xiaozhiliaoo · May 29, 2020 17:42
diff --git a/Main.java b/Main.java
 import org.openjdk.jmh.annotations.*;

 import java.util.Arrays;
 import java.util.Random;
 import java.util.concurrent.CountedCompleter;
 import java.util.concurrent.ExecutionException;

 /**
 * 40 Core Xeon E5-2660 v3 @ 2.60GHz
 *
 * Benchmark                                 (choice)   Mode  Cnt        Score      Error  Units
 * forkJoinBench.Main.customCC                 0  thrpt   15  3206592.104 ± 1457.036  ops/s
 * forkJoinBench.Main.customCC                 1  thrpt   15    23593.980 ±    9.608  ops/s
 * forkJoinBench.Main.customCC                 2  thrpt   15      391.796 ±    1.615  ops/s
 * forkJoinBench.Main.customCC                 3  thrpt   15      115.990 ±    0.190  ops/s
 * forkJoinBench.Main.customCC                 4  thrpt   15       81.434 ±    1.050  ops/s
 * forkJoinBench.Main.parallelStream                0  thrpt   15    23671.085 ±  252.414  ops/s
 * forkJoinBench.Main.parallelStream                1  thrpt   15    21285.229 ±  261.597  ops/s
 * forkJoinBench.Main.parallelStream                2  thrpt   15      256.318 ±   13.717  ops/s
 * forkJoinBench.Main.parallelStream                3  thrpt   15       67.165 ±    0.197  ops/s
 * forkJoinBench.Main.parallelStream                4  thrpt   15       40.167 ±    0.086  ops/s
 * forkJoinBench.Main.serial                        0  thrpt   15  3250371.613 ± 1751.204  ops/s
 * forkJoinBench.Main.serial                        1  thrpt   15    22931.864 ±    9.380  ops/s
 * forkJoinBench.Main.serial                        2  thrpt   15       22.846 ±    0.117  ops/s
 * forkJoinBench.Main.serial                        3  thrpt   15        7.644 ±    0.025  ops/s
 * forkJoinBench.Main.serial                        4  thrpt   15        4.583 ±    0.028  ops/s
 *
 */
 @BenchmarkMode(Mode.Throughput)
 @State(Scope.Benchmark)
 @Measurement(iterations = 15)
 @Fork(1)
 public class Main {
 	// just some verification that the numbers are correct
 	public static void main(String[] args) throws ExecutionException, InterruptedException {
 		Main m = new Main();
 		m.arr = arrays[3];
 		System.out.println("serial: " + m.serial()
 			+ "\nparallelStream: " + m.parallelStream()
 			+ "\ncustomCC: " + m.customCC());
 	}

 	static final int[][] arrays = new int[][] {
 		new int[100], new int[10000], new int[10_000_000], new int[30_000_000], new int[50_000_000]
 	};

 	static {
 		Random rand = new Random();
 		for (int[] arr : arrays) {
 			for (int i = 0; i < arr.length; ++i) arr[i] = rand.nextInt(200);
 		}
 	}

 	@Param({"0", "1", "2", "3", "4"})
 	int choice;
 	int[] arr;

 	@Setup(Level.Trial)
 	public void setup() {
 		arr = arrays[choice];
 	}

 	@Benchmark
 	public long serial() {
 		long res = 0;
 		for (int i : arr) if (i < 150) res += fnv(i) % 5;
 		return res;
 	}

 	@Benchmark
 	public long parallelStream() {
 		return Arrays.stream(arr)
 			.parallel()
 			.filter(i -> i < 150)
 			.mapToLong(i -> fnv(i) % 5)
 			.sum();
 	}

 	@Benchmark
 	public long customCC() {
 		return new CustomCC(arr).invoke();
 	}

 	static final class CustomCC extends CountedCompleter<Long> {
 		private static final int TARGET_LEAVES = Runtime.getRuntime().availableProcessors() << 2;
 		// ie, every leaf should be process at least 10_000 elements
 		private static final int MIN_SIZE_PER_LEAF = 10_000;

 		private static int computeInitialPending(int size) {
 			// if x = size / MIN_SIZE_PER_LEAF, and x < TARGET_LEAVES, create x leaves instead.
 			// This ensures that every leaf does at least MIN_SIZE_PER_LEAF operations.
 			int leaves = Math.min(TARGET_LEAVES, size / MIN_SIZE_PER_LEAF);
 			// If the total # leaves = 0, then pending = 0 since we shouldn't fork.
 			// Otherwise, pending = log2(leaves), as explained in the benchmark doc
 			return leaves == 0
 				? 0
 				: 31 - Integer.numberOfLeadingZeros(leaves);
 		}

 		final int[] arr;
 		int pos, size;
 		long res = 0;
 		final CustomCC[] subs;

 		public CustomCC(int[] arr) {
 			this(null, arr, arr.length, 0, computeInitialPending(arr.length));
 		}

 		private CustomCC(CustomCC parent, int[] arr, int size, int pos, int pending) {
 			super(parent, pending);
 			this.arr = arr; this.size = size; this.pos = pos;
 			subs = new CustomCC[pending];
 		}

 		@Override
 		public void compute() {
 			// subs.length === getPendingCount()
 			for (int p = subs.length - 1; p >= 0; --p) {
 				size >>>= 1;
 				CustomCC sub = new CustomCC(this, arr, size, pos + size, p);
 				subs[p] = sub;
 				sub.fork();
 			}

 			// needed for c2 to remove array index checking... modified from java's Spliterator class
 			int[] a; int i, hi;
 			if ((a = arr).length >= (hi = pos + size) && (i = pos) >= 0 && i < hi) {
 				do {
 					int e = a[i];
 					if (e < 150) res += fnv(e) % 5;
 				} while (++i < hi);
 			}

 			tryComplete();
 		}

 		@Override
 		public void onCompletion(CountedCompleter<?> caller) {
 			for (CustomCC sub : subs) res += sub.res;
 		}

 		@Override
 		public Long getRawResult() {
 			return res;
 		}
 	}

 	static final long FNV_64_PRIME  = 0x100000001b3L;
 	static final long FNV_64_OFFSET = 0xcbf29ce484222325L;
 	static long fnv(int x) {
 		long h = FNV_64_OFFSET;
 		h ^= (x >>> 24);
 		h *= FNV_64_PRIME;
 		h ^= (x >>> 16) & 0xff;
 		h *= FNV_64_PRIME;
 		h ^= (x >>> 8) & 0xff;
 		h *= FNV_64_PRIME;
 		h ^= x & 0xff;
 		h *= FNV_64_PRIME;
 		return h;
 	}

 }
	import org.openjdk.jmh.annotations.*;

	import java.util.Arrays;
	import java.util.Random;
	import java.util.concurrent.CountedCompleter;
	import java.util.concurrent.ExecutionException;

	/**
	* 40 Core Xeon E5-2660 v3 @ 2.60GHz
	*
	* Benchmark (choice) Mode Cnt Score Error Units
	* forkJoinBench.Main.customCC 0 thrpt 15 3206592.104 ± 1457.036 ops/s
	* forkJoinBench.Main.customCC 1 thrpt 15 23593.980 ± 9.608 ops/s
	* forkJoinBench.Main.customCC 2 thrpt 15 391.796 ± 1.615 ops/s
	* forkJoinBench.Main.customCC 3 thrpt 15 115.990 ± 0.190 ops/s
	* forkJoinBench.Main.customCC 4 thrpt 15 81.434 ± 1.050 ops/s
	* forkJoinBench.Main.parallelStream 0 thrpt 15 23671.085 ± 252.414 ops/s
	* forkJoinBench.Main.parallelStream 1 thrpt 15 21285.229 ± 261.597 ops/s
	* forkJoinBench.Main.parallelStream 2 thrpt 15 256.318 ± 13.717 ops/s
	* forkJoinBench.Main.parallelStream 3 thrpt 15 67.165 ± 0.197 ops/s
	* forkJoinBench.Main.parallelStream 4 thrpt 15 40.167 ± 0.086 ops/s
	* forkJoinBench.Main.serial 0 thrpt 15 3250371.613 ± 1751.204 ops/s
	* forkJoinBench.Main.serial 1 thrpt 15 22931.864 ± 9.380 ops/s
	* forkJoinBench.Main.serial 2 thrpt 15 22.846 ± 0.117 ops/s
	* forkJoinBench.Main.serial 3 thrpt 15 7.644 ± 0.025 ops/s
	* forkJoinBench.Main.serial 4 thrpt 15 4.583 ± 0.028 ops/s
	*
	*/
	@BenchmarkMode(Mode.Throughput)
	@State(Scope.Benchmark)
	@Measurement(iterations = 15)
	@Fork(1)
	public class Main {
	// just some verification that the numbers are correct
	public static void main(String[] args) throws ExecutionException, InterruptedException {
	Main m = new Main();
	m.arr = arrays[3];
	System.out.println("serial: " + m.serial()
	+ "\nparallelStream: " + m.parallelStream()
	+ "\ncustomCC: " + m.customCC());
	}

	static final int[][] arrays = new int[][] {
	new int[100], new int[10000], new int[10_000_000], new int[30_000_000], new int[50_000_000]
	};

	static {
	Random rand = new Random();
	for (int[] arr : arrays) {
	for (int i = 0; i < arr.length; ++i) arr[i] = rand.nextInt(200);
	}
	}

	@Param({"0", "1", "2", "3", "4"})
	int choice;
	int[] arr;

	@Setup(Level.Trial)
	public void setup() {
	arr = arrays[choice];
	}

	@Benchmark
	public long serial() {
	long res = 0;
	for (int i : arr) if (i < 150) res += fnv(i) % 5;
	return res;
	}

	@Benchmark
	public long parallelStream() {
	return Arrays.stream(arr)
	.parallel()
	.filter(i -> i < 150)
	.mapToLong(i -> fnv(i) % 5)
	.sum();
	}

	@Benchmark
	public long customCC() {
	return new CustomCC(arr).invoke();
	}

	static final class CustomCC extends CountedCompleter<Long> {
	private static final int TARGET_LEAVES = Runtime.getRuntime().availableProcessors() << 2;
	// ie, every leaf should be process at least 10_000 elements
	private static final int MIN_SIZE_PER_LEAF = 10_000;

	private static int computeInitialPending(int size) {
	// if x = size / MIN_SIZE_PER_LEAF, and x < TARGET_LEAVES, create x leaves instead.
	// This ensures that every leaf does at least MIN_SIZE_PER_LEAF operations.
	int leaves = Math.min(TARGET_LEAVES, size / MIN_SIZE_PER_LEAF);
	// If the total # leaves = 0, then pending = 0 since we shouldn't fork.
	// Otherwise, pending = log2(leaves), as explained in the benchmark doc
	return leaves == 0
	? 0
	: 31 - Integer.numberOfLeadingZeros(leaves);
	}

	final int[] arr;
	int pos, size;
	long res = 0;
	final CustomCC[] subs;

	public CustomCC(int[] arr) {
	this(null, arr, arr.length, 0, computeInitialPending(arr.length));
	}

	private CustomCC(CustomCC parent, int[] arr, int size, int pos, int pending) {
	super(parent, pending);
	this.arr = arr; this.size = size; this.pos = pos;
	subs = new CustomCC[pending];
	}

	@Override
	public void compute() {
	// subs.length === getPendingCount()
	for (int p = subs.length - 1; p >= 0; --p) {
	size >>>= 1;
	CustomCC sub = new CustomCC(this, arr, size, pos + size, p);
	subs[p] = sub;
	sub.fork();
	}

	// needed for c2 to remove array index checking... modified from java's Spliterator class
	int[] a; int i, hi;
	if ((a = arr).length >= (hi = pos + size) && (i = pos) >= 0 && i < hi) {
	do {
	int e = a[i];
	if (e < 150) res += fnv(e) % 5;
	} while (++i < hi);
	}

	tryComplete();
	}

	@Override
	public void onCompletion(CountedCompleter<?> caller) {
	for (CustomCC sub : subs) res += sub.res;
	}

	@Override
	public Long getRawResult() {
	return res;
	}
	}

	static final long FNV_64_PRIME = 0x100000001b3L;
	static final long FNV_64_OFFSET = 0xcbf29ce484222325L;
	static long fnv(int x) {
	long h = FNV_64_OFFSET;
	h ^= (x >>> 24);
	h *= FNV_64_PRIME;
	h ^= (x >>> 16) & 0xff;
	h *= FNV_64_PRIME;
	h ^= (x >>> 8) & 0xff;
	h *= FNV_64_PRIME;
	h ^= x & 0xff;
	h *= FNV_64_PRIME;
	return h;
	}

	}