Skip to content

Instantly share code, notes, and snippets.

@Pascal66
Last active February 4, 2020 15:02
Show Gist options
  • Save Pascal66/870c715529339734f5cfdd312bd82ae6 to your computer and use it in GitHub Desktop.
Save Pascal66/870c715529339734f5cfdd312bd82ae6 to your computer and use it in GitHub Desktop.
Fast Segmented Twin Primes SieveopenCL (Aparapi)
package fr.cridp.sieve;
import com.aparapi.Kernel;
import com.aparapi.ProfileInfo;
import com.aparapi.Range;
import com.aparapi.internal.kernel.KernelManager;
import java.math.BigInteger;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;
import java.util.stream.IntStream;
import static java.math.BigInteger.ONE;
import static java.math.BigInteger.ZERO;
/**
* This Java source file is a multiple threaded implementation to perform an
* extremely fast Segmented Sieve of Zakiya (SSoZ) to find Twin Primes <= N.
* <p>
* Inputs are single values N, of 64-bits, 0 -- 2^64 - 1.
* Output is the number of twin primes <= N; the last
* twin prime value for the range; and the total time of execution.
* <p>
* Run as Java application, and enter a range (comma or space spareted) when asked in console.
* <p>
* This java project, and updates, will be available here:
* https://github.com/Pascal66/TwinsPrimesSieve
* <p>
* <p>
* Example :
* Please enter an range of integer (comma or space separated):
* 0 1e11
* Max threads = 8
* Using Prime Generator parameters for given Pn 13
* segment size = 524288 resgroups; seg array is [1 x 8192] 64-bits
* twinprime candidates = 4945055940 ; resgroups = 3330004
* each 1485 threads has nextp[2 x 27287] array
* setup time = 0.124 secs
* perform twinprimes ssoz sieve with s=6
* sieve time = 11.17 secs
* last segment = 184276 resgroups; segment slices = 7
* total twins = 224376048; last twin = 99999999762+/-1
* total time = 11.294 secs
* <p>
* Please enter an range of integer (comma or space separated):
* 0 1e12
* Max threads = 8
* Using Prime Generator parameters for given Pn 13
* segment size = 802816 resgroups; seg array is [1 x 12544] 64-bits
* twinprime candidates = 49450550490 ; resgroups = 33300034
* each 1485 threads has nextp[2 x 78492] array
* setup time = 0.125 secs
* perform twinprimes ssoz sieve with s=6
* sieve time = 154.168 secs
* last segment = 384578 resgroups; segment slices = 42
* total twins = 1870585220; last twin = 999999999960+/-1
* total time = 154.293 secs
* <p>
* Original nim source file, and updates, available here:
* https://gist.github.com/jzakiya/6c7e1868bd749a6b1add62e3e3b2341e
* Original d source file, and updates, available here:
* https://gist.github.com/jzakiya/ae93bfa03dbc8b25ccc7f97ff8ad0f61
* Original rust source file, and updates, available here:
* https://gist.github.com/jzakiya/b96b0b70cf377dfd8feb3f35eb437225
* <p>
* Mathematical and technical basis for implementation are explained here:
* https://www.academia.edu/37952623The_Use_of_Prime_Generators_to_Implement_Fast_Twin_Primes_Sieve_of_Zakiya_SoZ_Applications_to_Number_Theory_and_Implications_for_the_Riemann_Hypotheses
* https://www.academia.edu/7583194/The_Segmented_Sieve_of_Zakiya_SSoZ
* https://www.academia.edu/19786419/PRIMES-UTILS_HANDBOOK
* <p>
* This code is provided free and subject to copyright and terms of the
* GNU General Public License Version 3, GPLv3, or greater.
* License copy/terms are here: http://www.gnu.org/licenses/
* <p>
* Copyright (c) 2017-20 Jabari Zakiya -- jzakiya at gmail dot com
* Java version 0.21.CL - Pascal Pechard -- pascal at priveyes dot net
* Version Date: 2020/01/20
* Attention ! Range 0 1E6 crash
*/
public class SSOZJCL2 {
static final BigInteger TWO = ONE.add(ONE);
static final BigInteger THREE = TWO.add(ONE);
static int segmentSize = 0; // segment size for each seg restrack
static BigInteger start_num; // lo number for range
static BigInteger end_num; // hi number for range
static BigInteger Kmax = ZERO; // number of resgroups to end_num
static BigInteger Kmin; // number of resgroups to start_num
static ArrayDeque<Integer> primes; // list of primes r1..sqrt(N)
// static long[] cnts; // hold twin primes counts for seg bytes
// static long[] lastwins; // holds largest twin prime <= num in each thread
static BigInteger PGmodulus; // PG's modulus value
static int firstResidue; // PG's first residue value
static LinkedList<Integer> restwins;// PG's list of twinpair residues
static int[] resinvrs; // PG's list of residues inverses
static int segmentSizeFactor; // segment size factor for PG and input number
static final int S = 5; // 0|3 for 1|8 resgroups/byte for 'small|large' ranges
static final int BMASK = (1 << S) - 1;
/**
* Create Prime Generator parameters for given Pn.
* Using BigInteger permit gcd and ModInverse
* For convenience, both version (Long and BigInteger are created)
*
* @param bestPrimeGenerator int
*/
private static void genPGparameters(int bestPrimeGenerator) {
System.out.println("Using Prime Generator parameters for given Pn " + bestPrimeGenerator);
final int[] primes = {2, 3, 5, 7, 11, 13, 17, 19, 23};
PGmodulus = ONE;
firstResidue = 0;
for (int prime : primes) {
firstResidue = prime;
if (prime > bestPrimeGenerator) break;
PGmodulus = PGmodulus.multiply(BigInteger.valueOf(firstResidue));
}
LinkedList<Integer> restwins = new LinkedList<>(); // save upper twin pair residues here
int[] inverses = new int[PGmodulus.intValue() + 2]; // save PG's residues inverses here
BigInteger pc = THREE.add(TWO);
int inc = 2;
BigInteger res = ZERO;
// find a residue, then modular complement
while (pc.compareTo(PGmodulus.divide(TWO)) < 0) {
// if pc a residue
if (PGmodulus.gcd(pc).equals(ONE)) {
final BigInteger pc_mc = PGmodulus.subtract(pc); // create its modular complement
int inv_r = pc.modInverse(PGmodulus).intValue();// modinv(pc, modpg); // compute residues's inverse
inverses[pc.intValue()] = inv_r; // save its inverse
inverses[inv_r] = pc.intValue(); // save its inverse inverse
inv_r = pc_mc.modInverse(PGmodulus).intValue(); // compute residues's complement inverse
inverses[pc_mc.intValue()] = inv_r; // save its inverse
inverses[inv_r] = pc_mc.intValue(); // save its inverse inverse
if (res.add(TWO).equals(pc)) {
restwins.add(pc.intValueExact());
restwins.add(pc_mc.add(TWO).intValueExact());
} // save hi_tp residues
res = pc;
}
pc = BigInteger.valueOf(pc.longValue() + inc);
inc ^= 0b110;
}
restwins.sort(Comparator.naturalOrder());
restwins.add(PGmodulus.add(ONE).intValueExact()); // last residue is last hi_tp
inverses[PGmodulus.add(ONE).intValue()] = ONE.intValue();
inverses[PGmodulus.subtract(ONE).intValue()] = PGmodulus.subtract(ONE).intValue(); // last 2 residues are self inverses
new PGparam(PGmodulus, firstResidue, restwins, inverses);
}
static class PGparam {
public static int modulus;
public static int Lkmin;
public static int Lkmax;
public static int Lstart;
public static int Lend;
public static int primesSize;
public static int segByteSize = 0;
public static int Lrange;
public PGparam(BigInteger modpg, int res_0, LinkedList<Integer> restwins, int[] inverses) {
SSOZJCL2.PGmodulus = modpg;
SSOZJCL2.firstResidue = res_0;
SSOZJCL2.restwins = restwins;
SSOZJCL2.resinvrs = inverses;
modulus = modpg.intValueExact();
Kmin = start_num.subtract(TWO).divide(modpg).add(ONE); // number of resgroups to start_num
Kmax = end_num.subtract(TWO).divide(modpg).add(ONE); // number of resgroups to end_num
Lkmin = Kmin.intValueExact();
Lkmax = Kmax.intValueExact();
// number of range resgroups, at least 1
Lrange = Lkmax - Lkmin + 1;
int n = 4; //Lrange < 37_500_000_000_000L ? 4 : Lrange < 975_000_000_000_000L ? 6 : 8;
// set seg size to optimize for selected PG
int B = (segmentSizeFactor * 1024 * n);
// segments resgroups size
segmentSize = Math.min(Lrange, B);
segByteSize = (int) ((segmentSize - 1 >>> S) + 1);
System.out.println("segment size = " + segmentSize + " resgroups; seg array is [1 x " + segByteSize + "] "+(BMASK+1L)+"-bits");
int maxpairs = Lrange * restwins.size(); // maximum number of twinprime pcs
System.out.println("twinprime candidates = " + maxpairs + " ; resgroups = " + Lrange);
}
}
/**
* Select at runtime best PG and segment size factor to use for input value.
* These are good estimates derived from PG data profiling. Can be improved.
*/
static void setSieveParameters(BigInteger start_range, BigInteger end_range) {
final BigInteger range = end_range.subtract(start_range);
int bestPG = 3;
if (end_range.compareTo(BigInteger.valueOf(49L)) < 0) {
segmentSizeFactor = 1;
bestPG = 3;
} else if (range.compareTo(BigInteger.valueOf(24_000_000L)) < 0) {
segmentSizeFactor = 16;
bestPG = 5;
} else if (range.compareTo(BigInteger.valueOf(1_100_000_000L)) < 0) {
segmentSizeFactor = 32;
bestPG = 7;
} else if (range.compareTo(BigInteger.valueOf(35_500_000_000L)) < 0) {
segmentSizeFactor = 64;
bestPG = 11;
} else if (range.compareTo(BigInteger.valueOf(15_000_000_000_000L)) < 0) {
bestPG = 13;
if (range.compareTo(BigInteger.valueOf(7_000_000_000_000L)) > 0) {
segmentSizeFactor = 384;
} else if (range.compareTo(BigInteger.valueOf(2_500_000_000_000L)) > 0) {
segmentSizeFactor = 320;
} else if (range.compareTo(BigInteger.valueOf(250_000_000_000L)) > 0) {
segmentSizeFactor = 196;
} else {
segmentSizeFactor = 128;
}
} else {
segmentSizeFactor = 384;
bestPG = 17;
}
genPGparameters(bestPG);
}
/**
* Compute the primes r1..sqrt(input_num) and store in global 'primes' array.
* Any algorithm (fast|small) is usable. Here the SoZ for P5 is used.
*/
static void sozpg(BigInteger maxRange) {
// P5's modulus value
final int P5Modulus = 30;
// P5's residue count
final int P5ResidueCounter = 8;
// P5's residues list
final int[] res = {7, 11, 13, 17, 19, 23, 29, 31};
final int[] posn = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 3, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 6, 0, 7};
// integer sqrt of sqrt(input value)
final long sqrtN = Bsqrt(maxRange).longValue();
// number of resgroups to input value
final int kmax = maxRange.subtract(BigInteger.valueOf(7L))
.divide(BigInteger.valueOf(P5Modulus))
.add(ONE).intValue();
short[] prms = new short[kmax]; // byte array of prime candidates
int modk = 0;
int r = -1;
int k = 0; // init residue parameters
// mark the multiples of the primes r1..sqrtN in 'prms'
while (true) {
r++;
if (r == P5ResidueCounter) {
r = 0;
modk += P5Modulus;
k++;
}
if ((prms[k] & (1 << r) & 0XFF) != 0) continue; // skip pc if not prime
final int prm_r = res[r]; // if prime save its residue value
final int prime = modk + prm_r; // numerate the prime value
if (prime > sqrtN) break; // we're finished when it's > sqrtN
for (int ri : res) { // mark prime's multiples in prms
final int prod = prm_r * ri - 2; // compute cross-product for prm_r|ri pair
final int bit_r = (1 << posn[mod(prod, P5Modulus)]) & 0XFF; // bit mask for prod's residue
int kpm = k * (prime + ri) + prod / P5Modulus; // 1st resgroup for prime mult
while (kpm < kmax) {
prms[kpm] |= bit_r;
kpm += prime;
}
}
}
// prms now contains the nonprime positions for the prime candidates r1..N
// extract primes into global var 'primes'
primes = new ArrayDeque<>(); // create empty dynamic array for primes
IntStream.range(0, kmax).forEach(km -> { // for each resgroup
int[] ri = {0};
for (int res_r : res) { // extract the primes in numerical order
if ((prms[km] & (1 << ri[0]++) & 0XFF) == 0) {
primes.add(P5Modulus * km + res_r);
}
}
});
while (primes.getFirst() < firstResidue) primes.pollFirst();
while (primes.getLast() > maxRange.longValue()) primes.pollLast();
}
static class twinSieve extends Kernel {
//TODO Must be outside
final int[] lastwins;
final int[] cnts;
@Constant final int[] twinsResidues;
@Constant final int[] residuesInverse;
@Constant final int[] primesArray;
int PGmodulus = 0;
int Lend = 0;
int Lstart = 0;
int firstResidueIndex = 0; //PGparam.Lkmin - 1;
int lastResidueIndex = 0; //PGparam.Lkmax;
int segSize = 0;
int segSlice = 0;
int flagCreateNextTwinPrime = 0;
@Local final int[] nextp;
@Local final int[] seg;
twinSieve(int[] twinsResidues, int[] primesArray) {
segSize = SSOZJCL2.segmentSize;
segSlice = segSize;
Lstart = PGparam.Lstart;
Lend = PGparam.Lend;
firstResidueIndex = PGparam.Lkmin - 1;
lastResidueIndex = PGparam.Lkmax;
PGmodulus = PGparam.modulus;
// Not working in constructor !?
this.twinsResidues = twinsResidues; //SSOZJCL2.restwins.stream().mapToLong(i->i).toArray();
this.primesArray = primesArray; //SSOZJCL2.primes.stream().mapToLong(i->i).toArray();
this.residuesInverse = resinvrs;
// This must be final global and not here !
// array to hold count of tps for each thread
// TODO must be outside
cnts = new int[restwins.size()];
// array to hold largest tp for each thread
lastwins = new int[restwins.size()];
seg = new int[(int) ((segSize - 1 >>> S) + 1)];
nextp = new int[primes.size()];
}
@Override
public void run() {
int thread = getGlobalId();
int upperTwinResidue = twinsResidues[thread];
int lowerTwinResidue = upperTwinResidue - 2;
int sum = 0;
// ensure high and low tps in range
if (((lastResidueIndex - 1) * PGmodulus + upperTwinResidue) > Lend) lastResidueIndex--;
if ((firstResidueIndex * PGmodulus + lowerTwinResidue) < Lstart) firstResidueIndex++;
// TODO Doesnt work in @run
// @Local int[] seg = new int[(int) ((KB - 1 >>> S) + 1)];
// @Local int[] nextp = new int[AprimesArray.length];
int highestTwinPair = 0; // max tp|resgroup, Kmax for slice
flagCreateNextTwinPrime = firstResidueIndex;
while (firstResidueIndex < lastResidueIndex) {
// set last slice resgroup size
if ((lastResidueIndex - firstResidueIndex) < segSize) segSlice = lastResidueIndex - firstResidueIndex;
for (int i = 0; i < primesArray.length; i++) {
//NextpInitPart
//This is done I time for each thread, for each prime
//Cant avoid it, since openCl doesnt Allow to call external method
if (flagCreateNextTwinPrime == firstResidueIndex) {
// find the resgroup it's in, and its residue value, and its residue inverse
int residueGroupIndex = (primesArray[i] - 2) / PGmodulus;
int residue = floorMod((primesArray[i] - 2), PGmodulus) + 2;
int residueInverse = residuesInverse[residue];
// compute the lowResidue for r for lo_tp
int lowResidue = floorMod((lowerTwinResidue * residueInverse - 2), PGmodulus) + 2;
int indexOfLowResidue = (residueGroupIndex * (primesArray[i] + lowResidue) + (residue * lowResidue - 2) / PGmodulus);
if (indexOfLowResidue < firstResidueIndex) { // if 1st mult index < start_num's
indexOfLowResidue = floorMod((firstResidueIndex - indexOfLowResidue), primesArray[i]); // how many indices short is it
if (indexOfLowResidue > 0) indexOfLowResidue = primesArray[i] - indexOfLowResidue; // adjust index value into range
} else indexOfLowResidue -= firstResidueIndex; // else here, adjust index if it was >
nextp[i << 1] = indexOfLowResidue; // lo_tp index val >= start of range
// compute the highResidue for r for hi_tp
int highResidue = floorMod((upperTwinResidue * residueInverse - 2), PGmodulus) + 2;
int indexOfHighResidue = residueGroupIndex * (primesArray[i] + highResidue) + (residue * highResidue - 2) / PGmodulus;
if (indexOfHighResidue < firstResidueIndex) { // if 1st mult index < start_num's
indexOfHighResidue = floorMod((firstResidueIndex - indexOfHighResidue), primesArray[i]); // how many indices short is it
if (indexOfHighResidue > 0) indexOfHighResidue = primesArray[i] - indexOfHighResidue; // adjust index value into range
} else indexOfHighResidue -= firstResidueIndex; // else here, adjust index if it was >
nextp[i << 1 | 1] = indexOfHighResidue; // hi_tp index val >= start of range
} // End NextpInit
// for lower twin pair residue track
int indexOfLowResidue = nextp[i << 1]; // starting from this resgroup in seg
for (; indexOfLowResidue < segSlice; indexOfLowResidue += primesArray[i])
seg[(indexOfLowResidue >>> S)] |= (1 << (indexOfLowResidue & BMASK)); // mark primenth resgroup bits prime mults and set next prime multiple resgroup
nextp[i << 1] = (indexOfLowResidue - segSlice); // save 1st resgroup in next eligible seg
// for upper twin pair residue track
int indexOfHighResidue = nextp[i << 1 | 1]; // starting from this resgroup in seg
for (; indexOfHighResidue < segSlice; indexOfHighResidue += primesArray[i]) // mark primenth resgroup bits prime mults
seg[(indexOfHighResidue >>> S)] |= (1 << (indexOfHighResidue & BMASK)); // set next prime multiple resgroup
nextp[i << 1 | 1] = (indexOfHighResidue - segSlice); // save 1st resgroup in next eligible seg
}
int upperSegmentIndex = (segSlice - 1);
seg[upperSegmentIndex >>> S] |= ((-(2 << (upperSegmentIndex & BMASK))));
// init seg twin primes count then find seg sum
int cnt = 0;
for (int m = 0; m <= (upperSegmentIndex >>> S); m++) cnt += (1 << S) - bitCount(seg[m]);
// if segment has twin primes, add the segment count to total count
if (cnt > 0) {
sum += cnt;
// from end of seg, count backwards to largest tp
while ((seg[(upperSegmentIndex >>> S)] & (1 << (upperSegmentIndex & BMASK))) != 0) upperSegmentIndex--;
// numerate its full resgroup value
highestTwinPair = firstResidueIndex + upperSegmentIndex;
}
// set 1st resgroup val of next seg slice
firstResidueIndex += segSize;
if (firstResidueIndex < lastResidueIndex)
// set seg to all primes
for (int b = 0; b <= upperSegmentIndex >>> S; b++) seg[b] = 0;
}
// when sieve done for full range, numerate largest twin prime in segs
highestTwinPair = upperTwinResidue > Lend ? 0 : highestTwinPair * PGmodulus + upperTwinResidue;
lastwins[thread] = (sum == 0 ? 1 : highestTwinPair);// store final seg tp value
cnts[thread] = sum; // sum for twin pair
}
private int floorMod(int x, int y) {
// return Math.floorMod(x,y);
return ((x >> 31) & (y - 1)) + (x % y);
}
private int bitCount(int i) {
//return Integer.bitCount(i);
i = i - ((i >>> 1) & 0x55555555);
i = (i & 0x33333333) + ((i >>> 2) & 0x33333333);
i = (i + (i >>> 4)) & 0x0f0f0f0f;
i = i + (i >>> 8);
i = i + (i >>> 16);
// i = i + (i >>> 32);
return i & 0x3f;
}
}
/**
* Main routine to setup, time, and display results for twin primes sieve.
*/
static void twinprimes_ssoz() {
System.out.println(" Max threads = " + (countProcessors()));
long ts = epochTime();
// select PrimeGenerator and seg factor Bn for input range
setSieveParameters(start_num, end_num);
final int pairscnt = restwins.size(); // number of twin pairs for selected PG
final int[] cnts = new int[restwins.size()]; // array to hold count of tps for each thread
final int[] lastwins = new int[restwins.size()]; // array to hold largest tp for each thread
// generate sieving primes
if (PGparam.Lend < 49L) primes.add((5));
else sozpg(Bsqrt(end_num)); // <= sqrt(end_num)
PGparam.primesSize = primes.size();
System.out.println("each " + pairscnt + " threads has nextp[" + 2 + " x " + PGparam.primesSize + "] array");
int twinscnt = 0; // number of twin primes in range
final int lo_range = restwins.getFirst() - 3; // lo_range = lo_tp - 1
// excluded low tp values for PGs used
for (int tp : new int[]{3, 5, 11, 17}) {
if (end_num.equals(THREE)) break; // if 3 end of range, no twin primes
if (tp >= PGparam.Lstart && tp <= lo_range) twinscnt++;
}
long te = epochTime() - ts;
System.out.println("setup time = " + te / 1e3 + " secs");
System.out.println("perform twinprimes ssoz sieve");
final long t1 = epochTime(); // start timing ssoz sieve execution
//This doesnt work in constructor !?
final int[] arestwins = restwins.stream().mapToInt(i -> i).toArray();
final int[] aprimesArray = primes.stream().mapToInt(i -> i).toArray();
twinSieve sieve = new twinSieve(arestwins, aprimesArray);
sieve.setExplicit(true);
Range twinRange = Range.create(pairscnt);
sieve.put(cnts).put(lastwins);
sieve.execute(twinRange);
sieve.get(cnts).get(lastwins);
// find largest twin prime in range
int last_twin = 0;
System.out.println("cnts Array "+Arrays.toString(cnts));
System.out.println("lastwins Array "+ Arrays.toString(lastwins));
twinscnt += Arrays.stream(cnts).sum();
last_twin = Arrays.stream(lastwins).max().orElse(0);
if (PGparam.Lend == 5L && twinscnt == 1) last_twin = 5;
// set number of resgroups in last slice
int Kn = PGparam.Lrange % segmentSize;
// if multiple of seg size set to seg size
if (Kn == 0) Kn = segmentSize;
long t2 = epochTime() - t1;
System.out.println("sieve time = " + t2 / 1e3 + " secs");
System.out.println("last segment = " + Kn + " resgroups; segment slices = " + ((PGparam.Lrange - 1) / segmentSize + 1));
System.out.println("total twins = " + twinscnt + "; last twin = " + (last_twin - 1) + "+/-1");
System.out.println("total time = " + (t2 + te) / 1e3 + " secs\n");
System.out.println(KernelManager.instance().bestDevice());
System.out.println("\nRange " + twinRange.toString());
List<ProfileInfo> profileInfo = sieve.getProfileInfo();
if (profileInfo != null && profileInfo.size() > 0) {
for (ProfileInfo p : profileInfo) {
System.out.print(p.getType() + " " + p.getLabel()
+ " " + p.getStart() / 1000 + " .. " + p.getEnd() / 1000
+ " " + (p.getEnd() - p.getStart()) / 1000 + "us"
+ " " + p.getSubmit() + " " + p.getQueued() );
}
System.out.println();
}
sieve.dispose();
}
public static void main(String[] args) {
boolean verbose = true;
if (verbose) {
// System.setProperty("com.aparapi.enableVerboseJNI", "true");
// System.setProperty("com.aparapi.dumpFlags", "true");
System.setProperty("com.aparapi.enableShowGeneratedOpenCL", "true");
// System.setProperty("com.aparapi.enableVerboseJNIOpenCLResourceTracking", "true");
System.setProperty("com.aparapi.enableExecutionModeReporting", "true");
System.setProperty("com.aparapi.logLevel", "WARNING"); //{OFF|FINEST|FINER|FINE|WARNING|SEVERE|ALL}
System.setProperty("com.aparapi.enableProfiling", "true");
System.setProperty("com.aparapi.executionMode", "GPU,CPU"); //{GPU|ACC|CPU|JTP|SEQ}
System.setProperty("com.aparapi.dumpProfilesOnExit", "true");
}
boolean tweak = true;
if (tweak) {
System.setProperty("com.aparapi.enable.PUTFIELD", "true");
// System.setProperty("com.aparapi.disable.ARETURN", "true");
System.setProperty("com.aparapi.enable.PUTSTATIC", "true");
System.setProperty("com.aparapi.enable.GETSTATIC", "true");
// System.setProperty("com.aparapi.enable.INVOKEINTERFACE", "true");
System.setProperty("com.aparapi.enable.MONITOR", "true");
System.setProperty("com.aparapi.disable.ARRAY", "false");
// System.setProperty("com.aparapi.enable.NEW", "true");
// System.setProperty("com.aparapi.enable.ATHROW", "true");
// System.setProperty("com.aparapi.disable.METHODARRAYPASSING", "true");
System.setProperty("com.aparapi.enable.ARRAYLENGTH", "true");
// System.setProperty("com.aparapi.enable.SWITCH", "true");
//System.setProperty("com.aparapi.enableAtomic32", "true");
//System.setProperty("com.aparapi.enableAtomic64", "true");
System.setProperty("com.aparapi.enableByteWrites", "true");
//System.setProperty("com.aparapi.enableDoubles", "true");
}
//if (args==null)
Scanner userInput = new Scanner(System.in).useDelimiter("[,\\s+]");
System.out.println("Please enter an range of integer (comma or space separated): ");
//Only BigDecimal understand scientific notation
BigInteger stop = userInput.nextBigDecimal().toBigIntegerExact();
BigInteger start = userInput.hasNextLine() ? userInput.nextBigDecimal().toBigIntegerExact() : THREE;
userInput.close();
if (stop.compareTo(start) < 0) {
BigInteger tmp = start;
start = stop;
stop = tmp;
}
start = start.max(THREE);
BigInteger end = stop.max(THREE);
start_num = start.or(ONE); // if start_num even add 1
end_num = end.subtract(ONE).or(ONE); // if end_num even subtract 1
PGparam.Lstart = start_num.intValueExact();
PGparam.Lend = end_num.intValueExact();
twinprimes_ssoz();
}
private static int countProcessors() {
return Runtime.getRuntime().availableProcessors();
}
/**
* Granularity tens of milliseconds.
*/
private static long epochTime() {
return System.currentTimeMillis();
}
public static BigInteger Bsqrt(BigInteger x) {
BigInteger div = BigInteger.ZERO.setBit(x.bitLength() >> 1);
BigInteger div2 = div;
// Loop until we hit the same value twice in a row, or wind up alternating.
for (; ; ) {
BigInteger y = div.add(x.divide(div)).shiftRight(1);
if (y.equals(div) || y.equals(div2)) return div.min(div2);
div2 = div;
div = y;
}
}
/**
* According to the Java Language Spec, Java's % operator is a remainder operator, not a modulo operator.
*
* @param x a long
* @param y a long
* @return a long modulo
*/
private static int mod(int x, int y) {
// return Math.floorMod(x,y);
return ((x >> 31) & (y - 1)) + (x % y);
}
}
@Pascal66
Copy link
Author

Pascal66 commented Feb 2, 2020

19:25:57: Executing task 'SSOZJCL2.main()'...

> Task :compileJava UP-TO-DATE
> Task :processResources NO-SOURCE
> Task :classes UP-TO-DATE

> Task :SSOZJCL2.main()
Please enter an range of integer (comma or space separated): 
0 1e6
 Max threads = 8
Using Prime Generator parameters for given Pn 5
segment size = 33334 resgroups; seg array is [1 x 521] 64-bits
twinprime candidates = 100002 ; resgroups = 33334
each 3 threads has nextp[2 x 165] array
setup time = 0.01 secs
perform twinprimes ssoz sieve
typedef struct This_s{
   __constant long *Arestwins;
   long kmax;
   long Lmodpg;
   long Lend;
   long kmin;
   long Lstart;
   long K1;
   long KB;
   long Kn;
   __constant long *AprimesArray;
   int AprimesArray__javaArrayLength;
   __constant long *Aresinvrs;
   __local long *nextp;
   __local long *seg;
   __global long *lastwins;
   __global long *cnts;
   int passid;
}This;
int get_pass_id(This *this){
   return this->passid;
}
int fr_cridp_sieve_SSOZJCL2$twinSieve__bitCount(This *this, long i){
   i = i - ((((unsigned long)i) >> 1) & 6148914691236517205L);
   i = (i & 3689348814741910323L) + ((((unsigned long)i) >> 2) & 3689348814741910323L);
   i = (i + (((unsigned long)i) >> 4)) & 1085102592571150095L;
   i = i + (((unsigned long)i) >> 8);
   i = i + (((unsigned long)i) >> 16);
   i = i + (((unsigned long)i) >> 32);
   return(((int)i & 127));
}
long fr_cridp_sieve_SSOZJCL2$twinSieve__mod(This *this, long x, long y){
   return((((x >> 31) & (y - 1L)) + (x % y)));
}
__kernel void run(
   __constant long *Arestwins, 
   long kmax, 
   long Lmodpg, 
   long Lend, 
   long kmin, 
   long Lstart, 
   long K1, 
   long KB, 
   long Kn, 
   __constant long *AprimesArray, 
   int AprimesArray__javaArrayLength, 
   __constant long *Aresinvrs, 
   __local long *nextp, 
   __local long *seg, 
   __global long *lastwins, 
   __global long *cnts, 
   int passid
){
   This thisStruct;
   This* this=&thisStruct;
   this->Arestwins = Arestwins;
   this->kmax = kmax;
   this->Lmodpg = Lmodpg;
   this->Lend = Lend;
   this->kmin = kmin;
   this->Lstart = Lstart;
   this->K1 = K1;
   this->KB = KB;
   this->Kn = Kn;
   this->AprimesArray = AprimesArray;
   this->AprimesArray__javaArrayLength = AprimesArray__javaArrayLength;
   this->Aresinvrs = Aresinvrs;
   this->nextp = nextp;
   this->seg = seg;
   this->lastwins = lastwins;
   this->cnts = cnts;
   this->passid = passid;
   {
      int indx = get_global_id(0);
      long r_hi = this->Arestwins[indx];
      long r_lo = r_hi - 2L;
      long sum = 0L;
      if (((((this->kmax - 1L) * this->Lmodpg) + r_hi) - this->Lend)>0){
         this->kmax=this->kmax - 1L;
      }
      if ((((this->kmin * this->Lmodpg) + (r_hi - 2L)) - this->Lstart)<0){
         this->kmin=this->kmin + 1L;
      }
      long hi_tp = 0L;
      this->K1=this->kmin;
      for (; (this->kmin - this->kmax)<0; ){
         if ((this->KB - (this->kmax - this->kmin))>0){
            this->Kn=this->kmax - this->kmin;
         }
         for (int i = 0; i<this->AprimesArray__javaArrayLength; i++){
            if ((this->K1 - this->kmin)==0){
               long k = (this->AprimesArray[i] - 2L) / this->Lmodpg;
               int r = (int)(fr_cridp_sieve_SSOZJCL2$twinSieve__mod(this, (this->AprimesArray[i] - 2L), this->Lmodpg) + 2L);
               long r_inv = this->Aresinvrs[r];
               long ro = fr_cridp_sieve_SSOZJCL2$twinSieve__mod(this, ((r_lo * r_inv) - 2L), this->Lmodpg) + 2L;
               long ko = (k * (this->AprimesArray[i] + ro)) + ((((long)r * ro) - 2L) / this->Lmodpg);
               if ((ko - this->kmin)<0){
                  ko = fr_cridp_sieve_SSOZJCL2$twinSieve__mod(this, (this->kmin - ko), this->AprimesArray[i]);
                  if ((ko - 0L)>0){
                     ko = this->AprimesArray[i] - ko;
                  }
               } else {
                  ko = ko - this->kmin;
               }
               this->nextp[i << 1]  = ko;
               long ri = fr_cridp_sieve_SSOZJCL2$twinSieve__mod(this, ((r_hi * r_inv) - 2L), this->Lmodpg) + 2L;
               long ki = (k * (this->AprimesArray[i] + ri)) + ((((long)r * ri) - 2L) / this->Lmodpg);
               if ((ki - this->kmin)<0){
                  ki = fr_cridp_sieve_SSOZJCL2$twinSieve__mod(this, (this->kmin - ki), this->AprimesArray[i]);
                  if ((ki - 0L)>0){
                     ki = this->AprimesArray[i] - ki;
                  }
               } else {
                  ki = ki - this->kmin;
               }
               this->nextp[(i << 1) | 1]  = ki;
            }
            {
               long ko = this->nextp[(i << 1)];
               for (; (ko - this->Kn)<0; ko = ko + this->AprimesArray[i]){
                  this->seg[(int)(((unsigned long)ko) >> 6)]  = this->seg[(int)(((unsigned long)ko) >> 6)] | (1L << (int)(ko & 63L));
               }
               this->nextp[i << 1]  = ko - this->Kn;
               long ki = this->nextp[((i << 1) | 1)];
               for (; (ki - this->Kn)<0; ki = ki + this->AprimesArray[i]){
                  this->seg[(int)(((unsigned long)ki) >> 6)]  = this->seg[(int)(((unsigned long)ki) >> 6)] | (1L << (int)(ki & 63L));
               }
               this->nextp[(i << 1) | 1]  = ki - this->Kn;
            }
         }
         int upk = (int)(this->Kn - 1L);
         this->seg[((unsigned int)upk) >> 6]  = this->seg[((unsigned int)upk) >> 6] | -(2L << (int)((long)upk & 63L));
         int cnt = 0;
         for (int m = 0; m<=(((unsigned int)upk) >> 6); m++){
            cnt = cnt + (64 - fr_cridp_sieve_SSOZJCL2$twinSieve__bitCount(this, this->seg[m]));
         }
         if (cnt>0){
            sum = sum + (long)cnt;
            for (; ((this->seg[(((unsigned int)upk) >> 6)] & (1L << (int)((long)upk & 63L))) - 0L)!=0; upk--){
            }
            hi_tp = this->kmin + (long)upk;
         }
         this->kmin=this->kmin + this->KB;
         
         if ((this->kmin - this->kmax)<0){
            for (int b = 0; b<=(((unsigned int)upk) >> 6); b++){
               this->seg[b]  = 0L;
            }
         }
      }
      hi_tp = ((r_hi - this->Lend)>0)?0L:((hi_tp * this->Lmodpg) + r_hi);
      this->lastwins[indx]  = ((sum - 0L)==0)?1L:hi_tp;
      this->cnts[indx]  = sum;
      return;
   }
}

execution complete: twinSieve, modes=[GPU, CPU], current = GPU
Device 1964732344400
  vendor = Advanced Micro Devices, Inc.
  type:GPU
  maxComputeUnits=6
  maxWorkItemDimensions=3
  maxWorkItemSizes={256, 256, 256}
  maxWorkWorkGroupSize=256
  globalMemSize=1073741824
  localMemSize=32768
Range global:3 local:(derived)3
sieve time = 0.989 secs
last segment = 33334 resgroups; segment slices = 1
total twins = 2; last twin = -1+/-1
total time = 0.999 secs


Profiles by Kernel Subclass (mean elapsed times in milliseconds)

Device               Count    CLASS_MODEL_BUILT          INIT_JNI  OPENCL_GENERATED   OPENCL_COMPILED   PREPARE_EXECUTE          EXECUTED  Total
----------------- [[ twinSieve ]] --------------------------------------------------------------------------------------------------
AMD<GPU>             1                  354,874             0,135            14,256           215,798             0,476            77,296  662,835


BUILD SUCCESSFUL in 8s
2 actionable tasks: 1 executed, 1 up-to-date
19:26:06: Task execution finished 'SSOZJCL2.main()'.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment