How do I implement the Sieve Of Eratosthenes using multithreaded C#?

Asked 15/1, 2011 at 14:30 Answered 19/9, 2013 at 1:49

Solved c#multithreading primes sieve-of-eratosthenes sieve

I am trying to implement Sieve Of Eratosthenes using Mutithreading. Here is my implementation:

using System;
using System.Collections.Generic;
using System.Threading;

namespace Sieve_Of_Eratosthenes 
{
    class Controller 
        {
        public static int upperLimit = 1000000000;
        public static bool[] primeArray = new bool[upperLimit];

        static void Main(string[] args) 
        {
        DateTime startTime = DateTime.Now;

        Initialize initial1 = new Initialize(0, 249999999);
        Initialize initial2 = new Initialize(250000000, 499999999);
        Initialize initial3 = new Initialize(500000000, 749999999);
        Initialize initial4 = new Initialize(750000000, 999999999);

        initial1.thread.Join();
        initial2.thread.Join();
        initial3.thread.Join();
        initial4.thread.Join();

        int sqrtLimit = (int)Math.Sqrt(upperLimit);

        Sieve sieve1 = new Sieve(249999999);
        Sieve sieve2 = new Sieve(499999999);
        Sieve sieve3 = new Sieve(749999999);
        Sieve sieve4 = new Sieve(999999999);

        for (int i = 3; i < sqrtLimit; i += 2) 
            {
            if (primeArray[i] == true) 
                {
                int squareI = i * i;

                    if (squareI <= 249999999) 
                    {
                sieve1.set(i);
                sieve2.set(i);
                sieve3.set(i);
                sieve4.set(i);
                sieve1.thread.Join();
                sieve2.thread.Join();
                sieve3.thread.Join();
                sieve4.thread.Join();
            } 
                    else if (squareI > 249999999 & squareI <= 499999999) 
                    {
                sieve2.set(i);
                sieve3.set(i);
                sieve4.set(i);
                sieve2.thread.Join();
                sieve3.thread.Join();
                sieve4.thread.Join();
            } 
                    else if (squareI > 499999999 & squareI <= 749999999) 
                    {
                sieve3.set(i);
                sieve4.set(i);
                sieve3.thread.Join();
                sieve4.thread.Join();
            } 
                    else if (squareI > 749999999 & squareI <= 999999999) 
                    {
                sieve4.set(i);
                sieve4.thread.Join();
            }
            }
        }    

        int count = 0;
        primeArray[2] = true;
        for (int i = 2; i < upperLimit; i++) 
            {
            if (primeArray[i]) 
                {
                count++;
            }
        }

        Console.WriteLine("Total: " + count);

        DateTime endTime = DateTime.Now;
        TimeSpan elapsedTime = endTime - startTime;
        Console.WriteLine("Elapsed time: " + elapsedTime.Seconds);
        }

        public class Initialize 
        {
            public Thread thread;
        private int lowerLimit;
        private int upperLimit;

        public Initialize(int lowerLimit, int upperLimit) 
            {
            this.lowerLimit = lowerLimit;
            this.upperLimit = upperLimit;
            thread = new Thread(this.InitializeArray);
            thread.Priority = ThreadPriority.Highest;
            thread.Start();
        }

        private void InitializeArray() 
            {
            for (int i = this.lowerLimit; i <= this.upperLimit; i++) 
                {
                if (i % 2 == 0) 
                    {
                    Controller.primeArray[i] = false;
            } 
                    else 
                    {
                Controller.primeArray[i] = true;
            }
            }
        }
        }

        public class Sieve 
            {
            public Thread thread;
            public int i;
            private int upperLimit;

            public Sieve(int upperLimit) 
                {
                this.upperLimit = upperLimit;
            }

        public void set(int i) 
            {
            this.i = i;
            thread = new Thread(this.primeGen);
            thread.Start();
        }

        public void primeGen() 
            {
            for (int j = this.i * this.i; j <= this.upperLimit; j += i) 
                {
                Controller.primeArray[j] = false;
            }
        }
        }
    }
}

This takes 30 seconds to produce the output, is there any way to speed this up?

Edit: Here is the TPL implementation:

public LinkedList<int> GetPrimeList(int limit) {
        LinkedList<int> primeList = new LinkedList<int>();
        bool[] primeArray = new bool[limit];

        Console.WriteLine("Initialization started...");

        Parallel.For(0, limit, i => {
            if (i % 2 == 0) {
                primeArray[i] = false;
            } else {
                primeArray[i] = true;
            }
        }
        );
        Console.WriteLine("Initialization finished...");

        /*for (int i = 0; i < limit; i++) {
            if (i % 2 == 0) {
                primeArray[i] = false;
            } else {
                primeArray[i] = true;
            }
        }*/

        int sqrtLimit = (int)Math.Sqrt(limit);
        Console.WriteLine("Operation started...");
        Parallel.For(3, sqrtLimit, i => {
            lock (this) {
                if (primeArray[i]) {
                    for (int j = i * i; j < limit; j += i) {
                        primeArray[j] = false;
                    }

                }
            }
        }
        );
        Console.WriteLine("Operation finished...");
        /*for (int i = 3; i < sqrtLimit; i += 2) {
            if (primeArray[i]) {
                for (int j = i * i; j < limit; j += i) {
                    primeArray[j] = false;
                }
            }
        }*/

        //primeList.AddLast(2);
        int count = 1;
        Console.WriteLine("Counting started...");
        Parallel.For(3, limit, i => {
            lock (this) {
                if (primeArray[i]) {
                    //primeList.AddLast(i);
                    count++;
                }
            }
        }
        );
        Console.WriteLine("Counting finished...");
        Console.WriteLine(count);

        /*for (int i = 3; i < limit; i++) {
            if (primeArray[i]) {
                primeList.AddLast(i);
            }
        }*/

        return primeList;
    }

Thank you.

Broadminded answered 15/1, 2011 at 14:30 Comment(4)

Do you have any ideas? What have you already tried? Posting a wall of code and asking us to "fix" it is rarely going to net you good answers. I think you'll find that the amount of work you put into your question is directly proportional to the amount of work people will put into their answers. – Metatherian 15/1, 2011 at 14:46

There is nothing wrong in the code so I am not seeking any "fixation", I just want to know is it possible to speed it up. What I have tried is clear from my code, isn't it? – Broadminded 15/1, 2011 at 15:14

Did you use a profiler? What hot spots, if any, did the profiler find? Did you try running it single-threaded? Was the single-threaded version faster or slower? Did you try varying the number of threads? If more threads is faster, what was the scaling factor? How many processors have you got? Is there any point to creating more than one thread per processor to work on this problem? Why or why not? Have you tried the Task Parallel Library? What were your results using it? – Holliehollifield 15/1, 2011 at 15:42

I also tried it single threaded, execution time: 45 sec. TPL takes 56 sec. I have E7500 core2duo @ 2.93GHz. – Broadminded 15/1, 2011 at 18:1

Edited:

My answer to the question is: Yes, you can definitely use the Task Parallel Library (TPL) to find the primes to one billion faster. The given code(s) in the question is slow because it isn't efficiently using memory or multiprocessing, and final output also isn't efficient.

So other than just multiprocessing, there are a huge number of things you can do to speed up the Sieve of Eratosthenese, as follows:

You sieve all numbers, even and odd, which both uses more memory (one billion bytes for your range of one billion) and is slower due to the unnecessary processing. Just using the fact that two is the only even prime so making the array represent only odd primes would half the memory requirements and reduce the number of composite number cull operations by over a factor of two so that the operation might take something like 20 seconds on your machine for primes to a billion.
Part of the reason that composite number culling over such a huge memory array is so slow is that it greatly exceeds the CPU cache sizes so that many memory accesses are to main memory in a somewhat random fashion meaning that culling a given composite number representation can take over a hundred CPU clock cycles, whereas if they were all in the L1 cache it would only take one cycle and in the L2 cache only about four cycles; not all accesses take the worst case times, but this definitely slows the processing. Using a bit packed array to represent the prime candidates will reduce the use of memory by a factor of eight and make the worst case accesses less common. While there will be a computational overhead to accessing individual bits, you will find there is a net gain as the time saving in reducing average memory access time will be greater than this cost. The simple way to implement this is to use a BitArray rather than an array of bool. Writing your own bit accesses using shift and "and" operations will be more efficient than use of the BitArray class. You will find a slight saving using BitArray and another factor of two doing your own bit operations for a single threaded performance of perhaps about ten or twelve seconds with this change.
Your output of the count of primes found is not very efficient as it requires an array access and an if condition per candidate prime. Once you have the sieve buffer as an array packed word array of bits, you can do this much more efficiently with a counting Look Up Table (LUT) which eliminates the if condition and only needs two array accesses per bit packed word. Doing this, the time to count becomes a negligible part of the work as compared to the time to cull composite numbers, for a further saving to get down to perhaps eight seconds for the count of the primes to one billion.
Further reductions in the number of processed prime candidates can be the result of applying wheel factorization, which removes say the factors of the primes 2, 3, and 5 from the processing and by adjusting the method of bit packing can also increase the effective range of a given size bit buffer by a factor of another about two. This can reduce the number of composite number culling operations by another huge factor of up to over three times, although at the cost of further computational complexity.
In order to further reduce memory use, making memory accesses even more efficient, and preparing the way for multiprocessing per page segment, one can divide the work into pages that are no larger than the L1 or L2 cache sizes. This requires that one keep a base primes table of all the primes up to the square root of the maximum prime candidate and recomputes the starting address parameters of each of the base primes used in culling across a given page segment, but this is still more efficient than using huge culling arrays. An added benefit to implementing this page segmenting is that one then does not have to specify the upper sieving limit in advance but rather can just extend the base primes as necessary as further upper pages are processed. With all of the optimizations to this point, you can likely produce the count of primes up to one billion in about 2.5 seconds.
Finally, one can put the final touches on multiprocessing the page segments using TPL or Threads, which using a buffer size of about the L2 cache size (per core) will produce an addition gain of a factor of two on your dual core non Hyper Threaded (HT) older processor as the Intel E7500 Core2Duo for an execute time to find the number of primes to one billion of about 1.25 seconds or so.

I have implemented a multi-threaded Sieve of Eratosthenes as an answer to another thread to show there isn't any advantage to the Sieve of Atkin over the Sieve of Eratosthenes. It uses the Task Parallel Library (TPL) as in Tasks and TaskFactory so requires at least DotNet Framework 4. I have further tweaked that code using all of the optimizations discussed above as an alternate answer to the same quesion. I re-post that tweaked code here with added comments and easier-to-read formatting, as follows:

  using System;
  using System.Collections;
  using System.Collections.Generic;
  using System.Linq;
  using System.Threading;
  using System.Threading.Tasks;

  class UltimatePrimesSoE : IEnumerable<ulong> {
    #region const and static readonly field's, private struct's and classes

    //one can get single threaded performance by setting NUMPRCSPCS = 1
    static readonly uint NUMPRCSPCS = (uint)Environment.ProcessorCount + 1;
    //the L1CACHEPOW can not be less than 14 and is usually the two raised to the power of the L1 or L2 cache
    const int L1CACHEPOW = 14, L1CACHESZ = (1 << L1CACHEPOW), MXPGSZ = L1CACHESZ / 2; //for buffer ushort[]
    const uint CHNKSZ = 17; //this times BWHLWRDS (below) times two should not be bigger than the L2 cache in bytes
    //the 2,3,57 factorial wheel increment pattern, (sum) 48 elements long, starting at prime 19 position
    static readonly byte[] WHLPTRN = { 2,3,1,3,2,1,2,3,3,1,3,2,1,3,2,3,4,2,1,2,1,2,4,3,
                                       2,3,1,2,3,1,3,3,2,1,2,3,1,3,2,1,2,1,5,1,5,1,2,1 }; const uint FSTCP = 11;
    static readonly byte[] WHLPOS; static readonly byte[] WHLNDX; //look up wheel position from index and vice versa
    static readonly byte[] WHLRNDUP; //to look up wheel rounded up index positon values, allow for overflow in size
    static readonly uint WCRC = WHLPTRN.Aggregate(0u, (acc, n) => acc + n); //small wheel circumference for odd numbers
    static readonly uint WHTS = (uint)WHLPTRN.Length; static readonly uint WPC = WHTS >> 4; //number of wheel candidates
    static readonly byte[] BWHLPRMS = { 2,3,5,7,11,13,17 }; const uint FSTBP = 19; //big wheel primes, following prime
    //the big wheel circumference expressed in number of 16 bit words as in a minimum bit buffer size
    static readonly uint BWHLWRDS = BWHLPRMS.Aggregate(1u, (acc, p) => acc * p) / 2 / WCRC * WHTS / 16;
    //page size and range as developed from the above
    static readonly uint PGSZ = MXPGSZ / BWHLWRDS * BWHLWRDS; static readonly uint PGRNG = PGSZ * 16 / WHTS * WCRC;
    //buffer size (multiple chunks) as produced from the above
    static readonly uint BFSZ = CHNKSZ * PGSZ, BFRNG = CHNKSZ * PGRNG; //number of uints even number of caches in chunk
    static readonly ushort[] MCPY; //a Master Copy page used to hold the lower base primes preculled version of the page
    struct Wst { public ushort msk; public byte mlt; public byte xtr; public ushort nxt; }
    static readonly byte[] PRLUT; /*Wheel Index Look Up Table */ static readonly Wst[] WSLUT; //Wheel State Look Up Table
    static readonly byte[] CLUT; // a Counting Look Up Table for very fast counting of primes

    class Bpa { //very efficient auto-resizing thread-safe read-only indexer class to hold the base primes array
      byte[] sa = new byte[0]; uint lwi = 0, lpd = 0; object lck = new object();
      public uint this[uint i] {
        get {
          if (i >= this.sa.Length) lock (this.lck) {
              var lngth = this.sa.Length; while (i >= lngth) {
                var bf = (ushort[])MCPY.Clone(); if (lngth == 0) {
                  for (uint bi = 0, wi = 0, w = 0, msk = 0x8000, v = 0; w < bf.Length;
                      bi += WHLPTRN[wi++], wi = (wi >= WHTS) ? 0 : wi) {
                    if (msk >= 0x8000) { msk = 1; v = bf[w++]; } else msk <<= 1;
                    if ((v & msk) == 0) {
                      var p = FSTBP + (bi + bi); var k = (p * p - FSTBP) >> 1;
                      if (k >= PGRNG) break; var pd = p / WCRC; var kd = k / WCRC; var kn = WHLNDX[k - kd * WCRC];
                      for (uint wrd = kd * WPC + (uint)(kn >> 4), ndx = wi * WHTS + kn; wrd < bf.Length; ) {
                        var st = WSLUT[ndx]; bf[wrd] |= st.msk; wrd += st.mlt * pd + st.xtr; ndx = st.nxt;
                      }
                    }
                  }
                }
                else { this.lwi += PGRNG; cullbf(this.lwi, bf); }
                var c = count(PGRNG, bf); var na = new byte[lngth + c]; sa.CopyTo(na, 0);
                for (uint p = FSTBP + (this.lwi << 1), wi = 0, w = 0, msk = 0x8000, v = 0;
                    lngth < na.Length; p += (uint)(WHLPTRN[wi++] << 1), wi = (wi >= WHTS) ? 0 : wi) {
                  if (msk >= 0x8000) { msk = 1; v = bf[w++]; } else msk <<= 1; if ((v & msk) == 0) {
                    var pd = p / WCRC; na[lngth++] = (byte)(((pd - this.lpd) << 6) + wi); this.lpd = pd;
                  }
                } this.sa = na;
              }
            } return this.sa[i];
        }
      }
    }
    static readonly Bpa baseprms = new Bpa(); //the base primes array using the above class

    struct PrcsSpc { public Task tsk; public ushort[] buf; } //used for multi-threading buffer array processing

    #endregion

    #region private static methods

    static int count(uint bitlim, ushort[] buf) { //very fast counting method using the CLUT look up table
      if (bitlim < BFRNG) {
        var addr = (bitlim - 1) / WCRC; var bit = WHLNDX[bitlim - addr * WCRC] - 1; addr *= WPC;
        for (var i = 0; i < 3; ++i) buf[addr++] |= (ushort)((unchecked((ulong)-2) << bit) >> (i << 4));
      }
      var acc = 0; for (uint i = 0, w = 0; i < bitlim; i += WCRC)
        acc += CLUT[buf[w++]] + CLUT[buf[w++]] + CLUT[buf[w++]]; return acc;
    }

    static void cullbf(ulong lwi, ushort[] b) { //fast buffer segment culling method using a Wheel State Look Up Table
      ulong nlwi = lwi;
      for (var i = 0u; i < b.Length; nlwi += PGRNG, i += PGSZ) MCPY.CopyTo(b, i); //copy preculled lower base primes.
      for (uint i = 0, pd = 0; ; ++i) {
        pd += (uint)baseprms[i] >> 6;
        var wi = baseprms[i] & 0x3Fu; var wp = (uint)WHLPOS[wi]; var p = pd * WCRC + PRLUT[wi];
        var k = ((ulong)p * (ulong)p - FSTBP) >> 1;
        if (k >= nlwi) break; if (k < lwi) {
          k = (lwi - k) % (WCRC * p);
          if (k != 0) {
            var nwp = wp + (uint)((k + p - 1) / p); k = (WHLRNDUP[nwp] - wp) * p - k;
          }
        }
        else k -= lwi; var kd = k / WCRC; var kn = WHLNDX[k - kd * WCRC];
        for (uint wrd = (uint)kd * WPC + (uint)(kn >> 4), ndx = wi * WHTS + kn; wrd < b.Length; ) {
          var st = WSLUT[ndx]; b[wrd] |= st.msk; wrd += st.mlt * pd + st.xtr; ndx = st.nxt;
        }
      }
    }

    static Task cullbftsk(ulong lwi, ushort[] b, Action<ushort[]> f) { // forms a task of the cull buffer operaion
      return Task.Factory.StartNew(() => { cullbf(lwi, b); f(b); });
    }

    //iterates the action over each page up to the page including the top_number,
    //making an adjustment to the top limit for the last page.
    //this method works for non-dependent actions that can be executed in any order.
    static void IterateTo(ulong top_number, Action<ulong, uint, ushort[]> actn) {
      PrcsSpc[] ps = new PrcsSpc[NUMPRCSPCS]; for (var s = 0u; s < NUMPRCSPCS; ++s) ps[s] = new PrcsSpc {
        buf = new ushort[BFSZ],
        tsk = Task.Factory.StartNew(() => { })
      };
      var topndx = (top_number - FSTBP) >> 1; for (ulong ndx = 0; ndx <= topndx; ) {
        ps[0].tsk.Wait(); var buf = ps[0].buf; for (var s = 0u; s < NUMPRCSPCS - 1; ++s) ps[s] = ps[s + 1];
        var lowi = ndx; var nxtndx = ndx + BFRNG; var lim = topndx < nxtndx ? (uint)(topndx - ndx + 1) : BFRNG;
        ps[NUMPRCSPCS - 1] = new PrcsSpc { buf = buf, tsk = cullbftsk(ndx, buf, (b) => actn(lowi, lim, b)) };
        ndx = nxtndx;
      } for (var s = 0u; s < NUMPRCSPCS; ++s) ps[s].tsk.Wait();
    }

    //iterates the predicate over each page up to the page where the predicate paramenter returns true,
    //this method works for dependent operations that need to be executed in increasing order.
    //it is somewhat slower than the above as the predicate function is executed outside the task.
    static void IterateUntil(Func<ulong, ushort[], bool> prdct) {
      PrcsSpc[] ps = new PrcsSpc[NUMPRCSPCS];
      for (var s = 0u; s < NUMPRCSPCS; ++s) {
        var buf = new ushort[BFSZ];
        ps[s] = new PrcsSpc { buf = buf, tsk = cullbftsk(s * BFRNG, buf, (bfr) => { }) };
      }
      for (var ndx = 0UL; ; ndx += BFRNG) {
        ps[0].tsk.Wait(); var buf = ps[0].buf; var lowi = ndx; if (prdct(lowi, buf)) break;
        for (var s = 0u; s < NUMPRCSPCS - 1; ++s) ps[s] = ps[s + 1];
        ps[NUMPRCSPCS - 1] = new PrcsSpc {
          buf = buf,
          tsk = cullbftsk(ndx + NUMPRCSPCS * BFRNG, buf, (bfr) => { })
        };
      }
    }

    #endregion

    #region initialization

    /// <summary>
    /// the static constructor is used to initialize the static readonly arrays.
    /// </summary>
    static UltimatePrimesSoE() {
      WHLPOS = new byte[WHLPTRN.Length + 1]; //to look up wheel position index from wheel index
      for (byte i = 0, acc = 0; i < WHLPTRN.Length; ++i) { acc += WHLPTRN[i]; WHLPOS[i + 1] = acc; }
      WHLNDX = new byte[WCRC + 1]; for (byte i = 1; i < WHLPOS.Length; ++i) {
        for (byte j = (byte)(WHLPOS[i - 1] + 1); j <= WHLPOS[i]; ++j) WHLNDX[j] = i;
      }
      WHLRNDUP = new byte[WCRC * 2]; for (byte i = 1; i < WHLRNDUP.Length; ++i) {
        if (i > WCRC) WHLRNDUP[i] = (byte)(WCRC + WHLPOS[WHLNDX[i - WCRC]]); else WHLRNDUP[i] = WHLPOS[WHLNDX[i]];
      }
      Func<ushort, int> nmbts = (v) => { var acc = 0; while (v != 0) { acc += (int)v & 1; v >>= 1; } return acc; };
      CLUT = new byte[1 << 16]; for (var i = 0; i < CLUT.Length; ++i) CLUT[i] = (byte)nmbts((ushort)(i ^ -1));
      PRLUT = new byte[WHTS]; for (var i = 0; i < PRLUT.Length; ++i) {
        var t = (uint)(WHLPOS[i] * 2) + FSTBP; if (t >= WCRC) t -= WCRC; if (t >= WCRC) t -= WCRC; PRLUT[i] = (byte)t;
      }
      WSLUT = new Wst[WHTS * WHTS]; for (var x = 0u; x < WHTS; ++x) {
        var p = FSTBP + 2u * WHLPOS[x]; var pr = p % WCRC;
        for (uint y = 0, pos = (p * p - FSTBP) / 2; y < WHTS; ++y) {
          var m = WHLPTRN[(x + y) % WHTS];
          pos %= WCRC; var posn = WHLNDX[pos]; pos += m * pr; var nposd = pos / WCRC; var nposn = WHLNDX[pos - nposd * WCRC];
          WSLUT[x * WHTS + posn] = new Wst {
            msk = (ushort)(1 << (int)(posn & 0xF)),
            mlt = (byte)(m * WPC),
            xtr = (byte)(WPC * nposd + (nposn >> 4) - (posn >> 4)),
            nxt = (ushort)(WHTS * x + nposn)
          };
        }
      }
      MCPY = new ushort[PGSZ]; foreach (var lp in BWHLPRMS.SkipWhile(p => p < FSTCP)) {
        var p = (uint)lp;
        var k = (p * p - FSTBP) >> 1; var pd = p / WCRC; var kd = k / WCRC; var kn = WHLNDX[k - kd * WCRC];
        for (uint w = kd * WPC + (uint)(kn >> 4), ndx = WHLNDX[(2 * WCRC + p - FSTBP) / 2] * WHTS + kn; w < MCPY.Length; ) {
          var st = WSLUT[ndx]; MCPY[w] |= st.msk; w += st.mlt * pd + st.xtr; ndx = st.nxt;
        }
      }
    }

    #endregion

    #region public class

    // this class implements the enumeration (IEnumerator).
    //    it works by farming out tasks culling pages, which it then processes in order by
    //    enumerating the found primes as recognized by the remaining non-composite bits
    //    in the cull page buffers.
    class nmrtr : IEnumerator<ulong>, IEnumerator, IDisposable {
      PrcsSpc[] ps = new PrcsSpc[NUMPRCSPCS]; ushort[] buf;
      public nmrtr() {
        for (var s = 0u; s < NUMPRCSPCS; ++s) ps[s] = new PrcsSpc { buf = new ushort[BFSZ] };
        for (var s = 1u; s < NUMPRCSPCS; ++s) {
          ps[s].tsk = cullbftsk((s - 1u) * BFRNG, ps[s].buf, (bfr) => { });
        } buf = ps[0].buf;
      }
      ulong _curr, i = (ulong)-WHLPTRN[WHTS - 1]; int b = -BWHLPRMS.Length - 1; uint wi = WHTS - 1; ushort v, msk = 0;
      public ulong Current { get { return this._curr; } } object IEnumerator.Current { get { return this._curr; } }
      public bool MoveNext() {
        if (b < 0) {
          if (b == -1) b += buf.Length; //no yield!!! so automatically comes around again
          else { this._curr = (ulong)BWHLPRMS[BWHLPRMS.Length + (++b)]; return true; }
        }
        do {
          i += WHLPTRN[wi++]; if (wi >= WHTS) wi = 0; if ((this.msk <<= 1) == 0) {
            if (++b >= BFSZ) {
              b = 0; for (var prc = 0; prc < NUMPRCSPCS - 1; ++prc) ps[prc] = ps[prc + 1];
              ps[NUMPRCSPCS - 1u].buf = buf;
              ps[NUMPRCSPCS - 1u].tsk = cullbftsk(i + (NUMPRCSPCS - 1u) * BFRNG, buf, (bfr) => { });
              ps[0].tsk.Wait(); buf = ps[0].buf;
            } v = buf[b]; this.msk = 1;
          }
        }
        while ((v & msk) != 0u); _curr = FSTBP + i + i; return true;
      }
      public void Reset() { throw new Exception("Primes enumeration reset not implemented!!!"); }
      public void Dispose() { }
    }

    #endregion

    #region public instance method and associated sub private method

    /// <summary>
    /// Gets the enumerator for the primes.
    /// </summary>
    /// <returns>The enumerator of the primes.</returns>
    public IEnumerator<ulong> GetEnumerator() { return new nmrtr(); }

    /// <summary>
    /// Gets the enumerator for the primes.
    /// </summary>
    /// <returns>The enumerator of the primes.</returns>
    IEnumerator IEnumerable.GetEnumerator() { return new nmrtr(); }

    #endregion

    #region public static methods

    /// <summary>
    /// Gets the count of primes up the number, inclusively.
    /// </summary>
    /// <param name="top_number">The ulong top number to check for prime.</param>
    /// <returns>The long number of primes found.</returns>
    public static long CountTo(ulong top_number) {
      if (top_number < FSTBP) return BWHLPRMS.TakeWhile(p => p <= top_number).Count();
      var cnt = (long)BWHLPRMS.Length;
      IterateTo(top_number, (lowi, lim, b) => { Interlocked.Add(ref cnt, count(lim, b)); }); return cnt;
    }

    /// <summary>
    /// Gets the sum of the primes up the number, inclusively.
    /// </summary>
    /// <param name="top_number">The uint top number to check for prime.</param>
    /// <returns>The ulong sum of all the primes found.</returns>
    public static ulong SumTo(uint top_number) {
      if (top_number < FSTBP) return (ulong)BWHLPRMS.TakeWhile(p => p <= top_number).Aggregate(0u, (acc, p) => acc += p);
      var sum = (long)BWHLPRMS.Aggregate(0u, (acc, p) => acc += p);
      Func<ulong, uint, ushort[], long> sumbf = (lowi, bitlim, buf) => {
        var acc = 0L; for (uint i = 0, wi = 0, msk = 0x8000, w = 0, v = 0; i < bitlim;
            i += WHLPTRN[wi++], wi = wi >= WHTS ? 0 : wi) {
          if (msk >= 0x8000) { msk = 1; v = buf[w++]; } else msk <<= 1;
          if ((v & msk) == 0) acc += (long)(FSTBP + ((lowi + i) << 1));
        } return acc;
      };
      IterateTo(top_number, (pos, lim, b) => { Interlocked.Add(ref sum, sumbf(pos, lim, b)); }); return (ulong)sum;
    }

    /// <summary>
    /// Gets the prime number at the zero based index number given.
    /// </summary>
    /// <param name="index">The long zero-based index number for the prime.</param>
    /// <returns>The ulong prime found at the given index.</returns>
    public static ulong ElementAt(long index) {
      if (index < BWHLPRMS.Length) return (ulong)BWHLPRMS.ElementAt((int)index);
      long cnt = BWHLPRMS.Length; var ndx = 0UL; var cycl = 0u; var bit = 0u; IterateUntil((lwi, bfr) => {
        var c = count(BFRNG, bfr); if ((cnt += c) < index) return false; ndx = lwi; cnt -= c; c = 0;
        do { var w = cycl++ * WPC; c = CLUT[bfr[w++]] + CLUT[bfr[w++]] + CLUT[bfr[w]]; cnt += c; } while (cnt < index);
        cnt -= c; var y = (--cycl) * WPC; ulong v = ((ulong)bfr[y + 2] << 32) + ((ulong)bfr[y + 1] << 16) + bfr[y];
        do { if ((v & (1UL << ((int)bit++))) == 0) ++cnt; } while (cnt <= index); --bit; return true;
      }); return FSTBP + ((ndx + cycl * WCRC + WHLPOS[bit]) << 1);
    }

    #endregion
  }

The above code will enumerate the primes to one billion in about 1.55 seconds on a four core (eight threads including HT) i7-2700K (3.5 GHz) and your E7500 will be perhaps up to four times slower due to less threads and slightly less clock speed. About three quarters of that time is just the time to run the enumeration MoveNext() method and Current property, so I provide the public static methods "CountTo", "SumTo" and "ElementAt" to compute the number or sum of primes in a range and the nth zero-based prime, respectively, without using enumeration. Using the UltimatePrimesSoE.CountTo(1000000000) static method produces 50847534 in about 0.32 seconds on my machine, so shouldn't take longer than about 1.28 seconds on the Intel E7500.

EDIT_ADD: Interestingly, this code runs 30% faster in x86 32-bit mode than in x64 64-bit mode, likely due to avoiding the slight extra overhead of extending the uint32 numbers to ulong's. All of the above timings are for 64-bit mode. END_EDIT_ADD

At almost 300 (dense) lines of code, this implementation isn't simple, but that's the cost of doing all of the described optimizations that make this code so efficient. It isn't all that many more lines of code that the other answer by Aaron Murgatroyd; although his code is less dense, his code is also about four times as slow. In fact, almost all of the execution time is spent in the final "for loop" of the my code's private static "cullbf" method, which is only four statements long plus the range condition check; all the rest of the code is just in support of repeated applications of that loop.

The reasons that this code is faster than that other answer are for the same reasons that this code is faster than your code other than he does the Step (1) optimization of only processing odd prime candidates. His use of multiprocessing is almost completely ineffective as in only a 30% advantage rather than the factor of four that should be possible on a true four core CPU when applied correctly as it threads per prime rather than for all primes over small pages, and his use of unsafe pointer array access as a method of eliminating the DotNet computational cost of an array bound check per loop actually slows the code compared to just using arrays directly including the bounds check as the DotNet Just In Time (JIT) compiler produces quite inefficient code for pointer access. In addition, his code enumerates the primes just as my code can do, which enumeration has a 10's of CPU clock cycle cost per enumerated prime, which is also slightly worse in his case as he uses the built-in C# iterators which are somewhat less efficient than my "roll-your-own" IEnumerator interface. However, for maximum speed, we should avoid enumeration entirely; however even his supplied "Count" instance method uses a "foreach" loop which means enumeration.

In summary, this answer code produces prime answers about 25 times faster than your question's code on your E7500 CPU (many more times faster on a CPU with more cores/threads) uses much less memory, and is not limited to smaller prime ranges of about the 32-bit number range, but at a cost of increased code complexity.

Marge answered 19/9, 2013 at 1:49 Comment(7)

A very nice write-up. Thanks! I tried to compile the code (I found I have csc 4.0.30319 on my system...) and there seem to be some generic types details and using statements missing, so some fiddling required, did't work for me, yet. No sweat, this is most likely a local problem, since that's my first ever go at C#. – Wirth 26/6, 2015 at 13:26

@mgaert, I have added the missing using's. BTW, I have found a way to make this about two to four times as fast so it runs only about 1.5 to 2 times slower than primesieve and faster than SoA primegen but haven't taken the time to write it up yet. The new method is based on the Bernstein SoE reference implementation used to compare against the SoA, but not crippled as compared to the limits on the amount of wheel factorization applied to his "eratspeed" implementation. It culls composite numbers by modulo sets rather than in modulo order, thus reducing the complexity of the inner cull loop. – Marge 13/11, 2015 at 3:53

@Marge Hey Gordon, would you be willing to break this down to teachable stages or tasks? The reason I ask is I would like to learn and implement how you do this - it's just really hard for me to follow. I built a Prime number benchmarking application for testing CPU performance and it's gaining a lot of traction in the overclocking world. I can't use your algorithm for personal/moral reasons despite it working 12x faster than what I have, but I could definitely learn from this. I refuse to use what I don't understand. I would like to take what I learn and do a write-up on my website. – Inextirpable 7/9, 2016 at 15:47

@RagingCain, it is already broken down into fairly small methods with comments; however, I could add some comments as to how the overall algorithm works. The hardest part to understand is likely that it uses the bit-packed wheel array to store the marked composite number representation, then "cullbf" uses a Wheel State LUT array WSLUT to advance culling per prime through the bit-packed buffer, using the pre-generated Look Up Tables (LUT's) WHLPOS, PRLUT and WHLRNDUP to calculate the first bit in the buffer to cull. The buffer is initialized with the right part of MCPY; preculled of 11/13/17. – Marge 7/9, 2016 at 23:53

@RagingCain, most of the time is spent in the final "for" loop of "cullbf" culling composite representations using the Wheel State Look Up Table (WSLUT); however, that loop is one of the limitations of this algorithm as it takes from 10 to over 20 CPU clock cycles per loop (range dependent on CPU efficiency/CPU make and model) due to the multiple array accesses in that inner loop. In Kim Walisch's C++ "primesieve", which uses a similar technique, he does major "un-rolling the loop", but that is not really possible in C#. However, there are other techniques which can speed this even further. – Marge 8/9, 2016 at 0:18

@RagingCain, As per a previous comment, this time can be reduced by a factor of 2 to 3 (again depending on CPU) by splitting the culled buffer into 48 bit-packed buffers with each buffer containing only one wheel modulo. It is both simpler and perhaps somewhat more complex: the inner loop is very simple other than handling the bit-packing, but the code to support that inner loop is somewhat complex in advancing through the modulo buffers by primes while avoiding completely recalculating the starting point per modulo - an computationally expensive operation - by using pre-calculated LUT's. – Marge 8/9, 2016 at 0:28

@RagingCain, how about you post your web-site and I can likely contact you directly through that, perhaps submitting a couple of articles explaining the different algorithms that can be linked from here? It would also help if I saw your current expertise by looking at your current code in order to be able to help you extend your knowledge. – Marge 8/9, 2016 at 0:34

My implementation with multi threading (.NET 4.0 required):

using System;
using System.Collections;
using System.Collections.Generic;
using System.Threading.Tasks;

namespace PrimeGenerator
{
    // The block element type for the bit array, 
    // use any unsigned value. WARNING: UInt64 is 
    // slower even on x64 architectures.
    using BitArrayType = System.UInt32;

    // This should never be any bigger than 256 bits - leave as is.
    using BitsPerBlockType = System.Byte;

    // The prime data type, this can be any unsigned value, the limit
    // of this type determines the limit of Prime value that can be
    // found. WARNING: UInt64 is slower even on x64 architectures.
    using PrimeType = System.UInt32;

    /// <summary>
    /// Calculates prime number using the Sieve of Eratosthenes method.
    /// </summary>
    /// <example>
    /// <code>
    ///     var lpPrimes = new Eratosthenes(1e7);
    ///     foreach (UInt32 luiPrime in lpPrimes)
    ///         Console.WriteLine(luiPrime);
    /// </example>
    public class Eratosthenes : IEnumerable<PrimeType>
    {
        #region Constants

        /// <summary>
        /// Constant for number of bits per block, calculated based on size of BitArrayType.
        /// </summary>
        const BitsPerBlockType cbBitsPerBlock = sizeof(BitArrayType) * 8;

        #endregion

        #region Protected Locals

        /// <summary>
        /// The limit for the maximum prime value to find.
        /// </summary>
        protected readonly PrimeType mpLimit;

        /// <summary>
        /// True if the class is multi-threaded
        /// </summary>
        protected readonly bool mbThreaded;

        /// <summary>
        /// The current bit array where a set bit means
        /// the odd value at that location has been determined
        /// to not be prime.
        /// </summary>
        protected BitArrayType[] mbaOddNotPrime;

        #endregion

        #region Initialisation

        /// <summary>
        /// Create Sieve of Eratosthenes generator.
        /// </summary>
        /// <param name="limit">The limit for the maximum prime value to find.</param>
        /// <param name="threaded">True if threaded, false otherwise.</param>
        public Eratosthenes(PrimeType limit, bool threaded)
        {
            // Check limit range
            if (limit > PrimeType.MaxValue - (PrimeType)Math.Sqrt(PrimeType.MaxValue))
                throw new ArgumentOutOfRangeException();

            mbThreaded = threaded;
            mpLimit = limit;

            FindPrimes();
        }

        /// <summary>
        /// Create Sieve of Eratosthenes generator in multi-threaded mode.
        /// </summary>
        /// <param name="limit">The limit for the maximum prime value to find.</param>
        public Eratosthenes(PrimeType limit)
            : this(limit, true)
        {
        }

        #endregion

        #region Private Methods

        /// <summary>
        /// Calculates compartment indexes for a multi-threaded operation.
        /// </summary>
        /// <param name="startInclusive">The inclusive starting index.</param>
        /// <param name="endExclusive">The exclusive ending index.</param>
        /// <param name="threads">The number of threads.</param>
        /// <returns>An array of thread elements plus 1 containing the starting and exclusive ending indexes to process for each thread.</returns>
        private PrimeType[] CalculateCompartments(PrimeType startInclusive, PrimeType endExclusive, ref int threads)
        {
            if (threads == 0) threads = 1;
            if (threads == -1) threads = Environment.ProcessorCount;
            if (threads > endExclusive - startInclusive) threads = (int)(endExclusive - startInclusive);

            PrimeType[] liThreadIndexes = new PrimeType[threads + 1];
            liThreadIndexes[threads] = endExclusive;
            PrimeType liIndexesPerThread = (endExclusive - startInclusive) / (PrimeType)threads;
            for (PrimeType liCount = 0; liCount < threads; liCount++)
                liThreadIndexes[liCount] = liCount * liIndexesPerThread;

            return liThreadIndexes;
        }

        /// <summary>
        /// Executes a simple for loop in parallel but only creates
        /// a set amount of threads breaking the work up evenly per thread,
        /// calling the body only once per thread, this is different
        /// to the .NET 4.0 For method which calls the body for each index.
        /// </summary>
        /// <typeparam name="ParamType">The type of parameter to pass to the body.</typeparam>
        /// <param name="startInclusive">The starting index.</param>
        /// <param name="endExclusive">The exclusive ending index.</param>
        /// <param name="parameter">The parameter to pass to the body.</param>
        /// <param name="body">The body to execute per thread.</param>
        /// <param name="threads">The number of threads to execute.</param>
        private void For<ParamType>(
            PrimeType startInclusive, PrimeType endExclusive, ParamType parameter,
            Action<PrimeType, PrimeType, ParamType> body,
            int threads)
        {
            PrimeType[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, new System.Threading.Tasks.ParallelOptions(),
                    (liThread) => { body(liThreadIndexes[liThread], liThreadIndexes[liThread + 1], parameter); }
                );
            else
                body(startInclusive, endExclusive, parameter);
        }

        /// <summary>
        /// Finds the prime number within range.
        /// </summary>
        private unsafe void FindPrimes()
        {
            // Allocate bit array.
            mbaOddNotPrime = new BitArrayType[(((mpLimit >> 1) + 1) / cbBitsPerBlock) + 1];

            // Cache Sqrt of limit.
            PrimeType lpSQRT = (PrimeType)Math.Sqrt(mpLimit);

            int liThreads = Environment.ProcessorCount;
            if (!Threaded) liThreads = 0;

            // Fix the bit array for pointer access
            fixed (BitArrayType* lpbOddNotPrime = &mbaOddNotPrime[0])
            {
                IntPtr lipBits = (IntPtr)lpbOddNotPrime;

                // Scan primes up to lpSQRT
                for (PrimeType lpN = 3; lpN <= lpSQRT; lpN += 2)
                {
                    // If the current bit value for index lpN is cleared (prime)
                    if (
                            (
                                lpbOddNotPrime[(lpN >> 1) / cbBitsPerBlock] &
                                ((BitArrayType)1 << (BitsPerBlockType)((lpN >> 1) % cbBitsPerBlock))
                            ) == 0
                        )
                    {
                        // If multi-threaded
                        if (liThreads > 1)
                        {
                            // Leave it cleared (prime) and mark all multiples of lpN*2 from lpN*lpN as not prime
                            For<PrimeType>(
                                0, ((mpLimit - (lpN * lpN)) / (lpN << 1)) + 1, lpN,
                                (start, end, n) =>
                                {
                                    BitArrayType* lpbBits = (BitArrayType*)lipBits;
                                    PrimeType lpM = n * n + (start * (n << 1));
                                    for (PrimeType lpCount = start; lpCount < end; lpCount++)
                                    {
                                        // Set as not prime
                                        lpbBits[(lpM >> 1) / cbBitsPerBlock] |=
                                            (BitArrayType)((BitArrayType)1 << (BitsPerBlockType)((lpM >> 1) % cbBitsPerBlock));

                                        lpM += n << 1;
                                    }
                                },
                                liThreads);
                        }
                        else
                        {
                            // Leave it cleared (prime) and mark all multiples of lpN*2 from lpN*lpN as not prime
                            for (PrimeType lpM = lpN * lpN; lpM <= mpLimit; lpM += lpN<<1)
                                // Set as not prime
                                lpbOddNotPrime[(lpM >> 1) / cbBitsPerBlock] |=
                                    (BitArrayType)((BitArrayType)1 << (BitsPerBlockType)((lpM >> 1) % cbBitsPerBlock));
                        }
                    }
                }
            }
        }

        /// <summary>
        /// Gets a bit value by index.
        /// </summary>
        /// <param name="bits">The blocks containing the bits.</param>
        /// <param name="index">The index of the bit.</param>
        /// <returns>True if bit is set, false if cleared.</returns>
        private bool GetBitSafe(BitArrayType[] bits, PrimeType index)
        {
            return (bits[index / cbBitsPerBlock] & ((BitArrayType)1 << (BitsPerBlockType)(index % cbBitsPerBlock))) != 0;
        }

        #endregion

        #region Public Properties

        /// <summary>
        /// Gets whether this class is multi-threaded or not.
        /// </summary>
        public bool Threaded
        {
            get
            {
                return mbThreaded;
            }
        }

        /// <summary>
        /// Get the limit for the maximum prime value to find.
        /// </summary>
        public PrimeType Limit
        {
            get
            {
                return mpLimit;
            }
        }

        /// <summary>
        /// Returns the number of primes found in the range.
        /// </summary>
        public PrimeType Count
        {
            get
            {
                PrimeType lptCount = 0;
                foreach (PrimeType liPrime in this)
                    lptCount++;
                return lptCount;
            }
        }

        /// <summary>
        /// Determines if a value in range is prime or not.
        /// </summary>
        /// <param name="test">The value to test for primality.</param>
        /// <returns>True if the value is prime, false otherwise.</returns>
        public bool this[PrimeType test]
        {
            get
            {
                if (test > mpLimit) throw new ArgumentOutOfRangeException();
                if (test <= 1) return false;
                if (test == 2) return true;
                if ((test & 1) == 0) return false;
                return !GetBitSafe(mbaOddNotPrime, test >> 1);
            }
        }

        #endregion

        #region Public Methods

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns>The enumerator of the primes.</returns>
        public IEnumerator<PrimeType> GetEnumerator()
        {
            // Two always prime.
            yield return 2;

            // Start at first block, second MSB.
            int liBlock = 0;
            byte lbBit = 1;
            BitArrayType lbaCurrent = mbaOddNotPrime[0] >> 1;

            // For each value in range stepping in incrments of two for odd values.
            for (PrimeType lpN = 3; lpN <= mpLimit; lpN += 2)
            {
                // If current bit not set then value is prime.
                if ((lbaCurrent & 1) == 0)
                    yield return lpN;

                // Move to NSB.
                lbaCurrent >>= 1;

                // Increment bit value.
                lbBit++;

                // If block is finished.
                if (lbBit == cbBitsPerBlock) 
                {
                    // Move to first bit of next block.
                    lbBit = 0;
                    liBlock++;
                    lbaCurrent = mbaOddNotPrime[liBlock];
                }
            }
        }

        #endregion

        #region IEnumerable<PrimeType> Implementation

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns></returns>
        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }

        #endregion
    }
}

The multi-threading works by threading the inner most loop, this way there are no data locking issues because the multiple threads work with a subset of the array and dont overlap for each job done.

Seems to be quite fast, can generate all primes up to a limit of 1,000,000,000 on an AMD Phenom II X4 965 processor in 5.8 seconds. Special implementations like the Atkins are faster, but this is fast for the Sieve of Eratosthenes.

Unpaged answered 14/3, 2012 at 11:11 Comment(4)

Good, but you miss two optimizations: 1) in multi-threading your inner loop array range continuously increases in size until it considerably exceeds L1 cache size for increasing inefficiency in memory access times, and 2) you only eliminate even number culls whereas a full wheel factorization could eliminate more culls at some cost in increased complexity. Using these two further optimizations, I find the primes to one billion in 1.975 seconds and up to 4294967295 in 8.738 seconds on a machine not much faster than yours: https://mcmap.net/q/138502/-c-how-to-make-sieve-of-atkin-incremental. – Marge 11/10, 2013 at 5:24

cont'd: The above link only uses 2,3,5 wheel factorization but slight further gains could be made using 2,3,5,7,11,13 wheel factorization at a further cost in complexity; however, there is little point in that about two thirds of the time is spent enumerating the found primes in the foreground thread, although it might reduce run times by about a few percent due to increasing the packing of the composite number array. The above code might also benefit from using unsafe code as you do, which is not currently used. – Marge 11/10, 2013 at 5:37

Not bad but two further comments: 1) there's not much point to multi-threading here as on my i7 (eight threads) CPU there's only about a 30% speed gain using multi-threading compared to not using it; this is due to extra overheads in threading over all ranges per prime, which is quite inefficient as to memory access as mentioned in my first comment. 2) Your use of unsafe pointers actually slows the program slightly compared to just using the array directly; this is because DotNet JIT compiler doesn't optimize pointers very well even though they eliminate the array bounds checks. – Marge 3/12, 2013 at 22:15

@cont'd: Also, your inner loop is somewhat over complex and can be reduced to "mbaOddNotPrime[(lpM) / cbBitsPerBlock] |= (BitArrayType)1 << (int)lpM;" with an appropriate adjustment to the initial condition for lpM, its limit (where used) and the lpM increment, all as divided by 2 (>> 1). This works because there is already an automatic mask (% cbBitsPerBlock) of the left shift value so another is redundant, and eliminates the continual multiple ">> 1" operations for each loop. It's still slow though due to the huge memory access overhead when the array size exceeds the CPU cache sizes. – Marge 3/12, 2013 at 23:3

A while back I tried to implement The Sieve of Atkin in parallell. It was a failure. I haven't done any deeper research but it seems that both Sieve Of Eratosthenes and The Sieve of Atkin are hard to scale over multiple CPUs because the implementations I've seen uses a list of integers that is shared. Shared state is a heavy anchor to carry when you try to scale over multiple CPUs.

Pot answered 15/1, 2011 at 15:19 Comment(4)

@TapasBose, you quickly accepted an answer that basically says it can't likely be done any better than the question code? The problem with both the question code and Jonas's work is that they consider applying fine scale parallelism to the problem rather than coarser scale. The correct approach to the problem is, having divided the work into page segments, to assign threads/tasks to the work of culling each of the segment pages, sharing only the source of base primes used in the culling and the aggregation of the final results; in this way, there is very little synchronization required. – Marge 17/11, 2015 at 2:21

@GordonBGood, yes sir, I did have accepted the answer and it was quite a long ago; perhaps you are right about the reason of the acceptance. I am out of touch with C# for long time and due to professional reason I am now working with Java. Recently I have written a piece of code of SoE in Java8 using Lamdba and Parallel computing. I have seen your work in other SO threads and those are awesome. – Broadminded 17/11, 2015 at 13:27

I should have posted this as a comment and not an answer. Sorry. – Barouche 17/11, 2015 at 15:35

@TapasBose, although the question was for C#, the algorithm applies to any language including Java, especially now that Java8 supports tasks which makes the multi-threading easier. I have translated this C# code to Java (and to Scala) and the code in those languages runs at about the same speed as it does in C#. – Marge 17/11, 2015 at 22:26

Hot tags

Godot Unity Godot Help Programming Godot 4.X GUI GDScript 3D 2D Physics CSharp Godot 3.X VR XR Projects C++

Recommended topics

Hot tags