Why is Parallel.Invoke much faster if the call is in a separate method?
Asked Answered
G

3

15

I implemented the QuickSort-Algorithm 3 times and measured the time for sorting 50 million random numbers:

  1. sequential (took ~14 seconds)

  2. With Parallel.Invoke() in the same method as the sorting algorithm (took ~12 seconds)

  3. With Parallel.Invoke() in a separate method (took ~7 seconds)

So my question is: Why is Parallel.Invoke() much faster if the call is in a separate method? On my computer the 3. example was more than twice as fast as the 2.

2. With Parallel.Invoke() in the same method as the sorting algorithm

public class ParallelQuickSort
{

    private const int Threshold = 100;

    public static void Sort(int[] array)
    {
        if (array == null || array.Length == 0)
        {
            new ArgumentException("number array must be at least of length 1");
        }
        QuickSort(array, 0, array.Length - 1);
    }

    private static void QuickSort(int[] array, int left, int right)
    {
        var i = left;
        var j = right;
        var m = array[(left + right) / 2];
        while (i <= j)
        {
            while (array[i] < m) { i++; }
            while (array[j] > m) { j--; }
            if (i <= j)
            {
                var t = array[i]; array[i] = array[j]; array[j] = t;
                i++; j--;
            }
        }
        if (j - left > Threshold && right - i > Threshold)
        {
            Parallel.Invoke(
                () => QuickSort(array, left, j),
                () => QuickSort(array, i, right)
            );
        }
        else
        {
            if (j > left) { QuickSort(array, left, j); }
            if (i < right) { QuickSort(array, i, right); }
        }
    }

}

3. With Parallel.Invoke() in a separate method

public class ParallelSeparateMethodQuickSort
{

    private const int Threshold = 100;

    public static void Sort(int[] array)
    {
        if (array == null || array.Length == 0)
        {
            new ArgumentException("number array must be at least of length 1");
        }
        QuickSort(array, 0, array.Length - 1);
    }

    private static void QuickSort(int[] array, int left, int right)
    {
        var i = left;
        var j = right;
        var m = array[(left + right) / 2];
        while (i <= j)
        {
            while (array[i] < m) { i++; }
            while (array[j] > m) { j--; }
            if (i <= j)
            {
                var t = array[i]; array[i] = array[j]; array[j] = t;
                i++; j--;
            }
        }
        if (j - left > Threshold && right - i > Threshold)
        {
            ParallelInvoke(array, left, j, i, right);
        }
        else
        {
            if (j > left) { QuickSort(array, left, j); }
            if (i < right) { QuickSort(array, i, right); }
        }
    }

    private static void ParallelInvoke(int[] array, int left, int j, int i, int right)
    {
        Parallel.Invoke(
                () => QuickSort(array, left, j),
                () => QuickSort(array, i, right)
            );
    }

}

Full Code

using System;
using System.Threading.Tasks;
using System.Diagnostics;

namespace parallelQuicksort
{
    class Program
    {
        static void Main(string[] args)
        {
            const int N = 50_000_000;
            for (int i = 0; i < 5; i++)
            {
                var array = GetRandomArray(N);
                Measure("Sequential\t", () => SequentialQuickSort.Sort(array));
                array = GetRandomArray(N);
                Measure("Parallel\t", () => ParallelQuickSort.Sort(array));
                array = GetRandomArray(N);
                Measure("P. Separate Method", () => ParallelSeparateMethodQuickSort.Sort(array));
            }
        }

        private static int[] GetRandomArray(int length)
        {
            var random = new Random(4711);
            var array = new int[length];
            for (int i = 0; i < length; i++)
            {
                array[i] = random.Next();
            }
            return array;
        }

        public static void Measure(string name, Action action)
        {
            Stopwatch stopwatch = Stopwatch.StartNew();
            action();
            stopwatch.Stop();
            var time = stopwatch.ElapsedMilliseconds;
            Console.WriteLine($"{name}: \tElapsed={time}");
        }
    }

    public class SequentialQuickSort
    {
        public static void Sort(int[] array)
        {
            if (array == null || array.Length == 0)
            {
                new ArgumentException("number array must be at least of length 1");
            }
            QuickSort(array, 0, array.Length - 1);
        }

        private static void QuickSort(int[] array, int left, int right)
        {
            var i = left;
            var j = right;
            var m = array[(left + right) / 2];
            while (i <= j)
            {
                while (array[i] < m) { i++; }
                while (array[j] > m) { j--; }
                if (i <= j)
                {
                    var t = array[i]; array[i] = array[j]; array[j] = t;
                    i++; j--;
                }
            }

            if (j > left) { QuickSort(array, left, j); }
            if (i < right) { QuickSort(array, i, right); }
        }
    }

    public class ParallelQuickSort
    {

        private const int Threshold = 100;

        public static void Sort(int[] array)
        {
            if (array == null || array.Length == 0)
            {
                new ArgumentException("number array must be at least of length 1");
            }
            QuickSort(array, 0, array.Length - 1);
        }

        private static void QuickSort(int[] array, int left, int right)
        {
            var i = left;
            var j = right;
            var m = array[(left + right) / 2];
            while (i <= j)
            {
                while (array[i] < m) { i++; }
                while (array[j] > m) { j--; }
                if (i <= j)
                {
                    var t = array[i]; array[i] = array[j]; array[j] = t;
                    i++; j--;
                }
            }
            if (j - left > Threshold && right - i > Threshold)
            {
                Parallel.Invoke(
                    () => QuickSort(array, left, j),
                    () => QuickSort(array, i, right)
                );
            }
            else
            {
                if (j > left) { QuickSort(array, left, j); }
                if (i < right) { QuickSort(array, i, right); }
            }
        }

    }

    public class ParallelSeparateMethodQuickSort
    {

        private const int Threshold = 100;

        public static void Sort(int[] array)
        {
            if (array == null || array.Length == 0)
            {
                new ArgumentException("number array must be at least of length 1");
            }
            QuickSort(array, 0, array.Length - 1);
        }

        private static void QuickSort(int[] array, int left, int right)
        {
            var i = left;
            var j = right;
            var m = array[(left + right) / 2];
            while (i <= j)
            {
                while (array[i] < m) { i++; }
                while (array[j] > m) { j--; }
                if (i <= j)
                {
                    var t = array[i]; array[i] = array[j]; array[j] = t;
                    i++; j--;
                }
            }
            if (j - left > Threshold && right - i > Threshold)
            {
                ParallelInvoke(array, left, j, i, right);
            }
            else
            {
                if (j > left) { QuickSort(array, left, j); }
                if (i < right) { QuickSort(array, i, right); }
            }
        }

        private static void ParallelInvoke(int[] array, int left, int j, int i, int right)
        {
            Parallel.Invoke(
                    () => QuickSort(array, left, j),
                    () => QuickSort(array, i, right)
                );
        }

    }

}

You find my code here: https://github.com/Lazzaretti/ParallelQuicksort

Output

Sequential      :       Elapsed=14534
Parallel        :       Elapsed=11960
P. Separate Method:     Elapsed=6353
Sequential      :       Elapsed=14620
Parallel        :       Elapsed=11954
P. Separate Method:     Elapsed=6647
Sequential      :       Elapsed=14529
Parallel        :       Elapsed=11870
P. Separate Method:     Elapsed=6389
Sequential      :       Elapsed=14512
Parallel        :       Elapsed=11787
P. Separate Method:     Elapsed=6590
Sequential      :       Elapsed=16203
Parallel        :       Elapsed=11738
P. Separate Method:     Elapsed=6674
Galore answered 13/4, 2018 at 13:20 Comment(5)
the benchmarks are form the Release build ;)Galore
Well, first of all, you're getting a random array then Sorting it. Then using that sorted array in your next two calls, so your benchmark is already kind of invalid...Nate
Having said that, I changed your benchmark to call GetRandomArray(N) for each measurement (thereby getting a new unsorted array each time), and I also flip flopped the order that the methods were called, and I still observed that the P. Separate Method was indeed consistently about 2 seconds faster than the other method. So, I agree with you, this is an unexpected outcome. I'm not seeing anything obvious here that would account for this.Nate
@Nate i updated the code and the benchmark - thank you for the input! But the P. Separate is still much faster....Galore
Somehow, constructor call of enclosure in separate method is much more faster s9.postimg.cc/hw80fyx7j/perf.pngTradein
R
8

After fixing that problem with sorting already sorted array mentioned in comments, your problem still reproduces.

I think the reason is how and what is captured by lambdas you pass to Parallel.Invoke.

In first case (when Parallel.Invoke is inside QuickSort method):

Parallel.Invoke(
    () => QuickSort(array, left, j),
    () => QuickSort(array, i, right)
);

You capture 5 variables, all of which are used throughout the whole QuickSort method. All those variables become fields of compiler generated class. That means whole QuickSort method now works with object fields and not local variables. If you decompile that method you'll see something like this:

  var cDisplayClass20 = new SomeCompilerGeneratedClass();
  cDisplayClass20.array = array;
  cDisplayClass20.left = left;
  cDisplayClass20.right = right;
  cDisplayClass20.i = cDisplayClass20.left;
  cDisplayClass20.j = cDisplayClass20.right;
  int num1 = cDisplayClass20.array[(cDisplayClass20.left + cDisplayClass20.right) / 2];
  while (cDisplayClass20.i <= cDisplayClass20.j) // field access
  {
    while (cDisplayClass20.array[cDisplayClass20.i] < num1) // field access
      cDisplayClass20.i++;
    while (cDisplayClass20.array[cDisplayClass20.j] > num1) // and again
      cDisplayClass20.j--;
    if (cDisplayClass20.i <= cDisplayClass20.j) // again field access
    {
      // they are everywhere
      int num2 = cDisplayClass20.array[cDisplayClass20.i];
      cDisplayClass20.array[cDisplayClass20.i] = cDisplayClass20.array[cDisplayClass20.j];
      cDisplayClass20.array[cDisplayClass20.j] = num2;
      cDisplayClass20.i++;
      cDisplayClass20.j--;
    }
  }

Which confirms point above.

However if you move Parallel.Invoke to separate method, that is no longer the case. 5 variables are still captured, but that does not affect whole QuickSort method, because capture now happens inside separate ParallelInvoke method, and so is localized. The QuickSort still works with local variables and not fields of compiler generated class. If you decompile version with separate method, it will look exactly like you wrote:

  int index1 = left;
  int index2 = right;
  int num1 = array[(left + right) / 2];
  while (index1 <= index2) // local variables
  {
    while (array[index1] < num1) // num1 is local variable
      ++index1;
    while (array[index2] > num1)
      --index2;
    if (index1 <= index2) // local variables again
    {
      int num2 = array[index1];
      array[index1] = array[index2];
      array[index2] = num2;
      ++index1;
      --index2;
    }
  }
  ...

Now, I assume that accessing object fields (which are generally on heap) is somewhat slower than accessing local variables (which are generally on stack \ in CPU register), hence version with separate method is faster. Eric Lippert also notes in comments that:

The jitter will likely do a worse job with the fields than it will with the locals because it will not enregister them as aggressively.

You can confirm the above by modifying your first version like this:

    private static void QuickSort(int[] array, int left, int right) {
        var i = left;
        var j = right;
        var m = array[(left + right) / 2];
        while (i <= j) {
            while (array[i] < m) {
                i++;
            }

            while (array[j] > m) {
                j--;
            }

            if (i <= j) {
                var t = array[i];
                array[i] = array[j];
                array[j] = t;
                i++;
                j--;
            }
        }

        if (j - left > Threshold && right - i > Threshold) {
            // copy all variables you pass to lambda
            // so that their capture does not affect the whole method
            var tmpArray = array;
            var tmpLeft = left;
            var tmpJ = j;
            var tmpI = i;
            var tmpRight = right;
            Parallel.Invoke(
                () => QuickSort(tmpArray, tmpLeft, tmpJ),
                () => QuickSort(tmpArray, tmpI, tmpRight)
            );
        }
        else {
            if (j > left) {
                QuickSort(array, left, j);
            }

            if (i < right) {
                QuickSort(array, i, right);
            }
        }
    }
}

Then execution time of both approaches will be the same.

As @Eugene mentions in comments and in his answer - there might be other things that slow this down besides field vs local variable access - such as construction and (potentially) garbage collection of compiler-generated classes mentioned above. However, these are all consequences of the same source root - capturing local variables in closure in different ways.

Reflector answered 13/4, 2018 at 14:6 Comment(25)
But why one class works faster than another one? What is the difference from which method it was generated?Tradein
Please, check the profiler results. Parallel case closure constructor it much more slower (more than 20 times) s9.postimg.cc/hw80fyx7j/perf.pngTradein
@EugeneGorbovoy I tried to explain in answer. Summary I think is: because accessing local variable is faster than accessing object field, and version with Parallel.Invoke inside QuickSort converts almost all important local variables in QuickSort method into object fields.Reflector
Nice analysis! The jitter will likely do a worse job with the fields than it will with the locals because it will not enregister them as aggressively.Deeannadeeanne
@evk you asnwer is wrong. fields are being created anyway. But in separate method case they also need to be copied to variables before becoming fields, so it should work slower!Tradein
An interesting optimization that the C# compiler could perform here is to detect that (1) the lambdas are used in tail call position, and (2) neither lambda mutates a local, and therefore the creation of the closure can be delayed until the lambda expression is evaluated. That would be just one of many possible optimizations that we never made on lambdas.Deeannadeeanne
@EugeneGorbovoy I don't say fields are not created. My point is: one version has much larger number of field accesses than the other (the other much more often works with method local variables).Reflector
@Reflector But they have same amount of field accesses)Tradein
@EugeneGorbovoy well I think I cannot explain it more clearly unfortunately :) But I've added a sample code which shows how you can modify slow version to execute in the same time as "fast" one. Maybe that will convince you.Reflector
Good analysis. As @EricLippert notes, this is definitely an optimization that the compiler could do, it just doesn't. Eric, is this the kind of thing that the compiler team could/should be notified of?Nate
@aquinas: when I was on the compiler team Neal Gafter and I identified several areas where we could make lambdas more efficient at runtime at a cost of more compile-time analysis. I don't recall if this specifically was one, but they were all based on an analysis of when locals were read and when they were written with respect to the lambdas. I don't know if any of them got implemented after I left.Deeannadeeanne
@aquinas: The one I was particularly keen on is when you have a long-lived delegate () => M(cheap) and short-lived ()=>M(expensive). The compiler puts both cheap and expensive in the same closure and so the closure holds a reference to expensive as long as the long-lived delegate lives, even though it will never use the expensive resource. The solution is of course to generate two closure classes. I don't know if it ever got implemented.Deeannadeeanne
@I checked decompiled code, it looks same in both cases, Enclousers have same fields, constructor is same and everything same except generated class names and additional method callTradein
@Reflector Thank you! I got you point, I just dont understand why is it so. Coz it has same fields, same amount of read/write etc.. I will continue to investigate this. And what I think it is not related to c# compiler, it somehow related to optimization or jit compilationTradein
@Reflector Your right! With the new variables the code is as fast as with the additional method; but neither i'm understanding why...Galore
@EugeneGorbovoy if you look at decompiled code (ensure to disable loading source code from symbols in your decompiler, otherwise you are not looking at decompiled code), one version has, for example, while (left<= right) ..., while another has while (cDisplayClass20.i <= cDisplayClass20.j), and so on. One uses local variables, another uses fields. So it's not true that "everything is the same".Reflector
@Reflector I'm investigating IL code, i'll get back if find smth interesting..Tradein
@Galore I tried but unfortunately I cannot explain that more clearly than I did in the answer.. Main point is accessing someObject.someField is slower than accessing someLocalVariable, and compiler rewrites your method in a way that makes almost all local variables (left, right etc) fields of compiler generated class. It does that because they are captured in closure (because you use them in lambda method you pass to Parallel.Invoke).Reflector
fab Please, check the compiled IL code: pastebin.com/knTSfnbs as you see, in case of parallel method it does not use variables j, left, right etc anymore, instead it uses fields.That's what guys trying to explain to us. Honestly, I thought inside method it will stay variables until they need to be copied to class to be used in enlosure. @evk , may be you can attached my code or should I put a new answer, I think I can explain it in easier manner ?Tradein
And as you can see in my attached picture, the problem is not only in access to fields, but in fact, that in separate method enclosure class is not being instantiated if that "if" not happenedTradein
@EugeneGorbovoy I don't see how your decompiled IL differs from second code block in my answer, where I show the same (not with IL but with decompiled C# code, which I think is easier to comprehend than pure IL). But you are sure free to post separate answer.Reflector
@Reflector You are right. I was wrong about "same". I will add another answer in case somebody can get it faster..Tradein
@Reflector Thank you for explanation one more time. Plus to you. But now i'm wondering what's actually more slower here: field access or enclosure ctor calls.Tradein
@Reflector If we move temp variables outside of "if" block it shows ~30% slower. That means (ctor + field init )+GC takes approximately same time as field access instead of variables.Tradein
@EugeneGorbovoy I agree, added this to answer.Reflector
T
3

!!!!!!!!! This answer is not actual for now !!!!! I know this is not a proper answer, just want to make it visible: try to change order of tests:

Measure("P. Separate Method", () => ParallelSeparateMethodQuickSort.Sort(array));
Measure("Sequential\t", () => SequentialQuickSort.Sort(array));
Measure("Parallel\t", () => ParallelQuickSort.Sort(array));

And you will see:

P. Separate Method:     Elapsed=8710
Sequential      :       Elapsed=4140
Parallel        :       Elapsed=7928
P. Separate Method:     Elapsed=9033
Sequential      :       Elapsed=4098
Parallel        :       Elapsed=7881

So I think your tests are wrong and this question does not make sense. And quick investigation shows that in every of the tests you change your source array, so each next test has already sorted array.

p.s. But I think this question really exists. If you try to correct the code you will see, that calling separate method works faster!

p.p.s plz let me know if somebody has an answer or if question was corrected

Tradein answered 13/4, 2018 at 13:32 Comment(0)
T
2

There are two reasons for that: additional constructor call (~30% of time) and field-access instead of variable access (~30% of time). When you use enclosure directly inside your method the auto-generated class for that enclosure is being instantiated in each call of this method (which in your case leads to garbage collections, pic below).

enter image description here

And all calls to variables are now calls to fields also (which is slower) as stated by @Evk.

enter image description here

But when your enclosure is wrapped inside another method it is instantiated only when wrapper method was called. So in case of separated method enclosure object is being created only when if (j - left > Threshold && right - i > Threshold) was "true". As described by @Evk you can copy values to new variables declared inside if, it will give you same result as wrapping it in to a method.

I ran a profiler and got this (look at highlighted row): Profiler results

This is fast separate method: enter image description here

And this is slow method: enter image description here

Also, look at the compiled version (note that in slow case we access fields, not variables):

 //Compilation of "if (j - left > Threshold && right - i > Threshold)"

//Slow method:
    // [106 13 - 106 63]
    IL_012f: ldloc.0      // 'CS$<>8__locals0'
    IL_0130: ldfld        int32 parallelQuicksort.ParallelQuickSort/'<>c__DisplayClass2_0'::j
    IL_0135: ldloc.0      // 'CS$<>8__locals0'
    IL_0136: ldfld        int32 parallelQuicksort.ParallelQuickSort/'<>c__DisplayClass2_0'::left
    IL_013b: sub          
    IL_013c: ldc.i4.s     100 // 0x64
    IL_013e: ble.s        IL_0153
    IL_0140: ldloc.0      // 'CS$<>8__locals0'
    IL_0141: ldfld        int32 parallelQuicksort.ParallelQuickSort/'<>c__DisplayClass2_0'::right
    IL_0146: ldloc.0      // 'CS$<>8__locals0'
    IL_0147: ldfld        int32 parallelQuicksort.ParallelQuickSort/'<>c__DisplayClass2_0'::i
    IL_014c: sub          
    IL_014d: ldc.i4.s     100 // 0x64
    IL_014f: cgt          
    IL_0151: br.s         IL_0154
    IL_0153: ldc.i4.0     
    IL_0154: stloc.s      V_8


//fast separate method

    // [151 13 - 151 63]
    IL_006b: ldloc.1      // j
    IL_006c: ldarg.1      // left
    IL_006d: sub          
    IL_006e: ldc.i4.s     100 // 0x64
    IL_0070: ble.s        IL_007b
    IL_0072: ldarg.2      // right
    IL_0073: ldloc.0      // i
    IL_0074: sub          
    IL_0075: ldc.i4.s     100 // 0x64
    IL_0077: cgt          
    IL_0079: br.s         IL_007c
    IL_007b: ldc.i4.0     
    IL_007c: stloc.s      V_8
Tradein answered 13/4, 2018 at 15:54 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.