.NET 4.6 RC x64 is twice as slow as x86 (release version)

Net 4.6 RC x64 is twice as slow as x86 (release version):

Consider this piece of code:

class SpectralNorm
{
    public static void Main(String[] args)
    {
        int n = 5500;
        if (args.Length > 0) n = Int32.Parse(args[0]);

        var spec = new SpectralNorm();
        var watch = Stopwatch.StartNew();
        var res = spec.Approximate(n);

        Console.WriteLine("{0:f9} -- {1}", res, watch.Elapsed.TotalMilliseconds);
    }

    double Approximate(int n)
    {
        // create unit vector
        double[] u = new double[n];
        for (int i = 0; i < n; i++) u[i] = 1;

        // 20 steps of the power method
        double[] v = new double[n];
        for (int i = 0; i < n; i++) v[i] = 0;

        for (int i = 0; i < 10; i++)
        {
            MultiplyAtAv(n, u, v);
            MultiplyAtAv(n, v, u);
        }

        // B=AtA         A multiplied by A transposed
        // v.Bv /(v.v)   eigenvalue of v 
        double vBv = 0, vv = 0;
        for (int i = 0; i < n; i++)
        {
            vBv += u[i] * v[i];
            vv += v[i] * v[i];
        }

        return Math.Sqrt(vBv / vv);
    }


    /* return element i,j of infinite matrix A */
    double A(int i, int j)
    {
        return 1.0 / ((i + j) * (i + j + 1) / 2 + i + 1);
    }

    /* multiply vector v by matrix A */
    void MultiplyAv(int n, double[] v, double[] Av)
    {
        for (int i = 0; i < n; i++)
        {
            Av[i] = 0;
            for (int j = 0; j < n; j++) Av[i] += A(i, j) * v[j];
        }
    }

    /* multiply vector v by matrix A transposed */
    void MultiplyAtv(int n, double[] v, double[] Atv)
    {
        for (int i = 0; i < n; i++)
        {
            Atv[i] = 0;
            for (int j = 0; j < n; j++) Atv[i] += A(j, i) * v[j];
        }
    }

    /* multiply vector v by matrix A and then by matrix A transposed */
    void MultiplyAtAv(int n, double[] v, double[] AtAv)
    {
        double[] u = new double[n];
        MultiplyAv(n, v, u);
        MultiplyAtv(n, u, AtAv);
    }
}

On my machine x86 release version takes 4.5 seconds to complete, while the x64 takes 9.5 seconds. Is there any specific flag/setting needed for the x64?

UPDATE

It turns out that RyuJIT has a role in this issue. If useLegacyJit is enabled in app.config, the result is different and this time x64 is faster.

<?xml version="1.0" encoding="utf-8"?>
<configuration>
  <startup>
    <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.6"/>
  </startup>
  <runtime>
    <useLegacyJit enabled="1" />
 </runtime>
</configuration>

UPDATE

Now the issue has been reported to the CLR team coreclr, issue 993

IN002a: 000093 lea eax, [rax+r10+1] IN002b: 000098 cvtsi2sd xmm1, rax IN002c: 00009C movsd xmm2, qword ptr [@RWD00] IN002d: 0000A4 divsd xmm2, xmm1 IN002e: 0000A8 movsxd eax, edi IN002f: 0000AB movaps xmm1, xmm2 IN0030: 0000AE mulsd xmm1, qword ptr [r8+8*rax+16] IN0031: 0000B5 addsd xmm0, xmm1 IN0032: 0000B9 movsd qword ptr [rbx], xmm0

void MultiplyAv(int n, double[] v, double[] Av) { for (int i = 0; i < n; i++) { Av[i] = 0; for (int j = 0; j < n; j++) Av[i] += v[j] * A(i, j); // order of operands reversed } }

Recommended topics

Hot tags