How to reduce the allocations in Julia?
Asked Answered
S

2

6

I am starting to use Julia mainly because of its speed. Currently, I am solving a fixed point problem. Although the current version of my code runs fast I would like to know some methods to improve its speed.

First of all, let me summarize the algorithm.

  1. There is an initial seed called C0 that maps from the space (b,y) into an action space c, then we have C0(b,y)
  2. There is a formula that generates a rule Ct from C0.
  3. Then, using an additional restriction, I can obtain an updating of b [let's called it bt]. Thus,it generates a rule Ct(bt,y)
  4. I need to interpolate the previous rule to move from the grid bt into the original grid b. It gives me an update for C0 [let's called that C1]
  5. I will iterate until the distance between C1 and C0 is below a convergence threshold.

To implement it I created two structures:

    struct Parm
        lC::Array{Float64, 2}    # Lower limit
        uC::Array{Float64, 2}    # Upper limit
        γ::Float64               # CRRA coefficient
        δ::Float64               # factor in the euler
        γ1::Float64              # 
        r1::Float64              # inverse of the gross interest rate
        yb1::Array{Float64, 2}   # y - b(t+1)
        P::Array{Float64, 2}     # Transpose of transition matrix
    end

    mutable struct Upd1
        pol::Array{Float64,2}    # policy function
        b::Array{Float64, 1}     # exogenous grid for interpolation
        dif::Float64             # updating difference
    end

The first one is a set of parameters while the second one stores the decision rule C1. I also define some functions:

    function eulerm(x::Upd1,p::Parm)
        ct = p.δ *(x.pol.^(-p.γ)*p.P).^(-p.γ1); #Euler equation 
        bt = p.r1.*(ct .+ p.yb1);               #Endeogenous grid for bonds
        return ct,bt
    end

    function interp0!(bt::Array{Float64},ct::Array{Float64},x::Upd1, p::Parm)
        polold = x.pol;
        polnew = similar(x.pol); 
        @inbounds @simd for col in 1:size(bt,2)
            F1 = LinearInterpolation(bt[:,col], ct[:,col],extrapolation_bc=Line());
            polnew[:,col] = F1(x.b);
        end
        polnew[polnew .< p.lC] .= p.lC[polnew .< p.lC];
        polnew[polnew .> p.uC] .= p.uC[polnew .> p.uC];
        dif = maximum(abs.(polnew - polold));
        return polnew,dif
    end

    function updating!(x::Upd1,p::Parm)
        ct, bt  = eulerm(x,p);       # endogeneous grid
        x.pol, x.dif   = interp0!(bt,ct,x,p);
    end

    function conver(x::Upd1,p::Parm)
        while x.dif>1e-8
            updating!(x,p);
        end
    end

The first formula implements steps 2 and 3. The third one makes the updating (last part of step 4), and the last one iterates until convergence (step 5).

The most important function is the second one. It makes the interpolation. While I was running the function @time and @btime I realized that the largest number of allocations are in the loop inside this function. I tried to reduce it by not defining polnew and goes directly to x.pol but in this case, the results are not correct since it only need two iterations to converge (I think that Julia is thinking that polold is exactly the same than x.pol and it is updating both at the same time).

Any advice is well received.

To anyone that wants to run it by themselves, I add the rest of the required code:

    function rouwen(ρ::Float64, σ2::Float64, N::Int64)
        if (N % 2 != 1)
            return "N should be an odd number"
        end
        sigz = sqrt(σ2/(1-ρ^2));
        zn = sigz*sqrt(N-1);
        z  = range(-zn,zn,N);
        p = (1+ρ)/2;
        q =  p;
        Rho = [p 1-p;1-q q];
        for i = 3:N
            zz = zeros(i-1,1);
            Rho = p*[Rho zz; zz' 0] + (1-p)*[zz Rho; 0 zz'] + (1-q)*[zz' 0; Rho zz] + q *[0 zz'; zz Rho];
            Rho[2:end-1,:] = Rho[2:end-1,:]/2;
        end
        return z,Rho;
    end

    #############################################################
    # Parameters of the model
    ############################################################

    lb = 0;     ub = 1000;   pivb = 0.25; nb   = 500; 
    ρ  = 0.988; σz = 0.0439; μz   =-σz/2; nz   = 7;
    ϕ  = 0.0;   σe = 0.6376; μe   =-σe/2; ne   = 7;  
    β  = 0.98;  r  = 1/400;   γ   = 1; 

    b = exp10.(range(start=log10(lb+pivb), stop=log10(ub+pivb), length=nb)) .- pivb;

    #=========================================================
     Algorithm
    ======================================================== =#
    (z,Pz) = rouwen(ρ,σz, nz);
    μZ = μz/(1-ρ);
    z  = z .+ μZ;
    (ee,Pe) = rouwen(ϕ,σe,ne);
    ee = ee .+ μe;
    y = exp.(vec((z .+ ee')'));
    P  = kron(Pz,Pe);
    R  = 1 + r;
    r1 = R^(-1);
    γ1 = 1/γ;
    δ  = (β*R)^(-γ1);
    m  = R*b .+ y';
    lC = max.(m .- ub,0);
    uC = m .- lb;
    by1 = b .- y';

    # initial guess for C0
    c0 = 0.1*(m); 
    # Set of parameters
    pp  = Parm(lC,uC,γ,δ,γ1,r1,by1,P');
    # Container of results
    up1 = Upd1(c0,b,1);
    # Fixed point problem
    conver(up1,pp)

UPDATE As it was reccomend, I made the following changes to the third function

    function interp0!(bt::Array{Float64},ct::Array{Float64},x::Upd1, p::Parm)
        polold = x.pol;
        polnew = similar(x.pol); 
        @inbounds for col in 1:size(bt,2)
            F1 = LinearInterpolation(@view(bt[:,col]), @view(ct[:,col]),extrapolation_bc=Line());
            polnew[:,col] = F1(x.b);
        end
        for j in eachindex(polnew)
            polnew[j] < p.lC[j] ? polnew[j] =  p.lC[j] : nothing
            polnew[j] > p.uC[j] ? polnew[j] =  p.uC[j] : nothing
        end  
        dif = maximum(abs.(polnew - polold));
        return polnew,dif
    end

This leads to an improvement in the speed (from ~1.5 to ~1.3 seconds). And a reduction in the number of allocations. Somethings that I noted were:

  • Changing from polnew[:,col] = F1(x.b) to polnew[:,col] .= F1(x.b) can reduce the total allocations but the time is slower, why is that?
  • How should I understand the difference between @time and @btime. For this case, I have:
    up1 = Upd1(c0,b,1);
    @time conver(up1,pp)
      1.338042 seconds (385.72 k allocations: 1.157 GiB, 3.37% gc time)
    up1 = Upd1(c0,b,1);
    @btime conver(up1,pp)
      4.200 ns (0 allocations: 0 bytes)

Just to be precise, in both cases, I run it several times and I choose representative numbers for each line. Does it mean that all the time is due allocations during the compilation?

Southsoutheast answered 25/7, 2022 at 14:40 Comment(7)
I admit I haven't studied your code, but I assume you are aware of the performance tips page in the offical documentation... BDW: how did you add runnable snippets to the question ? Is it a new feature of SO?Mooch
@Mooch yes I read that tips but I have to admit that although I tried to implement them I am not proficient enough to fully understand all of them. For example, I don't know if views can improve my code [although in my trials it doesn't look to be the case]. Regarding the snippets, it was how adding code appears while I was writing the question. So, I really don't know if this is new functionality. ThanksSouthsoutheast
You're saying that the allocations are happening inside the for col in 1:size(bt,2) loop? If so, I would think that it has nothint to do with polnew or x.pol; it's almost definitely the interpolation. Just constructing the LinearInterpolation may involve allocation, and almost certainly evaluating it will also. Is there an in-place version you could use, like F1(@view(polnew[:,col]), x.b)?Scourings
Incidentally, the state of interpolation packages in Julia is unfortunate. But FWIW, I usually go for DataInterpolations.jl instead of Interpolations.jl.Scourings
@Scourings that is great I will try to use it.Southsoutheast
@Scourings actually I can not use DataInterpolations.jlbecause in the future I will need to do multiple dimensions interpolations. But thank in advance.Southsoutheast
Have you used a profiler to find the bottlenecks of your code? Especially if your code is a bit longer it often pays off to focus optimizations to the part of your code, where most of the runtime is spend, instead of optimizing everything. Julia has such a tool build in and it really pays of to learn how to use it.Collected
P
2

Start going through the "performance tips" as advised by @DNF but below you will find most important comments for your code.

  1. Vectorize vector assignments - a small dot makes big difference
julia> julia> a = rand(3,4);

julia> @btime $a[3,:] = $a[3,:] ./ 2;
  40.726 ns (2 allocations: 192 bytes)

julia> @btime $a[3,:] .= $a[3,:] ./ 2;
  20.562 ns (1 allocation: 96 bytes)
  1. Use views when doing something with subarrays:
julia> @btime sum($a[3,:]);
  18.719 ns (1 allocation: 96 bytes)

julia> @btime sum(@view($a[3,:]));
  5.600 ns (0 allocations: 0 bytes)
  1. Your code around a lines polnew[polnew .< p.lC] .= p.lC[polnew .< p.lC]; will make much less allocations when you do it with a for loop over each element of polnew

  2. @simd will have no effect on conditionals (point 3) neither when code is calling complex external functions

Pournaras answered 25/7, 2022 at 17:16 Comment(1)
Hi, I will try to incorporate your advice. Regarding (4), yes I realized that while I was running my trials. With respect to 3, I need to add that polnew[polnew .< p.lC] .= p.lC[polnew .< p.lC]; makes 10 allocations while the loop makes >900, that's why I focus my attention on that. Thank so much!!!Southsoutheast
S
2

I want to give an update about this problem. I made two main changes to my code: (i) I define my own linear interpolation function and (ii) I include the check of bounds in the interpolation. With this the new function three is

    function interp0!(bt::Array{Float64},ct::Array{Float64},x::Upd1, p::Parm)
        polold = x.pol;
        polnew = similar(x.pol); 
        @inbounds @views for col in 1:size(bt,2)
            polnew[:,col] = myint(bt[:,col],     ct[:,col],x.b[:],p.lC[:,col],p.uC[:,col]);
        end
        dif = maximum(abs.(polnew - polold));
        return polnew,dif
    end

And the interpolation is now:

    function myint(x0,y0,x1,ly,uy)
        y1 = similar(x1);
        n = size(x0,1);
        j = 1;
        @simd for i in eachindex(x1)
            while (j <= n) && (x1[i] > x0[j])
                j+=1;
            end
            if j == 1
                y1[i] = y0[1] + ((y0[2]-y0[1])/(x0[2]-x0[1]))*(x1[i]-x0[1]) ;
            elseif j == n+1
                y1[i] = y0[n] + ((y0[n]-y0[n-1])/(x0[n]-x0[n-1]))*(x1[i]-x0[n]);
            else
                y1[i] = y0[j-1]+ ((x1[i]-x0[j-1])/(x0[j]-x0[j-1]))*(y0[j]-y0[j-1]); 
            end
            y1[i] > uy[i] ? y1[i] = uy[i] : nothing;
            y1[i] < ly[i] ? y1[i] = ly[i] : nothing;
        end
        return y1;
    end

As you can see, I am taking advantage (and assuming) that both vectors that we use as basis are ordered while the two last lines in the outer loops checks the bounds imposed by lC and uC. With that I get the following total time

    up1 = Upd1(c0,b,1);
    @time conver(up1,pp)
      0.734630 seconds (28.93 k allocations: 752.214 MiB, 3.82% gc time)
    up1 = Upd1(c0,b,1);
    @btime conver(up1,pp)
      4.200 ns (0 allocations: 0 bytes)

which is almost as twice faster with ~8% of the total allocations. the use of views in the loop of the function interp0! also helps a lot.

Southsoutheast answered 26/7, 2022 at 15:23 Comment(0)

© 2022 - 2025 — McMap. All rights reserved.