((Please forgive me that I ask more than one question in a single thread. I think they are related.))
Hello, I wanted to know, what best practices exist in Erlang in regards to per-module precompiled data.
Example: I have a module that heavily operates on a priory know, veeery complex regular expressions. re:compile/2's documentations says: “Compiling once and executing many times is far more efficient than compiling each time one wants to match”. Since re's mp() datatype is in no way specified, and as such cannot be put at compile time if you want a target-independ beam, one has to compile the RegEx at runtime. ((Note: re:compile/2 is only an example. Any complex function to memoize would fit my question.))
Erlang's module (can) have an -on_load(F/A)
attribute, denoting a method that should executed once when the module is loaded. As such, I could place my regexes to compile in this method and save the result in a new ets table named ?MODULE
.
Updated after Dan's answer.
My questions are:
- If I am understanding ets right, its data is saved in another process (differently form the process dictionary) and retrieving a value for an ets table is quite expensive. (Please prove me wrong, if I am wrong!) Should the content in ets be copied to the process dictionary for speedup? (Remember: the data is never being updated.)
- Are there any (considerable) drawbacks of putting all data as one record (instead of many table items) into the ets/process dictionary?
Working example:
-module(memoization).
-export([is_ipv4/1, fillCacheLoop/0]).
-record(?MODULE, { re_ipv4 = re_ipv4() }).
-on_load(fillCache/0).
fillCacheLoop() ->
receive
{ replace, NewData, Callback, Ref } ->
true = ets:insert(?MODULE, [{ data, {self(), NewData} }]),
Callback ! { on_load, Ref, ok },
?MODULE:fillCacheLoop();
purge ->
ok
end
.
fillCache() ->
Callback = self(),
Ref = make_ref(),
process_flag(trap_exit, true),
Pid = spawn_link(fun() ->
case catch ets:lookup(?MODULE, data) of
[{data, {TableOwner,_} }] ->
TableOwner ! { replace, #?MODULE{}, self(), Ref },
receive
{ on_load, Ref, Result } ->
Callback ! { on_load, Ref, Result }
end,
ok;
_ ->
?MODULE = ets:new(?MODULE, [named_table, {read_concurrency,true}]),
true = ets:insert_new(?MODULE, [{ data, {self(), #?MODULE{}} }]),
Callback ! { on_load, Ref, ok },
fillCacheLoop()
end
end),
receive
{ on_load, Ref, Result } ->
unlink(Pid),
Result;
{ 'EXIT', Pid, Result } ->
Result
after 1000 ->
error
end
.
is_ipv4(Addr) ->
Data = case get(?MODULE.data) of
undefined ->
[{data, {_,Result} }] = ets:lookup(?MODULE, data),
put(?MODULE.data, Result),
Result;
SomeDatum -> SomeDatum
end,
re:run(Addr, Data#?MODULE.re_ipv4)
.
re_ipv4() ->
{ok, Result} = re:compile("^0*"
"([1-9]?\\d|1\\d\\d|2[0-4]\\d|25[0-5])\\.0*"
"([1-9]?\\d|1\\d\\d|2[0-4]\\d|25[0-5])\\.0*"
"([1-9]?\\d|1\\d\\d|2[0-4]\\d|25[0-5])\\.0*"
"([1-9]?\\d|1\\d\\d|2[0-4]\\d|25[0-5])$"),
Result
.