Node.js GET Request ETIMEDOUT & ESOCKETTIMEDOUT
Asked Answered
S

4

21

I'm using Node.js - async & request module to crawl 100+ millions of websites and I keep bumping into errors ESOCKETTIMEDOUT & ETIMEDOUT after few minutes.

It works again after I restart the script. It doesn't seem to be connection limit issue because I can still do resolve4, resolveNs, resolveMx and also curl without delay.

Do you see any issue with the code? or any advice? I'd like to push up the async.queue() concurrency to at least a 1000. Thank you.

var request = require('request'),
    async = require('async'),
    mysql = require('mysql'),
    dns = require('dns'),
    url = require('url'),
    cheerio = require('cheerio'),
    iconv = require('iconv-lite'),
    charset = require('charset'),
    config = require('./spy.config'),
    pool = mysql.createPool(config.db);

iconv.skipDecodeWarning = true;

var queue = async.queue(function (task, cb) {
    dns.resolve4('www.' + task.domain, function (err, addresses) {
        if (err) {
            //
            // Do something
            //
            setImmediate(function () {
                cb()
            });
        } else {
            request({
                url: 'http://www.' + task.domain,
                method: 'GET',
                encoding:       'binary',
                followRedirect: true,
                pool:           false,
                pool:           { maxSockets: 1000 },
                timeout:        15000 // 15 sec
            }, function (error, response, body) {

                //console.info(task);

                if (!error) {
                  // If ok, do something

                } else {
                    // If not ok, do these

                    console.log(error);

                    // It keeps erroring here after few minutes, resolve4, resolveNs, resolveMx still work here.

                    // { [Error: ETIMEDOUT] code: 'ETIMEDOUT' }
                    // { [Error: ESOCKETTIMEDOUT] code: 'ESOCKETTIMEDOUT' }

                    var ns = [],
                        ip = [],
                        mx = [];
                    async.parallel([
                        function (callback) {
                            // Resolves the domain's name server records
                            dns.resolveNs(task.domain, function (err, addresses) {
                                if (!err) {
                                    ns = addresses;
                                }
                                callback();
                            });
                        }, function (callback) {
                            // Resolves the domain's IPV4 addresses
                            dns.resolve4(task.domain, function (err, addresses) {
                                if (!err) {
                                    ip = addresses;
                                }
                                callback();
                            });
                        }, function (callback) {
                            // Resolves the domain's MX records
                            dns.resolveMx(task.domain, function (err, addresses) {
                                if (!err) {
                                    addresses.forEach(function (a) {
                                        mx.push(a.exchange);
                                    });
                                }
                                callback();
                            });
                        }
                    ], function (err) {
                        if (err) return next(err);

                        // do something
                    });

                }
                setImmediate(function () {
                    cb()
                });
            });
        }
    });
}, 200);

// When the queue is emptied we want to check if we're done
queue.drain = function () {
    setImmediate(function () {
        checkDone()
    });
};
function consoleLog(msg) {
    //console.info(msg);
}
function checkDone() {
    if (queue.length() == 0) {
        setImmediate(function () {
            crawlQueue()
        });
    } else {
        console.log("checkDone() not zero");
    }
}

function query(sql) {
    pool.getConnection(function (err, connection) {
        if (!err) {
            //console.log(sql);
            connection.query(sql, function (err, results) {
                connection.release();
            });
        }
    });
}

function crawlQueue() {
    pool.getConnection(function (err, connection) {
        if (!err) {
            var sql = "SELECT * FROM domain last_update < (UNIX_TIMESTAMP() - 2592000) LIMIT 500";
            connection.query(sql, function (err, results) {
                if (!err) {
                    if (results.length) {
                        for (var i = 0, len = results.length; i < len; ++i) {
                            queue.push({"id": results[i]['id'], "domain": results[i]['domain'] });
                        }
                    } else {
                        process.exit();
                    }
                    connection.release();
                } else {
                    connection.release();
                    setImmediate(function () {
                        crawlQueue()
                    });
                }
            });
        } else {
            setImmediate(function () {
                crawlQueue()
            });
        }
    });
}
setImmediate(function () {
    crawlQueue()
});

And the system limits are pretty high.

    Limit                     Soft Limit           Hard Limit           Units
    Max cpu time              unlimited            unlimited            seconds
    Max file size             unlimited            unlimited            bytes
    Max data size             unlimited            unlimited            bytes
    Max stack size            8388608              unlimited            bytes
    Max core file size        0                    unlimited            bytes
    Max resident set          unlimited            unlimited            bytes
    Max processes             257645               257645               processes
    Max open files            500000               500000               files
    Max locked memory         65536                65536                bytes
    Max address space         unlimited            unlimited            bytes
    Max file locks            unlimited            unlimited            locks
    Max pending signals       257645               257645               signals
    Max msgqueue size         819200               819200               bytes
    Max nice priority         0                    0
    Max realtime priority     0                    0
    Max realtime timeout      unlimited            unlimited            us

sysctl

net.ipv4.ip_local_port_range = 10000    61000
Scrimmage answered 20/6, 2014 at 5:34 Comment(4)
why is pool (on the request) set twice?Voroshilovsk
It is to disable the pool. I still get the errors, with or without the pool and the maxSockets.Scrimmage
were you able to find the cause?Lamination
See my answer here for a solution: #35387764Hyphenate
T
23

By default, Node has 4 workers to resolve DNS queries. If your DNS query takes long-ish time, requests will block on the DNS phase, and the symptom is exactly ESOCKETTIMEDOUT or ETIMEDOUT.

Try increasing your uv thread pool size:

export UV_THREADPOOL_SIZE=128
node ...

or in index.js (or wherever your entry point is):

#!/usr/bin/env node
process.env.UV_THREADPOOL_SIZE = 128;

function main() {
   ...
}

Edit: I also wrote blog post about it.

Tung answered 21/6, 2016 at 13:49 Comment(2)
i have the same problem, but this workaround no't working for me. Any idea?Randle
that means either the problem is elsewhere, or you are exhausting your 128 threads. In the latter case, you will need to do DNS resolution natively -- by avoiding getaddrinfo(3).Spermatium
G
10

10/31/2017 The final solution we found is to use the keepAlive option in an agent. For example:

var pool = new https.Agent({ keepAlive: true });

function getJsonOptions(_url) {
    return {
        url: _url,
        method: 'GET',
        agent: pool,
        json: true
    };
}

Node's default pool seems to default to keepAlive=false which causes a new connection being created on each request. When too many connections are created in a short period of time, the above error would surface. My guess is that one or more routers along the path to the service blocks connection request, probably in suspicion of Deny Of Service attack. In any case, the code sample above completely solved our problem.

7/16/2021 There is an easier solution to this problem:

    var http = require('http');
    http.globalAgent.keepAlive = true;

    var https = require('https');
    https.globalAgent.keepAlive = true;

I verified in code that global agents' keepAlive were set to false despite documentation saying that the default should be true.

Gossip answered 8/5, 2017 at 21:51 Comment(0)
D
2

TL:DR; Configure a wrapper of request with your settings only one time before your loop, and use forever: true setting.

const customRequest = request.defaults({ forever: true }); // wrapper
customRequest({ uri: uri });

Detailed answer;
I had the exact same issue while doing requests inside a loop. However, nothing of the above worked for me.

In order to stop getting ETIMEDOUT & ESOCKETTIMEDOUT on some requests after a certain amount of time, do the following:

  1. Do not configure request settings on each request in the loop. Instead, create a request wrapper with your settings only one time before the loop. As the request documentation states:

Note that if you are sending multiple requests in a loop and creating multiple new pool objects, maxSockets will not work as intended. To work around this, either use request.defaults with your pool options or create the pool object with the maxSockets property outside of the loop.

  1. However, even when implementing that, and configuring a pool with { maxSockets: Infinity }, I was still facing the error. The only configuration that solved my issue was forever: true. Which will keep the established connections alive.

Therefore, at the end my code was something like this:

const request = require('request');
// Custom wrapper
const customRequest = request.defaults({
   forever: true,
   timeout: 20000,
   encoding: null
})

loop(urls, (url) => { // for each of my urls
   customRequest({uri: url}, (err, res, body) => { 
      console.log("done"); 
   });
});

Using this strategy I was able to do about 400K requests at a rate of 25 requests per second without having any issues during the process (Ubuntu 18.04 VM, 4GB RAM, with default UV_THREADPOOL_SIZE).

Disconformity answered 30/7, 2020 at 19:17 Comment(0)
S
-1

In the request tool (https://github.com/request/request)

The http connection keep-alive is turned off by default.

You need to set option.forever = true to open this feature.

Stocky answered 10/3, 2019 at 1:48 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.