Presumably you have your own pool of proxies - what is the best way to rotate them?
First, blindly picking random proxy we risk of repeating connection from the same proxy multiple times in a row. To add, most connection pattern based blocking is using proxy subnet (3rd number) rather than host - it's best to prevent repeats at subnet level.
It's also a good idea to track proxy performance as not all proxies are equal - we want to use our better performing proxies more often and let dead proxies cooldown.
All of this can be done with weighted randomization which is implemented by Python's random.choices()
function:
import random
from time import time
from typing import List, Literal
class Proxy:
"""container for a proxy"""
def __init__(self, ip, type_="datacenter") -> None:
self.ip: str = ip
self.type: Literal["datacenter", "residential"] = type_
_, _, self.subnet, self.host = ip.split(":")[0].split('.')
self.status: Literal["alive", "unchecked", "dead"] = "unchecked"
self.last_used: int = None
def __repr__(self) -> str:
return self.ip
def __str__(self) -> str:
return self.ip
class Rotator:
"""weighted random proxy rotator"""
def __init__(self, proxies: List[Proxy]):
self.proxies = proxies
self._last_subnet = None
def weigh_proxy(self, proxy: Proxy):
weight = 1_000
if proxy.subnet == self._last_subnet:
weight -= 500
if proxy.status == "dead":
weight -= 500
if proxy.status == "unchecked":
weight += 250
if proxy.type == "residential":
weight += 250
if proxy.last_used:
_seconds_since_last_use = time() - proxy.last_used
weight += _seconds_since_last_use
return weight
def get(self):
proxy_weights = [self.weigh_proxy(p) for p in self.proxies]
proxy = random.choices(
self.proxies,
weights=proxy_weights,
k=1,
)[0]
proxy.last_used = time()
self.last_subnet = proxy.subnet
return proxy
If we mock run this Rotator we can see how weighted randoms distribute our connections:
from collections import Counter
if __name__ == "__main__":
proxies = [
# these will be used more often
Proxy("xx.xx.121.1", "residential"),
Proxy("xx.xx.121.2", "residential"),
Proxy("xx.xx.121.3", "residential"),
# these will be used less often
Proxy("xx.xx.122.1"),
Proxy("xx.xx.122.2"),
Proxy("xx.xx.123.1"),
Proxy("xx.xx.123.2"),
]
rotator = Rotator(proxies)
# let's mock some runs:
_used = Counter()
_failed = Counter()
def mock_scrape():
proxy = rotator.get()
_used[proxy.ip] += 1
if proxy.host == "1": # simulate proxies with .1 being significantly worse
_fail_rate = 60
else:
_fail_rate = 20
if random.randint(0, 100) < _fail_rate: # simulate some failure
_failed[proxy.ip] += 1
proxy.status = "dead"
mock_scrape()
else:
proxy.status = "alive"
return
for i in range(10_000):
mock_scrape()
for proxy, count in _used.most_common():
print(f"{proxy} was used {count:>5} times")
print(f" failed {_failed[proxy]:>5} times")
# will print:
# xx.xx.121.2 was used 2629 times
# failed 522 times
# xx.xx.121.3 was used 2603 times
# failed 508 times
# xx.xx.123.2 was used 2321 times
# failed 471 times
# xx.xx.122.2 was used 2302 times
# failed 433 times
# xx.xx.121.1 was used 1941 times
# failed 1187 times
# xx.xx.122.1 was used 1629 times
# failed 937 times
# xx.xx.123.1 was used 1572 times
# failed 939 times
By using weighted randoms we can create a connection pattern that appears random but smart. We can apply generic patterns like not proxies from the same IP family in a row as well as custom per-target logic like priotizing North American IPs for NA targets etc.
For more on this see my blog How to Rotate Proxies in Web Scraping