It was asked in relation to another question recently: Given an unknown length list, return a random item in it by scanning it only 1 time
I know you shouldn't, I just can't put my finger on a canonical explanation of why not.
Look at the example code:
import random, sys
def rnd(): # a function that returns a random number each call
return int(random.getrandbits(32))
class fixed: # a functor that returns the same random number each call
def __init__(self):
self._ret = rnd()
def __call__(self):
return self._ret
def sample(rnd,seq_size):
choice = 0
for item in xrange(1,seq_size):
if (rnd() % (item+1)) == 0:
choice = item
return choice
dist = [0 for i in xrange(500)]
for i in xrange(1000):
dist[sample(rnd,len(dist))] += 1
print "real",dist
print
dist = [0 for i in xrange(500)]
for i in xrange(1000):
dist[sample(fixed(),len(dist))] += 1
print "reuse",dist
The choices for the proper reservoir sampling that generates a new random number per item is nicely evenly distributed as it should be:
real [1, 3, 0, 1, 2, 3, 2, 3, 1, 2, 2, 2, 2, 0, 0, 1, 3, 3, 4, 0, 2, 1, 2, 1, 1, 4, 0, 3, 1, 1, 2, 0, 0, 0, 1, 4, 6, 2, 3, 1, 1, 3, 2, 1, 3, 3, 1, 4, 1, 1, 2, 2, 5, 1, 2, 1, 0, 3, 1, 0, 2, 6, 1, 2, 2, 1, 1, 1, 1, 3, 2, 1, 5, 4, 0, 3, 3, 4, 0, 0, 2, 1, 3, 2, 3, 0, 2, 4, 6, 3, 0, 1, 3, 0, 2, 2, 4, 3, 2, 1, 2, 1, 2, 2, 1, 4, 2, 0, 0, 1, 1, 0, 1, 4, 2, 2, 2, 1, 0, 3, 1, 2, 1, 0, 2, 2, 1, 5, 1, 5, 3, 3, 1, 0, 2, 2, 0, 3, 2, 3, 0, 1, 1, 3, 0, 1, 2, 2, 0, 1, 2, 2, 3, 2, 3, 1, 1, 0, 1, 2, 2, 2, 2, 2, 3, 2, 1, 2, 2, 2, 1, 3, 3, 1, 0, 1, 1, 0, 1, 3, 2, 1, 4, 3, 4, 1, 1, 1, 2, 1, 2, 0, 0, 0, 1, 1, 2, 6, 0, 1, 1, 0, 1, 0, 1, 2, 2, 3, 0, 1, 2, 2, 1, 0, 4, 2, 1, 2, 2, 0, 4, 4, 0, 3, 2, 2, 1, 2, 4, 1, 2, 1, 0, 2, 1, 1, 5, 1, 2, 2, 3, 2, 3, 0, 1, 2, 3, 2, 5, 2, 3, 0, 1, 1, 1, 1, 3, 4, 2, 4, 1, 2, 3, 2, 5, 2, 1, 0, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 2, 0, 4, 1, 1, 2, 3, 4, 3, 1, 2, 3, 3, 3, 2, 1, 2, 0, 0, 4, 3, 2, 2, 5, 5, 3, 3, 3, 1, 0, 1, 3, 1, 1, 2, 4, 3, 1, 4, 4, 2, 5, 0, 5, 4, 2, 1, 0, 4, 1, 3, 3, 2, 4, 2, 3, 3, 1, 3, 3, 4, 2, 2, 1, 1, 1, 1, 3, 3, 5, 3, 2, 4, 0, 1, 3, 2, 2, 4, 2, 2, 3, 4, 5, 3, 2, 1, 2, 3, 2, 2, 2, 4, 4, 0, 1, 3, 3, 3, 4, 1, 2, 4, 0, 4, 0, 3, 2, 1, 1, 4, 2, 1, 0, 0, 0, 4, 2, 2, 1, 4, 3, 1, 1, 3, 2, 4, 3, 4, 2, 1, 1, 2, 2, 3, 3, 1, 2, 2, 1, 1, 2, 3, 1, 9, 1, 3, 4, 2, 4, 4, 0, 1, 0, 1, 0, 2, 1, 0, 1, 2, 3, 3, 6, 2, 2, 1, 2, 4, 3, 3, 3, 2, 1, 2, 1, 2, 8, 2, 3, 1, 5, 3, 0, 2, 1, 1, 4, 2, 2, 1, 2, 3, 2, 1, 0, 4, 3, 4, 3, 1, 3, 2, 3, 2, 2, 1, 0, 1, 2, 5, 3, 0, 3, 1, 2, 2, 2, 1, 0, 1, 4]
Whereas when you re-use the same random number for all items, you get a distribution skewed to the very low numbers:
reuse [92, 50, 34, 19, 23, 16, 13, 9, 9, 9, 11, 10, 6, 7, 8, 5, 5, 6, 4, 2, 2, 3, 2, 3, 3, 6, 6, 1, 4, 3, 5, 2, 2, 1, 1, 2, 3, 4, 3, 4, 1, 3, 1, 0, 0, 1, 5, 3, 1, 2, 0, 2, 0, 1, 1, 6, 2, 0, 2, 2, 4, 2, 2, 0, 2, 2, 2, 0, 3, 0, 4, 1, 2, 1, 4, 2, 2, 0, 1, 0, 1, 1, 0, 0, 0, 2, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 2, 1, 3, 1, 0, 1, 2, 0, 4, 3, 0, 0, 2, 0, 0, 1, 0, 0, 2, 0, 2, 1, 0, 1, 0, 0, 1, 1, 3, 0, 1, 1, 0, 2, 0, 1, 2, 0, 1, 1, 4, 1, 1, 1, 2, 1, 0, 1, 2, 0, 2, 1, 1, 2, 0, 1, 1, 0, 2, 0, 2, 0, 0, 2, 0, 1, 0, 2, 1, 1, 0, 0, 1, 2, 4, 1, 0, 2, 0, 1, 2, 1, 3, 0, 1, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 3, 2, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 4, 1, 0, 2, 1, 0, 0, 2, 1, 1, 3, 3, 2, 0, 1, 0, 2, 0, 1, 1, 0, 0, 3, 1, 0, 0, 1, 0, 3, 2, 2, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 0, 4, 1, 0, 0, 2, 0, 1, 1, 0, 0, 3, 1, 3, 2, 2, 1, 3, 1, 2, 0, 1, 1, 3, 0, 3, 1, 2, 0, 2, 0, 2, 0, 3, 0, 3, 0, 3, 1, 0, 2, 3, 1, 1, 0, 1, 3, 3, 1, 1, 1, 0, 2, 1, 1, 4, 1, 1, 1, 2, 0, 3, 1, 1, 0, 4, 1, 1, 0, 1, 3, 1, 0, 1, 1, 0, 3, 3, 0, 2, 4, 0, 1, 2, 1, 6, 1, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 4, 2, 0, 1, 2, 0, 1, 4, 1, 2, 0, 5, 2, 2, 0, 6, 2, 2, 1, 3, 0, 3, 1, 1, 0, 3, 1, 4, 2, 0, 1, 0, 1, 2, 3, 1, 1, 3, 0, 0, 0, 1, 1, 4, 3, 3, 0, 0, 1, 0, 1, 1, 2, 1, 0, 2, 1, 4, 5, 1, 1, 3, 0, 1, 1, 1, 3, 1, 1, 0, 3, 3, 1, 3, 0, 1, 0, 0, 1, 1, 3, 2, 1, 0, 3, 1, 1, 3, 1, 3, 1, 2, 2, 2, 0, 0, 5, 1, 3, 0, 1, 4, 1, 1, 1, 3, 2, 1, 3, 2, 1, 3, 1, 2, 2, 3, 2, 2, 1, 0, 3, 3, 1, 3, 3, 3, 2, 1, 2, 3, 3, 3, 1, 2, 2, 2, 4, 2, 1, 5, 2, 2, 0]
What's the maths here? Why can't you re-use the same random number?