Here's a way to do it with pymc. This method uses a fixed number of components (n_components) in the mixture model. You could try attaching a prior to n_components and sampling over that prior. Alternatively, you could just pick something reasonable or use the grid search technique from my other answer to estimate the number of components. In the below code I used 10000 iterations with a burn in of 9000, but that might not be sufficient to get good results. I would suggest using a much larger burn in. I also chose the priors somewhat arbitrarily, so those might be something to look at. You'll have to experiment with it. Best of luck to you. Here is the code.
import numpy as np
import pymc as mc
import scipy.stats as stats
from matplotlib import pyplot
def generate_random_histogram():
# Random bin locations between 0 and 100
bin_locations = np.random.rand(10,) * 100
bin_locations.sort()
# Random counts on those locations
bin_counts = np.random.randint(50, size=len(bin_locations))
return {'loc': bin_locations, 'count':bin_counts}
def bin_widths(loc):
widths = []
for i in range(len(loc)-1):
widths.append(loc[i+1] - loc[i])
widths.append(widths[-1])
widths = np.array(widths)
return widths
def mixer(name, weights, value=None):
n = value.shape[0]
def logp(value, weights):
vals = np.zeros(shape=(n, weights.shape[1]), dtype=int)
vals[:, value.astype(int)] = 1
return mc.multinomial_like(x = vals, n=1, p=weights)
def random(weights):
return np.argmax(np.random.multinomial(n=1, pvals=weights[0,:], size=n), 1)
result = mc.Stochastic(logp = logp,
doc = 'A kernel smoothing density node.',
name = name,
parents = {'weights': weights},
random = random,
trace = True,
value = value,
dtype = int,
observed = False,
cache_depth = 2,
plot = False,
verbose = 0)
return result
def create_model(lowers, uppers, count, n_components):
n = np.sum(count)
lower = min(lowers)
upper = min(uppers)
locations = mc.Uniform(name='locations', lower=lower, upper=upper, value=np.random.uniform(lower, upper, size=n_components), observed=False)
precisions = mc.Gamma(name='precisions', alpha=1, beta=1, value=.001*np.ones(n_components), observed=False)
weights = mc.CompletedDirichlet('weights', mc.Dirichlet(name='weights_ind', theta=np.ones(n_components)))
choice = mixer('choice', weights, value=np.ones(n).astype(int))
@mc.observed
def histogram_data(value=count, locations=locations, precisions=precisions, weights=weights):
if hasattr(weights, 'value'):
weights = weights.value
lower_cdfs = sum([weights[0,i]*stats.norm.cdf(lowers, loc=locations[i], scale=np.sqrt(1.0/precisions[i])) for i in range(len(weights))])
upper_cdfs = sum([weights[0,i]*stats.norm.cdf(uppers, loc=locations[i], scale=np.sqrt(1.0/precisions[i])) for i in range(len(weights))])
bin_probs = upper_cdfs - lower_cdfs
bin_probs = np.array(list(upper_cdfs - lower_cdfs) + [1.0 - np.sum(bin_probs)])
n = np.sum(count)
return mc.multinomial_like(x=np.array(list(count) + [0]), n=n, p=bin_probs)
@mc.deterministic
def location(locations=locations, choice=choice):
return locations[choice.astype(int)]
@mc.deterministic
def dispersion(precisions=precisions, choice=choice):
return precisions[choice.astype(int)]
data_generator = mc.Normal('data', mu=location, tau=dispersion)
return locals()
# Generate the histogram
hist = generate_random_histogram()
loc = hist['loc']
count = hist['count']
widths = bin_widths(hist['loc'])
lowers = loc - widths
uppers = loc + widths
# Create the model
model = create_model(lowers, uppers, count, 5)
# Initialize to the MAP estimate
model = mc.MAP(model)
model.fit(method ='fmin')
# Now sample with MCMC
model = mc.MCMC(model)
model.sample(iter=10000, burn=9000, thin=300)
# Plot the mu and tau traces
mc.Matplot.plot(model.trace('locations'))
pyplot.show()
# Get the samples from the fitted pdf
sample = np.ravel(model.trace('data')[:])
# Plot the original histogram, sampled histogram, and pdf
lower = min(lowers)
upper = min(uppers)
kde = stats.gaussian_kde(sample)
x = np.arange(0,100,.1)
y = kde(x)
fig = pyplot.figure()
ax1 = fig.add_subplot(311)
pyplot.xlim(lower,upper)
ax1.bar(loc, count, width=widths)
ax2 = fig.add_subplot(312, sharex=ax1)
ax2.hist(sample, bins=loc)
ax3 = fig.add_subplot(313, sharex=ax1)
ax3.plot(x, y)
pyplot.show()
And as you can see, the two distributions don't look terribly alike. However, a histogram is not much to go off of. I would play with different numbers of components and more iterations / burn in, but it's a project. Depending on your priorities, I suspect either @askewchan's or my other answer might serve you better.
bin_locations
describes the centers of bins? Otherwise, (if they are bin edges) you should havelen(bin_counts) = len(bin_locations) - 1
– Synchroscopebin_locations
are the center locations – Mullanebin_locations
should be the same, or do you just meanrange
as in therange
keyword arg tonp.histogram
which indicates just the lower and upper bounds of the domain? – Synchroscope0
to100
in the OP) for every histogram. – Mullanegaussian_kde
or an interpolation would return)? Or is it ok to have an array-like PDF with values atbin_locations
, as long as it is properly normalized on the domain? Or maybe you want something in betweeen, say a standardized evenly spaced bins that the values of all histograms are given at (likenp.linspace(0,100)
)? – Synchroscope[0,100]
. For it to be a PDF, the integral of the PDF within the range should be 1. Since we are working on the real domain, I should be able to discretize it at, say, regular intervals (e.g. usingnp.linspace
). – Mullane[a,b]
is too difficult, we can relax the constraint to(-Inf,Inf)
– Mullanebin_locations
have uniform binsize:[.1, .2, 5, 50, 90]
? – Synchroscopebin_size
much smaller if that's causing a problem. Technically speaking, to make the problem well-defined, the histogram counts must correspond to intervals (otherwise it doesn't make sense to talk about PDFs). This data is fake after all, but each count must correspond to a well-defined interval. – Mullanepymc
, since I thoughtpymc
could help with it. – Mullane