How to compute standard deviation in Apache Beam

I'm new to Apache Beam, and I want to calculate the mean and std deviation over a large dataset.

Given a .csv file of the form "A,B" where A, B are ints, this is basically what I have.

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.textio import ReadFromText

class Split(beam.DoFn):
    def process(self, element):
        A, B = element.split(',')
        return [('A', A), ('B', B)]

with beam.Pipeline(options=PipelineOptions()) as p:
     # parse the rows
     rows = (p
             | ReadFromText('data.csv')
             | beam.ParDo(Split()))

     # calculate the mean
     avgs = (rows
             | beam.CombinePerKey(
                 beam.combiners.MeanCombineFn()))

     # calculate the stdv per key
     # ???

     std >> beam.io.WriteToText('std.out')

I'd like to do something like:

class SquaredDiff(beam.DoFn):
    def process(self, element):
        A = element[0][1]
        B = element[1][1]
        return [('A', A - avgs[0]), ('B', B - avgs[1])]

stdv = (rows
        | beam.ParDo(SquaredDiff())
        | beam.CombinePerKey(
            beam.combiners.MeanCombineFn()))

or something, but I can't figure out how.

class MeanStddev(beam.CombineFn): def create_accumulator(self): return (0.0, 0.0, 0) # x, x^2, count def add_input(self, sum_count, input): (sum, sumsq, count) = sum_count return sum + input, sumsq + input*input, count + 1 def merge_accumulators(self, accumulators): sums, sumsqs, counts = zip(*accumulators) return sum(sums), sum(sumsqs), sum(counts) def extract_output(self, sum_count): (sum, sumsq, count) = sum_count if count: mean = sum / count variance = (sumsq / count) - mean*mean # -ve value could happen due to rounding stddev = np.sqrt(variance) if variance > 0 else 0 return { 'mean': mean, 'variance': variance, 'stddev': stddev, 'count': count } else: return { 'mean': float('NaN'), 'variance': float('NaN'), 'stddev': float('NaN'), 'count': 0 }

Recommended topics

Hot tags