How to share pandas DataFrame object between processes?

This question has the same point of the link that I posted before.

( Is there a good way to avoid memory deep copy or to reduce time spent in multiprocessing? )

I'm getting nowhere with that since I faced the 'DataFrame' object sharing problem.

I simplified the sample code.

If there any professional to revise my code to share 'DataFrame' object between processes without Manager.list, Manager.dict, numpy sharedmem, I will very appreciate to her or him.

Here's the code.

#-*- coding: UTF-8 -*-'
import pandas as pd
import numpy as np
from multiprocessing import *
import multiprocessing.sharedctypes as sharedctypes
import ctypes

def add_new_derived_column(shared_df_obj):
    shared_df_obj.value['new_column']=shared_df_obj.value['A']+shared_df_obj.value['B'] / 2
    print shared_df_obj.value.head()
    '''
    "new_column" Generated!!!

          A         B  new_column
0 -0.545815 -0.179209   -0.635419
1  0.654273 -2.015285   -0.353370
2  0.865932 -0.943028    0.394418
3 -0.850136  0.464778   -0.617747
4 -1.077967 -1.127802   -1.641868
    '''

if __name__ == "__main__":

    dataframe = pd.DataFrame(np.random.randn(100000, 2), columns=['A', 'B'])

    # to shared DataFrame object, I use sharedctypes.RawValue
    shared_df_obj=sharedctypes.RawValue(ctypes.py_object, dataframe )

    # then I pass the "shared_df_obj" to Mulitiprocessing.Process object
    process=Process(target=add_new_derived_column, args=(shared_df_obj,))
    process.start()
    process.join()

    print shared_df_obj.value.head()
    '''
    "new_column" disappeared.
    the DataFrame object isn't shared.

          A         B
0 -0.545815 -0.179209
1  0.654273 -2.015285
2  0.865932 -0.943028
3 -0.850136  0.464778
4 -1.077967 -1.127802
    '''

#-*- coding: UTF-8 -*-' import pandas as pd import numpy as np from multiprocessing import * import multiprocessing.sharedctypes as sharedctypes import ctypes def add_new_derived_column(ns): dataframe2 = ns.df dataframe2['new_column']=dataframe2['A']+dataframe2['B'] / 2 print (dataframe2.head()) ns.df = dataframe2 if __name__ == "__main__": mgr = Manager() ns = mgr.Namespace() dataframe = pd.DataFrame(np.random.randn(100000, 2), columns=['A', 'B']) ns.df = dataframe print (dataframe.head()) # then I pass the "shared_df_obj" to Mulitiprocessing.Process object process=Process(target=add_new_derived_column, args=(ns,)) process.start() process.join() print (ns.df.head())

Recommended topics

Hot tags