import pandas as pd
data1 = [
{'File': '7396ee8aea09.json', 'Last': '2019-04-25 20:24:28+00:00', 'Size': 78796, 'COLA': 'STANDARD'},
{'File': '7777682e4/ip/done', 'Last': '2019-04-24 21:45:01+00:00', 'Size': 4, 'COLA': 'CUSTOM'},
{'File': 'erparameters.json', 'Last': '2019-04-24 21:45:01+00:00', 'Size': 351, 'COLA': 'STANDARD'}
]
data2 = [
{'File': '3343e4/ip/ip.json', 'Last': '2019-04-24 21:45:01+00:00', 'Size': 20, 'COLA': 'STANDARD'},
{'File': 'ta-00000-of-00001', 'Last': '2019-04-24 22:43:17+00:00', 'Size': 140216236, 'COLA': 'STANDARD'},
{'File': '-20514.ckpt.index', 'Last': '2019-04-24 22:43:17+00:00', 'Size': 3168, 'COLA': 'STANDARD'},
]
# right now 2 for the prova but could be much more..
datas = [data1, data2]
results = None
for data in datas:
df = pd.DataFrame(data, columns=['COLA','Size','Last'])\
.groupby(['COLA'])\
.agg({
'COLA': [('COLA', 'max'), ('Count', 'count')],
'Size': [('Size', 'sum')],
'Last': [('Last', 'max')]
})
df.columns = ['COLA', 'Count', 'Size', 'Last']
if results is None:
results = df
else:
results = results.merge(df,
on=['COLA', 'Count', 'Size', 'Last'],
how='outer',
)\
.groupby(['COLA'], as_index=False)\
.agg({
'COLA': [('COLA', 'max'), ('Count', 'sum')],
'Size': [('Size', 'sum')],
'Last': [('Last', 'max')]
})
print (results)
expected results is in that format:
- COLA have all unique COLA labels
- Size is the sum of all 'Size' for that COLA label group across all sets.
- Count is the total count of that COLA label group across all sets.
- Last has the greatest date of that COLA label group across all sets.
Example:
results = [
{'COLA': 'STANDARD', 'Size': 140298571, 'Count': 5, 'Last': '2019-04-25 20:24:28+00:00'},
{'COLA': 'CUSTOM', 'Size': 4, 'Count': 1, 'Last': '2019-04-24 21:45:01+00:00'}
]