I have been trying to normalize
a very nested json file I will later analyze. What I am struggling with is how to go more than one level deep to normalize.
I went through the pandas.io.json.json_normalize documentation, since it does exactly what I want it to do.
I have been able to normalize part of it and now understand how dictionaries work, but I am still not there.
With below code I am able to get only the first level.
import json
import pandas as pd
from pandas.io.json import json_normalize
with open('authors_sample.json') as f:
d = json.load(f)
raw = json_normalize(d['hits']['hits'])
authors = json_normalize(data=d['hits']['hits'],
record_path='_source',
meta=['_id',
['_source', 'journal'],
['_source', 'title'],
['_source', 'normalized_venue_name']
])
I am trying to 'dig' into the 'authors' dictionary with below code, but the record_path = ['_source', 'authors']
throws me TypeError: string indices must be integers
. As far as I understand json_normalize
the logic should be good, but I still don't quite understand how to dive into a json with dict
vs list
.
I even went through this simple example.
authors = json_normalize(data=d['hits']['hits'],
record_path=['_source', 'authors'],
meta=['_id',
['_source', 'journal'],
['_source', 'title'],
['_source', 'normalized_venue_name']
])
Below is a chunk of the json file (5 records).
{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
u'hits': {u'hits': [{u'_id': u'7CB3F2AD',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': None,
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'Physical Review Letters',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'phys rev lett',
u'pages': None,
u'parent_keywords': [u'Chromatography',
u'Quantum mechanics',
u'Particle physics',
u'Quantum field theory',
u'Analytical chemistry',
u'Quantum chromodynamics',
u'Physics',
u'Mass spectrometry',
u'Chemistry'],
u'pub_date': u'1987-03-02 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'mass spectra', u'elementary particles', u'bound states'],
u'title': u'Evidence for a new meson: A quasinuclear NN-bar bound state',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'Physical Review Letters',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7AF8EBC3',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'affiliations': [u'Punjabi University'],
u'author_id': u'780E3459',
u'author_name': u'munish puri'},
{u'affiliations': [u'Punjabi University'],
u'author_id': u'48D92C79',
u'author_name': u'rajesh dhaliwal'},
{u'affiliations': [u'Punjabi University'],
u'author_id': u'7D9BD37C',
u'author_name': u'r s singh'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'Journal of Industrial Microbiology & Biotechnology',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'j ind microbiol biotechnol',
u'pages': None,
u'parent_keywords': [u'Nuclear medicine',
u'Psychology',
u'Hydrology',
u'Chromatography',
u'X-ray crystallography',
u'Nuclear fusion',
u'Medicine',
u'Fluid dynamics',
u'Thermodynamics',
u'Physics',
u'Gas chromatography',
u'Radiobiology',
u'Engineering',
u'Organic chemistry',
u'High-performance liquid chromatography',
u'Chemistry',
u'Organic synthesis',
u'Psychotherapist'],
u'pub_date': u'2008-04-04 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'flow rate',
u'operant conditioning',
u'packed bed reactor',
u'immobilized enzyme',
u'specific activity'],
u'title': u'Development of a stable continuous flow immobilized enzyme reactor for the hydrolysis of inulin',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'Journal of Industrial Microbiology & Biotechnology',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7521A721',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'7FF872BC',
u'author_name': u'barbara eileen ryan'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Social science',
u'Politics',
u'Sociology',
u'Law'],
u'pub_date': u'1992-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'social movements'],
u'title': u"Feminism and the women's movement : dynamics of change in social movement ideology, and activism",
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7DAEB9A4',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'0299B8E9',
u'author_name': u'fraser j harbutt'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Superconductivity',
u'Nuclear fusion',
u'Geology',
u'Chemistry',
u'Metallurgy'],
u'pub_date': u'1988-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'iron'],
u'title': u'The iron curtain : Churchill, America, and the origins of the Cold War',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7B3236C5',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'7DAB7B72',
u'author_name': u'richard m freeland'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Political Science', u'Economics'],
u'pub_date': u'1985-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'foreign policy'],
u'title': u'The Truman Doctrine and the origins of McCarthyism : foreign policy, domestic politics, and internal security, 1946-1948',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'}],
u'max_score': 1.0,
u'total': 36429433},
u'timed_out': False,
u'took': 170}