I have tried as well to get the har file using a proxy like browsermob proxy
I did a lot of research because the file which I've received was always empty.
What I did was to enable the browser performance log.
Note this will work only with chrome driver.
This is my driver class (in python)
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium import webdriver
from lib.config import config
class Driver:
global performance_log
capabilities = DesiredCapabilities.CHROME
capabilities['loggingPrefs'] = {'performance': 'ALL'}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
mobile_emulation = {"deviceName": "Nexus 5"}
if config.Env().is_mobile():
chrome_options.add_experimental_option(
"mobileEmulation", mobile_emulation)
else:
pass
chrome_options.add_experimental_option(
'perfLoggingPrefs', {"enablePage": True})
def __init__(self):
self.instance = webdriver.Chrome(
executable_path='/usr/local/bin/chromedriver', options=self.chrome_options)
def navigate(self, url):
if isinstance(url, str):
self.instance.get(url)
self.performance_log = self.instance.get_log('performance')
else:
raise TypeError("URL must be a string.")
The amount of information which is found the in output is huge so you'll have to filter the raw data and get the network received and send objects only.
import json
import secrets
def digest_log_data(performance_log):
# write all raw data in a file
with open('data.json', 'w', encoding='utf-8') as outfile:
json.dump(performance_log, outfile)
# open the file and real it with encoding='utf-8'
with open('data.json', encoding='utf-8') as data_file:
data = json.loads(data_file.read())
return data
def digest_raw_data(data, mongo_object={}):
for idx, val in enumerate(data):
data_object = json.loads(data[idx]['message'])
if (data_object['message']['method'] == 'Network.responseReceived') or (data_object['message']['method'] == 'Network.requestWillBeSent'):
mongo_object[secrets.token_hex(30)] = data_object
else:
pass
We choose to push this data into a mongo db which will be analyse later by an etl and pushed into a redshift database to create statistics .
I hope is what you are looking for.
The way Im running the script is :
import codecs
from pprint import pprint
import urllib
from lib import mongo_client
from lib.test_data import test_data as data
from jsonpath_ng.ext import parse
from IPython import embed
from lib.output_data import process_output_data as output_data
from lib.config import config
from lib import driver
browser = driver.Driver()
# get the list of urls which we need to navigate
urls = data.url_list()
for url in urls:
browser.navigate(config.Env().base_url() + url)
print('Visiting ' + url)
# get performance log
performance_log = browser.performance_log
# digest the performace log
data = output_data.digest_log_data(performance_log)
# initiate an empty dict
mongo_object = {}
# prepare the data for the mongo document
output_data.digest_raw_data(data, mongo_object)
# load data into the mongo db
mongo_client.populate_mongo(mongo_object)
browser.instance.quit()
My main source was this one which I've adjusted it to my needs.
https://www.reddit.com/r/Python/comments/97m9iq/headless_browsers_export_to_har/
Thanks