This task was puzzling me for a long time. Next to extracting fonts-information I also wanted to run this code in a python script.
Hower, today I was able to solve it. Below I wrote a script that calls the pdf2txt.py
script from the command line and then extracts the font-information form the parsed PDF and newly created html file.
import os
pathToScript = r'path\to\script\pdf2txt.py'
pathPDFinput = os.path.join(path\to\file, 'test.pdf')
pathHTMLoutput = os.path.join(path\to\file, 'test.html')
# call the pdf2txt.py from the command line
os.system('python {} -o {} -S {} -t html'.format(pathToScript, pathHTMLoutput, pathPDFinput))
Extract the font-size for every html tag:
# credits to akash karothiya:
# https://mcmap.net/q/324652/-need-to-extract-all-the-font-sizes-and-the-text-using-beautifulsoup/39015419#39015419
import re
import pandas as pd
from bs4 import BeautifulSoup
# open the html file
html = open(pathHTMLoutput, 'r')
soup = BeautifulSoup(html)
font_spans = [data for data in soup.select('span') if 'font-size' in str(data)]
output = []
for span in font_spans:
fonts_size = re.search(r'(?is)(font-size:)(.*?)(px)', str(span.get('style'))).group(2)
fonts_family = re.search(r'(?is)(font-family:)(.*?)(;)', str(span.get('style'))).group(2)
# split fonts_family into fonts-type and fonts-style
try:
fonts_type = fonts_family.strip().split(',')[0]
fonts_style = fonts_family.strip().split(',')[1]
except IndexError:
fonts_type = fonts_family.strip()
fonts_style = None
output.append(
(str(i.text).strip(), fonts_size.strip(), fonts_type, fonts_style)
)
# create dataframe
df = pd.DataFrame(output, columns = ['text', 'fonts-size', 'fonts-type', 'fonts-style'])