For the purposes of unit testing, I want to check that the XML produced for a Word paragraph is what I expect when I parse an HTML paragraph.
How do I extract the XML itself instead of writing to a file, unzipping the file, and re-reading the word/document.xml file it contains?
e.g.
from docx import Document
import bs4
def add_parsed_html_to_paragraph(p, s):
soup = bs4.BeautifulSoup(s)
para = soup.find('p')
for e in para.children:
if type(e) == bs4.element.NavigableString:
r = p.add_run(str(e))
else:
r = p.add_run(e.text)
if e.name == 'sub':
r.font.subscript = True
elif e.name == 'sup':
r.font.superscript = True
title = 'A formula: H<sub>2</sub>O.'
document = Document()
p = document.add_paragraph()
add_parsed_html_to_paragraph(p, title)
# ... Now I want to check p or document for the correct XML