You can easily do all of this using Pawpaw:
Code:
import sys
sys.modules['_elementtree'] = None
import xml.etree.ElementTree as ET
from pawpaw import Ito, visualization, xml
text = """<xml>
The captial of <place pid="1">South Africa</place> is <place>Pretoria</place>.
</xml>"""
root = ET.fromstring(text, parser=xml.XmlParser())
print('1. ET elements:\n')
print(elements := root.findall('.//'))
print()
print('2. Full plain text of document between start and end tags:\n')
start_tag = root.ito.find('*[d:start_tag]')
end_tag = root.ito.find('*[d:end_tag]')
ito = Ito(text, start_tag.stop, end_tag.start)
print(f'{ito:%substr!r}')
print()
print('3. Character offsets of plain text of each element:\n')
for e in elements:
plain_text = e.ito.find('*[d:text]')
print(f'{plain_text:%span: "%substr"}')
print()
Output:
1. ET elements:
[<Element 'place' at 0x1b0ffx203a0>, <Element 'place' at 0x1b0ffx21240>]
2. Full plain text of document between start and end tags:
'\nThe captial of <place pid="1">South Africa</place> is <place>Pretoria</place>.\n'
3. Character offsets of plain text of each element:
(36, 48) "South Africa"
(67, 75) "Pretoria"
Bonus: using Pawpaw, you can get the character offset of any xml segment, such as:
- elements
- attributes
- namespaces
- tags
- etc.
Example:
v_tree = visualization.pepo.Tree()
print(v_tree.dumps(root.ito))
Output:
(0, 91) 'element' : '<xml>\nThe captial o…ia</place>.\n</xml>'
├──(0, 5) 'start_tag' : '<xml>'
│ └──(1, 4) 'tag' : 'xml'
│ └──(1, 4) 'name' : 'xml'
├──(5, 21) 'text' : '\nThe captial of '
├──(21, 56) 'element' : '<place pid="1">South Africa</place>'
│ ├──(21, 36) 'start_tag' : '<place pid="1">'
│ │ ├──(22, 27) 'tag' : 'place'
│ │ │ └──(22, 27) 'name' : 'place'
│ │ └──(28, 35) 'attributes' : 'pid="1"'
│ │ └──(28, 31) 'attribute' : 'pid="1"'
│ │ ├──(28, 31) 'tag' : 'pid'
│ │ │ └──(28, 31) 'name' : 'pid'
│ │ └──(33, 34) 'value' : '1'
│ ├──(36, 48) 'text' : 'South Africa'
│ └──(48, 56) 'end_tag' : '</place>'
│ └──(50, 55) 'tag' : 'place'
│ └──(50, 55) 'name' : 'place'
├──(56, 60) 'text': ' is '
├──(60, 83) 'element' : '<place>Pretoria</place>'
│ ├──(60, 67) 'start_tag' : '<place>'
│ │ └──(61, 66) 'tag' : 'place'
│ │ └──(61, 66) 'name' : 'place'
│ ├──(67, 75) 'text' : 'Pretoria'
│ └──(75, 83) 'end_tag' : '</place>'
│ └──(77, 82) 'tag' : 'place'
│ └──(77, 82) 'name' : 'place'
├──(83, 85) 'text': '.\n'
└──(85, 91) 'end_tag' : '</xml>'
└──(87, 90) 'tag' : 'xml'
└──(87, 90) 'name' : 'xml'