BS4 replace_with result is no longer in tree

#!/usr/bin/env python3 from bs4 import BeautifulSoup import re def test1(): html = \ ''' Identify ''' soup = BeautifulSoup(html,features="html.parser") for txt in soup.findAll(text=True): if re.search('identify',txt,re.I) and txt.parent.name != 'a': newtext = re.sub('identify', '<a href="test.html"> test </a>', txt.lower()) txt.replace_with(BeautifulSoup(newtext, features="html.parser")) txt.replace_with(BeautifulSoup(newtext, features="html.parser")) # I called it twice here to make the code as small as possible. # Usually it would be a different newtext .. # which was created using the replaced txt looking for a different word to replace. return soup print(test1())

The first txt.replace_with(...) removes NavigableString (here stored in variable txt) from the document tree (doc). This effectively sets txt.parent to None

The second txt.replace_with(...) looks at parent property, finds None (because txt is already removed from tree) and throws an ValueError.

As you said at the end of your question, one the solution can be to use .replace_with() only once:

import re
from bs4 import BeautifulSoup

def test1():
    html = \
    '''
    word1 word2 word3 word4
    '''
    soup = BeautifulSoup(html,features="html.parser")

    to_delete = []
    for txt in soup.findAll(text=True):
        if re.search('word1', txt, flags=re.I) and txt.parent.name != 'a':
            newtext = re.sub('word1', '<a href="test.html"> test1 </a>', txt.lower())
            
            # ...some computations

            newtext = re.sub('word3', '<a href="test.html"> test2 </a>', newtext)

            # ...some more computations

            # and at the end, replce txt only once:
            txt.replace_with(BeautifulSoup(newtext, features="html.parser"))

    return soup
print(test1())

Prints:

<a href="test.html"> test1 </a> word2 <a href="test.html"> test2 </a> word4

Recommended topics

Hot tags