getElementsByTagName() equivalent for textNodes

Asked 5/4, 2010 at 16:58 Answered 7/8, 2020 at 7:38

Is there any way to get the collection of all textNode objects within a document?

getElementsByTagName() works great for Elements, but textNodes are not Elements.

Update: I realize this can be accomplished by walking the DOM - as many below suggest. I know how to write a DOM-walker function that looks at every node in the document. I was hoping there was some browser-native way to do it. After all it's a little strange that I can get all the <input>s with a single built-in call, but not all textNodes.

Tonneau answered 5/4, 2010 at 16:58 Comment(0)

133

Update:

I have outlined some basic performance tests for each of these 6 methods over 1000 runs. getElementsByTagName is the fastest but it does a half-assed job, as it does not select all elements, but only one particular type of tag ( i think p) and blindly assumes that its firstChild is a text element. It might be little flawed but its there for demonstration purpose and comparing its performance to TreeWalker. Run the tests yourselves on jsfiddle to see the results.

Using a TreeWalker
Custom Iterative Traversal
Custom Recursive Traversal
Xpath query
querySelectorAll
getElementsByTagName

Let's assume for a moment that there is a method that allows you to get all Text nodes natively. You would still have to traverse each resulting text node and call node.nodeValue to get the actual text as you would do with any DOM Node. So the issue of performance is not with iterating through text nodes, but iterating through all nodes that are not text and checking their type. I would argue (based on the results) that TreeWalker performs just as fast as getElementsByTagName, if not faster (even with getElementsByTagName playing handicapped).

Ran each test 1000 times.

Method                  Total ms        Average ms
--------------------------------------------------
document.TreeWalker          301            0.301
Iterative Traverser          769            0.769
Recursive Traverser         7352            7.352
XPath query                 1849            1.849
querySelectorAll            1725            1.725
getElementsByTagName         212            0.212

Source for each method:

TreeWalker

function nativeTreeWalker() {
    var walker = document.createTreeWalker(
        document.body, 
        NodeFilter.SHOW_TEXT, 
        null, 
        false
    );

    var node;
    var textNodes = [];

    while(node = walker.nextNode()) {
        textNodes.push(node.nodeValue);
    }
}

Recursive Tree Traversal

function customRecursiveTreeWalker() {
    var result = [];

    (function findTextNodes(current) {
        for(var i = 0; i < current.childNodes.length; i++) {
            var child = current.childNodes[i];
            if(child.nodeType == 3) {
                result.push(child.nodeValue);
            }
            else {
                findTextNodes(child);
            }
        }
    })(document.body);
}

Iterative Tree Traversal

function customIterativeTreeWalker() {
    var result = [];
    var root = document.body;

    var node = root.childNodes[0];
    while(node != null) {
        if(node.nodeType == 3) { /* Fixed a bug here. Thanks @theazureshadow */
            result.push(node.nodeValue);
        }

        if(node.hasChildNodes()) {
            node = node.firstChild;
        }
        else {
            while(node.nextSibling == null && node != root) {
                node = node.parentNode;
            }
            node = node.nextSibling;
        }
    }
}

querySelectorAll

function nativeSelector() {
    var elements = document.querySelectorAll("body, body *"); /* Fixed a bug here. Thanks @theazureshadow */
    var results = [];
    var child;
    for(var i = 0; i < elements.length; i++) {
        child = elements[i].childNodes[0];
        if(elements[i].hasChildNodes() && child.nodeType == 3) {
            results.push(child.nodeValue);
        }
    }
}

getElementsByTagName (handicap)

function getElementsByTagName() {
    var elements = document.getElementsByTagName("p");
    var results = [];
    for(var i = 0; i < elements.length; i++) {
        results.push(elements[i].childNodes[0].nodeValue);
    }
}

XPath

function xpathSelector() {
    var xpathResult = document.evaluate(
        "//*/text()", 
        document, 
        null, 
        XPathResult.ORDERED_NODE_ITERATOR_TYPE, 
        null
    );

    var results = [], res;
    while(res = xpathResult.iterateNext()) {
        results.push(res.nodeValue);  /* Fixed a bug here. Thanks @theazureshadow */
    }
}

Also, you might find this discussion helpful - http://bytes.com/topic/javascript/answers/153239-how-do-i-get-elements-text-node

Pvc answered 5/4, 2010 at 17:31 Comment(15)

I have gotten mixed results for each of the methods above in different browser - these results above are for Chrome. Firefox and Safari behave very differently. I don't have have access to IE unfortunately, but you could test these yourselves on IE to see if it works. As for browser optimization, I wouldn't worry about picking a different method for each browser as long as the differences are in order of tens of milliseconds or maybe even the low hundreds. – Pvc 7/4, 2010 at 0:29

This is a really useful answer, but beware that the different methods return very different things. Many of them only get text nodes if they're the first child of their parent. Some of them can only get the text, while others can return actual text nodes with minor modifications. There is an error in Iterative Tree Traversal that may affect its performance. Change node.nodeType = 3 to node.nodeType == 3 – Hatti 5/7, 2012 at 17:40

@Hatti - thanks for pointing out the glaring = bug. I've fixed that, and the xpath version was simply returning Text objects, and not the actual string contained in it like the other methods were doing. The method that is only getting the text of the first child is intentionally wrong, and I've mentioned that in the beginning. I will re-run the tests, and post the updated results here. All tests (except getElementsByTagName and xpath) are returning the same number of text nodes. XPath is reporting about 20 more nodes than the others which I'll ignore for now. – Pvc 6/7, 2012 at 3:14

The query selector method will also report a slightly different number of text nodes, as querySelector cannot query by text nodes. A query like body * will find only descendant elements (not text nodes) within the body element. – Pvc 6/7, 2012 at 3:23

@Anurag: Yeah, you'd have to go through all the elements and get each of their text nodes. It should also go through the body element's text nodes. Change to querySelectorAll('body, body *') – Hatti 6/7, 2012 at 23:16

IE 9+ supports TreeWalkers... =\ MDN - createTreeWalker – Waxy 18/3, 2013 at 16:59

I've made the tests equivalent and made a jsPerf: jsperf.com/text-node-traversal – Jamesy 19/4, 2013 at 10:2

Nice work @TimDown - that handicapped test was an eye-sore for a long time :) You should add it as an answer.. – Pvc 19/4, 2013 at 19:2

Pushing all nodes into an array is something you will certainly avoid in performance critical code, so I removed this very expensive calls by simply counting the nodes. createTreeWalker comes out first now: jsperf.com/text-node-traversal/5 . The numbers differ in the 3 cases (limited scope of test cases), so the results differ! – Shear 28/8, 2013 at 4:0

About XPath //*/text(), why not //text()? Is the real translation of "all textNode objects within a document". – Microspore 5/7, 2014 at 15:22

NodeFilter not supported in IE making the tree walker performance dubious at best: developer.mozilla.org/en-US/docs/Web/API/… – Bamako 5/11, 2014 at 23:1

OMG I had no idea about document.createTreeWalker - look like a fantastic method! Also your custom iterative walker is an eye opener - so much more efficient and actually easier to read/understand than the usual stand-in recursive version! – Two 25/6, 2016 at 14:0

Is "//*/text()", necessary, or would //text() also work? – Agonist 14/1, 2017 at 7:34

For Iterative Tree Traversal method above, if the document.body has next sibling, e.g., <script>, then the traversal will go to body's sibling, and eventually generates TypeError: node is null. – Hydropathy 19/12, 2018 at 17:56

jsfiddle/jsperf links are dead. – Kassia 5/6, 2021 at 5:17

Here's a modern Iterator version of the fastest TreeWalker method:

function getTextNodesIterator(el) { // Returns an iterable TreeWalker
    const walker = document.createTreeWalker(el, NodeFilter.SHOW_TEXT);
    walker[Symbol.iterator] = () => ({
        next() {
            const value = walker.nextNode();
            return {value, done: !value};
        }
    });
    return walker;
}

Usage:

for (const textNode of getTextNodesIterator(document.body)) {
    console.log(textNode)
}

Safer version

Using the iterator directly might get stuck if you move the nodes around while looping. This is safer, it returns an array:

function getTextNodes(el) { // Returns an array of Text nodes
    const walker = document.createTreeWalker(el, NodeFilter.SHOW_TEXT);
    const nodes = [];
    while (walker.nextNode()) {
        nodes.push(walker.currentNode);
    }
    return nodes;
}

Brinkley answered 13/6, 2017 at 8:22 Comment(0)

I know you specifically asked for a collection, but if you just meant that informally and didn't care if they were all joined together into one big string, you can use:

var allTextAsString = document.documentElement.textContent || document.documentElement.innerText;

...with the first item being the DOM3 standard approach. Note however that innerText appears to exclude script or style tag contents in implementations that support it (at least IE and Chrome) while textContent includes them (in Firefox and Chrome).

Soakage answered 1/4, 2011 at 14:6 Comment(1)

Thanks - that's not what I wanted though. My needs call for being able to inspect them in-place as DOM objects (like finding their parents, etc) – Tonneau 21/4, 2011 at 1:32

Here's an alternative that's a bit more idiomatic and (hopefully) easier to understand.

function getText(node) {
    // recurse into each child node
    if (node.hasChildNodes()) {
        node.childNodes.forEach(getText);
    }
    // get content of each non-empty text node
    else if (node.nodeType === Node.TEXT_NODE) {
        const text = node.textContent.trim();
        if (text) {
            console.log(text); // do something
        }
    }
}

Eiser answered 9/3, 2019 at 3:16 Comment(0)

after createTreeWalker is deprecated you can use

  /**
   * Get all text nodes under an element
   * @param {!Element} el
   * @return {Array<!Node>}
   */
  function getTextNodes(el) {
    const iterator = document.createNodeIterator(el, NodeFilter.SHOW_TEXT);
    const textNodes = [];
    let currentTextNode;
    while ((currentTextNode = iterator.nextNode())) {
      textNodes.push(currentTextNode);
    }
    return textNodes;
  }

Dumuzi answered 7/8, 2020 at 7:38 Comment(2)

Where have you seen createTreeWalker is deprecated? – Slating 24/6, 2022 at 0:43

createTreeWalker() is NOT deprecated. developer.mozilla.org/fr/docs/Web/API/Document/createTreeWalker dom.spec.whatwg.org/#dom-document-createtreewalker – Dhaulagiri 5/12, 2022 at 10:49

 document.deepText= function(hoo, fun){
        var A= [], tem;
        if(hoo){
            hoo= hoo.firstChild;
            while(hoo!= null){
                if(hoo.nodeType== 3){
                    if(typeof fun== 'function'){
                        tem= fun(hoo);
                        if(tem!= undefined) A[A.length]= tem;
                    }
                    else A[A.length]= hoo;
                }
                else A= A.concat(document.deepText(hoo, fun));
                hoo= hoo.nextSibling;
            }
        }
        return A;
    }

/* You can return an array of all the descendant text nodes of some parent element, or you can pass it some function and do something (find or replace or whatever) to the text in place.

This example returns the text of the non-whitespace textnodes in the body:

var A= document.deepText(document.body, function(t){
    var tem= t.data;
    return /\S/.test(tem)? tem: undefined;
});
alert(A.join('\n'))

Handy for search and replace, highlighting and so on

Colley answered 5/4, 2010 at 17:59 Comment(0)

var el1 = document.childNodes[0]
function get(node,ob)
{
        ob = ob || {};

        if(node.childElementCount)
        {

            ob[node.nodeName] = {}
            ob[node.nodeName]["text"] = [];
            for(var x = 0; x < node.childNodes.length;x++)
            {   
                if(node.childNodes[x].nodeType == 3)
                {
                    var txt = node.childNodes[x].nodeValue;


                    ob[node.nodeName]["text"].push(txt)
                    continue
                }
                get(node.childNodes[x],ob[node.nodeName])       
            };  
        }
        else
        {
            ob[node.nodeName]   = (node.childNodes[0] == undefined ? null :node.childNodes[0].nodeValue )
        }
        return ob
}



var o = get(el1)
console.log(o)

Diophantus answered 31/10, 2016 at 22:51 Comment(0)

Hot tags

Godot Unity Godot Help Programming Godot 4.X GUI GDScript 3D 2D Physics CSharp Godot 3.X VR XR Projects C++

Safer version

Recommended topics

Hot tags