Marking text in a html document
Asked Answered
M

2

6

Lets say I have the following markup:

<html>
    <head>
        <title>Page Title</title>
    </head>
    <body>
        <h1>Some title</h1>
        <p>First paragraph</p>
        <p>Second paragraph</p>
    </body>
<html>

I need to mark some parts of the text, namely "irst paragraph secon" It would look something like this:

<html>
    <head>
        <title>Page Title</title>
    </head>
    <body>
        <h1>Some title</h1>
        <p>F
            <mark>
                irst paragraph</p><p>Secon
            </mark>
        d paragraph</p>
    </body>
<html>

But the problem is be the html markup would be broken. The more complex the markup, the more problems this approach would have.

Question:

Looking for ideas on how can I take the first HTML example and apply a function to return a html structure where "irst paragraph second" is specifically marked somehow.

What I currently have is:

  • the parent container of the string "First paragraph"
  • the text "irst paragraph second"
  • the offset of the text "irst" in "First paragraph"
Miguelinamiguelita answered 29/1, 2016 at 11:18 Comment(0)
G
4

If you want to highlight text in a document then this plug-in will be helpful for you.

https://github.com/julmot/jquery.mark

Example fiddle: https://jsfiddle.net/julmot/vpav6tL1/

Usage is as simple as:

$(".context").mark("keyword");
Gettysburg answered 9/2, 2016 at 9:2 Comment(2)
Awesome but without 'Separate word search' option it will not work as expected with more complex markup (e.g. when parts of the search term are found in different child elements).Renascence
@B0Andrew I can not reproduce an error. What exactly do you mean? Can you open an issue on the repo page?Halfway
R
0

In principle you have to:

  • split the documents into words
  • identify the first word by parent element
  • skip the offset
  • mark matching words

Making changes at word level will prevent you from breaking the markup. I added a working example bellow. However I am not sure that it will work with all browsers.

Some of the functions like mergeWords are not used in the example but I included them because they can prove useful.

var splittedToWords = false;

function ignore(el) {
  return (el.nodeType == 8) || 
    (el.tagName == "BLOCKQUOTE") ||
    (el.tagName == "SCRIPT") ||
    (el.tagName == "DIV") ||
    (!el.hasChildNodes() && el.textContent.match(/\S+/) == null);
}

function splitToWords(el) {
  if (el.hasChildNodes()){
    var count = el.childNodes.length;
    for (var i = count - 1; i >= 0; i--) {
      var node = el.childNodes[i];
      if (!ignore(node))
        splitToWords(node);
    }
  }
  else {	//text node
    var words = el.textContent.match(/(\S+\s*)/g) || [];
    var count = words.length;
    var parentNode = el.parentNode;
    for (var i = 0; i < count; i++) {
      var wordNode = document.createElement("span");
      wordNode.className = "word";
      wordNode.innerText = words[i];

      wordNode.setAttribute["word-index"] = i;

      parentNode.insertBefore(wordNode, el);
    }
    parentNode.removeChild(el);
  }
  splittedToWords = true;
}

function unwrap(element) {
  var next = element.nextSibling;
  var parent = element.parentNode;
  parent.removeChild(element);
  var current;
  var frag = document.createDocumentFragment();
  do {
    current = element.nextSibling;
    frag.insertBefore(element, null);
  } while ((element = current));
  parent.insertBefore(frag, next);
}

function mergeWords(el) {
  var words = document.getElementsByClassName("word");
  count = words.length;
  if (count > 0)
    for (var i = 0; i < count; i++)
      uwrap(words[i]);
}

function markWord(el, pos, len) {
  var text = el.innerText;
  var pre = text.substr(0, pos);
  var mark = '<mark>' + text.substr(pos, len) + '</mark>';
  var post = text.substring(pos + len, text.length);
  el.innerHTML = pre + mark + post;
}

function mark(element, offset, text) {
  if (!splittedToWords) {
    var body = document.body;
    splitToWords(body);
  }

  var words = document.getElementsByClassName("word");
  var wordsCount = words.length;
  var first = null;
  for (var i = 0; i < wordsCount; i++ ) {
    if (words[i].parentElement == element) {
      first = i;
      break;
    }
  }

  done = false;
  var i = first;
  var pos = 0;

  do {
    var word = words[i];
    var wordLength = word.innerText.length;

    if (offset > pos + wordLength) {
      i++;
      pos += wordLength;
      continue;
    }
    else {
      done = true;
    }
  } while (!done);

  var tWords = text.match(/(\S+\s*)/g) || [];
  var tWordsCount = tWords.length;
  if (tWordsCount == 0)
    return;

  for (var ti = 0; ti < tWordsCount; ti++) {
    var wordEl = words[i++];
    var word = wordEl.innerText;
    var tWord = tWords[ti].trim();
    var pos = word.indexOf(tWord);

    if (pos == -1)
      continue;	//or maybe return.

    markWord(wordEl, pos, tWord.length);
  }

}
var e = document.getElementById("e");

//do the magic
mark(e, 1, 'irst paragraph Second');
<h1>Some title</h1>
<p id="e">First paragraph</p>
<p>Second paragraph</p>
Renascence answered 2/2, 2016 at 9:16 Comment(5)
The only downside of this it that it doesn't mark the spacesPhiona
It can be changed though. If you mark two adjacent words (spans) including the space at the end of the word they WILL appear as one contiguous marked element.Renascence
@B0Andrew +1, This is actually a good idea but wouldn't "split the documents into words" take a long time for larger web pages?Renovate
This is so buggy, it will only work in this little situation. Wrap "First paragraph" e.g. with <span> (jsfiddle.net/2c91f8rs) or search with a blank at the end (jsfiddle.net/2c91f8rs/1) will throw errors. I could list a dozens of situations where this throws errors. Using the plugin recommended from @Anamika Shrivastava works and on top gives you the opportunity to customize element name, class name, mark also diacritics, use synonyms, etc.Halfway
@Halfway One of the hypotheses was that "The parent element of the start of the string to be searched is known" not "any parent". Of course, if you wrap the start of the searched string with another element the function will not work. Call it a bug, fine. But don't forget this is SO. You get answers for specific problems not for every problem that could possibly arise.Renascence

© 2022 - 2024 — McMap. All rights reserved.