See WdHTMLParser class. I use this class for my forum.
Sample with WdHTMLParser :
This class parse the HTML to an array :
<div>
<span>
<br />
<span>
un bout de texte
</span>
<input type="text" />
</span>
</div>
Array :
Array (
[0] => Array (
[name] => div
[args] => Array ()
[children] => Array (
[0] => Array (
[name] => span
[args] => Array ()
[children] => Array (
[0] => Array (
[name] => br
[args] => Array ()
)
[1] => Array (
[name] => span
[args] => Array ()
[children] => Array (
[0] => un bout de texte
)
)
[2] => Array (
[name] => input
[args] => Array (
[type] => text
)
)
)
)
)
)
)
WdHTMLParser array to HTML
I use this class on my website to convert array to HTML.
voyageWdHTML_allowattr : These attributes will be allowed.
voyageWdHTML_allowtag : These tags will be allowed.
voyageWdHTML_special : Make your own rules. Actually, I add "_blank" to each link. And replace <br>
to new line (\n) in pre tag.
fix_javascript : You can to enable/disable this function, but it is useless.
Sample php :
<?php
include "WdHTMLParser.php";
include "parser.php";
list($erreur, $message) = (new Parser())->parseBadHTML("<div>
<span>
<a onclick=\"alert('Hacked ! :'(');\">Check javascript</a>
<script>alert(\"lol\");</script>
</span>
</div>");
if ($erreur) {
die("Error : ".$message);
}
echo $message;
Output :
<div>
<span>
<a target="_blank">Check javascript</a>
<pre>alert("lol");</pre>
</span>
</div>
My Parser class :
<?php
class Parser {
//private function fix_javascript(&$message) { }
private function voyageWdHTML_args($tab_args, $objname) {
$html = "";
foreach ($tab_args as $attr => $valeur) {
if ($valeur !== null && $this->voyageWdHTML_allowattr($attr)) {
$html .= " $attr=\"".htmlentities($valeur)."\"";
}
}
return $html;
}
private function voyageWdHTML_allowattr($attr) {
return in_array($attr, array("align", "face", "size", "href", "title", "target", "src", "color", "style",
"data-class", "data-format"));
}
private function voyageWdHTML_allowtag($name) {
return in_array($name, array("br", "b", "i", "u", "strike", "sub", "sup", "div", "ol", "ul", "li", "font", "span", "code",
"hr", "blockquote", "cite", "a", "img", "p", "pre", "h6", "h5", "h4", "h3", "h2", "h1"));
}
private function voyageWdHTML_special(&$obj) {
if ($obj["name"] == "a") { $obj["args"]["target"] = "_blank"; }
if ($obj["name"] == "pre") {
array_filter($obj["children"], function (&$var) {
if (is_string($var)) { return true; }
if ($var["name"] == "br") { $var = "\n"; return true; }
return false;
});
}
}
private function voyageWdHTML($tableau, $lvl = 0) {
$html = "";
foreach ($tableau as $obj) {
if (is_array($obj)) {
if (!$this->voyageWdHTML_allowtag($obj["name"])) {
$obj["name"] = "pre";
if (!isset($obj["children"])) {
$obj["children"] = array();
}
}
if (isset($obj["children"])) {
$this->voyageWdHTML_special($obj);
$html .= "<{$obj["name"]}{$this->voyageWdHTML_args($obj["args"], $obj["name"])}>{$this->voyageWdHTML($obj["children"], $lvl+1)}</{$obj["name"]}>";
} else {
$html .= "<{$obj["name"]}>";
}
} else {
$html .= $obj;
}
}
return $html;
}
public function parseBadHTML($message) {
$WdHTMLParser = new WdHTMLParser();
$message = str_replace(array("<br>", "<hr>"), array("<br/>", "<hr/>"), $message);
$tableau = $WdHTMLParser->parse($message);
if ($WdHTMLParser->malformed) {
$retour = $WdHTMLParser->error;
} else {
$retour = $this->voyageWdHTML($tableau);
//$this->fix_javascript($retour);// To make sur
}
return array($WdHTMLParser->malformed, $retour);
}
}
WdHTMLParser class
<?php
class WdHTMLParser {
private $encoding;
private $matches;
private $escaped;
private $opened = array();
public $malformed;
public function parse($html, $namespace = NULL, $encoding = 'utf-8') {
$this->malformed = false;
$this->encoding = $encoding;
$html = $this->escapeSpecials($html);
$this->matches = preg_split('#<(/?)' . $namespace . '([^>]*)>#', $html, -1, PREG_SPLIT_DELIM_CAPTURE);
$tree = $this->buildTree();
if ($this->escaped) {
$tree = $this->unescapeSpecials($tree);
}
return $tree;
}
private function escapeSpecials($html) {
$html = preg_replace_callback('#<\!--.+-->#sU', array($this, 'escapeSpecials_callback'), $html);
$html = preg_replace_callback('#<\?.+\?>#sU', array($this, 'escapeSpecials_callback'), $html);
return $html;
}
private function escapeSpecials_callback($m) {
$this->escaped = true;
$text = $m[0];
$text = str_replace(array('<', '>'), array("\x01", "\x02"), $text);
return $text;
}
private function unescapeSpecials($tree) {
return is_array($tree) ? array_map(array($this, 'unescapeSpecials'), $tree) : str_replace(array("\x01", "\x02"), array('<', '>'), $tree);
}
private function buildTree() {
$nodes = array();
$i = 0;
$text = NULL;
while (($value = array_shift($this->matches)) !== NULL) {
switch ($i++ % 3) {
case 0: {
if (trim($value)) {
$nodes[] = $value;
}
}
break;
case 1: {
$closing = ($value == '/');
}
break;
case 2: {
if (substr($value, -1, 1) == '/') {
$nodes[] = $this->parseMarkup(substr($value, 0, -1));
} else if ($closing) {
$open = array_pop($this->opened);
if ($value != $open) {
$this->error($value, $open);
}
return $nodes;
} else {
$node = $this->parseMarkup($value);
$this->opened[] = $node['name'];
$node['children'] = $this->buildTree($this->matches);
$nodes[] = $node;
}
}
}
}
return $nodes;
}
public function parseMarkup($markup) {
preg_match('#^[^\s]+#', $markup, $matches);
$name = $matches[0];
preg_match_all('#\s+([^=]+)\s*=\s*"([^"]+)"#', $markup, $matches, PREG_SET_ORDER);
$args = array();
foreach ($matches as $m) {
$args[$m[1]] = html_entity_decode($m[2], ENT_QUOTES, $this->encoding);
}
return array('name' => $name, 'args' => $args);
}
public function error($markup, $expected) {
$this->malformed = true;
printf('unexpected closing markup "%s", should be "%s"', $markup, $expected);
}
}
To make sur use, you can use this function (mybb.com) :
<?php
class Parser {
private function fix_javascript(&$message) {
$js_array = array(
"#(&\#(0*)106;?|&\#(0*)74;?|&\#x(0*)4a;?|&\#x(0*)6a;?|j)((&\#(0*)97;?|&\#(0*)65;?|a)(&\#(0*)118;?|&\#(0*)86;?|v)(&\#(0*)97;?|&\#(0*)65;?|a)(\s)?(&\#(0*)115;?|&\#(0*)83;?|s)(&\#(0*)99;?|&\#(0*)67;?|c)(&\#(0*)114;?|&\#(0*)82;?|r)(&\#(0*)105;?|&\#(0*)73;?|i)(&\#112;?|&\#(0*)80;?|p)(&\#(0*)116;?|&\#(0*)84;?|t)(&\#(0*)58;?|\:))#i",
"#(o)(nmouseover\s?=)#i",
"#(o)(nmouseout\s?=)#i",
"#(o)(nmousedown\s?=)#i",
"#(o)(nmousemove\s?=)#i",
"#(o)(nmouseup\s?=)#i",
"#(o)(nclick\s?=)#i",
"#(o)(ndblclick\s?=)#i",
"#(o)(nload\s?=)#i",
"#(o)(nsubmit\s?=)#i",
"#(o)(nblur\s?=)#i",
"#(o)(nchange\s?=)#i",
"#(o)(nfocus\s?=)#i",
"#(o)(nselect\s?=)#i",
"#(o)(nunload\s?=)#i",
"#(o)(nkeypress\s?=)#i"
);
$message = preg_replace($js_array, "$1<b></b>$2$4", $message);
}
}