2 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
5 * Loosely Based onHTML_Safe Parser
11 * @author Roman Ivanov <thingol@mail.ru>
12 * @copyright 2004-2005 Roman Ivanov
13 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
15 * @link http://pear.php.net/package/HTML_Safe
22 * This parser strips down all potentially dangerous content within HTML:
24 * <li>opening tag without its closing tag</li>
25 * <li>closing tag without its opening tag</li>
26 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
27 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
28 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
29 * <li>any of these attributes: on*, data*, dynsrc</li>
30 * <li>javascript:/vbscript:/about: etc. protocols</li>
31 * <li>expression/behavior etc. in styles</li>
32 * <li>any other active content</li>
34 * It also tries to convert code to XHTML valid, but htmltidy is far better
35 * solution for this task.
39 * $parser =& new HTML_Safe();
40 * $result = $parser->parse($doc);
45 * @author Roman Ivanov <thingol@mail.ru>
46 * @copyright 1997-2005 Roman Ivanov
47 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
48 * @version Release: @package_version@
49 * @link http://pear.php.net/package/HTML_Safe
57 * Array of prepared regular expressions for protocols (schemas) matching
62 var $_protoRegexps = array();
65 * Array of prepared regular expressions for CSS matching
70 var $_cssRegexps = array();
73 * List of single tags ("<tag />")
78 var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
81 * List of dangerous tags (such tags will be deleted)
86 var $deleteTags = array(
87 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
88 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
89 'iframe', 'layer', 'link', 'meta', 'object', 'style',
94 * List of dangerous tags (such tags will be deleted, and all content
95 * inside this tags will be also removed)
100 var $deleteTagsContent = array('script', 'style', 'title', 'xml', );
103 * Type of protocols filtering ('white' or 'black')
108 var $protocolFiltering = 'white';
111 * List of "dangerous" protocols (used for blacklist-filtering)
116 var $blackProtocols = array(
117 'about', 'chrome', 'data', 'disk', 'hcp',
118 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
119 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
120 'res', 'resource', 'shell', 'vbscript', 'view-source',
121 'vnd.ms.radio', 'wysiwyg',
125 * List of "safe" protocols (used for whitelist-filtering)
130 var $whiteProtocols = array(
131 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
132 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
137 * List of attributes that can contain protocols
142 var $protocolAttributes = array(
143 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
147 * List of dangerous CSS keywords
149 * Whole style="" attribute will be removed, if parser will find one of
155 var $cssKeywords = array(
156 'absolute', 'behavior', 'behaviour', 'content', 'expression',
157 'fixed', 'include-source', 'moz-binding',
161 * List of tags that can have no "closing tag"
165 * @deprecated XHTML does not allow such tags
167 var $noClose = array();
170 * List of block-level tags that terminates paragraph
172 * Paragraph will be closed when this tags opened
177 var $closeParagraph = array(
178 'address', 'blockquote', 'center', 'dd', 'dir', 'div',
179 'dl', 'dt', 'h1', 'h2', 'h3', 'h4',
180 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee',
181 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre',
182 'table', 'ul', 'xmp',
186 * List of table tags, all table tags outside a table will be removed
191 var $tableTags = array(
192 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
202 var $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
205 * List of dangerous attributes
210 var $attributes = array('dynsrc', 'id', 'name', );
213 * List of allowed "namespaced" attributes
218 var $attributesNS = array('xml:lang', );
225 function HTML_Safe($opts = array())
228 foreach ($opts as $k =>$v) {
232 //making regular expressions based on Proto & CSS arrays
233 foreach ($this->blackProtocols as $proto) {
234 $preg = "/[\s\x01-\x1F]*";
235 for ($i=0; $i<strlen($proto); $i++) {
236 $preg .= $proto{$i} . "[\s\x01-\x1F]*";
239 $this->_protoRegexps[] = $preg;
242 foreach ($this->cssKeywords as $css) {
243 $this->_cssRegexps[] = '/' . $css . '/i';
249 * Handles the writing of attributes - called from $this->_openHandler()
251 * @param array $attrs array of attributes $name => $value
255 function _writeAttrs ($attrs)
258 if (is_array($attrs)) {
259 foreach ($attrs as $name => $value) {
261 $name = strtolower($name);
263 if (strpos($name, 'on') === 0) {
266 if (strpos($name, 'data') === 0) {
269 if (in_array($name, $this->attributes)) {
272 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
273 if (!in_array($name, $this->attributesNS)) {
278 if (($value === TRUE) || (is_null($value))) {
282 if ($name == 'style') {
284 // removes insignificant backslahes
285 $value = str_replace("\\", '', $value);
287 // removes CSS comments
290 $_value = preg_replace("!/\*.*?\*/!s", '', $value);
291 if ($_value == $value) break;
295 // replace all & to &
296 $value = str_replace('&', '&', $value);
297 $value = str_replace('&', '&', $value);
298 $value = $this->cleanStyle($value);
301 $tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"'
302 $tempval = preg_replace('/&#x([0-9a-f]+);?/mei', "chr(hexdec('\\1'))", $tempval);
304 if ((in_array($name, $this->protocolAttributes)) &&
305 (strpos($tempval, ':') !== false))
307 if ($this->protocolFiltering == 'black') {
308 foreach ($this->_protoRegexps as $proto) {
309 if (preg_match($proto, $tempval)) continue 2;
312 $_tempval = explode(':', $tempval);
313 $proto = $_tempval[0];
314 if (!in_array($proto, $this->whiteProtocols)) {
320 $value = str_replace("\"", """, $value);
321 $ret .= ' ' . $name . '="' . $value . '"';
327 function cleanStyle ($str)
331 require_once 'HTML/CSS/InlineStyle.php';
332 $is = new HTML_CSS_InlineStyle();
334 $ar = $is->_styleToArray($str);
335 foreach($ar as $k=>$v) {
336 if (in_array(strtolower(trim($k)), $this->cssKeywords)) {
337 echo "Trashing BL css keyword $k=$v <br/>";
340 foreach ($this->_protoRegexps as $proto) {
341 if (preg_match($proto, $v)) {
342 echo "$proto - Trashing $k=$v <br/>";
350 foreach($ar as $prop => $val) {
351 $st[] = "{$prop}:{$val}";
353 return implode(';', $st);
359 * Opening tag handler - called from HTMLSax
361 * @param object $parser HTML Parser
362 * @param string $name tag name
363 * @param array $attrs tag attributes
367 function _openHandler($name, $attrs)
369 $name = strtolower($name);
371 if (in_array($name, $this->deleteTagsContent)) {
375 if (in_array($name, $this->deleteTags)) {
379 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
381 /*if (preg_match("!(?:\@|://)!i", $name)) {
382 return '<' . $name . '>';
383 $this->_xhtml .= '<' . $name . '>';
388 if (in_array(strtolower($name), $this->singleTags)) {
389 return '<' . $name . $this->_writeAttrs($attrs) . '/>';
391 return '<' . $name . $this->_writeAttrs($attrs) . '>';
396 * Main parsing fuction
398 * @param string $doc HTML document for processing
399 * @return string Processed (X)HTML document
405 // Save all '<' symbols
406 //$doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
408 // Web documents shouldn't contains \x00 symbol
409 //$doc = str_replace("\x00", '', $doc);
411 // Opera6 bug workaround
412 //$doc = str_replace("\xC0\xBC", '<', $doc);
414 // UTF-7 encoding ASCII decode
415 //$doc = $this->repackUTF7($doc);
417 if (!extension_loaded('tidy')) {
421 if (strlen($doc) > 100000) {
422 $doc = substr($doc, 0, 100000);
424 $tree = tidy_parse_string($doc,array(),'UTF8');
426 return $this->tidyTree($tree->root());
433 function parseFile($fn)
436 // Save all '<' symbols
437 //$doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
439 // Web documents shouldn't contains \x00 symbol
440 //$doc = str_replace("\x00", '', $doc);
442 // Opera6 bug workaround
443 //$doc = str_replace("\xC0\xBC", '<', $doc);
445 // UTF-7 encoding ASCII decode
446 //$doc = $this->repackUTF7($doc);
448 if (!extension_loaded('tidy')) {
449 die("Add tidy extension to extension.ini");
451 $tree = tidy_parse_file($fn,array(),'UTF8');
455 return $this->tidyTree($tree->root());
462 function tidyTree($node) {
465 switch ($node->type) {
466 case TIDY_NODETYPE_TEXT:
467 if (strlen(trim($node->value))) {
470 //echo htmlspecialchars($node->value);
473 case TIDY_NODETYPE_STARTEND:
474 case TIDY_NODETYPE_START:
475 if (!empty($this->filter)) {
476 $this->filter->apply($node);
479 case TIDY_NODETYPE_END: // handled by start / singleTags..
481 //$this->out .= "<". htmlspecialchars($node->name) .'/>';
484 case TIDY_NODETYPE_ROOT:
489 //echo $node->name ."\n";
493 if ($node->type != TIDY_NODETYPE_ROOT) {
494 //echo htmlspecialchars(print_r($node ,true));
495 $add = $this->_openHandler($node->name, empty($node->attribute) ? array() : $node->attribute);
496 if (is_string($add)) {
498 if (!in_array(strtolower($node->name), $this->singleTags)) {
499 $cr = strtolower($node->name) == 'pre' ? '' : "\n";
500 $end = $cr . '</' . $node->name . '>';
505 return ''; // delete this tag and all the contents..
509 // include children...
510 if(!$node->hasChildren()){
511 return $begin . $end;
513 foreach($node->child as $child){
514 // echo "child of ". $node->name . ':' . $child->type . "\n";
515 $begin .= $this->tidyTree($child);
517 return $begin . $end;
524 * UTF-7 decoding fuction
526 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
527 * @return string Decoded document
530 function repackUTF7($str)
532 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
536 * Additional UTF-7 decoding fuction
538 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
539 * @return string Recoded string
542 function repackUTF7Callback($str)
544 $str = base64_decode($str[1]);
545 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
546 return preg_replace('/\x00(.)/', '$1', $str);
550 * Additional UTF-7 encoding fuction
552 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
553 * @return string Recoded string
556 function repackUTF7Back($str)
558 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
566 * c-hanging-comment-ender-p: nil