2 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
5 * Loosely Based onHTML_Safe Parser
11 * @author Roman Ivanov <thingol@mail.ru>
12 * @copyright 2004-2005 Roman Ivanov
13 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
15 * @link http://pear.php.net/package/HTML_Safe
22 * This parser strips down all potentially dangerous content within HTML:
24 * <li>opening tag without its closing tag</li>
25 * <li>closing tag without its opening tag</li>
26 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
27 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
28 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
29 * <li>any of these attributes: on*, data*, dynsrc</li>
30 * <li>javascript:/vbscript:/about: etc. protocols</li>
31 * <li>expression/behavior etc. in styles</li>
32 * <li>any other active content</li>
34 * It also tries to convert code to XHTML valid, but htmltidy is far better
35 * solution for this task.
39 * $parser =& new HTML_Safe();
40 * $result = $parser->parse($doc);
45 * @author Roman Ivanov <thingol@mail.ru>
46 * @copyright 1997-2005 Roman Ivanov
47 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
48 * @version Release: @package_version@
49 * @link http://pear.php.net/package/HTML_Safe
57 * Array of prepared regular expressions for protocols (schemas) matching
62 var $_protoRegexps = array();
65 * Array of prepared regular expressions for CSS matching
70 var $_cssRegexps = array();
73 * List of single tags ("<tag />")
78 var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
81 * List of dangerous tags (such tags will be deleted)
86 var $deleteTags = array(
87 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
88 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
89 'iframe', 'layer', 'link', 'meta', 'object', 'style',
94 * List of dangerous tags (such tags will be deleted, and all content
95 * inside this tags will be also removed)
100 var $deleteTagsContent = array('script', 'style', 'title', 'xml', );
103 * Type of protocols filtering ('white' or 'black')
108 var $protocolFiltering = 'white';
111 * List of "dangerous" protocols (used for blacklist-filtering)
116 var $blackProtocols = array(
117 'about', 'chrome', 'data', 'disk', 'hcp',
118 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
119 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
120 'res', 'resource', 'shell', 'vbscript', 'view-source',
121 'vnd.ms.radio', 'wysiwyg',
125 * List of "safe" protocols (used for whitelist-filtering)
130 var $whiteProtocols = array(
131 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
132 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
137 * List of attributes that can contain protocols
142 var $protocolAttributes = array(
143 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
147 * List of dangerous CSS keywords
149 * Whole style="" attribute will be removed, if parser will find one of
155 var $cssKeywords = array(
156 'absolute', 'behavior', 'behaviour', 'content', 'expression',
157 'fixed', 'include-source', 'moz-binding',
161 * List of tags that can have no "closing tag"
165 * @deprecated XHTML does not allow such tags
167 var $noClose = array();
170 * List of block-level tags that terminates paragraph
172 * Paragraph will be closed when this tags opened
177 var $closeParagraph = array(
178 'address', 'blockquote', 'center', 'dd', 'dir', 'div',
179 'dl', 'dt', 'h1', 'h2', 'h3', 'h4',
180 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee',
181 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre',
182 'table', 'ul', 'xmp',
187 * List of table tags, all table tags outside a table will be removed
192 var $tableTags = array(
193 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
203 var $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
206 * List of dangerous attributes
211 var $attributes = array('dynsrc', 'id', 'name', );
214 * List of allowed "namespaced" attributes
219 var $attributesNS = array('xml:lang', );
227 function __construct($opts = array())
230 foreach ($opts as $k =>$v) {
234 //making regular expressions based on Proto & CSS arrays
235 foreach ($this->blackProtocols as $proto) {
236 $preg = "/[\s\x01-\x1F]*";
237 for ($i=0; $i<strlen($proto); $i++) {
238 $preg .= $proto[$i] . "[\s\x01-\x1F]*";
241 $this->_protoRegexps[] = $preg;
244 foreach ($this->cssKeywords as $css) {
245 $this->_cssRegexps[] = '/' . $css . '/i';
251 * Handles the writing of attributes - called from $this->_openHandler()
253 * @param array $attrs array of attributes $name => $value
257 function _writeAttrs ($attrs)
260 if (is_array($attrs)) {
261 foreach ($attrs as $name => $value) {
263 $name = strtolower($name);
265 if (strpos($name, 'on') === 0) {
268 if (strpos($name, 'data') === 0) {
271 if (in_array($name, $this->attributes)) {
274 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
275 if (!in_array($name, $this->attributesNS)) {
280 if (($value === TRUE) || (is_null($value))) {
284 if ($name == 'style') {
286 // removes insignificant backslahes
287 $value = str_replace("\\", '', $value);
289 // removes CSS comments
292 $_value = preg_replace("!/\*.*?\*/!s", '', $value);
293 if ($_value == $value) break;
297 // replace all & to &
298 $value = str_replace('&', '&', $value);
299 $value = str_replace('&', '&', $value);
300 $value = $this->cleanStyle($value);
303 $tempval = preg_replace_callback('/&#(\d+);?/m', function($m) { return chr($m[1]); } , $value); //"'
304 $tempval = preg_replace_callback('/&#x([0-9a-f]+);?/mi', function($m) { return chr(hexdec($m[1])); } , $tempval);
307 ///$tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"'
308 ///$tempval = preg_replace('/&#x([0-9a-f]+);?/mei', "chr(hexdec('\\1'))", $tempval);
310 if ((in_array($name, $this->protocolAttributes)) &&
311 (strpos($tempval, ':') !== false))
313 if ($this->protocolFiltering == 'black') {
314 foreach ($this->_protoRegexps as $proto) {
315 if (preg_match($proto, $tempval)) continue 2;
318 $_tempval = explode(':', $tempval);
319 $proto = $_tempval[0];
320 if (!in_array($proto, $this->whiteProtocols)) {
326 $value = str_replace("\"", """, $value);
327 $ret .= ' ' . $name . '="' . $value . '"';
333 function cleanStyle ($str)
337 require_once 'HTML/CSS/InlineStyle.php';
338 $is = new HTML_CSS_InlineStyle();
340 $ar = $is->_styleToArray($str);
341 foreach($ar as $k=>$v) {
342 if (in_array(strtolower(trim($k)), $this->cssKeywords)) {
343 //echo "Trashing BL css keyword $k=$v <br/>";
347 foreach ($this->_protoRegexps as $proto) {
348 if (preg_match($proto, $v)) {
349 echo "$proto - Trashing $k=$v <br/>";
357 foreach($ar as $prop => $val) {
358 $st[] = "{$prop}:{$val}";
360 return implode(';', $st);
366 * Opening tag handler - called from HTMLSax
368 * @param object $parser HTML Parser
369 * @param string $name tag name
370 * @param array $attrs tag attributes
374 function _openHandler($name, $attrs)
376 $name = strtolower($name);
378 if (in_array($name, $this->deleteTagsContent)) {
382 if (in_array($name, $this->deleteTags)) {
386 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
388 /*if (preg_match("!(?:\@|://)!i", $name)) {
389 return '<' . $name . '>';
390 $this->_xhtml .= '<' . $name . '>';
395 if (in_array(strtolower($name), $this->singleTags)) {
396 return '<' . $name . $this->_writeAttrs($attrs) . '/>';
398 return '<' . $name . $this->_writeAttrs($attrs) . '>';
403 * Main parsing fuction
405 * @param string $doc HTML document for processing
406 * @return string Processed (X)HTML document
412 // Save all '<' symbols
413 //$doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
415 // Web documents shouldn't contains \x00 symbol
416 //$doc = str_replace("\x00", '', $doc);
418 // Opera6 bug workaround
419 //$doc = str_replace("\xC0\xBC", '<', $doc);
421 // UTF-7 encoding ASCII decode
422 //$doc = $this->repackUTF7($doc);
424 if (!extension_loaded('tidy')) {
427 // print_r(strlen($doc));exit;
429 if (strlen($doc) > 1000000) {
430 $doc = substr($doc, 0, 1000000);
432 $tree = tidy_parse_string($doc,array(),'UTF8');
434 // print_r($tree);exit;
436 return $this->tidyTree($tree->root());
443 function parseFile($fn)
446 // Save all '<' symbols
447 //$doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
449 // Web documents shouldn't contains \x00 symbol
450 //$doc = str_replace("\x00", '', $doc);
452 // Opera6 bug workaround
453 //$doc = str_replace("\xC0\xBC", '<', $doc);
455 // UTF-7 encoding ASCII decode
456 //$doc = $this->repackUTF7($doc);
458 if (!extension_loaded('tidy')) {
459 die("Add tidy extension to extension.ini");
461 $tree = tidy_parse_file($fn,array(),'UTF8');
465 return $this->tidyTree($tree->root());
472 function tidyTree($node)
476 switch ($node->type) {
477 case TIDY_NODETYPE_TEXT:
478 if (strlen(trim($node->value))) {
481 //echo htmlspecialchars($node->value);
484 case TIDY_NODETYPE_STARTEND:
485 case TIDY_NODETYPE_START:
486 if (!empty($this->filter)) {
487 $node = (object) (array) $node; // we can't work with the
489 $this->filter->apply($node);
492 case TIDY_NODETYPE_END: // handled by start / singleTags..
494 //$this->out .= "<". htmlspecialchars($node->name) .'/>';
497 case TIDY_NODETYPE_ROOT:
502 //echo $node->name ."\n";
506 if ($node->type != TIDY_NODETYPE_ROOT) {
507 //echo htmlspecialchars(print_r($node ,true));
508 $add = $this->_openHandler($node->name, empty($node->attribute) ? array() : $node->attribute);
509 if (is_string($add)) {
511 if (!in_array(strtolower($node->name), $this->singleTags)) {
512 $cr = strtolower($node->name) == 'pre' ? '' : "\n";
513 $end = $cr . '</' . $node->name . '>';
518 return ''; // delete this tag and all the contents..
522 // include children...
523 if(!$onode->hasChildren()){
524 return $begin . $end;
526 foreach($onode->child as $child){
527 // echo "child of ". $node->name . ':' . $child->type . "\n";
528 $begin .= $this->tidyTree($child);
530 return $begin . $end;
537 * UTF-7 decoding fuction
539 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
540 * @return string Decoded document
543 function repackUTF7($str)
545 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
549 * Additional UTF-7 decoding fuction
551 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
552 * @return string Recoded string
555 function repackUTF7Callback($str)
557 $str = base64_decode($str[1]);
558 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
559 return preg_replace('/\x00(.)/', '$1', $str);
563 * Additional UTF-7 encoding fuction
565 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
566 * @return string Recoded string
569 function repackUTF7Back($str)
571 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
579 * c-hanging-comment-ender-p: nil