2 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
5 * Loosely Based onHTML_Safe Parser
11 * @author Roman Ivanov <thingol@mail.ru>
12 * @copyright 2004-2005 Roman Ivanov
13 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
15 * @link http://pear.php.net/package/HTML_Safe
22 * This parser strips down all potentially dangerous content within HTML:
24 * <li>opening tag without its closing tag</li>
25 * <li>closing tag without its opening tag</li>
26 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
27 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
28 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
29 * <li>any of these attributes: on*, data*, dynsrc</li>
30 * <li>javascript:/vbscript:/about: etc. protocols</li>
31 * <li>expression/behavior etc. in styles</li>
32 * <li>any other active content</li>
34 * It also tries to convert code to XHTML valid, but htmltidy is far better
35 * solution for this task.
39 * $parser =& new HTML_Safe();
40 * $result = $parser->parse($doc);
45 * @author Roman Ivanov <thingol@mail.ru>
46 * @copyright 1997-2005 Roman Ivanov
47 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
48 * @version Release: @package_version@
49 * @link http://pear.php.net/package/HTML_Safe
57 * Array of prepared regular expressions for protocols (schemas) matching
62 var $_protoRegexps = array();
65 * Array of prepared regular expressions for CSS matching
70 var $_cssRegexps = array();
73 * List of single tags ("<tag />")
78 var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
81 * List of dangerous tags (such tags will be deleted)
86 var $deleteTags = array(
87 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
88 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
89 'iframe', 'layer', 'link', 'meta', 'object', 'style',
94 * List of dangerous tags (such tags will be deleted, and all content
95 * inside this tags will be also removed)
100 var $deleteTagsContent = array('script', 'style', 'title', 'xml', );
103 * Type of protocols filtering ('white' or 'black')
108 var $protocolFiltering = 'white';
111 * List of "dangerous" protocols (used for blacklist-filtering)
116 var $blackProtocols = array(
117 'about', 'chrome', 'data', 'disk', 'hcp',
118 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
119 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
120 'res', 'resource', 'shell', 'vbscript', 'view-source',
121 'vnd.ms.radio', 'wysiwyg',
125 * List of "safe" protocols (used for whitelist-filtering)
130 var $whiteProtocols = array(
131 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
132 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
137 * List of attributes that can contain protocols
142 var $protocolAttributes = array(
143 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
147 * List of dangerous CSS keywords
149 * Whole style="" attribute will be removed, if parser will find one of
155 var $cssKeywords = array(
156 'absolute', 'behavior', 'behaviour', 'content', 'expression',
157 'fixed', 'include-source', 'moz-binding',
161 * List of tags that can have no "closing tag"
165 * @deprecated XHTML does not allow such tags
167 var $noClose = array();
170 * List of block-level tags that terminates paragraph
172 * Paragraph will be closed when this tags opened
177 var $closeParagraph = array(
178 'address', 'blockquote', 'center', 'dd', 'dir', 'div',
179 'dl', 'dt', 'h1', 'h2', 'h3', 'h4',
180 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee',
181 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre',
182 'table', 'ul', 'xmp',
187 * List of table tags, all table tags outside a table will be removed
192 var $tableTags = array(
193 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
203 var $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
206 * List of dangerous attributes
211 var $attributes = array('dynsrc', 'id', 'name', );
214 * List of allowed "namespaced" attributes
219 var $attributesNS = array('xml:lang', );
226 function __construct($opts = array())
229 foreach ($opts as $k =>$v) {
233 //making regular expressions based on Proto & CSS arrays
234 foreach ($this->blackProtocols as $proto) {
235 $preg = "/[\s\x01-\x1F]*";
236 for ($i=0; $i<strlen($proto); $i++) {
237 $preg .= $proto[$i] . "[\s\x01-\x1F]*";
240 $this->_protoRegexps[] = $preg;
243 foreach ($this->cssKeywords as $css) {
244 $this->_cssRegexps[] = '/' . $css . '/i';
250 * Handles the writing of attributes - called from $this->_openHandler()
252 * @param array $attrs array of attributes $name => $value
256 function _writeAttrs ($attrs)
259 if (is_array($attrs)) {
260 foreach ($attrs as $name => $value) {
262 $name = strtolower($name);
264 if (strpos($name, 'on') === 0) {
267 if (strpos($name, 'data') === 0) {
270 if (in_array($name, $this->attributes)) {
273 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
274 if (!in_array($name, $this->attributesNS)) {
279 if (($value === TRUE) || (is_null($value))) {
283 if ($name == 'style') {
285 // removes insignificant backslahes
286 $value = str_replace("\\", '', $value);
288 // removes CSS comments
291 $_value = preg_replace("!/\*.*?\*/!s", '', $value);
292 if ($_value == $value) break;
296 // replace all & to &
297 $value = str_replace('&', '&', $value);
298 $value = str_replace('&', '&', $value);
299 $value = $this->cleanStyle($value);
302 $tempval = preg_replace_callback('/&#(\d+);?/m', function($m) { return chr($m[1]); } , $value); //"'
303 $tempval = preg_replace_callback('/&#x([0-9a-f]+);?/mi', function($m) { return chr(hexdec($m[1])); } , $tempval);
306 ///$tempval = preg_replace('/&#(\d+);?/me', "chr('\\1')", $value); //"'
307 ///$tempval = preg_replace('/&#x([0-9a-f]+);?/mei', "chr(hexdec('\\1'))", $tempval);
309 if ((in_array($name, $this->protocolAttributes)) &&
310 (strpos($tempval, ':') !== false))
312 if ($this->protocolFiltering == 'black') {
313 foreach ($this->_protoRegexps as $proto) {
314 if (preg_match($proto, $tempval)) continue 2;
317 $_tempval = explode(':', $tempval);
318 $proto = $_tempval[0];
319 if (!in_array($proto, $this->whiteProtocols)) {
325 $value = str_replace("\"", """, $value);
326 $ret .= ' ' . $name . '="' . $value . '"';
332 function cleanStyle ($str)
336 require_once 'HTML/CSS/InlineStyle.php';
337 $is = new HTML_CSS_InlineStyle();
339 $ar = $is->_styleToArray($str);
340 foreach($ar as $k=>$v) {
341 if (in_array(strtolower(trim($k)), $this->cssKeywords)) {
342 //echo "Trashing BL css keyword $k=$v <br/>";
346 foreach ($this->_protoRegexps as $proto) {
347 if (preg_match($proto, $v)) {
348 echo "$proto - Trashing $k=$v <br/>";
356 foreach($ar as $prop => $val) {
357 $st[] = "{$prop}:{$val}";
359 return implode(';', $st);
365 * Opening tag handler - called from HTMLSax
367 * @param object $parser HTML Parser
368 * @param string $name tag name
369 * @param array $attrs tag attributes
373 function _openHandler($name, $attrs)
375 $name = strtolower($name);
377 if (in_array($name, $this->deleteTagsContent)) {
381 if (in_array($name, $this->deleteTags)) {
385 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
387 /*if (preg_match("!(?:\@|://)!i", $name)) {
388 return '<' . $name . '>';
389 $this->_xhtml .= '<' . $name . '>';
394 if (in_array(strtolower($name), $this->singleTags)) {
395 return '<' . $name . $this->_writeAttrs($attrs) . '/>';
397 return '<' . $name . $this->_writeAttrs($attrs) . '>';
402 * Main parsing fuction
404 * @param string $doc HTML document for processing
405 * @return string Processed (X)HTML document
411 // Save all '<' symbols
412 //$doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
414 // Web documents shouldn't contains \x00 symbol
415 //$doc = str_replace("\x00", '', $doc);
417 // Opera6 bug workaround
418 //$doc = str_replace("\xC0\xBC", '<', $doc);
420 // UTF-7 encoding ASCII decode
421 //$doc = $this->repackUTF7($doc);
423 if (!extension_loaded('tidy')) {
426 // print_r(strlen($doc));exit;
428 if (strlen($doc) > 1000000) {
429 $doc = substr($doc, 0, 1000000);
431 $tree = tidy_parse_string($doc,array(),'UTF8');
433 // print_r($tree);exit;
435 return $this->tidyTree($tree->root());
442 function parseFile($fn)
445 // Save all '<' symbols
446 //$doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
448 // Web documents shouldn't contains \x00 symbol
449 //$doc = str_replace("\x00", '', $doc);
451 // Opera6 bug workaround
452 //$doc = str_replace("\xC0\xBC", '<', $doc);
454 // UTF-7 encoding ASCII decode
455 //$doc = $this->repackUTF7($doc);
457 if (!extension_loaded('tidy')) {
458 die("Add tidy extension to extension.ini");
460 $tree = tidy_parse_file($fn,array(),'UTF8');
464 return $this->tidyTree($tree->root());
471 function tidyTree($node)
475 switch ($node->type) {
476 case TIDY_NODETYPE_TEXT:
477 if (strlen(trim($node->value))) {
480 //echo htmlspecialchars($node->value);
483 case TIDY_NODETYPE_STARTEND:
484 case TIDY_NODETYPE_START:
485 if (!empty($this->filter)) {
486 $node = (object) (array) $node; // we can't work with the
488 $this->filter->apply($node);
491 case TIDY_NODETYPE_END: // handled by start / singleTags..
493 //$this->out .= "<". htmlspecialchars($node->name) .'/>';
496 case TIDY_NODETYPE_ROOT:
501 //echo $node->name ."\n";
505 if ($node->type != TIDY_NODETYPE_ROOT) {
506 //echo htmlspecialchars(print_r($node ,true));
507 $add = $this->_openHandler($node->name, empty($node->attribute) ? array() : $node->attribute);
508 if (is_string($add)) {
510 if (!in_array(strtolower($node->name), $this->singleTags)) {
511 $cr = strtolower($node->name) == 'pre' ? '' : "\n";
512 $end = $cr . '</' . $node->name . '>';
517 return ''; // delete this tag and all the contents..
521 // include children...
522 if(!$onode->hasChildren()){
523 return $begin . $end;
525 foreach($onode->child as $child){
526 // echo "child of ". $node->name . ':' . $child->type . "\n";
527 $begin .= $this->tidyTree($child);
529 return $begin . $end;
536 * UTF-7 decoding fuction
538 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
539 * @return string Decoded document
542 function repackUTF7($str)
544 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
548 * Additional UTF-7 decoding fuction
550 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
551 * @return string Recoded string
554 function repackUTF7Callback($str)
556 $str = base64_decode($str[1]);
557 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
558 return preg_replace('/\x00(.)/', '$1', $str);
562 * Additional UTF-7 encoding fuction
564 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
565 * @return string Recoded string
568 function repackUTF7Back($str)
570 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
578 * c-hanging-comment-ender-p: nil