Fix #7402 - HTML editor cleaning in PHP

author Alan <alan@roojs.com>

Fri, 16 Sep 2022 05:14:09 +0000 (13:14 +0800)

committer Alan <alan@roojs.com>

Fri, 16 Sep 2022 05:14:09 +0000 (13:14 +0800)
author Alan <alan@roojs.com>
Fri, 16 Sep 2022 05:14:09 +0000 (13:14 +0800)
committer Alan <alan@roojs.com>
Fri, 16 Sep 2022 05:14:09 +0000 (13:14 +0800)
diff --git a/HTML/Clean.php b/HTML/Clean.php

new file mode 100644 (file)

index 0000000..e138f88
--- /dev/null
+++ b/HTML/Clean.php
@@ -0,0 +1,113 @@
+<?php
+
+/**
+ * This is a PHP implementation of the Roo HTMLEditorCore onPaste method - that cleans up HTML
+ * and replaces things like tables etc..
+ */
+
+class HTML_Clean {
+    
+    static function fromHTML($str, $opts = array())
+    {
+        $str= self::cleanWordChars($str);
+        $dom = new DOMDocument('1.0', 'utf8');
+        $dom->loadHTML($str);
+        $opts['dom'] = $dom;
+        return new HTML_Clean($opts);    
+    }
+    static function cleanWordChars($str)
+    {
+        $swapCodes  = array(
+             8211 =>  "&#8211;" ,  
+             8212 =>  "&#8212;" ,  
+             8216 =>   "'" ,   
+             8217 =>  "'" ,   
+             8220 =>  '"' ,   
+             8221 =>  '"' ,   
+             8226 =>  "*" ,   
+             8230 =>  "..." 
+        );
+        foreach($swapCodes as $k=>$v) {
+            $str = str_replace(mb_chr($k), $v, $str);
+        }
+        return $str;
+    
+    }
+    
+    
+    var $dom; // Dom Document.
+    var $black = array(
+        'APPLET', // 
+        'BASE',   'BASEFONT', 'BGSOUND', 'BLINK',  'BODY', 
+        'FRAME',  'FRAMESET', 'HEAD',    'HTML',   'ILAYER', 
+        'IFRAME', 'LAYER',  'LINK',     'META',    'OBJECT',   
+        'SCRIPT', 'STYLE' ,'TITLE',  'XML',
+        //'FONT' // CLEAN LATER..
+        'COLGROUP', 'COL'   // messy tables.
+    ); // blacklist of elements.
+    
+    function __construct($opts)
+    {
+        foreach($opts as $k=>$v) {
+            $this->{$k} = $v;
+        }
+        $d = $this->dom->documentElement;
+        $this->filter('Word',array( 'node' =>  $d ));
+            
+        $this->filter('StyleToTag', array(
+            'node' =>  $d   // this could add nodes to tree, so not very good to nest the walk.
+            
+        ));
+        
+        $this->filter('Attributes',array(    // does walk as well.
+            'node' => $d,
+            'attrib_white' => array('href', 'src', 'name', 'align', 'colspan', 'rowspan', 'data-display', 'data-width', 'start'),
+            'attrib_clean' => array('href', 'src' ),
+            
+            'replaceComment' => true   // this is sneaked in here - as walk will get rid of comments at the same time.
+        ));
+        // is this used?!?!
+        $this->filter('Black', array( 'node' =>  $d, 'tag'  =>  $this->black ));
+        // we don't use the whitelist?
+        
+        
+        // should be fonts..
+        $this->filter('KeepChildren',array( 'node' =>  $d, 'tag'  =>   array(   'FONT', ':' )) );  
+        $this->filter('Paragraph',array( 'node' =>  $d ));
+        $this->filter('Span',array( 'node' =>  $d ));
+        $this->filter('LongBr',array( 'node' =>  $d ));
+         
+        $ar = $this->arrayFrom($d->getElementsByTagName('img'));
+        foreach($ar as $img) {
+            if ($this->findParent($img, 'figure')) {
+                continue;
+            }
+            require_once 'HTML/Clean/BlockFigure.php';
+            $fig = new HTML_Clean_BlockFigure(array(
+                'image_src' => $img->getAttribute('src')
+            ));
+            $fig->updateElement($img);
+            
+        }
+         
+        
+        
+        require_once 'HTML/Clean/Block.php';
+        HTML_Clean_Block::initAll($d);
+
+    }
+    
+    function filter($type, $args)
+    {
+        require_once 'HTML/Clean/Filter'. $type .'.php';
+        $cls = 'HTML_Clean_Filter'. $type;
+        new $cls($args);
+    }
+    
+    function toString()
+    {
+        $this->dom->saveHTML();
+    }
+    
+    
+}
+\ No newline at end of file
diff --git a/HTML/Clean/Block.php b/HTML/Clean/Block.php

new file mode 100644 (file)

index 0000000..0af0483
--- /dev/null
+++ b/HTML/Clean/Block.php
@@ -0,0 +1,325 @@
+<?php
+
+/**
+ * This is used in the HTML editor to make content editable
+ *
+ * In our case it's used to render images and tables correctly.
+ */
+require_once 'Filter.php';
+abstract class  HTML_Clean_Block extends HTML_Clean_Filter
+{
+    var $node;
+   
+    // used by context menu
+    var $context = false; // ??
+   
+     
+    static function factory ($node)
+    {
+         
+        
+        $db  = $node->hasAttribute('data-block') ? $node->getAttribute('data-block') : false;
+        if ($db) {
+            $db = ucfirst($node->nodeName);
+        }
+        require_once 'HTML/Clean/Block'.$db . '.php';
+        $cls = 'HTML_Clean_Block'. $db;
+        
+        return new $cls(array('node' => $node ));
+        
+    }
+    
+
+    /**
+     * initalize all Elements from content that are 'blockable'
+     * @static
+     * @param the body element
+     */
+    static function initAll ($body, $type=false)
+    {
+        if ($type === false) {
+            
+            self::initAll($body,'table');
+            self::initAll($body,'td');
+            self::initAll($body,'figure');
+            return;
+        }
+        $ar = $body->getElementsByName($type);
+        foreach($ar as $a) {
+            self::factory($a);
+        }
+        
+    }
+     /**
+     * Update a node with values from this object
+     * @param {DomElement} node
+     */
+    function updateElement ($node)
+    {
+        self::updateNode(empty($node) ? $this->node : $node, self::createDom($this->toObject()));
+        
+    }
+     /**
+     * convert to plain HTML for calling insertAtCursor..
+     */
+    function toHTML ()
+    {
+        return self::createHTML($this->toObject());
+    }
+    /**
+     * used by readEleemnt to extract data from a node
+     * may need improving as it's pretty basic
+     
+     * @param {DomElement} node
+     * @param {String} tag - tag to find, eg. IMG ?? might be better to use DomQuery ?
+     * @param {String} attribute (use html - for contents, style for using next param as style, or false to return the node)
+     * @param {String} style the style property - eg. text-align
+     */
+    function getVal($node, $tag , $attr = false, $style = false)
+    {
+         $n = $node;
+        if ($tag !== true && $n->tagName != strtoupper($tag)) {
+            // in theory we could do figure[3] << 3rd figure? or some more complex search..?
+            // but kiss for now.
+            $n = $node->getElementsByTagName($tag)->item(0);
+        }
+        if (!$n) {
+            return '';
+        }
+        if ($attr === false) {
+            return $n;
+        }
+        if ($attr == 'html') {
+            return $this->innerHTML($node);
+            
+            
+        }
+        if ($attr == 'style') {
+            $style = $this->styleToObject($node, true);
+            return isset($style[strtolower($style)]) ? $style[strtolower($style)] : '';
+        }
+        
+        return n->hasAttribute($attr) ? n->getAttribute($attr) : '';
+            
+    }
+    /**
+     * create a DomHelper friendly object - for use with 
+     * Roo.DomHelper.markup / overwrite / etc..
+     * (override this)
+     */
+    abstract function toObject();
+      /**
+     * Read a node that has a 'data-block' property - and extract the values from it.
+     * @param {DomElement} node - the node
+     */
+    abstract function readElement ($node);
+    
+    static function createHTML($o)
+    {
+        
+        if (is_string($o)) {
+            return $o;
+        }
+        $b  = "";
+        if(empty($o->tag)){
+            $o->tag = "div";
+        }
+        $b .= "<" . $o->tag;
+        
+        foreach($o as $attr => $val) {
+            if ($attr == "tag" || $attr == "children" || $attr == "cn" || $attr == "html") {
+                continue;
+            }
+            if($attr == "style"){
+                
+                if (is_string($val)) {
+                    $b .= ' style="' . $val . '"';
+                } else if(is_array($val)) {
+                    $b .= ' style="';
+                    foreach($val as $kk=>$vv) {
+                        $b .= $kk . ":" . $vv . ";";
+                    
+                    }
+                    $b .= '"';
+                }
+            } else {
+                if($attr == "cls"){
+                    $b .= ' class="' + $val + '"';
+                }else if($attr == "htmlFor"){
+                    $b .= ' for="' + $val + '"';
+                } else {
+                    $b .= " " + $attr + '="' + $val + '"';
+                }
+            }
+        }
+        if (preg_match('/^(?:br|frame|hr|img|input|link|meta|range|spacer|wbr|area|param|col)$/i', $o->tag)) {
+            $b .= "/>"; // empty
+        } else { 
+            $b .= ">";
+            $cn = isset($o->cn) ? $o->cn :
+                (isset($o->children) ? $o->children : false);
+            
+            if($cn !== false){
+                
+                if(is_array($cn)) {
+                    foreach($cn as $v) {
+                        $b .= self::createHtml($v);
+                    }
+                }else{
+                    $b .= self::createHtml(cn);
+                }
+            }
+            if(isset($o->html)){
+                $b .= $o->html;
+            }
+            $b .= "</" + $o->tag + ">";
+        }
+        return $b;
+         
+        
+    }
+    
+    static function createDom ($o, $parentNode = false) {
+         
+        // defininition craeted..
+        $ns = false;
+        $doc = new DOMDocument('1.0', 'utf8');
+        
+        if (is_string($o)) {
+            return  $parentNode ? $parentNode->appendChild($doc->createTextNode($o)) : $doc->createTextNode($o);
+        }
+        if(empty($o->tag)){
+            $o->tag = "div";
+        }
+        
+        $el = $doc.createElement($o->tag);
+        
+        foreach ($o as $attr => $val) {
+            
+            if($attr == "tag" || $attr == "ns" ||$attr == "xmlns" || $attr == "children" || $attr == "cn" || $attr == "html" || 
+                    $attr == "style") {
+                continue;
+            }
+                    
+            if ($attr=="cls"){
+                $el->setAttribute('class',$val);
+            } else {
+                $el->setAttribute($attr, $val);
+            }
+        }
+        if (isset($o->style)) {
+            foreach ($styles as $k=>$v){
+                $el->setAttribute($k,$v);
+            }
+        }
+        $cn = isset($o->cn) ? $o->cn :
+                (isset($o->children) ? $o->children : false);
+           
+        if($cn) {
+            //http://bugs.kde.org/show_bug.cgi?id=71506
+            if (is_array($cn)) {
+                foreach($cn as $c) {
+                    self::createDom($c, $el);
+                }
+            }else{
+                self::createDom($cn, $el);
+            }
+        }
+        if(isset($o->html)) {
+            $f = $doc->createDocumentFragment();
+            $f->appendXML($o->html);
+            $el->appendChild($f);
+        }
+        if($parentNode){
+           $parentNode->appendChild(el);
+        }
+        return $el;
+    }
+    
+    static function updateNode ($from, $to)
+    {
+        // should we handle non-standard elements?
+        
+        if ($from->nodeType != $to->nodeType) {
+            //Roo.log(["ReplaceChild - mismatch notType" , to, from ]);
+            $from->parentNode->replaceChild($to, $from);
+        }
+        
+        if ($from->nodeType == 3) {
+            // assume it's text?!
+            if ($from->data == $to->data) {
+                return;
+            }
+            $from->data = $to->data;
+            return;
+        }
+        if (!$from->parentNode) {
+            return;
+        }
+        // assume 'to' doesnt have '1/3 nodetypes!
+        // not sure why, by from, parent node might not exist?
+        if ($from->nodeType != 1 || $from->tagName != $to->tagName) {
+            $from->parentNode->replaceChild($to, $from);
+            return;
+        }
+        
+        // compare attributes
+        $ar = $this->arrayFrom($from->attributes);
+        foreach($ar as $k=>$v) {
+            if ($to->hasAttribute($k)) {
+                continue;
+            }
+            if ($k == 'id') { // always keep ids?
+               continue;
+            }
+            //if (ar[i].name == 'style') {
+            //   throw "style removed?";
+            //}
+            
+            $from->removeAttribute($k);
+        }
+        
+        foreach($to->attributes as $k => $v) { 
+        
+            if ($from->getAttribute($k) == $v) {
+                continue;
+            }
+            $from->setAttribute($k, $v);
+        }
+        // children
+        $far = $this->arrayFrom(from.childNodes);
+        $tar = $this->arrayFrom(to.childNodes);
+        // if the lengths are different.. then it's probably a editable content change, rather than
+        // a change of the block definition..
+        
+        // this did notwork , as our rebuilt nodes did not include ID's so did not match at all.
+         /*if (from.innerHTML == to.innerHTML) {
+            return;
+        }
+        if (far.length != tar.length) {
+            from.innerHTML = to.innerHTML;
+            return;
+        }
+        */
+        
+        for ($i = 0; $i < max(count(array_keys($tar)), count(array_keys($far))); $i++) {
+            if ($i >= count(array_keys($far))) {
+                $from->appendChild($tar[$i]);
+                continue;
+                
+            }
+            if ( $i  >= count(array_keys($tar))) {
+                $from->removeChild($far[$i]);
+                continue;
+            }
+            self::updateNode($far[$i], $tar[$i]);
+            
+        }
+        
+        
+        
+        
+    }
+    
+     
+};
diff --git a/HTML/Clean/BlockFigure.php b/HTML/Clean/BlockFigure.php

new file mode 100644 (file)

index 0000000..3c19784
--- /dev/null
+++ b/HTML/Clean/BlockFigure.php
@@ -0,0 +1,195 @@
+<?php 
+
+/**
+ * This handles 'figure' tags (which is what img's are auto wrapped with.)
+ */
+
+ 
+require_once 'Block.php';
+class  HTML_Clean_BlockFigure extends HTML_Clean_Block
+{
+    
+    function __construct($cfg) {
+        if ($cfg['node']) {
+            $this->readElement($cfg['node']);
+            $this->updateElement($cfg['node']);
+        }
+        parent::__construct();
+    }
+     
+  
+    
+    // setable values.
+    var $image_src= '';
+    var $align= 'center';
+    var $caption = '';
+    var $caption_display = 'block';
+    var $width = '100%';
+    var $cls = '';
+    var $href = '';
+    var $video_url = '';
+    
+    // margin: '2%', not used
+    
+    var $text_align = 'left'; //   (left|right) alignment for the text caption default left. - not used at present
+
+    
+    // used by context menu
+    
+    /**
+     * create a DomHelper friendly object - for use with
+     * Roo.DomHelper.markup / overwrite / etc..
+     */
+    function toObject ()
+    {
+        $doc = new DOMDocument('1.0', 'utf8');
+        
+        $d = $doc->createElement('div');
+        $f = $doc->createDocumentFragment();
+        $f->appendXML($o->caption); // caption could include html
+        $d->appendChild($f);
+        $caption_plain = $this->caption_display == "block" ? trim(preg_replace('/\s+/g', ' ', str_replace("\n", " ", $d->textContent))) : '';
+        
+        $m = $this->width != '100%' && $this->align == 'center' ? '0 auto' : 0; 
+        
+        $iw = $this->align == 'center' ? $this->width : '100%';
+        $img =   array(
+            'tag' => 'img',
+            'src' => $this->image_src,
+            'alt' => $caption_plain,
+            'style'=> array(
+                'width' => $iw,
+                'max-width' =>$iw + ' !important', // this is not getting rendered?
+                'margin' => $m  
+                
+            )
+        );
+        /*
+        '<div class="{0}" width="420" height="315" src="{1}" frameborder="0" allowfullscreen>' +
+                    '<a href="{2}">' + 
+                        '<img class="{0}-thumbnail" src="{3}/Images/{4}/{5}#image-{4}" />' + 
+                    '</a>' + 
+                '</div>',
+        */
+                
+        if (!empty($this->href)) {
+            $img = array(
+                'tag ' => 'a',
+                'href' => $this->href,
+                'cn' => array(
+                    $img
+                )
+            );
+        }
+        
+        
+        if (!empty($this->video_url.length )) {
+            $img = array(
+                'tag' => 'div',
+                'cls' => $this->cls,
+                'frameborder' => 0,
+                'allowfullscreen' => true,
+                'width' => 420,  // these are for video tricks - that we replace the outer
+                'height' => 315,
+                'src' => $this->video_url,
+                'cn' => array(
+                    $img
+                )
+            );
+        }
+        // we remove caption totally if its hidden... - will delete data.. but otherwise we end up with fake caption
+        $captionhtml = $this->caption_display == 'none' || !strlen($this->caption) ? '' : $this->caption;
+        
+  
+        return  array(
+            'tag '=> 'figure',
+            'data-block' => 'Figure',
+            'data-width' => $this->width, 
+            
+            
+            'style' => array(
+                'display' => 'block',
+                'float' =>  $this->align ,
+                'max-width' =>  $this->align == 'center' ? '100% !important' : ($this->width + ' !important'),
+                'width' => $this->align == 'center' ? '100%' : $this->width,
+                'margin' =>  '0px',
+                'padding' => $this->align == 'center' ? '0' : '0 10px' ,
+                'text-align' => $this->align   // seems to work for email..
+                
+            ),
+           
+            
+            'align' => $this->align,
+            'cn' => array(
+                $img,
+              
+                array (
+                    'tag'=> 'figcaption',
+                    'data-display' => $this->caption_display,
+                    'style' => array(
+                        'text-align' => 'left',
+                        'font-size' => '16px',
+                        'line-height' => '24px',
+                        'display' => $this->caption_display,
+                        'max-width' => ($this->align == 'center' ?  $this->width : '100%' ) + ' !important',
+                        'margin'=> $m,
+                        'width'=> $this->align == 'center' ?  $this->width : '100%' 
+                    
+                         
+                    ),
+                    'cls' => $this->cls.length > 0 ? ($this->cls  + '-thumbnail' ) : '',
+                    'cn' => array(
+                        array(
+                            'tag' => 'div',
+                            'style'  => array(
+                                'margin-top' => '16px',
+                                'text-align' => 'left'
+                            ),
+                            'align'=> 'left',
+                            'cn' => array(
+                                array( 
+                                    // we can not rely on yahoo syndication to use CSS elements - so have to use  '<i>' to encase stuff.
+                                    'tag' => 'i',
+                                    'html' => $captionhtml
+                                )
+                                
+                            )
+                        )
+                        
+                    )
+                    
+                )
+            )
+        );
+         
+    }
+    
+    function readElement ($node)
+    {
+        // this should not really come from the link...
+        $this->video_url = $this->getVal($node, 'div', 'src');
+        $this->cls = $this->getVal($node, 'div', 'class');
+        $this->href = $this->getVal($node, 'a', 'href');
+        
+        
+        $this->image_src = $this->getVal($node, 'img', 'src');
+         
+        $this->align = $this->getVal($node, 'figure', 'align');
+        
+        $figcaption = $this->getVal($node, 'figcaption', false);
+        if ($figcaption !== '') {
+            $this->caption = $this->getVal($figcaption, 'i', 'html');
+        }
+        
+
+        $this->caption_display = $this->getVal($node, 'figcaption', 'data-display');
+        //$this->text_align = $this->getVal(node, 'figcaption', 'style','text-align');
+        $this->width = $this->getVal($node, true, 'data-width');
+        //$this->margin = $this->getVal(node, 'figure', 'style', 'margin');
+        
+    }
+    
+    
+    
+    
+}
+\ No newline at end of file
diff --git a/HTML/Clean/BlockTable.php b/HTML/Clean/BlockTable.php

new file mode 100644 (file)

index 0000000..c21e9ea
--- /dev/null
+++ b/HTML/Clean/BlockTable.php
@@ -0,0 +1,147 @@
+<?php 
+
+/**
+ * This handles 'table' tags (and puts nice borders on them.)
+ */
+
+ 
+require_once 'Block.php';
+require_once 'BlockTd.php'; 
+class  HTML_Clean_BlockTable extends HTML_Clean_Block
+{
+    
+    var $rows = array(); // row cols {array}
+    var $no_col = 1;
+    var $no_row = 1;
+    var $width = '100%';
+    
+    function __construct($cfg) {
+        
+        
+        
+        if ($cfg['node']) {
+            $this->readElement($cfg['node']);
+            $this->updateElement($cfg['node']);
+        } 
+        parent::__construct();
+        if (!$this->node) {
+        
+            for($r = 0; $r < $this->no_row; $r++) {
+                $this->rows[$r] = array();
+                for($c = 0; $c < $this->no_col; $c++) {
+                    $this->rows[$r][$c] = $this->emptyCell();
+                }
+            }
+        }
+    }
+    
+    function toObject ()
+    {
+        
+        $ret = array(
+            'tag' => 'table',
+            'data-block' => 'Table',
+            'style' => array(
+                'width'=>  $this->width,
+                'border' => 'solid 1px #000', // ??? hard coded?
+                'border-collapse' => 'collapse' 
+            ),
+            'cn' => array(
+                array( 'tag' => 'tbody' , 'cn' => array() ) 
+            )
+        );
+        
+        // do we have a head = not really 
+        $ncols = 0;
+        foreach($this->rows as $row) {
+            $tr = array(
+                'tag' => 'tr',
+                'style' => array(
+                    'margin' => '6px',
+                    'border' => 'solid 1px #000',
+                    'text-align' => 'left' 
+                ),
+                'cn' => array()
+            );
+            
+            
+            // does the row have any properties? ?? height?
+            $nc = 0;
+            foreach($row as $cell) {
+                
+                $td = $cell->toObject();
+                
+                if ($cell->colspan > 1) {
+                    $nc += $cell->colspan;
+                } else {
+                    $nc++;
+                }
+                
+                // widths ?
+                $tr->cn[] = $td;
+                    
+                
+            }
+            
+            $ret->cn[0]->cn[] = $tr;
+            
+            $ncols = max($nc, $ncols);
+            
+            
+        }
+        // add the header row..
+        
+        $ncols++; // not used?
+         
+        
+        return $ret;
+         
+    }
+    
+    function readElement($node)
+    {
+        
+        $node  = $node ? $node : $this->node ;
+        $this->width = this.getVal($node, true, 'style', 'width') || '100%';
+        
+        $this->rows = array();
+        $this->no_row = 0;
+        $trs = $this->arrayFrom($node->getElementsByTagName('tr'));
+        foreach($trs as $tr) {
+            $row =  array();
+            
+            
+            $this->no_row++;
+            $no_column = 0;
+            foreach($node->getElementsByTagName('td') as $td) {
+                
+                
+                $add = new HTML_Clean_Block_Td( array('node' => $td ));
+                    /*'colspan : td.hasAttribute('colspan') ? td.getAttribute('colspan')*1 : 1,
+                    rowspan : td.hasAttribute('rowspan') ? td.getAttribute('rowspan')*1 : 1,
+                    style : td.hasAttribute('style') ? td.getAttribute('style') : '',
+                    html : td.innerHTML
+                    
+                };
+                */
+                $no_column += $add->colspan;
+                     
+                
+                $row[] =   $add;
+                
+                
+            }
+            $this->rows[] = $row;     
+            $this->no_col = max($this->no_col, $no_column);
+            
+            
+          }
+        
+        
+    }
+    
+    function emptyCell () {
+        return new HTML_Clean_Block_Td(array());
+         
+    }
+}
+\ No newline at end of file
diff --git a/HTML/Clean/BlockTd.php b/HTML/Clean/BlockTd.php

new file mode 100644 (file)

index 0000000..ed04b87
--- /dev/null
+++ b/HTML/Clean/BlockTd.php
@@ -0,0 +1,85 @@
+<?php 
+
+/**
+ * This handles 'td' tags (and puts nice borders on them.)
+ */
+
+ 
+require_once 'Block.php';
+class  HTML_Clean_BlockTd extends HTML_Clean_Block
+{
+    var $width = ''; // should be a percent.!
+    var $textAlign = 'left';
+    var $valign = 'top';
+    
+    var $colspan = 1;
+    var $rowspan = 1;
+    
+    
+     
+    function __construct($cfg) {
+         
+        if ($cfg['node']) {
+            $this->readElement($cfg['node']);
+            $this->updateElement($cfg['node']);
+        } 
+        parent::__construct();
+         
+    }
+    
+    function toObject ()
+    {
+        $ret = array(
+            'tag' => 'td',
+            'data-block' => 'Td',
+            'valign' => $this->valign,
+            'style' => array(
+                'text-align' =>  $this->textAlign,
+                'border' => 'solid 1px rgb(0, 0, 0)', // ??? hard coded?
+                'border-collapse' => 'collapse',
+                'padding' => '6px', // 8 for desktop / 4 for mobile
+                'vertical-align'=> $this->valign
+            ),
+            html => $this->html
+        );
+        if ($this->width != '') {
+            $ret->width = $this->width;
+            $ret['style']['width'] = $this->width;  
+        }
+        
+        
+        if ($this->colspan > 1) {
+            $ret['colspan'] = $this->colspan ;
+        } 
+        if ($this->rowspan > 1) {
+            $ret['rowspan'] = $this->rowspan ;
+        }
+        
+           
+        
+        return $ret;
+         
+    }
+    
+    
+    function readElement ($node)
+    {
+        $node  = $node ? $node : $this->node ;
+        
+        
+        $this->width = $node->getAttribute('width');
+        $this->colspan = max(1,1*$node->getAttribute('colspan'));
+        $this->rowspan = max(1,1*$node->getAttribute('rowspan'));
+        $this->html = $this->innerHTML($node);
+        $styles = $this->styleToObject($node,true);
+        
+        if (!empty($styles['text-align'])) {
+            $this->textAlign = $styles['text-align'];
+        }
+        if ($node->hasAttribute('valign')) {
+            $this->valign = $node->getAttribute('valign');
+        }
+        
+    }
+    
+}
+\ No newline at end of file
diff --git a/HTML/Clean/Filter.php b/HTML/Clean/Filter.php

new file mode 100644 (file)

index 0000000..0df51ea
--- /dev/null
+++ b/HTML/Clean/Filter.php
@@ -0,0 +1,142 @@
+<?php
+
+/**
+ * Base class for all the filtering, contains a few usefull routines along with the walk code
+ * 
+ * doesnt really use filter parent much
+ *
+ */ 
+
+class  HTML_Clean_Filter
+{
+    var $replaceComment = false; // default to trash these.!
+    
+    var $node = false;
+    var $tag = false;
+   
+    function __construct($cfg)
+    {
+        foreach($cfg as $k=>$v) {
+            $this->$k = $v;
+        }
+    }
+    
+    function walk ($dom)
+    {
+        
+        $ar = $this->arrayFrom($dom->childNodes);
+        foreach($ar as $e) {
+        
+            switch(true) {
+                
+                
+                case $this->isTagMatch(e):
+                
+                    if (false === $this->replaceTag(e)) {
+                        return;
+                    }
+                    if ($e->hasChildNodes()) {
+                        $this->walk($e);
+                    }
+                    return;
+                
+                default:    // tags .. that do not match.
+                    if ($e->hasChildNodes()) {
+                        this->walk($e);
+                    }
+            }
+            
+        }
+        
+    }
+    function isTagMatch($e) {
+        switch(true) {
+            
+            case $e->nodeType == 8 &&  $this->replaceComment  !== false: // comment
+                    $this->replaceComment($e);
+                    return false;
+                
+                case $e->nodeType != 1: //not a node.
+                    return false;
+            
+            case $this->tag === true: // everything
+            case strpos(':', $e->tagName) !== false && is_array($this->tag) && in_array(":", $this->tag):
+            case strpos(':', $e->tagName) !== false && is_string($this->tag)  && $this->tag == ":":
+            case is_array($this->tag) && in_array($e->tagName, $this->tag):
+            case is_string($this->tag) && $e->tagName ==  $this->tag:
+                return true;
+        }
+         
+    }
+    
+    
+    // dummy version - implementations should return false to not walk children.
+    function replaceTag($e) {
+        // if we avoid filtering here -> we could just call walk on all the child names.
+        
+        return true;
+    }
+    
+    function removeNodeKeepChildren  ( $node)
+    {
+    
+        $ar = $this->arrayFrom($node->childNodes);
+        foreach($ar as $n) {
+            $node->removeChild($n);
+            $node->parentNode->insertBefore($n, $node);
+        }
+        $node->parentNode->removeChild($node);
+    }
+    
+    function arrayFrom($list)
+    {
+        $ret = array();
+        foreach($list as $k=> $l) {
+            $ret[$k] = $l;
+        }
+        return $ret;
+    }
+    
+    function innerHTML($n)
+    {
+        $ret = "";
+        foreach($n->children as $c) {
+            $ret .= $c->ownerDocument->saveXML($c);
+        }
+        return $ret;
+    }
+    
+    function copyInnerHTML($from, $to)
+    {
+        $ar = $this->arrayFrom($from->childNodes);
+        foreach($ar as $n) {
+            $from->removeChild($n);
+            $to->appendChild($n);
+        }
+    }
+    
+    function styleToObject($node, $lower = false)
+    {
+        $styles = explode(';',$node->hasAttribute("style") ? $node->getAttribute("style")  : '');
+        $ret = array();
+        foreach($styles as $s) {
+            if (strpos($s, ':') === false) {
+                return;
+            }
+            $kv = explode(':', $s, 2);
+             
+            // what ever is left... we allow.
+            $ret[$lower ? strtotrim($kv[0]) : $kv[0]] = trim($kv[1]);
+        }
+        return $ret;
+    }
+    function nodeSetStyle($node, $style)
+    {
+        $str = array();
+        foreach($style as $k=>$v) {
+            $str[] = "$k:$v";
+        }
+        $node->setAttribute('style', implode(";", $str));
+    }
+    
+}
diff --git a/HTML/Clean/FilterAttributes.php b/HTML/Clean/FilterAttributes.php

new file mode 100644 (file)

index 0000000..75dad9b
--- /dev/null
+++ b/HTML/Clean/FilterAttributes.php
@@ -0,0 +1,137 @@
+<?php
+
+
+
+/**
+ *  replaces bad attributes... and attribute values
+ *  
+ *  done by walking all elements
+ *   
+ *  
+ *
+ */ 
+
+require_once 'Filter.php';
+
+class HTML_Clean_FilterAttribute  extends HTML_Clean_Filter
+{
+   
+    var $tag =  true; // all tags
+    
+    var $attrib_black = array(); // array
+    var $attrib_clean = array(); // array
+    var $attrib_white = array(); // array
+    
+    var $style_black = array(); // array
+    var $style_white = array(); // array
+    
+    function __construct($cfg)
+    {
+        parent::__construct($cfg);
+        $this->walk($cfg['node']);
+    } 
+    
+     function replaceTag ($node)
+    {
+        if (!$node->hasAttributes()) {
+            return true; // do children.
+        }
+        $ats = $this->arrayFrom($node->attributes);
+        foreach($ats as $a) {
+            
+            // remove all if we have a white list..
+            if (count($this->attrib_white) && in_array(strtolower($a->name), $this->attrib_white)) {
+                $node->removeAttribute($a->name);
+                continue;
+            }
+            
+            // always remove 'on'
+            if (substr(strtolower($a->name),0,2) == 'on')  {
+                $node->removeAttribute($a->name);
+                continue;
+            }
+            
+            
+            if (in_array( strtolower($a->name),$this->attrib_black)) {
+                $node->removeAttribute($a->name);
+                continue;
+            }
+            if (in_array( strtolower($a->name),$this->attrib_clean))  {
+                $this->cleanAttr($node,$a->name,$a->value); // fixme..
+                continue;
+            }
+                
+            if ($a->name == 'style') {
+                $this->cleanStyle($node);
+                continue;
+            }
+            /// clean up MS crap..
+            // tecnically this should be a list of valid class'es..
+            
+            
+            if ($a->name == 'class') {
+                if (preg_match('/^Mso/', $a->value)) {
+                    $node->removeAttribute('class');
+                    continue;
+                }
+                if (preg_match('/^body$/', $a->value)) {
+                    $node->removeAttribute('class');
+                    continue;
+                }
+            }
+            
+            
+            // style cleanup!?
+            // class cleanup?
+            
+        }
+        return true; // clean children
+    }
+    // cleans urls...
+    function cleanAttr($node, $n,$v)
+    {
+        // starts with 'dot' or 'slash', 'hash' or '{' << template
+        if (preg_match('/^(\.|\/|#|\{)/' , $v)) {
+            return;
+        }
+        // standard stuff? - should we allow data?
+        if (preg_match('/(http|https|mailto|ftp|data):/' , $v)) {
+            return;
+        }
+        
+//            Roo.log("(REMOVE TAG)"+ node.tagName +'.' + n + '=' + v);
+        $node->removeAttribute($n);
+        
+    }
+    
+    function cleanStyle ($node)
+    {
+        if (preg_match('/expression/', $node->getAttribute('style'))) { //XSS?? should we even bother..
+            $node->removeAttribute('style');
+            return;
+        }
+        $style = $this->styleToObject($node);
+        $update = false;
+        foreach($style as $k=>$v) {
+            
+            if ( in_array(strtolower($k), $this->style_black)) {
+                unset($style[$k]);
+                $update = true;
+                continue;
+            }
+            
+            //Roo.log()
+            // only allow 'c whitelisted system attributes'
+            if ( count($this->style_white) &&  in_array(strtolower($k), $this->style_white)) {
+                continue;
+            }
+            unset($style[$k]);
+            $update = true;
+            
+        }
+        if ($update) {
+            $this->nodeSetStyle($node, $style);
+        }
+        
+    }
+}
+\ No newline at end of file
diff --git a/HTML/Clean/FilterBlack.php b/HTML/Clean/FilterBlack.php

new file mode 100644 (file)

index 0000000..6901ced
--- /dev/null
+++ b/HTML/Clean/FilterBlack.php
@@ -0,0 +1,50 @@
+<?php
+
+/**
+ *
+ * black list removes all nodes which match and their children.
+ *
+ * if it's doesnt need to use ':', then we can just do a search.
+ *
+ */
+
+ 
+require_once 'Filter.php';
+
+class HTML_Clean_FilterBlack extends HTML_Clean_Filter
+{
+   
+ 
+    function __construct($cfg)
+    {
+        parent::__construct($cfg);
+        if (is_array($this->tag) && !in_array(':', $this->tag)) {
+            $this->simpleReplace();
+            return;
+        }
+        if (is_string($this->tag) && ':' != $this->tag) {
+            $this->simpleReplace();
+            return;
+        }
+        $this->walk($cfg['node']);
+    } 
+ 
+    function replaceTag ($n)
+    {
+        $n->parentNode->removeChild($n);
+        return false; // don't both with children..
+    }
+    
+    function simpleReplace()
+    {
+        foreach(is_array($this->tag) ? $this->tag : array($this->tag) as $t) {
+            $ar = $this->fromArray($this->node->getElementsByTagName($t));
+            foreach($ar as $k) {
+                if ($k->parentNode) {
+                    $k->parentNode->removeChild($k);
+                }
+            }
+        }
+    }
+    
+}
+\ No newline at end of file
diff --git a/HTML/Clean/FilterKeepChildren.php b/HTML/Clean/FilterKeepChildren.php

new file mode 100644 (file)

index 0000000..1e35cf4
--- /dev/null
+++ b/HTML/Clean/FilterKeepChildren.php
@@ -0,0 +1,61 @@
+<?php
+
+/**
+ *
+ * if the node matches, it will replace the child with children.
+ * done for wierd namespaced nodes, and stuff like font.
+ *
+ * js one extends black?
+ *
+ */
+
+ 
+require_once 'Filter.php';
+
+class HTML_Clean_FilterKeepChildren extends HTML_Clean_Filter
+{
+   
+ 
+    function __construct($cfg)
+    {
+        parent::__construct($cfg);
+        if ($this->tag === false) { //not sure why.
+            return;
+        }
+        
+        $this->walk($cfg['node']);
+    } 
+ 
+    function replaceTag ($n)
+    {
+        
+          // walk children...
+        //Roo.log(node.tagName);
+        
+        $ar = $this->arrayFrom($node->childNodes);
+        
+        
+        //remove first.. - otherwise due to our walking method - the parent will not look at them.
+        foreach($ar as $t) {
+            if (!$this->isTagMatch($t)) {
+                continue;
+            }
+            $this->replaceTag($t); // this effetively walks all the children.
+        }
+        $ar = $this->arrayFrom($node->childNodes);
+        foreach($ar as $t) {
+         
+            $node->removeChild($t);
+            // what if we need to walk these???
+            $node->parentNode->insertBefore($t, $node);
+            // js code walks again.
+        }
+        //Roo.log("REMOVE:" + node.tagName);
+        $node->parentNode->removeChild(node);
+        return false; // don't walk children
+        
+    }
+    
+     
+    
+}
+\ No newline at end of file
diff --git a/HTML/Clean/FilterLongBr.php b/HTML/Clean/FilterLongBr.php

new file mode 100644 (file)

index 0000000..05c202d
--- /dev/null
+++ b/HTML/Clean/FilterLongBr.php
@@ -0,0 +1,76 @@
+<?php
+
+/**
+ *
+ *  br br br >>> BR ?
+ *  
+ *
+ */
+
+ 
+require_once 'Filter.php';
+
+class HTML_Clean_FilterLongBR extends HTML_Clean_Filter
+{
+   
+ 
+    function __construct($cfg)
+    {
+        parent::__construct($cfg);
+        $pp = $this->arrayFrom($this->node->getElementsByTagName('br'));
+        foreach($pp as $p) {
+            if (!$p->parentNode) { // should not happen as we only walk forwards.
+                continue;
+            }
+            $this->replaceIt($p);
+        }
+    }
+    
+    function replaceIt($node)
+    {
+        
+        if (!$node->previousSibling) { // not hing before us...
+            return false;
+        }
+        
+        $ps = $node->nextSibling;
+        // find the nex sibling that is a node, 
+        while ($ps && $ps->nodeType == 3 && strlen(trim($ps->nodeValue)) < 1) {
+            $ps = $ps->nextSibling;
+        }
+        // we have no next sibling, and are inside one of these tags
+        if (!$ps &&  in_array(strtoupper($node->parentNode->tagName), array( 'TD', 'TH', 'LI', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ))) {
+            $node->parentNode->removeChild($node); // remove last BR inside one fo these tags
+            return false;
+        }
+        
+        if (!$ps || $ps->nodeType != 1) {
+            return;
+        }
+        // next node node a BR.
+        
+        if (!$ps || $ps->tagName != 'BR') {
+            return; 
+        }
+        
+        
+        $ps = $node->previousSibling;
+        
+        while ($ps && $ps->nodeType == 3 &&  strlen(trim($ps->nodeValue)) < 1) {
+            $ps = $ps->previousSibling;
+        }
+        
+        if (!$ps || $ps->nodeType != 1) {
+            return;
+        }
+        // if header or BR before.. then it's a candidate for removal.. - as we only want '2' of these..
+        if (!$ps || !in_array(strtoupper($node->parentNode->tagName), array( 'TD', 'TH', 'LI', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ))) {
+            return;
+        }
+        
+        $node->parentNode->removeChild($node); // remove me...
+        
+        
+    }
+}
+ 
+\ No newline at end of file
diff --git a/HTML/Clean/FilterParagraph.php b/HTML/Clean/FilterParagraph.php

new file mode 100644 (file)

index 0000000..9023b7e
--- /dev/null
+++ b/HTML/Clean/FilterParagraph.php
@@ -0,0 +1,56 @@
+<?php
+
+/**
+ *
+ * if the node matches, it will replace the child with children.
+ * done for wierd namespaced nodes, and stuff like font.
+ *
+ * js one extends black?
+ *
+ */
+
+ 
+require_once 'Filter.php';
+
+class HTML_Clean_FilterParagraph extends HTML_Clean_Filter
+{
+   
+ 
+    function __construct($cfg)
+    {
+        parent::__construct($cfg);
+        $pp = $this->node->getElementsByTagName('p');
+        while($pp->length) {
+            $this->replaceIt($p);
+        }
+    }
+    
+    function replaceIt($node)
+    {
+        
+        if ($node->childNodes->length == 1 &&
+            $node->childNodes->item(0)->nodeType == 3 &&
+            strlen(trim($node->childNodes->item(0)->textContent)) < 1
+            ) {
+            
+            // remove and replace with '<BR>';
+            $node->parentNode->replaceChild($node->ownerDocument->createElement('BR'),$node);
+        }
+        $ar = $this->arrayFrom($node->childNodes);
+        foreach($ar as $a) {
+            $node->removeChild($a);
+            // what if we need to walk these???
+            $node->parentNode->insertBefore($a, $node);
+        }
+        // now what about this?
+        // <p> &nbsp; </p>
+        
+        // double BR.
+        $node->parentNode->insertBefore($node->ownerDocument->createElement('BR'), $node);
+        $node->parentNode->insertBefore($node->ownerDocument->createElement('BR'), $node);
+        
+        $node->parentNode->removeChild($node);
+        
+    }
+}
+ 
+\ No newline at end of file
diff --git a/HTML/Clean/FilterSpan.php b/HTML/Clean/FilterSpan.php

new file mode 100644 (file)

index 0000000..346c374
--- /dev/null
+++ b/HTML/Clean/FilterSpan.php
@@ -0,0 +1,32 @@
+<?php
+
+/**
+ *
+ * remove spans without attributes.
+ * 
+ *
+ */
+
+ 
+require_once 'Filter.php';
+
+class HTML_Clean_FilterSpan extends HTML_Clean_Filter
+{
+   
+ 
+    function __construct($cfg)
+    {
+        parent::__construct($cfg);
+        $ar = $this->arrayFrom($this->node->getElementsByTagName('span'));
+        foreach($as as $a) {
+            if ($a->hasAttributes()) {
+                continue;
+            }
+            $this->removeNodeKeepChildren($a);
+        }
+        
+    }
+    
+    
+}
+ 
+\ No newline at end of file
diff --git a/HTML/Clean/FilterStyleToTag.php b/HTML/Clean/FilterStyleToTag.php

new file mode 100644 (file)

index 0000000..6353b7e
--- /dev/null
+++ b/HTML/Clean/FilterStyleToTag.php
@@ -0,0 +1,82 @@
+<?php
+
+
+
+/**
+ *  replaces styles with HTML
+ *  
+ * 
+ *  
+ *
+ */ 
+
+require_once 'Filter.php';
+
+class HTML_Clean_FilterStyleToTag extends HTML_Clean_Filter
+{
+    
+    var $tag = true;
+    
+    // what we are going to change..
+    var $tags = array(
+        
+        
+        'B'  => array( 'font-weight' => 'bold' ),
+        'I' =>   array(  'font-style'  => 'italic' ),
+        
+        // h1.. h6 ?? font-size?
+        'SUP'  => array(   'vertical-align'  => 'super'),
+        'SUB' => array(   'vertical-align' => 'sub' )
+        
+    );
+    
+    function __construct($cfg)
+    {
+        parent::__construct($cfg);
+        $this->walk($cfg['node']);
+    }
+    
+ 
+    
+    
+    
+    function replaceTag($node)
+    {
+        
+        
+        if (!$node->hasAttribute("style")) {
+            return true;
+        }
+        $inject = array();
+        $style = $this->styleToObject($node, true);
+        foreach ($this->tags as $tn => $kv) {
+            list($k,$v) = $kv;
+            if (!isset($style[$k]) || $style[$k] != $v) {
+                continue;
+            }
+            unset($style[$k]);
+            $inject[] = $tn;
+        }
+        if (!count($inject)) {
+            return true; 
+        }
+        $this->nodeSetStyle($node, $style);
+        $cn = $this->arrayFrom($node->childNodes);
+        $nn = $node;
+        foreach($inject as $t) { 
+        
+            $nc = $node->ownerDocument->createElement($t);
+            $nn->appendChild($nc);
+            $nn = $nc;
+        }
+        foreach($cn as $n) {
+            $node->removeChild($n);
+            $nn->appendChild($n);
+        }
+        
+        return true; /// iterate thru
+    }
+    
+ 
+    
+}
+\ No newline at end of file
diff --git a/HTML/Clean/FilterWord.php b/HTML/Clean/FilterWord.php

new file mode 100644 (file)

index 0000000..f185cc9
--- /dev/null
+++ b/HTML/Clean/FilterWord.php
@@ -0,0 +1,241 @@
+<?php
+
+/**
+ * Does a few very specific word paste filtering - doc bullents an a name= tags.
+ * 
+ * doesnt really use filter parent much
+ *
+ */ 
+
+require_once 'Filter.php';
+
+class HTML_Clean_FilterWord extends HTML_Clean_Filter
+{
+    
+    var $tag = true;
+    
+    function __construct($cfg)
+    {
+        $this->replaceDocBullets($cfg->node);
+        $this->replaceAname($cfg->node);
+    }
+   
+    
+    function replaceAname  ($doc)
+    {
+        // replace all the a/name without..
+        $aa= $this->arrayFrom($doc->getElementsByTagName('a'));
+        
+        for ($i = 0; $i  < count($aa); $i++) {
+            $a = $aa[$i];
+            if ($a->hasAttribute("name")) {
+                $a->removeAttribute("name");
+            }
+            if ($a->hasAttribute("href")) {
+                continue;
+            }
+            // reparent children.
+            $this->removeNodeKeepChildren($a);
+            
+        }
+        
+        
+        
+    }
+    function replaceClassList($list)
+    {
+        foreach($this->arrayFrom($list) as $l) {
+            $l->setAttribute('class', "MsoListParagraph");
+        }
+    }
+    
+    
+    function replaceDocBullets  ($doc)
+    {
+        // this is a bit odd - but it appears some indents use ql-indent-1
+         //Roo.log(doc.innerHTML);
+        
+        $this->replaceClassList($doc->getElementsByClassName('MsoListParagraphCxSpFirst'));
+        $this->replaceClassList($doc->getElementsByClassName('MsoListParagraphCxSpMiddle'));
+        $this->replaceClassList($doc->getElementsByClassName('MsoListParagraphCxSpLast'));
+        $this->replaceClassList($doc->getElementsByClassName('ql-indent-1'));
+
+           
+        // this is a bit hacky - we had one word document where h2 had a miso-list attribute.
+        $htwo =  $this->arrayFrom($doc->getElementsByTagName('h2'));
+        foreach($htow as $a) {
+            if ($a->hasAttribute('style') && preg_match('/mso-list:/', $a->getAttribute('style'))) {
+                $a->setAttribute('class', "MsoListParagraph");
+            }
+        }
+        $ar =  $this->arrayFrom($doc->getElementsByClassName('MsoNormal'));
+        foreach($ar as $a) {
+            if ($a->hasAttribute('style') && preg_match('/mso-list:/', $a->getAttribute('style'))) {
+                $a->setAttribute('class', "MsoListParagraph");
+            } else {
+                $a->setAttribute('class', "MsoNormalx");
+            }
+        }
+       
+        $listpara = $doc->getElementsByClassName('MsoListParagraph');
+        // Roo.log(doc.innerHTML);
+        
+        
+        
+        while($listpara->length) {
+            
+            $this->replaceDocBullet($listpara->item(0));
+        }
+      
+    }
+    
+     
+    
+    function replaceDocBullet  ($p)
+    {
+        // gather all the siblings.
+        $ns = $p;
+        $parent = $p->parentNode;
+        $doc = $parent->ownerDocument;
+        $items = array();;
+            
+        $listtype = 'ul';   
+        while ($ns) {
+            if ($ns->nodeType != 1) {
+                $ns = $ns->nextSibling;
+                continue;
+            }
+            $cln = $ns->hasAttribute('class') ? $ns->getAttribute('class') : '';
+            if (preg_match('/(MsoListParagraph|ql-indent-1)/i', $cln)) {
+                break;
+            }
+            $spans = $ns->getElementsByTagName('span');
+            if ($ns->hasAttribute('style') && preg_match('/mso-list/', $ns->getAttribute('style'))) {
+                $items[] = $ns;
+                $ns = $ns->nextSibling;
+                $has_list = true;
+                if ($spans->length && spans->item(0).hasAttribute('style')) {
+                    $style = $this->styleToObject($spans->item(0), true);
+                    if (!empty($style['font-family']) && !preg_match('/Symbol/', $style['font-family'])) {
+                        $listtype = 'ol';
+                    }
+                }
+                
+                continue;
+            }
+            
+            $spans = $ns->getElementsByTagName('span');
+            if (!$spans->length) {
+                break;
+            }
+            $has_list  = false;
+            foreach($spasn as $s) {
+                if ($s->hasAttribute('style') &&  preg_match('/mso-list/', $s->getAttribute('style'))) {
+                    $has_list = true;
+                    break;
+                }
+            }
+            if (!$has_list) {
+                break;
+            }
+            $items[] = $ns;
+            $ns = $ns->nextSibling;
+            
+            
+        }
+        if (!count($items)) {
+            $ns->setAttribute('class', '');
+            return;
+        }
+        
+        $ul = $parent->ownerDocument->createElement($listtype); // what about number lists...
+        $parent->insertBefore($ul, $p);
+        $lvl = 0;
+        $stack = array ( $ul );
+        $last_li = false;
+        
+        $margin_to_depth = array();
+        $max_margins = -1;
+        
+        foreach($items as $ipos => $n)
+        {
+        
+            //Roo.log("got innertHMLT=" + n.innerHTML);
+            
+            $spans = $this->arrayFrom($n->getElementsByTagName('span'));
+            if (!count($spans)) {
+                //Roo.log("No spans found");
+                 
+                $parent->removeChild($n);
+                
+                
+                continue; // skip it...
+            }
+           
+                
+            $num = 1;
+            $style = array();
+            foreach($spans as $i => $span) {
+            
+                $style = $this->styleToObject($span, true);
+                if (empty($style['mso-list']) ) {
+                    continue;
+                }
+                if ($listtype == 'ol') {
+                   $num = preg_replace('/[^0-9]+]/g', '', $span->textContent)  * 1;
+                }
+                $span->parentNode->removeChild($span); // remove the fake bullet.
+                break;
+            }
+            //Roo.log("NOW GOT innertHMLT=" + n.innerHTML);
+            $style = $this->styleToObject($n, true); // mo-list is from the parent node.
+            if (empty($style['mso-list'])) {
+                  
+                $parent->removeChild($n);
+                 
+                continue;
+            }
+            
+            $margin = $style['margin-left'];
+            if (empty($margin_to_depth[$margin]) ) {
+                $max_margins++;
+                $margin_to_depth[$margin] = $max_margins;
+            }
+            $nlvl = $margin_to_depth[$margin] ;
+             
+            if ($nlvl > $lvl) {
+                //new indent
+                $nul = $doc->createElement($listtype); // what about number lists...
+                if (!$last_li) {
+                    $last_li = $doc->createElement('li');
+                    $stack[$lvl]->appendChild($last_li);
+                }
+                $last_li->appendChild($nul);
+                $stack[$nlvl] = $nul;
+                
+            }
+            $lvl = $nlvl;
+            
+            // not starting at 1..
+            if (!$stack[$nlvl]->hasAttribute("start") && $listtype == "ol") {
+                $stack[$nlvl]->setAttribute("start", $num);
+            }
+            
+            $nli = $stack[$nlvl]->appendChild($doc->createElement('li'));
+            $last_li = $nli;
+            $this->copyInnerHtml($n, $nli);
+            //$nli->innerHTML = $n->innerHTML;
+            //Roo.log("innerHTML = " + n.innerHTML);
+            $parent->removeChild($n);
+            
+              
+        }
+        
+        
+        
+        
+    }
+    
+    
+    
+}
author	Alan <alan@roojs.com>
	Fri, 16 Sep 2022 05:14:09 +0000 (13:14 +0800)
committer	Alan <alan@roojs.com>
	Fri, 16 Sep 2022 05:14:09 +0000 (13:14 +0800)
HTML/Clean.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/Block.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/BlockFigure.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/BlockTable.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/BlockTd.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/Filter.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/FilterAttributes.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/FilterBlack.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/FilterKeepChildren.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/FilterLongBr.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/FilterParagraph.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/FilterSpan.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/FilterStyleToTag.php	[new file with mode: 0644]	patch \| blob
HTML/Clean/FilterWord.php	[new file with mode: 0644]	patch \| blob