HTML/WordDiff.php

author edward <edward@roojs.com>

Thu, 17 Nov 2016 06:13:57 +0000 (14:13 +0800)

committer edward <edward@roojs.com>

Thu, 17 Nov 2016 06:13:57 +0000 (14:13 +0800)
author edward <edward@roojs.com>
Thu, 17 Nov 2016 06:13:57 +0000 (14:13 +0800)
committer edward <edward@roojs.com>
Thu, 17 Nov 2016 06:13:57 +0000 (14:13 +0800)
diff --git a/HTML/WordDiff.php b/HTML/WordDiff.php

index 854b319..6f32ccb 100644 (file)
--- a/HTML/WordDiff.php
+++ b/HTML/WordDiff.php
@@ -282,93 +282,6 @@ class HTML_WordDiff
          return $sentence;
      }
      
-    
-    var $tmpWords = false;
-    function addUTF8Word($s) {
-        
-        echo "calling addUTF8Word \n";
-        print_R($s);
-        
-        $this->tmpWords[] = $s[0];
-//        print_r($this->tmpWords);
-        return ' ';
-    }
-    
-    function domExtractWords($node, $words, $charset="auto")
-    {
-        if ($this->wordMax > 0 && count($words) >  $this->wordMax) {
-            return $words;
-        }
-        //echo count($words) ."\n";
-        if (empty($node)) {
-            return $words;
-        }
-        
-        if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
-            
-            $str = trim($node->textContent);
-            
-            echo "node content : {$str} \n";
-            
-            if ($charset != 'auto') {
-                
-                if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
-                    //var_dump("ORIG" . $str);
-                    $str = mb_convert_encoding($str, $charset,  "UTF-8");
-                    //var_dump("$charset:" .$str);
-                    $str = mb_convert_encoding($str, "BIG5",$charset);
-                    //var_dump("BIG5:".$str);
-                    $str = mb_convert_encoding($str, "UTF-8",  "BIG5");
-                    //var_dump("UTF-8:".$str);
-                } else {
-                    $str = mb_convert_encoding($str, "UTF-8",  $charset);
-                }
-            }
-            
-            echo "node content mb convert : {$str} \n";
-            
-            //var_dump('xx'.$str);
-             //var_dump($str);
-            $this->tmpWords = $words;
-            //if ($this->isSino()) {
-            $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word')  , $str);
-            //}
-            $words = $this->tmpWords;
-            // remove puncutianion..
-            $str = preg_replace('/[^\w]+/u', ' ', $str);
-            
-            echo "after replace : {$str} \n";
-            
-            foreach(preg_split('/\s+/u', $str) as $word) {
-                if($this->debug_on){
-//                    print_r(mb_detect_encoding($node->textContent));
-                    //print_r("\n");
-                }
-                if (!trim($word)) {
-                    continue;
-                }
-                // fixme - break unicode chars
-                $words[] = $word;
-            }
-            
-        }
-        if (!$node->hasChildNodes()) {
-            return $words;
-        }
-        
-        for($i = 0; $i < $node->childNodes->length; $i++) {
-            
-            $n = $node->childNodes->item($i);
-            //if($this->debug_on){
-            //    print_r($n);
-            //    print_r("\n");
-            //}
-            $words = $this->domExtractWords($n, $words,$charset);
-        }
-        return $words;
-        
-        
-    }
      function cjkpreg() {
          
          static $ret = false;
author	edward <edward@roojs.com>
	Thu, 17 Nov 2016 06:13:57 +0000 (14:13 +0800)
committer	edward <edward@roojs.com>
	Thu, 17 Nov 2016 06:13:57 +0000 (14:13 +0800)