fix image text

[pear] / HTML / WordDiff.php
diff --git a/HTML/WordDiff.php b/HTML/WordDiff.php

index ed59a86..30b76c4 100644 (file)
--- a/HTML/WordDiff.php
+++ b/HTML/WordDiff.php
@@ -143,11 +143,13 @@ class HTML_WordDiff
      function buildWords($target = 'original')
      {
          static $cache= array();
+        
          if (isset($cache[md5($this->htmlDom)])) {
+            
              $this->$target = $cache[md5($this->htmlDom)];
              
              if ($this->wordMax < 0) {
-                $this->wordMax = array_sum(array_values($this->target)) * 10 ;
+                $this->wordMax = array_sum(array_values($this->$target)) * 10 ;
              }
              
              if($target == 'original'){
@@ -159,177 +161,130 @@ class HTML_WordDiff
              return;
          }
          
-        $a = $this->DomToStrings();
+        $words = $this->DomToStrings();
          
          if ($this->wordMax < 0) {
-            $this->wordMax = 10*count($a);
+            $this->wordMax = 10 * count($words);
          }
+        
          if($this->debug_on){
-            var_Dump("domstrings"); print_r($a);
-//            exit;
+            var_Dump("domstrings"); print_r($words);
          }
+        
          $ret = array();
          $last_w = false;
          
-        foreach($a as $str){
+        foreach($words as $str){
+            
              if(empty($str) || !trim(strlen($str))) {
                  continue;
              }
-//            if(!isset($ret[$str])){
-//                $ret[$str] = 1;
-//            
-//            } else {
-//                $ret[$str] += 1;
-//            }
-            // now deal with pairing..
+            
              if ($last_w !== false) {
                  
                  if(!isset($ret[$last_w.'|'.$str])){
                      $ret[$last_w.'|'.$str] = 1;
-
                  } else {
                      $ret[$last_w.'|'.$str] += 1;
-                }    
-                
+                }
              }
+            
              $last_w = $str;
+            
          }
-//        print_r($ret);
+
          if($target == 'original'){
              $this->countTotal = array_sum(array_values($ret));
          }else{
              $this->targetTotal= array_sum(array_values($ret));
          }
+        
          $this->$target = $ret;
+        
          $cache[md5($this->htmlDom)] = $ret;
+        
      }
      
      function DomToStrings($target = '')
      {
          $charset = 'UTF-8';
-        //if (preg_match('#charset=([^"]+)#', $this->htmlDom,$matches)) {
-            //var_dump($matches);exit;
-        //    $charset = $matches[1];
-        //}
          
          $pageDom = new DomDocument('1.0', $charset);
          $pageDom->formatOutput = true;
          
-        // change language if encoding does not match...
+        $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
          
+        @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
          
+        $sentence = $this->domExtractWords($pageDom->documentElement, array(), $charset);
          
-//        print_r(mb_detect_encoding($this->htmlDom));
-       
-        // may produce errors - so we hide them...
-        $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
-        //$searchPage = $this->htmlDom; //@mb_convert_encoding($this->htmlDom, $charset ,  $charset=="UTF-8" ? "auto" :$charset);
-                 
-        
-//        $searchPage = mb_convert_encoding($this->htmlDom, "UTF-8",  "HTML-ENTITIES");
-//        echo $searchPage;
-//        print_r(mb_detect_encoding($searchPage));
-        
-//        $searchPage = mb_convert_encoding($this->htmlDom, "big5");
-//        if($target == 'target'){
-//            print_r($searchPage);
-//            exit;
-//        }
-//        print_r(mb_detect_encoding($searchPage));
-      // print_r($searchPage);
-        @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
-//        exit;
-        $words = $this->domExtractWords($pageDom->documentElement, array(), $charset);
-       // print_r($words);exit;
          
-//        $string = preg_replace('/[^\pL\pS\pN]/u', '-', $pageDom->documentElement->getElementsByTagName('body')->item(0)->textContent);
-        if($this->debug_on){
-            print_r("parsed      ");
-            print_r($words);
-           // print_r($pageDom->saveHTML());;
-//            exit;
+        
+        
+        $content = implode('', $sentence);
+        
+        $content = preg_replace('/\n+/', ' ', $content);
+        
+        $content = preg_replace('/\s+/', ' ', $content);
+        
+        if ($charset != 'auto') {
+            if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
+                $content = mb_convert_encoding($content, $charset,  "UTF-8");
+                $content = mb_convert_encoding($content, "BIG5",$charset);
+                $content = mb_convert_encoding($content, "UTF-8",  "BIG5");
+            } else {
+                $content = mb_convert_encoding($content, "UTF-8",  $charset);
+            }
          }
+        
+        $words = "";
+        
+        for ($i = 0; $i < mb_strlen($content); $i++){
+            
+            $word = mb_substr($content, $i, 1);
+            
+            if(preg_match('/'.$this->cjkpreg().'/u', $word)){
+                $words .= " {$word} ";
+                continue;
+            }
+            
+            if (preg_match('/[^\w]+/u', $word)) {
+                $words .= ' ';
+                continue;
+            }
+            
+            $words .= $word;
+        }
+
+        $words = preg_split('/\s+/', trim($words));
+         //var_dump($words);exit;
          return $words;
      }
      
-    
-    
-    var $tmpWords = false;
-    function addUTF8Word($s) {
-        $this->tmpWords[] = $s[0];
-//        print_r($this->tmpWords);
-        return ' ';
-    }
-    
-    function domExtractWords($node, $words, $charset="auto")
+    function domExtractWords($node, $sentence, $charset)
      {
-        if ($this->wordMax > 0 && count($words) >  $this->wordMax) {
-            return $words;
-        }
-        //echo count($words) ."\n";
          if (empty($node)) {
-            return $words;
+            return $sentence;
          }
-        if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
-            
-            $str = trim($node->textContent);
-            if ($charset != 'auto') {
-                
-                if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
-                    //var_dump("ORIG" . $str);
-                    $str = mb_convert_encoding($str, $charset,  "UTF-8");
-                    //var_dump("$charset:" .$str);
-                    $str = mb_convert_encoding($str, "BIG5",$charset);
-                    //var_dump("BIG5:".$str);
-                    $str = mb_convert_encoding($str, "UTF-8",  "BIG5");
-                    //var_dump("UTF-8:".$str);
-                } else {
-                    $str = mb_convert_encoding($str, "UTF-8",  $charset);
-                }
-            }
-            echo "$str \n";
-            print_R(mb_strlen($str));exit;
-            
-            //var_dump('xx'.$str);
-             //var_dump($str);
-            $this->tmpWords = $words;
-            //if ($this->isSino()) {
-            $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word')  , $str);
-            //}
-            $words = $this->tmpWords;
-            // remove puncutianion..
-            $str = preg_replace('/[^\w]+/u', ' ', $str);
-            
-            foreach(preg_split('/\s+/u', $str) as $word) {
-                if($this->debug_on){
-//                    print_r(mb_detect_encoding($node->textContent));
-                    //print_r("\n");
-                }
-                if (!trim($word)) {
-                    continue;
-                }
-                // fixme - break unicode chars
-                $words[] = $word;
-            }
-            
+        
+        if ($node->nodeType == XML_TEXT_NODE) {
+            $sentence[] = $node->textContent;
          }
+        
          if (!$node->hasChildNodes()) {
-            return $words;
+            return $sentence;
          }
          
          for($i = 0; $i < $node->childNodes->length; $i++) {
              
              $n = $node->childNodes->item($i);
-            //if($this->debug_on){
-            //    print_r($n);
-            //    print_r("\n");
-            //}
-            $words = $this->domExtractWords($n, $words,$charset);
+            
+            $sentence = $this->domExtractWords($n, $sentence, $charset);
          }
-        return $words;
-        
          
+        return $sentence;
      }
+    
      function cjkpreg() {
          
          static $ret = false;
@@ -337,11 +292,13 @@ class HTML_WordDiff
              return $ret;
          }
          
+        
          $ret = '['.implode('', array(
+                    "\x{0E00}-\x{0E7F}", // thai ??
                      "\x{2E80}-\x{2EFF}",      # CJK Radicals Supplement
                      "\x{2F00}-\x{2FDF}",      # Kangxi Radicals
                      "\x{2FF0}-\x{2FFF}",      # Ideographic Description Characters
-                    "\x{3000}-\x{303F}",      # CJK Symbols and Punctuation
+//                    "\x{3000}-\x{303F}",      # CJK Symbols and Punctuation
                      "\x{3040}-\x{309F}",      # Hiragana
                      "\x{30A0}-\x{30FF}",      # Katakana
                      "\x{3100}-\x{312F}",      # Bopomofo
@@ -385,36 +342,27 @@ class HTML_WordDiff
              $this->htmlDom = $file['string'];
          }
          
-//        $this->debug_on = true;
-//        print_r('is target');
          if(is_string($file) && file_exists($file)){
              $this->htmlDom = file_get_contents($file);
          }
          
-        
-        
          $m = 'buildWords';
+        
          if(!method_exists($this, $m)){
              trigger_error("Method not found ($m)");
              return;
          }
          
-        // if it's langauge is zh_HK or zh_TW -> then
-        // convert 
-        
-        
-        //print_r($this);
          $this->$m('target');
          
          $matchs = 0;
-     //  print_r($this->original);
-     //  print_r($this->target);// exit;
-        foreach($this->original as $k=>$t){
+        
+        foreach($this->original as $k => $t){
+            
              if(!isset($this->target[$k])){
                  continue;
              }
              
-//                $matchs += $this->original[$k] + $this->target[$k];
              if($this->original[$k] == $this->target[$k]){
                  $matchs += $this->original[$k];
                  continue;
@@ -424,14 +372,13 @@ class HTML_WordDiff
                  $matchs += $this->target[$k];
                  continue;
              }
+            
              $matchs += $this->original[$k];
              
          }
-//        print_r($matchs);
-//        print_r("\n");
-//        print_R(($this->countTotal + $this->targetTotal));  
-//        print_r("\n");
+        
          $percent = ( $matchs / ($this->countTotal) * 100);
+        
          return (int)$percent;
          
      }