HTML/WordDiff.php
[pear] / HTML / WordDiff.php
1 <?php
2
3 /**
4  * Description of WordDiff
5  *
6  *  require_once 'HTML/WordDiff.php';
7  *       $init = array(
8  *           'lang' => 'en',
9  *           'file' => '/home/press/rss/2014/03/31/3952.html'
10  *       );
11  *       $wd = new HTML_WordDiff($init);
12  *        $percent = $wd->compare('/home/press/rss/2014/03/31/3954.html');
13  * 
14  * 
15  * 
16  * @author chris
17  */
18 //
19 //require_once 'PEAR.php';
20 //require_once 'DB/DataObject.php';
21
22 class HTML_WordDiff
23 {
24     //put your code here
25     
26     var $lang = 'en'; // the press release language
27     var $original = array(); // original html words
28     var $target = array(); // diff target html words
29     var $countTotal = 0; // Total words count form original html
30     var $targetTotal = 0; // Total words count form target html
31     var $wordMax = -1;
32     //word type classification
33     var $nonSinoTibetan = array(//non Sino-Tibetan languages
34         'aa',
35         'ab',
36         'en',
37         'pt',
38         'ar',
39         'de',
40         'fr',
41         'es',
42         'vi',
43         'id',
44     );
45     var $sinoTibetan = array(//Sino-Tibetan languages
46         'my',
47         'th',
48         'ko',
49         'zh_HK',
50         'ja',
51         'zh_TW',
52         'zh_CN',
53     );
54     
55     var $alternatives = array(
56         '.',
57         ',',
58         '--'
59     );
60     
61     var $htmlDom = false; // HTML Dom elements
62     var $debug_on = false;
63     /**
64      * Constructor
65      * 
66      * 
67      * @param Array $config
68      * lang = language of article
69      * file = name of file...
70      * string = string contents
71      * 
72      * @return type
73      * 
74      */
75     function __construct($config = false)
76     {
77         //print_r($config);
78         
79         if(!$config){
80             return;
81         }
82         
83         if(!is_array($config)){
84             trigger_error("Word Diff got error, the argument IS NOT array");
85             return;
86         }
87         
88         if(empty($config['lang'])){
89             trigger_error("the language is missing.");
90             return;
91         }
92         if(empty($config['file']) && !isset($config['string'])){
93             trigger_error("File is missing");
94             return;
95         }
96         if (isset($config['debug_on'])) {
97             $this->debug_on = $config['debug_on'];
98         }
99         
100         
101         // not in used now??
102         if(!in_array($this->lang, $this->nonSinoTibetan)){
103             if(!in_array($this->lang, $this->sinoTibetan)){
104                 trigger_error("This ({$this->lang}) language is not on our word type classification");
105             }
106             return;
107         }
108         
109         
110         $this->htmlDom = isset($config['string']) ? $config['string'] : '';
111         
112         
113         if(isset($config['file']) && file_exists($config['file'])){
114             $this->htmlDom = file_get_contents($config['file']);
115         }
116         
117         $this->lang = $config['lang'];
118         
119     
120         $m = 'buildWords';// default run sino-tibetan
121         
122         if(!method_exists($this, $m)){
123             trigger_error("Method not found ($m)");
124             return;
125         }
126         $this->$m();
127     }
128     
129     function isSino()
130     {
131         return in_array($this->lang, $this->sinoTibetan);
132     }
133     
134     /**
135      * set the words array 
136      * 
137      * for non Sino-Tibetan languages etc. English, French
138      * 
139      *  
140      * @param $String $target for the array index
141      * 
142      */
143     function buildWords($target = 'original')
144     {
145         static $cache= array();
146         if (isset($cache[md5($this->htmlDom)])) {
147             $this->$target = $cache[md5($this->htmlDom)];
148             
149             if ($this->wordMax < 0) {
150                 $this->wordMax = array_sum(array_values($this->target)) * 10 ;
151             }
152             
153             if($target == 'original'){
154                 $this->countTotal = array_sum(array_values($this->$target));
155             }else{
156                 $this->targetTotal= array_sum(array_values($this->$target));
157             }
158             
159             return;
160         }
161         
162         $a = $this->DomToStrings();
163         
164         if ($this->wordMax < 0) {
165             $this->wordMax = 10*count($a);
166         }
167         if($this->debug_on){
168             var_Dump("domstrings"); print_r($a);
169 //            exit;
170         }
171         $ret = array();
172         $last_w = false;
173         
174         foreach($a as $str){
175             if(empty($str) || !trim(strlen($str))) {
176                 continue;
177             }
178 //            if(!isset($ret[$str])){
179 //                $ret[$str] = 1;
180 //            
181 //            } else {
182 //                $ret[$str] += 1;
183 //            }
184             // now deal with pairing..
185             if ($last_w !== false) {
186                 
187                 if(!isset($ret[$last_w.'|'.$str])){
188                     $ret[$last_w.'|'.$str] = 1;
189
190                 } else {
191                     $ret[$last_w.'|'.$str] += 1;
192                 }    
193                 
194             }
195             $last_w = $str;
196         }
197 //        print_r($ret);
198         if($target == 'original'){
199             $this->countTotal = array_sum(array_values($ret));
200         }else{
201             $this->targetTotal= array_sum(array_values($ret));
202         }
203         $this->$target = $ret;
204         $cache[md5($this->htmlDom)] = $ret;
205     }
206     
207     function DomToStrings($target = '')
208     {
209         $charset = 'UTF-8';
210         //if (preg_match('#charset=([^"]+)#', $this->htmlDom,$matches)) {
211             //var_dump($matches);exit;
212         //    $charset = $matches[1];
213         //}
214         
215         $pageDom = new DomDocument('1.0', $charset);
216         $pageDom->formatOutput = true;
217         
218         // change language if encoding does not match...
219         
220         
221         
222 //        print_r(mb_detect_encoding($this->htmlDom));
223        
224         // may produce errors - so we hide them...
225         $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
226         //$searchPage = $this->htmlDom; //@mb_convert_encoding($this->htmlDom, $charset ,  $charset=="UTF-8" ? "auto" :$charset);
227                  
228         
229 //        $searchPage = mb_convert_encoding($this->htmlDom, "UTF-8",  "HTML-ENTITIES");
230 //        echo $searchPage;
231 //        print_r(mb_detect_encoding($searchPage));
232         
233 //        $searchPage = mb_convert_encoding($this->htmlDom, "big5");
234 //        if($target == 'target'){
235 //            print_r($searchPage);
236 //            exit;
237 //        }
238 //        print_r(mb_detect_encoding($searchPage));
239       // print_r($searchPage);
240         @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
241 //        exit;
242         $words = $this->domExtractWords($pageDom->documentElement, array(), $charset);
243        // print_r($words);exit;
244         
245 //        $string = preg_replace('/[^\pL\pS\pN]/u', '-', $pageDom->documentElement->getElementsByTagName('body')->item(0)->textContent);
246         if($this->debug_on){
247             print_r("parsed      ");
248             print_r($words);
249            // print_r($pageDom->saveHTML());;
250 //            exit;
251         }
252         return $words;
253     }
254     
255     
256     
257     var $tmpWords = false;
258     function addUTF8Word($s) {
259         $this->tmpWords[] = $s[0];
260 //        print_r($this->tmpWords);
261         return ' ';
262     }
263     
264     function domExtractWords($node, $words, $charset="auto")
265     {
266         if ($this->wordMax > 0 && count($words) >  $this->wordMax) {
267             return $words;
268         }
269         //echo count($words) ."\n";
270         if (empty($node)) {
271             return $words;
272         }
273         if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
274             
275             $str = trim($node->textContent);
276             if ($charset != 'auto') {
277                 
278                 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
279                     //var_dump("ORIG" . $str);
280                     $str = mb_convert_encoding($str, $charset,  "UTF-8");
281                     //var_dump("$charset:" .$str);
282                     $str = mb_convert_encoding($str, "BIG5",$charset);
283                     //var_dump("BIG5:".$str);
284                     $str = mb_convert_encoding($str, "UTF-8",  "BIG5");
285                     //var_dump("UTF-8:".$str);
286                 } else {
287                     $str = mb_convert_encoding($str, "UTF-8",  $charset);
288                 }
289             }
290             
291             echo "$str\n";
292             
293             $str = utf8_encode($str);
294             
295             echo "$str[1] \n";
296             
297             exit;
298             
299             //var_dump('xx'.$str);
300              //var_dump($str);
301             $this->tmpWords = $words;
302             //if ($this->isSino()) {
303             $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word')  , $str);
304             //}
305             $words = $this->tmpWords;
306             // remove puncutianion..
307             $str = preg_replace('/[^\w]+/u', ' ', $str);
308             
309             foreach(preg_split('/\s+/u', $str) as $word) {
310                 if($this->debug_on){
311 //                    print_r(mb_detect_encoding($node->textContent));
312                     //print_r("\n");
313                 }
314                 if (!trim($word)) {
315                     continue;
316                 }
317                 // fixme - break unicode chars
318                 $words[] = $word;
319             }
320             
321         }
322         if (!$node->hasChildNodes()) {
323             return $words;
324         }
325         
326         for($i = 0; $i < $node->childNodes->length; $i++) {
327             
328             $n = $node->childNodes->item($i);
329             //if($this->debug_on){
330             //    print_r($n);
331             //    print_r("\n");
332             //}
333             $words = $this->domExtractWords($n, $words,$charset);
334         }
335         return $words;
336         
337         
338     }
339     function cjkpreg() {
340         
341         static $ret = false;
342         if ($ret !== false) {
343             return $ret;
344         }
345         
346         $ret = '['.implode('', array(
347                     "\x{2E80}-\x{2EFF}",      # CJK Radicals Supplement
348                     "\x{2F00}-\x{2FDF}",      # Kangxi Radicals
349                     "\x{2FF0}-\x{2FFF}",      # Ideographic Description Characters
350                     "\x{3000}-\x{303F}",      # CJK Symbols and Punctuation
351                     "\x{3040}-\x{309F}",      # Hiragana
352                     "\x{30A0}-\x{30FF}",      # Katakana
353                     "\x{3100}-\x{312F}",      # Bopomofo
354                     "\x{3130}-\x{318F}",      # Hangul Compatibility Jamo
355                     "\x{3190}-\x{319F}",      # Kanbun
356                     "\x{31A0}-\x{31BF}",      # Bopomofo Extended
357                     "\x{31F0}-\x{31FF}",      # Katakana Phonetic Extensions
358                     "\x{3200}-\x{32FF}",      # Enclosed CJK Letters and Months
359                     "\x{3300}-\x{33FF}",      # CJK Compatibility
360                     "\x{3400}-\x{4DBF}",      # CJK Unified Ideographs Extension A
361                     "\x{4DC0}-\x{4DFF}",      # Yijing Hexagram Symbols
362                     "\x{4E00}-\x{9FFF}",      # CJK Unified Ideographs
363                     "\x{A000}-\x{A48F}",      # Yi Syllables
364                     "\x{A490}-\x{A4CF}",      # Yi Radicals
365                     "\x{AC00}-\x{D7AF}",      # Hangul Syllables
366                     "\x{F900}-\x{FAFF}",      # CJK Compatibility Ideographs
367                     "\x{FE30}-\x{FE4F}",      # CJK Compatibility Forms
368                     "\x{1D300}-\x{1D35F}",    # Tai Xuan Jing Symbols
369                     "\x{20000}-\x{2A6DF}",    # CJK Unified Ideographs Extension B
370                     "\x{2F800}-\x{2FA1F}"     # CJK Compatibility Ideographs Supplement
371         )). ']';
372         
373 //        print_R($ret);
374         return $ret;
375     }
376     
377     /**
378      * 
379      * 
380      * 
381      * 
382      * @param (array|string) $file either file path or array('string'=>'....')
383      * 
384      * @return int $percent percentage of match 
385      * 
386      */
387     public function compare($file)
388     {
389         
390         if (is_array($file)) {
391             $this->htmlDom = $file['string'];
392         }
393         
394 //        $this->debug_on = true;
395 //        print_r('is target');
396         if(is_string($file) && file_exists($file)){
397             $this->htmlDom = file_get_contents($file);
398         }
399         
400         
401         
402         $m = 'buildWords';
403         if(!method_exists($this, $m)){
404             trigger_error("Method not found ($m)");
405             return;
406         }
407         
408         // if it's langauge is zh_HK or zh_TW -> then
409         // convert 
410         
411         
412         //print_r($this);
413         $this->$m('target');
414         
415         $matchs = 0;
416      //  print_r($this->original);
417      //  print_r($this->target);// exit;
418         foreach($this->original as $k=>$t){
419             if(!isset($this->target[$k])){
420                 continue;
421             }
422             
423 //                $matchs += $this->original[$k] + $this->target[$k];
424             if($this->original[$k] == $this->target[$k]){
425                 $matchs += $this->original[$k];
426                 continue;
427             }
428             
429             if($this->original[$k] > $this->target[$k]){
430                 $matchs += $this->target[$k];
431                 continue;
432             }
433             $matchs += $this->original[$k];
434             
435         }
436 //        print_r($matchs);
437 //        print_r("\n");
438 //        print_R(($this->countTotal + $this->targetTotal));  
439 //        print_r("\n");
440         $percent = ( $matchs / ($this->countTotal) * 100);
441         return (int)$percent;
442         
443     }
444     
445 }