HTML/WordDiff.php
[pear] / HTML / WordDiff.php
1 <?php
2
3 /**
4  * Description of WordDiff
5  *
6  *  require_once 'HTML/WordDiff.php';
7  *       $init = array(
8  *           'lang' => 'en',
9  *           'file' => '/home/press/rss/2014/03/31/3952.html'
10  *       );
11  *       $wd = new HTML_WordDiff($init);
12  *        $percent = $wd->compare('/home/press/rss/2014/03/31/3954.html');
13  * 
14  * 
15  * 
16  * @author chris
17  */
18 //
19 //require_once 'PEAR.php';
20 //require_once 'DB/DataObject.php';
21
22 class HTML_WordDiff
23 {
24     //put your code here
25     
26     var $lang = 'en'; // the press release language
27     var $original = array(); // original html words
28     var $target = array(); // diff target html words
29     var $countTotal = 0; // Total words count form original html
30     var $targetTotal = 0; // Total words count form target html
31     var $wordMax = -1;
32     //word type classification
33     var $nonSinoTibetan = array(//non Sino-Tibetan languages
34         'aa',
35         'ab',
36         'en',
37         'pt',
38         'ar',
39         'de',
40         'fr',
41         'es',
42         'vi',
43         'id',
44     );
45     var $sinoTibetan = array(//Sino-Tibetan languages
46         'my',
47         'th',
48         'ko',
49         'zh_HK',
50         'ja',
51         'zh_TW',
52         'zh_CN',
53     );
54     
55     var $alternatives = array(
56         '.',
57         ',',
58         '--'
59     );
60     
61     var $htmlDom = false; // HTML Dom elements
62     var $debug_on = false;
63     /**
64      * Constructor
65      * 
66      * 
67      * @param Array $config
68      * lang = language of article
69      * file = name of file...
70      * string = string contents
71      * 
72      * @return type
73      * 
74      */
75     function __construct($config = false)
76     {
77         //print_r($config);
78         
79         if(!$config){
80             return;
81         }
82         
83         if(!is_array($config)){
84             trigger_error("Word Diff got error, the argument IS NOT array");
85             return;
86         }
87         
88         if(empty($config['lang'])){
89             trigger_error("the language is missing.");
90             return;
91         }
92         if(empty($config['file']) && !isset($config['string'])){
93             trigger_error("File is missing");
94             return;
95         }
96         if (isset($config['debug_on'])) {
97             $this->debug_on = $config['debug_on'];
98         }
99         
100         
101         // not in used now??
102         if(!in_array($this->lang, $this->nonSinoTibetan)){
103             if(!in_array($this->lang, $this->sinoTibetan)){
104                 trigger_error("This ({$this->lang}) language is not on our word type classification");
105             }
106             return;
107         }
108         
109         
110         $this->htmlDom = isset($config['string']) ? $config['string'] : '';
111         
112         
113         if(isset($config['file']) && file_exists($config['file'])){
114             $this->htmlDom = file_get_contents($config['file']);
115         }
116         
117         $this->lang = $config['lang'];
118         
119     
120         $m = 'buildWords';// default run sino-tibetan
121         
122         if(!method_exists($this, $m)){
123             trigger_error("Method not found ($m)");
124             return;
125         }
126         $this->$m();
127     }
128     
129     function isSino()
130     {
131         return in_array($this->lang, $this->sinoTibetan);
132     }
133     
134     /**
135      * set the words array 
136      * 
137      * for non Sino-Tibetan languages etc. English, French
138      * 
139      *  
140      * @param $String $target for the array index
141      * 
142      */
143     function buildWords($target = 'original')
144     {
145         static $cache= array();
146         if (isset($cache[md5($this->htmlDom)])) {
147             $this->$target = $cache[md5($this->htmlDom)];
148             
149             if ($this->wordMax < 0) {
150                 $this->wordMax = array_sum(array_values($this->target)) * 10 ;
151             }
152             
153             if($target == 'original'){
154                 $this->countTotal = array_sum(array_values($this->$target));
155             }else{
156                 $this->targetTotal= array_sum(array_values($this->$target));
157             }
158             
159             return;
160         }
161         
162         $a = $this->DomToStrings();
163         
164         if ($this->wordMax < 0) {
165             $this->wordMax = 10*count($a);
166         }
167         if($this->debug_on){
168             var_Dump("domstrings"); print_r($a);
169 //            exit;
170         }
171         $ret = array();
172         $last_w = false;
173         
174         foreach($a as $str){
175             if(empty($str) || !trim(strlen($str))) {
176                 continue;
177             }
178 //            if(!isset($ret[$str])){
179 //                $ret[$str] = 1;
180 //            
181 //            } else {
182 //                $ret[$str] += 1;
183 //            }
184             // now deal with pairing..
185             if ($last_w !== false) {
186                 
187                 if(!isset($ret[$last_w.'|'.$str])){
188                     $ret[$last_w.'|'.$str] = 1;
189
190                 } else {
191                     $ret[$last_w.'|'.$str] += 1;
192                 }    
193                 
194             }
195             $last_w = $str;
196         }
197 //        print_r($ret);
198         if($target == 'original'){
199             $this->countTotal = array_sum(array_values($ret));
200         }else{
201             $this->targetTotal= array_sum(array_values($ret));
202         }
203         $this->$target = $ret;
204         $cache[md5($this->htmlDom)] = $ret;
205     }
206     
207     function DomToStrings($target = '')
208     {
209         $charset = 'UTF-8';
210         
211         $pageDom = new DomDocument('1.0', $charset);
212         $pageDom->formatOutput = true;
213         
214         $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
215         
216         @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
217         
218         $sentence = $this->parse_node($pageDom->documentElement, array(), $charset);
219         
220         $content = implode('', $sentence);
221         
222         $content = preg_replace('/\n+/u', ' ', $content);
223         
224         $content = preg_replace('/\s+/u', ' ', $content);
225         
226         if ($charset != 'auto') {
227             if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
228                 $content = mb_convert_encoding($content, $charset,  "UTF-8");
229                 $content = mb_convert_encoding($content, "BIG5",$charset);
230                 $content = mb_convert_encoding($content, "UTF-8",  "BIG5");
231             } else {
232                 $content = mb_convert_encoding($content, "UTF-8",  $charset);
233             }
234         }
235         
236         $words = "";
237         
238         for ($i = 0; $i < mb_strlen($content); $i++){
239             
240             $word = mb_substr($content, $i, 1);
241             
242             echo "$word \n";
243             
244             if (ctype_punct($word)) {
245                 echo "is punct \n";
246                 $words .= ' ';
247                 continue;
248             }
249             
250             if(preg_match('/'.$this->cjkpreg().'/u', $word)){
251                 echo "is cjk \n";
252                 $words .= " $word ";
253                 continue;
254             }
255             
256             $words .= $word;
257         }
258         
259         print_R($words);exit;
260         
261         foreach(preg_split('/\s+/', $content) as $word) {
262             
263             if (!trim($word)) {
264                 continue;
265             }
266             
267             $str = trim($word);
268             
269             if ($charset != 'auto') {
270                 
271                 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
272                     //var_dump("ORIG" . $str);
273                     $str = mb_convert_encoding($str, $charset,  "UTF-8");
274                     //var_dump("$charset:" .$str);
275                     $str = mb_convert_encoding($str, "BIG5",$charset);
276                     //var_dump("BIG5:".$str);
277                     $str = mb_convert_encoding($str, "UTF-8",  "BIG5");
278                     //var_dump("UTF-8:".$str);
279                 } else {
280                     
281                     $str = mb_convert_encoding($str, "UTF-8",  $charset);
282                 }
283             }
284             
285             $words[] = $str;
286         }
287         
288         return $words;
289     }
290     
291     function parse_node($node, $sentence, $charset)
292     {
293         if (empty($node)) {
294             return $sentence;
295         }
296         
297         if ($node->nodeType == XML_TEXT_NODE) {
298             $sentence[] = $node->textContent;
299         }
300         
301         if (!$node->hasChildNodes()) {
302             return $sentence;
303         }
304         
305         for($i = 0; $i < $node->childNodes->length; $i++) {
306             
307             $n = $node->childNodes->item($i);
308             
309             $sentence = $this->parse_node($n, $sentence, $charset);
310         }
311         
312         return $sentence;
313     }
314     
315     
316     var $tmpWords = false;
317     function addUTF8Word($s) {
318         
319         echo "calling addUTF8Word \n";
320         print_R($s);
321         
322         $this->tmpWords[] = $s[0];
323 //        print_r($this->tmpWords);
324         return ' ';
325     }
326     
327     function domExtractWords($node, $words, $charset="auto")
328     {
329         if ($this->wordMax > 0 && count($words) >  $this->wordMax) {
330             return $words;
331         }
332         //echo count($words) ."\n";
333         if (empty($node)) {
334             return $words;
335         }
336         
337         if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
338             
339             $str = trim($node->textContent);
340             
341             echo "node content : {$str} \n";
342             
343             if ($charset != 'auto') {
344                 
345                 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
346                     //var_dump("ORIG" . $str);
347                     $str = mb_convert_encoding($str, $charset,  "UTF-8");
348                     //var_dump("$charset:" .$str);
349                     $str = mb_convert_encoding($str, "BIG5",$charset);
350                     //var_dump("BIG5:".$str);
351                     $str = mb_convert_encoding($str, "UTF-8",  "BIG5");
352                     //var_dump("UTF-8:".$str);
353                 } else {
354                     $str = mb_convert_encoding($str, "UTF-8",  $charset);
355                 }
356             }
357             
358             echo "node content mb convert : {$str} \n";
359             
360             //var_dump('xx'.$str);
361              //var_dump($str);
362             $this->tmpWords = $words;
363             //if ($this->isSino()) {
364             $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word')  , $str);
365             //}
366             $words = $this->tmpWords;
367             // remove puncutianion..
368             $str = preg_replace('/[^\w]+/u', ' ', $str);
369             
370             echo "after replace : {$str} \n";
371             
372             foreach(preg_split('/\s+/u', $str) as $word) {
373                 if($this->debug_on){
374 //                    print_r(mb_detect_encoding($node->textContent));
375                     //print_r("\n");
376                 }
377                 if (!trim($word)) {
378                     continue;
379                 }
380                 // fixme - break unicode chars
381                 $words[] = $word;
382             }
383             
384         }
385         if (!$node->hasChildNodes()) {
386             return $words;
387         }
388         
389         for($i = 0; $i < $node->childNodes->length; $i++) {
390             
391             $n = $node->childNodes->item($i);
392             //if($this->debug_on){
393             //    print_r($n);
394             //    print_r("\n");
395             //}
396             $words = $this->domExtractWords($n, $words,$charset);
397         }
398         return $words;
399         
400         
401     }
402     function cjkpreg() {
403         
404         static $ret = false;
405         if ($ret !== false) {
406             return $ret;
407         }
408         
409         $ret = '['.implode('', array(
410                     "\x{2E80}-\x{2EFF}",      # CJK Radicals Supplement
411                     "\x{2F00}-\x{2FDF}",      # Kangxi Radicals
412                     "\x{2FF0}-\x{2FFF}",      # Ideographic Description Characters
413                     "\x{3000}-\x{303F}",      # CJK Symbols and Punctuation
414                     "\x{3040}-\x{309F}",      # Hiragana
415                     "\x{30A0}-\x{30FF}",      # Katakana
416                     "\x{3100}-\x{312F}",      # Bopomofo
417                     "\x{3130}-\x{318F}",      # Hangul Compatibility Jamo
418                     "\x{3190}-\x{319F}",      # Kanbun
419                     "\x{31A0}-\x{31BF}",      # Bopomofo Extended
420                     "\x{31F0}-\x{31FF}",      # Katakana Phonetic Extensions
421                     "\x{3200}-\x{32FF}",      # Enclosed CJK Letters and Months
422                     "\x{3300}-\x{33FF}",      # CJK Compatibility
423                     "\x{3400}-\x{4DBF}",      # CJK Unified Ideographs Extension A
424                     "\x{4DC0}-\x{4DFF}",      # Yijing Hexagram Symbols
425                     "\x{4E00}-\x{9FFF}",      # CJK Unified Ideographs
426                     "\x{A000}-\x{A48F}",      # Yi Syllables
427                     "\x{A490}-\x{A4CF}",      # Yi Radicals
428                     "\x{AC00}-\x{D7AF}",      # Hangul Syllables
429                     "\x{F900}-\x{FAFF}",      # CJK Compatibility Ideographs
430                     "\x{FE30}-\x{FE4F}",      # CJK Compatibility Forms
431                     "\x{1D300}-\x{1D35F}",    # Tai Xuan Jing Symbols
432                     "\x{20000}-\x{2A6DF}",    # CJK Unified Ideographs Extension B
433                     "\x{2F800}-\x{2FA1F}"     # CJK Compatibility Ideographs Supplement
434         )). ']';
435         
436 //        print_R($ret);
437         return $ret;
438     }
439     
440     /**
441      * 
442      * 
443      * 
444      * 
445      * @param (array|string) $file either file path or array('string'=>'....')
446      * 
447      * @return int $percent percentage of match 
448      * 
449      */
450     public function compare($file)
451     {
452         
453         if (is_array($file)) {
454             $this->htmlDom = $file['string'];
455         }
456         
457 //        $this->debug_on = true;
458 //        print_r('is target');
459         if(is_string($file) && file_exists($file)){
460             $this->htmlDom = file_get_contents($file);
461         }
462         
463         
464         
465         $m = 'buildWords';
466         if(!method_exists($this, $m)){
467             trigger_error("Method not found ($m)");
468             return;
469         }
470         
471         // if it's langauge is zh_HK or zh_TW -> then
472         // convert 
473         
474         
475         //print_r($this);
476         $this->$m('target');
477         
478         $matchs = 0;
479      //  print_r($this->original);
480      //  print_r($this->target);// exit;
481         foreach($this->original as $k=>$t){
482             if(!isset($this->target[$k])){
483                 continue;
484             }
485             
486 //                $matchs += $this->original[$k] + $this->target[$k];
487             if($this->original[$k] == $this->target[$k]){
488                 $matchs += $this->original[$k];
489                 continue;
490             }
491             
492             if($this->original[$k] > $this->target[$k]){
493                 $matchs += $this->target[$k];
494                 continue;
495             }
496             $matchs += $this->original[$k];
497             
498         }
499 //        print_r($matchs);
500 //        print_r("\n");
501 //        print_R(($this->countTotal + $this->targetTotal));  
502 //        print_r("\n");
503         $percent = ( $matchs / ($this->countTotal) * 100);
504         return (int)$percent;
505         
506     }
507     
508 }