HTML/WordDiff.php
[pear] / HTML / WordDiff.php
1 <?php
2
3 /**
4  * Description of WordDiff
5  *
6  *  require_once 'HTML/WordDiff.php';
7  *       $init = array(
8  *           'lang' => 'en',
9  *           'file' => '/home/press/rss/2014/03/31/3952.html'
10  *       );
11  *       $wd = new HTML_WordDiff($init);
12  *        $percent = $wd->compare('/home/press/rss/2014/03/31/3954.html');
13  * 
14  * 
15  * 
16  * @author chris
17  */
18 //
19 //require_once 'PEAR.php';
20 //require_once 'DB/DataObject.php';
21
22 class HTML_WordDiff
23 {
24     //put your code here
25     
26     var $lang = 'en'; // the press release language
27     var $original = array(); // original html words
28     var $target = array(); // diff target html words
29     var $countTotal = 0; // Total words count form original html
30     var $targetTotal = 0; // Total words count form target html
31     var $wordMax = -1;
32     //word type classification
33     var $nonSinoTibetan = array(//non Sino-Tibetan languages
34         'aa',
35         'ab',
36         'en',
37         'pt',
38         'ar',
39         'de',
40         'fr',
41         'es',
42         'vi',
43         'id',
44     );
45     var $sinoTibetan = array(//Sino-Tibetan languages
46         'my',
47         'th',
48         'ko',
49         'zh_HK',
50         'ja',
51         'zh_TW',
52         'zh_CN',
53     );
54     
55     var $alternatives = array(
56         '.',
57         ',',
58         '--'
59     );
60     
61     var $htmlDom = false; // HTML Dom elements
62     var $debug_on = false;
63     /**
64      * Constructor
65      * 
66      * 
67      * @param Array $config
68      * lang = language of article
69      * file = name of file...
70      * string = string contents
71      * 
72      * @return type
73      * 
74      */
75     function __construct($config = false)
76     {
77         //print_r($config);
78         
79         if(!$config){
80             return;
81         }
82         
83         if(!is_array($config)){
84             trigger_error("Word Diff got error, the argument IS NOT array");
85             return;
86         }
87         
88         if(empty($config['lang'])){
89             trigger_error("the language is missing.");
90             return;
91         }
92         if(empty($config['file']) && !isset($config['string'])){
93             trigger_error("File is missing");
94             return;
95         }
96         if (isset($config['debug_on'])) {
97             $this->debug_on = $config['debug_on'];
98         }
99         
100         
101         // not in used now??
102         if(!in_array($this->lang, $this->nonSinoTibetan)){
103             if(!in_array($this->lang, $this->sinoTibetan)){
104                 trigger_error("This ({$this->lang}) language is not on our word type classification");
105             }
106             return;
107         }
108         
109         
110         $this->htmlDom = isset($config['string']) ? $config['string'] : '';
111         
112         
113         if(isset($config['file']) && file_exists($config['file'])){
114             $this->htmlDom = file_get_contents($config['file']);
115         }
116         
117         $this->lang = $config['lang'];
118         
119     
120         $m = 'buildWords';// default run sino-tibetan
121         
122         if(!method_exists($this, $m)){
123             trigger_error("Method not found ($m)");
124             return;
125         }
126         $this->$m();
127     }
128     
129     function isSino()
130     {
131         return in_array($this->lang, $this->sinoTibetan);
132     }
133     
134     /**
135      * set the words array 
136      * 
137      * for non Sino-Tibetan languages etc. English, French
138      * 
139      *  
140      * @param $String $target for the array index
141      * 
142      */
143     function buildWords($target = 'original')
144     {
145         static $cache= array();
146         if (isset($cache[md5($this->htmlDom)])) {
147             $this->$target = $cache[md5($this->htmlDom)];
148             
149             if ($this->wordMax < 0) {
150                 $this->wordMax = array_sum(array_values($this->target)) * 10 ;
151             }
152             
153             if($target == 'original'){
154                 $this->countTotal = array_sum(array_values($this->$target));
155             }else{
156                 $this->targetTotal= array_sum(array_values($this->$target));
157             }
158             
159             return;
160         }
161         
162         $words = $this->DomToStrings();
163         
164         if ($this->wordMax < 0) {
165             $this->wordMax = 10 * count($words);
166         }
167         
168         if($this->debug_on){
169             var_Dump("domstrings"); print_r($words);
170         }
171         
172         $ret = array();
173         $last_w = false;
174         
175         foreach($words as $str){
176             
177             if(empty($str) || !trim(strlen($str))) {
178                 continue;
179             }
180             
181             if ($last_w !== false) {
182                 
183                 if(!isset($ret[$last_w.'|'.$str])){
184                     $ret[$last_w.'|'.$str] = 1;
185                 } else {
186                     $ret[$last_w.'|'.$str] += 1;
187                 }
188             }
189             
190             $last_w = $str;
191             
192         }
193
194         if($target == 'original'){
195             $this->countTotal = array_sum(array_values($ret));
196         }else{
197             $this->targetTotal= array_sum(array_values($ret));
198         }
199         $this->$target = $ret;
200         $cache[md5($this->htmlDom)] = $ret;
201     }
202     
203     function DomToStrings($target = '')
204     {
205         $charset = 'UTF-8';
206         
207         $pageDom = new DomDocument('1.0', $charset);
208         $pageDom->formatOutput = true;
209         
210         $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
211         
212         @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
213         
214         $sentence = $this->parse_node($pageDom->documentElement, array(), $charset);
215         
216         $content = implode('', $sentence);
217         
218         $content = preg_replace('/\n+/', ' ', $content);
219         
220         $content = preg_replace('/\s+/', ' ', $content);
221         
222         if ($charset != 'auto') {
223             if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
224                 $content = mb_convert_encoding($content, $charset,  "UTF-8");
225                 $content = mb_convert_encoding($content, "BIG5",$charset);
226                 $content = mb_convert_encoding($content, "UTF-8",  "BIG5");
227             } else {
228                 $content = mb_convert_encoding($content, "UTF-8",  $charset);
229             }
230         }
231         
232         $words = "";
233         
234         for ($i = 0; $i < mb_strlen($content); $i++){
235             
236             $word = mb_substr($content, $i, 1);
237             
238             if(preg_match('/'.$this->cjkpreg().'/u', $word)){
239                 $words .= " $word ";
240                 continue;
241             }
242             
243             if (preg_match('/[^\w]+/', $word)) {
244                 $words .= ' ';
245                 continue;
246             }
247             
248             $words .= $word;
249         }
250
251         $words = preg_split('/\s+/', trim($words));
252         
253         return $words;
254     }
255     
256     function parse_node($node, $sentence, $charset)
257     {
258         if (empty($node)) {
259             return $sentence;
260         }
261         
262         if ($node->nodeType == XML_TEXT_NODE) {
263             $sentence[] = $node->textContent;
264         }
265         
266         if (!$node->hasChildNodes()) {
267             return $sentence;
268         }
269         
270         for($i = 0; $i < $node->childNodes->length; $i++) {
271             
272             $n = $node->childNodes->item($i);
273             
274             $sentence = $this->parse_node($n, $sentence, $charset);
275         }
276         
277         return $sentence;
278     }
279     
280     
281     var $tmpWords = false;
282     function addUTF8Word($s) {
283         
284         echo "calling addUTF8Word \n";
285         print_R($s);
286         
287         $this->tmpWords[] = $s[0];
288 //        print_r($this->tmpWords);
289         return ' ';
290     }
291     
292     function domExtractWords($node, $words, $charset="auto")
293     {
294         if ($this->wordMax > 0 && count($words) >  $this->wordMax) {
295             return $words;
296         }
297         //echo count($words) ."\n";
298         if (empty($node)) {
299             return $words;
300         }
301         
302         if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
303             
304             $str = trim($node->textContent);
305             
306             echo "node content : {$str} \n";
307             
308             if ($charset != 'auto') {
309                 
310                 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
311                     //var_dump("ORIG" . $str);
312                     $str = mb_convert_encoding($str, $charset,  "UTF-8");
313                     //var_dump("$charset:" .$str);
314                     $str = mb_convert_encoding($str, "BIG5",$charset);
315                     //var_dump("BIG5:".$str);
316                     $str = mb_convert_encoding($str, "UTF-8",  "BIG5");
317                     //var_dump("UTF-8:".$str);
318                 } else {
319                     $str = mb_convert_encoding($str, "UTF-8",  $charset);
320                 }
321             }
322             
323             echo "node content mb convert : {$str} \n";
324             
325             //var_dump('xx'.$str);
326              //var_dump($str);
327             $this->tmpWords = $words;
328             //if ($this->isSino()) {
329             $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word')  , $str);
330             //}
331             $words = $this->tmpWords;
332             // remove puncutianion..
333             $str = preg_replace('/[^\w]+/u', ' ', $str);
334             
335             echo "after replace : {$str} \n";
336             
337             foreach(preg_split('/\s+/u', $str) as $word) {
338                 if($this->debug_on){
339 //                    print_r(mb_detect_encoding($node->textContent));
340                     //print_r("\n");
341                 }
342                 if (!trim($word)) {
343                     continue;
344                 }
345                 // fixme - break unicode chars
346                 $words[] = $word;
347             }
348             
349         }
350         if (!$node->hasChildNodes()) {
351             return $words;
352         }
353         
354         for($i = 0; $i < $node->childNodes->length; $i++) {
355             
356             $n = $node->childNodes->item($i);
357             //if($this->debug_on){
358             //    print_r($n);
359             //    print_r("\n");
360             //}
361             $words = $this->domExtractWords($n, $words,$charset);
362         }
363         return $words;
364         
365         
366     }
367     function cjkpreg() {
368         
369         static $ret = false;
370         if ($ret !== false) {
371             return $ret;
372         }
373         
374         $ret = '['.implode('', array(
375                     "\x{2E80}-\x{2EFF}",      # CJK Radicals Supplement
376                     "\x{2F00}-\x{2FDF}",      # Kangxi Radicals
377                     "\x{2FF0}-\x{2FFF}",      # Ideographic Description Characters
378 //                    "\x{3000}-\x{303F}",      # CJK Symbols and Punctuation
379                     "\x{3040}-\x{309F}",      # Hiragana
380                     "\x{30A0}-\x{30FF}",      # Katakana
381                     "\x{3100}-\x{312F}",      # Bopomofo
382                     "\x{3130}-\x{318F}",      # Hangul Compatibility Jamo
383                     "\x{3190}-\x{319F}",      # Kanbun
384                     "\x{31A0}-\x{31BF}",      # Bopomofo Extended
385                     "\x{31F0}-\x{31FF}",      # Katakana Phonetic Extensions
386                     "\x{3200}-\x{32FF}",      # Enclosed CJK Letters and Months
387                     "\x{3300}-\x{33FF}",      # CJK Compatibility
388                     "\x{3400}-\x{4DBF}",      # CJK Unified Ideographs Extension A
389                     "\x{4DC0}-\x{4DFF}",      # Yijing Hexagram Symbols
390                     "\x{4E00}-\x{9FFF}",      # CJK Unified Ideographs
391                     "\x{A000}-\x{A48F}",      # Yi Syllables
392                     "\x{A490}-\x{A4CF}",      # Yi Radicals
393                     "\x{AC00}-\x{D7AF}",      # Hangul Syllables
394                     "\x{F900}-\x{FAFF}",      # CJK Compatibility Ideographs
395                     "\x{FE30}-\x{FE4F}",      # CJK Compatibility Forms
396                     "\x{1D300}-\x{1D35F}",    # Tai Xuan Jing Symbols
397                     "\x{20000}-\x{2A6DF}",    # CJK Unified Ideographs Extension B
398                     "\x{2F800}-\x{2FA1F}"     # CJK Compatibility Ideographs Supplement
399         )). ']';
400         
401 //        print_R($ret);
402         return $ret;
403     }
404     
405     /**
406      * 
407      * 
408      * 
409      * 
410      * @param (array|string) $file either file path or array('string'=>'....')
411      * 
412      * @return int $percent percentage of match 
413      * 
414      */
415     public function compare($file)
416     {
417         
418         if (is_array($file)) {
419             $this->htmlDom = $file['string'];
420         }
421         
422 //        $this->debug_on = true;
423 //        print_r('is target');
424         if(is_string($file) && file_exists($file)){
425             $this->htmlDom = file_get_contents($file);
426         }
427         
428         
429         
430         $m = 'buildWords';
431         if(!method_exists($this, $m)){
432             trigger_error("Method not found ($m)");
433             return;
434         }
435         
436         // if it's langauge is zh_HK or zh_TW -> then
437         // convert 
438         
439         
440         //print_r($this);
441         $this->$m('target');
442         
443         $matchs = 0;
444      //  print_r($this->original);
445      //  print_r($this->target);// exit;
446         foreach($this->original as $k=>$t){
447             if(!isset($this->target[$k])){
448                 continue;
449             }
450             
451 //                $matchs += $this->original[$k] + $this->target[$k];
452             if($this->original[$k] == $this->target[$k]){
453                 $matchs += $this->original[$k];
454                 continue;
455             }
456             
457             if($this->original[$k] > $this->target[$k]){
458                 $matchs += $this->target[$k];
459                 continue;
460             }
461             $matchs += $this->original[$k];
462             
463         }
464 //        print_r($matchs);
465 //        print_r("\n");
466 //        print_R(($this->countTotal + $this->targetTotal));  
467 //        print_r("\n");
468         $percent = ( $matchs / ($this->countTotal) * 100);
469         return (int)$percent;
470         
471     }
472     
473 }