4 * Description of WordDiff
6 * require_once 'HTML/WordDiff.php';
9 * 'file' => '/home/press/rss/2014/03/31/3952.html'
11 * $wd = new HTML_WordDiff($init);
12 * $percent = $wd->compare('/home/press/rss/2014/03/31/3954.html');
19 //require_once 'PEAR.php';
20 //require_once 'DB/DataObject.php';
26 var $lang = 'en'; // the press release language
27 var $original = array(); // original html words
28 var $target = array(); // diff target html words
29 var $countTotal = 0; // Total words count form original html
30 var $targetTotal = 0; // Total words count form target html
32 //word type classification
33 var $nonSinoTibetan = array(//non Sino-Tibetan languages
45 var $sinoTibetan = array(//Sino-Tibetan languages
55 var $alternatives = array(
61 var $htmlDom = false; // HTML Dom elements
62 var $debug_on = false;
67 * @param Array $config
68 * lang = language of article
69 * file = name of file...
70 * string = string contents
75 function __construct($config = false)
83 if(!is_array($config)){
84 trigger_error("Word Diff got error, the argument IS NOT array");
88 if(empty($config['lang'])){
89 trigger_error("the language is missing.");
92 if(empty($config['file']) && !isset($config['string'])){
93 trigger_error("File is missing");
96 if (isset($config['debug_on'])) {
97 $this->debug_on = $config['debug_on'];
102 if(!in_array($this->lang, $this->nonSinoTibetan)){
103 if(!in_array($this->lang, $this->sinoTibetan)){
104 trigger_error("This ({$this->lang}) language is not on our word type classification");
110 $this->htmlDom = isset($config['string']) ? $config['string'] : '';
113 if(isset($config['file']) && file_exists($config['file'])){
114 $this->htmlDom = file_get_contents($config['file']);
117 $this->lang = $config['lang'];
120 $m = 'buildWords';// default run sino-tibetan
122 if(!method_exists($this, $m)){
123 trigger_error("Method not found ($m)");
131 return in_array($this->lang, $this->sinoTibetan);
135 * set the words array
137 * for non Sino-Tibetan languages etc. English, French
140 * @param $String $target for the array index
143 function buildWords($target = 'original')
145 static $cache= array();
146 if (isset($cache[md5($this->htmlDom)])) {
147 $this->$target = $cache[md5($this->htmlDom)];
149 if ($this->wordMax < 0) {
150 $this->wordMax = array_sum(array_values($this->target)) * 10 ;
153 if($target == 'original'){
154 $this->countTotal = array_sum(array_values($this->$target));
156 $this->targetTotal= array_sum(array_values($this->$target));
162 $a = $this->DomToStrings();
164 if ($this->wordMax < 0) {
165 $this->wordMax = 10*count($a);
168 var_Dump("domstrings"); print_r($a);
175 if(empty($str) || !trim(strlen($str))) {
178 // if(!isset($ret[$str])){
184 // now deal with pairing..
185 if ($last_w !== false) {
187 if(!isset($ret[$last_w.'|'.$str])){
188 $ret[$last_w.'|'.$str] = 1;
191 $ret[$last_w.'|'.$str] += 1;
198 if($target == 'original'){
199 $this->countTotal = array_sum(array_values($ret));
201 $this->targetTotal= array_sum(array_values($ret));
203 $this->$target = $ret;
204 $cache[md5($this->htmlDom)] = $ret;
207 function DomToStrings($target = '')
211 $pageDom = new DomDocument('1.0', $charset);
212 $pageDom->formatOutput = true;
214 $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
216 @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
218 $sentence = $this->parse_node($pageDom->documentElement, array(), $charset);
220 $content = implode('', $sentence);
222 $content = preg_replace('/\n+/u', ' ', $content);
224 $content = preg_replace('/\s+/u', ' ', $content);
226 if ($charset != 'auto') {
227 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
228 $content = mb_convert_encoding($content, $charset, "UTF-8");
229 $content = mb_convert_encoding($content, "BIG5",$charset);
230 $content = mb_convert_encoding($content, "UTF-8", "BIG5");
232 $content = mb_convert_encoding($content, "UTF-8", $charset);
238 for ($i = 0; $i < mb_strlen($content); $i++){
240 $word = mb_substr($content, $i, 1);
244 if (ctype_punct($word)) {
250 if(preg_match('/'.$this->cjkpreg().'/u', $word)){
259 print_R($words);exit;
261 foreach(preg_split('/\s+/', $content) as $word) {
269 if ($charset != 'auto') {
271 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
272 //var_dump("ORIG" . $str);
273 $str = mb_convert_encoding($str, $charset, "UTF-8");
274 //var_dump("$charset:" .$str);
275 $str = mb_convert_encoding($str, "BIG5",$charset);
276 //var_dump("BIG5:".$str);
277 $str = mb_convert_encoding($str, "UTF-8", "BIG5");
278 //var_dump("UTF-8:".$str);
281 $str = mb_convert_encoding($str, "UTF-8", $charset);
291 function parse_node($node, $sentence, $charset)
297 if ($node->nodeType == XML_TEXT_NODE) {
298 $sentence[] = $node->textContent;
301 if (!$node->hasChildNodes()) {
305 for($i = 0; $i < $node->childNodes->length; $i++) {
307 $n = $node->childNodes->item($i);
309 $sentence = $this->parse_node($n, $sentence, $charset);
316 var $tmpWords = false;
317 function addUTF8Word($s) {
319 echo "calling addUTF8Word \n";
322 $this->tmpWords[] = $s[0];
323 // print_r($this->tmpWords);
327 function domExtractWords($node, $words, $charset="auto")
329 if ($this->wordMax > 0 && count($words) > $this->wordMax) {
332 //echo count($words) ."\n";
337 if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
339 $str = trim($node->textContent);
341 echo "node content : {$str} \n";
343 if ($charset != 'auto') {
345 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
346 //var_dump("ORIG" . $str);
347 $str = mb_convert_encoding($str, $charset, "UTF-8");
348 //var_dump("$charset:" .$str);
349 $str = mb_convert_encoding($str, "BIG5",$charset);
350 //var_dump("BIG5:".$str);
351 $str = mb_convert_encoding($str, "UTF-8", "BIG5");
352 //var_dump("UTF-8:".$str);
354 $str = mb_convert_encoding($str, "UTF-8", $charset);
358 echo "node content mb convert : {$str} \n";
360 //var_dump('xx'.$str);
362 $this->tmpWords = $words;
363 //if ($this->isSino()) {
364 $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word') , $str);
366 $words = $this->tmpWords;
367 // remove puncutianion..
368 $str = preg_replace('/[^\w]+/u', ' ', $str);
370 echo "after replace : {$str} \n";
372 foreach(preg_split('/\s+/u', $str) as $word) {
374 // print_r(mb_detect_encoding($node->textContent));
380 // fixme - break unicode chars
385 if (!$node->hasChildNodes()) {
389 for($i = 0; $i < $node->childNodes->length; $i++) {
391 $n = $node->childNodes->item($i);
392 //if($this->debug_on){
396 $words = $this->domExtractWords($n, $words,$charset);
405 if ($ret !== false) {
409 $ret = '['.implode('', array(
410 "\x{2E80}-\x{2EFF}", # CJK Radicals Supplement
411 "\x{2F00}-\x{2FDF}", # Kangxi Radicals
412 "\x{2FF0}-\x{2FFF}", # Ideographic Description Characters
413 "\x{3000}-\x{303F}", # CJK Symbols and Punctuation
414 "\x{3040}-\x{309F}", # Hiragana
415 "\x{30A0}-\x{30FF}", # Katakana
416 "\x{3100}-\x{312F}", # Bopomofo
417 "\x{3130}-\x{318F}", # Hangul Compatibility Jamo
418 "\x{3190}-\x{319F}", # Kanbun
419 "\x{31A0}-\x{31BF}", # Bopomofo Extended
420 "\x{31F0}-\x{31FF}", # Katakana Phonetic Extensions
421 "\x{3200}-\x{32FF}", # Enclosed CJK Letters and Months
422 "\x{3300}-\x{33FF}", # CJK Compatibility
423 "\x{3400}-\x{4DBF}", # CJK Unified Ideographs Extension A
424 "\x{4DC0}-\x{4DFF}", # Yijing Hexagram Symbols
425 "\x{4E00}-\x{9FFF}", # CJK Unified Ideographs
426 "\x{A000}-\x{A48F}", # Yi Syllables
427 "\x{A490}-\x{A4CF}", # Yi Radicals
428 "\x{AC00}-\x{D7AF}", # Hangul Syllables
429 "\x{F900}-\x{FAFF}", # CJK Compatibility Ideographs
430 "\x{FE30}-\x{FE4F}", # CJK Compatibility Forms
431 "\x{1D300}-\x{1D35F}", # Tai Xuan Jing Symbols
432 "\x{20000}-\x{2A6DF}", # CJK Unified Ideographs Extension B
433 "\x{2F800}-\x{2FA1F}" # CJK Compatibility Ideographs Supplement
445 * @param (array|string) $file either file path or array('string'=>'....')
447 * @return int $percent percentage of match
450 public function compare($file)
453 if (is_array($file)) {
454 $this->htmlDom = $file['string'];
457 // $this->debug_on = true;
458 // print_r('is target');
459 if(is_string($file) && file_exists($file)){
460 $this->htmlDom = file_get_contents($file);
466 if(!method_exists($this, $m)){
467 trigger_error("Method not found ($m)");
471 // if it's langauge is zh_HK or zh_TW -> then
479 // print_r($this->original);
480 // print_r($this->target);// exit;
481 foreach($this->original as $k=>$t){
482 if(!isset($this->target[$k])){
486 // $matchs += $this->original[$k] + $this->target[$k];
487 if($this->original[$k] == $this->target[$k]){
488 $matchs += $this->original[$k];
492 if($this->original[$k] > $this->target[$k]){
493 $matchs += $this->target[$k];
496 $matchs += $this->original[$k];
501 // print_R(($this->countTotal + $this->targetTotal));
503 $percent = ( $matchs / ($this->countTotal) * 100);
504 return (int)$percent;