4 * Description of WordDiff
6 * require_once 'HTML/WordDiff.php';
9 * 'file' => '/home/press/rss/2014/03/31/3952.html'
11 * $wd = new HTML_WordDiff($init);
12 * $percent = $wd->compare('/home/press/rss/2014/03/31/3954.html');
19 //require_once 'PEAR.php';
20 //require_once 'DB/DataObject.php';
26 var $lang = 'en'; // the press release language
27 var $original = array(); // original html words
28 var $target = array(); // diff target html words
29 var $countTotal = 0; // Total words count form original html
30 var $targetTotal = 0; // Total words count form target html
32 //word type classification
33 var $nonSinoTibetan = array(//non Sino-Tibetan languages
45 var $sinoTibetan = array(//Sino-Tibetan languages
55 var $alternatives = array(
61 var $htmlDom = false; // HTML Dom elements
62 var $debug_on = false;
67 * @param Array $config
68 * lang = language of article
69 * file = name of file...
70 * string = string contents
75 function __construct($config = false)
83 if(!is_array($config)){
84 trigger_error("Word Diff got error, the argument IS NOT array");
88 if(empty($config['lang'])){
89 trigger_error("the language is missing.");
92 if(empty($config['file']) && !isset($config['string'])){
93 trigger_error("File is missing");
96 if (isset($config['debug_on'])) {
97 $this->debug_on = $config['debug_on'];
102 if(!in_array($this->lang, $this->nonSinoTibetan)){
103 if(!in_array($this->lang, $this->sinoTibetan)){
104 trigger_error("This ({$this->lang}) language is not on our word type classification");
110 $this->htmlDom = isset($config['string']) ? $config['string'] : '';
113 if(isset($config['file']) && file_exists($config['file'])){
114 $this->htmlDom = file_get_contents($config['file']);
117 $this->lang = $config['lang'];
120 $m = 'buildWords';// default run sino-tibetan
122 if(!method_exists($this, $m)){
123 trigger_error("Method not found ($m)");
131 return in_array($this->lang, $this->sinoTibetan);
135 * set the words array
137 * for non Sino-Tibetan languages etc. English, French
140 * @param $String $target for the array index
143 function buildWords($target = 'original')
145 static $cache= array();
146 if (isset($cache[md5($this->htmlDom)])) {
147 $this->$target = $cache[md5($this->htmlDom)];
149 if ($this->wordMax < 0) {
150 $this->wordMax = array_sum(array_values($this->target)) * 10 ;
153 if($target == 'original'){
154 $this->countTotal = array_sum(array_values($this->$target));
156 $this->targetTotal= array_sum(array_values($this->$target));
162 $words = $this->DomToStrings();
164 if ($this->wordMax < 0) {
165 $this->wordMax = 10 * count($words);
169 var_Dump("domstrings"); print_r($words);
175 foreach($words as $str){
177 if(empty($str) || !trim(strlen($str))) {
181 if ($last_w !== false) {
183 if(!isset($ret[$last_w.'|'.$str])){
184 $ret[$last_w.'|'.$str] = 1;
186 $ret[$last_w.'|'.$str] += 1;
194 if($target == 'original'){
195 $this->countTotal = array_sum(array_values($ret));
197 $this->targetTotal= array_sum(array_values($ret));
199 $this->$target = $ret;
200 $cache[md5($this->htmlDom)] = $ret;
203 function DomToStrings($target = '')
207 $pageDom = new DomDocument('1.0', $charset);
208 $pageDom->formatOutput = true;
210 $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
212 @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
214 $sentence = $this->parse_node($pageDom->documentElement, array(), $charset);
216 $content = implode('', $sentence);
218 $content = preg_replace('/\n+/', ' ', $content);
220 $content = preg_replace('/\s+/', ' ', $content);
222 if ($charset != 'auto') {
223 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
224 $content = mb_convert_encoding($content, $charset, "UTF-8");
225 $content = mb_convert_encoding($content, "BIG5",$charset);
226 $content = mb_convert_encoding($content, "UTF-8", "BIG5");
228 $content = mb_convert_encoding($content, "UTF-8", $charset);
234 for ($i = 0; $i < mb_strlen($content); $i++){
236 $word = mb_substr($content, $i, 1);
238 if(preg_match('/'.$this->cjkpreg().'/u', $word)){
243 if (preg_match('/[^\w]+/', $word)) {
251 $words = preg_split('/\s+/', trim($words));
256 function parse_node($node, $sentence, $charset)
262 if ($node->nodeType == XML_TEXT_NODE) {
263 $sentence[] = $node->textContent;
266 if (!$node->hasChildNodes()) {
270 for($i = 0; $i < $node->childNodes->length; $i++) {
272 $n = $node->childNodes->item($i);
274 $sentence = $this->parse_node($n, $sentence, $charset);
281 var $tmpWords = false;
282 function addUTF8Word($s) {
284 echo "calling addUTF8Word \n";
287 $this->tmpWords[] = $s[0];
288 // print_r($this->tmpWords);
292 function domExtractWords($node, $words, $charset="auto")
294 if ($this->wordMax > 0 && count($words) > $this->wordMax) {
297 //echo count($words) ."\n";
302 if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
304 $str = trim($node->textContent);
306 echo "node content : {$str} \n";
308 if ($charset != 'auto') {
310 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
311 //var_dump("ORIG" . $str);
312 $str = mb_convert_encoding($str, $charset, "UTF-8");
313 //var_dump("$charset:" .$str);
314 $str = mb_convert_encoding($str, "BIG5",$charset);
315 //var_dump("BIG5:".$str);
316 $str = mb_convert_encoding($str, "UTF-8", "BIG5");
317 //var_dump("UTF-8:".$str);
319 $str = mb_convert_encoding($str, "UTF-8", $charset);
323 echo "node content mb convert : {$str} \n";
325 //var_dump('xx'.$str);
327 $this->tmpWords = $words;
328 //if ($this->isSino()) {
329 $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word') , $str);
331 $words = $this->tmpWords;
332 // remove puncutianion..
333 $str = preg_replace('/[^\w]+/u', ' ', $str);
335 echo "after replace : {$str} \n";
337 foreach(preg_split('/\s+/u', $str) as $word) {
339 // print_r(mb_detect_encoding($node->textContent));
345 // fixme - break unicode chars
350 if (!$node->hasChildNodes()) {
354 for($i = 0; $i < $node->childNodes->length; $i++) {
356 $n = $node->childNodes->item($i);
357 //if($this->debug_on){
361 $words = $this->domExtractWords($n, $words,$charset);
370 if ($ret !== false) {
374 $ret = '['.implode('', array(
375 "\x{2E80}-\x{2EFF}", # CJK Radicals Supplement
376 "\x{2F00}-\x{2FDF}", # Kangxi Radicals
377 "\x{2FF0}-\x{2FFF}", # Ideographic Description Characters
378 // "\x{3000}-\x{303F}", # CJK Symbols and Punctuation
379 "\x{3040}-\x{309F}", # Hiragana
380 "\x{30A0}-\x{30FF}", # Katakana
381 "\x{3100}-\x{312F}", # Bopomofo
382 "\x{3130}-\x{318F}", # Hangul Compatibility Jamo
383 "\x{3190}-\x{319F}", # Kanbun
384 "\x{31A0}-\x{31BF}", # Bopomofo Extended
385 "\x{31F0}-\x{31FF}", # Katakana Phonetic Extensions
386 "\x{3200}-\x{32FF}", # Enclosed CJK Letters and Months
387 "\x{3300}-\x{33FF}", # CJK Compatibility
388 "\x{3400}-\x{4DBF}", # CJK Unified Ideographs Extension A
389 "\x{4DC0}-\x{4DFF}", # Yijing Hexagram Symbols
390 "\x{4E00}-\x{9FFF}", # CJK Unified Ideographs
391 "\x{A000}-\x{A48F}", # Yi Syllables
392 "\x{A490}-\x{A4CF}", # Yi Radicals
393 "\x{AC00}-\x{D7AF}", # Hangul Syllables
394 "\x{F900}-\x{FAFF}", # CJK Compatibility Ideographs
395 "\x{FE30}-\x{FE4F}", # CJK Compatibility Forms
396 "\x{1D300}-\x{1D35F}", # Tai Xuan Jing Symbols
397 "\x{20000}-\x{2A6DF}", # CJK Unified Ideographs Extension B
398 "\x{2F800}-\x{2FA1F}" # CJK Compatibility Ideographs Supplement
410 * @param (array|string) $file either file path or array('string'=>'....')
412 * @return int $percent percentage of match
415 public function compare($file)
418 if (is_array($file)) {
419 $this->htmlDom = $file['string'];
422 // $this->debug_on = true;
423 // print_r('is target');
424 if(is_string($file) && file_exists($file)){
425 $this->htmlDom = file_get_contents($file);
431 if(!method_exists($this, $m)){
432 trigger_error("Method not found ($m)");
436 // if it's langauge is zh_HK or zh_TW -> then
444 // print_r($this->original);
445 // print_r($this->target);// exit;
446 foreach($this->original as $k=>$t){
447 if(!isset($this->target[$k])){
451 // $matchs += $this->original[$k] + $this->target[$k];
452 if($this->original[$k] == $this->target[$k]){
453 $matchs += $this->original[$k];
457 if($this->original[$k] > $this->target[$k]){
458 $matchs += $this->target[$k];
461 $matchs += $this->original[$k];
466 // print_R(($this->countTotal + $this->targetTotal));
468 $percent = ( $matchs / ($this->countTotal) * 100);
469 return (int)$percent;