4 * Description of WordDiff
6 * require_once 'HTML/WordDiff.php';
9 * 'file' => '/home/press/rss/2014/03/31/3952.html'
11 * $wd = new HTML_WordDiff($init);
12 * $percent = $wd->compare('/home/press/rss/2014/03/31/3954.html');
19 //require_once 'PEAR.php';
20 //require_once 'DB/DataObject.php';
26 var $lang = 'en'; // the press release language
27 var $original = array(); // original html words
28 var $target = array(); // diff target html words
29 var $countTotal = 0; // Total words count form original html
30 var $targetTotal = 0; // Total words count form target html
32 //word type classification
33 var $nonSinoTibetan = array(//non Sino-Tibetan languages
45 var $sinoTibetan = array(//Sino-Tibetan languages
55 var $alternatives = array(
61 var $htmlDom = false; // HTML Dom elements
62 var $debug_on = false;
67 * @param Array $config
68 * lang = language of article
69 * file = name of file...
70 * string = string contents
75 function __construct($config = false)
83 if(!is_array($config)){
84 trigger_error("Word Diff got error, the argument IS NOT array");
88 if(empty($config['lang'])){
89 trigger_error("the language is missing.");
92 if(empty($config['file']) && !isset($config['string'])){
93 trigger_error("File is missing");
96 if (isset($config['debug_on'])) {
97 $this->debug_on = $config['debug_on'];
102 if(!in_array($this->lang, $this->nonSinoTibetan)){
103 if(!in_array($this->lang, $this->sinoTibetan)){
104 trigger_error("This ({$this->lang}) language is not on our word type classification");
110 $this->htmlDom = isset($config['string']) ? $config['string'] : '';
113 if(isset($config['file']) && file_exists($config['file'])){
114 $this->htmlDom = file_get_contents($config['file']);
117 $this->lang = $config['lang'];
120 $m = 'buildWords';// default run sino-tibetan
122 if(!method_exists($this, $m)){
123 trigger_error("Method not found ($m)");
131 return in_array($this->lang, $this->sinoTibetan);
135 * set the words array
137 * for non Sino-Tibetan languages etc. English, French
140 * @param $String $target for the array index
143 function buildWords($target = 'original')
145 static $cache= array();
146 if (isset($cache[md5($this->htmlDom)])) {
147 $this->$target = $cache[md5($this->htmlDom)];
149 if ($this->wordMax < 0) {
150 $this->wordMax = array_sum(array_values($this->target)) * 10 ;
153 if($target == 'original'){
154 $this->countTotal = array_sum(array_values($this->$target));
156 $this->targetTotal= array_sum(array_values($this->$target));
162 $a = $this->DomToStrings();
164 if ($this->wordMax < 0) {
165 $this->wordMax = 10*count($a);
168 var_Dump("domstrings"); print_r($a);
175 if(empty($str) || !trim(strlen($str))) {
178 // if(!isset($ret[$str])){
184 // now deal with pairing..
185 if ($last_w !== false) {
187 if(!isset($ret[$last_w.'|'.$str])){
188 $ret[$last_w.'|'.$str] = 1;
191 $ret[$last_w.'|'.$str] += 1;
198 if($target == 'original'){
199 $this->countTotal = array_sum(array_values($ret));
201 $this->targetTotal= array_sum(array_values($ret));
203 $this->$target = $ret;
204 $cache[md5($this->htmlDom)] = $ret;
207 function DomToStrings($target = '')
210 //if (preg_match('#charset=([^"]+)#', $this->htmlDom,$matches)) {
211 //var_dump($matches);exit;
212 // $charset = $matches[1];
215 $pageDom = new DomDocument('1.0', $charset);
216 $pageDom->formatOutput = true;
218 // change language if encoding does not match...
222 // print_r(mb_detect_encoding($this->htmlDom));
224 // may produce errors - so we hide them...
225 $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
226 //$searchPage = $this->htmlDom; //@mb_convert_encoding($this->htmlDom, $charset , $charset=="UTF-8" ? "auto" :$charset);
229 // $searchPage = mb_convert_encoding($this->htmlDom, "UTF-8", "HTML-ENTITIES");
231 // print_r(mb_detect_encoding($searchPage));
233 // $searchPage = mb_convert_encoding($this->htmlDom, "big5");
234 // if($target == 'target'){
235 // print_r($searchPage);
238 // print_r(mb_detect_encoding($searchPage));
239 // print_r($searchPage);
240 @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
242 $words = $this->domExtractWords($pageDom->documentElement, array(), $charset);
243 // print_r($words);exit;
245 // $string = preg_replace('/[^\pL\pS\pN]/u', '-', $pageDom->documentElement->getElementsByTagName('body')->item(0)->textContent);
249 // print_r($pageDom->saveHTML());;
257 var $tmpWords = false;
258 function addUTF8Word($s) {
259 $this->tmpWords[] = $s[0];
260 // print_r($this->tmpWords);
264 function domExtractWords($node, $words, $charset="auto")
266 if ($this->wordMax > 0 && count($words) > $this->wordMax) {
269 //echo count($words) ."\n";
273 if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
275 $str = trim($node->textContent);
276 if ($charset != 'auto') {
278 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
279 //var_dump("ORIG" . $str);
280 $str = mb_convert_encoding($str, $charset, "UTF-8");
281 //var_dump("$charset:" .$str);
282 $str = mb_convert_encoding($str, "BIG5",$charset);
283 //var_dump("BIG5:".$str);
284 $str = mb_convert_encoding($str, "UTF-8", "BIG5");
285 //var_dump("UTF-8:".$str);
287 $str = mb_convert_encoding($str, "UTF-8", $charset);
293 echo mb_strlen($str) . "\n";
295 for ($i = 1; $i <= mb_strlen($str); $i++){
296 echo mb_substr($str, 0, $i) . "\n";
302 //var_dump('xx'.$str);
304 $this->tmpWords = $words;
305 //if ($this->isSino()) {
306 $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word') , $str);
308 $words = $this->tmpWords;
309 // remove puncutianion..
310 $str = preg_replace('/[^\w]+/u', ' ', $str);
312 foreach(preg_split('/\s+/u', $str) as $word) {
314 // print_r(mb_detect_encoding($node->textContent));
320 // fixme - break unicode chars
325 if (!$node->hasChildNodes()) {
329 for($i = 0; $i < $node->childNodes->length; $i++) {
331 $n = $node->childNodes->item($i);
332 //if($this->debug_on){
336 $words = $this->domExtractWords($n, $words,$charset);
345 if ($ret !== false) {
349 $ret = '['.implode('', array(
350 "\x{2E80}-\x{2EFF}", # CJK Radicals Supplement
351 "\x{2F00}-\x{2FDF}", # Kangxi Radicals
352 "\x{2FF0}-\x{2FFF}", # Ideographic Description Characters
353 "\x{3000}-\x{303F}", # CJK Symbols and Punctuation
354 "\x{3040}-\x{309F}", # Hiragana
355 "\x{30A0}-\x{30FF}", # Katakana
356 "\x{3100}-\x{312F}", # Bopomofo
357 "\x{3130}-\x{318F}", # Hangul Compatibility Jamo
358 "\x{3190}-\x{319F}", # Kanbun
359 "\x{31A0}-\x{31BF}", # Bopomofo Extended
360 "\x{31F0}-\x{31FF}", # Katakana Phonetic Extensions
361 "\x{3200}-\x{32FF}", # Enclosed CJK Letters and Months
362 "\x{3300}-\x{33FF}", # CJK Compatibility
363 "\x{3400}-\x{4DBF}", # CJK Unified Ideographs Extension A
364 "\x{4DC0}-\x{4DFF}", # Yijing Hexagram Symbols
365 "\x{4E00}-\x{9FFF}", # CJK Unified Ideographs
366 "\x{A000}-\x{A48F}", # Yi Syllables
367 "\x{A490}-\x{A4CF}", # Yi Radicals
368 "\x{AC00}-\x{D7AF}", # Hangul Syllables
369 "\x{F900}-\x{FAFF}", # CJK Compatibility Ideographs
370 "\x{FE30}-\x{FE4F}", # CJK Compatibility Forms
371 "\x{1D300}-\x{1D35F}", # Tai Xuan Jing Symbols
372 "\x{20000}-\x{2A6DF}", # CJK Unified Ideographs Extension B
373 "\x{2F800}-\x{2FA1F}" # CJK Compatibility Ideographs Supplement
385 * @param (array|string) $file either file path or array('string'=>'....')
387 * @return int $percent percentage of match
390 public function compare($file)
393 if (is_array($file)) {
394 $this->htmlDom = $file['string'];
397 // $this->debug_on = true;
398 // print_r('is target');
399 if(is_string($file) && file_exists($file)){
400 $this->htmlDom = file_get_contents($file);
406 if(!method_exists($this, $m)){
407 trigger_error("Method not found ($m)");
411 // if it's langauge is zh_HK or zh_TW -> then
419 // print_r($this->original);
420 // print_r($this->target);// exit;
421 foreach($this->original as $k=>$t){
422 if(!isset($this->target[$k])){
426 // $matchs += $this->original[$k] + $this->target[$k];
427 if($this->original[$k] == $this->target[$k]){
428 $matchs += $this->original[$k];
432 if($this->original[$k] > $this->target[$k]){
433 $matchs += $this->target[$k];
436 $matchs += $this->original[$k];
441 // print_R(($this->countTotal + $this->targetTotal));
443 $percent = ( $matchs / ($this->countTotal) * 100);
444 return (int)$percent;