4 * Description of WordDiff
6 * require_once 'HTML/WordDiff.php';
9 * 'file' => '/home/press/rss/2014/03/31/3952.html'
11 * $wd = new HTML_WordDiff($init);
12 * $percent = $wd->compare('/home/press/rss/2014/03/31/3954.html');
19 //require_once 'PEAR.php';
20 //require_once 'DB/DataObject.php';
26 var $lang = 'en'; // the press release language
27 var $original = array(); // original html words
28 var $target = array(); // diff target html words
29 var $countTotal = 0; // Total words count form original html
30 var $targetTotal = 0; // Total words count form target html
32 //word type classification
33 var $nonSinoTibetan = array(//non Sino-Tibetan languages
45 var $sinoTibetan = array(//Sino-Tibetan languages
55 var $alternatives = array(
61 var $htmlDom = false; // HTML Dom elements
62 var $debug_on = false;
67 * @param Array $config
68 * lang = language of article
69 * file = name of file...
70 * string = string contents
75 function __construct($config = false)
83 if(!is_array($config)){
84 trigger_error("Word Diff got error, the argument IS NOT array");
88 if(empty($config['lang'])){
89 trigger_error("the language is missing.");
92 if(empty($config['file']) && !isset($config['string'])){
93 trigger_error("File is missing");
96 if (isset($config['debug_on'])) {
97 $this->debug_on = $config['debug_on'];
102 if(!in_array($this->lang, $this->nonSinoTibetan)){
103 if(!in_array($this->lang, $this->sinoTibetan)){
104 trigger_error("This ({$this->lang}) language is not on our word type classification");
110 $this->htmlDom = isset($config['string']) ? $config['string'] : '';
113 if(isset($config['file']) && file_exists($config['file'])){
114 $this->htmlDom = file_get_contents($config['file']);
117 $this->lang = $config['lang'];
120 $m = 'buildWords';// default run sino-tibetan
122 if(!method_exists($this, $m)){
123 trigger_error("Method not found ($m)");
131 return in_array($this->lang, $this->sinoTibetan);
135 * set the words array
137 * for non Sino-Tibetan languages etc. English, French
140 * @param $String $target for the array index
143 function buildWords($target = 'original')
145 static $cache= array();
147 if (isset($cache[md5($this->htmlDom)])) {
149 $this->$target = $cache[md5($this->htmlDom)];
151 if ($this->wordMax < 0) {
152 $this->wordMax = array_sum(array_values($this->$target)) * 10 ;
155 if($target == 'original'){
156 $this->countTotal = array_sum(array_values($this->$target));
158 $this->targetTotal= array_sum(array_values($this->$target));
164 $words = $this->DomToStrings();
166 if ($this->wordMax < 0) {
167 $this->wordMax = 10 * count($words);
171 var_Dump("domstrings"); print_r($words);
177 foreach($words as $str){
179 if(empty($str) || !trim(strlen($str))) {
183 if ($last_w !== false) {
185 if(!isset($ret[$last_w.'|'.$str])){
186 $ret[$last_w.'|'.$str] = 1;
188 $ret[$last_w.'|'.$str] += 1;
196 if($target == 'original'){
197 $this->countTotal = array_sum(array_values($ret));
199 $this->targetTotal= array_sum(array_values($ret));
202 $this->$target = $ret;
204 $cache[md5($this->htmlDom)] = $ret;
208 function DomToStrings($target = '')
212 $pageDom = new DomDocument('1.0', $charset);
213 $pageDom->formatOutput = true;
215 $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
217 @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
219 $sentence = $this->domExtractWords($pageDom->documentElement, array(), $charset);
224 $content = implode('', $sentence);
226 $content = preg_replace('/\n+/', ' ', $content);
228 $content = preg_replace('/\s+/', ' ', $content);
230 if ($charset != 'auto') {
231 if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
232 $content = mb_convert_encoding($content, $charset, "UTF-8");
233 $content = mb_convert_encoding($content, "BIG5",$charset);
234 $content = mb_convert_encoding($content, "UTF-8", "BIG5");
236 $content = mb_convert_encoding($content, "UTF-8", $charset);
242 for ($i = 0; $i < mb_strlen($content); $i++){
244 $word = mb_substr($content, $i, 1);
246 if(preg_match('/'.$this->cjkpreg().'/u', $word)){
247 $words .= " {$word} ";
251 if (preg_match('/[^\w]+/u', $word)) {
259 $words = preg_split('/\s+/', trim($words));
260 //var_dump($words);exit;
264 function domExtractWords($node, $sentence, $charset)
270 if ($node->nodeType == XML_TEXT_NODE) {
271 $sentence[] = $node->textContent;
274 if (!$node->hasChildNodes()) {
278 for($i = 0; $i < $node->childNodes->length; $i++) {
280 $n = $node->childNodes->item($i);
282 $sentence = $this->domExtractWords($n, $sentence, $charset);
291 if ($ret !== false) {
296 $ret = '['.implode('', array(
297 "\x{0E00}-\x{0E7F}", // thai ??
298 "\x{2E80}-\x{2EFF}", # CJK Radicals Supplement
299 "\x{2F00}-\x{2FDF}", # Kangxi Radicals
300 "\x{2FF0}-\x{2FFF}", # Ideographic Description Characters
301 // "\x{3000}-\x{303F}", # CJK Symbols and Punctuation
302 "\x{3040}-\x{309F}", # Hiragana
303 "\x{30A0}-\x{30FF}", # Katakana
304 "\x{3100}-\x{312F}", # Bopomofo
305 "\x{3130}-\x{318F}", # Hangul Compatibility Jamo
306 "\x{3190}-\x{319F}", # Kanbun
307 "\x{31A0}-\x{31BF}", # Bopomofo Extended
308 "\x{31F0}-\x{31FF}", # Katakana Phonetic Extensions
309 "\x{3200}-\x{32FF}", # Enclosed CJK Letters and Months
310 "\x{3300}-\x{33FF}", # CJK Compatibility
311 "\x{3400}-\x{4DBF}", # CJK Unified Ideographs Extension A
312 "\x{4DC0}-\x{4DFF}", # Yijing Hexagram Symbols
313 "\x{4E00}-\x{9FFF}", # CJK Unified Ideographs
314 "\x{A000}-\x{A48F}", # Yi Syllables
315 "\x{A490}-\x{A4CF}", # Yi Radicals
316 "\x{AC00}-\x{D7AF}", # Hangul Syllables
317 "\x{F900}-\x{FAFF}", # CJK Compatibility Ideographs
318 "\x{FE30}-\x{FE4F}", # CJK Compatibility Forms
319 "\x{1D300}-\x{1D35F}", # Tai Xuan Jing Symbols
320 "\x{20000}-\x{2A6DF}", # CJK Unified Ideographs Extension B
321 "\x{2F800}-\x{2FA1F}" # CJK Compatibility Ideographs Supplement
333 * @param (array|string) $file either file path or array('string'=>'....')
335 * @return int $percent percentage of match
338 public function compare($file)
341 if (is_array($file)) {
342 $this->htmlDom = $file['string'];
345 if(is_string($file) && file_exists($file)){
346 $this->htmlDom = file_get_contents($file);
351 if(!method_exists($this, $m)){
352 trigger_error("Method not found ($m)");
360 foreach($this->original as $k => $t){
362 if(!isset($this->target[$k])){
366 if($this->original[$k] == $this->target[$k]){
367 $matchs += $this->original[$k];
371 if($this->original[$k] > $this->target[$k]){
372 $matchs += $this->target[$k];
376 $matchs += $this->original[$k];
380 $percent = ( $matchs / ($this->countTotal) * 100);
382 return (int)$percent;