function buildWords($target = 'original')
{
static $cache= array();
+
if (isset($cache[md5($this->htmlDom)])) {
+
$this->$target = $cache[md5($this->htmlDom)];
if ($this->wordMax < 0) {
- $this->wordMax = array_sum(array_values($this->target)) * 10 ;
+ $this->wordMax = array_sum(array_values($this->$target)) * 10 ;
}
if($target == 'original'){
return;
}
- $a = $this->DomToStrings();
+ $words = $this->DomToStrings();
if ($this->wordMax < 0) {
- $this->wordMax = 10*count($a);
+ $this->wordMax = 10 * count($words);
}
+
if($this->debug_on){
- var_Dump("domstrings"); print_r($a);
-// exit;
+ var_Dump("domstrings"); print_r($words);
}
+
$ret = array();
$last_w = false;
- foreach($a as $str){
+ foreach($words as $str){
+
if(empty($str) || !trim(strlen($str))) {
continue;
}
-// if(!isset($ret[$str])){
-// $ret[$str] = 1;
-//
-// } else {
-// $ret[$str] += 1;
-// }
- // now deal with pairing..
+
if ($last_w !== false) {
if(!isset($ret[$last_w.'|'.$str])){
$ret[$last_w.'|'.$str] = 1;
-
} else {
$ret[$last_w.'|'.$str] += 1;
- }
-
+ }
}
+
$last_w = $str;
+
}
-// print_r($ret);
+
if($target == 'original'){
$this->countTotal = array_sum(array_values($ret));
}else{
$this->targetTotal= array_sum(array_values($ret));
}
+
$this->$target = $ret;
+
$cache[md5($this->htmlDom)] = $ret;
+
}
function DomToStrings($target = '')
{
$charset = 'UTF-8';
- //if (preg_match('#charset=([^"]+)#', $this->htmlDom,$matches)) {
- //var_dump($matches);exit;
- // $charset = $matches[1];
- //}
$pageDom = new DomDocument('1.0', $charset);
$pageDom->formatOutput = true;
- // change language if encoding does not match...
+ $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
+ @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
+ $sentence = $this->domExtractWords($pageDom->documentElement, array(), $charset);
-// print_r(mb_detect_encoding($this->htmlDom));
-
- // may produce errors - so we hide them...
- $searchPage = preg_replace('#charset=([^"]+)#', '', $this->htmlDom);
- //$searchPage = $this->htmlDom; //@mb_convert_encoding($this->htmlDom, $charset , $charset=="UTF-8" ? "auto" :$charset);
-
-
-// $searchPage = mb_convert_encoding($this->htmlDom, "UTF-8", "HTML-ENTITIES");
-// echo $searchPage;
-// print_r(mb_detect_encoding($searchPage));
-
-// $searchPage = mb_convert_encoding($this->htmlDom, "big5");
-// if($target == 'target'){
-// print_r($searchPage);
-// exit;
-// }
-// print_r(mb_detect_encoding($searchPage));
- // print_r($searchPage);
- @$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
-// exit;
- $words = $this->domExtractWords($pageDom->documentElement, array(), $charset);
- // print_r($words);exit;
-// $string = preg_replace('/[^\pL\pS\pN]/u', '-', $pageDom->documentElement->getElementsByTagName('body')->item(0)->textContent);
- if($this->debug_on){
- print_r("parsed ");
- print_r($words);
- // print_r($pageDom->saveHTML());;
-// exit;
+
+
+ $content = implode('', $sentence);
+
+ $content = preg_replace('/\n+/', ' ', $content);
+
+ $content = preg_replace('/\s+/', ' ', $content);
+
+ if ($charset != 'auto') {
+ if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
+ $content = mb_convert_encoding($content, $charset, "UTF-8");
+ $content = mb_convert_encoding($content, "BIG5",$charset);
+ $content = mb_convert_encoding($content, "UTF-8", "BIG5");
+ } else {
+ $content = mb_convert_encoding($content, "UTF-8", $charset);
+ }
}
+
+ $words = "";
+
+ for ($i = 0; $i < mb_strlen($content); $i++){
+
+ $word = mb_substr($content, $i, 1);
+
+ if(preg_match('/'.$this->cjkpreg().'/u', $word)){
+ $words .= " {$word} ";
+ continue;
+ }
+
+ if (preg_match('/[^\w]+/u', $word)) {
+ $words .= ' ';
+ continue;
+ }
+
+ $words .= $word;
+ }
+
+ $words = preg_split('/\s+/', trim($words));
+ //var_dump($words);exit;
return $words;
}
-
-
- var $tmpWords = false;
- function addUTF8Word($s) {
- $this->tmpWords[] = $s[0];
-// print_r($this->tmpWords);
- return ' ';
- }
-
- function domExtractWords($node, $words, $charset="auto")
+ function domExtractWords($node, $sentence, $charset)
{
- if ($this->wordMax > 0 && count($words) > $this->wordMax) {
- return $words;
- }
- //echo count($words) ."\n";
if (empty($node)) {
- return $words;
+ return $sentence;
}
- if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
-
- $str = trim($node->textContent);
- if ($charset != 'auto') {
-
- if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
- //var_dump("ORIG" . $str);
- $str = mb_convert_encoding($str, $charset, "UTF-8");
- //var_dump("$charset:" .$str);
- $str = mb_convert_encoding($str, "BIG5",$charset);
- //var_dump("BIG5:".$str);
- $str = mb_convert_encoding($str, "UTF-8", "BIG5");
- //var_dump("UTF-8:".$str);
- } else {
- $str = mb_convert_encoding($str, "UTF-8", $charset);
- }
- }
- echo "$str \n";
- print_R(mb_strlen($str));exit;
-
- //var_dump('xx'.$str);
- //var_dump($str);
- $this->tmpWords = $words;
- //if ($this->isSino()) {
- $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word') , $str);
- //}
- $words = $this->tmpWords;
- // remove puncutianion..
- $str = preg_replace('/[^\w]+/u', ' ', $str);
-
- foreach(preg_split('/\s+/u', $str) as $word) {
- if($this->debug_on){
-// print_r(mb_detect_encoding($node->textContent));
- //print_r("\n");
- }
- if (!trim($word)) {
- continue;
- }
- // fixme - break unicode chars
- $words[] = $word;
- }
-
+
+ if ($node->nodeType == XML_TEXT_NODE) {
+ $sentence[] = $node->textContent;
}
+
if (!$node->hasChildNodes()) {
- return $words;
+ return $sentence;
}
for($i = 0; $i < $node->childNodes->length; $i++) {
$n = $node->childNodes->item($i);
- //if($this->debug_on){
- // print_r($n);
- // print_r("\n");
- //}
- $words = $this->domExtractWords($n, $words,$charset);
+
+ $sentence = $this->domExtractWords($n, $sentence, $charset);
}
- return $words;
-
+ return $sentence;
}
+
function cjkpreg() {
static $ret = false;
return $ret;
}
+
$ret = '['.implode('', array(
+ "\x{0E00}-\x{0E7F}", // thai ??
"\x{2E80}-\x{2EFF}", # CJK Radicals Supplement
"\x{2F00}-\x{2FDF}", # Kangxi Radicals
"\x{2FF0}-\x{2FFF}", # Ideographic Description Characters
- "\x{3000}-\x{303F}", # CJK Symbols and Punctuation
+// "\x{3000}-\x{303F}", # CJK Symbols and Punctuation
"\x{3040}-\x{309F}", # Hiragana
"\x{30A0}-\x{30FF}", # Katakana
"\x{3100}-\x{312F}", # Bopomofo
$this->htmlDom = $file['string'];
}
-// $this->debug_on = true;
-// print_r('is target');
if(is_string($file) && file_exists($file)){
$this->htmlDom = file_get_contents($file);
}
-
-
$m = 'buildWords';
+
if(!method_exists($this, $m)){
trigger_error("Method not found ($m)");
return;
}
- // if it's langauge is zh_HK or zh_TW -> then
- // convert
-
-
- //print_r($this);
$this->$m('target');
$matchs = 0;
- // print_r($this->original);
- // print_r($this->target);// exit;
- foreach($this->original as $k=>$t){
+
+ foreach($this->original as $k => $t){
+
if(!isset($this->target[$k])){
continue;
}
-// $matchs += $this->original[$k] + $this->target[$k];
if($this->original[$k] == $this->target[$k]){
$matchs += $this->original[$k];
continue;
$matchs += $this->target[$k];
continue;
}
+
$matchs += $this->original[$k];
}
-// print_r($matchs);
-// print_r("\n");
-// print_R(($this->countTotal + $this->targetTotal));
-// print_r("\n");
+
$percent = ( $matchs / ($this->countTotal) * 100);
+
return (int)$percent;
}