function buildWords($target = 'original')
{
static $cache= array();
+
if (isset($cache[md5($this->htmlDom)])) {
+
$this->$target = $cache[md5($this->htmlDom)];
if ($this->wordMax < 0) {
- $this->wordMax = array_sum(array_values($this->target)) * 10 ;
+ $this->wordMax = array_sum(array_values($this->$target)) * 10 ;
}
if($target == 'original'){
$ret = array();
$last_w = false;
- foreach($words as $word){
+ foreach($words as $str){
+
if(empty($str) || !trim(strlen($str))) {
continue;
}
-// if(!isset($ret[$str])){
-// $ret[$str] = 1;
-//
-// } else {
-// $ret[$str] += 1;
-// }
- // now deal with pairing..
+
if ($last_w !== false) {
if(!isset($ret[$last_w.'|'.$str])){
$ret[$last_w.'|'.$str] = 1;
-
} else {
$ret[$last_w.'|'.$str] += 1;
- }
-
+ }
}
+
$last_w = $str;
+
}
-// print_r($ret);
+
if($target == 'original'){
$this->countTotal = array_sum(array_values($ret));
}else{
$this->targetTotal= array_sum(array_values($ret));
}
+
$this->$target = $ret;
+
$cache[md5($this->htmlDom)] = $ret;
+
}
function DomToStrings($target = '')
@$pageDom->loadHTML(($charset == 'UTF-8' ? '<?xml version="1.0" encoding="UTF-8"?>' : ''). $searchPage);
- $sentence = $this->parse_node($pageDom->documentElement, array(), $charset);
+ $sentence = $this->domExtractWords($pageDom->documentElement, array(), $charset);
+
+
+
$content = implode('', $sentence);
$word = mb_substr($content, $i, 1);
if(preg_match('/'.$this->cjkpreg().'/u', $word)){
- $words .= " $word ";
+ $words .= " {$word} ";
continue;
}
- if (preg_match('/[^\w]+/', $word)) {
+ if (preg_match('/[^\w]+/u', $word)) {
$words .= ' ';
continue;
}
}
$words = preg_split('/\s+/', trim($words));
-
+ //var_dump($words);exit;
return $words;
}
- function parse_node($node, $sentence, $charset)
+ function domExtractWords($node, $sentence, $charset)
{
if (empty($node)) {
return $sentence;
$n = $node->childNodes->item($i);
- $sentence = $this->parse_node($n, $sentence, $charset);
+ $sentence = $this->domExtractWords($n, $sentence, $charset);
}
return $sentence;
}
-
- var $tmpWords = false;
- function addUTF8Word($s) {
-
- echo "calling addUTF8Word \n";
- print_R($s);
-
- $this->tmpWords[] = $s[0];
-// print_r($this->tmpWords);
- return ' ';
- }
-
- function domExtractWords($node, $words, $charset="auto")
- {
- if ($this->wordMax > 0 && count($words) > $this->wordMax) {
- return $words;
- }
- //echo count($words) ."\n";
- if (empty($node)) {
- return $words;
- }
-
- if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
-
- $str = trim($node->textContent);
-
- echo "node content : {$str} \n";
-
- if ($charset != 'auto') {
-
- if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
- //var_dump("ORIG" . $str);
- $str = mb_convert_encoding($str, $charset, "UTF-8");
- //var_dump("$charset:" .$str);
- $str = mb_convert_encoding($str, "BIG5",$charset);
- //var_dump("BIG5:".$str);
- $str = mb_convert_encoding($str, "UTF-8", "BIG5");
- //var_dump("UTF-8:".$str);
- } else {
- $str = mb_convert_encoding($str, "UTF-8", $charset);
- }
- }
-
- echo "node content mb convert : {$str} \n";
-
- //var_dump('xx'.$str);
- //var_dump($str);
- $this->tmpWords = $words;
- //if ($this->isSino()) {
- $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word') , $str);
- //}
- $words = $this->tmpWords;
- // remove puncutianion..
- $str = preg_replace('/[^\w]+/u', ' ', $str);
-
- echo "after replace : {$str} \n";
-
- foreach(preg_split('/\s+/u', $str) as $word) {
- if($this->debug_on){
-// print_r(mb_detect_encoding($node->textContent));
- //print_r("\n");
- }
- if (!trim($word)) {
- continue;
- }
- // fixme - break unicode chars
- $words[] = $word;
- }
-
- }
- if (!$node->hasChildNodes()) {
- return $words;
- }
-
- for($i = 0; $i < $node->childNodes->length; $i++) {
-
- $n = $node->childNodes->item($i);
- //if($this->debug_on){
- // print_r($n);
- // print_r("\n");
- //}
- $words = $this->domExtractWords($n, $words,$charset);
- }
- return $words;
-
-
- }
function cjkpreg() {
static $ret = false;
return $ret;
}
+
$ret = '['.implode('', array(
+ "\x{0E00}-\x{0E7F}", // thai ??
"\x{2E80}-\x{2EFF}", # CJK Radicals Supplement
"\x{2F00}-\x{2FDF}", # Kangxi Radicals
"\x{2FF0}-\x{2FFF}", # Ideographic Description Characters
$this->htmlDom = $file['string'];
}
-// $this->debug_on = true;
-// print_r('is target');
if(is_string($file) && file_exists($file)){
$this->htmlDom = file_get_contents($file);
}
-
-
$m = 'buildWords';
+
if(!method_exists($this, $m)){
trigger_error("Method not found ($m)");
return;
}
- // if it's langauge is zh_HK or zh_TW -> then
- // convert
-
-
- //print_r($this);
$this->$m('target');
$matchs = 0;
- // print_r($this->original);
- // print_r($this->target);// exit;
- foreach($this->original as $k=>$t){
+
+ foreach($this->original as $k => $t){
+
if(!isset($this->target[$k])){
continue;
}
-// $matchs += $this->original[$k] + $this->target[$k];
if($this->original[$k] == $this->target[$k]){
$matchs += $this->original[$k];
continue;
$matchs += $this->target[$k];
continue;
}
+
$matchs += $this->original[$k];
}
-// print_r($matchs);
-// print_r("\n");
-// print_R(($this->countTotal + $this->targetTotal));
-// print_r("\n");
+
$percent = ( $matchs / ($this->countTotal) * 100);
+
return (int)$percent;
}