return $sentence;
}
-
- var $tmpWords = false;
- function addUTF8Word($s) {
-
- echo "calling addUTF8Word \n";
- print_R($s);
-
- $this->tmpWords[] = $s[0];
-// print_r($this->tmpWords);
- return ' ';
- }
-
- function domExtractWords($node, $words, $charset="auto")
- {
- if ($this->wordMax > 0 && count($words) > $this->wordMax) {
- return $words;
- }
- //echo count($words) ."\n";
- if (empty($node)) {
- return $words;
- }
-
- if ($node->nodeType == XML_TEXT_NODE && strlen(trim($node->textContent))) {// this is got the bug at sina....
-
- $str = trim($node->textContent);
-
- echo "node content : {$str} \n";
-
- if ($charset != 'auto') {
-
- if (($this->lang == 'zh_HK' || $this->lang == 'zh_TW') && $charset == 'gb2312') {
- //var_dump("ORIG" . $str);
- $str = mb_convert_encoding($str, $charset, "UTF-8");
- //var_dump("$charset:" .$str);
- $str = mb_convert_encoding($str, "BIG5",$charset);
- //var_dump("BIG5:".$str);
- $str = mb_convert_encoding($str, "UTF-8", "BIG5");
- //var_dump("UTF-8:".$str);
- } else {
- $str = mb_convert_encoding($str, "UTF-8", $charset);
- }
- }
-
- echo "node content mb convert : {$str} \n";
-
- //var_dump('xx'.$str);
- //var_dump($str);
- $this->tmpWords = $words;
- //if ($this->isSino()) {
- $str = preg_replace_callback('/'.$this->cjkpreg().'/u', array($this, 'addUTF8Word') , $str);
- //}
- $words = $this->tmpWords;
- // remove puncutianion..
- $str = preg_replace('/[^\w]+/u', ' ', $str);
-
- echo "after replace : {$str} \n";
-
- foreach(preg_split('/\s+/u', $str) as $word) {
- if($this->debug_on){
-// print_r(mb_detect_encoding($node->textContent));
- //print_r("\n");
- }
- if (!trim($word)) {
- continue;
- }
- // fixme - break unicode chars
- $words[] = $word;
- }
-
- }
- if (!$node->hasChildNodes()) {
- return $words;
- }
-
- for($i = 0; $i < $node->childNodes->length; $i++) {
-
- $n = $node->childNodes->item($i);
- //if($this->debug_on){
- // print_r($n);
- // print_r("\n");
- //}
- $words = $this->domExtractWords($n, $words,$charset);
- }
- return $words;
-
-
- }
function cjkpreg() {
static $ret = false;