StartAnalysis -> Get***Result
* 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
*
* @version $id:splitword.class.php 2 11:45 2011-2-14 itplato $
* @package DedeBIZ.Libraries
* @copyright Copyright (c) 2022 DedeBIZ.COM
* @license GNU GPL v2 (https://www.dedebiz.com/license)
* @link https://www.dedebiz.com
*/
//常量定义
define('_SP_', chr(0xFF).chr(0xFE));
define('UCS2', 'ucs-2be');
class SplitWord
{
//hash算法选项
var $mask_value = 0xFFFF;
//输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型)
var $sourceCharSet = 'utf-8';
var $targetCharSet = 'utf-8';
//生成的分词结果数据类型 1 为全部,2为 词典词汇及单个中日韩简繁字符及英文,3 为词典词汇及英文
var $resultType = 1;
//句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1
var $notSplitLen = 5;
//把英文单词全部转小写
var $toLower = FALSE;
//使用最大切分模式对二元词进行消岐
var $differMax = FALSE;
//尝试合并单字
var $unitWord = TRUE;
//初始化类时直接加载词典
var $loadInit = TRUE;
//使用热门词优先模式进行消岐
var $differFreq = FALSE;
//被转换为unicode的源字符串
var $sourceString = '';
//附加词典
var $addonDic = array();
var $addonDicFile = 'data/words_addons.dic';
//主词典
var $dicStr = '';
var $mainDic = array();
var $mainDicHand = FALSE;
var $mainDicInfos = array();
var $mainDicFile = 'data/base_dic_full.dic';
//是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条)
var $mainDicFileZip = 'data/base_dic_full.zip';
var $isLoadAll = FALSE;
var $isUnpacked = FALSE;
//主词典词语最大长度 x / 2
var $dicWordMax = 14;
//粗分后的数组(通常是截取句子等用途)
var $simpleResult = array();
//最终结果(用空格分开的词汇列表)
var $finallyResult = array();
//是否已经载入词典
var $isLoadDic = FALSE;
//系统识别或合并的新词
var $newWords = array();
var $foundWordStr = '';
//词库载入时间
var $loadTime = 0;
/**
* 构造函数
* @param $source_charset
* @param $target_charset
* @param $load_alldic
* @param $source
*
* @return void
*/
function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
{
$this->SetSource( $source, $source_charset, $target_charset );
$this->isLoadAll = $load_all;
if (file_exists(DEDEINC.'/'.$this->mainDicFile)) $this->isUnpacked = TRUE;
if ($this->loadInit) $this->LoadDict();
}
function SplitWord($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
{
$this->__construct($source_charset, $target_charset, $load_all, $source);
}
/**
* 析构函数
*/
function __destruct()
{
if ( $this->mainDicHand !== FALSE )
{
@fclose( $this->mainDicHand );
}
}
/**
* 根据字符串计算key索引
* @param $key
* @return short int
*/
function _get_index( $key )
{
$l = strlen($key);
$h = 0x238f13af;
while ($l--)
{
$h += ($h << 5);
$h ^= ord($key[$l]);
$h &= 0x7fffffff;
}
return ($h % $this->mask_value);
}
/**
* 从文件获得词
* @param $key
* @param $type (类型 word 或 key_groups)
* @return short int
*/
function GetWordInfos( $key, $type='word' )
{
if ( !$this->mainDicHand )
{
$this->mainDicHand = fopen($this->mainDicFile, 'r');
}
$p = 0;
$keynum = (int)$this->_get_index( $key );
if ( isset($this->mainDicInfos[ $keynum ]) )
{
$data = $this->mainDicInfos[ $keynum ];
} else {
//rewind( $this->mainDicHand );
$move_pos = $keynum * 8;
fseek($this->mainDicHand, $move_pos, SEEK_SET);
$dat = fread($this->mainDicHand, 8);
$arr = unpack('I1s/n1l/n1c', $dat);
if ( $arr['l'] == 0 )
{
return FALSE;
}
fseek($this->mainDicHand, $arr['s'], SEEK_SET);
$data = @unserialize(fread($this->mainDicHand, $arr['l']));
$this->mainDicInfos[ $keynum ] = $data;
}
if ( !is_array($data) || !isset($data[$key]) )
{
return FALSE;
}
return ($type=='word' ? $data[$key] : $data);
}
/**
* 设置源字符串
* @param $source
* @param $source_charset
* @param $target_charset
*
* @return bool
*/
function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
{
$this->sourceCharSet = strtolower($source_charset);
$this->targetCharSet = strtolower($target_charset);
$this->simpleResult = array();
$this->finallyResult = array();
if ( $source != '' )
{
$rs = TRUE;
if ( preg_match("/^utf/", $source_charset) ) {
$this->sourceString = @iconv('utf-8', UCS2, $source);
}
else if ( preg_match("/^gb/", $source_charset) ) {
$this->sourceString = @iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
}
else if ( preg_match("/^big/", $source_charset) ) {
$this->sourceString = @iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
} else {
$rs = FALSE;
}
} else {
$rs = FALSE;
}
return $rs;
}
/**
* 设置结果类型(只在获取finallyResult才有效)
* @param $rstype 1 为全部,2去除特殊符号
*
* @return void
*/
function SetResultType( $rstype )
{
$this->resultType = $rstype;
}
/**
* 载入词典
*
* @return void
*/
function LoadDict( $maindic='' )
{
$this->addonDicFile = DEDEINC.'/libraries/'.$this->addonDicFile;
$this->mainDicFile = DEDEINC.'/libraries/'.$this->mainDicFile;
$this->mainDicFileZip = DEDEINC.'/libraries/'.$this->mainDicFileZip;
$startt = microtime(TRUE);
//正常读取文件
$dicAddon = $this->addonDicFile;
if ($maindic=='' || !file_exists($maindic) )
{
$dicWords = $this->mainDicFile ;
} else {
$dicWords = $maindic;
$this->mainDicFile = $maindic;
}
//加载主词典(只打开)
if ($this->isUnpacked){
$this->mainDicHand = fopen($dicWords, 'r');
}
//载入副词典
$hw = '';
$ds = file($dicAddon);
foreach($ds as $d)
{
$d = trim($d);
if ($d=='') continue;
$estr = substr($d, 1, 1);
if ( $estr==':' ) {
$hw = substr($d, 0, 1);
} else {
$spstr = _SP_;
$spstr = iconv(UCS2, 'utf-8', $spstr);
$ws = explode(',', $d);
$wall = iconv('utf-8', UCS2, join($spstr, $ws));
$ws = explode(_SP_, $wall);
foreach($ws as $estr)
{
$this->addonDic[$hw][$estr] = strlen($estr);
}
}
}
$this->loadTime = microtime(TRUE) - $startt;
$this->isLoadDic = TRUE;
}
/**
* 检测某个词是否存在
*/
function IsWord( $word )
{
$winfos = $this->GetWordInfos( $word );
return ($winfos !== FALSE);
}
/**
* 获得某个词的词性及词频信息
* @parem $word unicode编码的词
* @return void
*/
function GetWordProperty($word)
{
if ( strlen($word)<4 )
{
return '/s';
}
$infos = $this->GetWordInfos($word);
return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
}
/**
* 指定某词的词性信息(通常是新词)
* @parem $word unicode编码的词
* @parem $infos array('c' => 词频, 'm' => 词性);
* @return void;
*/
function SetWordInfos($word, $infos)
{
if ( strlen($word)<4 )
{
return ;
}
if ( isset($this->mainDicInfos[$word]) )
{
$this->newWords[$word]++;
$this->mainDicInfos[$word]['c']++;
} else {
$this->newWords[$word] = 1;
$this->mainDicInfos[$word] = $infos;
}
}
/**
* 开始执行分析
* @parem bool optimize 是否对结果进行优化
* @return bool
*/
function StartAnalysis($optimize=TRUE)
{
if ( !$this->isLoadDic )
{
$this->LoadDict();
}
$this->simpleResult = $this->finallyResult = array();
$this->sourceString .= chr(0).chr(32);
$slen = strlen($this->sourceString);
$sbcArr = array();
$j = 0;
//全角与半角字符对照表
for($i=0xFF00; $i < 0xFF5F; $i++)
{
$scb = 0x20 + $j;
$j++;
$sbcArr[$i] = $scb;
}
//对字符串进行粗分
$onstr = '';
$lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
$s = 0;
$ansiWordMatch = "[0-9a-z@#%\+\.-]";
$notNumberMatch = "[a-z@#%\+]";
for($i=0; $i < $slen; $i++)
{
$c = $this->sourceString[$i].$this->sourceString[++$i];
$cn = hexdec(bin2hex($c));
$cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
//ANSI字符
if ($cn < 0x80)
{
if ( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
{
if ( $lastc != 2 && $onstr != '') {
$this->simpleResult[$s]['w'] = $onstr;
$this->simpleResult[$s]['t'] = $lastc;
$this->_deep_analysis($onstr, $lastc, $s, $optimize);
$s++;
$onstr = '';
}
$lastc = 2;
$onstr .= chr(0).chr($cn);
} else {
if ( $onstr != '' )
{
$this->simpleResult[$s]['w'] = $onstr;
if ( $lastc==2 )
{
if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
}
$this->simpleResult[$s]['t'] = $lastc;
if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
$s++;
}
$onstr = '';
$lastc = 3;
if ($cn < 31)
{
continue;
} else {
$this->simpleResult[$s]['w'] = chr(0).chr($cn);
$this->simpleResult[$s]['t'] = 3;
$s++;
}
}
}
//普通字符
else
{
//正常文字
if ( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
|| ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
{
if ( $lastc != 1 && $onstr != '')
{
$this->simpleResult[$s]['w'] = $onstr;
if ( $lastc==2 )
{
if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
}
$this->simpleResult[$s]['t'] = $lastc;
if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
$s++;
$onstr = '';
}
$lastc = 1;
$onstr .= $c;
}
//特殊符号
else
{
if ( $onstr != '' )
{
$this->simpleResult[$s]['w'] = $onstr;
if ( $lastc==2 )
{
if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
}
$this->simpleResult[$s]['t'] = $lastc;
if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
$s++;
}
//检测书名
if ( $cn == 0x300A )
{
$tmpw = '';
$n = 1;
$isok = FALSE;
$ew = chr(0x30).chr(0x0B);
while(TRUE)
{
if (!isset($this->sourceString[$i+$n]) && !isset($this->sourceString[$i+$n+1]))
break;
$w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
if ( $w == $ew )
{
$this->simpleResult[$s]['w'] = $c;
$this->simpleResult[$s]['t'] = 5;
$s++;
$this->simpleResult[$s]['w'] = $tmpw;
$this->newWords[$tmpw] = 1;
if ( !isset($this->newWords[$tmpw]) )
{
$this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
$this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
}
$this->simpleResult[$s]['t'] = 13;
$s++;
//最大切分模式对书名继续分词
if ( $this->differMax )
{
$this->simpleResult[$s]['w'] = $tmpw;
$this->simpleResult[$s]['t'] = 21;
$this->_deep_analysis($tmpw, $lastc, $s, $optimize);
$s++;
}
$this->simpleResult[$s]['w'] = $ew;
$this->simpleResult[$s]['t'] = 5;
$s++;
$i = $i + $n + 1;
$isok = TRUE;
$onstr = '';
$lastc = 5;
break;
} else {
$n = $n+2;
$tmpw .= $w;
if ( strlen($tmpw) > 60 )
{
break;
}
}
}//while
if ( !$isok )
{
$this->simpleResult[$s]['w'] = $c;
$this->simpleResult[$s]['t'] = 5;
$s++;
$onstr = '';
$lastc = 5;
}
continue;
}
$onstr = '';
$lastc = 5;
if ( $cn==0x3000 )
{
continue;
} else {
$this->simpleResult[$s]['w'] = $c;
$this->simpleResult[$s]['t'] = 5;
$s++;
}
}
}
}
//处理分词后的结果
$this->_sort_finally_result();
}
/**
* 深入分词
* @parem $str
* @parem $ctype (2 英文类,3 中/韩/日文类)
* @parem $spos 当前粗分结果游标
* @return bool
*/
function _deep_analysis( &$str, $ctype, $spos, $optimize=TRUE )
{
//中文句子
if ( $ctype==1 )
{
$slen = strlen($str);
//小于系统配置分词要求长度的句子
if ( $slen < $this->notSplitLen )
{
$tmpstr = '';
$lastType = 0;
if ( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
if ($slen < 5)
{
//echo iconv(UCS2, 'utf-8', $str).'
';
if ( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
{
$str2 = '';
if ( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
{
$str2 = substr($str, 2, 2);
$str = substr($str, 0, 2);
}
$ww = $this->simpleResult[$spos - 1]['w'].$str;
$this->simpleResult[$spos - 1]['w'] = $ww;
$this->simpleResult[$spos - 1]['t'] = 4;
if ( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
{
$this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
$this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
}
$this->simpleResult[$spos]['w'] = '';
if ( $str2 != '' )
{
$this->finallyResult[$spos-1][] = $ww;
$this->finallyResult[$spos-1][] = $str2;
}
} else {
$this->finallyResult[$spos][] = $str;
}
} else {
$this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
}
}
//正常长度的句子,循环进行分词处理
else {
$this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
}
}
//英文句子,转为小写
else {
if ( $this->toLower ) {
$this->finallyResult[$spos][] = strtolower($str);
} else {
$this->finallyResult[$spos][] = $str;
}
}
}
/**
* 中文的深入分词
* @parem $str
* @return void
*/
function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=TRUE )
{
$quote1 = chr(0x20).chr(0x1C);
$tmparr = array();
$hasw = 0;
//如果前一个词为“,并且字符串小于3个字符当成一个词处理
if ( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
{
$tmparr[] = $str;
if ( !isset($this->newWords[$str]) )
{
$this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
$this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
}
if ( !$this->differMax )
{
$this->finallyResult[$spos][] = $str;
return ;
}
}
//进行切分
for($i=$slen-1; $i > 0; $i -= 2)
{
//单个词
$nc = $str[$i-1].$str[$i];
//是否已经到最后两个字
if ( $i <= 2 )
{
$tmparr[] = $nc;
$i = 0;
break;
}
$isok = FALSE;
$i = $i + 1;
for($k=$this->dicWordMax; $k>1; $k=$k-2)
{
if ($i < $k) continue;
$w = substr($str, $i-$k, $k);
if ( strlen($w) <= 2 )
{
$i = $i - 1;
break;
}
if ( $this->IsWord( $w ) )
{
$tmparr[] = $w;
$i = $i - $k + 1;
$isok = TRUE;
break;
}
}
//echo '