国内流行的内容管理系统(CMS)多端全媒体解决方案 https://www.dedebiz.com
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1033 lines
37KB

  1. <?php
  2. if (!defined('DEDEINC')) exit ('dedebiz');
  3. /**
  4. * Unicode编码词典的php分词器
  5. *
  6. * 1、只适用于php5,必要函数 iconv
  7. * 2、本程序是使用RMM逆向匹配算法进行分词的,词库需要特别编译,本类里提供了 MakeDict() 方法
  8. * 3、简单操作流程:SetSource -> StartAnalysis -> Get***Result
  9. * 4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
  10. *
  11. * @version $id:splitword.class.php 2 11:45 2011-2-14 itplato $
  12. * @package DedeBIZ.Libraries
  13. * @copyright Copyright (c) 2022 DedeBIZ.COM
  14. * @license GNU GPL v2 (https://www.dedebiz.com/license)
  15. * @link https://www.dedebiz.com
  16. */
  17. //常量定义
  18. define('_SP_', chr(0xFF).chr(0xFE));
  19. define('UCS2', 'ucs-2be');
  20. class SplitWord
  21. {
  22. //hash算法选项
  23. var $mask_value = 0xFFFF;
  24. //输入和输出的字符编码(只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型)
  25. var $sourceCharSet = 'utf-8';
  26. var $targetCharSet = 'utf-8';
  27. //生成的分词结果数据类型 1 为全部,2为 词典词汇及单个中日韩简繁字符及英文,3 为词典词汇及英文
  28. var $resultType = 1;
  29. //句子长度小于这个数值时不拆分,notSplitLen = n(个汉字) * 2 + 1
  30. var $notSplitLen = 5;
  31. //把英文单词全部转小写
  32. var $toLower = FALSE;
  33. //使用最大切分模式对二元词进行消岐
  34. var $differMax = FALSE;
  35. //尝试合并单字
  36. var $unitWord = TRUE;
  37. //初始化类时直接加载词典
  38. var $loadInit = TRUE;
  39. //使用热门词优先模式进行消岐
  40. var $differFreq = FALSE;
  41. //被转换为unicode的源字符串
  42. var $sourceString = '';
  43. //附加词典
  44. var $addonDic = array();
  45. var $addonDicFile = 'data/words_addons.dic';
  46. //主词典
  47. var $dicStr = '';
  48. var $mainDic = array();
  49. var $mainDicHand = FALSE;
  50. var $mainDicInfos = array();
  51. var $mainDicFile = 'data/base_dic_full.dic';
  52. //是否直接载入词典(选是载入速度较慢,但解析较快;选否载入较快,但解析较慢,需要时才会载入特定的词条)
  53. var $mainDicFileZip = 'data/base_dic_full.zip';
  54. var $isLoadAll = FALSE;
  55. var $isUnpacked = FALSE;
  56. //主词典词语最大长度 x / 2
  57. var $dicWordMax = 14;
  58. //粗分后的数组(通常是截取句子等用途)
  59. var $simpleResult = array();
  60. //最终结果(用空格分开的词汇列表)
  61. var $finallyResult = array();
  62. //是否已经载入词典
  63. var $isLoadDic = FALSE;
  64. //系统识别或合并的新词
  65. var $newWords = array();
  66. var $foundWordStr = '';
  67. //词库载入时间
  68. var $loadTime = 0;
  69. /**
  70. * 构造函数
  71. * @param $source_charset
  72. * @param $target_charset
  73. * @param $load_alldic
  74. * @param $source
  75. *
  76. * @return void
  77. */
  78. function __construct($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
  79. {
  80. $this->SetSource( $source, $source_charset, $target_charset );
  81. $this->isLoadAll = $load_all;
  82. if (file_exists(DEDEINC.'/'.$this->mainDicFile)) $this->isUnpacked = TRUE;
  83. if ($this->loadInit) $this->LoadDict();
  84. }
  85. function SplitWord($source_charset='utf-8', $target_charset='utf-8', $load_all=TRUE, $source='')
  86. {
  87. $this->__construct($source_charset, $target_charset, $load_all, $source);
  88. }
  89. /**
  90. * 析构函数
  91. */
  92. function __destruct()
  93. {
  94. if ( $this->mainDicHand !== FALSE )
  95. {
  96. @fclose( $this->mainDicHand );
  97. }
  98. }
  99. /**
  100. * 根据字符串计算key索引
  101. * @param $key
  102. * @return short int
  103. */
  104. function _get_index( $key )
  105. {
  106. $l = strlen($key);
  107. $h = 0x238f13af;
  108. while ($l--)
  109. {
  110. $h += ($h << 5);
  111. $h ^= ord($key[$l]);
  112. $h &= 0x7fffffff;
  113. }
  114. return ($h % $this->mask_value);
  115. }
  116. /**
  117. * 从文件获得词
  118. * @param $key
  119. * @param $type (类型 word 或 key_groups)
  120. * @return short int
  121. */
  122. function GetWordInfos( $key, $type='word' )
  123. {
  124. if ( !$this->mainDicHand )
  125. {
  126. $this->mainDicHand = fopen($this->mainDicFile, 'r');
  127. }
  128. $p = 0;
  129. $keynum = (int)$this->_get_index( $key );
  130. if ( isset($this->mainDicInfos[ $keynum ]) )
  131. {
  132. $data = $this->mainDicInfos[ $keynum ];
  133. } else {
  134. //rewind( $this->mainDicHand );
  135. $move_pos = $keynum * 8;
  136. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  137. $dat = fread($this->mainDicHand, 8);
  138. $arr = unpack('I1s/n1l/n1c', $dat);
  139. if ( $arr['l'] == 0 )
  140. {
  141. return FALSE;
  142. }
  143. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  144. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  145. $this->mainDicInfos[ $keynum ] = $data;
  146. }
  147. if ( !is_array($data) || !isset($data[$key]) )
  148. {
  149. return FALSE;
  150. }
  151. return ($type=='word' ? $data[$key] : $data);
  152. }
  153. /**
  154. * 设置源字符串
  155. * @param $source
  156. * @param $source_charset
  157. * @param $target_charset
  158. *
  159. * @return bool
  160. */
  161. function SetSource( $source, $source_charset='utf-8', $target_charset='utf-8' )
  162. {
  163. $this->sourceCharSet = strtolower($source_charset);
  164. $this->targetCharSet = strtolower($target_charset);
  165. $this->simpleResult = array();
  166. $this->finallyResult = array();
  167. if ( $source != '' )
  168. {
  169. $rs = TRUE;
  170. if ( preg_match("/^utf/", $source_charset) ) {
  171. $this->sourceString = @iconv('utf-8', UCS2, $source);
  172. }
  173. else if ( preg_match("/^gb/", $source_charset) ) {
  174. $this->sourceString = @iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
  175. }
  176. else if ( preg_match("/^big/", $source_charset) ) {
  177. $this->sourceString = @iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
  178. } else {
  179. $rs = FALSE;
  180. }
  181. } else {
  182. $rs = FALSE;
  183. }
  184. return $rs;
  185. }
  186. /**
  187. * 设置结果类型(只在获取finallyResult才有效)
  188. * @param $rstype 1 为全部,2去除特殊符号
  189. *
  190. * @return void
  191. */
  192. function SetResultType( $rstype )
  193. {
  194. $this->resultType = $rstype;
  195. }
  196. /**
  197. * 载入词典
  198. *
  199. * @return void
  200. */
  201. function LoadDict( $maindic='' )
  202. {
  203. $this->addonDicFile = DEDEINC.'/libraries/'.$this->addonDicFile;
  204. $this->mainDicFile = DEDEINC.'/libraries/'.$this->mainDicFile;
  205. $this->mainDicFileZip = DEDEINC.'/libraries/'.$this->mainDicFileZip;
  206. $startt = microtime(TRUE);
  207. //正常读取文件
  208. $dicAddon = $this->addonDicFile;
  209. if ($maindic=='' || !file_exists($maindic) )
  210. {
  211. $dicWords = $this->mainDicFile ;
  212. } else {
  213. $dicWords = $maindic;
  214. $this->mainDicFile = $maindic;
  215. }
  216. //加载主词典(只打开)
  217. if ($this->isUnpacked){
  218. $this->mainDicHand = fopen($dicWords, 'r');
  219. }
  220. //载入副词典
  221. $hw = '';
  222. $ds = file($dicAddon);
  223. foreach($ds as $d)
  224. {
  225. $d = trim($d);
  226. if ($d=='') continue;
  227. $estr = substr($d, 1, 1);
  228. if ( $estr==':' ) {
  229. $hw = substr($d, 0, 1);
  230. } else {
  231. $spstr = _SP_;
  232. $spstr = iconv(UCS2, 'utf-8', $spstr);
  233. $ws = explode(',', $d);
  234. $wall = iconv('utf-8', UCS2, join($spstr, $ws));
  235. $ws = explode(_SP_, $wall);
  236. foreach($ws as $estr)
  237. {
  238. $this->addonDic[$hw][$estr] = strlen($estr);
  239. }
  240. }
  241. }
  242. $this->loadTime = microtime(TRUE) - $startt;
  243. $this->isLoadDic = TRUE;
  244. }
  245. /**
  246. * 检测某个词是否存在
  247. */
  248. function IsWord( $word )
  249. {
  250. $winfos = $this->GetWordInfos( $word );
  251. return ($winfos !== FALSE);
  252. }
  253. /**
  254. * 获得某个词的词性及词频信息
  255. * @parem $word unicode编码的词
  256. * @return void
  257. */
  258. function GetWordProperty($word)
  259. {
  260. if ( strlen($word)<4 )
  261. {
  262. return '/s';
  263. }
  264. $infos = $this->GetWordInfos($word);
  265. return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
  266. }
  267. /**
  268. * 指定某词的词性信息(通常是新词)
  269. * @parem $word unicode编码的词
  270. * @parem $infos array('c' => 词频, 'm' => 词性);
  271. * @return void;
  272. */
  273. function SetWordInfos($word, $infos)
  274. {
  275. if ( strlen($word)<4 )
  276. {
  277. return ;
  278. }
  279. if ( isset($this->mainDicInfos[$word]) )
  280. {
  281. $this->newWords[$word]++;
  282. $this->mainDicInfos[$word]['c']++;
  283. } else {
  284. $this->newWords[$word] = 1;
  285. $this->mainDicInfos[$word] = $infos;
  286. }
  287. }
  288. /**
  289. * 开始执行分析
  290. * @parem bool optimize 是否对结果进行优化
  291. * @return bool
  292. */
  293. function StartAnalysis($optimize=TRUE)
  294. {
  295. if ( !$this->isLoadDic )
  296. {
  297. $this->LoadDict();
  298. }
  299. $this->simpleResult = $this->finallyResult = array();
  300. $this->sourceString .= chr(0).chr(32);
  301. $slen = strlen($this->sourceString);
  302. $sbcArr = array();
  303. $j = 0;
  304. //全角与半角字符对照表
  305. for($i=0xFF00; $i < 0xFF5F; $i++)
  306. {
  307. $scb = 0x20 + $j;
  308. $j++;
  309. $sbcArr[$i] = $scb;
  310. }
  311. //对字符串进行粗分
  312. $onstr = '';
  313. $lastc = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  314. $s = 0;
  315. $ansiWordMatch = "[0-9a-z@#%\+\.-]";
  316. $notNumberMatch = "[a-z@#%\+]";
  317. for($i=0; $i < $slen; $i++)
  318. {
  319. $c = $this->sourceString[$i].$this->sourceString[++$i];
  320. $cn = hexdec(bin2hex($c));
  321. $cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
  322. //ANSI字符
  323. if ($cn < 0x80)
  324. {
  325. if ( preg_match('/'.$ansiWordMatch.'/i', chr($cn)) )
  326. {
  327. if ( $lastc != 2 && $onstr != '') {
  328. $this->simpleResult[$s]['w'] = $onstr;
  329. $this->simpleResult[$s]['t'] = $lastc;
  330. $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  331. $s++;
  332. $onstr = '';
  333. }
  334. $lastc = 2;
  335. $onstr .= chr(0).chr($cn);
  336. } else {
  337. if ( $onstr != '' )
  338. {
  339. $this->simpleResult[$s]['w'] = $onstr;
  340. if ( $lastc==2 )
  341. {
  342. if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  343. }
  344. $this->simpleResult[$s]['t'] = $lastc;
  345. if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  346. $s++;
  347. }
  348. $onstr = '';
  349. $lastc = 3;
  350. if ($cn < 31)
  351. {
  352. continue;
  353. } else {
  354. $this->simpleResult[$s]['w'] = chr(0).chr($cn);
  355. $this->simpleResult[$s]['t'] = 3;
  356. $s++;
  357. }
  358. }
  359. }
  360. //普通字符
  361. else
  362. {
  363. //正常文字
  364. if ( ($cn>0x3FFF && $cn < 0x9FA6) || ($cn>0xF8FF && $cn < 0xFA2D)
  365. || ($cn>0xABFF && $cn < 0xD7A4) || ($cn>0x3040 && $cn < 0x312B) )
  366. {
  367. if ( $lastc != 1 && $onstr != '')
  368. {
  369. $this->simpleResult[$s]['w'] = $onstr;
  370. if ( $lastc==2 )
  371. {
  372. if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  373. }
  374. $this->simpleResult[$s]['t'] = $lastc;
  375. if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  376. $s++;
  377. $onstr = '';
  378. }
  379. $lastc = 1;
  380. $onstr .= $c;
  381. }
  382. //特殊符号
  383. else
  384. {
  385. if ( $onstr != '' )
  386. {
  387. $this->simpleResult[$s]['w'] = $onstr;
  388. if ( $lastc==2 )
  389. {
  390. if ( !preg_match('/'.$notNumberMatch.'/i', iconv(UCS2, 'utf-8', $onstr)) ) $lastc = 4;
  391. }
  392. $this->simpleResult[$s]['t'] = $lastc;
  393. if ( $lastc != 4 ) $this->_deep_analysis($onstr, $lastc, $s, $optimize);
  394. $s++;
  395. }
  396. //检测书名
  397. if ( $cn == 0x300A )
  398. {
  399. $tmpw = '';
  400. $n = 1;
  401. $isok = FALSE;
  402. $ew = chr(0x30).chr(0x0B);
  403. while(TRUE)
  404. {
  405. if (!isset($this->sourceString[$i+$n]) && !isset($this->sourceString[$i+$n+1]))
  406. break;
  407. $w = $this->sourceString[$i+$n].$this->sourceString[$i+$n+1];
  408. if ( $w == $ew )
  409. {
  410. $this->simpleResult[$s]['w'] = $c;
  411. $this->simpleResult[$s]['t'] = 5;
  412. $s++;
  413. $this->simpleResult[$s]['w'] = $tmpw;
  414. $this->newWords[$tmpw] = 1;
  415. if ( !isset($this->newWords[$tmpw]) )
  416. {
  417. $this->foundWordStr .= $this->_out_string_encoding($tmpw).'/nb, ';
  418. $this->SetWordInfos($tmpw, array('c'=>1, 'm'=>'nb'));
  419. }
  420. $this->simpleResult[$s]['t'] = 13;
  421. $s++;
  422. //最大切分模式对书名继续分词
  423. if ( $this->differMax )
  424. {
  425. $this->simpleResult[$s]['w'] = $tmpw;
  426. $this->simpleResult[$s]['t'] = 21;
  427. $this->_deep_analysis($tmpw, $lastc, $s, $optimize);
  428. $s++;
  429. }
  430. $this->simpleResult[$s]['w'] = $ew;
  431. $this->simpleResult[$s]['t'] = 5;
  432. $s++;
  433. $i = $i + $n + 1;
  434. $isok = TRUE;
  435. $onstr = '';
  436. $lastc = 5;
  437. break;
  438. } else {
  439. $n = $n+2;
  440. $tmpw .= $w;
  441. if ( strlen($tmpw) > 60 )
  442. {
  443. break;
  444. }
  445. }
  446. }//while
  447. if ( !$isok )
  448. {
  449. $this->simpleResult[$s]['w'] = $c;
  450. $this->simpleResult[$s]['t'] = 5;
  451. $s++;
  452. $onstr = '';
  453. $lastc = 5;
  454. }
  455. continue;
  456. }
  457. $onstr = '';
  458. $lastc = 5;
  459. if ( $cn==0x3000 )
  460. {
  461. continue;
  462. } else {
  463. $this->simpleResult[$s]['w'] = $c;
  464. $this->simpleResult[$s]['t'] = 5;
  465. $s++;
  466. }
  467. }
  468. }
  469. }
  470. //处理分词后的结果
  471. $this->_sort_finally_result();
  472. }
  473. /**
  474. * 深入分词
  475. * @parem $str
  476. * @parem $ctype (2 英文类,3 中/韩/日文类)
  477. * @parem $spos 当前粗分结果游标
  478. * @return bool
  479. */
  480. function _deep_analysis( &$str, $ctype, $spos, $optimize=TRUE )
  481. {
  482. //中文句子
  483. if ( $ctype==1 )
  484. {
  485. $slen = strlen($str);
  486. //小于系统配置分词要求长度的句子
  487. if ( $slen < $this->notSplitLen )
  488. {
  489. $tmpstr = '';
  490. $lastType = 0;
  491. if ( $spos > 0 ) $lastType = $this->simpleResult[$spos-1]['t'];
  492. if ($slen < 5)
  493. {
  494. //echo iconv(UCS2, 'utf-8', $str).'<br>';
  495. if ( $lastType==4 && ( isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]) ) )
  496. {
  497. $str2 = '';
  498. if ( !isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)]) )
  499. {
  500. $str2 = substr($str, 2, 2);
  501. $str = substr($str, 0, 2);
  502. }
  503. $ww = $this->simpleResult[$spos - 1]['w'].$str;
  504. $this->simpleResult[$spos - 1]['w'] = $ww;
  505. $this->simpleResult[$spos - 1]['t'] = 4;
  506. if ( !isset($this->newWords[$this->simpleResult[$spos - 1]['w']]) )
  507. {
  508. $this->foundWordStr .= $this->_out_string_encoding( $ww ).'/mu, ';
  509. $this->SetWordInfos($ww, array('c'=>1, 'm'=>'mu'));
  510. }
  511. $this->simpleResult[$spos]['w'] = '';
  512. if ( $str2 != '' )
  513. {
  514. $this->finallyResult[$spos-1][] = $ww;
  515. $this->finallyResult[$spos-1][] = $str2;
  516. }
  517. } else {
  518. $this->finallyResult[$spos][] = $str;
  519. }
  520. } else {
  521. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  522. }
  523. }
  524. //正常长度的句子,循环进行分词处理
  525. else {
  526. $this->_deep_analysis_cn( $str, $ctype, $spos, $slen, $optimize );
  527. }
  528. }
  529. //英文句子,转为小写
  530. else {
  531. if ( $this->toLower ) {
  532. $this->finallyResult[$spos][] = strtolower($str);
  533. } else {
  534. $this->finallyResult[$spos][] = $str;
  535. }
  536. }
  537. }
  538. /**
  539. * 中文的深入分词
  540. * @parem $str
  541. * @return void
  542. */
  543. function _deep_analysis_cn( &$str, $lastec, $spos, $slen, $optimize=TRUE )
  544. {
  545. $quote1 = chr(0x20).chr(0x1C);
  546. $tmparr = array();
  547. $hasw = 0;
  548. //如果前一个词为“,并且字符串小于3个字符当成一个词处理
  549. if ( $spos > 0 && $slen < 11 && $this->simpleResult[$spos-1]['w']==$quote1 )
  550. {
  551. $tmparr[] = $str;
  552. if ( !isset($this->newWords[$str]) )
  553. {
  554. $this->foundWordStr .= $this->_out_string_encoding($str).'/nq, ';
  555. $this->SetWordInfos($str, array('c'=>1, 'm'=>'nq'));
  556. }
  557. if ( !$this->differMax )
  558. {
  559. $this->finallyResult[$spos][] = $str;
  560. return ;
  561. }
  562. }
  563. //进行切分
  564. for($i=$slen-1; $i > 0; $i -= 2)
  565. {
  566. //单个词
  567. $nc = $str[$i-1].$str[$i];
  568. //是否已经到最后两个字
  569. if ( $i <= 2 )
  570. {
  571. $tmparr[] = $nc;
  572. $i = 0;
  573. break;
  574. }
  575. $isok = FALSE;
  576. $i = $i + 1;
  577. for($k=$this->dicWordMax; $k>1; $k=$k-2)
  578. {
  579. if ($i < $k) continue;
  580. $w = substr($str, $i-$k, $k);
  581. if ( strlen($w) <= 2 )
  582. {
  583. $i = $i - 1;
  584. break;
  585. }
  586. if ( $this->IsWord( $w ) )
  587. {
  588. $tmparr[] = $w;
  589. $i = $i - $k + 1;
  590. $isok = TRUE;
  591. break;
  592. }
  593. }
  594. //echo '<hr/>';
  595. //没适合词
  596. if (!$isok) $tmparr[] = $nc;
  597. }
  598. $wcount = count($tmparr);
  599. if ( $wcount==0 ) return ;
  600. $this->finallyResult[$spos] = array_reverse($tmparr);
  601. //优化结果(岐义处理、新词、数词、人名识别等)
  602. if ( $optimize )
  603. {
  604. $this->_optimize_result( $this->finallyResult[$spos], $spos );
  605. }
  606. }
  607. /**
  608. * 对最终分词结果进行优化(把simpleresult结果合并,并尝试新词识别、数词合并等)
  609. * @parem $optimize 是否优化合并的结果
  610. * @return bool
  611. */
  612. //t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
  613. function _optimize_result( &$smarr, $spos )
  614. {
  615. $newarr = array();
  616. $prePos = $spos - 1;
  617. $arlen = count($smarr);
  618. $i = $j = 0;
  619. //检测数量词
  620. if ( $prePos > -1 && !isset($this->finallyResult[$prePos]) )
  621. {
  622. $lastw = $this->simpleResult[$prePos]['w'];
  623. $lastt = $this->simpleResult[$prePos]['t'];
  624. if ( ($lastt==4 || isset( $this->addonDic['c'][$lastw] )) && isset( $this->addonDic['u'][$smarr[0]] ) )
  625. {
  626. $this->simpleResult[$prePos]['w'] = $lastw.$smarr[0];
  627. $this->simpleResult[$prePos]['t'] = 4;
  628. if ( !isset($this->newWords[ $this->simpleResult[$prePos]['w'] ]) )
  629. {
  630. $this->foundWordStr .= $this->_out_string_encoding( $this->simpleResult[$prePos]['w'] ).'/mu, ';
  631. $this->SetWordInfos($this->simpleResult[$prePos]['w'], array('c'=>1, 'm'=>'mu'));
  632. }
  633. $smarr[0] = '';
  634. $i++;
  635. }
  636. }
  637. for(; $i < $arlen; $i++)
  638. {
  639. if ( !isset( $smarr[$i+1] ) )
  640. {
  641. $newarr[$j] = $smarr[$i];
  642. break;
  643. }
  644. $cw = $smarr[$i];
  645. $nw = $smarr[$i+1];
  646. $ischeck = FALSE;
  647. //检测数量词
  648. if ( isset( $this->addonDic['c'][$cw] ) && isset( $this->addonDic['u'][$nw] ) )
  649. {
  650. //最大切分时保留合并前的词
  651. if ($this->differMax)
  652. {
  653. $newarr[$j] = chr(0).chr(0x28);
  654. $j++;
  655. $newarr[$j] = $cw;
  656. $j++;
  657. $newarr[$j] = $nw;
  658. $j++;
  659. $newarr[$j] = chr(0).chr(0x29);
  660. $j++;
  661. }
  662. $newarr[$j] = $cw.$nw;
  663. if ( !isset($this->newWords[$newarr[$j]]) )
  664. {
  665. $this->foundWordStr .= $this->_out_string_encoding( $newarr[$j] ).'/mu, ';
  666. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'mu'));
  667. }
  668. $j++; $i++; $ischeck = TRUE;
  669. }
  670. //检测前导词(通常是姓)
  671. else if ( isset( $this->addonDic['n'][ $smarr[$i] ] ) )
  672. {
  673. $is_rs = FALSE;
  674. //词语是副词或介词或频率很高的词不作为人名
  675. if ( strlen($nw)==4 )
  676. {
  677. $winfos = $this->GetWordInfos($nw);
  678. if (isset($winfos['m']) && ($winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  679. {
  680. $is_rs = TRUE;
  681. }
  682. }
  683. if ( !isset($this->addonDic['s'][$nw]) && strlen($nw)<5 && !$is_rs )
  684. {
  685. $newarr[$j] = $cw.$nw;
  686. //echo iconv(UCS2, 'utf-8', $newarr[$j])."<br>";
  687. //尝试检测第三个词
  688. if ( strlen($nw)==2 && isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && !isset( $this->addonDic['s'][$smarr[$i+2]] ) )
  689. {
  690. $newarr[$j] .= $smarr[$i+2];
  691. $i++;
  692. }
  693. if ( !isset($this->newWords[$newarr[$j]]) )
  694. {
  695. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'nr'));
  696. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/nr, ';
  697. }
  698. //为了防止错误,保留合并前的姓名
  699. if (strlen($nw)==4)
  700. {
  701. $j++;
  702. $newarr[$j] = chr(0).chr(0x28);
  703. $j++;
  704. $newarr[$j] = $cw;
  705. $j++;
  706. $newarr[$j] = $nw;
  707. $j++;
  708. $newarr[$j] = chr(0).chr(0x29);
  709. }
  710. $j++; $i++; $ischeck = TRUE;
  711. }
  712. }
  713. //检测后缀词(地名等)
  714. else if ( isset($this->addonDic['a'][$nw]) )
  715. {
  716. $is_rs = FALSE;
  717. //词语是副词或介词不作为前缀
  718. if ( strlen($cw)>2 )
  719. {
  720. $winfos = $this->GetWordInfos($cw);
  721. if (isset($winfos['m']) && ($winfos['m']=='a' || $winfos['m']=='r' || $winfos['m']=='c' || $winfos['c']>500) )
  722. {
  723. $is_rs = TRUE;
  724. }
  725. }
  726. if ( !isset($this->addonDic['s'][$cw]) && !$is_rs )
  727. {
  728. $newarr[$j] = $cw.$nw;
  729. if ( !isset($this->newWords[$newarr[$j]]) )
  730. {
  731. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/na, ';
  732. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'na'));
  733. }
  734. $i++; $j++; $ischeck = TRUE;
  735. }
  736. }
  737. //新词识别(暂无规则)
  738. else if ($this->unitWord)
  739. {
  740. if (strlen($cw)==2 && strlen($nw)==2
  741. && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw])
  742. && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw]))
  743. {
  744. $newarr[$j] = $cw.$nw;
  745. //尝试检测第三个词
  746. if ( isset($smarr[$i+2]) && strlen($smarr[$i+2])==2 && (isset( $this->addonDic['a'][$smarr[$i+2]] ) || isset( $this->addonDic['u'][$smarr[$i+2]] )) )
  747. {
  748. $newarr[$j] .= $smarr[$i+2];
  749. $i++;
  750. }
  751. if ( !isset($this->newWords[$newarr[$j]]) )
  752. {
  753. $this->foundWordStr .= $this->_out_string_encoding($newarr[$j]).'/ms, ';
  754. $this->SetWordInfos($newarr[$j], array('c'=>1, 'm'=>'ms'));
  755. }
  756. $i++; $j++; $ischeck = TRUE;
  757. }
  758. }
  759. //不符合规则
  760. if ( !$ischeck )
  761. {
  762. $newarr[$j] = $cw;
  763. //二元消岐处理——最大切分模式
  764. if ( $this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7)
  765. {
  766. $slen = strlen($nw);
  767. $hasDiff = FALSE;
  768. for($y=2; $y <= $slen-2; $y=$y+2)
  769. {
  770. $nhead = substr($nw, $y-2, 2);
  771. $nfont = $cw.substr($nw, 0, $y-2);
  772. if ( $this->IsWord( $nfont.$nhead ) )
  773. {
  774. if ( strlen($cw) > 2 ) $j++;
  775. $hasDiff = TRUE;
  776. $newarr[$j] = $nfont.$nhead;
  777. }
  778. }
  779. }
  780. $j++;
  781. }
  782. }
  783. $smarr = $newarr;
  784. }
  785. /**
  786. * 转换最终分词结果到 finallyResult 数组
  787. * @return void
  788. */
  789. function _sort_finally_result()
  790. {
  791. $newarr = array();
  792. $i = 0;
  793. foreach($this->simpleResult as $k=>$v)
  794. {
  795. if ( empty($v['w']) ) continue;
  796. if ( isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0 )
  797. {
  798. foreach($this->finallyResult[$k] as $w)
  799. {
  800. if (!empty($w))
  801. {
  802. $newarr[$i]['w'] = $w;
  803. $newarr[$i]['t'] = 20;
  804. $i++;
  805. }
  806. }
  807. }
  808. else if ($v['t'] != 21)
  809. {
  810. $newarr[$i]['w'] = $v['w'];
  811. $newarr[$i]['t'] = $v['t'];
  812. $i++;
  813. }
  814. }
  815. $this->finallyResult = $newarr;
  816. $newarr = '';
  817. }
  818. /**
  819. * 把uncode字符串转换为输出字符串
  820. * @parem str
  821. * return string
  822. */
  823. function _out_string_encoding( &$str )
  824. {
  825. $rsc = $this->_source_result_charset();
  826. if ( $rsc==1 ) {
  827. $rsstr = iconv(UCS2, 'utf-8', $str);
  828. }
  829. else if ( $rsc==2 ) {
  830. $rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str) );
  831. } else {
  832. $rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str) );
  833. }
  834. return $rsstr;
  835. }
  836. /**
  837. * 获取最终结果字符串(用空格分开后的分词结果)
  838. * @return string
  839. */
  840. function GetFinallyResult($spword=' ', $word_meanings=FALSE)
  841. {
  842. $rsstr = '';
  843. foreach($this->finallyResult as $v)
  844. {
  845. if ( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  846. {
  847. continue;
  848. }
  849. $m = '';
  850. if ( $word_meanings )
  851. {
  852. $m = $this->GetWordProperty($v['w']);
  853. }
  854. $w = $this->_out_string_encoding($v['w']);
  855. if ( $w != ' ' )
  856. {
  857. if ($word_meanings) {
  858. $rsstr .= $spword.$w.$m;
  859. } else {
  860. $rsstr .= $spword.$w;
  861. }
  862. }
  863. }
  864. return $rsstr;
  865. }
  866. /**
  867. * 获取粗分结果,不包含粗分属性
  868. * @return array()
  869. */
  870. function GetSimpleResult()
  871. {
  872. $rearr = array();
  873. foreach($this->simpleResult as $k=>$v)
  874. {
  875. if ( empty($v['w']) ) continue;
  876. $w = $this->_out_string_encoding($v['w']);
  877. if ( $w != ' ' ) $rearr[] = $w;
  878. }
  879. return $rearr;
  880. }
  881. /**
  882. * 获取粗分结果,包含粗分属性(1中文词句、2 ANSI词汇(包括全角),3 ANSI标点符号(包括全角),4数字(包括全角),5 中文标点或无法识别字符)
  883. * @return array()
  884. */
  885. function GetSimpleResultAll()
  886. {
  887. $rearr = array();
  888. foreach($this->simpleResult as $k=>$v)
  889. {
  890. $w = $this->_out_string_encoding($v['w']);
  891. if ( $w != ' ' )
  892. {
  893. $rearr[$k]['w'] = $w;
  894. $rearr[$k]['t'] = $v['t'];
  895. }
  896. }
  897. return $rearr;
  898. }
  899. /**
  900. * 获取索引hash数组
  901. * @return array('word'=>count,)
  902. */
  903. function GetFinallyIndex()
  904. {
  905. $rearr = array();
  906. foreach($this->finallyResult as $v)
  907. {
  908. if ( $this->resultType==2 && ($v['t']==3 || $v['t']==5) )
  909. {
  910. continue;
  911. }
  912. $w = $this->_out_string_encoding($v['w']);
  913. if ( $w == ' ' )
  914. {
  915. continue;
  916. }
  917. if ( isset($rearr[$w]) )
  918. {
  919. $rearr[$w]++;
  920. } else {
  921. $rearr[$w] = 1;
  922. }
  923. }
  924. return $rearr;
  925. }
  926. /**
  927. * 获得保存目标编码
  928. * @return int
  929. */
  930. function _source_result_charset()
  931. {
  932. if ( preg_match("/^utf/", $this->targetCharSet) ) {
  933. $rs = 1;
  934. }
  935. else if ( preg_match("/^gb/", $this->targetCharSet) ) {
  936. $rs = 2;
  937. }
  938. else if ( preg_match("/^big/", $this->targetCharSet) ) {
  939. $rs = 3;
  940. }
  941. else {
  942. $rs = 4;
  943. }
  944. return $rs;
  945. }
  946. /**
  947. * 编译词典
  948. * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>
  949. * 注意, 需要PHP开放足够的内存才能完成操作
  950. * @return void
  951. */
  952. function MakeDict( $source_file, $target_file='' )
  953. {
  954. $target_file = ($target_file=='' ? $this->mainDicFile : $target_file);
  955. $allk = array();
  956. $fp = fopen($source_file, 'r');
  957. while( $line = fgets($fp, 512) )
  958. {
  959. if ( $line[0]=='@' ) continue;
  960. list($w, $r, $a) = explode(',', $line);
  961. $a = trim( $a );
  962. $w = iconv('utf-8', UCS2, $w);
  963. $k = $this->_get_index( $w );
  964. if ( isset($allk[ $k ]) )
  965. $allk[ $k ][ $w ] = array($r, $a);
  966. else
  967. $allk[ $k ][ $w ] = array($r, $a);
  968. }
  969. fclose( $fp );
  970. $fp = fopen($target_file, 'w');
  971. $heade_rarr = array();
  972. $alldat = '';
  973. $start_pos = $this->mask_value * 8;
  974. foreach( $allk as $k => $v )
  975. {
  976. $dat = serialize( $v );
  977. $dlen = strlen($dat);
  978. $alldat .= $dat;
  979. $heade_rarr[ $k ][0] = $start_pos;
  980. $heade_rarr[ $k ][1] = $dlen;
  981. $heade_rarr[ $k ][2] = count( $v );
  982. $start_pos += $dlen;
  983. }
  984. unset( $allk );
  985. for($i=0; $i < $this->mask_value; $i++)
  986. {
  987. if ( !isset($heade_rarr[$i]) )
  988. {
  989. $heade_rarr[$i] = array(0, 0, 0);
  990. }
  991. fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
  992. }
  993. fwrite( $fp, $alldat);
  994. fclose( $fp );
  995. }
  996. /**
  997. * 导出词典的词条
  998. * @parem $targetfile 保存位置
  999. * @return void
  1000. */
  1001. function ExportDict( $targetfile )
  1002. {
  1003. if ( !$this->mainDicHand )
  1004. {
  1005. $this->mainDicHand = fopen($this->mainDicFile, 'rw');
  1006. }
  1007. $fp = fopen($targetfile, 'w');
  1008. for($i=0; $i <= $this->mask_value; $i++)
  1009. {
  1010. $move_pos = $i * 8;
  1011. fseek($this->mainDicHand, $move_pos, SEEK_SET);
  1012. $dat = fread($this->mainDicHand, 8);
  1013. $arr = unpack('I1s/n1l/n1c', $dat);
  1014. if ( $arr['l'] == 0 )
  1015. {
  1016. continue;
  1017. }
  1018. fseek($this->mainDicHand, $arr['s'], SEEK_SET);
  1019. $data = @unserialize(fread($this->mainDicHand, $arr['l']));
  1020. if ( !is_array($data) ) continue;
  1021. foreach($data as $k => $v)
  1022. {
  1023. $w = iconv(UCS2, 'utf-8', $k);
  1024. fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
  1025. }
  1026. }
  1027. fclose( $fp );
  1028. return TRUE;
  1029. }
  1030. }