下面是两个分词算法,一个是二元分词,一个是词典分词:
CODE:[Copy to clipboard]<?php
function _tokenizer ($text)
{
// UTF8_only
// 2-Base Cut
$len = strlen ($text);
$mbc = '';
$last_mbc = '';
$tmp = '';
$tokens = array ();
for ($i = 0; $i < $len; $i++) {
$c = $text[$i];
$v = ord ($c);
if ($v > 0xe0) {
// 3-bytes chars
$tmp = '';
$mbc = $c . $text[$i + 1] . $text[$i + 2];
$i += 2;
}
elseif ($v > 0xc0) {
// 2-bytes chars
$tmp = '';
$mbc = $c . $text[$i + 1];
$i ++;
}
else {
$mbc = '';
if ($c == ' ') {
if ($tmp) {
$p = $i - strlen ($tmp);
$tokens[$p] = $tmp;
}
$tmp = '';
}
else {
$tmp .= $c;
}
}
if ($mbc) {
if ($last_mbc) {
$p = $i - strlen ($last_mbc . $mbc) + 1;
$tokens[$p] = $last_mbc . $mbc;
}
$last_mbc = $mbc;
}
else {
$last_mbc = '';
}
}
return $tokens;
}
function _tokenizer_dict ($text, $non_word = false)
{
$len = strlen ($text);
$mbc = '';
//$mbc_str = '';
$mbc_str = array ();
$tmp = '';
$tokens = array ();
for ($i = 0; $i < $len; $i++) {
$c = $text[$i];
$v = ord ($c);
if ($v > 0xe0) {
// 3-bytes chars
$tmp = '';
$mbc = $c . $text[$i + 1] . $text[$i + 2];
$i += 2;
}
elseif ($v > 0xc0) {
// 2-bytes chars
$tmp = '';
$mbc = $c . $text[$i + 1];
$i ++;
}
else {
$mbc = '';
if ($c == ' ') {
if ($tmp) {
$p = $i - strlen ($tmp);
$tokens[$p] = $tmp;
}
$tmp = '';
}
else {
$tmp .= $c;
}
if (count ($mbc_str) > 0) {
// Div_dict
//mb_internal_encoding ('UTF-8');
$start_offset = $i - strlen (implode ('', $mbc_str));
$mbc_str_left = $mbc_str;
while (count ($mbc_str_left)) {
//$mb_len = mb_strlen ($mbc_str_left);
$mb_len = count ($mbc_str_left);
$word = '';
for ($j = ($mb_len > 4 ? 4 : $mb_len); $j >= 1; $j --) {
//$test = mb_substr ($mbc_str_left, 0, $j);
$test = '';
for ($k = 0; $k < $j; $k++) {
$test .= $mbc_str_left[$k];
}
//$mb_test_len = mb_strlen ($test);
if ($j == 1) {
// 1 only
$word = $test;
}
else {
if ($this->dict->find ($test)) {
$word = $test;
}
}
if ($word) {
//$mbc_str_left = mb_substr ($mbc_str_left, $mb_test_len);
$arr_tmp = array ();
for ($k = $j; $k < $mb_len; $k++) {
$arr_tmp[] = $mbc_str_left[$k];
}
$mbc_str_left = $arr_tmp;
if (!$non_word) {
if ($j > 1)
$tokens[$start_offset] = $word;
}
else
$tokens[$start_offset] = $word;
$start_offset += strlen ($word);
continue 2;
}
}
}
}
//$mbc_str = '';
$mbc_str = array ();
}
if ($mbc) {
$mbc_str[] = $mbc;
}
}
return $tokens;
}
?>