function mb_word_split($str, $lang)
{
$m = array();
switch ($lang) {
case 'chinese_big5':
preg_match_all('!((?:[A-Za-z]+) | (?:[\xa1-\xfe] [\x40-\x7e] | [\xa1-\xfe] ) )!xs', $str, $m);
break;
case 'chinese': /* bg2312 */
preg_match_all('!((?:[A-Za-z]+) | (?:[\xa1-\xf7] [\xa1-\xfe] ) | ( [\xa1-\xfe] [\x40-\x7e] | [\xa1-\xfe] ) )!xs', $str, $m);
break;
case 'japanese': /* utf-8 */
case 'korean':
preg_match_all('!((?:[\x0-\x7f]+) | (?:[\xc0-\xfd]{1}[\x80-\xbf]+) )!xs', $str, $m);
break;
}
if (!$m) {
return array();
}
$m2 = array();
$tmp="";
foreach (array_unique($m[0]) as $v) {
if (isset($v[1]))
{
if(preg_match('!(?:[A-Za-z]+)!xs',$v))
{
$m2[] = _esc($v);
}
else
{
if($tmp!="")
{
$m2[] = _esc($tmp.$v);
}
$tmp=$v;
}
}
}
return $m2;
}
[Updated on: Wed, 28 February 2007 12:33]
Report message to a moderator