Wednesday, May 30, 2007

中文分词模块 from Huang Jiahua

#!/usr/bin/python

# -*- coding: UTF-8 -*-
# Author: Huang Jiahua
# Last modified: 2004-08-25

__revision__ = '0.1'

#切分关键字,要求预先转换为 Unicode 类型
# 分开中文,非中文 -> 按 seps 列表分隔 -> 对中文二元分词 -> 合并 -> 返回数组

import sys
##sys.setappdefaultencoding('utf8')

#分隔关键字列表 seps 设置
seps=[]
seps=[" ","\t","\n","\r",",","<",">","?","!",
";","\#",":",".","'",'"',"(",")","{","}","[","]","|","_","=",
" ",",","?","。","、",""",""","《","》","[","]","!","(",")"]
# Unicode 编码的分隔关键字列表
def _utuni(strr):return unicode(strr,'utf8')
seps=map(_utuni,seps)

##_alkeys={}
_zhkeys={}
_askeys={}
# _zhstr , _asstr 存储 中文,非中文 数组
_zhstr = []
_asstr = []

def _zhsplitkey(stri):
# 对 stri 二元分词法
#存入全局字典 _zhkeys
global _zhkeys
ln = len(stri)
if ln == 1:
return stri
#拆分中文关键字,二元分词法
n = 0
while n < ln-1:
_zhkeys[stri[n]+stri[n+1]] = ''
n = n+1
## return keyy.keys()

def _fenzhas(stri):
# 分开中文和非中文,
# 存入全局数组 _zhstr , _asstr
global _asstr
global _zhstr
ln = len(stri)
_zhstr = []
_asstr = []
n = 0
m = 0
try:
stri[n] >= u'\u4e00'
except:
return 0

while n < ln:
if stri[n] >= u'\u4e00':
if m==0:_zhstr.append(' ')
_zhstr.append(stri[n])
## print 'z:',stri[n]
m=1
else:
if m==1:_asstr.append(' ')
_asstr.append(stri[n])
## print 'a:',stri[n]
m=2
n+=1
## print 'zh:',''.join(_zhstr)
## print 'as:',''.join(_asstr)

def _fenseps(stri):
# 按 seps 列表分隔, 返回分隔后数组
global seps
alkeys={}
n = 0
m = 0
ln=stri.__len__()
while n if stri[n] in seps:
alkeys[stri[m:n]]=''
m=n+1
n+=1
if n>m:alkeys[stri[m:n]]=''
return alkeys.keys()

def splitkey(stri):
"""Split the keys

Split the keys."""
# 接受 str 返回分词后数组
global _zhstr
global _asstr
global _zhkeys
global _askeys
_zhkeys = {}
_askeys = {}
_zhstr = []
_asstr = []
_fenzhas(stri) #分开中文,非中文,存入数组 _zhstr , _asstr
zhstr= _fenseps(''.join(_zhstr))
asstr= _fenseps(''.join(_asstr)) #?
_zhstr = []
_asstr = []
for i in zhstr:
_zhsplitkey(i) #中文分词放入字典 _zhkeys
for i in asstr:
_askeys[i]=''

alkeys = {}
alkeys.update(_zhkeys)
alkeys.update(_askeys)
_zhkeys = {}
_askeys = {}
return alkeys.keys()

if __name__=="__main__":
# 命令行测试
import sys
# sys.setappdefaultencoding('unicode')
enc = sys.stdin.encoding
if len(sys.argv) > 1:
keyy = sys.argv[1]
else:
keyy = sys.stdin.read()
## keyyy = splitkey(keyy.decode(enc))
keyyy = splitkey(keyy.decode('utf8'))
for i in keyyy:
## print i.encode(enc),
print i.encode('utf8'),

No comments: