最近需要用到文本的拼音相似度计算,看了hankcs大神的hanlp里面通过ac自动机实现拼音的存储,想把它转成python版本的。开始啃AC自动机吧。
AC自动机建立在Trie树和KMP字符串匹配算法。首先啃Trie树。
关于Trie树的概念,http://blog.csdn.net/v_july_v/article/details/6897097这一篇讲得很好,还附赠了后缀树。
我所要做的是把utf-8编码的中文词和拼音对应起来。Utf-8编码将一个汉字编码成3个byte,每个byte按照16进制存储。鉴于这种情况,需要构造一个256 Trie,即每一层可能有256个节点。
看了几个程序后,集众人智慧,写了一个自己的。
# coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class TrieNode(object):
def __init__(self):
self.one_byte = {}
self.value = None
self.is_word = False
class Trie256(object):
def __init__(self):
self.root = TrieNode()
def getUtf8String(self, string):
bytes_array = bytearray(string.encode("utf-8"))
return bytes_array
def insert(self, bytes_array, str):
node = self.root
for byte in bytes_array:
child = node.one_byte.get(byte)
if child == None:
node.one_byte[byte] = TrieNode()
node = node.one_byte[byte]
node.is_word = True
node.value = str
def find(self, bytes_array):
node = self.root
for byte in bytes_array:
child = node.one_byte.get(byte)
if child == None:
print "No this word in this Trie."
return None
node = node.one_byte[byte]
if not node.is_word:
print "It is not a word."
return None
else:
return node.value
def modify(self, bytes_array, str):
node = self.root
for byte in bytes_array:
child = node.one_byte.get(byte)
if child == None:
print "This word is not in this Trie, we will insert it."
node.one_byte[byte] = TrieNode()
node = node.one_byte[byte]
if not node.is_word:
print "This word is not a word in this Trie, we will make it a word."
node.is_word = True
node.value = str
else:
print "modify this word..."
node.value = str
def delete(self, bytes_array):
node = self.root
for byte in bytes_array:
child = node.one_byte.get(byte)
if child == None:
print "This word is not in this Trie."
break
node = node.one_byte[byte]
if not node.is_word:
print "It is not a word."
else:
node.is_word = False
node.value = None
child = node.one_byte.keys()
if len(child) == 0:
node.one_byte.clear()
def print_item(self, p, indent=0):
if p:
ind = '' + '\t' * indent
for key in p.one_byte.keys():
label = "'%s' : " % key
print ind + label + '{'
self.print_item(p.one_byte[key], indent + 1)
#print ind + ' ' * len(label) + '}'
#self.print_item(p.one_byte[key], indent + 1)
if __name__ == "__main__":
trie = Trie256()
with open("dictionary/pinyin.txt", 'r') as fd:
line = fd.readline()
while line:
line_split = line.split('=')
word = line_split[0]
pinyin = line_split[1].strip()
bytes = trie.getUtf8String(word)
sentence = ''
for byte in bytes:
sentence = sentence + 'x' + str(byte)
print sentence
trie.insert(bytes, pinyin)
line = fd.readline()
trie.print_item(trie.root)
bytes = trie.getUtf8String("一分钟".decode("utf-8"))
for byte in bytes:
print byte
print trie.find(bytes)