#encoding:utf-8
from collections import defaultdict
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class LBTrie:
""" simple implemention of Trie in Python. """
def __init__(self):
self.trie = {}
self.size = 0
#添加单词
def add(self, word):
p = self.trie
dicnum = 0
word = word.strip()
for c in word:
if not c in p:
p[c] = {}
dicnum+=1
p = p[c]
if word != '':
#在单词末尾处添加键值''作为标记,即只要某个字符的字典中含有''键即为单词结尾
p[''] = ''
if dicnum == len(word):
return True
#查询单词
def search(self, word):
p = self.trie
word = word.lstrip()
for c in word:
if not c in p:
return False
p = p[c]
#判断单词结束标记''
if '' in p:
return True
return False
#打印Trie树的接口
def output(self):
#print '{'
self.__print_item(self.trie)
#print '}'
return self.__print_item(self.trie)
#实现Trie树打印的私有递归函数,indent控制缩进
def __print_item(self, p, indent=0):
if p:
ind = '' + '\t' * indent
for key in p.keys():
label = "'%s' : " % key
print ind + label + '{'
self.__print_item(p[key], indent+1)
print ind + ' '*len(label) + '}'
def codeutil(strs):
return strs.decode('utf8','ignore').encode('GBK','ignore').decode('GBK','ignore')
if __name__ == '__main__':
trie_obj = LBTrie()
#添加单词
corpus = open('content.txt','r')
tree = open('tree.txt','w+')
countdic = defaultdict(int)
for record in corpus.readlines():
recordlist = record.split(' ')
for word in recordlist:
check = trie_obj.add(codeutil(word))
if check:
countdic[word] += 1
resortedcountdic = sorted(countdic.items(), key=lambda item: item[1], reverse=True)
for tup in resortedcountdic:
tree.write(''.join(codeutil(tup[0]))+'\t'+str(tup[1])+'\t')
#查找单词
if trie_obj.search(codeutil('氨基酸')):
print 'Yes'
else:
print 'No'
Trie树实现词频统计与查找
原文作者:Trie树
原文地址: https://blog.csdn.net/IqqIqqIqqIqq/article/details/54561975
本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
原文地址: https://blog.csdn.net/IqqIqqIqqIqq/article/details/54561975
本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。