Trie树实现词频统计与查找

#encoding:utf-8
from collections import defaultdict
import sys
reload(sys) 
sys.setdefaultencoding('utf8') 
class LBTrie:  
    """ simple implemention of Trie in Python. """  
    def __init__(self):  
        self.trie = {}  
        self.size = 0  

    #添加单词 
    def add(self, word):  
        p = self.trie 
        dicnum = 0 
        word = word.strip()  
        for c in word:  
            if not c in p:  
                p[c] = {}
            dicnum+=1  
            p = p[c] 


        if word != '':  
            #在单词末尾处添加键值''作为标记,即只要某个字符的字典中含有''键即为单词结尾 
            p[''] = ''   
        if dicnum == len(word):
            return True
    #查询单词 
    def search(self, word):  
        p = self.trie  
        word = word.lstrip()  
        for c in word:  
            if not c in p:  
                return False  
            p = p[c]  
        #判断单词结束标记'' 
        if '' in p:  
            return True  
        return False            

    #打印Trie树的接口 
    def output(self):  
        #print '{' 
        self.__print_item(self.trie)      
        #print '}' 
        return  self.__print_item(self.trie)

    #实现Trie树打印的私有递归函数,indent控制缩进 
    def __print_item(self, p, indent=0):       
        if p:  
            ind = '' + '\t' * indent  
            for key in p.keys():  
                label = "'%s' : " % key  
                print ind + label + '{'  
                self.__print_item(p[key], indent+1)

            print ind + ' '*len(label) + '}'    

def codeutil(strs):
         return strs.decode('utf8','ignore').encode('GBK','ignore').decode('GBK','ignore')

if __name__ == '__main__':  
    trie_obj = LBTrie()  
    #添加单词 
    corpus = open('content.txt','r')
    tree = open('tree.txt','w+')
    countdic = defaultdict(int)
    for record in corpus.readlines():
        recordlist = record.split(' ')
        for word in recordlist:
            check = trie_obj.add(codeutil(word))
            if check:
                countdic[word] += 1
    resortedcountdic = sorted(countdic.items(), key=lambda item: item[1], reverse=True)
    for tup in resortedcountdic:
     tree.write(''.join(codeutil(tup[0]))+'\t'+str(tup[1])+'\t')
    #查找单词 
    if trie_obj.search(codeutil('氨基酸')):  
        print 'Yes'  
    else:  
        print 'No'   
    原文作者:Trie树
    原文地址: https://blog.csdn.net/IqqIqqIqqIqq/article/details/54561975
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞