首先,导入相关模块:
import jieba
import docx
import os
from gensim import corpora, models, similarities
对目标文件集进行处理:
path="C:/Users/尚鑫/Desktop/sx1" #目标文件目录
files=os.listdir(path) #遍历目录下子文件名
print(files)
texts=[]
for file in files:
f = docx.Document(path+'/'+ file) #读取文件夹里所有子文件
text = ''
for para in f.paragraphs:
text += para.text #text为每个子文件的文件内容
texts.append(text) #将每个子文件的内容作为一个元素组合成列表
将文件内容进行分词处理:
a=[]
for text in texts:
b=[word for word in jieba.cut(text)]
a.append(b)
同理:对测试文件进行一样的处理:
测试文档与目标文档一样,循环选取目标文档其中之一为测试文档
ff=[]
for i in range(len(a)):
test_list=a[i]
将上诉得到的文本集制作语料库,采用字典:
dictionary=corpora.Dictionary(a) #获取词袋
dictionary.keys() #对词袋中所有词进行编号
dictionary.token2id #
corpus=[dictionary.doc2bow(doc) for doc in a] #使用doc2dow制作语料库
语料库是一组向量,每个元素对应的是其中一个目标文档分词后的二元组向量,我们把测试文档也转换成二元组向量:
doc_test_vec = dictionary.doc2bow(test_list)
相似度分析:
tfidf = models.TfidfModel(corpus) #对语料库建模
tfidf[doc_test_vec] #获取测试文档中每个词的tfidf值
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys())) #对目标函数进行相似度分析
sim = index[tfidf[doc_test_vec]]
for i in range(len(sim)):
print('第', i+1, '文件的相似度为:', sim[i]) #按遍历顺序输出相似度
print(sorted(enumerate(sim), key=lambda item: -item[1])) #按相似度高低输出相似度排序
然后我们将得到的数据自动生成表格:
xls=xlwt.Workbook()
sht1=xls.add_sheet('sheet1')
file_studentname=[]
for studentname in files:
studentnames=re.sub(".docx","",studentname)
file_studentname.append(studentnames) #将名字后面的.docx用正则表达式去掉
# for i in range(len(ff)):
for k,filename in zip(np.arange(len(files)),file_studentname):
sht1.write(int(k+1),0,filename)
sht1.write(0,int(k+1) , filename)
for i in range(len(ff)):
for j in range(len(ff[i])):
h=ff[i].tolist() #将数组转化成数列的形式
sht1.write(j+1,i+1,float(h[j]))
xls.save('C:/Users/尚鑫/Desktop/相似度2.xls')
然后程序结束,附上整体代码:
iimport jieba
import docx
import os
import numpy as np
import re
import xlwt
from gensim import corpora, models, similarities
path="C:/Users/尚鑫/Desktop/sx1"
files=os.listdir(path)
print(files)
texts=[]
for file in files:
f = docx.Document(path+'/'+ file)
text = ''
for para in f.paragraphs:
text += para.text
texts.append(text)
a=[]
for text in texts:
b=[word for word in jieba.cut(text)]
a.append(b)
ff=[]
for i in range(len(a)):
test_list=a[i]
dictionary=corpora.Dictionary(a)
# dictionary.keys()
# dictionary.token2id
corpus=[dictionary.doc2bow(doc) for doc in a]
doc_test_vec = dictionary.doc2bow(test_list)
tfidf = models.TfidfModel(corpus)
tfidf[doc_test_vec]
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[doc_test_vec]]
for j in range(len(sim)):
print('与第',i,'个文件对比:','第', j, '文件的相似度为:', sim[j])
print(sorted(enumerate(sim), key=lambda item: -item[1]))
ff.append(sim)
print(len(ff))
xls=xlwt.Workbook()
sht1=xls.add_sheet('sheet1')
file_studentname=[]
for studentname in files:
studentnames=re.sub(".docx","",studentname)
file_studentname.append(studentnames)
# for i in range(len(ff)):
for k,filename in zip(np.arange(len(files)),file_studentname):
sht1.write(int(k+1),0,filename)
sht1.write(0,int(k+1) , filename)
for i in range(len(ff)):
for j in range(len(ff[i])):
h=ff[i].tolist()
sht1.write(j+1,i+1,float(h[j]))
xls.save('C:/Users/尚鑫/Desktop/相似度2.xls')