对文件夹中文件进行相似度检测,并生成表格

首先,导入相关模块:

import jieba
import docx
import os
from gensim import corpora, models, similarities

对目标文件集进行处理:

path="C:/Users/尚鑫/Desktop/sx1"  #目标文件目录
files=os.listdir(path)  #遍历目录下子文件名
print(files)
texts=[]
for file in files:
    f = docx.Document(path+'/'+ file)  #读取文件夹里所有子文件
    text = ''
    for para in f.paragraphs:
        text += para.text      #text为每个子文件的文件内容
    texts.append(text)   #将每个子文件的内容作为一个元素组合成列表

将文件内容进行分词处理:

a=[]
for text in texts:
    b=[word for word in jieba.cut(text)]
    a.append(b)

同理:对测试文件进行一样的处理:
测试文档与目标文档一样,循环选取目标文档其中之一为测试文档

ff=[]
for i in range(len(a)):
     test_list=a[i]

将上诉得到的文本集制作语料库,采用字典:

dictionary=corpora.Dictionary(a)  #获取词袋
dictionary.keys()  #对词袋中所有词进行编号
dictionary.token2id   #
corpus=[dictionary.doc2bow(doc) for doc in a]  #使用doc2dow制作语料库

语料库是一组向量,每个元素对应的是其中一个目标文档分词后的二元组向量,我们把测试文档也转换成二元组向量:

doc_test_vec = dictionary.doc2bow(test_list)

相似度分析:

tfidf = models.TfidfModel(corpus)   #对语料库建模
tfidf[doc_test_vec]   #获取测试文档中每个词的tfidf值
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))  #对目标函数进行相似度分析
sim = index[tfidf[doc_test_vec]]
for i in range(len(sim)):
    print('第', i+1, '文件的相似度为:', sim[i])  #按遍历顺序输出相似度
print(sorted(enumerate(sim), key=lambda item: -item[1]))  #按相似度高低输出相似度排序

然后我们将得到的数据自动生成表格:

xls=xlwt.Workbook()
sht1=xls.add_sheet('sheet1')
file_studentname=[]
for studentname in files:
    studentnames=re.sub(".docx","",studentname)
    file_studentname.append(studentnames)   #将名字后面的.docx用正则表达式去掉
# for i in range(len(ff)):
for k,filename in zip(np.arange(len(files)),file_studentname):
    sht1.write(int(k+1),0,filename)
    sht1.write(0,int(k+1) , filename)
for i in range(len(ff)):
    for j in range(len(ff[i])):
        h=ff[i].tolist()  #将数组转化成数列的形式
        sht1.write(j+1,i+1,float(h[j]))
xls.save('C:/Users/尚鑫/Desktop/相似度2.xls')

然后程序结束,附上整体代码:

iimport jieba
import docx
import os
import numpy as np
import re
import xlwt
from gensim import corpora, models, similarities


path="C:/Users/尚鑫/Desktop/sx1"
files=os.listdir(path)
print(files)
texts=[]
for file in files:
    f = docx.Document(path+'/'+ file)
    text = ''
    for para in f.paragraphs:
        text += para.text
    texts.append(text)
a=[]
for text in texts:
    b=[word for word in jieba.cut(text)]
    a.append(b)
ff=[]
for i in range(len(a)):
     test_list=a[i]
     dictionary=corpora.Dictionary(a)
     # dictionary.keys()
     # dictionary.token2id
     corpus=[dictionary.doc2bow(doc) for doc in a]
     doc_test_vec = dictionary.doc2bow(test_list)
     tfidf = models.TfidfModel(corpus)
     tfidf[doc_test_vec]
     index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
     sim = index[tfidf[doc_test_vec]]
     for j in range(len(sim)):
         
          print('与第',i,'个文件对比:','第', j, '文件的相似度为:', sim[j])
     print(sorted(enumerate(sim), key=lambda item: -item[1]))
      
     ff.append(sim)
print(len(ff))
xls=xlwt.Workbook()
sht1=xls.add_sheet('sheet1')
file_studentname=[]
for studentname in files:
    studentnames=re.sub(".docx","",studentname)
    file_studentname.append(studentnames)
# for i in range(len(ff)):
for k,filename in zip(np.arange(len(files)),file_studentname):
    sht1.write(int(k+1),0,filename)
    sht1.write(0,int(k+1) , filename)
for i in range(len(ff)):
    for j in range(len(ff[i])):
        h=ff[i].tolist()
        sht1.write(j+1,i+1,float(h[j]))
xls.save('C:/Users/尚鑫/Desktop/相似度2.xls')

    原文作者:xin1996_
    原文地址: https://blog.csdn.net/xin1996_/article/details/84234715
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞