import docx
import os
from win32com import client as wc
import pandas as pd
word_paths = os.getcwd()
# doc 转化为 docx,如果不用该方法则打不开对应的docx
def convertdoc_docx(path):
#将path下所有的doc转换为有效的docx
path_list = os.listdir(path)
doc_list = [os.path.join(path, str(i))
for i in path_list if str(i).endswith('doc')]
word = wc.Dispatch('Word.Application')
for path in doc_list:
print(path)
save_path = str(path).replace('doc', 'docx')
doc = word.Documents.Open(path)
doc.SaveAs(save_path, 12, False, "", True,
"", False, False, False, False)
doc.Close()
print('{} Save sucessfully '.format(save_path))
word.Quit()
def docx2dataframe(filepath) -> pd.DataFrame:
# 将一个docx的文件路径传入,发挥一个Dataframe,便于导出
doc = docx.Document(filepath)
# print(len(doc.tables)) # 检查表格数量是否在一个合理范围
for index, table in enumerate(doc.tables):
df = [['' for i in range(len(table.columns))]
for j in range(len(table.rows))]
try:
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
if cell.text:
df[i][j] = cell.text
return pd.DataFrame(df)
except:
pass # 出错的是少量,手动添加即可
if __name__ == "__main__":
convertdoc_docx(word_paths) # 这个函数调用将当前文件夹下所有的doc文件转为docx
excel_writer = pd.ExcelWriter('target.xlsx') # 创建目标excel文件
docx_list = [os.path.join(word_paths, i) for i in os.listdir(
word_paths) if str(i).endswith('.docx')] # docx文件路径列表
for index, docx_file in enumerate(docx_list):
docx2dataframe(docx_file).to_excel(
excel_writer=excel_writer, sheet_name=f'sheet{ index}', index=False)
excel_writer.save() # 每次读完保存一下
print('\r' + str(index), end='') # 查看进度
excel_writer.close()
批量复制提取Word中所有的表格到Excel(Python办公自动化)
原文作者:Honour Van
原文地址: https://blog.csdn.net/weixin_45502929/article/details/121855657
本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
原文地址: https://blog.csdn.net/weixin_45502929/article/details/121855657
本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。