python将word文件批量转成pdf

2024年5月19日 165次阅读来源: Valerie2020

遍历目录下的文件

调用glob
遍历指定目录下的所有文件和文件夹，不递归遍历，需要手动完成递归遍历功能。

import glob as gb
paths = gb.glob('D:\\文件夹\\*')
for path in paths:
    print(path)

调用os.walk
遍历指定目录下的所有文件和文件夹，递归遍历，功能强大，推荐使用。

import os
for dirpath, dirnames, filenames in os.walk('d:\\2\\'):
    for file in filenames:
        fullpath = os.path.join(dirpath, file)
        print(fullpath, file)

DIY
遍历指定目录下的所有文件和文件夹，递归遍历，自主编写，扩展性强，可以学习练手。

import os;  
files = list();  
def DirAll(pathName):  
    if os.path.exists(pathName):  
        fileList = os.listdir(pathName);  
        for f in fileList:  
            if f=="$RECYCLE.BIN" or f=="System Volume Information":  
                continue;  
            f=os.path.join(pathName,f);  
            if os.path.isdir(f):     
                DirAll(f);                  
            else:  
                dirName=os.path.dirname(f);  
                baseName=os.path.basename(f);  
                if dirName.endswith(os.sep):  
                    files.append(dirName+baseName);  
                else:  
                    files.append(dirName+os.sep+baseName);  

DirAll("D:\\2\\");  
for f in files:  
    print(f)
    # print f.decode('gbk').encode('utf-8');

Word转换为PDF的库

Python中针对Word转换为PDF的库有：

仅能在Windows上运行
- win32com：通过Windows Com组件（win32com），调用Word服务（Word.Application），实现Word到PDF文件的转换。因此，要求该Python程序需要在有Word服务（可能至少要求2007版本）的Windows机器上运行。

from win32com.client import Dispatch
from os import walk

wdFormatPDF = 17


def doc2pdf(input_file):
    word = Dispatch('Word.Application')
    doc = word.Documents.Open(input_file)
    doc.SaveAs(input_file.replace(".docx", ".pdf"), FileFormat=wdFormatPDF)
    doc.Close()
    word.Quit()


if __name__ == "__main__":
    doc_files = []
    directory = "C:\\Users\\xkw\\Desktop\\destData"
    for root, dirs, filenames in walk(directory):
        for file in filenames:
            if file.endswith(".doc") or file.endswith(".docx"):
                doc2pdf(str(root + "\\" + file))

comtypes

import os
import sys
import re

import comtypes.client

wdFormatPDF = 17

def covx_to_pdf(infile, outfile):
    """Convert a Word .docx to PDF"""
    print('making:',outfile)
    word = comtypes.client.CreateObject('Word.Application')
    doc = word.Documents.Open(infile)
    doc.SaveAs(outfile)
    doc.Close()
    word.Quit()

total = 0

for root,dirs,files in os.walk('.'):
    for filespath in files:
        # print(filespath)
        p = os.path.abspath( os.path.join(root, filespath) )
        zhen = re.search(r'(\w+镇)|(\w+乡)', filespath)
        cun = re.search(r'((\w+镇)|(\w+乡))(\w+村)', filespath)
        if zhen and cun:
            print(p, zhen.groups(), cun.groups())
            zhen = zhen.group(1) or zhen.group(2)
            cun = cun.group(4)
            outp = os.path.abspath( os.path.join(root, 'output/'+zhen+'/'+cun+'.doc') )
            folder, fn = os.path.split(outp)
            if not os.path.exists(folder) :
                os.makedirs(folder)
            if p[-3:] == 'pdf':
                total += 1
                covx_to_pdf(p, outp)

print('共生成',total)

mac用户
- docx2pdf：专门用于word转pdf

pip install docx2pdf
# 单个文件转换
from docx2pdf import convert
convert("input.docx", "output.pdf")

#查找当前目录下的全部word文件
import os
import glob
from pathlib import Path

path = os.getcwd() + '/'
p = Path(path) #初始化构造Path对象
FileList=list(p.glob("**/*.docx")) 

 #循环将该目录下的全部word一次性转换为PDF
for file in FileList:
    convert(file,f"{file}.pdf")

参考连接
https://www.zhihu.com/people/valerie-98-45
https://blog.csdn.net/san1156/article/details/77885995
https://blog.csdn.net/kewei168/article/details/84574301
https://www.cnblogs.com/xiangnan/p/7040093.html

    原文作者：Valerie2020
    原文地址: https://blog.csdn.net/qq_42771083/article/details/107387640
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。