Python 文件内容读取

2023年1月27日 205次阅读来源: 过桥0811

背景

计划实现文件中心，支撑检索常见文件内容

依赖包

#pip install baidu-aip
from aip import AipOcr
#pip install xlrd
import xlrd
import os
#pip install csv23
import csv23
#pip install docx2txt
import docx2txt
#pip install pypiwin32
from win32com import client as wc
#pip install python-pptx
from pptx import Presentation
#pip install wand
from wand.image import Image
# 使用 wand 异常，缺少 ImageMagick 支持
# http://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-windows
# https://imagemagick.org/script/download.php#windows
# 使用 wand 异常，FailedToExecuteCommand `"gswin32c.exe"
# http://ghostscript.com/download/gsdnld.html

基础文件读取

.txt

# 读取 txt 文件，返回文件内容 
def readTxt(fileUrl):
    content = ""
    if os.path.exists(fileUrl):
        with open(fileUrl, 'r') as f:
            for l in f:
                temp = l.rstrip('\n').rstrip().split('\t')[0]
                content += temp.replace(' ','')
    return content

.xls .xlsx

# 读取 excel 文件，返回文件内容 
def readExcel(fileUrl):
    content = ""
    if os.path.exists(fileUrl):
        excelfile = xlrd.open_workbook(fileUrl)
        for name in excelfile.sheet_names():
            sheet = excelfile.sheet_by_name(name)
            sheet_rows = sheet.nrows
            sheet_cols = sheet.ncols
            for rowi in range(sheet_rows):
                temp = sheet.row_values(rowi)
                content += (''.join(map(str,temp))).replace(' ','')
    return content

.docx

# 读取 docx 文件    
def readDocx(fileUrl):
    content = ""
    if os.path.exists(fileUrl):
        content = docx2txt.process(fileUrl)
        content = "".join(content.split())
    return content

.doc

# 读取 doc 文件，安装 pypiwin32，操作本地word程序，将doc 转为docx，再调用读取 docx 文件方法
def readDoc(fileUrl):
    AbsolutePath = os.path.abspath(fileUrl)
    word = wc.Dispatch('Word.Application')
    doc = word.Documents.Open(AbsolutePath)
    # 保存临时文件
    doc.SaveAs(AbsolutePath + ".docx", 12, False, "", True, "", False, False, False, False) # 转化后路径下的文件 
    doc.Close()
    word.Quit()
    content = readDocx(fileUrl + ".docx")
    # 移除临时文件
    os.remove(fileUrl + ".docx")
    return content

其他文件读取

.csv

# 读取 csv 文件，返回文件内容，默认utf-8，如果解析不了，使用gbk解析
def readCsv(fileUrl):
    content = ""
    if os.path.exists(fileUrl):
        try:
            with csv23.open_csv(fileUrl) as reader:
                for row in reader:
                    content += (''.join(row)).replace(' ','')
        except Exception as e:
            with csv23.open_csv(fileUrl, encoding='gbk') as reader:
                for row in reader:
                    content += (''.join(row)).replace(' ','')
    return content

图片

# 读取 图片 文件，返回文件内容 
def readImage(fileUrl):
    content = ""
    if os.path.exists(fileUrl):
        APP_ID = 'xxxxx'
        API_KEY = 'xxxxxxxxxxxxxxxx'
        SECRET_KEY = 'xxxxxxxxxxxxxxxxxxxxxxx'
        client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
        with open(fileUrl,'rb') as f:
            img = f.read()
            msg = client.basicGeneral(img)
            for i in msg.get('words_result'):
                temp = i.get('words')
                content += temp.replace(' ','')
    return content

.pptx

# 读取 pptx 文件 ,默认读取正文，默认不读取表格，默认不读取图片
def readPptx(fileUrl,extend_table = False,extend_image = False):
    content = ""
    ppt = Presentation(fileUrl)

    for slide in ppt.slides:
        for shape in slide.shapes:
            if not shape.has_text_frame:
                # 提取图片文字
                if extend_image and hasattr(shape,'image'):
                    # 图片存储本地
                    with open(shape.image.filename, 'wb') as f:
                        f.write(shape.image.blob)
                        f.close()
                    # 调用图片文字识别
                    content += readImage(shape.image.filename)
                    # 移除临时图片
                    os.remove(shape.image.filename)
                # 提取表格内容
                if extend_table and shape.has_table:
                    for row in shape.table.rows:
                        for cell in row.cells:
                            content += cell.text
            else:
                content += shape.text

    content = "".join(content.split())
    return content

.ppt

# 读取 ppt 文件，安装 pypiwin32，操作本地ppt程序，将ppt 转为pptx，再调用读取 pptx 文件方法
def readPpt(fileUrl,extend_table = False,extend_image = False):
    AbsolutePath = os.path.abspath(fileUrl)
    powerpoint = wc.Dispatch('PowerPoint.Application')
    ppt = powerpoint.Presentations.Open(AbsolutePath)
    # 保存临时文件
    ppt.SaveAs(AbsolutePath + ".pptx") 
    powerpoint.Quit()
    content = readPptx(fileUrl + ".pptx",extend_table,extend_image)
    # 移除临时文件
    os.remove(fileUrl + ".pptx")
    return content

.pdf

# 读取 pdf 文件
def readPdf(fileUrl):
    content = ""
    # 将pdf文件转为jpg图片文件
    # ./PDF_FILE_NAME 为pdf文件路径和名称
    image_pdf = Image(filename=fileUrl,resolution=300)    
    image_jpeg = image_pdf.convert('jpg')
         
    # wand已经将PDF中所有的独立页面都转成了独立的二进制图像对象。我们可以遍历这个大对象，并把它们加入到req_image序列中去。    
    req_image = []
    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpg'))
     
    # 遍历req_image,保存为图片文件

    for img in req_image:
        ff = open(fileUrl+'.jpg','wb')
        ff.write(img)
        ff.close()
        # 调用图片文字识别
        content += readImage(fileUrl+'.jpg')
        # 移除临时图片
        os.remove(fileUrl+'.jpg')
    return content

    原文作者：过桥0811
    原文地址: https://www.jianshu.com/p/056e94ca301e
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。