Python筛选EXCEL数据

2024年4月2日 108次阅读来源: 毛维

Python筛选EXCEL数据

Python筛选EXCEL数据

Python筛选EXCEL数据

我们在实际业务过程中，可能涉及到excel数据清洗的场景，本次代码处理的是客户个人基本信息的清洗操作，其中包含了身份证，性别，国籍，电话，职业，身份证地址，居住地地址，职业描述，以及证件起始日期的筛选操作，我们可以根据实际需要一并判断都判断，或者单个判断，实际代码如下：

配置文件名：config.ini

[File]
### 填写待处理文件信息
file_name1 = C:\Users\Administrator\Desktop\新建 Microsoft Excel 工作表.xlsx
sheet_name = Sheet1

[9yaosu]
### 请填写需要处理的数据在数据表中的列名，例如ABCDEF，如果以下的筛选数据不需要筛选时可以直接不写列名信息
### 身份证列名
id_position =S
### 性别列名
sex_position =
### 国籍列名
nationality_position =R
### 电话列名
phone_position =L
### 职业列名
job_position =
### 身份证地址列名
id_address_position =O
### 居住地址列名
live_address_position =
### 职业描述列名
job_description_position =W
### 证件起始时间列名(必须同时都填或者都不填)
certificates_start_time_position =J
certificates_end_time_position =K

python脚本文件：xx.py

# coding=utf-8
import configparser
import os
import re
import time
from xlrd import xldate_as_tuple
from datetime import datetime
import openpyxl
import xlrd
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE


def fix_illegal(s):
    try:
        a = ILLEGAL_CHARACTERS_RE.sub(r"", s)
    except:
        a = s
    return a


if __name__ == '__main__':
    cf = configparser.ConfigParser()
    cf.read("config.ini", encoding="utf-8")
    start_time = time.time()
    """待处理数据的文件名"""
    file_name1 = cf.get("File", "file_name1")
    """待处理的excel中需要操作的sheet名"""
    sheet_name = cf.get("File", "sheet_name")
    """异常数据的文件名"""
    empty_file = os.path.dirname(file_name1) + r"\异常数据.xlsx"
    """完整数据的文件名"""
    full_file = os.path.dirname(file_name1) + r"\完整数据.xlsx"
    excel = xlrd.open_workbook(file_name1)
    sheet = excel.sheet_by_name(sheet_name)
    rows = sheet.nrows
    cols = sheet.ncols
    row1 = sheet.row_values(0)
    count1 = 1
    count2 = 1

    """新建空数据表"""
    empty_excel = openpyxl.Workbook()
    empty_sheet = empty_excel.active
    empty_sheet.title = (sheet_name)
    empty_sheet.append(row1)
    """新建完整数据表"""
    full_excel = openpyxl.Workbook()
    full_sheet = full_excel.active
    full_sheet.title = (sheet_name)
    full_sheet.append(row1)

    dic = { "A": 1, "B": 2, "C": 3, "D": 4, "E": 5,
           "F": 6, "G": 7, "H": 8, "I": 9, "J": 10,
           "K": 11, "L": 12, "M": 13, "N": 14, "O": 15,
           "P": 16, "Q": 17, "R": 18, "S": 19, "T": 20,
           "U": 21, "V": 22, "W": 23, "X": 24, "Y": 25, "Z": 26}

    """获取校验元素位置"""
    id_position = dic.get(cf.get("9yaosu", "id_position"), 0)
    sex_position = dic.get(cf.get("9yaosu", "sex_position"), 0)
    nationality_position = dic.get(cf.get("9yaosu", "nationality_position"), 0)
    phone_position = dic.get(cf.get("9yaosu", "phone_position"), 0)
    job_position = dic.get(cf.get("9yaosu", "job_position"), 0)
    id_address_position = dic.get(cf.get("9yaosu", "id_address_position"), 0)
    live_address_position = dic.get(cf.get("9yaosu", "live_address_position"), 0)
    job_description_position = dic.get(cf.get("9yaosu", "job_description_position"), 0)
    certificates_start_time_position = dic.get(cf.get("9yaosu", "certificates_start_time_position"), 0)
    certificates_end_time_position = dic.get(cf.get("9yaosu", "certificates_end_time_position"), 0)

    for i in range(1, rows):
        row_value = sheet.row_values(i)
        c1 = True
        c2 = True
        c3 = True
        c4 = True
        c5 = True
        c6 = True
        c7 = True
        c8 = True
        c9 = True

        # ==================================================================================
        """判断身份证和性别数据是否正常"""
        if id_position > 0:
            id_card = row_value[id_position - 1].strip()
            # 如果身份证存在就判断身份证倒数第二位是否跟性别相符合
            if sex_position > 0:  # 如果表中存在性别，则判断性别是否是男或者女
                sex = row_value[sex_position - 1].strip()
                # 获取客户性别
                id_sex = ["男", "女"][id_card and (int(id_card[-2]) % 2 == 0)]
                c1 = id_sex in sex
            # 当身份证存在时判断身份证是否正常
            if id_card:
                c8 = ((id_card[0:-1].isdecimal() or id_card[-1].isdecimal() or id_card[-1] == "X")
                      and len(id_card) == 18)
            else:
                c8 = False
        else:  # 如果身份证不存在则直接判断性别是否是男或者女
            if sex_position > 0:  # 如果表中存在性别，则判断性别是否是男或者女
                sex = row_value[sex_position - 1].strip()
                c1 = "男" in sex or "女" in sex

        # ==================================================================================
        """判断国籍数据是否正常"""
        if nationality_position > 0:
            nationality = row_value[nationality_position - 1].strip()
            c2 = "中国" in nationality or "CN" in nationality

        # ==================================================================================
        """判断电话号码数据是否正常"""
        if phone_position > 0:
            phone = row_value[phone_position - 1].strip()
            c3 = phone.isdecimal() and (len(phone) == 8 or len(phone) == 11)

        # ==================================================================================
        """判断职业数据是否正常"""
        if job_position > 0:
            job = row_value[job_position - 1].strip()
            c4 = len(job) > 0

        # ==================================================================================
        """判断身份证地址数据是否正常"""
        if id_address_position > 0:
            id_address = row_value[id_address_position - 1].strip()
            c5_1 = (len(re.findall("([\u4e00-\u9fbb])", id_address)) > 9)
            # 判断地址是否合法
            a = ["省", "市", "区", "县", "镇", "村", "湾", "弯", "巷", "弄", "公司", "厂", "室", "号", "户", "乡", "组"]
            c5 = c5_1 and any(i in a for i in id_address)

        # ==================================================================================
        """判断居住地地址数据是否正常"""
        if live_address_position > 0:
            live_address = row_value[live_address_position - 1].strip()
            c6_1 = (len(re.findall("([\u4e00-\u9fbb])", live_address)) > 9)
            a = ["省", "市", "区", "县", "镇", "村", "湾", "弯", "巷", "弄", "公司", "厂", "室", "号", "户", "乡", "组"]
            c6 = c6_1 and any(i in a for i in live_address)

        # ==================================================================================
        """判断职业描述数据是否正常"""
        if job_description_position > 0:
            job_description = row_value[job_description_position - 1].strip()
            match = (re.compile(u"[\u4e00-\u9fbb]")).search(job_description)
            c7 = len(job_description) > 0 and (job_description != "无" or
                                               job_description != "无无" or
                                               job_description != "一般人员" or
                                               job_description != "一般员工" or
                                               job_description != "一贝人员" or
                                               match)

        # ==================================================================================
        """判断证件到期数据是否正常"""
        if certificates_start_time_position > 0 and certificates_end_time_position > 0:
            a = row_value[certificates_start_time_position - 1]
            b = row_value[certificates_end_time_position - 1]
            if isinstance(a, float):
                certificates_start_time = datetime(*xldate_as_tuple(a, 0)).strftime("%Y-%m-%d")
            else:
                certificates_start_time = a.replace("-", "").replace("/", "").strip()
            if b:
                if isinstance(a, float):
                    certificates_end_time = datetime(*xldate_as_tuple(b, 0)).strftime("%Y-%m-%d")
                else:
                    certificates_end_time = b.replace("-", "").replace("/", "").strip()
            else:
                certificates_end_time = "21991231"
            try:
                start_year = int(certificates_start_time[0:4])
                start_month = certificates_start_time[4:]
                end_year = int(certificates_end_time[0:4])
                end_month = certificates_end_time[4:]
            except:
                start_year = 0
                start_month = ""
                end_year = 0
                end_month = ""

            youxiao1 = (end_year - start_year == 5) and (start_month == end_month)
            youxiao2 = (end_year - start_year == 10) and (start_month == end_month)
            youxiao3 = (end_year - start_year == 20) and (start_month == end_month)
            youxiao4 = ("20991231" in certificates_end_time) or ("21991231" in certificates_end_time)
            youxiao = youxiao1 or youxiao2 or youxiao3 or youxiao4
            c9 = certificates_start_time and ("1899" not in certificates_start_time) and youxiao

        # 错误原因
        reason = ""
        if not c1:
            reason += "【性别不合法】"
        if not c2:
            reason += "【国籍不合法】"
        if not c3:
            reason += "【电话不合法】"
        if not c4:
            reason += "【职业不合法】"
        check_address = c5 and c6
        if not check_address:
            reason += "【地址不合法】"
        if not c7:
            reason += "【职业描述不合法】"
        if not c8:
            reason += "【身份证不合法】"
        if not c9:
            reason += "【证件起始日期不合法】"

        if c1 and c2 and c3 and c4 and check_address and c7 and c8 and c9:
            row_value1 = list(map(fix_illegal, row_value))
            full_sheet.append(row_value1)
            count1 = count1 + 1
        else:
            row_value1 = list(map(fix_illegal, row_value)) + [reason]
            empty_sheet.append(row_value1)
            count2 = count2 + 1

        print("\r", f"处理数据第{i}条....", end="", flush=True)

    """=================================================================================="""
    """打印完整数据数量/异常数据数量"""
    print("\n")
    print(f"完整数据数量是:{count1 - 1}")
    print(f"异常数据数量是:{count2 - 1}")
    """写入异常数据"""
    empty_excel.save(empty_file)
    """写入完整数据"""
    full_excel.save(full_file)
    """打印代码运行时间"""
    end_time = time.time()
    print(f"总耗费时间是:{end_time - start_time}s")

    原文作者：毛维
    原文地址: https://blog.csdn.net/weixin_43046974/article/details/111713025
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。