Python筛选EXCEL数据
Python筛选EXCEL数据
我们在实际业务过程中,可能涉及到excel数据清洗的场景,本次代码处理的是客户个人基本信息的清洗操作,其中包含了身份证,性别,国籍,电话,职业,身份证地址,居住地地址,职业描述,以及证件起始日期的筛选操作,我们可以根据实际需要一并判断都判断,或者单个判断,实际代码如下:
配置文件名:config.ini
[File]
### 填写待处理文件信息
file_name1 = C:\Users\Administrator\Desktop\新建 Microsoft Excel 工作表.xlsx
sheet_name = Sheet1
[9yaosu]
### 请填写需要处理的数据在数据表中的列名,例如ABCDEF,如果以下的筛选数据不需要筛选时可以直接不写列名信息
### 身份证列名
id_position =S
### 性别列名
sex_position =
### 国籍列名
nationality_position =R
### 电话列名
phone_position =L
### 职业列名
job_position =
### 身份证地址列名
id_address_position =O
### 居住地址列名
live_address_position =
### 职业描述列名
job_description_position =W
### 证件起始时间列名(必须同时都填或者都不填)
certificates_start_time_position =J
certificates_end_time_position =K
python脚本文件:xx.py
# coding=utf-8
import configparser
import os
import re
import time
from xlrd import xldate_as_tuple
from datetime import datetime
import openpyxl
import xlrd
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
def fix_illegal(s):
try:
a = ILLEGAL_CHARACTERS_RE.sub(r"", s)
except:
a = s
return a
if __name__ == '__main__':
cf = configparser.ConfigParser()
cf.read("config.ini", encoding="utf-8")
start_time = time.time()
"""待处理数据的文件名"""
file_name1 = cf.get("File", "file_name1")
"""待处理的excel中需要操作的sheet名"""
sheet_name = cf.get("File", "sheet_name")
"""异常数据的文件名"""
empty_file = os.path.dirname(file_name1) + r"\异常数据.xlsx"
"""完整数据的文件名"""
full_file = os.path.dirname(file_name1) + r"\完整数据.xlsx"
excel = xlrd.open_workbook(file_name1)
sheet = excel.sheet_by_name(sheet_name)
rows = sheet.nrows
cols = sheet.ncols
row1 = sheet.row_values(0)
count1 = 1
count2 = 1
"""新建空数据表"""
empty_excel = openpyxl.Workbook()
empty_sheet = empty_excel.active
empty_sheet.title = (sheet_name)
empty_sheet.append(row1)
"""新建完整数据表"""
full_excel = openpyxl.Workbook()
full_sheet = full_excel.active
full_sheet.title = (sheet_name)
full_sheet.append(row1)
dic = { "A": 1, "B": 2, "C": 3, "D": 4, "E": 5,
"F": 6, "G": 7, "H": 8, "I": 9, "J": 10,
"K": 11, "L": 12, "M": 13, "N": 14, "O": 15,
"P": 16, "Q": 17, "R": 18, "S": 19, "T": 20,
"U": 21, "V": 22, "W": 23, "X": 24, "Y": 25, "Z": 26}
"""获取校验元素位置"""
id_position = dic.get(cf.get("9yaosu", "id_position"), 0)
sex_position = dic.get(cf.get("9yaosu", "sex_position"), 0)
nationality_position = dic.get(cf.get("9yaosu", "nationality_position"), 0)
phone_position = dic.get(cf.get("9yaosu", "phone_position"), 0)
job_position = dic.get(cf.get("9yaosu", "job_position"), 0)
id_address_position = dic.get(cf.get("9yaosu", "id_address_position"), 0)
live_address_position = dic.get(cf.get("9yaosu", "live_address_position"), 0)
job_description_position = dic.get(cf.get("9yaosu", "job_description_position"), 0)
certificates_start_time_position = dic.get(cf.get("9yaosu", "certificates_start_time_position"), 0)
certificates_end_time_position = dic.get(cf.get("9yaosu", "certificates_end_time_position"), 0)
for i in range(1, rows):
row_value = sheet.row_values(i)
c1 = True
c2 = True
c3 = True
c4 = True
c5 = True
c6 = True
c7 = True
c8 = True
c9 = True
# ==================================================================================
"""判断身份证和性别数据是否正常"""
if id_position > 0:
id_card = row_value[id_position - 1].strip()
# 如果身份证存在就判断身份证倒数第二位是否跟性别相符合
if sex_position > 0: # 如果表中存在性别,则判断性别是否是男或者女
sex = row_value[sex_position - 1].strip()
# 获取客户性别
id_sex = ["男", "女"][id_card and (int(id_card[-2]) % 2 == 0)]
c1 = id_sex in sex
# 当身份证存在时判断身份证是否正常
if id_card:
c8 = ((id_card[0:-1].isdecimal() or id_card[-1].isdecimal() or id_card[-1] == "X")
and len(id_card) == 18)
else:
c8 = False
else: # 如果身份证不存在则直接判断性别是否是男或者女
if sex_position > 0: # 如果表中存在性别,则判断性别是否是男或者女
sex = row_value[sex_position - 1].strip()
c1 = "男" in sex or "女" in sex
# ==================================================================================
"""判断国籍数据是否正常"""
if nationality_position > 0:
nationality = row_value[nationality_position - 1].strip()
c2 = "中国" in nationality or "CN" in nationality
# ==================================================================================
"""判断电话号码数据是否正常"""
if phone_position > 0:
phone = row_value[phone_position - 1].strip()
c3 = phone.isdecimal() and (len(phone) == 8 or len(phone) == 11)
# ==================================================================================
"""判断职业数据是否正常"""
if job_position > 0:
job = row_value[job_position - 1].strip()
c4 = len(job) > 0
# ==================================================================================
"""判断身份证地址数据是否正常"""
if id_address_position > 0:
id_address = row_value[id_address_position - 1].strip()
c5_1 = (len(re.findall("([\u4e00-\u9fbb])", id_address)) > 9)
# 判断地址是否合法
a = ["省", "市", "区", "县", "镇", "村", "湾", "弯", "巷", "弄", "公司", "厂", "室", "号", "户", "乡", "组"]
c5 = c5_1 and any(i in a for i in id_address)
# ==================================================================================
"""判断居住地地址数据是否正常"""
if live_address_position > 0:
live_address = row_value[live_address_position - 1].strip()
c6_1 = (len(re.findall("([\u4e00-\u9fbb])", live_address)) > 9)
a = ["省", "市", "区", "县", "镇", "村", "湾", "弯", "巷", "弄", "公司", "厂", "室", "号", "户", "乡", "组"]
c6 = c6_1 and any(i in a for i in live_address)
# ==================================================================================
"""判断职业描述数据是否正常"""
if job_description_position > 0:
job_description = row_value[job_description_position - 1].strip()
match = (re.compile(u"[\u4e00-\u9fbb]")).search(job_description)
c7 = len(job_description) > 0 and (job_description != "无" or
job_description != "无无" or
job_description != "一般人员" or
job_description != "一般员工" or
job_description != "一贝人员" or
match)
# ==================================================================================
"""判断证件到期数据是否正常"""
if certificates_start_time_position > 0 and certificates_end_time_position > 0:
a = row_value[certificates_start_time_position - 1]
b = row_value[certificates_end_time_position - 1]
if isinstance(a, float):
certificates_start_time = datetime(*xldate_as_tuple(a, 0)).strftime("%Y-%m-%d")
else:
certificates_start_time = a.replace("-", "").replace("/", "").strip()
if b:
if isinstance(a, float):
certificates_end_time = datetime(*xldate_as_tuple(b, 0)).strftime("%Y-%m-%d")
else:
certificates_end_time = b.replace("-", "").replace("/", "").strip()
else:
certificates_end_time = "21991231"
try:
start_year = int(certificates_start_time[0:4])
start_month = certificates_start_time[4:]
end_year = int(certificates_end_time[0:4])
end_month = certificates_end_time[4:]
except:
start_year = 0
start_month = ""
end_year = 0
end_month = ""
youxiao1 = (end_year - start_year == 5) and (start_month == end_month)
youxiao2 = (end_year - start_year == 10) and (start_month == end_month)
youxiao3 = (end_year - start_year == 20) and (start_month == end_month)
youxiao4 = ("20991231" in certificates_end_time) or ("21991231" in certificates_end_time)
youxiao = youxiao1 or youxiao2 or youxiao3 or youxiao4
c9 = certificates_start_time and ("1899" not in certificates_start_time) and youxiao
# 错误原因
reason = ""
if not c1:
reason += "【性别不合法】"
if not c2:
reason += "【国籍不合法】"
if not c3:
reason += "【电话不合法】"
if not c4:
reason += "【职业不合法】"
check_address = c5 and c6
if not check_address:
reason += "【地址不合法】"
if not c7:
reason += "【职业描述不合法】"
if not c8:
reason += "【身份证不合法】"
if not c9:
reason += "【证件起始日期不合法】"
if c1 and c2 and c3 and c4 and check_address and c7 and c8 and c9:
row_value1 = list(map(fix_illegal, row_value))
full_sheet.append(row_value1)
count1 = count1 + 1
else:
row_value1 = list(map(fix_illegal, row_value)) + [reason]
empty_sheet.append(row_value1)
count2 = count2 + 1
print("\r", f"处理数据第{i}条....", end="", flush=True)
"""=================================================================================="""
"""打印完整数据数量/异常数据数量"""
print("\n")
print(f"完整数据数量是:{count1 - 1}")
print(f"异常数据数量是:{count2 - 1}")
"""写入异常数据"""
empty_excel.save(empty_file)
"""写入完整数据"""
full_excel.save(full_file)
"""打印代码运行时间"""
end_time = time.time()
print(f"总耗费时间是:{end_time - start_time}s")