第一次使用python编写项目程序,实现地址的省市提取,补全缺省词。
已知地址表file1.xls,包含省、市、县名称。
从数据库取出的地址通过jieba分词,再与本地表比较,最后将省和市分别列出来。
from pandas import *
from numpy import *
import pandas as pd
import numpy as np
import MySQLdb
import jieba
# #建立数据库连接 conn = MySQLdb.connect(host="*.*.*.*",user="*",passwd="*",port=0,db="*",charset="utf8")
# 通过获取到的数据库连接conn下的cursor()方法来创建游标 cursor =conn.cursor()
# 解决乱码问题 cursor.execute("SET NAMES utf8")
# sql语句 sql1= "SELECT customer_id,customer_name,address FROM t_customer ORDER BY customer_id limit 50" # read_sql 方法返回的数据类型是DataFrame dfSql=pd.read_sql(sql1,con=conn)
# 为数据框添加两列 dfSql['Province']='' dfSql['City']='' #将省市县数据读取到数据框,本地查询表 dfExcel = pd.read_excel('D:\\file\\file1.xls') #遍历从数据库取的数据的每一行 for index,dfSqlRows in dfSql.iterrows():
if dfSqlRows[2] is NAN:
continue seg_list = list(jieba.cut(str(dfSqlRows[2]), cut_all=False))
#遍历本地地址库的每一行,与当前数据库数据比对 if len(seg_list) > 0:
for flag,dfExcelRows in dfExcel.iterrows():
if seg_list[0] in dfExcelRows[2]:
dfSql.loc[index, ['Province']] = dfExcelRows[0]
dfSql.loc[index, ['City']] = dfExcelRows[1]
break elif seg_list[0] in dfExcelRows[1]:
dfSql.loc[index, ['Province']] = dfExcelRows[0]
dfSql.loc[index, ['City']] = dfExcelRows[1]
break elif seg_list[0] in dfExcelRows[0]:
dfSql.loc[index, ['Province']] = dfExcelRows[0]
if len(seg_list) > 1:
if seg_list[1] in dfExcelRows[1] or seg_list[1] in dfExcelRows[2]:
dfSql.loc[index, ['City']] = dfExcelRows[1]
break else:
dfSql.loc[index, ['City']] = "not" else:
dfSql.loc[index, ['City']] = "not" break #将数据写到excel中 dfSql.to_excel('D:\\file\\省市分类表.xlsx')
#关闭游标 cursor.close()
#关闭数据库连接 conn.close()