Python 省市分词提取

第一次使用python编写项目程序,实现地址的省市提取,补全缺省词。
已知地址表file1.xls,包含省、市、县名称。
从数据库取出的地址通过jieba分词,再与本地表比较,最后将省和市分别列出来。
from pandas import *
from numpy import *
import pandas as pd
import numpy as np
import MySQLdb
import jieba
 # #建立数据库连接 conn = MySQLdb.connect(host="*.*.*.*",user="*",passwd="*",port=0,db="*",charset="utf8")
# 通过获取到的数据库连接conn下的cursor()方法来创建游标 cursor =conn.cursor()
# 解决乱码问题 cursor.execute("SET NAMES utf8")
# sql语句 sql1= "SELECT customer_id,customer_name,address FROM t_customer ORDER BY customer_id limit 50" # read_sql 方法返回的数据类型是DataFrame dfSql=pd.read_sql(sql1,con=conn)
# 为数据框添加两列 dfSql['Province']='' dfSql['City']=''  #将省市县数据读取到数据框,本地查询表 dfExcel = pd.read_excel('D:\\file\\file1.xls')  #遍历从数据库取的数据的每一行 for index,dfSqlRows in dfSql.iterrows():
    if dfSqlRows[2] is NAN:
        continue   seg_list = list(jieba.cut(str(dfSqlRows[2]), cut_all=False))

    #遍历本地地址库的每一行,与当前数据库数据比对  if len(seg_list) > 0:
        for flag,dfExcelRows in dfExcel.iterrows():
            if seg_list[0] in dfExcelRows[2]:
                dfSql.loc[index, ['Province']] = dfExcelRows[0]
                dfSql.loc[index, ['City']] = dfExcelRows[1]
                break  elif seg_list[0] in dfExcelRows[1]:
                dfSql.loc[index, ['Province']] = dfExcelRows[0]
                dfSql.loc[index, ['City']] = dfExcelRows[1]
                break  elif seg_list[0] in dfExcelRows[0]:
                dfSql.loc[index, ['Province']] = dfExcelRows[0]
                if len(seg_list) > 1:
                    if seg_list[1] in dfExcelRows[1] or seg_list[1] in dfExcelRows[2]:
                        dfSql.loc[index, ['City']] = dfExcelRows[1]
                        break  else:
                        dfSql.loc[index, ['City']] = "not"  else:
                    dfSql.loc[index, ['City']] = "not"  break #将数据写到excel中 dfSql.to_excel('D:\\file\\省市分类表.xlsx')

#关闭游标 cursor.close()
#关闭数据库连接 conn.close()

    原文作者:ValleysWind
    原文地址: https://blog.csdn.net/appleGZ/article/details/78751542
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞