介绍
CREATE TABLE "candidates" (
"id" INTEGER PRIMARY KEY NOT NULL ,
"first_name" VARCHAR,
"last_name" VARCHAR,
"middle_name" VARCHAR,
"party" VARCHAR NOT NULL
);
CREATE TABLE "contributors" (
"id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
"last_name" VARCHAR,
"first_name" VARCHAR,
"middle_name" VARCHAR,
"street_1" VARCHAR,
"street_2" VARCHAR,
"city" VARCHAR,
"state" VARCHAR,
"zip" VARCHAR,
"amount" INTEGER,
"date" DATETIME,
"candidate_id" INTEGER NOT NULL, name,
FOREIGN KEY(candidate_id) REFERENCES candidates(id)
);
dfcond对应的是contributors表,dfuser对应的是condidates
数据查询
- 单条件
查找first_name为‘John’的数据
dfcond.query("first_name=='John'")
dfcond[dfcond.first_name=='John']
dfcond.loc[dfcond.first_name=='John']
上面三个语句等价
###SQL
select * from contributors where first_name == 'John'
- 多条件
查找last_name为Ahrens,订单大于500的数据
dfcond.query("last_name=='Ahrens' and amount>500")
###SQL
select * from contributors where last_name=='Ahrens' and amount>500
- 空值
查找state为空的数据
dfcond[dfcond.state.isnull()]
###SQL
select * from contributors where state is null
- 多值选择
查找state为VA或者WA的数据
dfcond[dfcond.state.isin(['VA','WA'])]
###SQL
select * from contributors where state in ('VA','WA')
- 区间查找
查到订单介于10到50之间的数据
dfcond.query("10<=amount<=50")
###SQL
select * from contributors where amount between 10 and 50
- 重复值
查找first_name,last_name
dfcond[['first_name,last_name']].drop_duplicates()
dfcond.drop_duplicates(subset=['first_name,last_name'],keep='first')
keep还可以选择'last',或者False删除所有重复项
###SQL
select distinct first_name,last_name from contributors
- 数据返回量控制
返回三条记录
dfcond[0:3]
dfcond.iloc[0:3]
#SQL
select * from contributors limit 3
- 带有函数的例子
订单值大于最大订单值减去2000
dfcond[dfcond.amount>dfcond.amount.max()-2000]
###SQL
select * from contributors where amount>(select max(amount) from contributors)
- 联合查找
从candidates表中查找last_name为Obama,然后去contributors中查找与其有关的记录。
cid = dfuser.query("last_name=='Obama'")['id'].values[0]
dfcond.query('candidate_id=={}'.format(cid))
###SQL
select * from contributors where candidate_id =
(select id from candidates where last_name='Obama')
#也可以进行隐式连接
select contributors.last_name,contributors.amount from contributors,candidates where
condidates.last_name='Obama' and condidates.id = contributors.candidate_id
- 内联
dfcond.merge(dfuser,left_on='candidate_id',right_on='id')
###SQL
select * from contributors,candidates where contributors.candidate_id =
candidates.id
select * from contributors inter join candidates on candidate_id=candidates.id
#这里有个疑问就是candidate_id如果加了表名字段就会报错,找不到字段
- 左联/右联/全联
dfcond.merge(dfuser,left_on='candidate_id',right_on='id',how='left/right/outer')
###SQL
select * from contributors left join candidates on candidate_id=candidates.id
select * from contributors right join candidates on candidate_id=candidates.id
select * from contributors full join candidates on candidate_id=candidates.id
left,right,full的区别?
行列操作
- 列选择
选择first_name列
dfcond['first_name']
dfcond[['first_name','last_name']]#多列选择
###SQL
select first_name from contributors
select first_name,last_name from contributors
- 新增加一列
增加一列name
dfcond['name'] = dfcond['last_name']+','+dfcond['first_name']
dfcond.assign(name=dfcond.last_name+":"+dfcond.first_name)
###对于sql要修改数据表
ALTER TABLE contributors ADD COLUMN name varchar(255);
- 列删除
删除name列
del dfcond['name']
###SQL
alter table contributors drop column name
- 行删除
删除所有last_name为Ahrens的行
#这里其实比较复杂,先把last_name弄成索引,然后删除,最后恢复索引
df2=dfcwdi.copy()
df2.set_index('last_name', inplace=True)
df2.drop(['Ahrens','Akin'],inplace=True)#这里可以单个也可以多个值
df2.reset_index()
# 其实这里没必要这样直接就是查询last_name 不等于Ahrens的行就可以了
ndf = dfcwdi.query('last_name!="Ahrens"')
###SQL
delete from contributors where last_name='Ahrens'
delete from contributes where last_name in ('Ahrens','Akin')
数据修改
- 多行数据修改
dfcond.loc[dfcond.state=='VA','name'] = "Junk"
###SQL
update contributors set name = "Junk" where state = 'VA'
聚合Aggregate
- 极值
###amount最大值
dfcond.describe()###获取平均值,极值(貌似只对数据有效)
dfcond.amount.max()
dfcond[dfcond.amount.max()==dfcond.amount]
###SQL
select *,max(amount) as maxcol from contributors
- 计数
dfcond.count()
#返回所有字段的统计计数,空值不算
dfcond.info()#这个函数也有同样的结果
dfcond.XX.count_values()#对相同的值进行统计,类似于Counter功能
###SQL
select count(amount) as countcol from contributors
#平均值
select avg(amount) as avgcol from contributors
- groupby
根据state分类汇总
dfcond.groupby('state').sum()#分类汇总求和
gb = dfcond.groupby('state')#分组
CA = gb.get_group('CA')#获取单个分组
###SQL
SELECT state,SUM(amount) FROM contributors GROUP BY state
排序
df = dfcond.sort_values(by=['last_name'],ascending=False)#采用降序排列
###SQL
SELECT * FROM contributors ORDER BY last_name DESC;
pandas与数据库之间的转换
from sqlite3 import dbapi2 as sq3
import os
PATHSTART="."
def get_db(dbfile):
sqlite_db = sq3.connect(os.path.join(PATHSTART, dbfile))
return sqlite_db
def init_db(dbfile, schema):
"""Creates the database tables."""
db = get_db(dbfile)
db.cursor().executescript(schema)
db.commit()
return db
db=init_db("cancont.db", ourschema)
dfusers.to_sql("candidates", db, if_exists="append", index=False)
dfcand.to_sql("contributors", db, if_exists="append", index=False)
另一种数据插入方法
ins="""
INSERT INTO candidates (id, first_name, last_name, middle_name, party) \
VALUES (?,?,?,?,?);
"""
#candidates.txt文件中第一行是表头
with open('./candidates.txt') as f:
lines = f.readlines()
for line in lines[1:]:
zid,first_name,last_name,middle_name,party = line.strip().split('|')
print(zid,first_name,last_name,middle_name,party)
vals = (int(zid),first_name,last_name,middle_name,party)
print(vals)
db.cursor().execute(ins,vals)
sql语句执行函数
def make_query(sql):
c = db.cursor().execute(sql)
return c.fetchall()
把查询结果转换为dataframe对象
def make_frame(data,col_names):
frame = []
for i,name in enumerate(col_names):
frame.append((name,[d[i] for d in data]))
#把行编程列
return pd.DataFrame.from_items(frame)
上面的函数需要列名
col_names = [field[1] for field in make_query("pragma table_info(contributors)")]
#这是针对sqlite
使用例子
make_frame(make_query("select * from contributors where state is null"),col_names)
如果是sqlite数据库,可以直接如下读取
pd.read_sql("SELECT * FROM candidates WHERE party= 'D';", db)