pandas统计数据项重复值次数和删除
转:https://blog.csdn.net/qq_35203425/article/details/80830911
Pandas学习笔记之重复数据统计
https://blog.csdn.net/lansecheng/article/details/75085675
Pandas数据基础(索引、排序、连接、去重、分箱、异常处理)
https://blog.csdn.net/niuniuyuh/article/details/77102442
from pandas import DataFrame
df = DataFrame({'key1':['a','a','b','b','a','a','b','b'],
'key2':['one','two','one','two','one','one','two','two'],
'key3':[1,2,3,2,1,1,2,3],
# 'data2':np.random.randn(5)
})
print(df)
key1 key2 key3
# 0 a one 1
# 1 a two 2
# 2 b one 3
# 3 b two 2
# 4 a one 1
# 5 a one 1
# 6 b two 2
# 7 b two 3
#重复项
print(df[df.duplicated()]) #找出全部列的重复项
# key1 key2 key3
# 4 a one 1
# 5 a one 1
# 6 b two 2
df[df.duplicated(['key1','key2'])] #只找出key1,key2相同的重复项
# key1 key2 key3
# 4 a one 1
# 5 a one 1
# 6 b two 2
# 7 b two 3
column_names = ['key1', 'key2']
df[df.duplicated(column_names)] #只找出key1,key2相同的重复项
# key1 key2 key3
# 4 a one 1
# 5 a one 1
# 6 b two 2
# 7 b two 3
# 统计重复值
dup=df[df.duplicated()].count()
print(dup)
# key1 3
# key2 3
# key3 3
# dtype: int64
# 去除重复项
nodup=df[-df.duplicated()]
print(nodup)
# key1 key2 key3
#0 a one 1
#1 a two 2
#2 b one 3
#3 b two 2
#7 b two 3
# 去除重复项,#去除只有key1,key2相同的重复项
nodup=df[-df.duplicated(['key1','key2'])]
print(nodup)
key1 key2 key3
# 0 a one 1
# 1 a two 2
# 2 b one 3
# 3 b two 2
column_names = ['adRequest_id', 'adRequest_time', 'adPosition_id', 'user_id', 'ExAd_id', 'ExAd_materialSize', 'ExAd_bid', 'pctr', 'quality_ecpm', 'totalEcpm']
#去重所有列标签,以下两种筛选出来的结果是一样的
#ExposureLog[ExposureLog.duplicated()] #默认去除所有列一样的
# ExposureLog[ExposureLog.duplicated(column_names)] #878919行
# ExposureLog[ExposureLog.duplicated(['adRequest_id', 'adPosition_id', 'ExAd_id'])]
#去除重复项
ExposureLog_nodup = ExposureLog[-ExposureLog.duplicated(column_names)]
print(len(ExposureLog_nodup)) #101507776=102386695(total) - 878919(dup)
#保存到文件totalExposureLog_nodup.csv中
ExposureLog_nodup.to_csv("./data/totalExposureLog_nodup.csv", index=False)