In [1]:
import pandas as pd import numpy as np df=pd.DataFrame({'A':['foo','bar','foo','bar', 'foo','bar','foo','foo'], 'B':['one','one','two','three', 'two','two','one','three'], 'C':np.random.randn(8), 'D':np.random.randn(8)})#randn=>具有标准正态分布 df
Out[1]:
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | 1.126165 | -0.676814 |
1 | bar | one | -1.429697 | -0.464149 |
2 | foo | two | -0.383661 | -0.309679 |
3 | bar | three | 0.945099 | 1.375307 |
4 | foo | two | -0.296882 | -0.630503 |
5 | bar | two | 2.526570 | -1.142886 |
6 | foo | one | -0.848323 | -0.310705 |
7 | foo | three | -1.683177 | -1.371868 |
In [2]:
grouped=df.groupby('A') grouped
Out[2]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000000052092B0>
In [3]:
grouped.count()#统计次数
Out[3]:
B | C | D | |
---|---|---|---|
A | |||
bar | 3 | 3 | 3 |
foo | 5 | 5 | 5 |
In [4]:
grouped=df.groupby(['A','B'])#索引多项 grouped.count()
Out[4]:
C | D | ||
---|---|---|---|
A | B | ||
bar | one | 1 | 1 |
three | 1 | 1 | |
two | 1 | 1 | |
foo | one | 2 | 2 |
three | 1 | 1 | |
two | 2 | 2 |
In [5]:
def get_letter_type(letter): if letter.lower() in 'aeiou': return 'a' else: return 'b' grouped=df.groupby(get_letter_type,axis=1) grouped.count().iloc[0]
Out[5]:
a 1 b 3 Name: 0, dtype: int64
In [6]:
s=pd.Series([1,2,3,1,2,3],[8,7,6,8,7,6]) s
Out[6]:
8 1 7 2 6 3 8 1 7 2 6 3 dtype: int64
1.指定多个索引的多个操作
In [7]:
grouped=s.groupby(level=0)#默认从0开始 grouped.first()#索引的第一部分,自动去除重复部分;也可以指定last()
Out[7]:
6 3 7 2 8 1 dtype: int64
In [8]:
grouped.sum()
Out[8]:
6 6 7 4 8 2 dtype: int64
In [9]:
grouped=s.groupby(level=0,sort=False) grouped.first()
Out[9]:
8 1 7 2 6 3 dtype: int64
2.单独索引某一列的某一个元素:多重索引
In [10]:
df2=pd.DataFrame({'X':['A','B','A','B'],'Y':[1,2,3,4]}) df2
Out[10]:
X | Y | |
---|---|---|
0 | A | 1 |
1 | B | 2 |
2 | A | 3 |
3 | B | 4 |
2-1 多重所以方法一
In [11]:
df2.groupby(['X']).get_group('A')#关注具体的某一个键值
Out[11]:
X | Y | |
---|---|---|
0 | A | 1 |
2 | A | 3 |
2-2 多重所以方法二
In [12]:
arrays=[['foo','bar','foo','bar', 'foo','bar','foo','foo'], ['one','one','two','three','two','two','one','three']] index=pd.MultiIndex.from_arrays(arrays,names=['first','second'])#添加索引名字 s=pd.Series(np.random.randn(8),index=index)#对索引键来添加值 s
Out[12]:
first second foo one -0.518263 bar one 0.583992 foo two 1.338273 bar three -0.671916 foo two 0.633448 bar two 0.144302 foo one 0.828419 three -0.834918 dtype: float64
In [13]:
grouped=s.groupby(level=0)#索引第一列 grouped.sum()
Out[13]:
first bar 0.056377 foo 1.446958 dtype: float64
In [14]:
grouped=s.groupby(level='second')#索引第二列,也可以指定名字 grouped.sum()
Out[14]:
second one 0.894148 three -1.506835 two 2.116022 dtype: float64
3 aggregate:以A B为键求和
In [15]:
grouped=df.groupby(['A','B']) grouped.aggregate(np.sum)
Out[15]:
C | D | ||
---|---|---|---|
A | B | ||
bar | one | -1.429697 | -0.464149 |
three | 0.945099 | 1.375307 | |
two | 2.526570 | -1.142886 | |
foo | one | 0.277842 | -0.987519 |
three | -1.683177 | -1.371868 | |
two | -0.680543 | -0.940182 |
In [16]:
grouped=df.groupby(['A','B'],as_index=False)#as_index=False :不去除重复的行,是一行行索引 grouped.aggregate(np.sum)
Out[16]:
A | B | C | D | |
---|---|---|---|---|
0 | bar | one | -1.429697 | -0.464149 |
1 | bar | three | 0.945099 | 1.375307 |
2 | bar | two | 2.526570 | -1.142886 |
3 | foo | one | 0.277842 | -0.987519 |
4 | foo | three | -1.683177 | -1.371868 |
5 | foo | two | -0.680543 | -0.940182 |
In [17]:
grouped=df.groupby(['A','B']).sum().reset_index()#重新构建索引 grouped
Out[17]:
A | B | C | D | |
---|---|---|---|---|
0 | bar | one | -1.429697 | -0.464149 |
1 | bar | three | 0.945099 | 1.375307 |
2 | bar | two | 2.526570 | -1.142886 |
3 | foo | one | 0.277842 | -0.987519 |
4 | foo | three | -1.683177 | -1.371868 |
5 | foo | two | -0.680543 | -0.940182 |
In [18]:
grouped=df.groupby(['A','B']) grouped.size()#统计出现次数
Out[18]:
A B bar one 1 three 1 two 1 foo one 2 three 1 two 2 dtype: int64
7.得出统计特性值
In [19]:
grouped.describe().head()
Out[19]:
C | D | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | ||
A | B | ||||||||||||||||
bar | one | 1.0 | -1.429697 | NaN | -1.429697 | -1.429697 | -1.429697 | -1.429697 | -1.429697 | 1.0 | -0.464149 | NaN | -0.464149 | -0.464149 | -0.464149 | -0.464149 | -0.464149 |
three | 1.0 | 0.945099 | NaN | 0.945099 | 0.945099 | 0.945099 | 0.945099 | 0.945099 | 1.0 | 1.375307 | NaN | 1.375307 | 1.375307 | 1.375307 | 1.375307 | 1.375307 | |
two | 1.0 | 2.526570 | NaN | 2.526570 | 2.526570 | 2.526570 | 2.526570 | 2.526570 | 1.0 | -1.142886 | NaN | -1.142886 | -1.142886 | -1.142886 | -1.142886 | -1.142886 | |
foo | one | 2.0 | 0.138921 | 1.396174 | -0.848323 | -0.354701 | 0.138921 | 0.632543 | 1.126165 | 2.0 | -0.493760 | 0.258878 | -0.676814 | -0.585287 | -0.493760 | -0.402232 | -0.310705 |
three | 1.0 | -1.683177 | NaN | -1.683177 | -1.683177 | -1.683177 | -1.683177 | -1.683177 | 1.0 | -1.371868 | NaN | -1.371868 | -1.371868 | -1.371868 | -1.371868 | -1.371868 |
- 得出指定的统计指标 agg操作
In [20]:
grouped=df.groupby('A') grouped['C'].agg([np.sum,np.mean,np.std])
Out[20]:
sum | mean | std | |
---|---|---|---|
A | |||
bar | 2.041972 | 0.680657 | 1.991346 |
foo | -2.085878 | -0.417176 | 1.023003 |
In [21]:
grouped['C'].agg({'sum1':np.sum,'mean1':np.mean,'std1':np.std})#改名字
E:\software\Anaconda3 5.2.0\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: using a dict on a Series for aggregation is deprecated and will be removed in a future version. Use named aggregation instead. >>> grouper.agg(name_1=func_1, name_2=func_2) """Entry point for launching an IPython kernel.
Out[21]:
sum1 | mean1 | std1 | |
---|---|---|---|
A | |||
bar | 2.041972 | 0.680657 | 1.991346 |
foo | -2.085878 | -0.417176 | 1.023003 |