#SQL Query to groupby Col1 and Col2
#and get mean and sum of col3 and col 4 respectively

SELECT Col1, Col2, mean(Col3), sum(Col4)
FROM Table
GROUP BY Col1, Col2


#Load dependencies
import pandas as pd
import numpy as np
import seaborn as sns


titanic = sns.load_dataset('titanic').dropna() #drop missing data

print(titanic.shape)
titanic.head()

(182, 15)


#Step 1: Create grouper
grouper = titanic.groupby(['class'])

#Step 2: Filter column and apply aggregation
grouper['survived'].sum().reset_index()


titanic.groupby(['class'])['survived'].sum().reset_index()


titanic.groupby(['embark_town'])['fare'].mean().reset_index()


titanic.groupby(['embark_town','sex'])['fare'].mean().reset_index()


titanic.groupby(['embark_town','sex'])[['fare', 'age']].mean().reset_index()


grouper = titanic.groupby(['embark_town'])

#Print dtype for each of the elements in the grouper
[(type(k),type(g)) for k,g in grouper]

[(str, pandas.core.frame.DataFrame),
 (str, pandas.core.frame.DataFrame),
 (str, pandas.core.frame.DataFrame)]


#Print shape for the dataframe groups
[(k,g.shape) for k,g in grouper]

[('Cherbourg', (65, 15)), ('Queenstown', (2, 15)), ('Southampton', (115, 15))]


print(list(grouper)[1][0]) #print key
list(grouper)[1][1]        #print dataframe

Queenstown


grouper = titanic.groupby(['embark_town','sex'])

#Print dtype for each of the elements in the grouper
[(k,g.shape) for k,g in grouper]

[(('Cherbourg', 'female'), (34, 15)),
 (('Cherbourg', 'male'), (31, 15)),
 (('Queenstown', 'female'), (1, 15)),
 (('Queenstown', 'male'), (1, 15)),
 (('Southampton', 'female'), (53, 15)),
 (('Southampton', 'male'), (62, 15))]


#The grouping columns become the index after the groupby aggregation
titanic.groupby(['embark_town','sex'])['age'].mean()

embark_town  sex   
Cherbourg    female    35.352941
             male      39.774194
Queenstown   female    33.000000
             male      44.000000
Southampton  female    30.952830
             male      37.595484
Name: age, dtype: float64


titanic.groupby(['embark_town', 'who'])['age'].apply(set).reset_index()


titanic.groupby(['embark_town', 'who'])['age'].apply(lambda x: x.max()-x.min()).reset_index()


titanic.groupby(['embark_town', 'who']).apply(lambda x: (x['fare']/x['age']).mean())

embark_town  who  
Cherbourg    man       3.146083
             woman     3.614103
Queenstown   man       2.045455
             woman     2.727273
Southampton  child    28.956893
             man       1.410745
             woman     2.593897
dtype: float64


#Define aggregations as a dictionary
g = {'fare':'mean', 
     'age':'median'
    }

titanic.groupby(['embark_town', 'who']).agg(g).reset_index()


#Define aggregations as a dictionary
g = {'fare':['sum', 'mean'], 
     'age':['median', 'min', 'max']
    }

titanic.groupby(['embark_town', 'who']).agg(g).reset_index()


#Define aggregations directly as columns and tuples
titanic.groupby(['embark_town', 'who']).agg(A=('fare', 'sum'), 
                                            B=('fare', 'mean'),
                                            C=('age', 'min'),
                                            D=('age', 'max')).reset_index()


#Define aggregations as a dictionary
g = {'fare':lambda x: x.sum(), 
     'age' :lambda x: x.max()
    }

titanic.groupby(['embark_town', 'who']).agg(g).reset_index()


titanic.groupby('who')['fare'].transform(lambda x: x.mean())

1      88.817429
3      88.817429
6      69.821026
10     77.379485
11     88.817429
         ...    
871    88.817429
872    69.821026
879    88.817429
887    88.817429
889    69.821026
Name: fare, Length: 182, dtype: float64


df = pd.DataFrame({'A':[1,2,3,4,8,10,12,13],
                   'B':[1,2,2,3,1,3,2,3]
                  })

#custom grouping
even_odd = ['even' if i%2==0 else 'odd' for i in df['A']]

df.groupby(even_odd)['B'].mean()

even    2.2
odd     2.0
Name: B, dtype: float64


df = pd.DataFrame({'A':[1,1,2,2,2,1,1,3,3], #<- column to group on
                   'B':[1,7,2,4,1,8,2,1,3]  #<- column to aggregate
                  })

grouper = (df['A']!=df['A'].shift()).cumsum()

df.groupby(grouper).agg({'A':'mean','B':'sum'}).reset_index(drop=True)


d = {'id': [11, 11, 11, 11, 13, 13, 13],
     'date': ['2017-06-01','2017-06-03','2017-06-05','2017-06-06','2017-06-01','2017-06-02','2017-06-07'],
     'value': [1, 7, 8, 2, 9, 2, 11]
    }

df = pd.DataFrame(d)
df['date'] = pd.to_datetime(df['date'])
print("Input dataframe:")
df

Input dataframe:


#custom date range
idx = pd.date_range('2017-06-01','2017-06-07')

#set original date column as index
df.set_index('date', inplace=True)

#grouby and apply pd.DataFrame.reindex to apply new index and fill value as 0
df.groupby('id').apply(pd.DataFrame.reindex, idx, fill_value=0).drop('id',1).reset_index()


data = list(zip(np.random.randint(0,4,(10,)), np.random.randint(0,100,(10,))))
print(data)

[(1, 41), (1, 30), (2, 70), (3, 82), (2, 68), (0, 18), (3, 97), (1, 37), (3, 8), (0, 51)]


from collections import defaultdict

d = defaultdict(list)

for k,v in data:
    d[k].append(v)
    
grouped_data = dict(d)

print(grouped_data)

{1: [41, 30, 37], 2: [70, 68], 3: [82, 97, 8], 0: [18, 51]}


import numpy as np

#sorted numpy array (sorted by the grouping column)
a = np.array(data)
a = a[np.argsort(a[:, 0])]

#Take the index positions for the unique values using return_index 
#and start from the second one to split the data
groups = np.split(a[:,1], np.unique(a[:,0], return_index=True)[1][1:])
print(groups)

[array([18, 51]), array([41, 30, 37]), array([70, 68]), array([82, 97,  8])]


import itertools

items = sorted(data, key=lambda x:x[0])
grouper = itertools.groupby(items, key=lambda x:x[0])
groups = [list(g) for k, g in grouper]
groups

[[(0, 18), (0, 51)],
 [(1, 41), (1, 30), (1, 37)],
 [(2, 70), (2, 68)],
 [(3, 82), (3, 97), (3, 8)]]

	survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
1	1	1	female	38.0	1	0	71.2833	C	First	woman	False	C	Cherbourg	yes	False
3	1	1	female	35.0	1	0	53.1000	S	First	woman	False	C	Southampton	yes	False
6	0	1	male	54.0	0	0	51.8625	S	First	man	True	E	Southampton	no	True
10	1	3	female	4.0	1	1	16.7000	S	Third	child	False	G	Southampton	yes	False
11	1	1	female	58.0	0	0	26.5500	S	First	woman	False	C	Southampton	yes	True

	class	survived
0	First	106
1	Second	12
2	Third	5

	class	survived
0	First	106
1	Second	12
2	Third	5

	survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
245	0	1	male	44.0	2	0	90.0	Q	First	man	True	C	Queenstown	no	False
412	1	1	female	33.0	1	0	90.0	Q	First	woman	False	C	Queenstown	yes	False

Function	Description
mean( )	Compute mean of groups
median( )	Compute median of groups
mode( )	Compute mode of groups
sum( )	Compute sum of group values
prod( )	Compute product of group values
size( )	Compute group sizes
count( )	Compute count of group
std( )	Standard deviation of groups
var( )	Compute variance of groups
skew( )	Compute skewness/3rd moment of groups
kurt( )	Compute kurtosis/4th moment of groups
sem( )	Standard error of the mean of groups
mad( )	Mean absolute deviation for each group
describe( )	Generates descriptive statistics
first( )	Compute first of group values
last( )	Compute last of group values
nth( )	Take nth value, or a subset if n is a list
min( )	Compute min of group values
max( )	Compute max of group values

Groupby & Aggregate using Pandas¶

Table of contents:¶

1. Introduction¶

2. Syntax¶

2.1 Adding more groups/levels¶

2.2 Adding more variables/features¶

3. Grouping¶

4. Aggregation¶

4.1 In-built aggregation methods¶

4.2 Custom functions with pandas apply¶

4.3 Multiple aggregations using agg method¶

4.4 Custom functions with agg method¶

5. Transform¶

6. Advanced Usage¶

6.1 Sequential/local grouping of a dataframe¶

6.2 Re-indexing to a fixed date range for each group¶

7. Other ways of grouping data¶

7.1 Using collections' defaultdict¶

7.2 Using numpy's split function¶

7.3 Using itertools' groupby¶

8. References¶

	embark_town	fare
0	Cherbourg	103.342503
1	Queenstown	90.000000
2	Southampton	64.922862

	embark_town	sex	fare
0	Cherbourg	female	104.169609
1	Cherbourg	male	102.435355
2	Queenstown	female	90.000000
3	Queenstown	male	90.000000
4	Southampton	female	79.251179
5	Southampton	male	52.674461

	embark_town	who	age
0	Cherbourg	man	{17.0, 18.0, 23.0, 24.0, 25.0, 26.0, 27.0, 30....
1	Cherbourg	woman	{16.0, 17.0, 18.0, 19.0, 21.0, 22.0, 23.0, 24....
2	Queenstown	man	{44.0}
3	Queenstown	woman	{33.0}
4	Southampton	child	{0.92, 1.0, 2.0, 3.0, 4.0, 6.0, 11.0, 14.0, 15.0}
5	Southampton	man	{19.0, 21.0, 25.0, 27.0, 28.0, 29.0, 31.0, 32....
6	Southampton	woman	{16.0, 17.0, 18.0, 19.0, 21.0, 22.0, 23.0, 24....

	embark_town	who	age
0	Cherbourg	man	54.00
1	Cherbourg	woman	44.00
2	Queenstown	man	0.00
3	Queenstown	woman	0.00
4	Southampton	child	14.08
5	Southampton	man	61.00
6	Southampton	woman	47.00

	embark_town	who	fare	age
0	Cherbourg	man	102.435355	36.0
1	Cherbourg	woman	104.169609	37.0
2	Queenstown	man	90.000000	44.0
3	Queenstown	woman	90.000000	33.0
4	Southampton	child	77.379485	4.0
5	Southampton	man	51.071515	40.0
6	Southampton	woman	77.686436	33.0

	embark_town	who	fare		age
			sum	mean	median	min	max
0	Cherbourg	man	3175.4960	102.435355	36.0	17.00	71.0
1	Cherbourg	woman	3541.7667	104.169609	37.0	16.00	60.0
2	Queenstown	man	90.0000	90.000000	44.0	44.00	44.0
3	Queenstown	woman	90.0000	90.000000	33.0	33.00	33.0
4	Southampton	child	1005.9333	77.379485	4.0	0.92	15.0
5	Southampton	man	2808.9333	51.071515	40.0	19.00	80.0
6	Southampton	woman	3651.2625	77.686436	33.0	16.00	63.0

	id	date	value
0	11	2017-06-01	1
1	11	2017-06-03	7
2	11	2017-06-05	8
3	11	2017-06-06	2
4	13	2017-06-01	9
5	13	2017-06-02	2
6	13	2017-06-07	11