xxxxxxxxxx
# car_sales:> it is dataframe
# "Make" is column, or feature name
# operation is mean
car_sales.groupby(["Make"]).mean()
xxxxxxxxxx
pd.DataFrame( {'a':['A','A','B','B','B','C'], 'b':[1,2,5,5,4,6]})
df.groupby('a')['b'].apply(list)
Out:
a
A [1, 2]
B [5, 5, 4]
C [6]
Name: b, dtype: object
xxxxxxxxxx
values = set(map(lambda x:x[1], mylist))
newlist = [[y[0] for y in mylist if y[1]==x] for x in values]
xxxxxxxxxx
In [1]: df = pd.DataFrame( {'a':['A','A','B','B','B','C'], 'b':[1,2,5,5,4,6]})
df
Out[1]:
a b
0 A 1
1 A 2
2 B 5
3 B 5
4 B 4
5 C 6
In [2]: df.groupby('a')['b'].apply(list)
Out[2]:
a
A [1, 2]
B [5, 5, 4]
C [6]
Name: b, dtype: object
In [3]: df1 = df.groupby('a')['b'].apply(list).reset_index(name='new')
df1
Out[3]:
a new
0 A [1, 2]
1 B [5, 5, 4]
2 C [6]
xxxxxxxxxx
#calculate sum of sales grouped by month
df.groupby(df.date.dt.month)['sales'].sum()
date
1 34
2 44
3 31
Name: sales, dtype: int64
xxxxxxxxxx
# Group "another_col" column by "col1" and "col2" and
# produce min, max and sum of the grouped data
df.groupby(["col1","col2"], as_index=False)["another_col"].agg([min,max,sum])
# Way 2
df.groupby("cat_col").agg({"col1": ["mean", "std"], "col2": ["median"]})
# Way 3
books.groupby("some_col").agg(
mean_col1=("col1", "mean"),
std_col2=("col2", "std"),
median_col3=("col3", "median")
)
# Multi-index groupby
df.groupby(level=0).agg({'col':'mean'}) # Outermost = level 0
# Size per group
df.groupby('col').size()
xxxxxxxxxx
# usage example
gb = df.groupby(["col1", "col2"])
counts = gb.size().to_frame(name="counts")
count
(
counts.join(gb.agg({"col3": "mean"}).rename(columns={"col3": "col3_mean"}))
.join(gb.agg({"col4": "median"}).rename(columns={"col4": "col4_median"}))
.join(gb.agg({"col4": "min"}).rename(columns={"col4": "col4_min"}))
.reset_index()
)
# to create dataframe
keys = np.array(
[
["A", "B"],
["A", "B"],
["A", "B"],
["A", "B"],
["C", "D"],
["C", "D"],
["C", "D"],
["E", "F"],
["E", "F"],
["G", "H"],
]
)
df = pd.DataFrame(
np.hstack([keys, np.random.randn(10, 4).round(2)]), columns=["col1", "col2", "col3", "col4", "col5", "col6"]
)
df[["col3", "col4", "col5", "col6"]] = df[["col3", "col4", "col5", "col6"]].astype(float)