mirror of
https://github.com/koodiklinikka/palkkakysely.git
synced 2026-01-26 03:14:03 +00:00
42 lines
998 B
Python
42 lines
998 B
Python
from typing import Optional
|
|
|
|
import pandas as pd
|
|
|
|
|
|
def q25(x):
|
|
return x.quantile(0.25)
|
|
|
|
|
|
def q50(x):
|
|
return x.quantile(0.5)
|
|
|
|
|
|
def q75(x):
|
|
return x.quantile(0.75)
|
|
|
|
|
|
def q90(x):
|
|
return x.quantile(0.9)
|
|
|
|
|
|
def get_categorical_stats(
|
|
df: pd.DataFrame,
|
|
category_col: str,
|
|
value_col: str,
|
|
*,
|
|
na_as_category: Optional[str] = None,
|
|
) -> pd.DataFrame:
|
|
# Drop records where value is not numeric before grouping...
|
|
df = df.copy()
|
|
df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
|
|
df = df[df[value_col].notna() & df[value_col] > 0]
|
|
if na_as_category:
|
|
df[category_col] = df[category_col].astype("string")
|
|
df.loc[df[category_col].isna(), category_col] = na_as_category
|
|
df[category_col] = df[category_col].astype("category")
|
|
# ... then carry on.
|
|
group = df[[category_col, value_col]].groupby(category_col)
|
|
return group[value_col].agg(
|
|
["mean", "min", "max", "median", "count", q25, q50, q75, q90]
|
|
)
|