mirror of
https://github.com/koodiklinikka/palkkakysely.git
synced 2026-03-13 19:03:40 +00:00
103 lines
2.5 KiB
Python
103 lines
2.5 KiB
Python
from typing import Optional
|
|
|
|
import pandas as pd
|
|
|
|
|
|
def q25(x):
|
|
return x.quantile(0.25)
|
|
|
|
|
|
def q50(x):
|
|
return x.quantile(0.5)
|
|
|
|
|
|
def q75(x):
|
|
return x.quantile(0.75)
|
|
|
|
|
|
def q90(x):
|
|
return x.quantile(0.9)
|
|
|
|
|
|
def get_categorical_stats(
|
|
df: pd.DataFrame,
|
|
category_col: str,
|
|
value_col: str,
|
|
*,
|
|
na_as_category: Optional[str] = None,
|
|
) -> pd.DataFrame:
|
|
# Drop records where value is not numeric before grouping...
|
|
df = df.copy()
|
|
df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
|
|
df = df[df[value_col].notna() & df[value_col] > 0]
|
|
if na_as_category:
|
|
rename_na(df, category_col, na_as_category)
|
|
# ... then carry on.
|
|
group = df[[category_col, value_col]].groupby(category_col, observed=False)
|
|
return group[value_col].agg(
|
|
["mean", "min", "max", "median", "count", q25, q50, q75, q90],
|
|
)
|
|
|
|
|
|
def rename_na(df: pd.DataFrame, col: str, na_name: str) -> None:
|
|
df[col] = df[col].astype("string")
|
|
df.loc[df[col].isna(), col] = na_name
|
|
df[col] = df[col].astype("category")
|
|
|
|
|
|
def _split_multiselect(value: str, sep: str = ", ") -> list[str]:
|
|
"""
|
|
Split a multiselect string by separator, but not inside parentheses.
|
|
|
|
E.g. "CI/CD (GitHub Actions, GitLab CI, Jenkins), AWS"
|
|
-> ["CI/CD (GitHub Actions, GitLab CI, Jenkins)", "AWS"]
|
|
"""
|
|
parts = []
|
|
depth = 0
|
|
current = []
|
|
i = 0
|
|
while i < len(value):
|
|
if value[i] == "(":
|
|
depth += 1
|
|
current.append(value[i])
|
|
elif value[i] == ")":
|
|
depth -= 1
|
|
current.append(value[i])
|
|
elif depth == 0 and value[i:].startswith(sep):
|
|
parts.append("".join(current).strip())
|
|
current = []
|
|
i += len(sep)
|
|
continue
|
|
else:
|
|
current.append(value[i])
|
|
i += 1
|
|
remaining = "".join(current).strip()
|
|
if remaining:
|
|
parts.append(remaining)
|
|
return parts
|
|
|
|
|
|
def explode_multiselect(
|
|
series: pd.Series,
|
|
*,
|
|
sep: str = ", ",
|
|
top_n: int | None = None,
|
|
) -> pd.Series:
|
|
"""
|
|
Explode a comma-separated multiselect column into value counts.
|
|
|
|
Returns a Series of counts indexed by individual values,
|
|
sorted descending. Optionally limited to top_n entries.
|
|
"""
|
|
counts = (
|
|
series.dropna()
|
|
.apply(lambda v: _split_multiselect(v, sep))
|
|
.explode()
|
|
.str.strip()
|
|
.loc[lambda s: s != ""]
|
|
.value_counts()
|
|
)
|
|
if top_n is not None:
|
|
counts = counts.head(top_n)
|
|
return counts
|