mirror of
https://github.com/koodiklinikka/palkkakysely.git
synced 2026-03-13 10:03:34 +00:00
Fix multiselect atom split (#22)
This commit is contained in:
@@ -45,6 +45,38 @@ def rename_na(df: pd.DataFrame, col: str, na_name: str) -> None:
|
|||||||
df[col] = df[col].astype("category")
|
df[col] = df[col].astype("category")
|
||||||
|
|
||||||
|
|
||||||
|
def _split_multiselect(value: str, sep: str = ", ") -> list[str]:
|
||||||
|
"""
|
||||||
|
Split a multiselect string by separator, but not inside parentheses.
|
||||||
|
|
||||||
|
E.g. "CI/CD (GitHub Actions, GitLab CI, Jenkins), AWS"
|
||||||
|
-> ["CI/CD (GitHub Actions, GitLab CI, Jenkins)", "AWS"]
|
||||||
|
"""
|
||||||
|
parts = []
|
||||||
|
depth = 0
|
||||||
|
current = []
|
||||||
|
i = 0
|
||||||
|
while i < len(value):
|
||||||
|
if value[i] == "(":
|
||||||
|
depth += 1
|
||||||
|
current.append(value[i])
|
||||||
|
elif value[i] == ")":
|
||||||
|
depth -= 1
|
||||||
|
current.append(value[i])
|
||||||
|
elif depth == 0 and value[i:].startswith(sep):
|
||||||
|
parts.append("".join(current).strip())
|
||||||
|
current = []
|
||||||
|
i += len(sep)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
current.append(value[i])
|
||||||
|
i += 1
|
||||||
|
remaining = "".join(current).strip()
|
||||||
|
if remaining:
|
||||||
|
parts.append(remaining)
|
||||||
|
return parts
|
||||||
|
|
||||||
|
|
||||||
def explode_multiselect(
|
def explode_multiselect(
|
||||||
series: pd.Series,
|
series: pd.Series,
|
||||||
*,
|
*,
|
||||||
@@ -59,7 +91,7 @@ def explode_multiselect(
|
|||||||
"""
|
"""
|
||||||
counts = (
|
counts = (
|
||||||
series.dropna()
|
series.dropna()
|
||||||
.str.split(sep)
|
.apply(lambda v: _split_multiselect(v, sep))
|
||||||
.explode()
|
.explode()
|
||||||
.str.strip()
|
.str.strip()
|
||||||
.loc[lambda s: s != ""]
|
.loc[lambda s: s != ""]
|
||||||
|
|||||||
Reference in New Issue
Block a user