From 77347c1890cc98c0c86a2a932ff3e07270dcf8e3 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 12 Mar 2026 17:14:48 +0200 Subject: [PATCH] Fix multiselect atom split (#22) --- pulkka/data_utils.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/pulkka/data_utils.py b/pulkka/data_utils.py index 1a5adad..c6d460e 100644 --- a/pulkka/data_utils.py +++ b/pulkka/data_utils.py @@ -45,6 +45,38 @@ def rename_na(df: pd.DataFrame, col: str, na_name: str) -> None: df[col] = df[col].astype("category") +def _split_multiselect(value: str, sep: str = ", ") -> list[str]: + """ + Split a multiselect string by separator, but not inside parentheses. + + E.g. "CI/CD (GitHub Actions, GitLab CI, Jenkins), AWS" + -> ["CI/CD (GitHub Actions, GitLab CI, Jenkins)", "AWS"] + """ + parts = [] + depth = 0 + current = [] + i = 0 + while i < len(value): + if value[i] == "(": + depth += 1 + current.append(value[i]) + elif value[i] == ")": + depth -= 1 + current.append(value[i]) + elif depth == 0 and value[i:].startswith(sep): + parts.append("".join(current).strip()) + current = [] + i += len(sep) + continue + else: + current.append(value[i]) + i += 1 + remaining = "".join(current).strip() + if remaining: + parts.append(remaining) + return parts + + def explode_multiselect( series: pd.Series, *, @@ -59,7 +91,7 @@ def explode_multiselect( """ counts = ( series.dropna() - .str.split(sep) + .apply(lambda v: _split_multiselect(v, sep)) .explode() .str.strip() .loc[lambda s: s != ""]