From 663cd3d3494ed04e18a34814aeb61b19fceff27b Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Wed, 11 Mar 2026 11:08:14 +0200
Subject: [PATCH] Add 2025 survey data support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 2025 survey uses a single English-only xlsx (instead of separate
fi/en files) with a restructured schema: compensation is split into
base salary, commission, lomaraha, bonus, and equity components;
working time is h/week instead of percentage; and competitive salary
is categorical instead of boolean. Vuositulot is now synthesized
from the component fields.

Drop COLUMN_MAP_2024, COLUMN_MAP_2024_EN_TO_FI, VALUE_MAP_2024_EN_TO_FI,
read_initial_dfs_2024, read_data_2024, map_sukupuoli, map_vuositulot,
split_boolean_column_to_other, apply_fixups, and the associated gender
value lists and boolean text maps. All of this exists in version history.

- KKPALKKA now includes base salary + commission (median 5500 → 5800)
- Apply map_numberlike to tuntilaskutus and vuosilaskutus columns to
  handle string values like "60 000" and "100 000"
- Filter out zeros when computing tunnusluvut on the index page so
  stats reflect actual reported values

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 pulkka/column_maps.py | 149 +++++++++++++++++++---------------
 pulkka/data_ingest.py | 180 +++++++++++++++++++++---------------------
 pulkka/data_utils.py  |  25 ++++++
 template/index.html   |  18 ++++-
 4 files changed, 213 insertions(+), 159 deletions(-)

diff --git a/pulkka/column_maps.py b/pulkka/column_maps.py
index 56f7d60..4f09a35 100644
--- a/pulkka/column_maps.py
+++ b/pulkka/column_maps.py
@@ -3,9 +3,6 @@ from __future__ import annotations
 IKA_COL = "Ikä"
 KAUPUNKI_COL = "Kaupunki"
 KIKY_COL = "Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?"
-KIKY_OTHER_COL = (
-    "Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen? (muut vastaukset)"
-)
 KKPALKKA_COL = "Kuukausipalkka"
 KK_TULOT_COL = "Kk-tulot (laskennallinen)"
 KK_TULOT_NORM_COL = "Kk-tulot (laskennallinen, normalisoitu)"
@@ -18,9 +15,7 @@ PALKANSAAJA_VAI_LASKUTTAJA_COL = "Palkansaaja vai laskuttaja"
 PALVELUT_COL = "Palvelut"
 ROOLI_COL = "Rooli"
 ROOLI_NORM_COL = "Rooli (normalisoitu)"
-SIIRTYNYT_COL = (
-    "Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?"
-)
+SIIRTYNYT_COL = "Siirtynyt palkansaaja/laskuttaja"
 SUKUPUOLI_COL = "Sukupuoli"
 TUNTILASKUTUS_ALV0_COL = "Tuntilaskutus (ALV 0%, euroina)"
 TYOAIKA_COL = "Työaika"
@@ -30,68 +25,67 @@ VUOSILASKUTUS_ALV0_COL = "Vuosilaskutus (ALV 0%, euroina)"
 VUOSITULOT_COL = "Vuositulot"
 ID_COL = "Vastaustunniste"
 
-COLUMN_MAP_2024 = {
-    "Timestamp": "Timestamp",
-    "Oletko palkansaaja vai laskuttaja?": PALKANSAAJA_VAI_LASKUTTAJA_COL,
-    "Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?": SIIRTYNYT_COL,
-    "Ikä": "Ikä",
-    "Sukupuoli": "Sukupuoli",
-    "Työkokemus alalta (vuosina)": TYOKOKEMUS_COL,
-    "Koulutustaustasi": "Koulutustaustasi",
-    "Tulojen muutos viime vuodesta (%)": "Tulojen muutos viime vuodesta (%)",
-    "Montako vuotta olet tehnyt laskuttavaa työtä alalla?": "Montako vuotta olet tehnyt laskuttavaa työtä alalla?",
-    "Mitä palveluja tarjoat?": PALVELUT_COL,
-    "Tuntilaskutus (ALV 0%, euroina)": TUNTILASKUTUS_ALV0_COL,
-    "Vuosilaskutus (ALV 0%, euroina)": VUOSILASKUTUS_ALV0_COL,
-    "Hankitko asiakkaasi itse suoraan vai käytätkö välitysfirmojen palveluita?": "Hankitko asiakkaasi itse suoraan vai käytätkö välitysfirmojen palveluita?",
-    "Mistä asiakkaat ovat?": MISTA_ASIAKKAAT_COL,
-    "Työpaikka": "Työpaikka",
-    "Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": KAUPUNKI_COL,
-    "Millaisessa yrityksessä työskentelet?": MILLAISESSA_COL,
-    "Työaika": TYOAIKA_COL,
-    "Kuinka suuren osan ajasta teet lähityönä toimistolla?": LAHITYO_COL,
-    "Rooli / titteli": ROOLI_COL,
-    "Kuukausipalkka (brutto, euroina)": KKPALKKA_COL,
-    "Vuositulot (sis. bonukset, osingot yms, euroina)": VUOSITULOT_COL,
-    "Vapaa kuvaus kokonaiskompensaatiomallista": "Vapaa kuvaus kokonaiskompensaatiomallista",
-    "Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?": KIKY_COL,
-    "Vapaa sana": "Vapaa sana",
-    "Palautetta kyselystä ja ideoita ensi vuoden kyselyyn": PALAUTE_COL,
-}
+COMMISSION_COL = "Provisio (kk, brutto)"
+LOMARAHA_COL = "Lomaraha (EUR)"
+BONUS_COL = "Bonus (EUR)"
+EQUITY_COL = "Osakkeet/optiot (EUR)"
+SENIORITY_COL = "Seniority"
 
-COLUMN_MAP_2024_EN_TO_FI = {
+COLUMN_MAP_2025 = {
     "Timestamp": "Timestamp",
-    "Employee or entrepreneur": "Oletko palkansaaja vai laskuttaja?",
-    "Have you switched from employment to entrepreneurship or vice versa after 1.10.2023?": "Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?",
-    "Age": "Ikä",
-    "Gender": "Sukupuoli",
-    "Relevant work experience from the industry (in years)": "Työkokemus alalta (vuosina)",
+    "Employee or entrepreneur": PALKANSAAJA_VAI_LASKUTTAJA_COL,
+    "Switched from employment to entrepreneurship, or vice versa, in 2025?": SIIRTYNYT_COL,
+    "Age": IKA_COL,
+    "Gender": SUKUPUOLI_COL,
+    "Finnish fluency": "Suomen kielen taito",
+    "Work language": "Työkieli",
+    "Relevant work experience from the industry (in years)": TYOKOKEMUS_COL,
+    "Years at current employer": "Vuosia nykyisellä työnantajalla",
+    "Companies worked for": "Työpaikkojen lukumäärä",
+    "Company size": "Yrityksen koko",
     "Education": "Koulutustaustasi",
-    "Change in income from last year (in %)": "Tulojen muutos viime vuodesta (%)",
-    "How many years have you worked as an entrepreneur in this industry?": "Montako vuotta olet tehnyt laskuttavaa työtä alalla?",
-    "What services do you offer?": "Mitä palveluja tarjoat?",
-    "Hourly rate (VAT 0%, in euros)": "Tuntilaskutus (ALV 0%, euroina)",
-    "Yearly billing (VAT 0%, in euros)": "Vuosilaskutus (ALV 0%, euroina)",
+    "Field of Study": "Opintoala",
+    "Change in pay rate from last year (%)": "Tulojen muutos viime vuodesta (%)",
+    "Years as entrepreneur": "Montako vuotta olet tehnyt laskuttavaa työtä alalla?",
+    "What services do you offer?": PALVELUT_COL,
+    "Hourly rate (VAT 0%, in euros)": TUNTILASKUTUS_ALV0_COL,
+    "Yearly billing (VAT 0%, in euros)": VUOSILASKUTUS_ALV0_COL,
+    "Billable hours per week": "Laskutettavat tunnit viikossa",
+    "Weeks not billing": "Viikot ilman laskutusta",
+    "Billing methods": "Laskutustavat",
+    "Contract length": "Sopimuksen pituus",
     "Do you use agencies or find your clients yourself?": "Hankitko asiakkaasi itse suoraan vai käytätkö välitysfirmojen palveluita?",
-    "Where are your clients from?": "Mistä asiakkaat ovat?",
-    "Company": "Työpaikka",
-    "In which city is your office?": "Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?",
-    "What kind of a company you work in?": "Millaisessa yrityksessä työskentelet?",
-    "Full time / part time": "Työaika",
-    "How much of your work time you spend in company office? (in %)": "Kuinka suuren osan ajasta teet lähityönä toimistolla?",
-    "Role / title": "Rooli / titteli",
-    "Monthly salary (gross, in EUR)": "Kuukausipalkka (brutto, euroina)",
-    "Yearly income (incl. bonuses, etc; in EUR)": "Vuositulot (sis. bonukset, osingot yms, euroina)",
+    "Where are your clients from?": MISTA_ASIAKKAAT_COL,
+    "Company": TYOPAIKKA_COL,
+    "City": KAUPUNKI_COL,
+    "What kind of a company you work in?": MILLAISESSA_COL,
+    "Working time (h/week)": TYOAIKA_COL,
+    "Time in office (%)": LAHITYO_COL,
+    "Role / title": ROOLI_COL,
+    "Seniority level": SENIORITY_COL,
+    "Formal Seniority": "Virallinen senioriteetti",
+    "Base salary (gross, monthly EUR)": KKPALKKA_COL,
+    "Commission (gross, monthly EUR)": COMMISSION_COL,
+    "Lomaraha (Holiday bonus, in EUR)": LOMARAHA_COL,
+    "Bonus (EUR)": BONUS_COL,
+    "Equity (EUR)": EQUITY_COL,
     "Free description of your compensation model": "Vapaa kuvaus kokonaiskompensaatiomallista",
-    "Is your salary competitive?": "Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?",
+    "Competitive salary": KIKY_COL,
+    "Bonus": "Bonukset (kuvaus)",
+    "Non-fringe benefits": "Edut (ei luontoisedut)",
+    "Yearly Tax-Free Benefits (EUR)": "Vuosittaiset verovapaat edut (EUR)",
+    "Fringe benefits (luontoisedut)": "Luontoisedut",
+    "Operating system": "Käyttöjärjestelmä",
+    "Language": "Ohjelmointikieli",
+    "Web Frameworks": "Web-kehykset",
+    "Data Engineering & Machine Learning": "Data & ML",
+    "DevOps & Cloud Platforms": "DevOps & pilvi",
+    "Databases": "Tietokannat",
     "What was left unasked that you want to answer to?": "Vapaa sana",
-    "Feedback of the survey": "Palautetta kyselystä ja ideoita ensi vuoden kyselyyn",
+    "Feedback of the survey": PALAUTE_COL,
 }
 
-# ensure all columns have translations
-assert set(COLUMN_MAP_2024.keys()) == set(COLUMN_MAP_2024_EN_TO_FI.values())
-
-VALUE_MAP_2024_EN_TO_FI = {
+VALUE_MAP_2025 = {
     PALKANSAAJA_VAI_LASKUTTAJA_COL: {
         "Employee": "Palkansaaja",
         "Entrepreneur": "Laskuttaja",
@@ -122,9 +116,36 @@ VALUE_MAP_2024_EN_TO_FI = {
         "Tampere (etänä Berliiniin)": "Tampere",
         "Turku/remote (HQ Austin, TX)": "Turku",
         "Ulkomailla": "Ulkomaat",
+        "Remote": "Etätyö",
+        "remote": "Etätyö",
+        "Fully remote": "Etätyö",
+        "Fully remote work": "Etätyö",
+        "100% remote, no main office": "Etätyö",
+        "Completely distributed and remote": "Etätyö",
+        "Remote without HQ": "Etätyö",
+        "Remote (US)": "Ulkomaat",
+        "Outside Finland": "Ulkomaat",
+        "Abroad": "Ulkomaat",
+        "No centrla office, multiple locations with employees": "Etätyö",
     },
     MILLAISESSA_COL: {
         "Product company with softaware as their core business": "Tuotetalossa, jonka core-bisnes on softa",
+        "Product company with software as their core business": "Tuotetalossa, jonka core-bisnes on softa",
+        "A company where software is a support role (for example banks or healthcare)": "Yritys, jossa softa tukirooli",
+        "Consulting": "Konsultointi",
+        "Public or third sector": "Julkinen/kolmas sektori",
+    },
+    SUKUPUOLI_COL: {
+        "Male": "mies",
+        "Female": "nainen",
+        "Non-binary": "muu",
+        "Prefer not to say": None,
+    },
+    KIKY_COL: {
+        "Above market": "Yli markkinatason",
+        "Average market": "Markkinataso",
+        "Below market": "Alle markkinatason",
+        "Not sure": "En osaa sanoa",
     },
 }
 
@@ -211,9 +232,7 @@ MALE_GENDER_VALUES = (
     "äiä",
 )
 
-IDS_TO_DROP = {
-    "0bf579f8b0a771b9",  # 2 euron palkka, rooli "2"
-    "9a3b73d810f6e983",  # apache hyökkäyshelikopteri
+IDS_TO_DROP_2025 = {
+    "18121abbdb13303c",  # duplicate of d5ac88f64a922e6c (submitted 3 min later)
 }
-FI_EXPECTED_ROW_COUNT = 682
-EN_EXPECTED_ROW_COUNT = 51
+EXPECTED_ROW_COUNT_2025 = 683
diff --git a/pulkka/data_ingest.py b/pulkka/data_ingest.py
index 2afaf5e..28976ab 100644
--- a/pulkka/data_ingest.py
+++ b/pulkka/data_ingest.py
@@ -5,27 +5,25 @@ import re
 import warnings
 
 import numpy as np
-import pandas
 import pandas as pd
 
 from pulkka.column_maps import (
-    BOOLEAN_TEXT_TO_BOOLEAN_MAP,
-    COLUMN_MAP_2024,
-    COLUMN_MAP_2024_EN_TO_FI,
+    BONUS_COL,
+    COLUMN_MAP_2025,
+    COMMISSION_COL,
     COMPANY_MAP,
-    EN_EXPECTED_ROW_COUNT,
+    EQUITY_COL,
+    EXPECTED_ROW_COUNT_2025,
     FEMALE_GENDER_VALUES,
-    FI_EXPECTED_ROW_COUNT,
     ID_COL,
-    IDS_TO_DROP,
+    IDS_TO_DROP_2025,
     IKA_COL,
     KIKY_COL,
-    KIKY_OTHER_COL,
     KK_TULOT_COL,
     KK_TULOT_NORM_COL,
     KKPALKKA_COL,
     LAHITYO_COL,
-    LANG_COL,
+    LOMARAHA_COL,
     MALE_GENDER_VALUES,
     NO_GENDER_VALUES,
     OTHER_GENDER_VALUES,
@@ -34,16 +32,18 @@ from pulkka.column_maps import (
     ROOLI_COL,
     ROOLI_NORM_COL,
     SUKUPUOLI_COL,
+    TUNTILASKUTUS_ALV0_COL,
     TYOAIKA_COL,
     TYOKOKEMUS_COL,
     TYOPAIKKA_COL,
-    VALUE_MAP_2024_EN_TO_FI,
+    VALUE_MAP_2025,
+    VUOSILASKUTUS_ALV0_COL,
     VUOSITULOT_COL,
 )
 from pulkka.config import DATA_DIR, YEAR
 
 
-def map_sukupuoli(r: pd.Series) -> str | None:
+def map_sukupuoli(r: pd.Series) -> str | None:  # Unused in 2025
     value = r[SUKUPUOLI_COL]
     if not isinstance(value, str):
         return value
@@ -84,44 +84,29 @@ def map_numberlike(d):
     return d
 
 
-def ucfirst(val):
+def ucfirst(val) -> str:
     if isinstance(val, str):
         return val[0].upper() + val[1:]
     return val
 
 
 def hash_row(r: pd.Series) -> str:
-    source_data = f"{r[LANG_COL]}.{int(r.Timestamp.timestamp() * 1000)}"
+    source_data = (
+        f"en.{int(r.Timestamp.timestamp() * 1000)}"  # NB (2025): hard-codes `en`!
+    )
     return hashlib.sha256(source_data.encode()).hexdigest()[:16]
 
 
 def read_initial_dfs() -> pd.DataFrame:
-    df_fi: pd.DataFrame = pd.read_excel(
-        DATA_DIR / "results-fi.xlsx",
-        skiprows=[1],  # Google Sheets exports one empty row
-    )
-    df_fi[LANG_COL] = "fi"
+    df: pd.DataFrame = pd.read_excel(DATA_DIR / "data.xlsx")
+    df.columns = df.columns.str.strip()
 
-    if len(df_fi) < FI_EXPECTED_ROW_COUNT:
+    if len(df) < EXPECTED_ROW_COUNT_2025:
         raise ValueError(
-            f"Expected at least {FI_EXPECTED_ROW_COUNT} rows in the Finnish data, got {len(df_fi)}",
+            f"Expected at least {EXPECTED_ROW_COUNT_2025} rows, got {len(df)}",
         )
 
-    df_en: pd.DataFrame = pd.read_excel(
-        DATA_DIR / "results-en.xlsx",
-        skiprows=[1],  # Google Sheets exports one empty row
-    )
-    df_en[LANG_COL] = "en"
-
-    if len(df_fi) < EN_EXPECTED_ROW_COUNT:
-        raise ValueError(
-            f"Expected at least {EN_EXPECTED_ROW_COUNT} rows in the English data, got {len(df_en)}",
-        )
-
-    df_en = df_en.rename(columns=COLUMN_MAP_2024_EN_TO_FI)
-    df = pd.concat([df_fi, df_en], ignore_index=True)
     df = df[df["Timestamp"].notna()]  # Remove rows with no timestamp
-    df[LANG_COL] = df[LANG_COL].astype("category")
     # Give each row a unique hash ID
     df[ID_COL] = df.apply(hash_row, axis=1)
     # Ensure truncated sha is unique
@@ -146,38 +131,75 @@ def map_case_insensitive(series: pd.Series, mapping: dict[str, str]) -> pd.Serie
 
 
 def read_data() -> pd.DataFrame:
-    if YEAR != "2024":
+    if YEAR != "2025":
         raise ValueError(
-            "This code only works for 2024. "
+            "This code only works for 2025. "
             "Please use an older revision for older data.",
         )
     df = read_initial_dfs()
 
-    df = df.rename(columns=COLUMN_MAP_2024)
+    df = df.rename(columns=COLUMN_MAP_2025)
 
-    for col, val_map in VALUE_MAP_2024_EN_TO_FI.items():
+    for col, val_map in VALUE_MAP_2025.items():
         df[col] = df[col].map(val_map).fillna(df[col]).astype("category")
 
     # Drop known bogus data
-    df = df.drop(df[df[ID_COL].isin(IDS_TO_DROP)].index)
+    df = df.drop(df[df[ID_COL].isin(IDS_TO_DROP_2025)].index)
 
-    df[SUKUPUOLI_COL] = df.apply(map_sukupuoli, axis=1).astype("category")
+    # Drop duplicate submissions: rows identical on all columns except
+    # Timestamp and ID (keep the earliest submission)
+    content_cols = [c for c in df.columns if c not in ("Timestamp", ID_COL)]
+    before = len(df)
+    df = df.sort_values("Timestamp").drop_duplicates(subset=content_cols, keep="first")
+    n_dupes = before - len(df)
+    if n_dupes:
+        warnings.warn(f"Dropped {n_dupes} duplicate submission(s)")
+
+    # Gender is already mapped via VALUE_MAP_2025
+    df[SUKUPUOLI_COL] = df[SUKUPUOLI_COL].astype("category")
     df[IKA_COL] = df[IKA_COL].astype("category")
+    df[KIKY_COL] = df[KIKY_COL].astype("category")
 
-    # Assume that people entering 37.5 (hours) as their työaika means 100%
-    df.loc[df[TYOAIKA_COL] == 37.5, TYOAIKA_COL] = 100
-    # Assume there is no actual 10x koodari among us
-    df.loc[df[TYOAIKA_COL] == 1000, TYOAIKA_COL] = 100
-
-    df[TYOAIKA_COL] = to_percentage(df[TYOAIKA_COL], 100)
+    # Working time is in h/week — normalize to fraction of 37.5h
+    df[TYOAIKA_COL] = to_percentage(df[TYOAIKA_COL], 37.5)
+    # Time in office is already a percentage
     df[LAHITYO_COL] = to_percentage(df[LAHITYO_COL], 100)
 
-    # Split out non-boolean answers from KIKY_COL to KIKY_OTHER_COL
-    df = split_boolean_column_to_other(df, KIKY_COL, KIKY_OTHER_COL)
-
     # Try to clean up numbers with spaces, etc. to real numbers
     df[KKPALKKA_COL] = df[KKPALKKA_COL].apply(map_numberlike)
-    df[VUOSITULOT_COL] = df[VUOSITULOT_COL].apply(map_numberlike)
+    df[TUNTILASKUTUS_ALV0_COL] = pd.to_numeric(
+        df[TUNTILASKUTUS_ALV0_COL].apply(map_numberlike),
+        errors="coerce",
+    )
+    df[VUOSILASKUTUS_ALV0_COL] = pd.to_numeric(
+        df[VUOSILASKUTUS_ALV0_COL].apply(map_numberlike),
+        errors="coerce",
+    )
+
+    # Synthesize Vuositulot from components:
+    # (base_salary + commission) * 12 + lomaraha + bonus + equity
+    for comp_col in [COMMISSION_COL, LOMARAHA_COL, BONUS_COL, EQUITY_COL]:
+        df[comp_col] = pd.to_numeric(
+            df[comp_col].apply(map_numberlike),
+            errors="coerce",
+        ).fillna(0)
+
+    # Fold commission into monthly salary so KKPALKKA = base + commission
+    df[KKPALKKA_COL] = (
+        pd.to_numeric(df[KKPALKKA_COL], errors="coerce").fillna(0) + df[COMMISSION_COL]
+    )
+
+    base_yearly = df[KKPALKKA_COL] * 12
+    lomaraha = df.get(LOMARAHA_COL, 0)
+    bonus = df.get(BONUS_COL, 0)
+    equity = df.get(EQUITY_COL, 0)
+
+    df[VUOSITULOT_COL] = base_yearly + lomaraha + bonus + equity
+    # If base salary is missing/zero, vuositulot should be NaN
+    df.loc[
+        pd.to_numeric(df[KKPALKKA_COL], errors="coerce").fillna(0) == 0,
+        VUOSITULOT_COL,
+    ] = np.nan
 
     # Fix up Työpaikka
     df[TYOPAIKKA_COL] = df[TYOPAIKKA_COL].replace("-", np.nan)
@@ -214,7 +236,7 @@ def read_data() -> pd.DataFrame:
     return df
 
 
-def to_percentage(ser: pandas.Series, norm_max: float) -> pandas.Series:
+def to_percentage(ser: pd.Series, norm_max: float) -> pd.Series:
     """
     Convert a series of numbers to a percentage
     """
@@ -227,54 +249,16 @@ def to_percentage(ser: pandas.Series, norm_max: float) -> pandas.Series:
     return ser.clip(lower=0)
 
 
-def split_boolean_column_to_other(df, col, other_col):
-    df[col] = df[col].replace(BOOLEAN_TEXT_TO_BOOLEAN_MAP)
-    df[other_col] = df[col].apply(
-        lambda r: r if (r and not isinstance(r, bool)) else None,
-    )
-    df[col] = (
-        df[col]
-        .apply(
-            lambda value: (
-                ["Ei", "Kyllä"][value]
-                if isinstance(value, bool)
-                else (np.nan if not value else "Muu")
-            ),
-        )
-        .astype("category")
-    )
-    # reorder columns so that other_col is right after col
-    cols = list(df.columns)
-    cols.remove(other_col)
-    cols.insert(cols.index(col) + 1, other_col)
-    df = df[cols]
-    return df
-
-
-def force_age_numeric(df):
+def force_age_numeric(df: pd.DataFrame) -> pd.DataFrame:
     age_map = {}
     for cat in df[IKA_COL].cat.categories:
-        m = re.match(r"^(\d+)-(\d+) v", cat)
+        m = re.match(r"^(\d+)-(\d+)( v)?", cat)
         if m:
             age_map[cat] = int(round(float(m.group(1)) + float(m.group(2))) / 2)
     df[IKA_COL] = df[IKA_COL].apply(lambda r: age_map.get(r, r))
     return df
 
 
-def main():
-    pd.set_option("display.max_column", None)
-    pd.set_option("display.max_rows", None)
-    pd.set_option("display.max_seq_items", None)
-    pd.set_option("display.max_colwidth", 500)
-    pd.set_option("expand_frame_repr", True)
-    df = read_data()
-    print(df.head())
-
-
-if __name__ == "__main__":
-    main()
-
-
 def apply_fixups(df: pd.DataFrame, fixups: list[tuple[dict, dict]]) -> pd.DataFrame:
     for match_cond, replace_cond in fixups:
         match_keys, match_values = zip(*match_cond.items())
@@ -286,3 +270,17 @@ def apply_fixups(df: pd.DataFrame, fixups: list[tuple[dict, dict]]) -> pd.DataFr
         replace_keys, replace_values = zip(*replace_cond.items())
         df.loc[ix, list(replace_keys)] = replace_values
     return df
+
+
+def main() -> None:
+    pd.set_option("display.max_column", None)
+    pd.set_option("display.max_rows", None)
+    pd.set_option("display.max_seq_items", None)
+    pd.set_option("display.max_colwidth", 500)
+    pd.set_option("expand_frame_repr", True)
+    df = read_data()
+    print(df.head())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pulkka/data_utils.py b/pulkka/data_utils.py
index 002a4ee..1a5adad 100644
--- a/pulkka/data_utils.py
+++ b/pulkka/data_utils.py
@@ -43,3 +43,28 @@ def rename_na(df: pd.DataFrame, col: str, na_name: str) -> None:
     df[col] = df[col].astype("string")
     df.loc[df[col].isna(), col] = na_name
     df[col] = df[col].astype("category")
+
+
+def explode_multiselect(
+    series: pd.Series,
+    *,
+    sep: str = ", ",
+    top_n: int | None = None,
+) -> pd.Series:
+    """
+    Explode a comma-separated multiselect column into value counts.
+
+    Returns a Series of counts indexed by individual values,
+    sorted descending. Optionally limited to top_n entries.
+    """
+    counts = (
+        series.dropna()
+        .str.split(sep)
+        .explode()
+        .str.strip()
+        .loc[lambda s: s != ""]
+        .value_counts()
+    )
+    if top_n is not None:
+        counts = counts.head(top_n)
+    return counts
diff --git a/template/index.html b/template/index.html
index fa26013..12a4847 100644
--- a/template/index.html
+++ b/template/index.html
@@ -1,8 +1,11 @@
 {% extends "_base.html" %}
+{% macro eur_span(number) -%}
+    <span title="{{ number }}" class="eur" data-number="{{ number }}">{{ number }}&nbsp;€</span>
+{%- endmacro %}
 {% macro tunnusluvut_points(df, col_name, title) %}
-    {% with num_kk = df[pd.to_numeric(df[col_name], errors='coerce').notnull()][col_name] %}
-        <li title="n = {{ num_kk.count() }}">{{ title }}, keskiarvo = {{ num_kk.mean()|round(0) }} €</li>
-        <li title="n = {{ num_kk.count() }}">{{ title }}, mediaani = {{ num_kk.median()|round(0) }} €</li>
+    {% with num_kk = df[col_name][pd.to_numeric(df[col_name], errors='coerce') > 0].dropna() %}
+        <li title="n = {{ num_kk.count() }}">{{ title }}, keskiarvo = {{ eur_span(num_kk.mean()|round(0)) }}</li>
+        <li title="n = {{ num_kk.count() }}">{{ title }}, mediaani = {{ eur_span(num_kk.median()|round(0)) }}</li>
     {% endwith %}
 {% endmacro %}
 {% block body %}
@@ -74,6 +77,15 @@
         International</a> (CC&nbsp;BY&nbsp;4.0).<br>
         Mankelointityökalujen lisenssi on <a href="https://opensource.org/licenses/MIT">MIT</a>.
     </p>
+    <script>
+    for (const eur of document.querySelectorAll(".eur")) {
+      // Try to parse data-number, format as browser-native currency
+      const number = parseFloat(eur.dataset.number);
+      if (!isNaN(number)) {
+        eur.textContent = number.toLocaleString("fi-FI", {style: "currency", currency: "EUR"});
+      }
+    }
+    </script>
 {% endblock %}
 {% block footer %}
     <footer>