mirror of
https://github.com/koodiklinikka/palkkakysely.git
synced 2026-03-13 03:03:42 +00:00
Add 2025 survey data support
The 2025 survey uses a single English-only xlsx (instead of separate fi/en files) with a restructured schema: compensation is split into base salary, commission, lomaraha, bonus, and equity components; working time is h/week instead of percentage; and competitive salary is categorical instead of boolean. Vuositulot is now synthesized from the component fields. Drop COLUMN_MAP_2024, COLUMN_MAP_2024_EN_TO_FI, VALUE_MAP_2024_EN_TO_FI, read_initial_dfs_2024, read_data_2024, map_sukupuoli, map_vuositulot, split_boolean_column_to_other, apply_fixups, and the associated gender value lists and boolean text maps. All of this exists in version history. - KKPALKKA now includes base salary + commission (median 5500 → 5800) - Apply map_numberlike to tuntilaskutus and vuosilaskutus columns to handle string values like "60 000" and "100 000" - Filter out zeros when computing tunnusluvut on the index page so stats reflect actual reported values Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,9 +3,6 @@ from __future__ import annotations
|
|||||||
IKA_COL = "Ikä"
|
IKA_COL = "Ikä"
|
||||||
KAUPUNKI_COL = "Kaupunki"
|
KAUPUNKI_COL = "Kaupunki"
|
||||||
KIKY_COL = "Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?"
|
KIKY_COL = "Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?"
|
||||||
KIKY_OTHER_COL = (
|
|
||||||
"Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen? (muut vastaukset)"
|
|
||||||
)
|
|
||||||
KKPALKKA_COL = "Kuukausipalkka"
|
KKPALKKA_COL = "Kuukausipalkka"
|
||||||
KK_TULOT_COL = "Kk-tulot (laskennallinen)"
|
KK_TULOT_COL = "Kk-tulot (laskennallinen)"
|
||||||
KK_TULOT_NORM_COL = "Kk-tulot (laskennallinen, normalisoitu)"
|
KK_TULOT_NORM_COL = "Kk-tulot (laskennallinen, normalisoitu)"
|
||||||
@@ -18,9 +15,7 @@ PALKANSAAJA_VAI_LASKUTTAJA_COL = "Palkansaaja vai laskuttaja"
|
|||||||
PALVELUT_COL = "Palvelut"
|
PALVELUT_COL = "Palvelut"
|
||||||
ROOLI_COL = "Rooli"
|
ROOLI_COL = "Rooli"
|
||||||
ROOLI_NORM_COL = "Rooli (normalisoitu)"
|
ROOLI_NORM_COL = "Rooli (normalisoitu)"
|
||||||
SIIRTYNYT_COL = (
|
SIIRTYNYT_COL = "Siirtynyt palkansaaja/laskuttaja"
|
||||||
"Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?"
|
|
||||||
)
|
|
||||||
SUKUPUOLI_COL = "Sukupuoli"
|
SUKUPUOLI_COL = "Sukupuoli"
|
||||||
TUNTILASKUTUS_ALV0_COL = "Tuntilaskutus (ALV 0%, euroina)"
|
TUNTILASKUTUS_ALV0_COL = "Tuntilaskutus (ALV 0%, euroina)"
|
||||||
TYOAIKA_COL = "Työaika"
|
TYOAIKA_COL = "Työaika"
|
||||||
@@ -30,68 +25,67 @@ VUOSILASKUTUS_ALV0_COL = "Vuosilaskutus (ALV 0%, euroina)"
|
|||||||
VUOSITULOT_COL = "Vuositulot"
|
VUOSITULOT_COL = "Vuositulot"
|
||||||
ID_COL = "Vastaustunniste"
|
ID_COL = "Vastaustunniste"
|
||||||
|
|
||||||
COLUMN_MAP_2024 = {
|
COMMISSION_COL = "Provisio (kk, brutto)"
|
||||||
"Timestamp": "Timestamp",
|
LOMARAHA_COL = "Lomaraha (EUR)"
|
||||||
"Oletko palkansaaja vai laskuttaja?": PALKANSAAJA_VAI_LASKUTTAJA_COL,
|
BONUS_COL = "Bonus (EUR)"
|
||||||
"Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?": SIIRTYNYT_COL,
|
EQUITY_COL = "Osakkeet/optiot (EUR)"
|
||||||
"Ikä": "Ikä",
|
SENIORITY_COL = "Seniority"
|
||||||
"Sukupuoli": "Sukupuoli",
|
|
||||||
"Työkokemus alalta (vuosina)": TYOKOKEMUS_COL,
|
|
||||||
"Koulutustaustasi": "Koulutustaustasi",
|
|
||||||
"Tulojen muutos viime vuodesta (%)": "Tulojen muutos viime vuodesta (%)",
|
|
||||||
"Montako vuotta olet tehnyt laskuttavaa työtä alalla?": "Montako vuotta olet tehnyt laskuttavaa työtä alalla?",
|
|
||||||
"Mitä palveluja tarjoat?": PALVELUT_COL,
|
|
||||||
"Tuntilaskutus (ALV 0%, euroina)": TUNTILASKUTUS_ALV0_COL,
|
|
||||||
"Vuosilaskutus (ALV 0%, euroina)": VUOSILASKUTUS_ALV0_COL,
|
|
||||||
"Hankitko asiakkaasi itse suoraan vai käytätkö välitysfirmojen palveluita?": "Hankitko asiakkaasi itse suoraan vai käytätkö välitysfirmojen palveluita?",
|
|
||||||
"Mistä asiakkaat ovat?": MISTA_ASIAKKAAT_COL,
|
|
||||||
"Työpaikka": "Työpaikka",
|
|
||||||
"Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": KAUPUNKI_COL,
|
|
||||||
"Millaisessa yrityksessä työskentelet?": MILLAISESSA_COL,
|
|
||||||
"Työaika": TYOAIKA_COL,
|
|
||||||
"Kuinka suuren osan ajasta teet lähityönä toimistolla?": LAHITYO_COL,
|
|
||||||
"Rooli / titteli": ROOLI_COL,
|
|
||||||
"Kuukausipalkka (brutto, euroina)": KKPALKKA_COL,
|
|
||||||
"Vuositulot (sis. bonukset, osingot yms, euroina)": VUOSITULOT_COL,
|
|
||||||
"Vapaa kuvaus kokonaiskompensaatiomallista": "Vapaa kuvaus kokonaiskompensaatiomallista",
|
|
||||||
"Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?": KIKY_COL,
|
|
||||||
"Vapaa sana": "Vapaa sana",
|
|
||||||
"Palautetta kyselystä ja ideoita ensi vuoden kyselyyn": PALAUTE_COL,
|
|
||||||
}
|
|
||||||
|
|
||||||
COLUMN_MAP_2024_EN_TO_FI = {
|
COLUMN_MAP_2025 = {
|
||||||
"Timestamp": "Timestamp",
|
"Timestamp": "Timestamp",
|
||||||
"Employee or entrepreneur": "Oletko palkansaaja vai laskuttaja?",
|
"Employee or entrepreneur": PALKANSAAJA_VAI_LASKUTTAJA_COL,
|
||||||
"Have you switched from employment to entrepreneurship or vice versa after 1.10.2023?": "Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?",
|
"Switched from employment to entrepreneurship, or vice versa, in 2025?": SIIRTYNYT_COL,
|
||||||
"Age": "Ikä",
|
"Age": IKA_COL,
|
||||||
"Gender": "Sukupuoli",
|
"Gender": SUKUPUOLI_COL,
|
||||||
"Relevant work experience from the industry (in years)": "Työkokemus alalta (vuosina)",
|
"Finnish fluency": "Suomen kielen taito",
|
||||||
|
"Work language": "Työkieli",
|
||||||
|
"Relevant work experience from the industry (in years)": TYOKOKEMUS_COL,
|
||||||
|
"Years at current employer": "Vuosia nykyisellä työnantajalla",
|
||||||
|
"Companies worked for": "Työpaikkojen lukumäärä",
|
||||||
|
"Company size": "Yrityksen koko",
|
||||||
"Education": "Koulutustaustasi",
|
"Education": "Koulutustaustasi",
|
||||||
"Change in income from last year (in %)": "Tulojen muutos viime vuodesta (%)",
|
"Field of Study": "Opintoala",
|
||||||
"How many years have you worked as an entrepreneur in this industry?": "Montako vuotta olet tehnyt laskuttavaa työtä alalla?",
|
"Change in pay rate from last year (%)": "Tulojen muutos viime vuodesta (%)",
|
||||||
"What services do you offer?": "Mitä palveluja tarjoat?",
|
"Years as entrepreneur": "Montako vuotta olet tehnyt laskuttavaa työtä alalla?",
|
||||||
"Hourly rate (VAT 0%, in euros)": "Tuntilaskutus (ALV 0%, euroina)",
|
"What services do you offer?": PALVELUT_COL,
|
||||||
"Yearly billing (VAT 0%, in euros)": "Vuosilaskutus (ALV 0%, euroina)",
|
"Hourly rate (VAT 0%, in euros)": TUNTILASKUTUS_ALV0_COL,
|
||||||
|
"Yearly billing (VAT 0%, in euros)": VUOSILASKUTUS_ALV0_COL,
|
||||||
|
"Billable hours per week": "Laskutettavat tunnit viikossa",
|
||||||
|
"Weeks not billing": "Viikot ilman laskutusta",
|
||||||
|
"Billing methods": "Laskutustavat",
|
||||||
|
"Contract length": "Sopimuksen pituus",
|
||||||
"Do you use agencies or find your clients yourself?": "Hankitko asiakkaasi itse suoraan vai käytätkö välitysfirmojen palveluita?",
|
"Do you use agencies or find your clients yourself?": "Hankitko asiakkaasi itse suoraan vai käytätkö välitysfirmojen palveluita?",
|
||||||
"Where are your clients from?": "Mistä asiakkaat ovat?",
|
"Where are your clients from?": MISTA_ASIAKKAAT_COL,
|
||||||
"Company": "Työpaikka",
|
"Company": TYOPAIKKA_COL,
|
||||||
"In which city is your office?": "Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?",
|
"City": KAUPUNKI_COL,
|
||||||
"What kind of a company you work in?": "Millaisessa yrityksessä työskentelet?",
|
"What kind of a company you work in?": MILLAISESSA_COL,
|
||||||
"Full time / part time": "Työaika",
|
"Working time (h/week)": TYOAIKA_COL,
|
||||||
"How much of your work time you spend in company office? (in %)": "Kuinka suuren osan ajasta teet lähityönä toimistolla?",
|
"Time in office (%)": LAHITYO_COL,
|
||||||
"Role / title": "Rooli / titteli",
|
"Role / title": ROOLI_COL,
|
||||||
"Monthly salary (gross, in EUR)": "Kuukausipalkka (brutto, euroina)",
|
"Seniority level": SENIORITY_COL,
|
||||||
"Yearly income (incl. bonuses, etc; in EUR)": "Vuositulot (sis. bonukset, osingot yms, euroina)",
|
"Formal Seniority": "Virallinen senioriteetti",
|
||||||
|
"Base salary (gross, monthly EUR)": KKPALKKA_COL,
|
||||||
|
"Commission (gross, monthly EUR)": COMMISSION_COL,
|
||||||
|
"Lomaraha (Holiday bonus, in EUR)": LOMARAHA_COL,
|
||||||
|
"Bonus (EUR)": BONUS_COL,
|
||||||
|
"Equity (EUR)": EQUITY_COL,
|
||||||
"Free description of your compensation model": "Vapaa kuvaus kokonaiskompensaatiomallista",
|
"Free description of your compensation model": "Vapaa kuvaus kokonaiskompensaatiomallista",
|
||||||
"Is your salary competitive?": "Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?",
|
"Competitive salary": KIKY_COL,
|
||||||
|
"Bonus": "Bonukset (kuvaus)",
|
||||||
|
"Non-fringe benefits": "Edut (ei luontoisedut)",
|
||||||
|
"Yearly Tax-Free Benefits (EUR)": "Vuosittaiset verovapaat edut (EUR)",
|
||||||
|
"Fringe benefits (luontoisedut)": "Luontoisedut",
|
||||||
|
"Operating system": "Käyttöjärjestelmä",
|
||||||
|
"Language": "Ohjelmointikieli",
|
||||||
|
"Web Frameworks": "Web-kehykset",
|
||||||
|
"Data Engineering & Machine Learning": "Data & ML",
|
||||||
|
"DevOps & Cloud Platforms": "DevOps & pilvi",
|
||||||
|
"Databases": "Tietokannat",
|
||||||
"What was left unasked that you want to answer to?": "Vapaa sana",
|
"What was left unasked that you want to answer to?": "Vapaa sana",
|
||||||
"Feedback of the survey": "Palautetta kyselystä ja ideoita ensi vuoden kyselyyn",
|
"Feedback of the survey": PALAUTE_COL,
|
||||||
}
|
}
|
||||||
|
|
||||||
# ensure all columns have translations
|
VALUE_MAP_2025 = {
|
||||||
assert set(COLUMN_MAP_2024.keys()) == set(COLUMN_MAP_2024_EN_TO_FI.values())
|
|
||||||
|
|
||||||
VALUE_MAP_2024_EN_TO_FI = {
|
|
||||||
PALKANSAAJA_VAI_LASKUTTAJA_COL: {
|
PALKANSAAJA_VAI_LASKUTTAJA_COL: {
|
||||||
"Employee": "Palkansaaja",
|
"Employee": "Palkansaaja",
|
||||||
"Entrepreneur": "Laskuttaja",
|
"Entrepreneur": "Laskuttaja",
|
||||||
@@ -122,9 +116,36 @@ VALUE_MAP_2024_EN_TO_FI = {
|
|||||||
"Tampere (etänä Berliiniin)": "Tampere",
|
"Tampere (etänä Berliiniin)": "Tampere",
|
||||||
"Turku/remote (HQ Austin, TX)": "Turku",
|
"Turku/remote (HQ Austin, TX)": "Turku",
|
||||||
"Ulkomailla": "Ulkomaat",
|
"Ulkomailla": "Ulkomaat",
|
||||||
|
"Remote": "Etätyö",
|
||||||
|
"remote": "Etätyö",
|
||||||
|
"Fully remote": "Etätyö",
|
||||||
|
"Fully remote work": "Etätyö",
|
||||||
|
"100% remote, no main office": "Etätyö",
|
||||||
|
"Completely distributed and remote": "Etätyö",
|
||||||
|
"Remote without HQ": "Etätyö",
|
||||||
|
"Remote (US)": "Ulkomaat",
|
||||||
|
"Outside Finland": "Ulkomaat",
|
||||||
|
"Abroad": "Ulkomaat",
|
||||||
|
"No centrla office, multiple locations with employees": "Etätyö",
|
||||||
},
|
},
|
||||||
MILLAISESSA_COL: {
|
MILLAISESSA_COL: {
|
||||||
"Product company with softaware as their core business": "Tuotetalossa, jonka core-bisnes on softa",
|
"Product company with softaware as their core business": "Tuotetalossa, jonka core-bisnes on softa",
|
||||||
|
"Product company with software as their core business": "Tuotetalossa, jonka core-bisnes on softa",
|
||||||
|
"A company where software is a support role (for example banks or healthcare)": "Yritys, jossa softa tukirooli",
|
||||||
|
"Consulting": "Konsultointi",
|
||||||
|
"Public or third sector": "Julkinen/kolmas sektori",
|
||||||
|
},
|
||||||
|
SUKUPUOLI_COL: {
|
||||||
|
"Male": "mies",
|
||||||
|
"Female": "nainen",
|
||||||
|
"Non-binary": "muu",
|
||||||
|
"Prefer not to say": None,
|
||||||
|
},
|
||||||
|
KIKY_COL: {
|
||||||
|
"Above market": "Yli markkinatason",
|
||||||
|
"Average market": "Markkinataso",
|
||||||
|
"Below market": "Alle markkinatason",
|
||||||
|
"Not sure": "En osaa sanoa",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -211,9 +232,7 @@ MALE_GENDER_VALUES = (
|
|||||||
"äiä",
|
"äiä",
|
||||||
)
|
)
|
||||||
|
|
||||||
IDS_TO_DROP = {
|
IDS_TO_DROP_2025 = {
|
||||||
"0bf579f8b0a771b9", # 2 euron palkka, rooli "2"
|
"18121abbdb13303c", # duplicate of d5ac88f64a922e6c (submitted 3 min later)
|
||||||
"9a3b73d810f6e983", # apache hyökkäyshelikopteri
|
|
||||||
}
|
}
|
||||||
FI_EXPECTED_ROW_COUNT = 682
|
EXPECTED_ROW_COUNT_2025 = 683
|
||||||
EN_EXPECTED_ROW_COUNT = 51
|
|
||||||
|
|||||||
@@ -5,27 +5,25 @@ import re
|
|||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from pulkka.column_maps import (
|
from pulkka.column_maps import (
|
||||||
BOOLEAN_TEXT_TO_BOOLEAN_MAP,
|
BONUS_COL,
|
||||||
COLUMN_MAP_2024,
|
COLUMN_MAP_2025,
|
||||||
COLUMN_MAP_2024_EN_TO_FI,
|
COMMISSION_COL,
|
||||||
COMPANY_MAP,
|
COMPANY_MAP,
|
||||||
EN_EXPECTED_ROW_COUNT,
|
EQUITY_COL,
|
||||||
|
EXPECTED_ROW_COUNT_2025,
|
||||||
FEMALE_GENDER_VALUES,
|
FEMALE_GENDER_VALUES,
|
||||||
FI_EXPECTED_ROW_COUNT,
|
|
||||||
ID_COL,
|
ID_COL,
|
||||||
IDS_TO_DROP,
|
IDS_TO_DROP_2025,
|
||||||
IKA_COL,
|
IKA_COL,
|
||||||
KIKY_COL,
|
KIKY_COL,
|
||||||
KIKY_OTHER_COL,
|
|
||||||
KK_TULOT_COL,
|
KK_TULOT_COL,
|
||||||
KK_TULOT_NORM_COL,
|
KK_TULOT_NORM_COL,
|
||||||
KKPALKKA_COL,
|
KKPALKKA_COL,
|
||||||
LAHITYO_COL,
|
LAHITYO_COL,
|
||||||
LANG_COL,
|
LOMARAHA_COL,
|
||||||
MALE_GENDER_VALUES,
|
MALE_GENDER_VALUES,
|
||||||
NO_GENDER_VALUES,
|
NO_GENDER_VALUES,
|
||||||
OTHER_GENDER_VALUES,
|
OTHER_GENDER_VALUES,
|
||||||
@@ -34,16 +32,18 @@ from pulkka.column_maps import (
|
|||||||
ROOLI_COL,
|
ROOLI_COL,
|
||||||
ROOLI_NORM_COL,
|
ROOLI_NORM_COL,
|
||||||
SUKUPUOLI_COL,
|
SUKUPUOLI_COL,
|
||||||
|
TUNTILASKUTUS_ALV0_COL,
|
||||||
TYOAIKA_COL,
|
TYOAIKA_COL,
|
||||||
TYOKOKEMUS_COL,
|
TYOKOKEMUS_COL,
|
||||||
TYOPAIKKA_COL,
|
TYOPAIKKA_COL,
|
||||||
VALUE_MAP_2024_EN_TO_FI,
|
VALUE_MAP_2025,
|
||||||
|
VUOSILASKUTUS_ALV0_COL,
|
||||||
VUOSITULOT_COL,
|
VUOSITULOT_COL,
|
||||||
)
|
)
|
||||||
from pulkka.config import DATA_DIR, YEAR
|
from pulkka.config import DATA_DIR, YEAR
|
||||||
|
|
||||||
|
|
||||||
def map_sukupuoli(r: pd.Series) -> str | None:
|
def map_sukupuoli(r: pd.Series) -> str | None: # Unused in 2025
|
||||||
value = r[SUKUPUOLI_COL]
|
value = r[SUKUPUOLI_COL]
|
||||||
if not isinstance(value, str):
|
if not isinstance(value, str):
|
||||||
return value
|
return value
|
||||||
@@ -84,44 +84,29 @@ def map_numberlike(d):
|
|||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
def ucfirst(val):
|
def ucfirst(val) -> str:
|
||||||
if isinstance(val, str):
|
if isinstance(val, str):
|
||||||
return val[0].upper() + val[1:]
|
return val[0].upper() + val[1:]
|
||||||
return val
|
return val
|
||||||
|
|
||||||
|
|
||||||
def hash_row(r: pd.Series) -> str:
|
def hash_row(r: pd.Series) -> str:
|
||||||
source_data = f"{r[LANG_COL]}.{int(r.Timestamp.timestamp() * 1000)}"
|
source_data = (
|
||||||
|
f"en.{int(r.Timestamp.timestamp() * 1000)}" # NB (2025): hard-codes `en`!
|
||||||
|
)
|
||||||
return hashlib.sha256(source_data.encode()).hexdigest()[:16]
|
return hashlib.sha256(source_data.encode()).hexdigest()[:16]
|
||||||
|
|
||||||
|
|
||||||
def read_initial_dfs() -> pd.DataFrame:
|
def read_initial_dfs() -> pd.DataFrame:
|
||||||
df_fi: pd.DataFrame = pd.read_excel(
|
df: pd.DataFrame = pd.read_excel(DATA_DIR / "data.xlsx")
|
||||||
DATA_DIR / "results-fi.xlsx",
|
df.columns = df.columns.str.strip()
|
||||||
skiprows=[1], # Google Sheets exports one empty row
|
|
||||||
)
|
|
||||||
df_fi[LANG_COL] = "fi"
|
|
||||||
|
|
||||||
if len(df_fi) < FI_EXPECTED_ROW_COUNT:
|
if len(df) < EXPECTED_ROW_COUNT_2025:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Expected at least {FI_EXPECTED_ROW_COUNT} rows in the Finnish data, got {len(df_fi)}",
|
f"Expected at least {EXPECTED_ROW_COUNT_2025} rows, got {len(df)}",
|
||||||
)
|
)
|
||||||
|
|
||||||
df_en: pd.DataFrame = pd.read_excel(
|
|
||||||
DATA_DIR / "results-en.xlsx",
|
|
||||||
skiprows=[1], # Google Sheets exports one empty row
|
|
||||||
)
|
|
||||||
df_en[LANG_COL] = "en"
|
|
||||||
|
|
||||||
if len(df_fi) < EN_EXPECTED_ROW_COUNT:
|
|
||||||
raise ValueError(
|
|
||||||
f"Expected at least {EN_EXPECTED_ROW_COUNT} rows in the English data, got {len(df_en)}",
|
|
||||||
)
|
|
||||||
|
|
||||||
df_en = df_en.rename(columns=COLUMN_MAP_2024_EN_TO_FI)
|
|
||||||
df = pd.concat([df_fi, df_en], ignore_index=True)
|
|
||||||
df = df[df["Timestamp"].notna()] # Remove rows with no timestamp
|
df = df[df["Timestamp"].notna()] # Remove rows with no timestamp
|
||||||
df[LANG_COL] = df[LANG_COL].astype("category")
|
|
||||||
# Give each row a unique hash ID
|
# Give each row a unique hash ID
|
||||||
df[ID_COL] = df.apply(hash_row, axis=1)
|
df[ID_COL] = df.apply(hash_row, axis=1)
|
||||||
# Ensure truncated sha is unique
|
# Ensure truncated sha is unique
|
||||||
@@ -146,38 +131,75 @@ def map_case_insensitive(series: pd.Series, mapping: dict[str, str]) -> pd.Serie
|
|||||||
|
|
||||||
|
|
||||||
def read_data() -> pd.DataFrame:
|
def read_data() -> pd.DataFrame:
|
||||||
if YEAR != "2024":
|
if YEAR != "2025":
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"This code only works for 2024. "
|
"This code only works for 2025. "
|
||||||
"Please use an older revision for older data.",
|
"Please use an older revision for older data.",
|
||||||
)
|
)
|
||||||
df = read_initial_dfs()
|
df = read_initial_dfs()
|
||||||
|
|
||||||
df = df.rename(columns=COLUMN_MAP_2024)
|
df = df.rename(columns=COLUMN_MAP_2025)
|
||||||
|
|
||||||
for col, val_map in VALUE_MAP_2024_EN_TO_FI.items():
|
for col, val_map in VALUE_MAP_2025.items():
|
||||||
df[col] = df[col].map(val_map).fillna(df[col]).astype("category")
|
df[col] = df[col].map(val_map).fillna(df[col]).astype("category")
|
||||||
|
|
||||||
# Drop known bogus data
|
# Drop known bogus data
|
||||||
df = df.drop(df[df[ID_COL].isin(IDS_TO_DROP)].index)
|
df = df.drop(df[df[ID_COL].isin(IDS_TO_DROP_2025)].index)
|
||||||
|
|
||||||
df[SUKUPUOLI_COL] = df.apply(map_sukupuoli, axis=1).astype("category")
|
# Drop duplicate submissions: rows identical on all columns except
|
||||||
|
# Timestamp and ID (keep the earliest submission)
|
||||||
|
content_cols = [c for c in df.columns if c not in ("Timestamp", ID_COL)]
|
||||||
|
before = len(df)
|
||||||
|
df = df.sort_values("Timestamp").drop_duplicates(subset=content_cols, keep="first")
|
||||||
|
n_dupes = before - len(df)
|
||||||
|
if n_dupes:
|
||||||
|
warnings.warn(f"Dropped {n_dupes} duplicate submission(s)")
|
||||||
|
|
||||||
|
# Gender is already mapped via VALUE_MAP_2025
|
||||||
|
df[SUKUPUOLI_COL] = df[SUKUPUOLI_COL].astype("category")
|
||||||
df[IKA_COL] = df[IKA_COL].astype("category")
|
df[IKA_COL] = df[IKA_COL].astype("category")
|
||||||
|
df[KIKY_COL] = df[KIKY_COL].astype("category")
|
||||||
|
|
||||||
# Assume that people entering 37.5 (hours) as their työaika means 100%
|
# Working time is in h/week — normalize to fraction of 37.5h
|
||||||
df.loc[df[TYOAIKA_COL] == 37.5, TYOAIKA_COL] = 100
|
df[TYOAIKA_COL] = to_percentage(df[TYOAIKA_COL], 37.5)
|
||||||
# Assume there is no actual 10x koodari among us
|
# Time in office is already a percentage
|
||||||
df.loc[df[TYOAIKA_COL] == 1000, TYOAIKA_COL] = 100
|
|
||||||
|
|
||||||
df[TYOAIKA_COL] = to_percentage(df[TYOAIKA_COL], 100)
|
|
||||||
df[LAHITYO_COL] = to_percentage(df[LAHITYO_COL], 100)
|
df[LAHITYO_COL] = to_percentage(df[LAHITYO_COL], 100)
|
||||||
|
|
||||||
# Split out non-boolean answers from KIKY_COL to KIKY_OTHER_COL
|
|
||||||
df = split_boolean_column_to_other(df, KIKY_COL, KIKY_OTHER_COL)
|
|
||||||
|
|
||||||
# Try to clean up numbers with spaces, etc. to real numbers
|
# Try to clean up numbers with spaces, etc. to real numbers
|
||||||
df[KKPALKKA_COL] = df[KKPALKKA_COL].apply(map_numberlike)
|
df[KKPALKKA_COL] = df[KKPALKKA_COL].apply(map_numberlike)
|
||||||
df[VUOSITULOT_COL] = df[VUOSITULOT_COL].apply(map_numberlike)
|
df[TUNTILASKUTUS_ALV0_COL] = pd.to_numeric(
|
||||||
|
df[TUNTILASKUTUS_ALV0_COL].apply(map_numberlike),
|
||||||
|
errors="coerce",
|
||||||
|
)
|
||||||
|
df[VUOSILASKUTUS_ALV0_COL] = pd.to_numeric(
|
||||||
|
df[VUOSILASKUTUS_ALV0_COL].apply(map_numberlike),
|
||||||
|
errors="coerce",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Synthesize Vuositulot from components:
|
||||||
|
# (base_salary + commission) * 12 + lomaraha + bonus + equity
|
||||||
|
for comp_col in [COMMISSION_COL, LOMARAHA_COL, BONUS_COL, EQUITY_COL]:
|
||||||
|
df[comp_col] = pd.to_numeric(
|
||||||
|
df[comp_col].apply(map_numberlike),
|
||||||
|
errors="coerce",
|
||||||
|
).fillna(0)
|
||||||
|
|
||||||
|
# Fold commission into monthly salary so KKPALKKA = base + commission
|
||||||
|
df[KKPALKKA_COL] = (
|
||||||
|
pd.to_numeric(df[KKPALKKA_COL], errors="coerce").fillna(0) + df[COMMISSION_COL]
|
||||||
|
)
|
||||||
|
|
||||||
|
base_yearly = df[KKPALKKA_COL] * 12
|
||||||
|
lomaraha = df.get(LOMARAHA_COL, 0)
|
||||||
|
bonus = df.get(BONUS_COL, 0)
|
||||||
|
equity = df.get(EQUITY_COL, 0)
|
||||||
|
|
||||||
|
df[VUOSITULOT_COL] = base_yearly + lomaraha + bonus + equity
|
||||||
|
# If base salary is missing/zero, vuositulot should be NaN
|
||||||
|
df.loc[
|
||||||
|
pd.to_numeric(df[KKPALKKA_COL], errors="coerce").fillna(0) == 0,
|
||||||
|
VUOSITULOT_COL,
|
||||||
|
] = np.nan
|
||||||
|
|
||||||
# Fix up Työpaikka
|
# Fix up Työpaikka
|
||||||
df[TYOPAIKKA_COL] = df[TYOPAIKKA_COL].replace("-", np.nan)
|
df[TYOPAIKKA_COL] = df[TYOPAIKKA_COL].replace("-", np.nan)
|
||||||
@@ -214,7 +236,7 @@ def read_data() -> pd.DataFrame:
|
|||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def to_percentage(ser: pandas.Series, norm_max: float) -> pandas.Series:
|
def to_percentage(ser: pd.Series, norm_max: float) -> pd.Series:
|
||||||
"""
|
"""
|
||||||
Convert a series of numbers to a percentage
|
Convert a series of numbers to a percentage
|
||||||
"""
|
"""
|
||||||
@@ -227,54 +249,16 @@ def to_percentage(ser: pandas.Series, norm_max: float) -> pandas.Series:
|
|||||||
return ser.clip(lower=0)
|
return ser.clip(lower=0)
|
||||||
|
|
||||||
|
|
||||||
def split_boolean_column_to_other(df, col, other_col):
|
def force_age_numeric(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
df[col] = df[col].replace(BOOLEAN_TEXT_TO_BOOLEAN_MAP)
|
|
||||||
df[other_col] = df[col].apply(
|
|
||||||
lambda r: r if (r and not isinstance(r, bool)) else None,
|
|
||||||
)
|
|
||||||
df[col] = (
|
|
||||||
df[col]
|
|
||||||
.apply(
|
|
||||||
lambda value: (
|
|
||||||
["Ei", "Kyllä"][value]
|
|
||||||
if isinstance(value, bool)
|
|
||||||
else (np.nan if not value else "Muu")
|
|
||||||
),
|
|
||||||
)
|
|
||||||
.astype("category")
|
|
||||||
)
|
|
||||||
# reorder columns so that other_col is right after col
|
|
||||||
cols = list(df.columns)
|
|
||||||
cols.remove(other_col)
|
|
||||||
cols.insert(cols.index(col) + 1, other_col)
|
|
||||||
df = df[cols]
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def force_age_numeric(df):
|
|
||||||
age_map = {}
|
age_map = {}
|
||||||
for cat in df[IKA_COL].cat.categories:
|
for cat in df[IKA_COL].cat.categories:
|
||||||
m = re.match(r"^(\d+)-(\d+) v", cat)
|
m = re.match(r"^(\d+)-(\d+)( v)?", cat)
|
||||||
if m:
|
if m:
|
||||||
age_map[cat] = int(round(float(m.group(1)) + float(m.group(2))) / 2)
|
age_map[cat] = int(round(float(m.group(1)) + float(m.group(2))) / 2)
|
||||||
df[IKA_COL] = df[IKA_COL].apply(lambda r: age_map.get(r, r))
|
df[IKA_COL] = df[IKA_COL].apply(lambda r: age_map.get(r, r))
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
pd.set_option("display.max_column", None)
|
|
||||||
pd.set_option("display.max_rows", None)
|
|
||||||
pd.set_option("display.max_seq_items", None)
|
|
||||||
pd.set_option("display.max_colwidth", 500)
|
|
||||||
pd.set_option("expand_frame_repr", True)
|
|
||||||
df = read_data()
|
|
||||||
print(df.head())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
|
|
||||||
def apply_fixups(df: pd.DataFrame, fixups: list[tuple[dict, dict]]) -> pd.DataFrame:
|
def apply_fixups(df: pd.DataFrame, fixups: list[tuple[dict, dict]]) -> pd.DataFrame:
|
||||||
for match_cond, replace_cond in fixups:
|
for match_cond, replace_cond in fixups:
|
||||||
match_keys, match_values = zip(*match_cond.items())
|
match_keys, match_values = zip(*match_cond.items())
|
||||||
@@ -286,3 +270,17 @@ def apply_fixups(df: pd.DataFrame, fixups: list[tuple[dict, dict]]) -> pd.DataFr
|
|||||||
replace_keys, replace_values = zip(*replace_cond.items())
|
replace_keys, replace_values = zip(*replace_cond.items())
|
||||||
df.loc[ix, list(replace_keys)] = replace_values
|
df.loc[ix, list(replace_keys)] = replace_values
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
pd.set_option("display.max_column", None)
|
||||||
|
pd.set_option("display.max_rows", None)
|
||||||
|
pd.set_option("display.max_seq_items", None)
|
||||||
|
pd.set_option("display.max_colwidth", 500)
|
||||||
|
pd.set_option("expand_frame_repr", True)
|
||||||
|
df = read_data()
|
||||||
|
print(df.head())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|||||||
@@ -43,3 +43,28 @@ def rename_na(df: pd.DataFrame, col: str, na_name: str) -> None:
|
|||||||
df[col] = df[col].astype("string")
|
df[col] = df[col].astype("string")
|
||||||
df.loc[df[col].isna(), col] = na_name
|
df.loc[df[col].isna(), col] = na_name
|
||||||
df[col] = df[col].astype("category")
|
df[col] = df[col].astype("category")
|
||||||
|
|
||||||
|
|
||||||
|
def explode_multiselect(
|
||||||
|
series: pd.Series,
|
||||||
|
*,
|
||||||
|
sep: str = ", ",
|
||||||
|
top_n: int | None = None,
|
||||||
|
) -> pd.Series:
|
||||||
|
"""
|
||||||
|
Explode a comma-separated multiselect column into value counts.
|
||||||
|
|
||||||
|
Returns a Series of counts indexed by individual values,
|
||||||
|
sorted descending. Optionally limited to top_n entries.
|
||||||
|
"""
|
||||||
|
counts = (
|
||||||
|
series.dropna()
|
||||||
|
.str.split(sep)
|
||||||
|
.explode()
|
||||||
|
.str.strip()
|
||||||
|
.loc[lambda s: s != ""]
|
||||||
|
.value_counts()
|
||||||
|
)
|
||||||
|
if top_n is not None:
|
||||||
|
counts = counts.head(top_n)
|
||||||
|
return counts
|
||||||
|
|||||||
@@ -1,8 +1,11 @@
|
|||||||
{% extends "_base.html" %}
|
{% extends "_base.html" %}
|
||||||
|
{% macro eur_span(number) -%}
|
||||||
|
<span title="{{ number }}" class="eur" data-number="{{ number }}">{{ number }} €</span>
|
||||||
|
{%- endmacro %}
|
||||||
{% macro tunnusluvut_points(df, col_name, title) %}
|
{% macro tunnusluvut_points(df, col_name, title) %}
|
||||||
{% with num_kk = df[pd.to_numeric(df[col_name], errors='coerce').notnull()][col_name] %}
|
{% with num_kk = df[col_name][pd.to_numeric(df[col_name], errors='coerce') > 0].dropna() %}
|
||||||
<li title="n = {{ num_kk.count() }}">{{ title }}, keskiarvo = {{ num_kk.mean()|round(0) }} €</li>
|
<li title="n = {{ num_kk.count() }}">{{ title }}, keskiarvo = {{ eur_span(num_kk.mean()|round(0)) }}</li>
|
||||||
<li title="n = {{ num_kk.count() }}">{{ title }}, mediaani = {{ num_kk.median()|round(0) }} €</li>
|
<li title="n = {{ num_kk.count() }}">{{ title }}, mediaani = {{ eur_span(num_kk.median()|round(0)) }}</li>
|
||||||
{% endwith %}
|
{% endwith %}
|
||||||
{% endmacro %}
|
{% endmacro %}
|
||||||
{% block body %}
|
{% block body %}
|
||||||
@@ -74,6 +77,15 @@
|
|||||||
International</a> (CC BY 4.0).<br>
|
International</a> (CC BY 4.0).<br>
|
||||||
Mankelointityökalujen lisenssi on <a href="https://opensource.org/licenses/MIT">MIT</a>.
|
Mankelointityökalujen lisenssi on <a href="https://opensource.org/licenses/MIT">MIT</a>.
|
||||||
</p>
|
</p>
|
||||||
|
<script>
|
||||||
|
for (const eur of document.querySelectorAll(".eur")) {
|
||||||
|
// Try to parse data-number, format as browser-native currency
|
||||||
|
const number = parseFloat(eur.dataset.number);
|
||||||
|
if (!isNaN(number)) {
|
||||||
|
eur.textContent = number.toLocaleString("fi-FI", {style: "currency", currency: "EUR"});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
{% block footer %}
|
{% block footer %}
|
||||||
<footer>
|
<footer>
|
||||||
|
|||||||
Reference in New Issue
Block a user