mirror of
https://github.com/koodiklinikka/palkkakysely.git
synced 2026-01-26 03:14:03 +00:00
Update for 2024
This commit is contained in:
6
Makefile
6
Makefile
@@ -1,8 +1,8 @@
|
||||
YEAR := 2023
|
||||
YEAR := 2024
|
||||
DATA_DIR := data/${YEAR}
|
||||
OUT_DIR := out/${YEAR}
|
||||
DOCUMENT_ID_FI := 1sycmd6DGqHj9-0k6D8HclzlRghxqoVaBZNSZye1Jdbg
|
||||
DOCUMENT_ID_EN := 1pmrQWsja3wRVF02PyEGO2F_CgttobTbxGUGjQ5K4H4Y
|
||||
DOCUMENT_ID_FI := 1dvyVEJkn3_osBeKGIlhKmid671jjH7zYgcyH1BjiGF8
|
||||
DOCUMENT_ID_EN := 1o1uakk1pkoUCtx2OGJhLclxt_uraYA-uK3DH8yCYHN4
|
||||
XLSX_URL_FI := https://docs.google.com/spreadsheets/d/$(DOCUMENT_ID_FI)/export?format=xlsx
|
||||
TSV_URL_FI := https://docs.google.com/spreadsheets/d/$(DOCUMENT_ID_FI)/export?format=tsv
|
||||
XLSX_URL_EN := https://docs.google.com/spreadsheets/d/$(DOCUMENT_ID_EN)/export?format=xlsx
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
<li><a href="2021/">2021</a></li>
|
||||
<li><a href="2022/">2022</a></li>
|
||||
<li><a href="2023/">2023</a></li>
|
||||
<li><a href="2024/">2024</a></li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@@ -19,7 +19,7 @@ PALVELUT_COL = "Palvelut"
|
||||
ROOLI_COL = "Rooli"
|
||||
ROOLI_NORM_COL = "Rooli (normalisoitu)"
|
||||
SIIRTYNYT_COL = (
|
||||
"Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2022 jälkeen?"
|
||||
"Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?"
|
||||
)
|
||||
SUKUPUOLI_COL = "Sukupuoli"
|
||||
TUNTILASKUTUS_ALV0_COL = "Tuntilaskutus (ALV 0%, euroina)"
|
||||
@@ -30,10 +30,10 @@ VUOSILASKUTUS_ALV0_COL = "Vuosilaskutus (ALV 0%, euroina)"
|
||||
VUOSITULOT_COL = "Vuositulot"
|
||||
ID_COL = "Vastaustunniste"
|
||||
|
||||
COLUMN_MAP_2023 = {
|
||||
COLUMN_MAP_2024 = {
|
||||
"Timestamp": "Timestamp",
|
||||
"Oletko palkansaaja vai laskuttaja?": PALKANSAAJA_VAI_LASKUTTAJA_COL,
|
||||
"Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2022 jälkeen?": SIIRTYNYT_COL,
|
||||
"Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?": SIIRTYNYT_COL,
|
||||
"Ikä": "Ikä",
|
||||
"Sukupuoli": "Sukupuoli",
|
||||
"Työkokemus alalta (vuosina)": TYOKOKEMUS_COL,
|
||||
@@ -59,10 +59,10 @@ COLUMN_MAP_2023 = {
|
||||
"Palautetta kyselystä ja ideoita ensi vuoden kyselyyn": PALAUTE_COL,
|
||||
}
|
||||
|
||||
COLUMN_MAP_2023_EN_TO_FI = {
|
||||
COLUMN_MAP_2024_EN_TO_FI = {
|
||||
"Timestamp": "Timestamp",
|
||||
"Employee or entrepreneur": "Oletko palkansaaja vai laskuttaja?",
|
||||
"Have you switched from employment to entrepreneurship or vice versa after 1.10.2022?": "Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2022 jälkeen?",
|
||||
"Have you switched from employment to entrepreneurship or vice versa after 1.10.2023?": "Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?",
|
||||
"Age": "Ikä",
|
||||
"Gender": "Sukupuoli",
|
||||
"Relevant work experience from the industry (in years)": "Työkokemus alalta (vuosina)",
|
||||
@@ -89,9 +89,9 @@ COLUMN_MAP_2023_EN_TO_FI = {
|
||||
}
|
||||
|
||||
# ensure all columns have translations
|
||||
assert set(COLUMN_MAP_2023.keys()) == set(COLUMN_MAP_2023_EN_TO_FI.values())
|
||||
assert set(COLUMN_MAP_2024.keys()) == set(COLUMN_MAP_2024_EN_TO_FI.values())
|
||||
|
||||
VALUE_MAP_2023_EN_TO_FI = {
|
||||
VALUE_MAP_2024_EN_TO_FI = {
|
||||
PALKANSAAJA_VAI_LASKUTTAJA_COL: {
|
||||
"Employee": "Palkansaaja",
|
||||
"Entrepreneur": "Laskuttaja",
|
||||
@@ -112,8 +112,16 @@ VALUE_MAP_2023_EN_TO_FI = {
|
||||
"Finland": "Suomesta",
|
||||
},
|
||||
KAUPUNKI_COL: {
|
||||
"PK-Seutu (Helsinki, Espoo, Vantaa)": "PK-seutu",
|
||||
"Asun Porissa, toimisto Helsingissä, sijainnilla ei vaikutusta palkkaan": "Pori",
|
||||
"Capital region (Helsinki, Espoo, Vantaa)": "PK-seutu",
|
||||
"Firmalla ei ole toimistoa": "Etätyöfirma",
|
||||
"Hajautettu": "Etätyöfirma",
|
||||
"New York City": "New York",
|
||||
"New York, NY, USA": "New York",
|
||||
"PK-Seutu (Helsinki, Espoo, Vantaa)": "PK-seutu",
|
||||
"Tampere (etänä Berliiniin)": "Tampere",
|
||||
"Turku/remote (HQ Austin, TX)": "Turku",
|
||||
"Ulkomailla": "Ulkomaat",
|
||||
},
|
||||
MILLAISESSA_COL: {
|
||||
"Product company with softaware as their core business": "Tuotetalossa, jonka core-bisnes on softa",
|
||||
@@ -166,23 +174,44 @@ ROLE_MAP = {
|
||||
"Full-stack cloud developer": FULL_STACK_ROLE,
|
||||
"Fullstack developer, web apps": FULL_STACK_ROLE,
|
||||
}
|
||||
|
||||
NO_GENDER_VALUES = {
|
||||
"-",
|
||||
"ei liity asiaan",
|
||||
"epärelevantti",
|
||||
"jänis",
|
||||
"kyllä, kiitos",
|
||||
"leppäkerttu",
|
||||
"tihkutympönen",
|
||||
"on",
|
||||
"yes",
|
||||
}
|
||||
|
||||
OTHER_GENDER_VALUES = {
|
||||
"muu",
|
||||
"muu/ei",
|
||||
"non-binary, afab",
|
||||
"muunsukupuolinen",
|
||||
}
|
||||
|
||||
FEMALE_GENDER_VALUES = (
|
||||
"f",
|
||||
"n",
|
||||
"women",
|
||||
)
|
||||
|
||||
MALE_GENDER_VALUES = (
|
||||
"he / him / male",
|
||||
"ihminen. kikkelillä.",
|
||||
"m i ä s",
|
||||
"m",
|
||||
"mail", # probably a typo
|
||||
"male presenting",
|
||||
"male",
|
||||
"man",
|
||||
"meis",
|
||||
"mie", # probably mies?
|
||||
"miekkonen",
|
||||
"mies",
|
||||
"miesoletettu",
|
||||
"miäs",
|
||||
"ukko",
|
||||
"äiä",
|
||||
)
|
||||
|
||||
IDS_TO_DROP = {
|
||||
"6cab61607da9c2b6", # hupsu taisteluhelikopteri
|
||||
"aefdb9e69b1621d5", # See "SUBMITTED TWICE, SORRY!!" in English data
|
||||
"0bf579f8b0a771b9", # 2 euron palkka, rooli "2"
|
||||
"9a3b73d810f6e983", # apache hyökkäyshelikopteri
|
||||
}
|
||||
|
||||
@@ -10,9 +10,10 @@ import pandas as pd
|
||||
|
||||
from pulkka.column_maps import (
|
||||
BOOLEAN_TEXT_TO_BOOLEAN_MAP,
|
||||
COLUMN_MAP_2023,
|
||||
COLUMN_MAP_2023_EN_TO_FI,
|
||||
COLUMN_MAP_2024,
|
||||
COLUMN_MAP_2024_EN_TO_FI,
|
||||
COMPANY_MAP,
|
||||
FEMALE_GENDER_VALUES,
|
||||
ID_COL,
|
||||
IDS_TO_DROP,
|
||||
IKA_COL,
|
||||
@@ -23,6 +24,7 @@ from pulkka.column_maps import (
|
||||
KKPALKKA_COL,
|
||||
LAHITYO_COL,
|
||||
LANG_COL,
|
||||
MALE_GENDER_VALUES,
|
||||
NO_GENDER_VALUES,
|
||||
OTHER_GENDER_VALUES,
|
||||
PALVELUT_COL,
|
||||
@@ -33,7 +35,7 @@ from pulkka.column_maps import (
|
||||
TYOAIKA_COL,
|
||||
TYOKOKEMUS_COL,
|
||||
TYOPAIKKA_COL,
|
||||
VALUE_MAP_2023_EN_TO_FI,
|
||||
VALUE_MAP_2024_EN_TO_FI,
|
||||
VUOSITULOT_COL,
|
||||
)
|
||||
from pulkka.config import DATA_DIR, YEAR
|
||||
@@ -49,19 +51,11 @@ def map_sukupuoli(r: pd.Series) -> str | None:
|
||||
"nainen" in value
|
||||
or "female" in value
|
||||
or "woman" in value
|
||||
or value == "f"
|
||||
or value == "women"
|
||||
or value in FEMALE_GENDER_VALUES
|
||||
):
|
||||
return "nainen"
|
||||
|
||||
if (
|
||||
"mies" in value
|
||||
or "uros" in value
|
||||
or "miäs" in value
|
||||
or "äiä" in value
|
||||
or "male" in value
|
||||
or value in ("m", "man", "m i ä s", "ukko")
|
||||
):
|
||||
if value.strip() in MALE_GENDER_VALUES:
|
||||
return "mies"
|
||||
|
||||
if value in NO_GENDER_VALUES:
|
||||
@@ -70,7 +64,7 @@ def map_sukupuoli(r: pd.Series) -> str | None:
|
||||
if value in OTHER_GENDER_VALUES:
|
||||
return "muu"
|
||||
|
||||
raise NotImplementedError(f"Unknown sukupuoli: {value} (row ID {r[ID_COL]})")
|
||||
raise NotImplementedError(f"Unknown sukupuoli: {value!r} (row ID {r[ID_COL]})")
|
||||
|
||||
|
||||
def map_vuositulot(r):
|
||||
@@ -110,7 +104,7 @@ def read_initial_dfs() -> pd.DataFrame:
|
||||
skiprows=[1], # Google Sheets exports one empty row
|
||||
)
|
||||
df_en[LANG_COL] = "en"
|
||||
df_en = df_en.rename(columns=COLUMN_MAP_2023_EN_TO_FI)
|
||||
df_en = df_en.rename(columns=COLUMN_MAP_2024_EN_TO_FI)
|
||||
df = pd.concat([df_fi, df_en], ignore_index=True)
|
||||
df = df[df["Timestamp"].notna()] # Remove rows with no timestamp
|
||||
df[LANG_COL] = df[LANG_COL].astype("category")
|
||||
@@ -130,23 +124,24 @@ def map_case_insensitive(series: pd.Series, mapping: dict[str, str]) -> pd.Serie
|
||||
def map_value(v):
|
||||
if v is np.nan:
|
||||
return ""
|
||||
assert isinstance(v, str)
|
||||
if not isinstance(v, str):
|
||||
raise TypeError(f"Unexpected value {v!r} of type {type(v)}")
|
||||
return lower_mapping.get(v.lower().strip(), v)
|
||||
|
||||
return series.apply(map_value).fillna(series)
|
||||
|
||||
|
||||
def read_data() -> pd.DataFrame:
|
||||
if YEAR != "2023":
|
||||
if YEAR != "2024":
|
||||
raise ValueError(
|
||||
"This code only works for 2023. "
|
||||
"This code only works for 2024. "
|
||||
"Please use an older revision for older data.",
|
||||
)
|
||||
df = read_initial_dfs()
|
||||
|
||||
df = df.rename(columns=COLUMN_MAP_2023)
|
||||
df = df.rename(columns=COLUMN_MAP_2024)
|
||||
|
||||
for col, val_map in VALUE_MAP_2023_EN_TO_FI.items():
|
||||
for col, val_map in VALUE_MAP_2024_EN_TO_FI.items():
|
||||
df[col] = df[col].map(val_map).fillna(df[col]).astype("category")
|
||||
|
||||
# Drop known bogus data
|
||||
@@ -192,14 +187,7 @@ def read_data() -> pd.DataFrame:
|
||||
df = apply_fixups(
|
||||
df,
|
||||
[
|
||||
(
|
||||
{ID_COL: "a01216a11026d749", VUOSITULOT_COL: 620000},
|
||||
{VUOSITULOT_COL: 62000},
|
||||
),
|
||||
(
|
||||
{ID_COL: "79a200f529f6919b", VUOSITULOT_COL: 1500},
|
||||
{VUOSITULOT_COL: 150_000},
|
||||
),
|
||||
# ({ID_COL: "..."}, {VUOSITULOT_COL: 62000}),
|
||||
],
|
||||
)
|
||||
# Fill in Vuositulot as 12.5 * Kk-tulot if empty
|
||||
|
||||
@@ -33,7 +33,7 @@ def get_categorical_stats(
|
||||
if na_as_category:
|
||||
rename_na(df, category_col, na_as_category)
|
||||
# ... then carry on.
|
||||
group = df[[category_col, value_col]].groupby(category_col)
|
||||
group = df[[category_col, value_col]].groupby(category_col, observed=False)
|
||||
return group[value_col].agg(
|
||||
["mean", "min", "max", "median", "count", q25, q50, q75, q90],
|
||||
)
|
||||
|
||||
@@ -79,7 +79,7 @@ def main():
|
||||
)
|
||||
env.globals.update(
|
||||
{
|
||||
"date": datetime.datetime.utcnow(),
|
||||
"date": datetime.datetime.now(datetime.UTC),
|
||||
"cm": column_maps,
|
||||
"pd": pandas,
|
||||
"np": numpy,
|
||||
|
||||
Reference in New Issue
Block a user