Merge pull request #18 from koodiklinikka/2024

2024 Edition
This commit is contained in:
Aarni Koskela
2024-10-28 11:54:13 +02:00
committed by GitHub
14 changed files with 175 additions and 148 deletions

View File

@@ -14,26 +14,27 @@ jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
python-version: "3.12"
cache: pip
- uses: pre-commit/action@v3.0.0
- uses: pre-commit/action@v3.0.1
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: pip
- name: Install dependencies
run: python -m pip install -r requirements.txt
- name: Build
run: make -j3
- uses: actions/setup-node@v3
python-version: "3.12"
- uses: astral-sh/setup-uv@v3
with:
version: "0.4.x"
enable-cache: true
cache-dependency-glob: "**/requirements*.txt"
- run: uv pip install --system -r requirements.txt
- run: make -j3
- uses: actions/setup-node@v4
with:
node-version: "20"
cache: yarn

View File

@@ -3,16 +3,13 @@ ci:
autofix_prs: false
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.291
rev: v0.7.1
hooks:
- id: ruff
args:
- --fix
- repo: https://github.com/psf/black
rev: 23.9.1
hooks:
- id: black
- id: ruff-format
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.0.3
rev: v3.1.0
hooks:
- id: prettier

View File

@@ -1,8 +1,8 @@
YEAR := 2023
YEAR := 2024
DATA_DIR := data/${YEAR}
OUT_DIR := out/${YEAR}
DOCUMENT_ID_FI := 1sycmd6DGqHj9-0k6D8HclzlRghxqoVaBZNSZye1Jdbg
DOCUMENT_ID_EN := 1pmrQWsja3wRVF02PyEGO2F_CgttobTbxGUGjQ5K4H4Y
DOCUMENT_ID_FI := 1dvyVEJkn3_osBeKGIlhKmid671jjH7zYgcyH1BjiGF8
DOCUMENT_ID_EN := 1o1uakk1pkoUCtx2OGJhLclxt_uraYA-uK3DH8yCYHN4
XLSX_URL_FI := https://docs.google.com/spreadsheets/d/$(DOCUMENT_ID_FI)/export?format=xlsx
TSV_URL_FI := https://docs.google.com/spreadsheets/d/$(DOCUMENT_ID_FI)/export?format=tsv
XLSX_URL_EN := https://docs.google.com/spreadsheets/d/$(DOCUMENT_ID_EN)/export?format=xlsx

View File

@@ -14,6 +14,7 @@
<li><a href="2021/">2021</a></li>
<li><a href="2022/">2022</a></li>
<li><a href="2023/">2023</a></li>
<li><a href="2024/">2024</a></li>
</ul>
</body>
</html>

View File

@@ -1,4 +1,5 @@
from bokeh import models as bm, plotting as bp
from bokeh import models as bm
from bokeh import plotting as bp
from bokeh.transform import factor_cmap
from pandas import DataFrame

View File

@@ -19,7 +19,7 @@ PALVELUT_COL = "Palvelut"
ROOLI_COL = "Rooli"
ROOLI_NORM_COL = "Rooli (normalisoitu)"
SIIRTYNYT_COL = (
"Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2022 jälkeen?"
"Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?"
)
SUKUPUOLI_COL = "Sukupuoli"
TUNTILASKUTUS_ALV0_COL = "Tuntilaskutus (ALV 0%, euroina)"
@@ -30,10 +30,10 @@ VUOSILASKUTUS_ALV0_COL = "Vuosilaskutus (ALV 0%, euroina)"
VUOSITULOT_COL = "Vuositulot"
ID_COL = "Vastaustunniste"
COLUMN_MAP_2023 = {
COLUMN_MAP_2024 = {
"Timestamp": "Timestamp",
"Oletko palkansaaja vai laskuttaja?": PALKANSAAJA_VAI_LASKUTTAJA_COL,
"Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2022 jälkeen?": SIIRTYNYT_COL,
"Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?": SIIRTYNYT_COL,
"Ikä": "Ikä",
"Sukupuoli": "Sukupuoli",
"Työkokemus alalta (vuosina)": TYOKOKEMUS_COL,
@@ -59,10 +59,10 @@ COLUMN_MAP_2023 = {
"Palautetta kyselystä ja ideoita ensi vuoden kyselyyn": PALAUTE_COL,
}
COLUMN_MAP_2023_EN_TO_FI = {
COLUMN_MAP_2024_EN_TO_FI = {
"Timestamp": "Timestamp",
"Employee or entrepreneur": "Oletko palkansaaja vai laskuttaja?",
"Have you switched from employment to entrepreneurship or vice versa after 1.10.2022?": "Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2022 jälkeen?",
"Have you switched from employment to entrepreneurship or vice versa after 1.10.2023?": "Oletko siirtynyt palkansaajasta laskuttajaksi tai päinvastoin 1.10.2023 jälkeen?",
"Age": "Ikä",
"Gender": "Sukupuoli",
"Relevant work experience from the industry (in years)": "Työkokemus alalta (vuosina)",
@@ -89,9 +89,9 @@ COLUMN_MAP_2023_EN_TO_FI = {
}
# ensure all columns have translations
assert set(COLUMN_MAP_2023.keys()) == set(COLUMN_MAP_2023_EN_TO_FI.values())
assert set(COLUMN_MAP_2024.keys()) == set(COLUMN_MAP_2024_EN_TO_FI.values())
VALUE_MAP_2023_EN_TO_FI = {
VALUE_MAP_2024_EN_TO_FI = {
PALKANSAAJA_VAI_LASKUTTAJA_COL: {
"Employee": "Palkansaaja",
"Entrepreneur": "Laskuttaja",
@@ -112,8 +112,16 @@ VALUE_MAP_2023_EN_TO_FI = {
"Finland": "Suomesta",
},
KAUPUNKI_COL: {
"PK-Seutu (Helsinki, Espoo, Vantaa)": "PK-seutu",
"Asun Porissa, toimisto Helsingissä, sijainnilla ei vaikutusta palkkaan": "Pori",
"Capital region (Helsinki, Espoo, Vantaa)": "PK-seutu",
"Firmalla ei ole toimistoa": "Etätyöfirma",
"Hajautettu": "Etätyöfirma",
"New York City": "New York",
"New York, NY, USA": "New York",
"PK-Seutu (Helsinki, Espoo, Vantaa)": "PK-seutu",
"Tampere (etänä Berliiniin)": "Tampere",
"Turku/remote (HQ Austin, TX)": "Turku",
"Ulkomailla": "Ulkomaat",
},
MILLAISESSA_COL: {
"Product company with softaware as their core business": "Tuotetalossa, jonka core-bisnes on softa",
@@ -166,23 +174,44 @@ ROLE_MAP = {
"Full-stack cloud developer": FULL_STACK_ROLE,
"Fullstack developer, web apps": FULL_STACK_ROLE,
}
NO_GENDER_VALUES = {
"-",
"ei liity asiaan",
"epärelevantti",
"jänis",
"kyllä, kiitos",
"leppäkerttu",
"tihkutympönen",
"on",
"yes",
}
OTHER_GENDER_VALUES = {
"muu",
"muu/ei",
"non-binary, afab",
"muunsukupuolinen",
}
FEMALE_GENDER_VALUES = (
"f",
"n",
"women",
)
MALE_GENDER_VALUES = (
"he / him / male",
"ihminen. kikkelillä.",
"m i ä s",
"m",
"mail", # probably a typo
"male presenting",
"male",
"man",
"meis",
"mie", # probably mies?
"miekkonen",
"mies",
"miesoletettu",
"miäs",
"ukko",
"äiä",
)
IDS_TO_DROP = {
"6cab61607da9c2b6", # hupsu taisteluhelikopteri
"aefdb9e69b1621d5", # See "SUBMITTED TWICE, SORRY!!" in English data
"0bf579f8b0a771b9", # 2 euron palkka, rooli "2"
"9a3b73d810f6e983", # apache hyökkäyshelikopteri
}

View File

@@ -8,35 +8,37 @@ import numpy as np
import pandas
import pandas as pd
from pulkka.config import DATA_DIR, YEAR
from pulkka.column_maps import (
COLUMN_MAP_2023_EN_TO_FI,
KIKY_COL,
KKPALKKA_COL,
PALVELUT_COL,
TYOAIKA_COL,
VUOSITULOT_COL,
TYOPAIKKA_COL,
ROOLI_COL,
KIKY_OTHER_COL,
BOOLEAN_TEXT_TO_BOOLEAN_MAP,
COLUMN_MAP_2024,
COLUMN_MAP_2024_EN_TO_FI,
COMPANY_MAP,
SUKUPUOLI_COL,
ROLE_MAP,
COLUMN_MAP_2023,
VALUE_MAP_2023_EN_TO_FI,
LAHITYO_COL,
IKA_COL,
LANG_COL,
KK_TULOT_COL,
KK_TULOT_NORM_COL,
NO_GENDER_VALUES,
OTHER_GENDER_VALUES,
TYOKOKEMUS_COL,
ROOLI_NORM_COL,
FEMALE_GENDER_VALUES,
ID_COL,
IDS_TO_DROP,
IKA_COL,
KIKY_COL,
KIKY_OTHER_COL,
KK_TULOT_COL,
KK_TULOT_NORM_COL,
KKPALKKA_COL,
LAHITYO_COL,
LANG_COL,
MALE_GENDER_VALUES,
NO_GENDER_VALUES,
OTHER_GENDER_VALUES,
PALVELUT_COL,
ROLE_MAP,
ROOLI_COL,
ROOLI_NORM_COL,
SUKUPUOLI_COL,
TYOAIKA_COL,
TYOKOKEMUS_COL,
TYOPAIKKA_COL,
VALUE_MAP_2024_EN_TO_FI,
VUOSITULOT_COL,
)
from pulkka.config import DATA_DIR, YEAR
def map_sukupuoli(r: pd.Series) -> str | None:
@@ -49,19 +51,11 @@ def map_sukupuoli(r: pd.Series) -> str | None:
"nainen" in value
or "female" in value
or "woman" in value
or value == "f"
or value == "women"
or value in FEMALE_GENDER_VALUES
):
return "nainen"
if (
"mies" in value
or "uros" in value
or "miäs" in value
or "äiä" in value
or "male" in value
or value in ("m", "man", "m i ä s", "ukko")
):
if value.strip() in MALE_GENDER_VALUES:
return "mies"
if value in NO_GENDER_VALUES:
@@ -70,7 +64,7 @@ def map_sukupuoli(r: pd.Series) -> str | None:
if value in OTHER_GENDER_VALUES:
return "muu"
raise NotImplementedError(f"Unknown sukupuoli: {value} (row ID {r[ID_COL]})")
raise NotImplementedError(f"Unknown sukupuoli: {value!r} (row ID {r[ID_COL]})")
def map_vuositulot(r):
@@ -110,7 +104,7 @@ def read_initial_dfs() -> pd.DataFrame:
skiprows=[1], # Google Sheets exports one empty row
)
df_en[LANG_COL] = "en"
df_en = df_en.rename(columns=COLUMN_MAP_2023_EN_TO_FI)
df_en = df_en.rename(columns=COLUMN_MAP_2024_EN_TO_FI)
df = pd.concat([df_fi, df_en], ignore_index=True)
df = df[df["Timestamp"].notna()] # Remove rows with no timestamp
df[LANG_COL] = df[LANG_COL].astype("category")
@@ -130,23 +124,24 @@ def map_case_insensitive(series: pd.Series, mapping: dict[str, str]) -> pd.Serie
def map_value(v):
if v is np.nan:
return ""
assert isinstance(v, str)
if not isinstance(v, str):
raise TypeError(f"Unexpected value {v!r} of type {type(v)}")
return lower_mapping.get(v.lower().strip(), v)
return series.apply(map_value).fillna(series)
def read_data() -> pd.DataFrame:
if YEAR != "2023":
if YEAR != "2024":
raise ValueError(
"This code only works for 2023. "
"This code only works for 2024. "
"Please use an older revision for older data.",
)
df = read_initial_dfs()
df = df.rename(columns=COLUMN_MAP_2023)
df = df.rename(columns=COLUMN_MAP_2024)
for col, val_map in VALUE_MAP_2023_EN_TO_FI.items():
for col, val_map in VALUE_MAP_2024_EN_TO_FI.items():
df[col] = df[col].map(val_map).fillna(df[col]).astype("category")
# Drop known bogus data
@@ -192,14 +187,7 @@ def read_data() -> pd.DataFrame:
df = apply_fixups(
df,
[
(
{ID_COL: "a01216a11026d749", VUOSITULOT_COL: 620000},
{VUOSITULOT_COL: 62000},
),
(
{ID_COL: "79a200f529f6919b", VUOSITULOT_COL: 1500},
{VUOSITULOT_COL: 150_000},
),
# ({ID_COL: "..."}, {VUOSITULOT_COL: 62000}),
],
)
# Fill in Vuositulot as 12.5 * Kk-tulot if empty

View File

@@ -33,7 +33,7 @@ def get_categorical_stats(
if na_as_category:
rename_na(df, category_col, na_as_category)
# ... then carry on.
group = df[[category_col, value_col]].groupby(category_col)
group = df[[category_col, value_col]].groupby(category_col, observed=False)
return group[value_col].agg(
["mean", "min", "max", "median", "count", q25, q50, q75, q90],
)

View File

@@ -1,20 +1,20 @@
import bokeh.plotting as bp
import bokeh.models as bm
import bokeh.layouts as bl
import bokeh.models as bm
import bokeh.plotting as bp
from pandas import DataFrame
from pulkka.chart_utils import (
gender_colormap,
get_categorical_stats_plot,
get_df_hover_tool,
set_yaxis_cash,
get_categorical_stats_plot,
)
from pulkka.column_maps import (
IKA_COL,
KAUPUNKI_COL,
SUKUPUOLI_COL,
TYOKOKEMUS_COL,
VUOSITULOT_COL,
KAUPUNKI_COL,
IKA_COL,
SUKUPUOLI_COL,
)
from pulkka.config import OUT_DIR
from pulkka.data_ingest import read_data

View File

@@ -1,9 +1,9 @@
import pandas as pd
from ydata_profiling import ProfileReport
from pulkka.column_maps import KKPALKKA_COL, VUOSITULOT_COL
from pulkka.config import OUT_DIR
from pulkka.data_ingest import read_data, force_age_numeric
from ydata_profiling import ProfileReport
from pulkka.data_ingest import force_age_numeric, read_data
def main():

View File

@@ -79,7 +79,7 @@ def main():
)
env.globals.update(
{
"date": datetime.datetime.utcnow(),
"date": datetime.datetime.now(datetime.UTC),
"cm": column_maps,
"pd": pandas,
"np": numpy,

View File

@@ -3,4 +3,5 @@ jinja2
matplotlib<3.8.0 # ydata-profiling compat
openpyxl
pandas
setuptools # implicitly required by ydata-profiling
ydata-profiling

View File

@@ -1,65 +1,68 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile requirements.in
#
attrs==23.1.0
# This file was autogenerated by uv via the following command:
# uv pip compile requirements.in -o requirements.txt
annotated-types==0.7.0
# via pydantic
attrs==24.2.0
# via visions
bokeh==2.4.3
# via -r requirements.in
certifi==2023.7.22
certifi==2024.8.30
# via requests
charset-normalizer==3.2.0
charset-normalizer==3.4.0
# via requests
contourpy==1.1.1
contourpy==1.3.0
# via matplotlib
cycler==0.11.0
cycler==0.12.1
# via matplotlib
dacite==1.8.1
# via ydata-profiling
et-xmlfile==1.1.0
# via openpyxl
fonttools==4.42.1
fonttools==4.54.1
# via matplotlib
htmlmin==0.1.12
# via ydata-profiling
idna==3.4
idna==3.10
# via requests
imagehash==4.3.1
# via
# visions
# ydata-profiling
jinja2==3.1.2
jinja2==3.1.4
# via
# -r requirements.in
# bokeh
# ydata-profiling
joblib==1.3.2
joblib==1.4.2
# via phik
kiwisolver==1.4.5
kiwisolver==1.4.7
# via matplotlib
markupsafe==2.1.3
llvmlite==0.43.0
# via numba
markupsafe==3.0.1
# via jinja2
matplotlib==3.7.3
matplotlib==3.7.5
# via
# -r requirements.in
# phik
# seaborn
# wordcloud
# ydata-profiling
multimethod==1.10
multimethod==1.12
# via
# visions
# ydata-profiling
networkx==3.1
networkx==3.4.1
# via visions
numpy==1.23.5
numba==0.60.0
# via ydata-profiling
numpy==1.26.4
# via
# bokeh
# contourpy
# imagehash
# matplotlib
# numba
# pandas
# patsy
# phik
@@ -70,14 +73,14 @@ numpy==1.23.5
# visions
# wordcloud
# ydata-profiling
openpyxl==3.1.2
openpyxl==3.1.5
# via -r requirements.in
packaging==23.1
packaging==24.1
# via
# bokeh
# matplotlib
# statsmodels
pandas==2.0.3
pandas==2.2.3
# via
# -r requirements.in
# phik
@@ -85,68 +88,72 @@ pandas==2.0.3
# statsmodels
# visions
# ydata-profiling
patsy==0.5.3
patsy==0.5.6
# via statsmodels
phik==0.12.3
phik==0.12.4
# via ydata-profiling
pillow==10.0.1
pillow==11.0.0
# via
# bokeh
# imagehash
# matplotlib
# visions
# wordcloud
pydantic==1.10.12
pydantic==2.9.2
# via ydata-profiling
pyparsing==3.1.1
pydantic-core==2.23.4
# via pydantic
pyparsing==3.2.0
# via matplotlib
python-dateutil==2.8.2
python-dateutil==2.9.0.post0
# via
# matplotlib
# pandas
pytz==2023.3.post1
pytz==2024.2
# via pandas
pywavelets==1.4.1
pywavelets==1.7.0
# via imagehash
pyyaml==6.0.1
pyyaml==6.0.2
# via
# bokeh
# ydata-profiling
requests==2.31.0
requests==2.32.3
# via ydata-profiling
scipy==1.11.2
scipy==1.13.1
# via
# imagehash
# phik
# statsmodels
# ydata-profiling
seaborn==0.12.2
seaborn==0.13.2
# via ydata-profiling
setuptools==75.2.0
# via -r requirements.in
six==1.16.0
# via
# patsy
# python-dateutil
statsmodels==0.14.0
statsmodels==0.14.4
# via ydata-profiling
tangled-up-in-unicode==0.2.0
# via visions
tornado==6.3.3
tornado==6.4.1
# via bokeh
tqdm==4.66.1
tqdm==4.66.5
# via ydata-profiling
typeguard==2.13.3
typeguard==4.3.0
# via ydata-profiling
typing-extensions==4.8.0
typing-extensions==4.12.2
# via
# bokeh
# pydantic
tzdata==2023.3
# pydantic-core
# typeguard
tzdata==2024.2
# via pandas
urllib3==2.0.5
urllib3==2.2.3
# via requests
visions[type_image_path]==0.7.5
visions==0.7.6
# via ydata-profiling
wordcloud==1.9.2
wordcloud==1.9.3
# via ydata-profiling
ydata-profiling==4.5.1
ydata-profiling==4.11.0
# via -r requirements.in

View File

@@ -1,8 +1,10 @@
[lint]
ignore = [
"E501",
"PD901", # I know "df" isn't the best name
]
extend-select = [
"COM",
"I",
"PD",
]