mirror of
https://github.com/koodiklinikka/palkkakysely.git
synced 2026-01-26 11:23:59 +00:00
Merge pull request #17 from koodiklinikka/ids
Ids, vertical HTML, data fixups
This commit is contained in:
@@ -28,6 +28,7 @@ TYOKOKEMUS_COL = "Työkokemus alalta (vuosina)"
|
||||
TYOPAIKKA_COL = "Työpaikka"
|
||||
VUOSILASKUTUS_ALV0_COL = "Vuosilaskutus (ALV 0%, euroina)"
|
||||
VUOSITULOT_COL = "Vuositulot"
|
||||
ID_COL = "Vastaustunniste"
|
||||
|
||||
COLUMN_MAP_2023 = {
|
||||
"Timestamp": "Timestamp",
|
||||
@@ -172,7 +173,6 @@ NO_GENDER_VALUES = {
|
||||
"jänis",
|
||||
"kyllä, kiitos",
|
||||
"leppäkerttu",
|
||||
"taisteluhelikopteri",
|
||||
"tihkutympönen",
|
||||
"yes",
|
||||
}
|
||||
@@ -182,7 +182,7 @@ OTHER_GENDER_VALUES = {
|
||||
"non-binary, afab",
|
||||
}
|
||||
|
||||
TIMESTAMPS_TO_DROP = {
|
||||
# See "SUBMITTED TWICE, SORRY!!" in English data:
|
||||
"2023-09-08 13:24:46.740",
|
||||
IDS_TO_DROP = {
|
||||
"6cab61607da9c2b6", # hupsu taisteluhelikopteri
|
||||
"aefdb9e69b1621d5", # See "SUBMITTED TWICE, SORRY!!" in English data
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
import warnings
|
||||
|
||||
@@ -33,11 +34,13 @@ from pulkka.column_maps import (
|
||||
OTHER_GENDER_VALUES,
|
||||
TYOKOKEMUS_COL,
|
||||
ROOLI_NORM_COL,
|
||||
TIMESTAMPS_TO_DROP,
|
||||
ID_COL,
|
||||
IDS_TO_DROP,
|
||||
)
|
||||
|
||||
|
||||
def map_sukupuoli(value: str) -> str | None:
|
||||
def map_sukupuoli(r: pd.Series) -> str | None:
|
||||
value = r[SUKUPUOLI_COL]
|
||||
if not isinstance(value, str):
|
||||
return value
|
||||
|
||||
@@ -67,7 +70,7 @@ def map_sukupuoli(value: str) -> str | None:
|
||||
if value in OTHER_GENDER_VALUES:
|
||||
return "muu"
|
||||
|
||||
raise NotImplementedError(f"Unknown sukupuoli: {value}")
|
||||
raise NotImplementedError(f"Unknown sukupuoli: {value} (row ID {r[ID_COL]})")
|
||||
|
||||
|
||||
def map_vuositulot(r):
|
||||
@@ -91,6 +94,11 @@ def ucfirst(val):
|
||||
return val
|
||||
|
||||
|
||||
def hash_row(r: pd.Series) -> str:
|
||||
source_data = f"{r[LANG_COL]}.{int(r.Timestamp.timestamp() * 1000)}"
|
||||
return hashlib.sha256(source_data.encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
def read_initial_dfs() -> pd.DataFrame:
|
||||
df_fi: pd.DataFrame = pd.read_excel(
|
||||
DATA_DIR / "results-fi.xlsx",
|
||||
@@ -106,6 +114,10 @@ def read_initial_dfs() -> pd.DataFrame:
|
||||
df = pd.concat([df_fi, df_en], ignore_index=True)
|
||||
df = df[df["Timestamp"].notna()] # Remove rows with no timestamp
|
||||
df[LANG_COL] = df[LANG_COL].astype("category")
|
||||
# Give each row a unique hash ID
|
||||
df[ID_COL] = df.apply(hash_row, axis=1)
|
||||
# Ensure truncated sha is unique
|
||||
assert len(df[ID_COL].unique()) == len(df)
|
||||
return df
|
||||
|
||||
|
||||
@@ -137,13 +149,10 @@ def read_data() -> pd.DataFrame:
|
||||
for col, val_map in VALUE_MAP_2023_EN_TO_FI.items():
|
||||
df[col] = df[col].map(val_map).fillna(df[col]).astype("category")
|
||||
|
||||
# Drop bogus data
|
||||
df = df.drop(df[df[SUKUPUOLI_COL] == "taisteluhelikopteri"].index)
|
||||
# Drop known bogus data
|
||||
df = df.drop(df[df[ID_COL].isin(IDS_TO_DROP)].index)
|
||||
|
||||
# Drop rows by timestamps known to be duplicate
|
||||
df = df.drop(df[df["Timestamp"].isin(TIMESTAMPS_TO_DROP)].index)
|
||||
|
||||
df[SUKUPUOLI_COL] = df[SUKUPUOLI_COL].apply(map_sukupuoli).astype("category")
|
||||
df[SUKUPUOLI_COL] = df.apply(map_sukupuoli, axis=1).astype("category")
|
||||
df[IKA_COL] = df[IKA_COL].astype("category")
|
||||
|
||||
# Assume that people entering 37.5 (hours) as their työaika means 100%
|
||||
@@ -180,11 +189,19 @@ def read_data() -> pd.DataFrame:
|
||||
df[TYOKOKEMUS_COL] = df[TYOKOKEMUS_COL].round()
|
||||
|
||||
# Fix known bogus data
|
||||
df.loc[
|
||||
(df[KKPALKKA_COL] == 4900) & (df[VUOSITULOT_COL] == 620000),
|
||||
VUOSITULOT_COL,
|
||||
] = 62000
|
||||
|
||||
df = apply_fixups(
|
||||
df,
|
||||
[
|
||||
(
|
||||
{ID_COL: "a01216a11026d749", VUOSITULOT_COL: 620000},
|
||||
{VUOSITULOT_COL: 62000},
|
||||
),
|
||||
(
|
||||
{ID_COL: "79a200f529f6919b", VUOSITULOT_COL: 1500},
|
||||
{VUOSITULOT_COL: 150_000},
|
||||
),
|
||||
],
|
||||
)
|
||||
# Fill in Vuositulot as 12.5 * Kk-tulot if empty
|
||||
df[VUOSITULOT_COL] = df.apply(map_vuositulot, axis=1)
|
||||
|
||||
@@ -252,3 +269,16 @@ def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
def apply_fixups(df: pd.DataFrame, fixups: list[tuple[dict, dict]]) -> pd.DataFrame:
|
||||
for match_cond, replace_cond in fixups:
|
||||
match_keys, match_values = zip(*match_cond.items())
|
||||
ix = df[list(match_keys)].eq(list(match_values)).all(axis=1)
|
||||
if not ix.any():
|
||||
raise ValueError(
|
||||
f"Fixup match condition {match_cond} did not match any rows",
|
||||
)
|
||||
replace_keys, replace_values = zip(*replace_cond.items())
|
||||
df.loc[ix, list(replace_keys)] = replace_values
|
||||
return df
|
||||
|
||||
@@ -26,6 +26,18 @@ def write_massaged_files(env, df):
|
||||
body_class="table-body",
|
||||
),
|
||||
)
|
||||
with open(OUT_DIR / "data-vertical.html", "w") as f:
|
||||
with io.StringIO() as s:
|
||||
for _, row in df.iterrows():
|
||||
row.dropna().to_frame().to_html(s, header=False, na_rep="", border=0)
|
||||
s.write("\n")
|
||||
table_html = s.getvalue()
|
||||
f.write(
|
||||
env.get_template("_table.html").render(
|
||||
table_html=table_html,
|
||||
body_class="table-body",
|
||||
),
|
||||
)
|
||||
df.to_csv(OUT_DIR / "data.csv", index=False)
|
||||
df.to_excel(OUT_DIR / "data.xlsx", index=False)
|
||||
df.to_json(
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
<ul>
|
||||
<li><a href="data.csv">Lähdedata (CSV)</a></li>
|
||||
<li><a href="data.html">Lähdedata (HTML)</a></li>
|
||||
<li><a href="data-vertical.html">Vastaukset eriteltyinä (HTML)</a></li>
|
||||
<li><a href="data.json">Lähdedata (JSON)</a></li>
|
||||
<li><a href="data.xlsx">Lähdedata (XLSX)</a></li>
|
||||
</ul>
|
||||
|
||||
@@ -27,6 +27,7 @@ body.table-body {
|
||||
|
||||
body.table-body table {
|
||||
border-collapse: collapse;
|
||||
margin-bottom: 1em;
|
||||
}
|
||||
|
||||
body.table-body td,
|
||||
@@ -35,6 +36,10 @@ body.table-body th {
|
||||
border: 1px solid #999;
|
||||
}
|
||||
|
||||
body.table-body tr th {
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
h1,
|
||||
h2,
|
||||
h3 {
|
||||
|
||||
Reference in New Issue
Block a user