Merge pull request #17 from koodiklinikka/ids

Ids, vertical HTML, data fixups
2026-03-12 16:03:25 +00:00 · 2023-09-28 16:48:32 +03:00
parent 2195f6621d 773aad8749
commit 001610bbc3
5 changed files with 66 additions and 18 deletions
--- a/pulkka/column_maps.py
+++ b/pulkka/column_maps.py
@@ -28,6 +28,7 @@ TYOKOKEMUS_COL = "Työkokemus alalta (vuosina)"
 TYOPAIKKA_COL = "Työpaikka"
 VUOSILASKUTUS_ALV0_COL = "Vuosilaskutus (ALV 0%, euroina)"
 VUOSITULOT_COL = "Vuositulot"
+ID_COL = "Vastaustunniste"

 COLUMN_MAP_2023 = {
    "Timestamp": "Timestamp",
@@ -172,7 +173,6 @@ NO_GENDER_VALUES = {
    "jänis",
    "kyllä, kiitos",
    "leppäkerttu",
-    "taisteluhelikopteri",
    "tihkutympönen",
    "yes",
 }
@@ -182,7 +182,7 @@ OTHER_GENDER_VALUES = {
    "non-binary, afab",
 }

-TIMESTAMPS_TO_DROP = {
-    # See "SUBMITTED TWICE, SORRY!!" in English data:
-    "2023-09-08 13:24:46.740",
+IDS_TO_DROP = {
+    "6cab61607da9c2b6",  # hupsu taisteluhelikopteri
+    "aefdb9e69b1621d5",  # See "SUBMITTED TWICE, SORRY!!" in English data
 }
--- a/pulkka/data_ingest.py
+++ b/pulkka/data_ingest.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import hashlib
 import re
 import warnings

@@ -33,11 +34,13 @@ from pulkka.column_maps import (
    OTHER_GENDER_VALUES,
    TYOKOKEMUS_COL,
    ROOLI_NORM_COL,
-    TIMESTAMPS_TO_DROP,
+    ID_COL,
+    IDS_TO_DROP,
 )


-def map_sukupuoli(value: str) -> str | None:
+def map_sukupuoli(r: pd.Series) -> str | None:
+    value = r[SUKUPUOLI_COL]
    if not isinstance(value, str):
        return value

@@ -67,7 +70,7 @@ def map_sukupuoli(value: str) -> str | None:
    if value in OTHER_GENDER_VALUES:
        return "muu"

-    raise NotImplementedError(f"Unknown sukupuoli: {value}")
+    raise NotImplementedError(f"Unknown sukupuoli: {value} (row ID {r[ID_COL]})")


 def map_vuositulot(r):
@@ -91,6 +94,11 @@ def ucfirst(val):
    return val


+def hash_row(r: pd.Series) -> str:
+    source_data = f"{r[LANG_COL]}.{int(r.Timestamp.timestamp() * 1000)}"
+    return hashlib.sha256(source_data.encode()).hexdigest()[:16]
+
+
 def read_initial_dfs() -> pd.DataFrame:
    df_fi: pd.DataFrame = pd.read_excel(
        DATA_DIR / "results-fi.xlsx",
@@ -106,6 +114,10 @@ def read_initial_dfs() -> pd.DataFrame:
    df = pd.concat([df_fi, df_en], ignore_index=True)
    df = df[df["Timestamp"].notna()]  # Remove rows with no timestamp
    df[LANG_COL] = df[LANG_COL].astype("category")
+    # Give each row a unique hash ID
+    df[ID_COL] = df.apply(hash_row, axis=1)
+    # Ensure truncated sha is unique
+    assert len(df[ID_COL].unique()) == len(df)
    return df


@@ -137,13 +149,10 @@ def read_data() -> pd.DataFrame:
    for col, val_map in VALUE_MAP_2023_EN_TO_FI.items():
        df[col] = df[col].map(val_map).fillna(df[col]).astype("category")

-    # Drop bogus data
-    df = df.drop(df[df[SUKUPUOLI_COL] == "taisteluhelikopteri"].index)
+    # Drop known bogus data
+    df = df.drop(df[df[ID_COL].isin(IDS_TO_DROP)].index)

-    # Drop rows by timestamps known to be duplicate
-    df = df.drop(df[df["Timestamp"].isin(TIMESTAMPS_TO_DROP)].index)
-
-    df[SUKUPUOLI_COL] = df[SUKUPUOLI_COL].apply(map_sukupuoli).astype("category")
+    df[SUKUPUOLI_COL] = df.apply(map_sukupuoli, axis=1).astype("category")
    df[IKA_COL] = df[IKA_COL].astype("category")

    # Assume that people entering 37.5 (hours) as their työaika means 100%
@@ -180,11 +189,19 @@ def read_data() -> pd.DataFrame:
    df[TYOKOKEMUS_COL] = df[TYOKOKEMUS_COL].round()

    # Fix known bogus data
-    df.loc[
-        (df[KKPALKKA_COL] == 4900) & (df[VUOSITULOT_COL] == 620000),
-        VUOSITULOT_COL,
-    ] = 62000
-
+    df = apply_fixups(
+        df,
+        [
+            (
+                {ID_COL: "a01216a11026d749", VUOSITULOT_COL: 620000},
+                {VUOSITULOT_COL: 62000},
+            ),
+            (
+                {ID_COL: "79a200f529f6919b", VUOSITULOT_COL: 1500},
+                {VUOSITULOT_COL: 150_000},
+            ),
+        ],
+    )
    # Fill in Vuositulot as 12.5 * Kk-tulot if empty
    df[VUOSITULOT_COL] = df.apply(map_vuositulot, axis=1)

@@ -252,3 +269,16 @@ def main():

 if __name__ == "__main__":
    main()
+
+
+def apply_fixups(df: pd.DataFrame, fixups: list[tuple[dict, dict]]) -> pd.DataFrame:
+    for match_cond, replace_cond in fixups:
+        match_keys, match_values = zip(*match_cond.items())
+        ix = df[list(match_keys)].eq(list(match_values)).all(axis=1)
+        if not ix.any():
+            raise ValueError(
+                f"Fixup match condition {match_cond} did not match any rows",
+            )
+        replace_keys, replace_values = zip(*replace_cond.items())
+        df.loc[ix, list(replace_keys)] = replace_values
+    return df
--- a/pulkka/massage_outputs.py
+++ b/pulkka/massage_outputs.py
@@ -26,6 +26,18 @@ def write_massaged_files(env, df):
                body_class="table-body",
            ),
        )
+    with open(OUT_DIR / "data-vertical.html", "w") as f:
+        with io.StringIO() as s:
+            for _, row in df.iterrows():
+                row.dropna().to_frame().to_html(s, header=False, na_rep="", border=0)
+                s.write("\n")
+            table_html = s.getvalue()
+        f.write(
+            env.get_template("_table.html").render(
+                table_html=table_html,
+                body_class="table-body",
+            ),
+        )
    df.to_csv(OUT_DIR / "data.csv", index=False)
    df.to_excel(OUT_DIR / "data.xlsx", index=False)
    df.to_json(
--- a/template/index.html
+++ b/template/index.html
@@ -48,6 +48,7 @@
    <ul>
        <li><a href="data.csv">Lähdedata (CSV)</a></li>
        <li><a href="data.html">Lähdedata (HTML)</a></li>
+        <li><a href="data-vertical.html">Vastaukset eriteltyinä (HTML)</a></li>
        <li><a href="data.json">Lähdedata (JSON)</a></li>
        <li><a href="data.xlsx">Lähdedata (XLSX)</a></li>
    </ul>
--- a/template/style.css
+++ b/template/style.css
@@ -27,6 +27,7 @@ body.table-body {

 body.table-body table {
  border-collapse: collapse;
+  margin-bottom: 1em;
 }

 body.table-body td,
@@ -35,6 +36,10 @@ body.table-body th {
  border: 1px solid #999;
 }

+body.table-body tr th {
+  text-align: left;
+}
+
 h1,
 h2,
 h3 {