Packagify pulkka

2026-02-23 00:55:55 +00:00 · 2022-08-31 15:05:37 +03:00
parent df22bd90f1
commit cdc6d9cc89
9 changed files with 10 additions and 10 deletions
--- a/pulkka/init.py
+++ b/pulkka/init.py
--- a/pulkka/chart_utils.py
+++ b/pulkka/chart_utils.py
@@ -0,0 +1,84 @@
+from bokeh import models as bm, plotting as bp
+from bokeh.transform import factor_cmap
+from pandas import DataFrame
+
+from pulkka.data_utils import get_categorical_stats
+
+CAT_Q_RADIUS = 0.1
+
+gender_colormap = factor_cmap("Sukupuoli", ["#4834d4", "#eb4d4b"], ["mies", "nainen"])
+
+
+def get_df_hover_tool(df: DataFrame):
+    return bm.HoverTool(tooltips=[(c, f"@{{{c}}}") for c in df.columns])
+
+
+def set_yaxis_cash(plot):
+    plot.yaxis.axis_label = "Vuositulot"
+    plot.yaxis[0].formatter = bm.NumeralTickFormatter(format="€0")
+
+
+def get_categorical_stats_plot(df, *, category, value, na_as_category=None, line=True):
+    df = get_categorical_stats(df, category, value, na_as_category=na_as_category)
+    df.reset_index(inplace=True)
+    df[category] = df[category].astype("category")
+    plot = bp.figure(
+        title=f"{category}/{value}", x_range=list(df[category].cat.categories)
+    )
+    set_yaxis_cash(plot)
+    plot.vbar(
+        df[category],
+        CAT_Q_RADIUS * 2.5,
+        df["max"],
+        df["min"],
+        color="#a4b0be",
+        fill_alpha=0.7,
+    )
+    plot.circle(
+        df[category],
+        df["q25"],
+        radius=CAT_Q_RADIUS,
+        legend_label="q25",
+        color="#f368e0",
+    )
+    plot.circle(
+        df[category],
+        df["q75"],
+        radius=CAT_Q_RADIUS,
+        legend_label="q75",
+        color="#00d2d3",
+    )
+    plot.circle(
+        df[category],
+        df["q90"],
+        radius=CAT_Q_RADIUS,
+        legend_label="q90",
+        color="#ff9f43",
+    )
+    if line:
+        plot.line(
+            df[category],
+            df["median"],
+            legend_label="median",
+            color="#1289A7",
+            line_width=4,
+        )
+        plot.line(
+            df[category], df["mean"], legend_label="mean", color="#B53471", line_width=4
+        )
+    else:
+        plot.circle(
+            df[category],
+            df["median"],
+            radius=CAT_Q_RADIUS,
+            legend_label="median",
+            color="#1289A7",
+        )
+        plot.circle(
+            df[category],
+            df["mean"],
+            radius=CAT_Q_RADIUS,
+            legend_label="mean",
+            color="#B53471",
+        )
+    return plot
--- a/pulkka/copy_massaged_data.py
+++ b/pulkka/copy_massaged_data.py
@@ -0,0 +1,18 @@
+from pulkka.data_ingest import read_data
+
+
+def main():
+    df = read_data()
+    df.to_html("out/data.html", index=False)
+    df.to_csv("out/data.csv", index=False)
+    df.to_excel("out/data.xlsx", index=False)
+    df.to_json(
+        "out/data.json",
+        orient="records",
+        date_format="iso",
+        force_ascii=False,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/pulkka/data_ingest.py
+++ b/pulkka/data_ingest.py
@@ -0,0 +1,123 @@
+import re
+
+import numpy as np
+import pandas as pd
+
+COLUMN_MAP = {
+    "Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": "Kaupunki",
+    "Työaika (jos työsuhteessa)": "Työaika",
+    "Etänä vai paikallisesti?": "Etä",
+    "Vuositulot (sis. bonukset, osingot yms) / Vuosilaskutus (jos laskutat)": "Vuositulot",
+    "Kuukausipalkka (jos työntekijä) (brutto)": "Kuukausipalkka",
+    "Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?": "Kilpailukykyinen",
+}
+
+ETATYO_MAP = {
+    "Pääosin tai kokonaan etätyö": "Etä",
+    "Pääosin tai kokonaan toimistolla": "Toimisto",
+    "Noin 50/50 hybridimalli": "50/50",
+}
+
+
+def map_sukupuoli(value: str):
+    if isinstance(value, str):
+        value = value.lower()
+        if "nainen" in value or "female" in value:
+            return "nainen"
+
+        if (
+            "mies" in value
+            or "uros" in value
+            or "miäs" in value
+            or "äiä" in value
+            or "male" in value
+            or value == "m"
+        ):
+            return "mies"
+        return "muu"  # Map the handful of outliers into "muu" (so a given value but not specified)
+    return value
+
+
+def map_vuositulot(r):
+    if r["Vuositulot"] is np.nan:
+        return r["Kuukausipalkka"] * 12.5
+    return r["Vuositulot"]
+
+
+def map_numberlike(d):
+    if isinstance(d, str):
+        try:
+            return float(re.sub("\s+", "", d))
+        except ValueError:
+            pass
+    return d
+
+
+def map_ika(d):
+    if d == "30-35 v":  # Early answers had a wrong bracket here
+        d = "31-35 v"
+    return d
+
+
+def read_data() -> pd.DataFrame:
+    df: pd.DataFrame = pd.read_excel(
+        "data/results.xlsx",
+        skiprows=[1],  # Google Sheets exports one empty row
+    )
+    df.rename(columns=COLUMN_MAP, inplace=True)
+
+    df["Kaupunki"].replace(
+        "PK-Seutu (Helsinki, Espoo, Vantaa)", "PK-Seutu", inplace=True
+    )
+    df["Kaupunki"] = df["Kaupunki"].astype("category")
+    df["Sukupuoli"] = df["Sukupuoli"].apply(map_sukupuoli).astype("category")
+    df["Ikä"] = df["Ikä"].apply(map_ika).astype("category")
+    # Turn työaika into 0% - 100%
+    df["Työaika"] = pd.to_numeric(df["Työaika"], errors="coerce").clip(0, 1)
+
+    df["Etä"] = df["Etä"].map(ETATYO_MAP).astype("category")
+    df["Kilpailukykyinen"].replace({"Kyllä": True, "Ei": False}, inplace=True)
+
+    # Try to clean up numbers with spaces, etc. to real numbers
+    df["Kuukausipalkka"] = df["Kuukausipalkka"].apply(map_numberlike)
+    df["Vuositulot"] = df["Vuositulot"].apply(map_numberlike)
+
+    # Remove Oy, Oyj, etc. from work places
+    df["Työpaikka"] = df["Työpaikka"].replace(re.compile(r"\s+oy|oyj$", flags=re.I), "")
+
+    # Fill in Vuositulot as 12.5 * Kk-tulot if empty
+    df["Vuositulot"] = df.apply(map_vuositulot, axis=1)
+
+    # Synthesize kk-tulot from Vuositulot
+    df["Kk-tulot"] = pd.to_numeric(df["Vuositulot"], errors="coerce") / 12
+    return df
+
+
+def force_tulot_numeric(df):
+    df["Kuukausipalkka"] = pd.to_numeric(df["Kuukausipalkka"], errors="coerce")
+    df["Vuositulot"] = pd.to_numeric(df["Vuositulot"], errors="coerce")
+    return df
+
+
+def force_age_numeric(df):
+    age_map = {}
+    for cat in df["Ikä"].cat.categories:
+        m = re.match("^(\d+)-(\d+) v", cat)
+        if m:
+            age_map[cat] = int(round(float(m.group(1)) + float(m.group(2))) / 2)
+    df["Ikä"] = df["Ikä"].apply(lambda r: age_map.get(r, r))
+    return df
+
+
+def main():
+    pd.set_option("display.max_column", None)
+    pd.set_option("display.max_rows", None)
+    pd.set_option("display.max_seq_items", None)
+    pd.set_option("display.max_colwidth", 500)
+    pd.set_option("expand_frame_repr", True)
+    df = read_data()
+    print(df.head())
+
+
+if __name__ == "__main__":
+    main()
--- a/pulkka/data_utils.py
+++ b/pulkka/data_utils.py
@@ -0,0 +1,41 @@
+from typing import Optional
+
+import pandas as pd
+
+
+def q25(x):
+    return x.quantile(0.25)
+
+
+def q50(x):
+    return x.quantile(0.5)
+
+
+def q75(x):
+    return x.quantile(0.75)
+
+
+def q90(x):
+    return x.quantile(0.9)
+
+
+def get_categorical_stats(
+    df: pd.DataFrame,
+    category_col: str,
+    value_col: str,
+    *,
+    na_as_category: Optional[str] = None,
+) -> pd.DataFrame:
+    # Drop records where value is not numeric before grouping...
+    df = df.copy()
+    df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
+    df = df[df[value_col].notna() & df[value_col] > 0]
+    if na_as_category:
+        df[category_col] = df[category_col].astype("string")
+        df.loc[df[category_col].isna(), category_col] = na_as_category
+        df[category_col] = df[category_col].astype("category")
+    # ... then carry on.
+    group = df[[category_col, value_col]].groupby(category_col)
+    return group[value_col].agg(
+        ["mean", "min", "max", "median", "count", q25, q50, q75, q90]
+    )
--- a/pulkka/generate_charts.py
+++ b/pulkka/generate_charts.py
@@ -0,0 +1,64 @@
+import bokeh.plotting as bp
+import bokeh.models as bm
+import bokeh.layouts as bl
+from pandas import DataFrame
+
+from pulkka.chart_utils import (
+    gender_colormap,
+    get_df_hover_tool,
+    set_yaxis_cash,
+    get_categorical_stats_plot,
+)
+from pulkka.data_ingest import read_data
+
+plot_funcs = set()
+
+
+def plot_this(fn):
+    """
+    Decorator for marking a function as a plot generator.
+    """
+    plot_funcs.add(fn)
+
+
+@plot_this
+def plot_kokemus_tulot(df: DataFrame):
+    source = bm.ColumnDataSource(df)
+    plot = bp.figure(title="Kokemus/Vuositulot")
+    plot.add_tools(get_df_hover_tool(df))
+    plot.xaxis.axis_label = "Työkokemus (v)"
+    set_yaxis_cash(plot)
+    plot.circle(
+        x="Työkokemus", y="Vuositulot", source=source, color=gender_colormap, size=10
+    )
+    return plot
+
+
+@plot_this
+def plot_ika_vuositulot(df: DataFrame):
+    return get_categorical_stats_plot(df, category="Ikä", value="Vuositulot")
+
+
+@plot_this
+def plot_sukupuoli_vuositulot(df: DataFrame):
+    return get_categorical_stats_plot(
+        df, category="Sukupuoli", value="Vuositulot", na_as_category="EOS"
+    )
+
+
+@plot_this
+def plot_kaupunki_vuositulot(df: DataFrame):
+    plot = get_categorical_stats_plot(df, category="Kaupunki", value="Vuositulot", line=False)
+    plot.xaxis.major_label_orientation = "vertical"
+    return plot
+
+
+def main():
+    df = read_data()
+    plots = [func(df) for func in sorted(plot_funcs, key=lambda f: f.__name__)]
+    bp.output_file("out/charts.html", title="Koodiklinikan Palkkakysely")
+    bp.save(bl.grid(plots, ncols=2, sizing_mode="stretch_both"))
+
+
+if __name__ == "__main__":
+    main()
--- a/pulkka/generate_profiling.py
+++ b/pulkka/generate_profiling.py
@@ -0,0 +1,14 @@
+from pulkka.data_ingest import read_data, force_tulot_numeric, force_age_numeric
+from pandas_profiling import ProfileReport
+
+
+def main():
+    df = read_data()
+    df = force_tulot_numeric(df)
+    df = force_age_numeric(df)
+    profile = ProfileReport(df)
+    profile.to_file("out/profiling_report.html")
+
+
+if __name__ == "__main__":
+    main()
--- a/pulkka/massage_templates.py
+++ b/pulkka/massage_templates.py
@@ -0,0 +1,33 @@
+import datetime
+import glob
+import os
+
+import jinja2
+import numpy
+import pandas
+
+from pulkka.data_ingest import read_data
+
+
+def main():
+    env = jinja2.Environment(
+        autoescape=True,
+    )
+    data = {
+        "date": datetime.datetime.utcnow(),
+        "pd": pandas,
+        "np": numpy,
+        "df": read_data(),
+    }
+    for filename in glob.glob("template/*"):
+        out_filename = os.path.join("out", os.path.relpath(filename, "template"))
+        with open(filename, "r") as inf:
+            tpl: jinja2.Template = env.from_string(inf.read())
+            content = tpl.render(data)
+        with open(out_filename, "w") as outf:
+            outf.write(content)
+        print(filename, "=>", out_filename)
+
+
+if __name__ == "__main__":
+    main()