mirror of
https://github.com/koodiklinikka/palkkakysely.git
synced 2026-02-23 00:55:55 +00:00
Packagify pulkka
This commit is contained in:
0
pulkka/__init__.py
Normal file
0
pulkka/__init__.py
Normal file
84
pulkka/chart_utils.py
Normal file
84
pulkka/chart_utils.py
Normal file
@@ -0,0 +1,84 @@
|
||||
from bokeh import models as bm, plotting as bp
|
||||
from bokeh.transform import factor_cmap
|
||||
from pandas import DataFrame
|
||||
|
||||
from pulkka.data_utils import get_categorical_stats
|
||||
|
||||
CAT_Q_RADIUS = 0.1
|
||||
|
||||
gender_colormap = factor_cmap("Sukupuoli", ["#4834d4", "#eb4d4b"], ["mies", "nainen"])
|
||||
|
||||
|
||||
def get_df_hover_tool(df: DataFrame):
|
||||
return bm.HoverTool(tooltips=[(c, f"@{{{c}}}") for c in df.columns])
|
||||
|
||||
|
||||
def set_yaxis_cash(plot):
|
||||
plot.yaxis.axis_label = "Vuositulot"
|
||||
plot.yaxis[0].formatter = bm.NumeralTickFormatter(format="€0")
|
||||
|
||||
|
||||
def get_categorical_stats_plot(df, *, category, value, na_as_category=None, line=True):
|
||||
df = get_categorical_stats(df, category, value, na_as_category=na_as_category)
|
||||
df.reset_index(inplace=True)
|
||||
df[category] = df[category].astype("category")
|
||||
plot = bp.figure(
|
||||
title=f"{category}/{value}", x_range=list(df[category].cat.categories)
|
||||
)
|
||||
set_yaxis_cash(plot)
|
||||
plot.vbar(
|
||||
df[category],
|
||||
CAT_Q_RADIUS * 2.5,
|
||||
df["max"],
|
||||
df["min"],
|
||||
color="#a4b0be",
|
||||
fill_alpha=0.7,
|
||||
)
|
||||
plot.circle(
|
||||
df[category],
|
||||
df["q25"],
|
||||
radius=CAT_Q_RADIUS,
|
||||
legend_label="q25",
|
||||
color="#f368e0",
|
||||
)
|
||||
plot.circle(
|
||||
df[category],
|
||||
df["q75"],
|
||||
radius=CAT_Q_RADIUS,
|
||||
legend_label="q75",
|
||||
color="#00d2d3",
|
||||
)
|
||||
plot.circle(
|
||||
df[category],
|
||||
df["q90"],
|
||||
radius=CAT_Q_RADIUS,
|
||||
legend_label="q90",
|
||||
color="#ff9f43",
|
||||
)
|
||||
if line:
|
||||
plot.line(
|
||||
df[category],
|
||||
df["median"],
|
||||
legend_label="median",
|
||||
color="#1289A7",
|
||||
line_width=4,
|
||||
)
|
||||
plot.line(
|
||||
df[category], df["mean"], legend_label="mean", color="#B53471", line_width=4
|
||||
)
|
||||
else:
|
||||
plot.circle(
|
||||
df[category],
|
||||
df["median"],
|
||||
radius=CAT_Q_RADIUS,
|
||||
legend_label="median",
|
||||
color="#1289A7",
|
||||
)
|
||||
plot.circle(
|
||||
df[category],
|
||||
df["mean"],
|
||||
radius=CAT_Q_RADIUS,
|
||||
legend_label="mean",
|
||||
color="#B53471",
|
||||
)
|
||||
return plot
|
||||
18
pulkka/copy_massaged_data.py
Normal file
18
pulkka/copy_massaged_data.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from pulkka.data_ingest import read_data
|
||||
|
||||
|
||||
def main():
|
||||
df = read_data()
|
||||
df.to_html("out/data.html", index=False)
|
||||
df.to_csv("out/data.csv", index=False)
|
||||
df.to_excel("out/data.xlsx", index=False)
|
||||
df.to_json(
|
||||
"out/data.json",
|
||||
orient="records",
|
||||
date_format="iso",
|
||||
force_ascii=False,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
123
pulkka/data_ingest.py
Normal file
123
pulkka/data_ingest.py
Normal file
@@ -0,0 +1,123 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
COLUMN_MAP = {
|
||||
"Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": "Kaupunki",
|
||||
"Työaika (jos työsuhteessa)": "Työaika",
|
||||
"Etänä vai paikallisesti?": "Etä",
|
||||
"Vuositulot (sis. bonukset, osingot yms) / Vuosilaskutus (jos laskutat)": "Vuositulot",
|
||||
"Kuukausipalkka (jos työntekijä) (brutto)": "Kuukausipalkka",
|
||||
"Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?": "Kilpailukykyinen",
|
||||
}
|
||||
|
||||
ETATYO_MAP = {
|
||||
"Pääosin tai kokonaan etätyö": "Etä",
|
||||
"Pääosin tai kokonaan toimistolla": "Toimisto",
|
||||
"Noin 50/50 hybridimalli": "50/50",
|
||||
}
|
||||
|
||||
|
||||
def map_sukupuoli(value: str):
|
||||
if isinstance(value, str):
|
||||
value = value.lower()
|
||||
if "nainen" in value or "female" in value:
|
||||
return "nainen"
|
||||
|
||||
if (
|
||||
"mies" in value
|
||||
or "uros" in value
|
||||
or "miäs" in value
|
||||
or "äiä" in value
|
||||
or "male" in value
|
||||
or value == "m"
|
||||
):
|
||||
return "mies"
|
||||
return "muu" # Map the handful of outliers into "muu" (so a given value but not specified)
|
||||
return value
|
||||
|
||||
|
||||
def map_vuositulot(r):
|
||||
if r["Vuositulot"] is np.nan:
|
||||
return r["Kuukausipalkka"] * 12.5
|
||||
return r["Vuositulot"]
|
||||
|
||||
|
||||
def map_numberlike(d):
|
||||
if isinstance(d, str):
|
||||
try:
|
||||
return float(re.sub("\s+", "", d))
|
||||
except ValueError:
|
||||
pass
|
||||
return d
|
||||
|
||||
|
||||
def map_ika(d):
|
||||
if d == "30-35 v": # Early answers had a wrong bracket here
|
||||
d = "31-35 v"
|
||||
return d
|
||||
|
||||
|
||||
def read_data() -> pd.DataFrame:
|
||||
df: pd.DataFrame = pd.read_excel(
|
||||
"data/results.xlsx",
|
||||
skiprows=[1], # Google Sheets exports one empty row
|
||||
)
|
||||
df.rename(columns=COLUMN_MAP, inplace=True)
|
||||
|
||||
df["Kaupunki"].replace(
|
||||
"PK-Seutu (Helsinki, Espoo, Vantaa)", "PK-Seutu", inplace=True
|
||||
)
|
||||
df["Kaupunki"] = df["Kaupunki"].astype("category")
|
||||
df["Sukupuoli"] = df["Sukupuoli"].apply(map_sukupuoli).astype("category")
|
||||
df["Ikä"] = df["Ikä"].apply(map_ika).astype("category")
|
||||
# Turn työaika into 0% - 100%
|
||||
df["Työaika"] = pd.to_numeric(df["Työaika"], errors="coerce").clip(0, 1)
|
||||
|
||||
df["Etä"] = df["Etä"].map(ETATYO_MAP).astype("category")
|
||||
df["Kilpailukykyinen"].replace({"Kyllä": True, "Ei": False}, inplace=True)
|
||||
|
||||
# Try to clean up numbers with spaces, etc. to real numbers
|
||||
df["Kuukausipalkka"] = df["Kuukausipalkka"].apply(map_numberlike)
|
||||
df["Vuositulot"] = df["Vuositulot"].apply(map_numberlike)
|
||||
|
||||
# Remove Oy, Oyj, etc. from work places
|
||||
df["Työpaikka"] = df["Työpaikka"].replace(re.compile(r"\s+oy|oyj$", flags=re.I), "")
|
||||
|
||||
# Fill in Vuositulot as 12.5 * Kk-tulot if empty
|
||||
df["Vuositulot"] = df.apply(map_vuositulot, axis=1)
|
||||
|
||||
# Synthesize kk-tulot from Vuositulot
|
||||
df["Kk-tulot"] = pd.to_numeric(df["Vuositulot"], errors="coerce") / 12
|
||||
return df
|
||||
|
||||
|
||||
def force_tulot_numeric(df):
|
||||
df["Kuukausipalkka"] = pd.to_numeric(df["Kuukausipalkka"], errors="coerce")
|
||||
df["Vuositulot"] = pd.to_numeric(df["Vuositulot"], errors="coerce")
|
||||
return df
|
||||
|
||||
|
||||
def force_age_numeric(df):
|
||||
age_map = {}
|
||||
for cat in df["Ikä"].cat.categories:
|
||||
m = re.match("^(\d+)-(\d+) v", cat)
|
||||
if m:
|
||||
age_map[cat] = int(round(float(m.group(1)) + float(m.group(2))) / 2)
|
||||
df["Ikä"] = df["Ikä"].apply(lambda r: age_map.get(r, r))
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
pd.set_option("display.max_column", None)
|
||||
pd.set_option("display.max_rows", None)
|
||||
pd.set_option("display.max_seq_items", None)
|
||||
pd.set_option("display.max_colwidth", 500)
|
||||
pd.set_option("expand_frame_repr", True)
|
||||
df = read_data()
|
||||
print(df.head())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
41
pulkka/data_utils.py
Normal file
41
pulkka/data_utils.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def q25(x):
|
||||
return x.quantile(0.25)
|
||||
|
||||
|
||||
def q50(x):
|
||||
return x.quantile(0.5)
|
||||
|
||||
|
||||
def q75(x):
|
||||
return x.quantile(0.75)
|
||||
|
||||
|
||||
def q90(x):
|
||||
return x.quantile(0.9)
|
||||
|
||||
|
||||
def get_categorical_stats(
|
||||
df: pd.DataFrame,
|
||||
category_col: str,
|
||||
value_col: str,
|
||||
*,
|
||||
na_as_category: Optional[str] = None,
|
||||
) -> pd.DataFrame:
|
||||
# Drop records where value is not numeric before grouping...
|
||||
df = df.copy()
|
||||
df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
|
||||
df = df[df[value_col].notna() & df[value_col] > 0]
|
||||
if na_as_category:
|
||||
df[category_col] = df[category_col].astype("string")
|
||||
df.loc[df[category_col].isna(), category_col] = na_as_category
|
||||
df[category_col] = df[category_col].astype("category")
|
||||
# ... then carry on.
|
||||
group = df[[category_col, value_col]].groupby(category_col)
|
||||
return group[value_col].agg(
|
||||
["mean", "min", "max", "median", "count", q25, q50, q75, q90]
|
||||
)
|
||||
64
pulkka/generate_charts.py
Normal file
64
pulkka/generate_charts.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import bokeh.plotting as bp
|
||||
import bokeh.models as bm
|
||||
import bokeh.layouts as bl
|
||||
from pandas import DataFrame
|
||||
|
||||
from pulkka.chart_utils import (
|
||||
gender_colormap,
|
||||
get_df_hover_tool,
|
||||
set_yaxis_cash,
|
||||
get_categorical_stats_plot,
|
||||
)
|
||||
from pulkka.data_ingest import read_data
|
||||
|
||||
plot_funcs = set()
|
||||
|
||||
|
||||
def plot_this(fn):
|
||||
"""
|
||||
Decorator for marking a function as a plot generator.
|
||||
"""
|
||||
plot_funcs.add(fn)
|
||||
|
||||
|
||||
@plot_this
|
||||
def plot_kokemus_tulot(df: DataFrame):
|
||||
source = bm.ColumnDataSource(df)
|
||||
plot = bp.figure(title="Kokemus/Vuositulot")
|
||||
plot.add_tools(get_df_hover_tool(df))
|
||||
plot.xaxis.axis_label = "Työkokemus (v)"
|
||||
set_yaxis_cash(plot)
|
||||
plot.circle(
|
||||
x="Työkokemus", y="Vuositulot", source=source, color=gender_colormap, size=10
|
||||
)
|
||||
return plot
|
||||
|
||||
|
||||
@plot_this
|
||||
def plot_ika_vuositulot(df: DataFrame):
|
||||
return get_categorical_stats_plot(df, category="Ikä", value="Vuositulot")
|
||||
|
||||
|
||||
@plot_this
|
||||
def plot_sukupuoli_vuositulot(df: DataFrame):
|
||||
return get_categorical_stats_plot(
|
||||
df, category="Sukupuoli", value="Vuositulot", na_as_category="EOS"
|
||||
)
|
||||
|
||||
|
||||
@plot_this
|
||||
def plot_kaupunki_vuositulot(df: DataFrame):
|
||||
plot = get_categorical_stats_plot(df, category="Kaupunki", value="Vuositulot", line=False)
|
||||
plot.xaxis.major_label_orientation = "vertical"
|
||||
return plot
|
||||
|
||||
|
||||
def main():
|
||||
df = read_data()
|
||||
plots = [func(df) for func in sorted(plot_funcs, key=lambda f: f.__name__)]
|
||||
bp.output_file("out/charts.html", title="Koodiklinikan Palkkakysely")
|
||||
bp.save(bl.grid(plots, ncols=2, sizing_mode="stretch_both"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
14
pulkka/generate_profiling.py
Normal file
14
pulkka/generate_profiling.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from pulkka.data_ingest import read_data, force_tulot_numeric, force_age_numeric
|
||||
from pandas_profiling import ProfileReport
|
||||
|
||||
|
||||
def main():
|
||||
df = read_data()
|
||||
df = force_tulot_numeric(df)
|
||||
df = force_age_numeric(df)
|
||||
profile = ProfileReport(df)
|
||||
profile.to_file("out/profiling_report.html")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
33
pulkka/massage_templates.py
Normal file
33
pulkka/massage_templates.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import datetime
|
||||
import glob
|
||||
import os
|
||||
|
||||
import jinja2
|
||||
import numpy
|
||||
import pandas
|
||||
|
||||
from pulkka.data_ingest import read_data
|
||||
|
||||
|
||||
def main():
|
||||
env = jinja2.Environment(
|
||||
autoescape=True,
|
||||
)
|
||||
data = {
|
||||
"date": datetime.datetime.utcnow(),
|
||||
"pd": pandas,
|
||||
"np": numpy,
|
||||
"df": read_data(),
|
||||
}
|
||||
for filename in glob.glob("template/*"):
|
||||
out_filename = os.path.join("out", os.path.relpath(filename, "template"))
|
||||
with open(filename, "r") as inf:
|
||||
tpl: jinja2.Template = env.from_string(inf.read())
|
||||
content = tpl.render(data)
|
||||
with open(out_filename, "w") as outf:
|
||||
outf.write(content)
|
||||
print(filename, "=>", out_filename)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user