Packagify pulkka

This commit is contained in:
Aarni Koskela
2022-08-31 15:05:37 +03:00
parent df22bd90f1
commit cdc6d9cc89
9 changed files with 10 additions and 10 deletions

0
pulkka/__init__.py Normal file
View File

84
pulkka/chart_utils.py Normal file
View File

@@ -0,0 +1,84 @@
from bokeh import models as bm, plotting as bp
from bokeh.transform import factor_cmap
from pandas import DataFrame
from pulkka.data_utils import get_categorical_stats
CAT_Q_RADIUS = 0.1
gender_colormap = factor_cmap("Sukupuoli", ["#4834d4", "#eb4d4b"], ["mies", "nainen"])
def get_df_hover_tool(df: DataFrame):
return bm.HoverTool(tooltips=[(c, f"@{{{c}}}") for c in df.columns])
def set_yaxis_cash(plot):
plot.yaxis.axis_label = "Vuositulot"
plot.yaxis[0].formatter = bm.NumeralTickFormatter(format="€0")
def get_categorical_stats_plot(df, *, category, value, na_as_category=None, line=True):
df = get_categorical_stats(df, category, value, na_as_category=na_as_category)
df.reset_index(inplace=True)
df[category] = df[category].astype("category")
plot = bp.figure(
title=f"{category}/{value}", x_range=list(df[category].cat.categories)
)
set_yaxis_cash(plot)
plot.vbar(
df[category],
CAT_Q_RADIUS * 2.5,
df["max"],
df["min"],
color="#a4b0be",
fill_alpha=0.7,
)
plot.circle(
df[category],
df["q25"],
radius=CAT_Q_RADIUS,
legend_label="q25",
color="#f368e0",
)
plot.circle(
df[category],
df["q75"],
radius=CAT_Q_RADIUS,
legend_label="q75",
color="#00d2d3",
)
plot.circle(
df[category],
df["q90"],
radius=CAT_Q_RADIUS,
legend_label="q90",
color="#ff9f43",
)
if line:
plot.line(
df[category],
df["median"],
legend_label="median",
color="#1289A7",
line_width=4,
)
plot.line(
df[category], df["mean"], legend_label="mean", color="#B53471", line_width=4
)
else:
plot.circle(
df[category],
df["median"],
radius=CAT_Q_RADIUS,
legend_label="median",
color="#1289A7",
)
plot.circle(
df[category],
df["mean"],
radius=CAT_Q_RADIUS,
legend_label="mean",
color="#B53471",
)
return plot

View File

@@ -0,0 +1,18 @@
from pulkka.data_ingest import read_data
def main():
df = read_data()
df.to_html("out/data.html", index=False)
df.to_csv("out/data.csv", index=False)
df.to_excel("out/data.xlsx", index=False)
df.to_json(
"out/data.json",
orient="records",
date_format="iso",
force_ascii=False,
)
if __name__ == "__main__":
main()

123
pulkka/data_ingest.py Normal file
View File

@@ -0,0 +1,123 @@
import re
import numpy as np
import pandas as pd
COLUMN_MAP = {
"Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": "Kaupunki",
"Työaika (jos työsuhteessa)": "Työaika",
"Etänä vai paikallisesti?": "Etä",
"Vuositulot (sis. bonukset, osingot yms) / Vuosilaskutus (jos laskutat)": "Vuositulot",
"Kuukausipalkka (jos työntekijä) (brutto)": "Kuukausipalkka",
"Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?": "Kilpailukykyinen",
}
ETATYO_MAP = {
"Pääosin tai kokonaan etätyö": "Etä",
"Pääosin tai kokonaan toimistolla": "Toimisto",
"Noin 50/50 hybridimalli": "50/50",
}
def map_sukupuoli(value: str):
if isinstance(value, str):
value = value.lower()
if "nainen" in value or "female" in value:
return "nainen"
if (
"mies" in value
or "uros" in value
or "miäs" in value
or "äiä" in value
or "male" in value
or value == "m"
):
return "mies"
return "muu" # Map the handful of outliers into "muu" (so a given value but not specified)
return value
def map_vuositulot(r):
if r["Vuositulot"] is np.nan:
return r["Kuukausipalkka"] * 12.5
return r["Vuositulot"]
def map_numberlike(d):
if isinstance(d, str):
try:
return float(re.sub("\s+", "", d))
except ValueError:
pass
return d
def map_ika(d):
if d == "30-35 v": # Early answers had a wrong bracket here
d = "31-35 v"
return d
def read_data() -> pd.DataFrame:
df: pd.DataFrame = pd.read_excel(
"data/results.xlsx",
skiprows=[1], # Google Sheets exports one empty row
)
df.rename(columns=COLUMN_MAP, inplace=True)
df["Kaupunki"].replace(
"PK-Seutu (Helsinki, Espoo, Vantaa)", "PK-Seutu", inplace=True
)
df["Kaupunki"] = df["Kaupunki"].astype("category")
df["Sukupuoli"] = df["Sukupuoli"].apply(map_sukupuoli).astype("category")
df["Ikä"] = df["Ikä"].apply(map_ika).astype("category")
# Turn työaika into 0% - 100%
df["Työaika"] = pd.to_numeric(df["Työaika"], errors="coerce").clip(0, 1)
df["Etä"] = df["Etä"].map(ETATYO_MAP).astype("category")
df["Kilpailukykyinen"].replace({"Kyllä": True, "Ei": False}, inplace=True)
# Try to clean up numbers with spaces, etc. to real numbers
df["Kuukausipalkka"] = df["Kuukausipalkka"].apply(map_numberlike)
df["Vuositulot"] = df["Vuositulot"].apply(map_numberlike)
# Remove Oy, Oyj, etc. from work places
df["Työpaikka"] = df["Työpaikka"].replace(re.compile(r"\s+oy|oyj$", flags=re.I), "")
# Fill in Vuositulot as 12.5 * Kk-tulot if empty
df["Vuositulot"] = df.apply(map_vuositulot, axis=1)
# Synthesize kk-tulot from Vuositulot
df["Kk-tulot"] = pd.to_numeric(df["Vuositulot"], errors="coerce") / 12
return df
def force_tulot_numeric(df):
df["Kuukausipalkka"] = pd.to_numeric(df["Kuukausipalkka"], errors="coerce")
df["Vuositulot"] = pd.to_numeric(df["Vuositulot"], errors="coerce")
return df
def force_age_numeric(df):
age_map = {}
for cat in df["Ikä"].cat.categories:
m = re.match("^(\d+)-(\d+) v", cat)
if m:
age_map[cat] = int(round(float(m.group(1)) + float(m.group(2))) / 2)
df["Ikä"] = df["Ikä"].apply(lambda r: age_map.get(r, r))
return df
def main():
pd.set_option("display.max_column", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_seq_items", None)
pd.set_option("display.max_colwidth", 500)
pd.set_option("expand_frame_repr", True)
df = read_data()
print(df.head())
if __name__ == "__main__":
main()

41
pulkka/data_utils.py Normal file
View File

@@ -0,0 +1,41 @@
from typing import Optional
import pandas as pd
def q25(x):
return x.quantile(0.25)
def q50(x):
return x.quantile(0.5)
def q75(x):
return x.quantile(0.75)
def q90(x):
return x.quantile(0.9)
def get_categorical_stats(
df: pd.DataFrame,
category_col: str,
value_col: str,
*,
na_as_category: Optional[str] = None,
) -> pd.DataFrame:
# Drop records where value is not numeric before grouping...
df = df.copy()
df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
df = df[df[value_col].notna() & df[value_col] > 0]
if na_as_category:
df[category_col] = df[category_col].astype("string")
df.loc[df[category_col].isna(), category_col] = na_as_category
df[category_col] = df[category_col].astype("category")
# ... then carry on.
group = df[[category_col, value_col]].groupby(category_col)
return group[value_col].agg(
["mean", "min", "max", "median", "count", q25, q50, q75, q90]
)

64
pulkka/generate_charts.py Normal file
View File

@@ -0,0 +1,64 @@
import bokeh.plotting as bp
import bokeh.models as bm
import bokeh.layouts as bl
from pandas import DataFrame
from pulkka.chart_utils import (
gender_colormap,
get_df_hover_tool,
set_yaxis_cash,
get_categorical_stats_plot,
)
from pulkka.data_ingest import read_data
plot_funcs = set()
def plot_this(fn):
"""
Decorator for marking a function as a plot generator.
"""
plot_funcs.add(fn)
@plot_this
def plot_kokemus_tulot(df: DataFrame):
source = bm.ColumnDataSource(df)
plot = bp.figure(title="Kokemus/Vuositulot")
plot.add_tools(get_df_hover_tool(df))
plot.xaxis.axis_label = "Työkokemus (v)"
set_yaxis_cash(plot)
plot.circle(
x="Työkokemus", y="Vuositulot", source=source, color=gender_colormap, size=10
)
return plot
@plot_this
def plot_ika_vuositulot(df: DataFrame):
return get_categorical_stats_plot(df, category="Ikä", value="Vuositulot")
@plot_this
def plot_sukupuoli_vuositulot(df: DataFrame):
return get_categorical_stats_plot(
df, category="Sukupuoli", value="Vuositulot", na_as_category="EOS"
)
@plot_this
def plot_kaupunki_vuositulot(df: DataFrame):
plot = get_categorical_stats_plot(df, category="Kaupunki", value="Vuositulot", line=False)
plot.xaxis.major_label_orientation = "vertical"
return plot
def main():
df = read_data()
plots = [func(df) for func in sorted(plot_funcs, key=lambda f: f.__name__)]
bp.output_file("out/charts.html", title="Koodiklinikan Palkkakysely")
bp.save(bl.grid(plots, ncols=2, sizing_mode="stretch_both"))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,14 @@
from pulkka.data_ingest import read_data, force_tulot_numeric, force_age_numeric
from pandas_profiling import ProfileReport
def main():
df = read_data()
df = force_tulot_numeric(df)
df = force_age_numeric(df)
profile = ProfileReport(df)
profile.to_file("out/profiling_report.html")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,33 @@
import datetime
import glob
import os
import jinja2
import numpy
import pandas
from pulkka.data_ingest import read_data
def main():
env = jinja2.Environment(
autoescape=True,
)
data = {
"date": datetime.datetime.utcnow(),
"pd": pandas,
"np": numpy,
"df": read_data(),
}
for filename in glob.glob("template/*"):
out_filename = os.path.join("out", os.path.relpath(filename, "template"))
with open(filename, "r") as inf:
tpl: jinja2.Template = env.from_string(inf.read())
content = tpl.render(data)
with open(out_filename, "w") as outf:
outf.write(content)
print(filename, "=>", out_filename)
if __name__ == "__main__":
main()