diff --git a/.gitignore b/.gitignore index 27a39b0..659603f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *.py[cod] -*.html \ No newline at end of file +*.html +out diff --git a/chart_utils.py b/chart_utils.py new file mode 100644 index 0000000..6f2db0c --- /dev/null +++ b/chart_utils.py @@ -0,0 +1,34 @@ +from bokeh import models as bm, plotting as bp +from bokeh.transform import factor_cmap +from pandas import DataFrame + +from data_utils import get_categorical_stats + +gender_colormap = factor_cmap("Sukupuoli", ["#4834d4", "#eb4d4b"], ["mies", "nainen"]) + + +def get_df_hover_tool(df: DataFrame): + return bm.HoverTool(tooltips=[(c, f"@{{{c}}}") for c in df.columns]) + + +def set_yaxis_cash(plot): + plot.yaxis.axis_label = "Vuositulot" + plot.yaxis[0].formatter = bm.NumeralTickFormatter(format="€0") + + +def get_categorical_stats_plot(df, *, category): + df = get_categorical_stats(df, category, "Vuositulot") + df.reset_index(inplace=True) + df[category] = df[category].astype("category") + plot = bp.figure( + title=f"{category}/tulot", x_range=list(df[category].cat.categories) + ) + set_yaxis_cash(plot) + plot.vbar(df[category], 0.4, df["max"], df["min"], color="#a4b0be") + plot.line( + df[category], df["median"], legend_label="median", color="#1289A7", line_width=4 + ) + plot.line( + df[category], df["mean"], legend_label="mean", color="#B53471", line_width=4 + ) + return plot diff --git a/data_utils.py b/data_utils.py new file mode 100644 index 0000000..9a4bf3b --- /dev/null +++ b/data_utils.py @@ -0,0 +1,13 @@ +import pandas as pd + + +def get_categorical_stats( + df: pd.DataFrame, category_col: str, value_col: str +) -> pd.DataFrame: + # Drop records where value is not numeric before grouping... + df = df.copy() + df[value_col] = pd.to_numeric(df[value_col], errors="coerce") + df = df[df[value_col].notna() & df[value_col] > 0] + # ... then carry on. + group = df[[category_col, value_col]].groupby(category_col) + return group[value_col].agg(["mean", "min", "max", "median"]) diff --git a/generate_charts.py b/generate_charts.py new file mode 100644 index 0000000..f9bdafa --- /dev/null +++ b/generate_charts.py @@ -0,0 +1,50 @@ +import bokeh.plotting as bp +import bokeh.models as bm +import bokeh.layouts as bl +from pandas import DataFrame + +from chart_utils import ( + gender_colormap, + get_df_hover_tool, + set_yaxis_cash, + get_categorical_stats_plot, +) +from data_ingest import read_data + +plot_funcs = set() + + +def plot_this(fn): + """ + Decorator for marking a function as a plot generator. + """ + plot_funcs.add(fn) + + +@plot_this +def plot_kokemus_tulot(df: DataFrame): + source = bm.ColumnDataSource(df) + plot = bp.figure(title="Kokemus/Tulot") + plot.add_tools(get_df_hover_tool(df)) + plot.xaxis.axis_label = "Työkokemus (v)" + set_yaxis_cash(plot) + plot.circle( + x="Työkokemus", y="Vuositulot", source=source, color=gender_colormap, size=10 + ) + return plot + + +@plot_this +def plot_ika_tulot(df: DataFrame): + return get_categorical_stats_plot(df, category="Ikä") + + +def main(): + df = read_data() + plots = [func(df) for func in sorted(plot_funcs, key=lambda f: f.__name__)] + bp.output_file("out/charts.html", title="Koodiklinikan Palkkakysely") + bp.save(bl.column(plots)) + + +if __name__ == "__main__": + main() diff --git a/out/.gitkeep b/out/.gitkeep new file mode 100644 index 0000000..e69de29