diff --git a/pulkka/config.py b/pulkka/config.py new file mode 100644 index 0000000..655474e --- /dev/null +++ b/pulkka/config.py @@ -0,0 +1,5 @@ +import os +from pathlib import Path + +DATA_DIR = Path(os.environ.get("DATA_DIR", "data")) +OUT_DIR = Path(os.environ.get("OUT_DIR", "out")) diff --git a/pulkka/copy_massaged_data.py b/pulkka/copy_massaged_data.py index 0b765b9..b710ccb 100644 --- a/pulkka/copy_massaged_data.py +++ b/pulkka/copy_massaged_data.py @@ -1,13 +1,14 @@ +from pulkka.config import OUT_DIR from pulkka.data_ingest import read_data def main(): df = read_data() - df.to_html("out/data.html", index=False) - df.to_csv("out/data.csv", index=False) - df.to_excel("out/data.xlsx", index=False) + df.to_html(OUT_DIR / "data.html", index=False) + df.to_csv(OUT_DIR / "data.csv", index=False) + df.to_excel(OUT_DIR / "data.xlsx", index=False) df.to_json( - "out/data.json", + OUT_DIR / "data.json", orient="records", date_format="iso", force_ascii=False, diff --git a/pulkka/data_ingest.py b/pulkka/data_ingest.py index 8a0c327..a6da781 100644 --- a/pulkka/data_ingest.py +++ b/pulkka/data_ingest.py @@ -3,6 +3,8 @@ import re import numpy as np import pandas as pd +from pulkka.config import DATA_DIR + COLUMN_MAP = { "Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": "Kaupunki", "Työaika (jos työsuhteessa)": "Työaika", @@ -61,7 +63,7 @@ def map_ika(d): def read_data() -> pd.DataFrame: df: pd.DataFrame = pd.read_excel( - "data/results.xlsx", + DATA_DIR / "results.xlsx", skiprows=[1], # Google Sheets exports one empty row ) df.rename(columns=COLUMN_MAP, inplace=True) diff --git a/pulkka/generate_charts.py b/pulkka/generate_charts.py index 7de3c22..08a0cc7 100644 --- a/pulkka/generate_charts.py +++ b/pulkka/generate_charts.py @@ -9,6 +9,7 @@ from pulkka.chart_utils import ( set_yaxis_cash, get_categorical_stats_plot, ) +from pulkka.config import OUT_DIR from pulkka.data_ingest import read_data plot_funcs = set() @@ -56,7 +57,7 @@ def plot_kaupunki_vuositulot(df: DataFrame): def main(): df = read_data() plots = [func(df) for func in sorted(plot_funcs, key=lambda f: f.__name__)] - bp.output_file("out/charts.html", title="Koodiklinikan Palkkakysely") + bp.output_file(OUT_DIR / "charts.html", title="Koodiklinikan Palkkakysely") bp.save(bl.grid(plots, ncols=2, sizing_mode="stretch_both")) diff --git a/pulkka/generate_profiling.py b/pulkka/generate_profiling.py index 24789d4..0942bd5 100644 --- a/pulkka/generate_profiling.py +++ b/pulkka/generate_profiling.py @@ -1,3 +1,4 @@ +from pulkka.config import OUT_DIR from pulkka.data_ingest import read_data, force_tulot_numeric, force_age_numeric from pandas_profiling import ProfileReport @@ -7,7 +8,7 @@ def main(): df = force_tulot_numeric(df) df = force_age_numeric(df) profile = ProfileReport(df) - profile.to_file("out/profiling_report.html") + profile.to_file(OUT_DIR / "profiling_report.html") if __name__ == "__main__": diff --git a/pulkka/massage_templates.py b/pulkka/massage_templates.py index eb43ef1..bc49385 100644 --- a/pulkka/massage_templates.py +++ b/pulkka/massage_templates.py @@ -6,6 +6,7 @@ import jinja2 import numpy import pandas +from pulkka.config import OUT_DIR from pulkka.data_ingest import read_data @@ -20,7 +21,7 @@ def main(): "df": read_data(), } for filename in glob.glob("template/*"): - out_filename = os.path.join("out", os.path.relpath(filename, "template")) + out_filename = OUT_DIR / os.path.relpath(filename, "template") with open(filename, "r") as inf: tpl: jinja2.Template = env.from_string(inf.read()) content = tpl.render(data)