From 335cf1506496613d7fee64d0a3158d102be66a4a Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 10 Oct 2022 12:07:48 +0300 Subject: [PATCH] Apply some data fixes --- pulkka/copy_massaged_data.py | 2 +- pulkka/data_ingest.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pulkka/copy_massaged_data.py b/pulkka/copy_massaged_data.py index b710ccb..d2b3ca9 100644 --- a/pulkka/copy_massaged_data.py +++ b/pulkka/copy_massaged_data.py @@ -4,7 +4,7 @@ from pulkka.data_ingest import read_data def main(): df = read_data() - df.to_html(OUT_DIR / "data.html", index=False) + df.to_html(OUT_DIR / "data.html", index=False, na_rep="") df.to_csv(OUT_DIR / "data.csv", index=False) df.to_excel(OUT_DIR / "data.xlsx", index=False) df.to_json( diff --git a/pulkka/data_ingest.py b/pulkka/data_ingest.py index 03ecdaa..efefb2d 100644 --- a/pulkka/data_ingest.py +++ b/pulkka/data_ingest.py @@ -106,8 +106,10 @@ def read_data() -> pd.DataFrame: # Fill in Vuositulot as 12.5 * Kk-tulot if empty df["Vuositulot"] = df.apply(map_vuositulot, axis=1) - # Fudge a single known outlier + + # Fudge some known outliers df.loc[df.Vuositulot == 912500, 'Vuositulot'] = 91250 + df.loc[df.Kuukausipalkka == 87000, 'Kuukausipalkka'] = 7250 # Synthesize kk-tulot from Vuositulot df["Kk-tulot"] = pd.to_numeric(df["Vuositulot"], errors="coerce") / 12