diff --git a/data_ingest.py b/data_ingest.py index 9beb070..a8db4ed 100644 --- a/data_ingest.py +++ b/data_ingest.py @@ -84,6 +84,9 @@ def read_data() -> pd.DataFrame: # Fill in Vuositulot as 12.5 * Kk-tulot if empty df["Vuositulot"] = df.apply(map_vuositulot, axis=1) + + # Synthesize kk-tulot from Vuositulot + df["Kk-tulot"] = pd.to_numeric(df["Vuositulot"], errors="coerce") / 12 return df @@ -93,6 +96,16 @@ def force_tulot_numeric(df): return df +def force_age_numeric(df): + age_map = {} + for cat in df["Ikä"].cat.categories: + m = re.match("^(\d+)-(\d+) v", cat) + if m: + age_map[cat] = int(round(float(m.group(1)) + float(m.group(2)) / 2)) + df["Ikä"] = df["Ikä"].apply(lambda r: age_map.get(r, r)) + return df + + def main(): pd.set_option("display.max_column", None) pd.set_option("display.max_rows", None) diff --git a/generate_profiling.py b/generate_profiling.py index a751276..792150e 100644 --- a/generate_profiling.py +++ b/generate_profiling.py @@ -1,10 +1,11 @@ -from data_ingest import read_data, force_tulot_numeric +from data_ingest import read_data, force_tulot_numeric, force_age_numeric from pandas_profiling import ProfileReport def main(): df = read_data() df = force_tulot_numeric(df) + df = force_age_numeric(df) profile = ProfileReport(df) profile.to_file("out/profiling_report.html")