mirror of
https://github.com/koodiklinikka/palkkakysely.git
synced 2026-02-04 11:48:14 +00:00
Add new data bits
This commit is contained in:
@@ -84,6 +84,9 @@ def read_data() -> pd.DataFrame:
|
||||
|
||||
# Fill in Vuositulot as 12.5 * Kk-tulot if empty
|
||||
df["Vuositulot"] = df.apply(map_vuositulot, axis=1)
|
||||
|
||||
# Synthesize kk-tulot from Vuositulot
|
||||
df["Kk-tulot"] = pd.to_numeric(df["Vuositulot"], errors="coerce") / 12
|
||||
return df
|
||||
|
||||
|
||||
@@ -93,6 +96,16 @@ def force_tulot_numeric(df):
|
||||
return df
|
||||
|
||||
|
||||
def force_age_numeric(df):
|
||||
age_map = {}
|
||||
for cat in df["Ikä"].cat.categories:
|
||||
m = re.match("^(\d+)-(\d+) v", cat)
|
||||
if m:
|
||||
age_map[cat] = int(round(float(m.group(1)) + float(m.group(2)) / 2))
|
||||
df["Ikä"] = df["Ikä"].apply(lambda r: age_map.get(r, r))
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
pd.set_option("display.max_column", None)
|
||||
pd.set_option("display.max_rows", None)
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
from data_ingest import read_data, force_tulot_numeric
|
||||
from data_ingest import read_data, force_tulot_numeric, force_age_numeric
|
||||
from pandas_profiling import ProfileReport
|
||||
|
||||
|
||||
def main():
|
||||
df = read_data()
|
||||
df = force_tulot_numeric(df)
|
||||
df = force_age_numeric(df)
|
||||
profile = ProfileReport(df)
|
||||
profile.to_file("out/profiling_report.html")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user