Improve column maps, drop duplicate row

This commit is contained in:
Aarni Koskela
2023-09-25 09:18:09 +03:00
parent d75eedbebb
commit d71d0a188c
2 changed files with 23 additions and 10 deletions

View File

@@ -33,6 +33,8 @@ from pulkka.column_maps import (
OTHER_GENDER_VALUES,
TYOKOKEMUS_COL,
ROOLI_NORM_COL,
PALAUTE_COL,
TIMESTAMPS_TO_DROP,
)
@@ -139,6 +141,9 @@ def read_data() -> pd.DataFrame:
# Drop bogus data
df = df.drop(df[df[SUKUPUOLI_COL] == "taisteluhelikopteri"].index)
# Drop rows by timestamps known to be duplicate
df = df.drop(df[df["Timestamp"].isin(TIMESTAMPS_TO_DROP)].index)
df[SUKUPUOLI_COL] = df[SUKUPUOLI_COL].apply(map_sukupuoli).astype("category")
df[IKA_COL] = df[IKA_COL].astype("category")