diff --git a/pulkka/column_maps.py b/pulkka/column_maps.py index 2b518a0..facbe11 100644 --- a/pulkka/column_maps.py +++ b/pulkka/column_maps.py @@ -28,6 +28,7 @@ TYOKOKEMUS_COL = "Tyƶkokemus alalta (vuosina)" TYOPAIKKA_COL = "Tyƶpaikka" VUOSILASKUTUS_ALV0_COL = "Vuosilaskutus (ALV 0%, euroina)" VUOSITULOT_COL = "Vuositulot" +ID_COL = "Vastaustunniste" COLUMN_MAP_2023 = { "Timestamp": "Timestamp", diff --git a/pulkka/data_ingest.py b/pulkka/data_ingest.py index 2fbfcfa..c03ffea 100644 --- a/pulkka/data_ingest.py +++ b/pulkka/data_ingest.py @@ -1,5 +1,6 @@ from __future__ import annotations +import hashlib import re import warnings @@ -34,6 +35,7 @@ from pulkka.column_maps import ( TYOKOKEMUS_COL, ROOLI_NORM_COL, TIMESTAMPS_TO_DROP, + ID_COL, ) @@ -91,6 +93,11 @@ def ucfirst(val): return val +def hash_row(r: pd.Series) -> str: + source_data = f"{r[LANG_COL]}.{int(r.Timestamp.timestamp() * 1000)}" + return hashlib.sha256(source_data.encode()).hexdigest()[:16] + + def read_initial_dfs() -> pd.DataFrame: df_fi: pd.DataFrame = pd.read_excel( DATA_DIR / "results-fi.xlsx", @@ -106,6 +113,10 @@ def read_initial_dfs() -> pd.DataFrame: df = pd.concat([df_fi, df_en], ignore_index=True) df = df[df["Timestamp"].notna()] # Remove rows with no timestamp df[LANG_COL] = df[LANG_COL].astype("category") + # Give each row a unique hash ID + df[ID_COL] = df.apply(hash_row, axis=1) + # Ensure truncated sha is unique + assert len(df[ID_COL].unique()) == len(df) return df