mirror of
https://github.com/koodiklinikka/palkkakysely.git
synced 2026-01-26 03:14:03 +00:00
Initial data ingestion/massage
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -0,0 +1,2 @@
|
||||
*.py[cod]
|
||||
*.html
|
||||
93
data_ingest.py
Normal file
93
data_ingest.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
COLUMN_MAP = {
|
||||
"Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": "Kaupunki",
|
||||
"Työaika (jos työsuhteessa)": "Työaika",
|
||||
"Etänä vai paikallisesti?": "Etä",
|
||||
"Vuositulot (sis. bonukset, osingot yms) / Vuosilaskutus (jos laskutat)": "Vuositulot",
|
||||
"Kuukausipalkka (jos työntekijä) (brutto)": "Kuukausipalkka",
|
||||
"Onko palkkasi nykyroolissasi mielestäsi kilpailukykyinen?": "Kilpailukykyinen",
|
||||
}
|
||||
|
||||
ETATYO_MAP = {
|
||||
"Pääosin tai kokonaan etätyö": "Etä",
|
||||
"Pääosin tai kokonaan toimisto": "Toimisto",
|
||||
"Noin 50/50 hybridimalli": "50/50",
|
||||
}
|
||||
|
||||
|
||||
def map_sukupuoli(value: str):
|
||||
if isinstance(value, str):
|
||||
value = value.lower()
|
||||
if "nainen" in value or "female" in value:
|
||||
return "nainen"
|
||||
|
||||
if (
|
||||
"mies" in value
|
||||
or "uros" in value
|
||||
or "miäs" in value
|
||||
or "äiä" in value
|
||||
or "male" in value
|
||||
or value == "m"
|
||||
):
|
||||
return "mies"
|
||||
return "muu" # Map the handful of outliers into "muu" (so a given value but not specified)
|
||||
return value
|
||||
|
||||
|
||||
def map_vuositulot(r):
|
||||
if r["Vuositulot"] is np.nan:
|
||||
return r["Kuukausipalkka"] * 12.5
|
||||
return r["Vuositulot"]
|
||||
|
||||
|
||||
def map_numberlike(d):
|
||||
if isinstance(d, str):
|
||||
try:
|
||||
return float(re.sub("\s+", "", d))
|
||||
except ValueError:
|
||||
pass
|
||||
return d
|
||||
|
||||
|
||||
def read_data() -> pd.DataFrame:
|
||||
df: pd.DataFrame = pd.read_excel(
|
||||
"data/results.xlsx",
|
||||
skiprows=[1], # Google Sheets exports one empty row
|
||||
)
|
||||
df.rename(columns=COLUMN_MAP, inplace=True)
|
||||
|
||||
df["Kaupunki"].replace(
|
||||
"PK-Seutu (Helsinki, Espoo, Vantaa)", "PK-Seutu", inplace=True
|
||||
)
|
||||
df["Kaupunki"] = df["Kaupunki"].astype("category")
|
||||
df["Sukupuoli"] = df["Sukupuoli"].apply(map_sukupuoli).astype("category")
|
||||
df["Ikä"] = df["Ikä"].astype("category")
|
||||
|
||||
df["Etä"] = df["Etä"].map(ETATYO_MAP).astype("category")
|
||||
df["Kilpailukykyinen"].replace({"Kyllä": True, "Ei": False}, inplace=True)
|
||||
|
||||
# Try to clean up numbers with spaces, etc. to real numbers
|
||||
df["Kuukausipalkka"] = df["Kuukausipalkka"].apply(map_numberlike)
|
||||
df["Vuositulot"] = df["Vuositulot"].apply(map_numberlike)
|
||||
|
||||
# Fill in Vuositulot as 12.5 * Kk-tulot if empty
|
||||
df["Vuositulot"] = df.apply(map_vuositulot, axis=1)
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
pd.set_option("display.max_column", None)
|
||||
pd.set_option("display.max_rows", None)
|
||||
pd.set_option("display.max_seq_items", None)
|
||||
pd.set_option("display.max_colwidth", 500)
|
||||
pd.set_option("expand_frame_repr", True)
|
||||
df = read_data()
|
||||
print(df.head())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
3
requirements.in
Normal file
3
requirements.in
Normal file
@@ -0,0 +1,3 @@
|
||||
bokeh
|
||||
openpyxl
|
||||
pandas
|
||||
44
requirements.txt
Normal file
44
requirements.txt
Normal file
@@ -0,0 +1,44 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile requirements.in
|
||||
#
|
||||
bokeh==2.2.3
|
||||
# via -r requirements.in
|
||||
et-xmlfile==1.0.1
|
||||
# via openpyxl
|
||||
jdcal==1.4.1
|
||||
# via openpyxl
|
||||
jinja2==2.11.3
|
||||
# via bokeh
|
||||
markupsafe==1.1.1
|
||||
# via jinja2
|
||||
numpy==1.20.1
|
||||
# via
|
||||
# bokeh
|
||||
# pandas
|
||||
openpyxl==3.0.6
|
||||
# via -r requirements.in
|
||||
packaging==20.9
|
||||
# via bokeh
|
||||
pandas==1.2.2
|
||||
# via -r requirements.in
|
||||
pillow==8.1.0
|
||||
# via bokeh
|
||||
pyparsing==2.4.7
|
||||
# via packaging
|
||||
python-dateutil==2.8.1
|
||||
# via
|
||||
# bokeh
|
||||
# pandas
|
||||
pytz==2021.1
|
||||
# via pandas
|
||||
pyyaml==5.4.1
|
||||
# via bokeh
|
||||
six==1.15.0
|
||||
# via python-dateutil
|
||||
tornado==6.1
|
||||
# via bokeh
|
||||
typing-extensions==3.7.4.3
|
||||
# via bokeh
|
||||
Reference in New Issue
Block a user