diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0ebf9a6..3038e06 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -14,37 +14,21 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2.3.1 - - name: Set up Python 3.9 - uses: actions/setup-python@v2 + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 with: - python-version: 3.9 - - name: Cache pip - uses: actions/cache@v2 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip- - ${{ runner.os }}- + python-version: "3.10" + cache: pip - name: Install dependencies - run: | - python -m pip install --upgrade pip wheel - python -m pip install -r requirements.txt + run: python -m pip install -r requirements.txt - name: Build run: make -j3 - - uses: actions/setup-node@v2 + - uses: actions/setup-node@v3 with: - node-version: '12' - - name: Get yarn cache directory path - id: yarn-cache-dir-path - run: echo "::set-output name=dir::$(yarn cache dir)" - - uses: actions/cache@v2 - with: - path: ${{ steps.yarn-cache-dir-path.outputs.dir }} - key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }} - restore-keys: | - ${{ runner.os }}-yarn- + node-version: '16' + cache: yarn + cache-dependency-path: analysaattori/yarn.lock - run: yarn working-directory: analysaattori - run: yarn build @@ -58,3 +42,4 @@ jobs: with: branch: gh-pages folder: out + if: ${{ github.event_name == 'push' }} diff --git a/Makefile b/Makefile index cc970ba..cc4f64c 100644 --- a/Makefile +++ b/Makefile @@ -1,27 +1,41 @@ -.PHONY: data/results.xlsx data/results.tsv +DATA_DIR := data/2021 +OUT_DIR := out +XLSX_URL := https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=xlsx +TSV_URL := https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=tsv -out: all-data copy-raw-data copy-massaged-data static charts profiling +export DATA_DIR +export OUT_DIR -copy-raw-data: all-data - cp data/results.xlsx out/raw.xlsx - cp data/results.tsv out/raw.tsv +.PHONY: $(DATA_DIR)/results.xlsx $(DATA_DIR)/results.tsv + +all: all-data copy-raw-data copy-massaged-data static charts profiling + +$(OUT_DIR): + mkdir -p $(OUT_DIR) + +copy-raw-data: all-data $(OUT_DIR) + cp $(DATA_DIR)/results.xlsx $(OUT_DIR)/raw.xlsx + cp $(DATA_DIR)/results.tsv $(OUT_DIR)/raw.tsv copy-massaged-data: all-data - python copy_massaged_data.py + python -m pulkka.copy_massaged_data static: all-data - python massage_templates.py + python -m pulkka.massage_templates charts: all-data - python generate_charts.py + python -m pulkka.generate_charts profiling: all-data - python generate_profiling.py + python -m pulkka.generate_profiling -all-data: data/results.xlsx data/results.tsv +all-data: $(DATA_DIR)/results.xlsx $(DATA_DIR)/results.tsv -data/results.xlsx: - curl -fsSL -o $@ "https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=xlsx" +$(DATA_DIR): + mkdir -p $(DATA_DIR) -data/results.tsv: - curl -fsSL -o $@ "https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=tsv" +$(DATA_DIR)/results.xlsx: $(DATA_DIR) + curl -fsSL -o $@ $(XLSX_URL) + +$(DATA_DIR)/results.tsv: $(DATA_DIR) + curl -fsSL -o $@ $(TSV_URL) diff --git a/copy_massaged_data.py b/copy_massaged_data.py deleted file mode 100644 index b38316c..0000000 --- a/copy_massaged_data.py +++ /dev/null @@ -1,18 +0,0 @@ -from data_ingest import read_data - - -def main(): - df = read_data() - df.to_html("out/data.html", index=False) - df.to_csv("out/data.csv", index=False) - df.to_excel("out/data.xlsx", index=False) - df.to_json( - "out/data.json", - orient="records", - date_format="iso", - force_ascii=False, - ) - - -if __name__ == "__main__": - main() diff --git a/data/results.tsv b/data/2021/results.tsv similarity index 98% rename from data/results.tsv rename to data/2021/results.tsv index dfb2a5b..02c12f8 100644 --- a/data/results.tsv +++ b/data/2021/results.tsv @@ -492,4 +492,11 @@ Timestamp Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee? Ik 2/25/2021 14:10:33 Tampere 21-25 v naisoletettu 1 Työntekijä / palkollinen 50% Systems Administrator ja firmän sisäinen 1st line -tukihessu Pääosin tai kokonaan toimistolla 1081 14000 Kyllä Kk-palkkani on varsinkin vaihteleva, koska riippuu vuorolisistä (mahdollisista pyhä- ja yövuoroista ja tuurauksista). Jonkinlaisen oletuksen nyt yritin lyödä vuositulolle, mutta taitaa jäädä todellisuudessa hivenen sen alle. 2/25/2021 21:17:36 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Mies 10 Työntekijä / palkollinen 100% Full-stack ohjemistokehittäjä Pääosin tai kokonaan toimistolla 4600 58000 Kyllä 2/26/2021 9:33:00 Oulu 46-50 v Mies 21 Työntekijä / palkollinen 100% Backend-koodari Pääosin tai kokonaan etätyö 5000 70000 Kyllä Nokia -2/26/2021 12:16:20 Tampere 36-40 v Mies 15 Työntekijä / palkollinen 100% Ohjelmistosuunnittelija Pääosin tai kokonaan toimistolla 4300 53750 Ei Gofore \ No newline at end of file +2/26/2021 12:16:20 Tampere 36-40 v Mies 15 Työntekijä / palkollinen 100% Ohjelmistosuunnittelija Pääosin tai kokonaan toimistolla 4300 53750 Ei Gofore +2/26/2021 12:21:52 Tampere 31-35 v Mies 11 Freelancer 100% frontend Pääosin tai kokonaan etätyö 157300 Kyllä +2/26/2021 12:46:37 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v M 11 Työntekijä / palkollinen 100% Arkkitehti Pääosin tai kokonaan toimistolla 6500 81250 Kyllä Siili +2/26/2021 12:47:26 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Nainen 3 Työntekijä / palkollinen 100% Full-stack Noin 50/50 hybridimalli 3800 - Ei +2/26/2021 13:24:36 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Mies Työntekijä / palkollinen 100% Ohjelmistokehittäjä Noin 50/50 hybridimalli 75000 Kyllä Vincit +2/26/2021 16:28:30 Tampere 41-45 v Mies 20 Työntekijä / palkollinen 100% full-stack Pääosin tai kokonaan toimistolla 4800 61000 Kyllä +2/27/2021 12:38:01 Tampere 31-35 v Mies 9 Työntekijä / palkollinen 100% backend ja devops Pääosin tai kokonaan etätyö 4270 54000 Ei +2/27/2021 17:49:25 Kouvola 31-35 v Mies 2 Työntekijä / palkollinen 100% Full-stack Ohjelmistosuunnittelija Pääosin tai kokonaan etätyö 2800 Ei \ No newline at end of file diff --git a/data/2021/results.xlsx b/data/2021/results.xlsx new file mode 100644 index 0000000..e1e9954 Binary files /dev/null and b/data/2021/results.xlsx differ diff --git a/data/results.xlsx b/data/results.xlsx deleted file mode 100644 index 0005e3f..0000000 Binary files a/data/results.xlsx and /dev/null differ diff --git a/pulkka/__init__.py b/pulkka/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chart_utils.py b/pulkka/chart_utils.py similarity index 97% rename from chart_utils.py rename to pulkka/chart_utils.py index 98bd63d..6eb238f 100644 --- a/chart_utils.py +++ b/pulkka/chart_utils.py @@ -2,7 +2,7 @@ from bokeh import models as bm, plotting as bp from bokeh.transform import factor_cmap from pandas import DataFrame -from data_utils import get_categorical_stats +from pulkka.data_utils import get_categorical_stats CAT_Q_RADIUS = 0.1 diff --git a/pulkka/config.py b/pulkka/config.py new file mode 100644 index 0000000..655474e --- /dev/null +++ b/pulkka/config.py @@ -0,0 +1,5 @@ +import os +from pathlib import Path + +DATA_DIR = Path(os.environ.get("DATA_DIR", "data")) +OUT_DIR = Path(os.environ.get("OUT_DIR", "out")) diff --git a/pulkka/copy_massaged_data.py b/pulkka/copy_massaged_data.py new file mode 100644 index 0000000..b710ccb --- /dev/null +++ b/pulkka/copy_massaged_data.py @@ -0,0 +1,19 @@ +from pulkka.config import OUT_DIR +from pulkka.data_ingest import read_data + + +def main(): + df = read_data() + df.to_html(OUT_DIR / "data.html", index=False) + df.to_csv(OUT_DIR / "data.csv", index=False) + df.to_excel(OUT_DIR / "data.xlsx", index=False) + df.to_json( + OUT_DIR / "data.json", + orient="records", + date_format="iso", + force_ascii=False, + ) + + +if __name__ == "__main__": + main() diff --git a/data_ingest.py b/pulkka/data_ingest.py similarity index 98% rename from data_ingest.py rename to pulkka/data_ingest.py index 8a0c327..a6da781 100644 --- a/data_ingest.py +++ b/pulkka/data_ingest.py @@ -3,6 +3,8 @@ import re import numpy as np import pandas as pd +from pulkka.config import DATA_DIR + COLUMN_MAP = { "Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": "Kaupunki", "Työaika (jos työsuhteessa)": "Työaika", @@ -61,7 +63,7 @@ def map_ika(d): def read_data() -> pd.DataFrame: df: pd.DataFrame = pd.read_excel( - "data/results.xlsx", + DATA_DIR / "results.xlsx", skiprows=[1], # Google Sheets exports one empty row ) df.rename(columns=COLUMN_MAP, inplace=True) diff --git a/data_utils.py b/pulkka/data_utils.py similarity index 100% rename from data_utils.py rename to pulkka/data_utils.py diff --git a/generate_charts.py b/pulkka/generate_charts.py similarity index 88% rename from generate_charts.py rename to pulkka/generate_charts.py index 6df9729..08a0cc7 100644 --- a/generate_charts.py +++ b/pulkka/generate_charts.py @@ -3,13 +3,14 @@ import bokeh.models as bm import bokeh.layouts as bl from pandas import DataFrame -from chart_utils import ( +from pulkka.chart_utils import ( gender_colormap, get_df_hover_tool, set_yaxis_cash, get_categorical_stats_plot, ) -from data_ingest import read_data +from pulkka.config import OUT_DIR +from pulkka.data_ingest import read_data plot_funcs = set() @@ -56,7 +57,7 @@ def plot_kaupunki_vuositulot(df: DataFrame): def main(): df = read_data() plots = [func(df) for func in sorted(plot_funcs, key=lambda f: f.__name__)] - bp.output_file("out/charts.html", title="Koodiklinikan Palkkakysely") + bp.output_file(OUT_DIR / "charts.html", title="Koodiklinikan Palkkakysely") bp.save(bl.grid(plots, ncols=2, sizing_mode="stretch_both")) diff --git a/generate_profiling.py b/pulkka/generate_profiling.py similarity index 55% rename from generate_profiling.py rename to pulkka/generate_profiling.py index 792150e..0942bd5 100644 --- a/generate_profiling.py +++ b/pulkka/generate_profiling.py @@ -1,4 +1,5 @@ -from data_ingest import read_data, force_tulot_numeric, force_age_numeric +from pulkka.config import OUT_DIR +from pulkka.data_ingest import read_data, force_tulot_numeric, force_age_numeric from pandas_profiling import ProfileReport @@ -7,7 +8,7 @@ def main(): df = force_tulot_numeric(df) df = force_age_numeric(df) profile = ProfileReport(df) - profile.to_file("out/profiling_report.html") + profile.to_file(OUT_DIR / "profiling_report.html") if __name__ == "__main__": diff --git a/massage_templates.py b/pulkka/massage_templates.py similarity index 81% rename from massage_templates.py rename to pulkka/massage_templates.py index 5aa2a6f..bc49385 100644 --- a/massage_templates.py +++ b/pulkka/massage_templates.py @@ -6,7 +6,8 @@ import jinja2 import numpy import pandas -from data_ingest import read_data +from pulkka.config import OUT_DIR +from pulkka.data_ingest import read_data def main(): @@ -20,7 +21,7 @@ def main(): "df": read_data(), } for filename in glob.glob("template/*"): - out_filename = os.path.join("out", os.path.relpath(filename, "template")) + out_filename = OUT_DIR / os.path.relpath(filename, "template") with open(filename, "r") as inf: tpl: jinja2.Template = env.from_string(inf.read()) content = tpl.render(data) diff --git a/requirements.in b/requirements.in index f987e5c..84c3d59 100644 --- a/requirements.in +++ b/requirements.in @@ -2,4 +2,4 @@ bokeh jinja2 openpyxl pandas -https://github.com/akx/pandas-profiling/archive/no-phik.zip#egg=pandas-profiling +pandas-profiling diff --git a/requirements.txt b/requirements.txt index d765ab0..aa81a67 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,56 +1,59 @@ # -# This file is autogenerated by pip-compile +# This file is autogenerated by pip-compile with python 3.10 # To update, run: # # pip-compile requirements.in # -attrs==20.3.0 - # via - # pandas-profiling - # visions -bokeh==2.2.3 +attrs==22.1.0 + # via visions +bokeh==2.4.3 # via -r requirements.in -certifi==2020.12.5 +certifi==2022.6.15 # via requests -chardet==4.0.0 +charset-normalizer==2.1.1 # via requests -confuse==1.4.0 - # via pandas-profiling -cycler==0.10.0 +cycler==0.11.0 # via matplotlib -decorator==4.4.2 - # via networkx -et-xmlfile==1.0.1 +et-xmlfile==1.1.0 # via openpyxl +fonttools==4.37.1 + # via matplotlib htmlmin==0.1.12 # via pandas-profiling -idna==2.10 +idna==3.3 # via requests -imagehash==4.2.0 +imagehash==4.2.1 # via visions -jdcal==1.4.1 - # via openpyxl -jinja2==2.11.3 +jinja2==3.1.2 # via # -r requirements.in # bokeh # pandas-profiling -joblib==1.0.1 - # via pandas-profiling -kiwisolver==1.3.1 +joblib==1.1.0 + # via + # pandas-profiling + # phik +kiwisolver==1.4.4 # via matplotlib -markupsafe==1.1.1 - # via jinja2 -matplotlib==3.3.4 +markupsafe==2.1.1 + # via + # jinja2 + # pandas-profiling +matplotlib==3.5.3 # via # missingno # pandas-profiling + # phik # seaborn -missingno==0.4.2 +missingno==0.5.1 # via pandas-profiling -networkx==2.5 +multimethod==1.8 + # via + # pandas-profiling + # visions +networkx==2.8.6 # via visions -numpy==1.20.1 +numpy==1.23.2 # via # bokeh # imagehash @@ -58,73 +61,82 @@ numpy==1.20.1 # missingno # pandas # pandas-profiling + # phik # pywavelets # scipy # seaborn # visions -openpyxl==3.0.6 +openpyxl==3.0.10 # via -r requirements.in -packaging==20.9 - # via bokeh -https://github.com/akx/pandas-profiling/archive/no-phik.zip#egg=pandas-profiling - # via -r requirements.in -pandas==1.2.2 +packaging==21.3 + # via + # bokeh + # matplotlib +pandas==1.4.4 # via # -r requirements.in # pandas-profiling + # phik # seaborn # visions -pillow==8.1.0 +pandas-profiling==3.2.0 + # via -r requirements.in +phik==0.12.2 + # via pandas-profiling +pillow==9.2.0 # via # bokeh # imagehash # matplotlib # visions -pyparsing==2.4.7 +pydantic==1.10.1 + # via pandas-profiling +pyparsing==3.0.9 # via # matplotlib # packaging -python-dateutil==2.8.1 +python-dateutil==2.8.2 # via - # bokeh # matplotlib # pandas -pytz==2021.1 +pytz==2022.2.1 # via pandas -pywavelets==1.1.1 +pywavelets==1.3.0 # via imagehash -pyyaml==5.4.1 +pyyaml==6.0 # via # bokeh - # confuse -requests==2.25.1 + # pandas-profiling +requests==2.28.1 # via pandas-profiling -scipy==1.6.1 +scipy==1.9.1 # via # imagehash # missingno # pandas-profiling + # phik # seaborn -seaborn==0.11.1 +seaborn==0.11.2 # via # missingno # pandas-profiling -six==1.15.0 +six==1.16.0 # via - # cycler # imagehash # python-dateutil -tangled-up-in-unicode==0.0.6 +tangled-up-in-unicode==0.2.0 # via # pandas-profiling # visions -tornado==6.1 +tornado==6.2 # via bokeh -tqdm==4.57.0 +tqdm==4.64.0 # via pandas-profiling -typing-extensions==3.7.4.3 - # via bokeh -urllib3==1.26.3 +typing-extensions==4.3.0 + # via + # bokeh + # pydantic +urllib3==1.26.12 # via requests -visions[type_image_path]==0.6.0 +visions[type_image_path]==0.7.4 # via pandas-profiling