Merge pull request #2 from koodiklinikka/prepare-2022

Prepare for 2022
This commit is contained in:
Aarni Koskela
2022-08-31 15:31:30 +03:00
committed by GitHub
17 changed files with 152 additions and 123 deletions

View File

@@ -14,37 +14,21 @@ jobs:
build: build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v2.3.1 - uses: actions/checkout@v3
- name: Set up Python 3.9 - name: Set up Python 3.10
uses: actions/setup-python@v2 uses: actions/setup-python@v3
with: with:
python-version: 3.9 python-version: "3.10"
- name: Cache pip cache: pip
uses: actions/cache@v2
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
${{ runner.os }}-
- name: Install dependencies - name: Install dependencies
run: | run: python -m pip install -r requirements.txt
python -m pip install --upgrade pip wheel
python -m pip install -r requirements.txt
- name: Build - name: Build
run: make -j3 run: make -j3
- uses: actions/setup-node@v2 - uses: actions/setup-node@v3
with: with:
node-version: '12' node-version: '16'
- name: Get yarn cache directory path cache: yarn
id: yarn-cache-dir-path cache-dependency-path: analysaattori/yarn.lock
run: echo "::set-output name=dir::$(yarn cache dir)"
- uses: actions/cache@v2
with:
path: ${{ steps.yarn-cache-dir-path.outputs.dir }}
key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
restore-keys: |
${{ runner.os }}-yarn-
- run: yarn - run: yarn
working-directory: analysaattori working-directory: analysaattori
- run: yarn build - run: yarn build
@@ -58,3 +42,4 @@ jobs:
with: with:
branch: gh-pages branch: gh-pages
folder: out folder: out
if: ${{ github.event_name == 'push' }}

View File

@@ -1,27 +1,41 @@
.PHONY: data/results.xlsx data/results.tsv DATA_DIR := data/2021
OUT_DIR := out
XLSX_URL := https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=xlsx
TSV_URL := https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=tsv
out: all-data copy-raw-data copy-massaged-data static charts profiling export DATA_DIR
export OUT_DIR
copy-raw-data: all-data .PHONY: $(DATA_DIR)/results.xlsx $(DATA_DIR)/results.tsv
cp data/results.xlsx out/raw.xlsx
cp data/results.tsv out/raw.tsv all: all-data copy-raw-data copy-massaged-data static charts profiling
$(OUT_DIR):
mkdir -p $(OUT_DIR)
copy-raw-data: all-data $(OUT_DIR)
cp $(DATA_DIR)/results.xlsx $(OUT_DIR)/raw.xlsx
cp $(DATA_DIR)/results.tsv $(OUT_DIR)/raw.tsv
copy-massaged-data: all-data copy-massaged-data: all-data
python copy_massaged_data.py python -m pulkka.copy_massaged_data
static: all-data static: all-data
python massage_templates.py python -m pulkka.massage_templates
charts: all-data charts: all-data
python generate_charts.py python -m pulkka.generate_charts
profiling: all-data profiling: all-data
python generate_profiling.py python -m pulkka.generate_profiling
all-data: data/results.xlsx data/results.tsv all-data: $(DATA_DIR)/results.xlsx $(DATA_DIR)/results.tsv
data/results.xlsx: $(DATA_DIR):
curl -fsSL -o $@ "https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=xlsx" mkdir -p $(DATA_DIR)
data/results.tsv: $(DATA_DIR)/results.xlsx: $(DATA_DIR)
curl -fsSL -o $@ "https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=tsv" curl -fsSL -o $@ $(XLSX_URL)
$(DATA_DIR)/results.tsv: $(DATA_DIR)
curl -fsSL -o $@ $(TSV_URL)

View File

@@ -1,18 +0,0 @@
from data_ingest import read_data
def main():
df = read_data()
df.to_html("out/data.html", index=False)
df.to_csv("out/data.csv", index=False)
df.to_excel("out/data.xlsx", index=False)
df.to_json(
"out/data.json",
orient="records",
date_format="iso",
force_ascii=False,
)
if __name__ == "__main__":
main()

View File

@@ -493,3 +493,10 @@ Timestamp Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee? Ik
2/25/2021 21:17:36 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Mies 10 Työntekijä / palkollinen 100% Full-stack ohjemistokehittäjä Pääosin tai kokonaan toimistolla 4600 58000 Kyllä 2/25/2021 21:17:36 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Mies 10 Työntekijä / palkollinen 100% Full-stack ohjemistokehittäjä Pääosin tai kokonaan toimistolla 4600 58000 Kyllä
2/26/2021 9:33:00 Oulu 46-50 v Mies 21 Työntekijä / palkollinen 100% Backend-koodari Pääosin tai kokonaan etätyö 5000 70000 Kyllä Nokia 2/26/2021 9:33:00 Oulu 46-50 v Mies 21 Työntekijä / palkollinen 100% Backend-koodari Pääosin tai kokonaan etätyö 5000 70000 Kyllä Nokia
2/26/2021 12:16:20 Tampere 36-40 v Mies 15 Työntekijä / palkollinen 100% Ohjelmistosuunnittelija Pääosin tai kokonaan toimistolla 4300 53750 Ei Gofore 2/26/2021 12:16:20 Tampere 36-40 v Mies 15 Työntekijä / palkollinen 100% Ohjelmistosuunnittelija Pääosin tai kokonaan toimistolla 4300 53750 Ei Gofore
2/26/2021 12:21:52 Tampere 31-35 v Mies 11 Freelancer 100% frontend Pääosin tai kokonaan etätyö 157300 Kyllä
2/26/2021 12:46:37 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v M 11 Työntekijä / palkollinen 100% Arkkitehti Pääosin tai kokonaan toimistolla 6500 81250 Kyllä Siili
2/26/2021 12:47:26 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Nainen 3 Työntekijä / palkollinen 100% Full-stack Noin 50/50 hybridimalli 3800 - Ei
2/26/2021 13:24:36 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Mies Työntekijä / palkollinen 100% Ohjelmistokehittäjä Noin 50/50 hybridimalli 75000 Kyllä Vincit
2/26/2021 16:28:30 Tampere 41-45 v Mies 20 Työntekijä / palkollinen 100% full-stack Pääosin tai kokonaan toimistolla 4800 61000 Kyllä
2/27/2021 12:38:01 Tampere 31-35 v Mies 9 Työntekijä / palkollinen 100% backend ja devops Pääosin tai kokonaan etätyö 4270 54000 Ei
2/27/2021 17:49:25 Kouvola 31-35 v Mies 2 Työntekijä / palkollinen 100% Full-stack Ohjelmistosuunnittelija Pääosin tai kokonaan etätyö 2800 Ei
Can't render this file because it contains an unexpected character in line 232 and column 265.

BIN
data/2021/results.xlsx Normal file

Binary file not shown.

Binary file not shown.

0
pulkka/__init__.py Normal file
View File

View File

@@ -2,7 +2,7 @@ from bokeh import models as bm, plotting as bp
from bokeh.transform import factor_cmap from bokeh.transform import factor_cmap
from pandas import DataFrame from pandas import DataFrame
from data_utils import get_categorical_stats from pulkka.data_utils import get_categorical_stats
CAT_Q_RADIUS = 0.1 CAT_Q_RADIUS = 0.1

5
pulkka/config.py Normal file
View File

@@ -0,0 +1,5 @@
import os
from pathlib import Path
DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
OUT_DIR = Path(os.environ.get("OUT_DIR", "out"))

View File

@@ -0,0 +1,19 @@
from pulkka.config import OUT_DIR
from pulkka.data_ingest import read_data
def main():
df = read_data()
df.to_html(OUT_DIR / "data.html", index=False)
df.to_csv(OUT_DIR / "data.csv", index=False)
df.to_excel(OUT_DIR / "data.xlsx", index=False)
df.to_json(
OUT_DIR / "data.json",
orient="records",
date_format="iso",
force_ascii=False,
)
if __name__ == "__main__":
main()

View File

@@ -3,6 +3,8 @@ import re
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pulkka.config import DATA_DIR
COLUMN_MAP = { COLUMN_MAP = {
"Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": "Kaupunki", "Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": "Kaupunki",
"Työaika (jos työsuhteessa)": "Työaika", "Työaika (jos työsuhteessa)": "Työaika",
@@ -61,7 +63,7 @@ def map_ika(d):
def read_data() -> pd.DataFrame: def read_data() -> pd.DataFrame:
df: pd.DataFrame = pd.read_excel( df: pd.DataFrame = pd.read_excel(
"data/results.xlsx", DATA_DIR / "results.xlsx",
skiprows=[1], # Google Sheets exports one empty row skiprows=[1], # Google Sheets exports one empty row
) )
df.rename(columns=COLUMN_MAP, inplace=True) df.rename(columns=COLUMN_MAP, inplace=True)

View File

@@ -3,13 +3,14 @@ import bokeh.models as bm
import bokeh.layouts as bl import bokeh.layouts as bl
from pandas import DataFrame from pandas import DataFrame
from chart_utils import ( from pulkka.chart_utils import (
gender_colormap, gender_colormap,
get_df_hover_tool, get_df_hover_tool,
set_yaxis_cash, set_yaxis_cash,
get_categorical_stats_plot, get_categorical_stats_plot,
) )
from data_ingest import read_data from pulkka.config import OUT_DIR
from pulkka.data_ingest import read_data
plot_funcs = set() plot_funcs = set()
@@ -56,7 +57,7 @@ def plot_kaupunki_vuositulot(df: DataFrame):
def main(): def main():
df = read_data() df = read_data()
plots = [func(df) for func in sorted(plot_funcs, key=lambda f: f.__name__)] plots = [func(df) for func in sorted(plot_funcs, key=lambda f: f.__name__)]
bp.output_file("out/charts.html", title="Koodiklinikan Palkkakysely") bp.output_file(OUT_DIR / "charts.html", title="Koodiklinikan Palkkakysely")
bp.save(bl.grid(plots, ncols=2, sizing_mode="stretch_both")) bp.save(bl.grid(plots, ncols=2, sizing_mode="stretch_both"))

View File

@@ -1,4 +1,5 @@
from data_ingest import read_data, force_tulot_numeric, force_age_numeric from pulkka.config import OUT_DIR
from pulkka.data_ingest import read_data, force_tulot_numeric, force_age_numeric
from pandas_profiling import ProfileReport from pandas_profiling import ProfileReport
@@ -7,7 +8,7 @@ def main():
df = force_tulot_numeric(df) df = force_tulot_numeric(df)
df = force_age_numeric(df) df = force_age_numeric(df)
profile = ProfileReport(df) profile = ProfileReport(df)
profile.to_file("out/profiling_report.html") profile.to_file(OUT_DIR / "profiling_report.html")
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -6,7 +6,8 @@ import jinja2
import numpy import numpy
import pandas import pandas
from data_ingest import read_data from pulkka.config import OUT_DIR
from pulkka.data_ingest import read_data
def main(): def main():
@@ -20,7 +21,7 @@ def main():
"df": read_data(), "df": read_data(),
} }
for filename in glob.glob("template/*"): for filename in glob.glob("template/*"):
out_filename = os.path.join("out", os.path.relpath(filename, "template")) out_filename = OUT_DIR / os.path.relpath(filename, "template")
with open(filename, "r") as inf: with open(filename, "r") as inf:
tpl: jinja2.Template = env.from_string(inf.read()) tpl: jinja2.Template = env.from_string(inf.read())
content = tpl.render(data) content = tpl.render(data)

View File

@@ -2,4 +2,4 @@ bokeh
jinja2 jinja2
openpyxl openpyxl
pandas pandas
https://github.com/akx/pandas-profiling/archive/no-phik.zip#egg=pandas-profiling pandas-profiling

View File

@@ -1,56 +1,59 @@
# #
# This file is autogenerated by pip-compile # This file is autogenerated by pip-compile with python 3.10
# To update, run: # To update, run:
# #
# pip-compile requirements.in # pip-compile requirements.in
# #
attrs==20.3.0 attrs==22.1.0
# via # via visions
# pandas-profiling bokeh==2.4.3
# visions
bokeh==2.2.3
# via -r requirements.in # via -r requirements.in
certifi==2020.12.5 certifi==2022.6.15
# via requests # via requests
chardet==4.0.0 charset-normalizer==2.1.1
# via requests # via requests
confuse==1.4.0 cycler==0.11.0
# via pandas-profiling
cycler==0.10.0
# via matplotlib # via matplotlib
decorator==4.4.2 et-xmlfile==1.1.0
# via networkx
et-xmlfile==1.0.1
# via openpyxl # via openpyxl
fonttools==4.37.1
# via matplotlib
htmlmin==0.1.12 htmlmin==0.1.12
# via pandas-profiling # via pandas-profiling
idna==2.10 idna==3.3
# via requests # via requests
imagehash==4.2.0 imagehash==4.2.1
# via visions # via visions
jdcal==1.4.1 jinja2==3.1.2
# via openpyxl
jinja2==2.11.3
# via # via
# -r requirements.in # -r requirements.in
# bokeh # bokeh
# pandas-profiling # pandas-profiling
joblib==1.0.1 joblib==1.1.0
# via pandas-profiling # via
kiwisolver==1.3.1 # pandas-profiling
# phik
kiwisolver==1.4.4
# via matplotlib # via matplotlib
markupsafe==1.1.1 markupsafe==2.1.1
# via jinja2 # via
matplotlib==3.3.4 # jinja2
# pandas-profiling
matplotlib==3.5.3
# via # via
# missingno # missingno
# pandas-profiling # pandas-profiling
# phik
# seaborn # seaborn
missingno==0.4.2 missingno==0.5.1
# via pandas-profiling # via pandas-profiling
networkx==2.5 multimethod==1.8
# via
# pandas-profiling
# visions
networkx==2.8.6
# via visions # via visions
numpy==1.20.1 numpy==1.23.2
# via # via
# bokeh # bokeh
# imagehash # imagehash
@@ -58,73 +61,82 @@ numpy==1.20.1
# missingno # missingno
# pandas # pandas
# pandas-profiling # pandas-profiling
# phik
# pywavelets # pywavelets
# scipy # scipy
# seaborn # seaborn
# visions # visions
openpyxl==3.0.6 openpyxl==3.0.10
# via -r requirements.in # via -r requirements.in
packaging==20.9 packaging==21.3
# via bokeh # via
https://github.com/akx/pandas-profiling/archive/no-phik.zip#egg=pandas-profiling # bokeh
# via -r requirements.in # matplotlib
pandas==1.2.2 pandas==1.4.4
# via # via
# -r requirements.in # -r requirements.in
# pandas-profiling # pandas-profiling
# phik
# seaborn # seaborn
# visions # visions
pillow==8.1.0 pandas-profiling==3.2.0
# via -r requirements.in
phik==0.12.2
# via pandas-profiling
pillow==9.2.0
# via # via
# bokeh # bokeh
# imagehash # imagehash
# matplotlib # matplotlib
# visions # visions
pyparsing==2.4.7 pydantic==1.10.1
# via pandas-profiling
pyparsing==3.0.9
# via # via
# matplotlib # matplotlib
# packaging # packaging
python-dateutil==2.8.1 python-dateutil==2.8.2
# via # via
# bokeh
# matplotlib # matplotlib
# pandas # pandas
pytz==2021.1 pytz==2022.2.1
# via pandas # via pandas
pywavelets==1.1.1 pywavelets==1.3.0
# via imagehash # via imagehash
pyyaml==5.4.1 pyyaml==6.0
# via # via
# bokeh # bokeh
# confuse # pandas-profiling
requests==2.25.1 requests==2.28.1
# via pandas-profiling # via pandas-profiling
scipy==1.6.1 scipy==1.9.1
# via # via
# imagehash # imagehash
# missingno # missingno
# pandas-profiling # pandas-profiling
# phik
# seaborn # seaborn
seaborn==0.11.1 seaborn==0.11.2
# via # via
# missingno # missingno
# pandas-profiling # pandas-profiling
six==1.15.0 six==1.16.0
# via # via
# cycler
# imagehash # imagehash
# python-dateutil # python-dateutil
tangled-up-in-unicode==0.0.6 tangled-up-in-unicode==0.2.0
# via # via
# pandas-profiling # pandas-profiling
# visions # visions
tornado==6.1 tornado==6.2
# via bokeh # via bokeh
tqdm==4.57.0 tqdm==4.64.0
# via pandas-profiling # via pandas-profiling
typing-extensions==3.7.4.3 typing-extensions==4.3.0
# via bokeh # via
urllib3==1.26.3 # bokeh
# pydantic
urllib3==1.26.12
# via requests # via requests
visions[type_image_path]==0.6.0 visions[type_image_path]==0.7.4
# via pandas-profiling # via pandas-profiling