Merge pull request #2 from koodiklinikka/prepare-2022

Prepare for 2022
This commit is contained in:
Aarni Koskela
2022-08-31 15:31:30 +03:00
committed by GitHub
17 changed files with 152 additions and 123 deletions

View File

@@ -14,37 +14,21 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2.3.1
- name: Set up Python 3.9
uses: actions/setup-python@v2
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: 3.9
- name: Cache pip
uses: actions/cache@v2
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
${{ runner.os }}-
python-version: "3.10"
cache: pip
- name: Install dependencies
run: |
python -m pip install --upgrade pip wheel
python -m pip install -r requirements.txt
run: python -m pip install -r requirements.txt
- name: Build
run: make -j3
- uses: actions/setup-node@v2
- uses: actions/setup-node@v3
with:
node-version: '12'
- name: Get yarn cache directory path
id: yarn-cache-dir-path
run: echo "::set-output name=dir::$(yarn cache dir)"
- uses: actions/cache@v2
with:
path: ${{ steps.yarn-cache-dir-path.outputs.dir }}
key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
restore-keys: |
${{ runner.os }}-yarn-
node-version: '16'
cache: yarn
cache-dependency-path: analysaattori/yarn.lock
- run: yarn
working-directory: analysaattori
- run: yarn build
@@ -58,3 +42,4 @@ jobs:
with:
branch: gh-pages
folder: out
if: ${{ github.event_name == 'push' }}

View File

@@ -1,27 +1,41 @@
.PHONY: data/results.xlsx data/results.tsv
DATA_DIR := data/2021
OUT_DIR := out
XLSX_URL := https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=xlsx
TSV_URL := https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=tsv
out: all-data copy-raw-data copy-massaged-data static charts profiling
export DATA_DIR
export OUT_DIR
copy-raw-data: all-data
cp data/results.xlsx out/raw.xlsx
cp data/results.tsv out/raw.tsv
.PHONY: $(DATA_DIR)/results.xlsx $(DATA_DIR)/results.tsv
all: all-data copy-raw-data copy-massaged-data static charts profiling
$(OUT_DIR):
mkdir -p $(OUT_DIR)
copy-raw-data: all-data $(OUT_DIR)
cp $(DATA_DIR)/results.xlsx $(OUT_DIR)/raw.xlsx
cp $(DATA_DIR)/results.tsv $(OUT_DIR)/raw.tsv
copy-massaged-data: all-data
python copy_massaged_data.py
python -m pulkka.copy_massaged_data
static: all-data
python massage_templates.py
python -m pulkka.massage_templates
charts: all-data
python generate_charts.py
python -m pulkka.generate_charts
profiling: all-data
python generate_profiling.py
python -m pulkka.generate_profiling
all-data: data/results.xlsx data/results.tsv
all-data: $(DATA_DIR)/results.xlsx $(DATA_DIR)/results.tsv
data/results.xlsx:
curl -fsSL -o $@ "https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=xlsx"
$(DATA_DIR):
mkdir -p $(DATA_DIR)
data/results.tsv:
curl -fsSL -o $@ "https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=tsv"
$(DATA_DIR)/results.xlsx: $(DATA_DIR)
curl -fsSL -o $@ $(XLSX_URL)
$(DATA_DIR)/results.tsv: $(DATA_DIR)
curl -fsSL -o $@ $(TSV_URL)

View File

@@ -1,18 +0,0 @@
from data_ingest import read_data
def main():
df = read_data()
df.to_html("out/data.html", index=False)
df.to_csv("out/data.csv", index=False)
df.to_excel("out/data.xlsx", index=False)
df.to_json(
"out/data.json",
orient="records",
date_format="iso",
force_ascii=False,
)
if __name__ == "__main__":
main()

View File

@@ -492,4 +492,11 @@ Timestamp Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee? Ik
2/25/2021 14:10:33 Tampere 21-25 v naisoletettu 1 Työntekijä / palkollinen 50% Systems Administrator ja firmän sisäinen 1st line -tukihessu Pääosin tai kokonaan toimistolla 1081 14000 Kyllä Kk-palkkani on varsinkin vaihteleva, koska riippuu vuorolisistä (mahdollisista pyhä- ja yövuoroista ja tuurauksista). Jonkinlaisen oletuksen nyt yritin lyödä vuositulolle, mutta taitaa jäädä todellisuudessa hivenen sen alle.
2/25/2021 21:17:36 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Mies 10 Työntekijä / palkollinen 100% Full-stack ohjemistokehittäjä Pääosin tai kokonaan toimistolla 4600 58000 Kyllä
2/26/2021 9:33:00 Oulu 46-50 v Mies 21 Työntekijä / palkollinen 100% Backend-koodari Pääosin tai kokonaan etätyö 5000 70000 Kyllä Nokia
2/26/2021 12:16:20 Tampere 36-40 v Mies 15 Työntekijä / palkollinen 100% Ohjelmistosuunnittelija Pääosin tai kokonaan toimistolla 4300 53750 Ei Gofore
2/26/2021 12:16:20 Tampere 36-40 v Mies 15 Työntekijä / palkollinen 100% Ohjelmistosuunnittelija Pääosin tai kokonaan toimistolla 4300 53750 Ei Gofore
2/26/2021 12:21:52 Tampere 31-35 v Mies 11 Freelancer 100% frontend Pääosin tai kokonaan etätyö 157300 Kyllä
2/26/2021 12:46:37 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v M 11 Työntekijä / palkollinen 100% Arkkitehti Pääosin tai kokonaan toimistolla 6500 81250 Kyllä Siili
2/26/2021 12:47:26 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Nainen 3 Työntekijä / palkollinen 100% Full-stack Noin 50/50 hybridimalli 3800 - Ei
2/26/2021 13:24:36 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Mies Työntekijä / palkollinen 100% Ohjelmistokehittäjä Noin 50/50 hybridimalli 75000 Kyllä Vincit
2/26/2021 16:28:30 Tampere 41-45 v Mies 20 Työntekijä / palkollinen 100% full-stack Pääosin tai kokonaan toimistolla 4800 61000 Kyllä
2/27/2021 12:38:01 Tampere 31-35 v Mies 9 Työntekijä / palkollinen 100% backend ja devops Pääosin tai kokonaan etätyö 4270 54000 Ei
2/27/2021 17:49:25 Kouvola 31-35 v Mies 2 Työntekijä / palkollinen 100% Full-stack Ohjelmistosuunnittelija Pääosin tai kokonaan etätyö 2800 Ei
Can't render this file because it contains an unexpected character in line 232 and column 265.

BIN
data/2021/results.xlsx Normal file

Binary file not shown.

Binary file not shown.

0
pulkka/__init__.py Normal file
View File

View File

@@ -2,7 +2,7 @@ from bokeh import models as bm, plotting as bp
from bokeh.transform import factor_cmap
from pandas import DataFrame
from data_utils import get_categorical_stats
from pulkka.data_utils import get_categorical_stats
CAT_Q_RADIUS = 0.1

5
pulkka/config.py Normal file
View File

@@ -0,0 +1,5 @@
import os
from pathlib import Path
DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
OUT_DIR = Path(os.environ.get("OUT_DIR", "out"))

View File

@@ -0,0 +1,19 @@
from pulkka.config import OUT_DIR
from pulkka.data_ingest import read_data
def main():
df = read_data()
df.to_html(OUT_DIR / "data.html", index=False)
df.to_csv(OUT_DIR / "data.csv", index=False)
df.to_excel(OUT_DIR / "data.xlsx", index=False)
df.to_json(
OUT_DIR / "data.json",
orient="records",
date_format="iso",
force_ascii=False,
)
if __name__ == "__main__":
main()

View File

@@ -3,6 +3,8 @@ import re
import numpy as np
import pandas as pd
from pulkka.config import DATA_DIR
COLUMN_MAP = {
"Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": "Kaupunki",
"Työaika (jos työsuhteessa)": "Työaika",
@@ -61,7 +63,7 @@ def map_ika(d):
def read_data() -> pd.DataFrame:
df: pd.DataFrame = pd.read_excel(
"data/results.xlsx",
DATA_DIR / "results.xlsx",
skiprows=[1], # Google Sheets exports one empty row
)
df.rename(columns=COLUMN_MAP, inplace=True)

View File

@@ -3,13 +3,14 @@ import bokeh.models as bm
import bokeh.layouts as bl
from pandas import DataFrame
from chart_utils import (
from pulkka.chart_utils import (
gender_colormap,
get_df_hover_tool,
set_yaxis_cash,
get_categorical_stats_plot,
)
from data_ingest import read_data
from pulkka.config import OUT_DIR
from pulkka.data_ingest import read_data
plot_funcs = set()
@@ -56,7 +57,7 @@ def plot_kaupunki_vuositulot(df: DataFrame):
def main():
df = read_data()
plots = [func(df) for func in sorted(plot_funcs, key=lambda f: f.__name__)]
bp.output_file("out/charts.html", title="Koodiklinikan Palkkakysely")
bp.output_file(OUT_DIR / "charts.html", title="Koodiklinikan Palkkakysely")
bp.save(bl.grid(plots, ncols=2, sizing_mode="stretch_both"))

View File

@@ -1,4 +1,5 @@
from data_ingest import read_data, force_tulot_numeric, force_age_numeric
from pulkka.config import OUT_DIR
from pulkka.data_ingest import read_data, force_tulot_numeric, force_age_numeric
from pandas_profiling import ProfileReport
@@ -7,7 +8,7 @@ def main():
df = force_tulot_numeric(df)
df = force_age_numeric(df)
profile = ProfileReport(df)
profile.to_file("out/profiling_report.html")
profile.to_file(OUT_DIR / "profiling_report.html")
if __name__ == "__main__":

View File

@@ -6,7 +6,8 @@ import jinja2
import numpy
import pandas
from data_ingest import read_data
from pulkka.config import OUT_DIR
from pulkka.data_ingest import read_data
def main():
@@ -20,7 +21,7 @@ def main():
"df": read_data(),
}
for filename in glob.glob("template/*"):
out_filename = os.path.join("out", os.path.relpath(filename, "template"))
out_filename = OUT_DIR / os.path.relpath(filename, "template")
with open(filename, "r") as inf:
tpl: jinja2.Template = env.from_string(inf.read())
content = tpl.render(data)

View File

@@ -2,4 +2,4 @@ bokeh
jinja2
openpyxl
pandas
https://github.com/akx/pandas-profiling/archive/no-phik.zip#egg=pandas-profiling
pandas-profiling

View File

@@ -1,56 +1,59 @@
#
# This file is autogenerated by pip-compile
# This file is autogenerated by pip-compile with python 3.10
# To update, run:
#
# pip-compile requirements.in
#
attrs==20.3.0
# via
# pandas-profiling
# visions
bokeh==2.2.3
attrs==22.1.0
# via visions
bokeh==2.4.3
# via -r requirements.in
certifi==2020.12.5
certifi==2022.6.15
# via requests
chardet==4.0.0
charset-normalizer==2.1.1
# via requests
confuse==1.4.0
# via pandas-profiling
cycler==0.10.0
cycler==0.11.0
# via matplotlib
decorator==4.4.2
# via networkx
et-xmlfile==1.0.1
et-xmlfile==1.1.0
# via openpyxl
fonttools==4.37.1
# via matplotlib
htmlmin==0.1.12
# via pandas-profiling
idna==2.10
idna==3.3
# via requests
imagehash==4.2.0
imagehash==4.2.1
# via visions
jdcal==1.4.1
# via openpyxl
jinja2==2.11.3
jinja2==3.1.2
# via
# -r requirements.in
# bokeh
# pandas-profiling
joblib==1.0.1
# via pandas-profiling
kiwisolver==1.3.1
joblib==1.1.0
# via
# pandas-profiling
# phik
kiwisolver==1.4.4
# via matplotlib
markupsafe==1.1.1
# via jinja2
matplotlib==3.3.4
markupsafe==2.1.1
# via
# jinja2
# pandas-profiling
matplotlib==3.5.3
# via
# missingno
# pandas-profiling
# phik
# seaborn
missingno==0.4.2
missingno==0.5.1
# via pandas-profiling
networkx==2.5
multimethod==1.8
# via
# pandas-profiling
# visions
networkx==2.8.6
# via visions
numpy==1.20.1
numpy==1.23.2
# via
# bokeh
# imagehash
@@ -58,73 +61,82 @@ numpy==1.20.1
# missingno
# pandas
# pandas-profiling
# phik
# pywavelets
# scipy
# seaborn
# visions
openpyxl==3.0.6
openpyxl==3.0.10
# via -r requirements.in
packaging==20.9
# via bokeh
https://github.com/akx/pandas-profiling/archive/no-phik.zip#egg=pandas-profiling
# via -r requirements.in
pandas==1.2.2
packaging==21.3
# via
# bokeh
# matplotlib
pandas==1.4.4
# via
# -r requirements.in
# pandas-profiling
# phik
# seaborn
# visions
pillow==8.1.0
pandas-profiling==3.2.0
# via -r requirements.in
phik==0.12.2
# via pandas-profiling
pillow==9.2.0
# via
# bokeh
# imagehash
# matplotlib
# visions
pyparsing==2.4.7
pydantic==1.10.1
# via pandas-profiling
pyparsing==3.0.9
# via
# matplotlib
# packaging
python-dateutil==2.8.1
python-dateutil==2.8.2
# via
# bokeh
# matplotlib
# pandas
pytz==2021.1
pytz==2022.2.1
# via pandas
pywavelets==1.1.1
pywavelets==1.3.0
# via imagehash
pyyaml==5.4.1
pyyaml==6.0
# via
# bokeh
# confuse
requests==2.25.1
# pandas-profiling
requests==2.28.1
# via pandas-profiling
scipy==1.6.1
scipy==1.9.1
# via
# imagehash
# missingno
# pandas-profiling
# phik
# seaborn
seaborn==0.11.1
seaborn==0.11.2
# via
# missingno
# pandas-profiling
six==1.15.0
six==1.16.0
# via
# cycler
# imagehash
# python-dateutil
tangled-up-in-unicode==0.0.6
tangled-up-in-unicode==0.2.0
# via
# pandas-profiling
# visions
tornado==6.1
tornado==6.2
# via bokeh
tqdm==4.57.0
tqdm==4.64.0
# via pandas-profiling
typing-extensions==3.7.4.3
# via bokeh
urllib3==1.26.3
typing-extensions==4.3.0
# via
# bokeh
# pydantic
urllib3==1.26.12
# via requests
visions[type_image_path]==0.6.0
visions[type_image_path]==0.7.4
# via pandas-profiling