mirror of
https://github.com/koodiklinikka/palkkakysely.git
synced 2026-01-26 11:23:59 +00:00
37
.github/workflows/build.yml
vendored
37
.github/workflows/build.yml
vendored
@@ -14,37 +14,21 @@ jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2.3.1
|
||||
- name: Set up Python 3.9
|
||||
uses: actions/setup-python@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python 3.10
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: 3.9
|
||||
- name: Cache pip
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-pip-
|
||||
${{ runner.os }}-
|
||||
python-version: "3.10"
|
||||
cache: pip
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip wheel
|
||||
python -m pip install -r requirements.txt
|
||||
run: python -m pip install -r requirements.txt
|
||||
- name: Build
|
||||
run: make -j3
|
||||
- uses: actions/setup-node@v2
|
||||
- uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: '12'
|
||||
- name: Get yarn cache directory path
|
||||
id: yarn-cache-dir-path
|
||||
run: echo "::set-output name=dir::$(yarn cache dir)"
|
||||
- uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ steps.yarn-cache-dir-path.outputs.dir }}
|
||||
key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-yarn-
|
||||
node-version: '16'
|
||||
cache: yarn
|
||||
cache-dependency-path: analysaattori/yarn.lock
|
||||
- run: yarn
|
||||
working-directory: analysaattori
|
||||
- run: yarn build
|
||||
@@ -58,3 +42,4 @@ jobs:
|
||||
with:
|
||||
branch: gh-pages
|
||||
folder: out
|
||||
if: ${{ github.event_name == 'push' }}
|
||||
|
||||
42
Makefile
42
Makefile
@@ -1,27 +1,41 @@
|
||||
.PHONY: data/results.xlsx data/results.tsv
|
||||
DATA_DIR := data/2021
|
||||
OUT_DIR := out
|
||||
XLSX_URL := https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=xlsx
|
||||
TSV_URL := https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=tsv
|
||||
|
||||
out: all-data copy-raw-data copy-massaged-data static charts profiling
|
||||
export DATA_DIR
|
||||
export OUT_DIR
|
||||
|
||||
copy-raw-data: all-data
|
||||
cp data/results.xlsx out/raw.xlsx
|
||||
cp data/results.tsv out/raw.tsv
|
||||
.PHONY: $(DATA_DIR)/results.xlsx $(DATA_DIR)/results.tsv
|
||||
|
||||
all: all-data copy-raw-data copy-massaged-data static charts profiling
|
||||
|
||||
$(OUT_DIR):
|
||||
mkdir -p $(OUT_DIR)
|
||||
|
||||
copy-raw-data: all-data $(OUT_DIR)
|
||||
cp $(DATA_DIR)/results.xlsx $(OUT_DIR)/raw.xlsx
|
||||
cp $(DATA_DIR)/results.tsv $(OUT_DIR)/raw.tsv
|
||||
|
||||
copy-massaged-data: all-data
|
||||
python copy_massaged_data.py
|
||||
python -m pulkka.copy_massaged_data
|
||||
|
||||
static: all-data
|
||||
python massage_templates.py
|
||||
python -m pulkka.massage_templates
|
||||
|
||||
charts: all-data
|
||||
python generate_charts.py
|
||||
python -m pulkka.generate_charts
|
||||
|
||||
profiling: all-data
|
||||
python generate_profiling.py
|
||||
python -m pulkka.generate_profiling
|
||||
|
||||
all-data: data/results.xlsx data/results.tsv
|
||||
all-data: $(DATA_DIR)/results.xlsx $(DATA_DIR)/results.tsv
|
||||
|
||||
data/results.xlsx:
|
||||
curl -fsSL -o $@ "https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=xlsx"
|
||||
$(DATA_DIR):
|
||||
mkdir -p $(DATA_DIR)
|
||||
|
||||
data/results.tsv:
|
||||
curl -fsSL -o $@ "https://docs.google.com/spreadsheets/d/1l-Zgf1HqaFGd8gRA8kQzaxJ3R7eJy29ORUS8pr5o0nk/export?format=tsv"
|
||||
$(DATA_DIR)/results.xlsx: $(DATA_DIR)
|
||||
curl -fsSL -o $@ $(XLSX_URL)
|
||||
|
||||
$(DATA_DIR)/results.tsv: $(DATA_DIR)
|
||||
curl -fsSL -o $@ $(TSV_URL)
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
from data_ingest import read_data
|
||||
|
||||
|
||||
def main():
|
||||
df = read_data()
|
||||
df.to_html("out/data.html", index=False)
|
||||
df.to_csv("out/data.csv", index=False)
|
||||
df.to_excel("out/data.xlsx", index=False)
|
||||
df.to_json(
|
||||
"out/data.json",
|
||||
orient="records",
|
||||
date_format="iso",
|
||||
force_ascii=False,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -492,4 +492,11 @@ Timestamp Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee? Ik
|
||||
2/25/2021 14:10:33 Tampere 21-25 v naisoletettu 1 Työntekijä / palkollinen 50% Systems Administrator ja firmän sisäinen 1st line -tukihessu Pääosin tai kokonaan toimistolla 1081 14000 Kyllä Kk-palkkani on varsinkin vaihteleva, koska riippuu vuorolisistä (mahdollisista pyhä- ja yövuoroista ja tuurauksista). Jonkinlaisen oletuksen nyt yritin lyödä vuositulolle, mutta taitaa jäädä todellisuudessa hivenen sen alle.
|
||||
2/25/2021 21:17:36 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Mies 10 Työntekijä / palkollinen 100% Full-stack ohjemistokehittäjä Pääosin tai kokonaan toimistolla 4600 58000 Kyllä
|
||||
2/26/2021 9:33:00 Oulu 46-50 v Mies 21 Työntekijä / palkollinen 100% Backend-koodari Pääosin tai kokonaan etätyö 5000 70000 Kyllä Nokia
|
||||
2/26/2021 12:16:20 Tampere 36-40 v Mies 15 Työntekijä / palkollinen 100% Ohjelmistosuunnittelija Pääosin tai kokonaan toimistolla 4300 53750 Ei Gofore
|
||||
2/26/2021 12:16:20 Tampere 36-40 v Mies 15 Työntekijä / palkollinen 100% Ohjelmistosuunnittelija Pääosin tai kokonaan toimistolla 4300 53750 Ei Gofore
|
||||
2/26/2021 12:21:52 Tampere 31-35 v Mies 11 Freelancer 100% frontend Pääosin tai kokonaan etätyö 157300 Kyllä
|
||||
2/26/2021 12:46:37 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v M 11 Työntekijä / palkollinen 100% Arkkitehti Pääosin tai kokonaan toimistolla 6500 81250 Kyllä Siili
|
||||
2/26/2021 12:47:26 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Nainen 3 Työntekijä / palkollinen 100% Full-stack Noin 50/50 hybridimalli 3800 - Ei
|
||||
2/26/2021 13:24:36 PK-Seutu (Helsinki, Espoo, Vantaa) 31-35 v Mies Työntekijä / palkollinen 100% Ohjelmistokehittäjä Noin 50/50 hybridimalli 75000 Kyllä Vincit
|
||||
2/26/2021 16:28:30 Tampere 41-45 v Mies 20 Työntekijä / palkollinen 100% full-stack Pääosin tai kokonaan toimistolla 4800 61000 Kyllä
|
||||
2/27/2021 12:38:01 Tampere 31-35 v Mies 9 Työntekijä / palkollinen 100% backend ja devops Pääosin tai kokonaan etätyö 4270 54000 Ei
|
||||
2/27/2021 17:49:25 Kouvola 31-35 v Mies 2 Työntekijä / palkollinen 100% Full-stack Ohjelmistosuunnittelija Pääosin tai kokonaan etätyö 2800 Ei
|
||||
|
Can't render this file because it contains an unexpected character in line 232 and column 265.
|
BIN
data/2021/results.xlsx
Normal file
BIN
data/2021/results.xlsx
Normal file
Binary file not shown.
Binary file not shown.
0
pulkka/__init__.py
Normal file
0
pulkka/__init__.py
Normal file
@@ -2,7 +2,7 @@ from bokeh import models as bm, plotting as bp
|
||||
from bokeh.transform import factor_cmap
|
||||
from pandas import DataFrame
|
||||
|
||||
from data_utils import get_categorical_stats
|
||||
from pulkka.data_utils import get_categorical_stats
|
||||
|
||||
CAT_Q_RADIUS = 0.1
|
||||
|
||||
5
pulkka/config.py
Normal file
5
pulkka/config.py
Normal file
@@ -0,0 +1,5 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
|
||||
OUT_DIR = Path(os.environ.get("OUT_DIR", "out"))
|
||||
19
pulkka/copy_massaged_data.py
Normal file
19
pulkka/copy_massaged_data.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from pulkka.config import OUT_DIR
|
||||
from pulkka.data_ingest import read_data
|
||||
|
||||
|
||||
def main():
|
||||
df = read_data()
|
||||
df.to_html(OUT_DIR / "data.html", index=False)
|
||||
df.to_csv(OUT_DIR / "data.csv", index=False)
|
||||
df.to_excel(OUT_DIR / "data.xlsx", index=False)
|
||||
df.to_json(
|
||||
OUT_DIR / "data.json",
|
||||
orient="records",
|
||||
date_format="iso",
|
||||
force_ascii=False,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -3,6 +3,8 @@ import re
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from pulkka.config import DATA_DIR
|
||||
|
||||
COLUMN_MAP = {
|
||||
"Missä kaupungissa työpaikkasi pääasiallinen toimisto sijaitsee?": "Kaupunki",
|
||||
"Työaika (jos työsuhteessa)": "Työaika",
|
||||
@@ -61,7 +63,7 @@ def map_ika(d):
|
||||
|
||||
def read_data() -> pd.DataFrame:
|
||||
df: pd.DataFrame = pd.read_excel(
|
||||
"data/results.xlsx",
|
||||
DATA_DIR / "results.xlsx",
|
||||
skiprows=[1], # Google Sheets exports one empty row
|
||||
)
|
||||
df.rename(columns=COLUMN_MAP, inplace=True)
|
||||
@@ -3,13 +3,14 @@ import bokeh.models as bm
|
||||
import bokeh.layouts as bl
|
||||
from pandas import DataFrame
|
||||
|
||||
from chart_utils import (
|
||||
from pulkka.chart_utils import (
|
||||
gender_colormap,
|
||||
get_df_hover_tool,
|
||||
set_yaxis_cash,
|
||||
get_categorical_stats_plot,
|
||||
)
|
||||
from data_ingest import read_data
|
||||
from pulkka.config import OUT_DIR
|
||||
from pulkka.data_ingest import read_data
|
||||
|
||||
plot_funcs = set()
|
||||
|
||||
@@ -56,7 +57,7 @@ def plot_kaupunki_vuositulot(df: DataFrame):
|
||||
def main():
|
||||
df = read_data()
|
||||
plots = [func(df) for func in sorted(plot_funcs, key=lambda f: f.__name__)]
|
||||
bp.output_file("out/charts.html", title="Koodiklinikan Palkkakysely")
|
||||
bp.output_file(OUT_DIR / "charts.html", title="Koodiklinikan Palkkakysely")
|
||||
bp.save(bl.grid(plots, ncols=2, sizing_mode="stretch_both"))
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from data_ingest import read_data, force_tulot_numeric, force_age_numeric
|
||||
from pulkka.config import OUT_DIR
|
||||
from pulkka.data_ingest import read_data, force_tulot_numeric, force_age_numeric
|
||||
from pandas_profiling import ProfileReport
|
||||
|
||||
|
||||
@@ -7,7 +8,7 @@ def main():
|
||||
df = force_tulot_numeric(df)
|
||||
df = force_age_numeric(df)
|
||||
profile = ProfileReport(df)
|
||||
profile.to_file("out/profiling_report.html")
|
||||
profile.to_file(OUT_DIR / "profiling_report.html")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -6,7 +6,8 @@ import jinja2
|
||||
import numpy
|
||||
import pandas
|
||||
|
||||
from data_ingest import read_data
|
||||
from pulkka.config import OUT_DIR
|
||||
from pulkka.data_ingest import read_data
|
||||
|
||||
|
||||
def main():
|
||||
@@ -20,7 +21,7 @@ def main():
|
||||
"df": read_data(),
|
||||
}
|
||||
for filename in glob.glob("template/*"):
|
||||
out_filename = os.path.join("out", os.path.relpath(filename, "template"))
|
||||
out_filename = OUT_DIR / os.path.relpath(filename, "template")
|
||||
with open(filename, "r") as inf:
|
||||
tpl: jinja2.Template = env.from_string(inf.read())
|
||||
content = tpl.render(data)
|
||||
@@ -2,4 +2,4 @@ bokeh
|
||||
jinja2
|
||||
openpyxl
|
||||
pandas
|
||||
https://github.com/akx/pandas-profiling/archive/no-phik.zip#egg=pandas-profiling
|
||||
pandas-profiling
|
||||
|
||||
120
requirements.txt
120
requirements.txt
@@ -1,56 +1,59 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile
|
||||
# This file is autogenerated by pip-compile with python 3.10
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile requirements.in
|
||||
#
|
||||
attrs==20.3.0
|
||||
# via
|
||||
# pandas-profiling
|
||||
# visions
|
||||
bokeh==2.2.3
|
||||
attrs==22.1.0
|
||||
# via visions
|
||||
bokeh==2.4.3
|
||||
# via -r requirements.in
|
||||
certifi==2020.12.5
|
||||
certifi==2022.6.15
|
||||
# via requests
|
||||
chardet==4.0.0
|
||||
charset-normalizer==2.1.1
|
||||
# via requests
|
||||
confuse==1.4.0
|
||||
# via pandas-profiling
|
||||
cycler==0.10.0
|
||||
cycler==0.11.0
|
||||
# via matplotlib
|
||||
decorator==4.4.2
|
||||
# via networkx
|
||||
et-xmlfile==1.0.1
|
||||
et-xmlfile==1.1.0
|
||||
# via openpyxl
|
||||
fonttools==4.37.1
|
||||
# via matplotlib
|
||||
htmlmin==0.1.12
|
||||
# via pandas-profiling
|
||||
idna==2.10
|
||||
idna==3.3
|
||||
# via requests
|
||||
imagehash==4.2.0
|
||||
imagehash==4.2.1
|
||||
# via visions
|
||||
jdcal==1.4.1
|
||||
# via openpyxl
|
||||
jinja2==2.11.3
|
||||
jinja2==3.1.2
|
||||
# via
|
||||
# -r requirements.in
|
||||
# bokeh
|
||||
# pandas-profiling
|
||||
joblib==1.0.1
|
||||
# via pandas-profiling
|
||||
kiwisolver==1.3.1
|
||||
joblib==1.1.0
|
||||
# via
|
||||
# pandas-profiling
|
||||
# phik
|
||||
kiwisolver==1.4.4
|
||||
# via matplotlib
|
||||
markupsafe==1.1.1
|
||||
# via jinja2
|
||||
matplotlib==3.3.4
|
||||
markupsafe==2.1.1
|
||||
# via
|
||||
# jinja2
|
||||
# pandas-profiling
|
||||
matplotlib==3.5.3
|
||||
# via
|
||||
# missingno
|
||||
# pandas-profiling
|
||||
# phik
|
||||
# seaborn
|
||||
missingno==0.4.2
|
||||
missingno==0.5.1
|
||||
# via pandas-profiling
|
||||
networkx==2.5
|
||||
multimethod==1.8
|
||||
# via
|
||||
# pandas-profiling
|
||||
# visions
|
||||
networkx==2.8.6
|
||||
# via visions
|
||||
numpy==1.20.1
|
||||
numpy==1.23.2
|
||||
# via
|
||||
# bokeh
|
||||
# imagehash
|
||||
@@ -58,73 +61,82 @@ numpy==1.20.1
|
||||
# missingno
|
||||
# pandas
|
||||
# pandas-profiling
|
||||
# phik
|
||||
# pywavelets
|
||||
# scipy
|
||||
# seaborn
|
||||
# visions
|
||||
openpyxl==3.0.6
|
||||
openpyxl==3.0.10
|
||||
# via -r requirements.in
|
||||
packaging==20.9
|
||||
# via bokeh
|
||||
https://github.com/akx/pandas-profiling/archive/no-phik.zip#egg=pandas-profiling
|
||||
# via -r requirements.in
|
||||
pandas==1.2.2
|
||||
packaging==21.3
|
||||
# via
|
||||
# bokeh
|
||||
# matplotlib
|
||||
pandas==1.4.4
|
||||
# via
|
||||
# -r requirements.in
|
||||
# pandas-profiling
|
||||
# phik
|
||||
# seaborn
|
||||
# visions
|
||||
pillow==8.1.0
|
||||
pandas-profiling==3.2.0
|
||||
# via -r requirements.in
|
||||
phik==0.12.2
|
||||
# via pandas-profiling
|
||||
pillow==9.2.0
|
||||
# via
|
||||
# bokeh
|
||||
# imagehash
|
||||
# matplotlib
|
||||
# visions
|
||||
pyparsing==2.4.7
|
||||
pydantic==1.10.1
|
||||
# via pandas-profiling
|
||||
pyparsing==3.0.9
|
||||
# via
|
||||
# matplotlib
|
||||
# packaging
|
||||
python-dateutil==2.8.1
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# bokeh
|
||||
# matplotlib
|
||||
# pandas
|
||||
pytz==2021.1
|
||||
pytz==2022.2.1
|
||||
# via pandas
|
||||
pywavelets==1.1.1
|
||||
pywavelets==1.3.0
|
||||
# via imagehash
|
||||
pyyaml==5.4.1
|
||||
pyyaml==6.0
|
||||
# via
|
||||
# bokeh
|
||||
# confuse
|
||||
requests==2.25.1
|
||||
# pandas-profiling
|
||||
requests==2.28.1
|
||||
# via pandas-profiling
|
||||
scipy==1.6.1
|
||||
scipy==1.9.1
|
||||
# via
|
||||
# imagehash
|
||||
# missingno
|
||||
# pandas-profiling
|
||||
# phik
|
||||
# seaborn
|
||||
seaborn==0.11.1
|
||||
seaborn==0.11.2
|
||||
# via
|
||||
# missingno
|
||||
# pandas-profiling
|
||||
six==1.15.0
|
||||
six==1.16.0
|
||||
# via
|
||||
# cycler
|
||||
# imagehash
|
||||
# python-dateutil
|
||||
tangled-up-in-unicode==0.0.6
|
||||
tangled-up-in-unicode==0.2.0
|
||||
# via
|
||||
# pandas-profiling
|
||||
# visions
|
||||
tornado==6.1
|
||||
tornado==6.2
|
||||
# via bokeh
|
||||
tqdm==4.57.0
|
||||
tqdm==4.64.0
|
||||
# via pandas-profiling
|
||||
typing-extensions==3.7.4.3
|
||||
# via bokeh
|
||||
urllib3==1.26.3
|
||||
typing-extensions==4.3.0
|
||||
# via
|
||||
# bokeh
|
||||
# pydantic
|
||||
urllib3==1.26.12
|
||||
# via requests
|
||||
visions[type_image_path]==0.6.0
|
||||
visions[type_image_path]==0.7.4
|
||||
# via pandas-profiling
|
||||
|
||||
Reference in New Issue
Block a user