Compare commits
1 Commits
d378116d53
...
css-grid-l
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2c7b0a2700 |
@@ -1,71 +0,0 @@
|
||||
{
|
||||
"hooks": {
|
||||
"PreToolUse": [
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == *README.md || \"$file\" == *index.html ]]; then echo 'BLOCKED: README.md and index.html are generated artifacts. Edit create_listing.py instead, then run /regen-listings.' >&2; exit 2; fi"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == */uv.lock || \"$file\" == uv.lock ]]; then echo 'BLOCKED: uv.lock is auto-generated. Modify pyproject.toml and run uv sync instead.' >&2; exit 2; fi"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == *.github/workflows/*.yml || \"$file\" == .github/workflows/*.yml ]]; then echo 'BLOCKED: CI workflows use pinned action SHAs with # version comments for security. Edit workflow files carefully and maintain the SHA-pinning convention.' >&2; exit 2; fi"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == */renovate.json || \"$file\" == renovate.json ]]; then echo 'BLOCKED: renovate.json is rarely edited. Make changes deliberately and confirm with the user first.' >&2; exit 2; fi"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == emoji/* || \"$file\" == */emoji/* ]]; then echo 'BLOCKED: Emoji image files should not be written by Claude. Manage images manually or use /dedup-check.' >&2; exit 2; fi"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"PostToolUse": [
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == *.py ]]; then uv run ruff check --fix \"$file\" 2>&1 | tail -5 && uv run ruff format \"$file\" 2>&1 | tail -3; fi"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == *.py ]]; then uv run pytest --tb=short -q 2>&1 | tail -5; fi"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,10 +0,0 @@
|
||||
---
|
||||
name: dedup-check
|
||||
description: Run dedup in dry-run mode and report duplicate groups found
|
||||
disable-model-invocation: true
|
||||
---
|
||||
|
||||
## Steps
|
||||
|
||||
1. Run `uv run dedup --dry-run` from the project root
|
||||
2. Summarize the output — report how many duplicate groups were found and which files are involved
|
||||
@@ -1,11 +0,0 @@
|
||||
---
|
||||
name: regen-listings
|
||||
description: Regenerate README.md and index.html from emoji/ contents and verify output
|
||||
disable-model-invocation: true
|
||||
---
|
||||
|
||||
## Steps
|
||||
|
||||
1. Run `uv run python3 create_listing.py` from the project root
|
||||
2. Confirm both `README.md` and `index.html` exist and are non-empty
|
||||
3. Report the file sizes of both generated files
|
||||
20
.gitattributes
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
* text=auto
|
||||
*.md text diff=markdown
|
||||
*.php text diff=php
|
||||
|
||||
.gitattributes export-ignore
|
||||
.gitignore export-ignore
|
||||
|
||||
# Graphics
|
||||
*.gif binary
|
||||
*.gifv binary
|
||||
*.jpg binary
|
||||
*.jpeg binary
|
||||
*.png binary
|
||||
# SVG treated as an asset (binary) by default.
|
||||
*.svg text
|
||||
*.svgz binary
|
||||
*.tif binary
|
||||
*.tiff binary
|
||||
*.wbmp binary
|
||||
*.webp binary
|
||||
4
.github/workflows/compress-images.yml
vendored
@@ -11,7 +11,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
|
||||
uses: actions/checkout@v2
|
||||
- name: Compress Images
|
||||
id: calibre
|
||||
uses: calibreapp/image-actions@main
|
||||
@@ -20,7 +20,7 @@ jobs:
|
||||
compressOnly: true
|
||||
- name: Create New Pull Request If Needed
|
||||
if: steps.calibre.outputs.markdown != ''
|
||||
uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 # v8
|
||||
uses: peter-evans/create-pull-request@v3
|
||||
with:
|
||||
title: Compressed Images Nightly
|
||||
branch-suffix: timestamp
|
||||
|
||||
30
.github/workflows/generate-listings.yml
vendored
@@ -1,30 +0,0 @@
|
||||
name: Generate Listings
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'emoji/**'
|
||||
- 'create_listing.py'
|
||||
branches:
|
||||
- master
|
||||
jobs:
|
||||
generate:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
|
||||
with:
|
||||
python-version: '3.14'
|
||||
|
||||
- name: Generate listings
|
||||
run: python3 create_listing.py
|
||||
|
||||
- name: Commit changes
|
||||
run: |
|
||||
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
||||
git config --local user.name "github-actions[bot]"
|
||||
git add README.md index.html
|
||||
git diff --staged --quiet || git commit -m "Update listings"
|
||||
git push
|
||||
2
.github/workflows/pr-compress-images.yml
vendored
@@ -16,7 +16,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Compress Images
|
||||
uses: calibreapp/image-actions@main
|
||||
|
||||
25
.github/workflows/test.yml
vendored
@@ -1,25 +0,0 @@
|
||||
name: Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- '**.py'
|
||||
- 'pyproject.toml'
|
||||
- 'uv.lock'
|
||||
pull_request:
|
||||
paths:
|
||||
- '**.py'
|
||||
- 'pyproject.toml'
|
||||
- 'uv.lock'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
|
||||
- uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7
|
||||
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
|
||||
with:
|
||||
python-version: '3.14'
|
||||
- run: uv sync --dev
|
||||
- run: uv run pytest -v
|
||||
2
.gitignore
vendored
@@ -1,2 +0,0 @@
|
||||
__pycache__/
|
||||
.claude/settings.local.json
|
||||
52
CLAUDE.md
@@ -1,52 +0,0 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
Personal emoji/emote collection for chat apps (Slack, Discord, etc.). Contains 3000+ custom emoji images in `emoji/` with Python tooling for maintenance: listing generation and perceptual deduplication.
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
# Install dependencies (uses uv package manager)
|
||||
uv sync
|
||||
|
||||
# Regenerate README.md and index.html from emoji/ contents
|
||||
uv run python3 create_listing.py
|
||||
|
||||
# Find duplicate emojis (dry run)
|
||||
uv run python3 dedup.py --dry-run
|
||||
|
||||
# Find duplicates with custom threshold (0=exact match, default)
|
||||
uv run python3 dedup.py --threshold 5 --dry-run
|
||||
|
||||
# Actually remove duplicates
|
||||
uv run python3 dedup.py --dir emoji/
|
||||
|
||||
# Or via uv entry point
|
||||
uv run dedup --dry-run
|
||||
|
||||
# Run tests
|
||||
uv run pytest
|
||||
|
||||
# Run tests with verbose output
|
||||
uv run pytest -v
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
Two standalone Python scripts, no shared modules:
|
||||
|
||||
- **`create_listing.py`** — Generates `README.md` (HTML tables) and `index.html` (searchable dark-theme SPA) from all images in `emoji/`. No dependencies beyond stdlib. Both output files are auto-generated and committed by CI on push.
|
||||
|
||||
- **`dedup.py`** — Finds and removes duplicate images using multi-algorithm perceptual hashing (pHash, aHash, dHash, colorHash). Uses Union-Find clustering. Animated GIFs get extra frame-by-frame verification including timing. Keeps alphabetically-first filename per duplicate group.
|
||||
|
||||
## Key Conventions
|
||||
|
||||
- Python >=3.11 required; dependencies managed via `uv` with `uv.lock`
|
||||
- Image formats: `.png`, `.gif`, `.jpg`, `.jpeg`
|
||||
- `README.md` and `index.html` are generated artifacts — edit the scripts, not the outputs
|
||||
- CI uses pinned action SHAs (not tags) for security
|
||||
- Dependency updates managed by Renovate bot
|
||||
- Always use `uv run` to execute Python commands (e.g. `uv run pytest`, `uv run ruff`, `uv run python3 script.py`) to ensure the correct virtualenv and dependencies are used
|
||||
68
_create-listing.php
Normal file
@@ -0,0 +1,68 @@
|
||||
<?php
|
||||
|
||||
$output = 'README.md';
|
||||
$per_row = 5;
|
||||
$files = glob( 'emoji/*.{png,gif,jpg,jpeg}', GLOB_BRACE );
|
||||
$listing = [];
|
||||
$per_row_width = floor( 100 / $per_row ) . '%';
|
||||
|
||||
sort( $files );
|
||||
|
||||
if ( count( $files ) < 1 ) {
|
||||
die( 'No images to continue with.' );
|
||||
}
|
||||
|
||||
function get_basename( string $file ) {
|
||||
$parts = explode( DIRECTORY_SEPARATOR, $file );
|
||||
return end( $parts );
|
||||
}
|
||||
|
||||
foreach ( $files as $file ) {
|
||||
$first = get_basename( $file );
|
||||
$first = str_replace( 'emoji/', '', $first );
|
||||
$first = trim( $first[0] );
|
||||
|
||||
if ( preg_match( '/([^a-zA-Z:])/', $first ) ) {
|
||||
$first = '\[^a-zA-Z:\]';
|
||||
}
|
||||
|
||||
if ( ! array_key_exists( $first, $listing ) ) {
|
||||
$listing[ $first ] = [];
|
||||
}
|
||||
|
||||
$listing[ $first ][] = $file;
|
||||
}
|
||||
|
||||
$contents = "# Emotes\n\n";
|
||||
|
||||
$contents .= sprintf(
|
||||
"Listing of %d emojis last refreshed: %s",
|
||||
count($files),
|
||||
date('c')
|
||||
) . "\n\n";
|
||||
|
||||
$contents .= "<!-- markdownlint-disable-file MD033 -->\n";
|
||||
|
||||
foreach ( $listing as $header => $icons ) {
|
||||
$contents .= sprintf( "\n## %s\n\n", $header );
|
||||
|
||||
$chunks = array_chunk( $icons, $per_row );
|
||||
|
||||
$contents .= '<div style="text-align: center;display:grid;grid-template-columns: repeat(5, 1fr);grid-template-rows: minmax(70px, auto);">' . "\n";
|
||||
|
||||
foreach ( $chunks as $chunk_icons ) {
|
||||
foreach ( $chunk_icons as $icon ) {
|
||||
$file = $icon;
|
||||
[ $name, $ext ] = explode( '.', get_basename($icon), 2 );
|
||||
|
||||
$format = '<div style=\'border:1px solid #eee;padding:.5rem\'>'
|
||||
. '<img width=\'30\' src="%1$s" alt="%1$s"><br>'
|
||||
. '<kbd style=\'display:inline-block;max-width: 15vw;white-space: nowrap;overflow:auto\'>%2$s</kbd></div>';
|
||||
$contents .= sprintf( $format, $file, $name ) . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
$contents .= "</div>\n";
|
||||
}
|
||||
|
||||
file_put_contents( $output, $contents );
|
||||
@@ -1,228 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate README.md and index.html with emoji listings."""
|
||||
|
||||
import html
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
PER_ROW = 10
|
||||
EMOJI_DIR = Path("emoji")
|
||||
EXTENSIONS = (".png", ".gif", ".jpg", ".jpeg")
|
||||
|
||||
|
||||
def generate_readme(files: list[Path]) -> None:
|
||||
"""Generate README.md with HTML tables of all emoji images."""
|
||||
listing = defaultdict(list)
|
||||
for file in files:
|
||||
first_char = file.name[0].lower()
|
||||
if not re.match(r"[a-z]", first_char):
|
||||
first_char = r"\[^a-zA-Z:\]"
|
||||
listing[first_char].append(file)
|
||||
|
||||
per_row_width = f"{100 // PER_ROW}%"
|
||||
contents = "# Emotes\n\n"
|
||||
|
||||
for header in sorted(listing.keys(), key=lambda x: (not x.startswith("\\"), x)):
|
||||
icons = listing[header]
|
||||
contents += f"## {header}\n\n"
|
||||
contents += '<table style="text-align: center;width: 100%">\n'
|
||||
|
||||
for i in range(0, len(icons), PER_ROW):
|
||||
chunk = icons[i : i + PER_ROW]
|
||||
contents += "<tr>\n"
|
||||
|
||||
for icon in chunk:
|
||||
name = icon.stem
|
||||
encoded_path = f"emoji/{quote(icon.name)}"
|
||||
display_path = f"emoji/{icon.name}"
|
||||
|
||||
contents += (
|
||||
f"<td style='width: {per_row_width}'>"
|
||||
f"<img width='30' src=\"{encoded_path}\" "
|
||||
f'alt="{display_path}" title=":{name}:"></td>\n'
|
||||
)
|
||||
|
||||
contents += "</tr>\n"
|
||||
|
||||
contents += "</table>\n\n"
|
||||
|
||||
contents += f"\n\n Generated: {datetime.now(timezone.utc).isoformat()}"
|
||||
|
||||
Path("README.md").write_text(contents, encoding="utf-8")
|
||||
print(f"Generated README.md with {len(files)} emojis")
|
||||
|
||||
|
||||
def generate_html(files: list[Path]) -> None:
|
||||
"""Generate index.html with searchable emoji grid grouped alphabetically."""
|
||||
# Group files by first character
|
||||
listing = defaultdict(list)
|
||||
for file in files:
|
||||
first_char = file.name[0].lower()
|
||||
if not re.match(r"[a-z]", first_char):
|
||||
first_char = "#"
|
||||
listing[first_char].append(file)
|
||||
|
||||
# Build grouped HTML
|
||||
sections = []
|
||||
for header in sorted(listing.keys(), key=lambda x: (x != "#", x)):
|
||||
display_header = "0-9 / Special" if header == "#" else header.upper()
|
||||
emoji_items = []
|
||||
for file in listing[header]:
|
||||
name = file.stem
|
||||
encoded_path = f"emoji/{quote(file.name)}"
|
||||
escaped_name = html.escape(name)
|
||||
emoji_items.append(
|
||||
f' <div class="emoji" data-keyword="{escaped_name}">'
|
||||
f'<img src="{encoded_path}" alt="{escaped_name}" title=":{escaped_name}:"></div>'
|
||||
)
|
||||
sections.append(
|
||||
f' <section data-group="{html.escape(header)}">\n'
|
||||
f" <h2>{display_header}</h2>\n"
|
||||
f' <div class="grid">\n{chr(10).join(emoji_items)}\n </div>\n'
|
||||
f" </section>"
|
||||
)
|
||||
|
||||
contents = f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Emotes</title>
|
||||
<style>
|
||||
* {{ box-sizing: border-box; }}
|
||||
body {{
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background: #1a1a1a;
|
||||
color: #fff;
|
||||
}}
|
||||
#search {{
|
||||
width: 100%;
|
||||
max-width: 400px;
|
||||
padding: 12px 16px;
|
||||
font-size: 16px;
|
||||
border: 2px solid #333;
|
||||
border-radius: 8px;
|
||||
background: #2a2a2a;
|
||||
color: #fff;
|
||||
margin-bottom: 20px;
|
||||
}}
|
||||
#search:focus {{
|
||||
outline: none;
|
||||
border-color: #666;
|
||||
}}
|
||||
#search::placeholder {{
|
||||
color: #888;
|
||||
}}
|
||||
section {{
|
||||
margin-bottom: 24px;
|
||||
}}
|
||||
section.hidden {{
|
||||
display: none;
|
||||
}}
|
||||
h2 {{
|
||||
font-size: 18px;
|
||||
font-weight: 600;
|
||||
margin: 0 0 12px 0;
|
||||
color: #ccc;
|
||||
}}
|
||||
.grid {{
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(50px, 1fr));
|
||||
gap: 8px;
|
||||
}}
|
||||
.emoji {{
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
padding: 8px;
|
||||
background: #2a2a2a;
|
||||
border-radius: 6px;
|
||||
transition: background 0.15s;
|
||||
}}
|
||||
.emoji:hover {{
|
||||
background: #3a3a3a;
|
||||
}}
|
||||
.emoji img {{
|
||||
width: 32px;
|
||||
height: 32px;
|
||||
object-fit: contain;
|
||||
}}
|
||||
.emoji.hidden {{
|
||||
display: none;
|
||||
}}
|
||||
#count {{
|
||||
color: #888;
|
||||
font-size: 14px;
|
||||
margin-bottom: 12px;
|
||||
}}
|
||||
h1 {{
|
||||
margin: 0 0 20px 0;
|
||||
font-size: 24px;
|
||||
}}
|
||||
h1 a {{
|
||||
color: #fff;
|
||||
text-decoration: none;
|
||||
}}
|
||||
h1 a:hover {{
|
||||
text-decoration: underline;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1><a href="https://github.com/ivuorinen/emoji">ivuorinen/emoji</a></h1>
|
||||
<input type="text" id="search" placeholder="Search emojis..." autofocus>
|
||||
<div id="count">{len(files)} emojis</div>
|
||||
<div id="content">
|
||||
{chr(10).join(sections)}
|
||||
</div>
|
||||
<script>
|
||||
let timeout;
|
||||
const search = document.getElementById('search');
|
||||
const emojis = document.querySelectorAll('.emoji');
|
||||
const sections = document.querySelectorAll('section');
|
||||
const count = document.getElementById('count');
|
||||
const total = emojis.length;
|
||||
|
||||
search.addEventListener('input', function(e) {{
|
||||
clearTimeout(timeout);
|
||||
timeout = setTimeout(() => {{
|
||||
const query = e.target.value.toLowerCase();
|
||||
let visible = 0;
|
||||
emojis.forEach(el => {{
|
||||
const match = el.dataset.keyword.toLowerCase().includes(query);
|
||||
el.classList.toggle('hidden', !match);
|
||||
if (match) visible++;
|
||||
}});
|
||||
sections.forEach(sec => {{
|
||||
const hasVisible = sec.querySelector('.emoji:not(.hidden)');
|
||||
sec.classList.toggle('hidden', !hasVisible);
|
||||
}});
|
||||
count.textContent = query ? visible + ' of ' + total + ' emojis' : total + ' emojis';
|
||||
}}, 150);
|
||||
}});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
Path("index.html").write_text(contents, encoding="utf-8")
|
||||
print(f"Generated index.html with {len(files)} emojis")
|
||||
|
||||
|
||||
def main():
|
||||
files = sorted(f for f in EMOJI_DIR.iterdir() if f.suffix.lower() in EXTENSIONS)
|
||||
|
||||
if not files:
|
||||
raise SystemExit("No images to continue with.")
|
||||
|
||||
generate_readme(files)
|
||||
generate_html(files)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
374
dedup.py
@@ -1,374 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Find and remove duplicate emoji files using perceptual hashing."""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
import imagehash
|
||||
from PIL import Image
|
||||
|
||||
EXTENSIONS = (".png", ".gif", ".jpg", ".jpeg")
|
||||
|
||||
# Number of hash algorithms that must agree for images to be considered similar
|
||||
MIN_HASH_AGREEMENT = 4
|
||||
|
||||
# Maximum file size difference ratio for duplicates (e.g., 0.05 = 5% difference allowed)
|
||||
MAX_SIZE_DIFF_RATIO = 0.02
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImageInfo:
|
||||
"""Container for image metadata and hashes."""
|
||||
|
||||
phash: imagehash.ImageHash
|
||||
ahash: imagehash.ImageHash
|
||||
dhash: imagehash.ImageHash
|
||||
colorhash: imagehash.ImageHash
|
||||
width: int
|
||||
height: int
|
||||
n_frames: int # 1 for static images
|
||||
md5: str # File content hash for exact duplicate detection
|
||||
|
||||
def _has_degenerate_hash(self) -> bool:
|
||||
"""Check if this image has degenerate (all-zero) hashes, indicating mostly transparent content."""
|
||||
zero_hash = "0000000000000000"
|
||||
# If 3+ hashes are all zeros, the image is likely mostly transparent
|
||||
zero_count = sum(1 for h in [str(self.phash), str(self.ahash), str(self.dhash)] if h == zero_hash)
|
||||
return zero_count >= 3
|
||||
|
||||
def is_candidate(self, other: "ImageInfo", threshold: int) -> tuple[bool, int, int]:
|
||||
"""
|
||||
Check if two images are candidate duplicates based on metadata and hashes.
|
||||
Returns (is_candidate, agreements, total_distance).
|
||||
|
||||
This is a fast pre-filter. GIFs require additional frame verification.
|
||||
"""
|
||||
# Dimensions must match exactly
|
||||
if self.width != other.width or self.height != other.height:
|
||||
return False, 0, 999
|
||||
|
||||
# Frame count must match for animated images
|
||||
if self.n_frames != other.n_frames:
|
||||
return False, 0, 999
|
||||
|
||||
# Calculate perceptual hash distances
|
||||
distances = [
|
||||
self.phash - other.phash,
|
||||
self.ahash - other.ahash,
|
||||
self.dhash - other.dhash,
|
||||
self.colorhash - other.colorhash,
|
||||
]
|
||||
total_distance = sum(distances)
|
||||
agreements = sum(1 for d in distances if d <= threshold)
|
||||
|
||||
# For static images: detect re-compressed/re-exported duplicates
|
||||
# Require identical structure AND color, with small perceptual variance:
|
||||
# - aHash=0 AND dHash=0 AND colorHash=0 AND pHash <= 10
|
||||
# - OR all 4 hashes match exactly (total_distance = 0)
|
||||
if self.n_frames == 1:
|
||||
phash_dist = self.phash - other.phash
|
||||
ahash_dist = self.ahash - other.ahash
|
||||
dhash_dist = self.dhash - other.dhash
|
||||
chash_dist = self.colorhash - other.colorhash
|
||||
# Identical structure + color, small perceptual variance = re-compressed image
|
||||
if ahash_dist == 0 and dhash_dist == 0 and chash_dist == 0 and phash_dist <= 10:
|
||||
return True, agreements, total_distance
|
||||
# All hashes match exactly
|
||||
if total_distance == 0:
|
||||
return True, agreements, total_distance
|
||||
return False, agreements, total_distance
|
||||
|
||||
# For animated images: require all 4 hashes to agree (will be verified by frame check)
|
||||
return agreements >= MIN_HASH_AGREEMENT, agreements, total_distance
|
||||
|
||||
def is_animated(self) -> bool:
|
||||
"""Check if this is an animated image (multiple frames)."""
|
||||
return self.n_frames > 1
|
||||
|
||||
|
||||
class UnionFind:
|
||||
"""Union-Find data structure for clustering similar images."""
|
||||
|
||||
def __init__(self):
|
||||
self.parent = {}
|
||||
|
||||
def find(self, x):
|
||||
if x not in self.parent:
|
||||
self.parent[x] = x
|
||||
if self.parent[x] != x:
|
||||
self.parent[x] = self.find(self.parent[x])
|
||||
return self.parent[x]
|
||||
|
||||
def union(self, x, y):
|
||||
px, py = self.find(x), self.find(y)
|
||||
if px != py:
|
||||
self.parent[px] = py
|
||||
|
||||
|
||||
def _compute_hashes(img: Image.Image) -> tuple[imagehash.ImageHash, ...]:
|
||||
"""Compute all hash types for a single image/frame."""
|
||||
# Convert to RGBA to handle transparency consistently
|
||||
if img.mode != "RGBA":
|
||||
img = img.convert("RGBA")
|
||||
return (
|
||||
imagehash.phash(img),
|
||||
imagehash.average_hash(img),
|
||||
imagehash.dhash(img),
|
||||
imagehash.colorhash(img),
|
||||
)
|
||||
|
||||
|
||||
def _compute_md5(path: Path) -> str:
|
||||
"""Compute MD5 hash of file contents."""
|
||||
md5 = hashlib.md5()
|
||||
with open(path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(8192), b""):
|
||||
md5.update(chunk)
|
||||
return md5.hexdigest()
|
||||
|
||||
|
||||
def _get_gif_frame_info(path: Path) -> list[tuple[str, int]] | None:
|
||||
"""
|
||||
Get perceptual hash and duration for each frame of a GIF.
|
||||
Returns list of (hash_string, duration_ms) tuples, or None if not a multi-frame image.
|
||||
"""
|
||||
try:
|
||||
with Image.open(path) as img:
|
||||
n_frames = getattr(img, "n_frames", 1)
|
||||
if n_frames <= 1:
|
||||
return None
|
||||
|
||||
frame_info = []
|
||||
for i in range(n_frames):
|
||||
img.seek(i)
|
||||
frame = img.copy()
|
||||
if frame.mode != "RGBA":
|
||||
frame = frame.convert("RGBA")
|
||||
duration = img.info.get("duration", 0)
|
||||
frame_info.append((str(imagehash.phash(frame)), duration))
|
||||
return frame_info
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _gifs_are_identical(path1: Path, path2: Path) -> bool:
|
||||
"""
|
||||
Compare two GIFs frame-by-frame to check if they have identical content AND timing.
|
||||
Returns True only if all frames and durations match.
|
||||
"""
|
||||
info1 = _get_gif_frame_info(path1)
|
||||
info2 = _get_gif_frame_info(path2)
|
||||
|
||||
# If either isn't a multi-frame GIF, fall back to MD5 comparison
|
||||
if info1 is None or info2 is None:
|
||||
return _compute_md5(path1) == _compute_md5(path2)
|
||||
|
||||
# Frame counts must match
|
||||
if len(info1) != len(info2):
|
||||
return False
|
||||
|
||||
# All frames AND durations must match
|
||||
return info1 == info2
|
||||
|
||||
|
||||
def compute_image_info(path: Path) -> ImageInfo | None:
|
||||
"""
|
||||
Compute image metadata and perceptual hashes.
|
||||
For animated GIFs, samples middle frame to avoid blank first-frame issues.
|
||||
Returns None if image can't be processed.
|
||||
"""
|
||||
try:
|
||||
md5 = _compute_md5(path)
|
||||
|
||||
with Image.open(path) as img:
|
||||
width, height = img.size
|
||||
n_frames = getattr(img, "n_frames", 1)
|
||||
is_animated = getattr(img, "is_animated", False)
|
||||
|
||||
if not is_animated:
|
||||
hashes = _compute_hashes(img)
|
||||
else:
|
||||
# For animated images, use middle frame for hashing
|
||||
middle_frame = n_frames // 2
|
||||
try:
|
||||
img.seek(middle_frame)
|
||||
hashes = _compute_hashes(img.copy())
|
||||
except EOFError:
|
||||
img.seek(0)
|
||||
hashes = _compute_hashes(img)
|
||||
|
||||
return ImageInfo(
|
||||
phash=hashes[0],
|
||||
ahash=hashes[1],
|
||||
dhash=hashes[2],
|
||||
colorhash=hashes[3],
|
||||
width=width,
|
||||
height=height,
|
||||
n_frames=n_frames,
|
||||
md5=md5,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not process {path.name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _files_size_similar(path1: Path, path2: Path) -> bool:
|
||||
"""Check if two files have similar sizes (within MAX_SIZE_DIFF_RATIO)."""
|
||||
size1 = path1.stat().st_size
|
||||
size2 = path2.stat().st_size
|
||||
if size1 == 0 or size2 == 0:
|
||||
return size1 == size2
|
||||
ratio = abs(size1 - size2) / max(size1, size2)
|
||||
return ratio <= MAX_SIZE_DIFF_RATIO
|
||||
|
||||
|
||||
def _verify_duplicate_pair(path_i: Path, info_i: ImageInfo, path_j: Path, info_j: ImageInfo, threshold: int) -> bool:
|
||||
"""
|
||||
Verify if two candidate images are true duplicates.
|
||||
For animated GIFs, compares frames and timing. For static images, perceptual match is sufficient.
|
||||
"""
|
||||
# For animated images, verify frame-by-frame including timing
|
||||
if info_i.is_animated() and info_j.is_animated():
|
||||
return _gifs_are_identical(path_i, path_j)
|
||||
|
||||
# For static images, perceptual hash agreement is sufficient
|
||||
# (handles re-compressed/re-exported duplicates with different file sizes)
|
||||
return True
|
||||
|
||||
|
||||
def find_similar_groups(files: list[Path], threshold: int) -> list[list[tuple[Path, ImageInfo]]]:
|
||||
"""Find groups of similar images using multi-hash consensus and union-find."""
|
||||
# Compute image info for all files
|
||||
images: list[tuple[Path, ImageInfo]] = []
|
||||
for file in files:
|
||||
info = compute_image_info(file)
|
||||
if info is not None:
|
||||
# Skip images with degenerate (all-zero) hashes - they can't be meaningfully compared
|
||||
if not info._has_degenerate_hash():
|
||||
images.append((file, info))
|
||||
|
||||
if not images:
|
||||
return []
|
||||
|
||||
# Use union-find to cluster similar images
|
||||
# First pass: find candidates based on hashes and metadata
|
||||
# Second pass: verify GIFs with frame comparison
|
||||
uf = UnionFind()
|
||||
for i, (path_i, info_i) in enumerate(images):
|
||||
uf.find(i) # Initialize
|
||||
for j in range(i + 1, len(images)):
|
||||
path_j, info_j = images[j]
|
||||
|
||||
# Check if candidates based on hashes/metadata
|
||||
is_candidate, _, _ = info_i.is_candidate(info_j, threshold)
|
||||
if not is_candidate:
|
||||
continue
|
||||
|
||||
# For animated images, also check file size similarity
|
||||
# (static images may have different compression, so skip size check)
|
||||
if info_i.is_animated() and not _files_size_similar(path_i, path_j):
|
||||
continue
|
||||
|
||||
# Verify: for GIFs, compare frames; for static, already verified by hashes
|
||||
if _verify_duplicate_pair(path_i, info_i, path_j, info_j, threshold):
|
||||
uf.union(i, j)
|
||||
|
||||
# Group by cluster
|
||||
clusters: dict[int, list[tuple[Path, ImageInfo]]] = {}
|
||||
for i, (path, info) in enumerate(images):
|
||||
root = uf.find(i)
|
||||
if root not in clusters:
|
||||
clusters[root] = []
|
||||
clusters[root].append((path, info))
|
||||
|
||||
# Return only groups with duplicates
|
||||
return [group for group in clusters.values() if len(group) > 1]
|
||||
|
||||
|
||||
def deduplicate(groups: list[list[tuple[Path, ImageInfo]]], dry_run: bool, threshold: int) -> tuple[int, int]:
|
||||
"""Remove duplicates, keeping first alphabetically. Returns (groups, removed)."""
|
||||
total_removed = 0
|
||||
|
||||
for group in groups:
|
||||
# Sort by filename alphabetically
|
||||
sorted_group = sorted(group, key=lambda x: x[0].name.lower())
|
||||
keep_path, keep_info = sorted_group[0]
|
||||
remove = sorted_group[1:]
|
||||
|
||||
# Calculate agreement info for display
|
||||
agreements_info = [keep_info.is_candidate(info, threshold) for _, info in remove]
|
||||
|
||||
frames_str = f", {keep_info.n_frames} frames" if keep_info.is_animated() else ""
|
||||
print(f"\nSimilar group ({len(group)} files, {keep_info.width}x{keep_info.height}{frames_str}):")
|
||||
print(f" KEEP: {keep_path.name}")
|
||||
|
||||
for (path, info), (_, agreements, total_dist) in zip(remove, agreements_info):
|
||||
action = "WOULD DELETE" if dry_run else "DELETE"
|
||||
print(f" {action}: {path.name} (agreements: {agreements}/4, dist: {total_dist})")
|
||||
if not dry_run:
|
||||
path.unlink()
|
||||
total_removed += 1
|
||||
|
||||
if dry_run:
|
||||
return len(groups), sum(len(g) - 1 for g in groups)
|
||||
return len(groups), total_removed
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Find and remove duplicate emoji files using perceptual hashing.")
|
||||
parser.add_argument(
|
||||
"--threshold",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Similarity threshold (0=exact, default=0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Show duplicates without deleting",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dir",
|
||||
type=Path,
|
||||
default=Path("emoji"),
|
||||
help="Directory to scan (default: emoji/)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
emoji_dir = args.dir
|
||||
if not emoji_dir.exists():
|
||||
print(f"Error: Directory '{emoji_dir}' does not exist.")
|
||||
return
|
||||
|
||||
files = [f for f in emoji_dir.iterdir() if f.suffix.lower() in EXTENSIONS]
|
||||
|
||||
if not files:
|
||||
print(f"No image files found in {emoji_dir}/ folder.")
|
||||
return
|
||||
|
||||
print(f"Scanning {len(files)} files (threshold: {args.threshold})...")
|
||||
if args.dry_run:
|
||||
print("(dry-run mode - no files will be deleted)")
|
||||
|
||||
groups = find_similar_groups(files, args.threshold)
|
||||
|
||||
if not groups:
|
||||
print("\nNo similar images found.")
|
||||
return
|
||||
|
||||
group_count, removed = deduplicate(groups, args.dry_run, args.threshold)
|
||||
|
||||
print("\n--- Summary ---")
|
||||
print(f"Files scanned: {len(files)}")
|
||||
print(f"Similar groups: {group_count}")
|
||||
if args.dry_run:
|
||||
print(f"Files to remove: {removed}")
|
||||
else:
|
||||
print(f"Files removed: {removed}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
emoji/+2.png
|
Before Width: | Height: | Size: 945 B After Width: | Height: | Size: 1.6 KiB |
BIN
emoji/000.png
|
Before Width: | Height: | Size: 1.4 KiB |
BIN
emoji/01x.png
|
Before Width: | Height: | Size: 3.3 KiB After Width: | Height: | Size: 15 KiB |
BIN
emoji/10000.png
|
Before Width: | Height: | Size: 6.1 KiB |
|
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 5.7 KiB |
BIN
emoji/10x.png
|
Before Width: | Height: | Size: 5.3 KiB After Width: | Height: | Size: 21 KiB |
|
Before Width: | Height: | Size: 5.1 KiB |
|
Before Width: | Height: | Size: 1.9 KiB After Width: | Height: | Size: 2.1 KiB |
|
Before Width: | Height: | Size: 6.2 KiB |
BIN
emoji/5g.png
|
Before Width: | Height: | Size: 1.9 KiB After Width: | Height: | Size: 9.6 KiB |
BIN
emoji/6-5.png
|
Before Width: | Height: | Size: 1.2 KiB |
|
Before Width: | Height: | Size: 23 KiB After Width: | Height: | Size: 25 KiB |
|
Before Width: | Height: | Size: 3.8 KiB |
|
Before Width: | Height: | Size: 3.2 KiB |
|
Before Width: | Height: | Size: 5.2 KiB |
|
Before Width: | Height: | Size: 5.0 KiB |
BIN
emoji/99.png
|
Before Width: | Height: | Size: 466 B After Width: | Height: | Size: 1.2 KiB |
|
Before Width: | Height: | Size: 51 KiB |
|
Before Width: | Height: | Size: 26 KiB |
|
Before Width: | Height: | Size: 56 KiB |
BIN
emoji/SCP096.png
|
Before Width: | Height: | Size: 3.0 KiB |
BIN
emoji/SCP173.png
|
Before Width: | Height: | Size: 2.4 KiB |
BIN
emoji/SCP999.png
|
Before Width: | Height: | Size: 1.7 KiB |
|
Before Width: | Height: | Size: 25 KiB |
|
Before Width: | Height: | Size: 15 KiB |
|
Before Width: | Height: | Size: 17 KiB |
|
Before Width: | Height: | Size: 4.2 KiB |
|
Before Width: | Height: | Size: 4.5 KiB |
|
Before Width: | Height: | Size: 49 KiB |
|
Before Width: | Height: | Size: 9.7 KiB |
|
Before Width: | Height: | Size: 18 KiB |
|
Before Width: | Height: | Size: 45 KiB |
BIN
emoji/aa.png
|
Before Width: | Height: | Size: 3.2 KiB After Width: | Height: | Size: 12 KiB |
BIN
emoji/aaa.gif
|
Before Width: | Height: | Size: 228 KiB |
BIN
emoji/aaa.png
|
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 3.9 KiB |
BIN
emoji/aatos.png
|
Before Width: | Height: | Size: 903 B After Width: | Height: | Size: 2.7 KiB |
|
Before Width: | Height: | Size: 6.1 KiB After Width: | Height: | Size: 6.1 KiB |
BIN
emoji/abua.jpg
|
Before Width: | Height: | Size: 2.0 KiB |
|
Before Width: | Height: | Size: 1.7 KiB |
|
Before Width: | Height: | Size: 31 KiB |
|
Before Width: | Height: | Size: 2.6 KiB |
BIN
emoji/admin.png
|
Before Width: | Height: | Size: 6.6 KiB |
|
Before Width: | Height: | Size: 15 KiB |
BIN
emoji/afk.png
|
Before Width: | Height: | Size: 2.8 KiB After Width: | Height: | Size: 38 KiB |
BIN
emoji/africa.png
|
Before Width: | Height: | Size: 973 B |
BIN
emoji/ah.png
|
Before Width: | Height: | Size: 5.4 KiB |
|
Before Width: | Height: | Size: 14 KiB |
BIN
emoji/ahhhhh.png
|
Before Width: | Height: | Size: 56 KiB |
BIN
emoji/ahshit.png
|
Before Width: | Height: | Size: 2.2 KiB |
BIN
emoji/aia.png
|
Before Width: | Height: | Size: 2.1 KiB After Width: | Height: | Size: 5.2 KiB |
BIN
emoji/aiet.png
|
Before Width: | Height: | Size: 5.4 KiB |
|
Before Width: | Height: | Size: 1.9 KiB |
|
Before Width: | Height: | Size: 7.0 KiB |
|
Before Width: | Height: | Size: 8.1 KiB After Width: | Height: | Size: 32 KiB |
BIN
emoji/ajk.png
|
Before Width: | Height: | Size: 2.2 KiB After Width: | Height: | Size: 7.4 KiB |
BIN
emoji/aku.png
|
Before Width: | Height: | Size: 2.5 KiB |
|
Before Width: | Height: | Size: 3.7 KiB After Width: | Height: | Size: 3.7 KiB |
|
Before Width: | Height: | Size: 2.6 KiB After Width: | Height: | Size: 14 KiB |
BIN
emoji/akx.png
|
Before Width: | Height: | Size: 7.5 KiB After Width: | Height: | Size: 28 KiB |
|
Before Width: | Height: | Size: 5.3 KiB |
|
Before Width: | Height: | Size: 6.4 KiB |
|
Before Width: | Height: | Size: 462 B |
BIN
emoji/alibi.png
|
Before Width: | Height: | Size: 7.4 KiB After Width: | Height: | Size: 8.7 KiB |
|
Before Width: | Height: | Size: 16 KiB |
|
Before Width: | Height: | Size: 3.8 KiB |
BIN
emoji/alien.png
|
Before Width: | Height: | Size: 5.6 KiB |
|
Before Width: | Height: | Size: 5.7 KiB |
BIN
emoji/alko.png
|
Before Width: | Height: | Size: 1.1 KiB After Width: | Height: | Size: 3.9 KiB |
|
Before Width: | Height: | Size: 7.2 KiB |
BIN
emoji/all-in.gif
|
Before Width: | Height: | Size: 14 KiB |
|
Before Width: | Height: | Size: 3.0 KiB |
BIN
emoji/allu.png
|
Before Width: | Height: | Size: 4.2 KiB After Width: | Height: | Size: 14 KiB |
|
Before Width: | Height: | Size: 317 B |
BIN
emoji/alma.png
|
Before Width: | Height: | Size: 646 B |
BIN
emoji/almond.gif
|
Before Width: | Height: | Size: 616 B |
|
Before Width: | Height: | Size: 4.2 KiB After Width: | Height: | Size: 12 KiB |
|
Before Width: | Height: | Size: 6.8 KiB After Width: | Height: | Size: 15 KiB |
|
Before Width: | Height: | Size: 4.3 KiB After Width: | Height: | Size: 11 KiB |
|
Before Width: | Height: | Size: 5.5 KiB After Width: | Height: | Size: 13 KiB |
|
Before Width: | Height: | Size: 4.3 KiB After Width: | Height: | Size: 11 KiB |
|
Before Width: | Height: | Size: 3.2 KiB After Width: | Height: | Size: 8.2 KiB |
|
Before Width: | Height: | Size: 3.4 KiB After Width: | Height: | Size: 9.1 KiB |
|
Before Width: | Height: | Size: 3.2 KiB After Width: | Height: | Size: 8.4 KiB |
|
Before Width: | Height: | Size: 5.8 KiB After Width: | Height: | Size: 13 KiB |
|
Before Width: | Height: | Size: 3.8 KiB After Width: | Height: | Size: 10 KiB |
|
Before Width: | Height: | Size: 5.1 KiB After Width: | Height: | Size: 11 KiB |