Compare commits

..

1 Commits

Author SHA1 Message Date
Ismo Vuorinen
2c7b0a2700 Test css grid based listing 2022-08-17 10:33:10 +03:00
3173 changed files with 1270 additions and 9401 deletions

View File

@@ -1,71 +0,0 @@
{
"hooks": {
"PreToolUse": [
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == *README.md || \"$file\" == *index.html ]]; then echo 'BLOCKED: README.md and index.html are generated artifacts. Edit create_listing.py instead, then run /regen-listings.' >&2; exit 2; fi"
}
]
},
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == */uv.lock || \"$file\" == uv.lock ]]; then echo 'BLOCKED: uv.lock is auto-generated. Modify pyproject.toml and run uv sync instead.' >&2; exit 2; fi"
}
]
},
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == *.github/workflows/*.yml || \"$file\" == .github/workflows/*.yml ]]; then echo 'BLOCKED: CI workflows use pinned action SHAs with # version comments for security. Edit workflow files carefully and maintain the SHA-pinning convention.' >&2; exit 2; fi"
}
]
},
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == */renovate.json || \"$file\" == renovate.json ]]; then echo 'BLOCKED: renovate.json is rarely edited. Make changes deliberately and confirm with the user first.' >&2; exit 2; fi"
}
]
},
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == emoji/* || \"$file\" == */emoji/* ]]; then echo 'BLOCKED: Emoji image files should not be written by Claude. Manage images manually or use /dedup-check.' >&2; exit 2; fi"
}
]
}
],
"PostToolUse": [
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == *.py ]]; then uv run ruff check --fix \"$file\" 2>&1 | tail -5 && uv run ruff format \"$file\" 2>&1 | tail -3; fi"
}
]
},
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=$(jq -r '.tool_input.file_path // empty'); if [[ \"$file\" == *.py ]]; then uv run pytest --tb=short -q 2>&1 | tail -5; fi"
}
]
}
]
}
}

View File

@@ -1,10 +0,0 @@
---
name: dedup-check
description: Run dedup in dry-run mode and report duplicate groups found
disable-model-invocation: true
---
## Steps
1. Run `uv run dedup --dry-run` from the project root
2. Summarize the output — report how many duplicate groups were found and which files are involved

View File

@@ -1,11 +0,0 @@
---
name: regen-listings
description: Regenerate README.md and index.html from emoji/ contents and verify output
disable-model-invocation: true
---
## Steps
1. Run `uv run python3 create_listing.py` from the project root
2. Confirm both `README.md` and `index.html` exist and are non-empty
3. Report the file sizes of both generated files

20
.gitattributes vendored Normal file
View File

@@ -0,0 +1,20 @@
* text=auto
*.md text diff=markdown
*.php text diff=php
.gitattributes export-ignore
.gitignore export-ignore
# Graphics
*.gif binary
*.gifv binary
*.jpg binary
*.jpeg binary
*.png binary
# SVG treated as an asset (binary) by default.
*.svg text
*.svgz binary
*.tif binary
*.tiff binary
*.wbmp binary
*.webp binary

View File

@@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout Repo - name: Checkout Repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 uses: actions/checkout@v2
- name: Compress Images - name: Compress Images
id: calibre id: calibre
uses: calibreapp/image-actions@main uses: calibreapp/image-actions@main
@@ -20,7 +20,7 @@ jobs:
compressOnly: true compressOnly: true
- name: Create New Pull Request If Needed - name: Create New Pull Request If Needed
if: steps.calibre.outputs.markdown != '' if: steps.calibre.outputs.markdown != ''
uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 # v8 uses: peter-evans/create-pull-request@v3
with: with:
title: Compressed Images Nightly title: Compressed Images Nightly
branch-suffix: timestamp branch-suffix: timestamp

View File

@@ -1,30 +0,0 @@
name: Generate Listings
on:
push:
paths:
- 'emoji/**'
- 'create_listing.py'
branches:
- master
jobs:
generate:
runs-on: ubuntu-latest
steps:
- name: Checkout Repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
with:
python-version: '3.14'
- name: Generate listings
run: python3 create_listing.py
- name: Commit changes
run: |
git config --local user.email "github-actions[bot]@users.noreply.github.com"
git config --local user.name "github-actions[bot]"
git add README.md index.html
git diff --staged --quiet || git commit -m "Update listings"
git push

View File

@@ -16,7 +16,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout Repo - name: Checkout Repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 uses: actions/checkout@v2
- name: Compress Images - name: Compress Images
uses: calibreapp/image-actions@main uses: calibreapp/image-actions@main

View File

@@ -1,25 +0,0 @@
name: Tests
on:
push:
paths:
- '**.py'
- 'pyproject.toml'
- 'uv.lock'
pull_request:
paths:
- '**.py'
- 'pyproject.toml'
- 'uv.lock'
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
with:
python-version: '3.14'
- run: uv sync --dev
- run: uv run pytest -v

2
.gitignore vendored
View File

@@ -1,2 +0,0 @@
__pycache__/
.claude/settings.local.json

View File

@@ -1,52 +0,0 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
Personal emoji/emote collection for chat apps (Slack, Discord, etc.). Contains 3000+ custom emoji images in `emoji/` with Python tooling for maintenance: listing generation and perceptual deduplication.
## Commands
```bash
# Install dependencies (uses uv package manager)
uv sync
# Regenerate README.md and index.html from emoji/ contents
uv run python3 create_listing.py
# Find duplicate emojis (dry run)
uv run python3 dedup.py --dry-run
# Find duplicates with custom threshold (0=exact match, default)
uv run python3 dedup.py --threshold 5 --dry-run
# Actually remove duplicates
uv run python3 dedup.py --dir emoji/
# Or via uv entry point
uv run dedup --dry-run
# Run tests
uv run pytest
# Run tests with verbose output
uv run pytest -v
```
## Architecture
Two standalone Python scripts, no shared modules:
- **`create_listing.py`** — Generates `README.md` (HTML tables) and `index.html` (searchable dark-theme SPA) from all images in `emoji/`. No dependencies beyond stdlib. Both output files are auto-generated and committed by CI on push.
- **`dedup.py`** — Finds and removes duplicate images using multi-algorithm perceptual hashing (pHash, aHash, dHash, colorHash). Uses Union-Find clustering. Animated GIFs get extra frame-by-frame verification including timing. Keeps alphabetically-first filename per duplicate group.
## Key Conventions
- Python >=3.11 required; dependencies managed via `uv` with `uv.lock`
- Image formats: `.png`, `.gif`, `.jpg`, `.jpeg`
- `README.md` and `index.html` are generated artifacts — edit the scripts, not the outputs
- CI uses pinned action SHAs (not tags) for security
- Dependency updates managed by Renovate bot
- Always use `uv run` to execute Python commands (e.g. `uv run pytest`, `uv run ruff`, `uv run python3 script.py`) to ensure the correct virtualenv and dependencies are used

5063
README.md

File diff suppressed because it is too large Load Diff

68
_create-listing.php Normal file
View File

@@ -0,0 +1,68 @@
<?php
$output = 'README.md';
$per_row = 5;
$files = glob( 'emoji/*.{png,gif,jpg,jpeg}', GLOB_BRACE );
$listing = [];
$per_row_width = floor( 100 / $per_row ) . '%';
sort( $files );
if ( count( $files ) < 1 ) {
die( 'No images to continue with.' );
}
function get_basename( string $file ) {
$parts = explode( DIRECTORY_SEPARATOR, $file );
return end( $parts );
}
foreach ( $files as $file ) {
$first = get_basename( $file );
$first = str_replace( 'emoji/', '', $first );
$first = trim( $first[0] );
if ( preg_match( '/([^a-zA-Z:])/', $first ) ) {
$first = '\[^a-zA-Z:\]';
}
if ( ! array_key_exists( $first, $listing ) ) {
$listing[ $first ] = [];
}
$listing[ $first ][] = $file;
}
$contents = "# Emotes\n\n";
$contents .= sprintf(
"Listing of %d emojis last refreshed: %s",
count($files),
date('c')
) . "\n\n";
$contents .= "<!-- markdownlint-disable-file MD033 -->\n";
foreach ( $listing as $header => $icons ) {
$contents .= sprintf( "\n## %s\n\n", $header );
$chunks = array_chunk( $icons, $per_row );
$contents .= '<div style="text-align: center;display:grid;grid-template-columns: repeat(5, 1fr);grid-template-rows: minmax(70px, auto);">' . "\n";
foreach ( $chunks as $chunk_icons ) {
foreach ( $chunk_icons as $icon ) {
$file = $icon;
[ $name, $ext ] = explode( '.', get_basename($icon), 2 );
$format = '<div style=\'border:1px solid #eee;padding:.5rem\'>'
. '<img width=\'30\' src="%1$s" alt="%1$s"><br>'
. '<kbd style=\'display:inline-block;max-width: 15vw;white-space: nowrap;overflow:auto\'>%2$s</kbd></div>';
$contents .= sprintf( $format, $file, $name ) . "\n";
}
}
$contents .= "</div>\n";
}
file_put_contents( $output, $contents );

View File

@@ -1,228 +0,0 @@
#!/usr/bin/env python3
"""Generate README.md and index.html with emoji listings."""
import html
import re
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import quote
PER_ROW = 10
EMOJI_DIR = Path("emoji")
EXTENSIONS = (".png", ".gif", ".jpg", ".jpeg")
def generate_readme(files: list[Path]) -> None:
"""Generate README.md with HTML tables of all emoji images."""
listing = defaultdict(list)
for file in files:
first_char = file.name[0].lower()
if not re.match(r"[a-z]", first_char):
first_char = r"\[^a-zA-Z:\]"
listing[first_char].append(file)
per_row_width = f"{100 // PER_ROW}%"
contents = "# Emotes\n\n"
for header in sorted(listing.keys(), key=lambda x: (not x.startswith("\\"), x)):
icons = listing[header]
contents += f"## {header}\n\n"
contents += '<table style="text-align: center;width: 100%">\n'
for i in range(0, len(icons), PER_ROW):
chunk = icons[i : i + PER_ROW]
contents += "<tr>\n"
for icon in chunk:
name = icon.stem
encoded_path = f"emoji/{quote(icon.name)}"
display_path = f"emoji/{icon.name}"
contents += (
f"<td style='width: {per_row_width}'>"
f"<img width='30' src=\"{encoded_path}\" "
f'alt="{display_path}" title=":{name}:"></td>\n'
)
contents += "</tr>\n"
contents += "</table>\n\n"
contents += f"\n\n Generated: {datetime.now(timezone.utc).isoformat()}"
Path("README.md").write_text(contents, encoding="utf-8")
print(f"Generated README.md with {len(files)} emojis")
def generate_html(files: list[Path]) -> None:
"""Generate index.html with searchable emoji grid grouped alphabetically."""
# Group files by first character
listing = defaultdict(list)
for file in files:
first_char = file.name[0].lower()
if not re.match(r"[a-z]", first_char):
first_char = "#"
listing[first_char].append(file)
# Build grouped HTML
sections = []
for header in sorted(listing.keys(), key=lambda x: (x != "#", x)):
display_header = "0-9 / Special" if header == "#" else header.upper()
emoji_items = []
for file in listing[header]:
name = file.stem
encoded_path = f"emoji/{quote(file.name)}"
escaped_name = html.escape(name)
emoji_items.append(
f' <div class="emoji" data-keyword="{escaped_name}">'
f'<img src="{encoded_path}" alt="{escaped_name}" title=":{escaped_name}:"></div>'
)
sections.append(
f' <section data-group="{html.escape(header)}">\n'
f" <h2>{display_header}</h2>\n"
f' <div class="grid">\n{chr(10).join(emoji_items)}\n </div>\n'
f" </section>"
)
contents = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Emotes</title>
<style>
* {{ box-sizing: border-box; }}
body {{
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
margin: 0;
padding: 20px;
background: #1a1a1a;
color: #fff;
}}
#search {{
width: 100%;
max-width: 400px;
padding: 12px 16px;
font-size: 16px;
border: 2px solid #333;
border-radius: 8px;
background: #2a2a2a;
color: #fff;
margin-bottom: 20px;
}}
#search:focus {{
outline: none;
border-color: #666;
}}
#search::placeholder {{
color: #888;
}}
section {{
margin-bottom: 24px;
}}
section.hidden {{
display: none;
}}
h2 {{
font-size: 18px;
font-weight: 600;
margin: 0 0 12px 0;
color: #ccc;
}}
.grid {{
display: grid;
grid-template-columns: repeat(auto-fill, minmax(50px, 1fr));
gap: 8px;
}}
.emoji {{
display: flex;
align-items: center;
justify-content: center;
padding: 8px;
background: #2a2a2a;
border-radius: 6px;
transition: background 0.15s;
}}
.emoji:hover {{
background: #3a3a3a;
}}
.emoji img {{
width: 32px;
height: 32px;
object-fit: contain;
}}
.emoji.hidden {{
display: none;
}}
#count {{
color: #888;
font-size: 14px;
margin-bottom: 12px;
}}
h1 {{
margin: 0 0 20px 0;
font-size: 24px;
}}
h1 a {{
color: #fff;
text-decoration: none;
}}
h1 a:hover {{
text-decoration: underline;
}}
</style>
</head>
<body>
<h1><a href="https://github.com/ivuorinen/emoji">ivuorinen/emoji</a></h1>
<input type="text" id="search" placeholder="Search emojis..." autofocus>
<div id="count">{len(files)} emojis</div>
<div id="content">
{chr(10).join(sections)}
</div>
<script>
let timeout;
const search = document.getElementById('search');
const emojis = document.querySelectorAll('.emoji');
const sections = document.querySelectorAll('section');
const count = document.getElementById('count');
const total = emojis.length;
search.addEventListener('input', function(e) {{
clearTimeout(timeout);
timeout = setTimeout(() => {{
const query = e.target.value.toLowerCase();
let visible = 0;
emojis.forEach(el => {{
const match = el.dataset.keyword.toLowerCase().includes(query);
el.classList.toggle('hidden', !match);
if (match) visible++;
}});
sections.forEach(sec => {{
const hasVisible = sec.querySelector('.emoji:not(.hidden)');
sec.classList.toggle('hidden', !hasVisible);
}});
count.textContent = query ? visible + ' of ' + total + ' emojis' : total + ' emojis';
}}, 150);
}});
</script>
</body>
</html>
"""
Path("index.html").write_text(contents, encoding="utf-8")
print(f"Generated index.html with {len(files)} emojis")
def main():
files = sorted(f for f in EMOJI_DIR.iterdir() if f.suffix.lower() in EXTENSIONS)
if not files:
raise SystemExit("No images to continue with.")
generate_readme(files)
generate_html(files)
if __name__ == "__main__":
main()

374
dedup.py
View File

@@ -1,374 +0,0 @@
#!/usr/bin/env python3
"""Find and remove duplicate emoji files using perceptual hashing."""
import argparse
import hashlib
from pathlib import Path
from dataclasses import dataclass
import imagehash
from PIL import Image
EXTENSIONS = (".png", ".gif", ".jpg", ".jpeg")
# Number of hash algorithms that must agree for images to be considered similar
MIN_HASH_AGREEMENT = 4
# Maximum file size difference ratio for duplicates (e.g., 0.05 = 5% difference allowed)
MAX_SIZE_DIFF_RATIO = 0.02
@dataclass
class ImageInfo:
"""Container for image metadata and hashes."""
phash: imagehash.ImageHash
ahash: imagehash.ImageHash
dhash: imagehash.ImageHash
colorhash: imagehash.ImageHash
width: int
height: int
n_frames: int # 1 for static images
md5: str # File content hash for exact duplicate detection
def _has_degenerate_hash(self) -> bool:
"""Check if this image has degenerate (all-zero) hashes, indicating mostly transparent content."""
zero_hash = "0000000000000000"
# If 3+ hashes are all zeros, the image is likely mostly transparent
zero_count = sum(1 for h in [str(self.phash), str(self.ahash), str(self.dhash)] if h == zero_hash)
return zero_count >= 3
def is_candidate(self, other: "ImageInfo", threshold: int) -> tuple[bool, int, int]:
"""
Check if two images are candidate duplicates based on metadata and hashes.
Returns (is_candidate, agreements, total_distance).
This is a fast pre-filter. GIFs require additional frame verification.
"""
# Dimensions must match exactly
if self.width != other.width or self.height != other.height:
return False, 0, 999
# Frame count must match for animated images
if self.n_frames != other.n_frames:
return False, 0, 999
# Calculate perceptual hash distances
distances = [
self.phash - other.phash,
self.ahash - other.ahash,
self.dhash - other.dhash,
self.colorhash - other.colorhash,
]
total_distance = sum(distances)
agreements = sum(1 for d in distances if d <= threshold)
# For static images: detect re-compressed/re-exported duplicates
# Require identical structure AND color, with small perceptual variance:
# - aHash=0 AND dHash=0 AND colorHash=0 AND pHash <= 10
# - OR all 4 hashes match exactly (total_distance = 0)
if self.n_frames == 1:
phash_dist = self.phash - other.phash
ahash_dist = self.ahash - other.ahash
dhash_dist = self.dhash - other.dhash
chash_dist = self.colorhash - other.colorhash
# Identical structure + color, small perceptual variance = re-compressed image
if ahash_dist == 0 and dhash_dist == 0 and chash_dist == 0 and phash_dist <= 10:
return True, agreements, total_distance
# All hashes match exactly
if total_distance == 0:
return True, agreements, total_distance
return False, agreements, total_distance
# For animated images: require all 4 hashes to agree (will be verified by frame check)
return agreements >= MIN_HASH_AGREEMENT, agreements, total_distance
def is_animated(self) -> bool:
"""Check if this is an animated image (multiple frames)."""
return self.n_frames > 1
class UnionFind:
"""Union-Find data structure for clustering similar images."""
def __init__(self):
self.parent = {}
def find(self, x):
if x not in self.parent:
self.parent[x] = x
if self.parent[x] != x:
self.parent[x] = self.find(self.parent[x])
return self.parent[x]
def union(self, x, y):
px, py = self.find(x), self.find(y)
if px != py:
self.parent[px] = py
def _compute_hashes(img: Image.Image) -> tuple[imagehash.ImageHash, ...]:
"""Compute all hash types for a single image/frame."""
# Convert to RGBA to handle transparency consistently
if img.mode != "RGBA":
img = img.convert("RGBA")
return (
imagehash.phash(img),
imagehash.average_hash(img),
imagehash.dhash(img),
imagehash.colorhash(img),
)
def _compute_md5(path: Path) -> str:
"""Compute MD5 hash of file contents."""
md5 = hashlib.md5()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
md5.update(chunk)
return md5.hexdigest()
def _get_gif_frame_info(path: Path) -> list[tuple[str, int]] | None:
"""
Get perceptual hash and duration for each frame of a GIF.
Returns list of (hash_string, duration_ms) tuples, or None if not a multi-frame image.
"""
try:
with Image.open(path) as img:
n_frames = getattr(img, "n_frames", 1)
if n_frames <= 1:
return None
frame_info = []
for i in range(n_frames):
img.seek(i)
frame = img.copy()
if frame.mode != "RGBA":
frame = frame.convert("RGBA")
duration = img.info.get("duration", 0)
frame_info.append((str(imagehash.phash(frame)), duration))
return frame_info
except Exception:
return None
def _gifs_are_identical(path1: Path, path2: Path) -> bool:
"""
Compare two GIFs frame-by-frame to check if they have identical content AND timing.
Returns True only if all frames and durations match.
"""
info1 = _get_gif_frame_info(path1)
info2 = _get_gif_frame_info(path2)
# If either isn't a multi-frame GIF, fall back to MD5 comparison
if info1 is None or info2 is None:
return _compute_md5(path1) == _compute_md5(path2)
# Frame counts must match
if len(info1) != len(info2):
return False
# All frames AND durations must match
return info1 == info2
def compute_image_info(path: Path) -> ImageInfo | None:
"""
Compute image metadata and perceptual hashes.
For animated GIFs, samples middle frame to avoid blank first-frame issues.
Returns None if image can't be processed.
"""
try:
md5 = _compute_md5(path)
with Image.open(path) as img:
width, height = img.size
n_frames = getattr(img, "n_frames", 1)
is_animated = getattr(img, "is_animated", False)
if not is_animated:
hashes = _compute_hashes(img)
else:
# For animated images, use middle frame for hashing
middle_frame = n_frames // 2
try:
img.seek(middle_frame)
hashes = _compute_hashes(img.copy())
except EOFError:
img.seek(0)
hashes = _compute_hashes(img)
return ImageInfo(
phash=hashes[0],
ahash=hashes[1],
dhash=hashes[2],
colorhash=hashes[3],
width=width,
height=height,
n_frames=n_frames,
md5=md5,
)
except Exception as e:
print(f" Warning: Could not process {path.name}: {e}")
return None
def _files_size_similar(path1: Path, path2: Path) -> bool:
"""Check if two files have similar sizes (within MAX_SIZE_DIFF_RATIO)."""
size1 = path1.stat().st_size
size2 = path2.stat().st_size
if size1 == 0 or size2 == 0:
return size1 == size2
ratio = abs(size1 - size2) / max(size1, size2)
return ratio <= MAX_SIZE_DIFF_RATIO
def _verify_duplicate_pair(path_i: Path, info_i: ImageInfo, path_j: Path, info_j: ImageInfo, threshold: int) -> bool:
"""
Verify if two candidate images are true duplicates.
For animated GIFs, compares frames and timing. For static images, perceptual match is sufficient.
"""
# For animated images, verify frame-by-frame including timing
if info_i.is_animated() and info_j.is_animated():
return _gifs_are_identical(path_i, path_j)
# For static images, perceptual hash agreement is sufficient
# (handles re-compressed/re-exported duplicates with different file sizes)
return True
def find_similar_groups(files: list[Path], threshold: int) -> list[list[tuple[Path, ImageInfo]]]:
"""Find groups of similar images using multi-hash consensus and union-find."""
# Compute image info for all files
images: list[tuple[Path, ImageInfo]] = []
for file in files:
info = compute_image_info(file)
if info is not None:
# Skip images with degenerate (all-zero) hashes - they can't be meaningfully compared
if not info._has_degenerate_hash():
images.append((file, info))
if not images:
return []
# Use union-find to cluster similar images
# First pass: find candidates based on hashes and metadata
# Second pass: verify GIFs with frame comparison
uf = UnionFind()
for i, (path_i, info_i) in enumerate(images):
uf.find(i) # Initialize
for j in range(i + 1, len(images)):
path_j, info_j = images[j]
# Check if candidates based on hashes/metadata
is_candidate, _, _ = info_i.is_candidate(info_j, threshold)
if not is_candidate:
continue
# For animated images, also check file size similarity
# (static images may have different compression, so skip size check)
if info_i.is_animated() and not _files_size_similar(path_i, path_j):
continue
# Verify: for GIFs, compare frames; for static, already verified by hashes
if _verify_duplicate_pair(path_i, info_i, path_j, info_j, threshold):
uf.union(i, j)
# Group by cluster
clusters: dict[int, list[tuple[Path, ImageInfo]]] = {}
for i, (path, info) in enumerate(images):
root = uf.find(i)
if root not in clusters:
clusters[root] = []
clusters[root].append((path, info))
# Return only groups with duplicates
return [group for group in clusters.values() if len(group) > 1]
def deduplicate(groups: list[list[tuple[Path, ImageInfo]]], dry_run: bool, threshold: int) -> tuple[int, int]:
"""Remove duplicates, keeping first alphabetically. Returns (groups, removed)."""
total_removed = 0
for group in groups:
# Sort by filename alphabetically
sorted_group = sorted(group, key=lambda x: x[0].name.lower())
keep_path, keep_info = sorted_group[0]
remove = sorted_group[1:]
# Calculate agreement info for display
agreements_info = [keep_info.is_candidate(info, threshold) for _, info in remove]
frames_str = f", {keep_info.n_frames} frames" if keep_info.is_animated() else ""
print(f"\nSimilar group ({len(group)} files, {keep_info.width}x{keep_info.height}{frames_str}):")
print(f" KEEP: {keep_path.name}")
for (path, info), (_, agreements, total_dist) in zip(remove, agreements_info):
action = "WOULD DELETE" if dry_run else "DELETE"
print(f" {action}: {path.name} (agreements: {agreements}/4, dist: {total_dist})")
if not dry_run:
path.unlink()
total_removed += 1
if dry_run:
return len(groups), sum(len(g) - 1 for g in groups)
return len(groups), total_removed
def main():
parser = argparse.ArgumentParser(description="Find and remove duplicate emoji files using perceptual hashing.")
parser.add_argument(
"--threshold",
type=int,
default=0,
help="Similarity threshold (0=exact, default=0)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show duplicates without deleting",
)
parser.add_argument(
"--dir",
type=Path,
default=Path("emoji"),
help="Directory to scan (default: emoji/)",
)
args = parser.parse_args()
emoji_dir = args.dir
if not emoji_dir.exists():
print(f"Error: Directory '{emoji_dir}' does not exist.")
return
files = [f for f in emoji_dir.iterdir() if f.suffix.lower() in EXTENSIONS]
if not files:
print(f"No image files found in {emoji_dir}/ folder.")
return
print(f"Scanning {len(files)} files (threshold: {args.threshold})...")
if args.dry_run:
print("(dry-run mode - no files will be deleted)")
groups = find_similar_groups(files, args.threshold)
if not groups:
print("\nNo similar images found.")
return
group_count, removed = deduplicate(groups, args.dry_run, args.threshold)
print("\n--- Summary ---")
print(f"Files scanned: {len(files)}")
print(f"Similar groups: {group_count}")
if args.dry_run:
print(f"Files to remove: {removed}")
else:
print(f"Files removed: {removed}")
if __name__ == "__main__":
main()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 945 B

After

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.0 KiB

After

Width:  |  Height:  |  Size: 5.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.0 KiB

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.9 KiB

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.2 KiB

After

Width:  |  Height:  |  Size: 9.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 466 B

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.6 KiB

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 228 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 3.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 903 B

After

Width:  |  Height:  |  Size: 2.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.1 KiB

After

Width:  |  Height:  |  Size: 6.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 63 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 KiB

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.2 KiB

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.4 KiB

After

Width:  |  Height:  |  Size: 7.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.7 KiB

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.0 KiB

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.0 KiB

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 462 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.4 KiB

After

Width:  |  Height:  |  Size: 8.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

After

Width:  |  Height:  |  Size: 3.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.7 KiB

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 317 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 646 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 616 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.5 KiB

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.3 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.7 KiB

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.8 KiB

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.5 KiB

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.2 KiB

After

Width:  |  Height:  |  Size: 8.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.4 KiB

After

Width:  |  Height:  |  Size: 9.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.2 KiB

After

Width:  |  Height:  |  Size: 8.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.8 KiB

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

After

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.1 KiB

After

Width:  |  Height:  |  Size: 11 KiB

Some files were not shown because too many files have changed in this diff Show More