feat(ghcid): add diacritics normalization and transliteration scripts
- Add fix_ghcid_diacritics.py for normalizing non-ASCII in GHCIDs - Add resolve_diacritics_collisions.py for collision handling - Add transliterate_emic_names.py for non-Latin script handling - Add transliteration tests
This commit is contained in:
parent
6a6557bbe8
commit
891692a4d6
4 changed files with 2212 additions and 0 deletions
325
scripts/fix_ghcid_diacritics.py
Normal file
325
scripts/fix_ghcid_diacritics.py
Normal file
|
|
@ -0,0 +1,325 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fix GHCID abbreviations containing diacritics.
|
||||
|
||||
This script normalizes diacritics in GHCID abbreviation components to ASCII,
|
||||
regenerates UUIDs and numeric IDs, updates GHCID history, and renames files.
|
||||
|
||||
Rule: ABBREV-DIACRITICS
|
||||
See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md
|
||||
|
||||
Usage:
|
||||
python scripts/fix_ghcid_diacritics.py --dry-run # Preview changes
|
||||
python scripts/fix_ghcid_diacritics.py # Apply changes
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import unicodedata
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import yaml
|
||||
|
||||
# GHCID namespace UUID for deterministic UUID generation
|
||||
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # URL namespace
|
||||
|
||||
# Regex pattern for common diacritics
|
||||
DIACRITICS_PATTERN = re.compile(r'[ČŘŠŽĚŮÁÉÍÓÚÝÄÖÜßÑÇÀÈÌÒÙŁŃŚŹŻĄĘÅØÆŐŰÂÊÎÔÛčřšžěůáéíóúýäöüñçàèìòùłńśźżąęåøæőűâêîôû]')
|
||||
|
||||
|
||||
def normalize_diacritics(text: str) -> str:
|
||||
"""
|
||||
Normalize diacritics to ASCII equivalents.
|
||||
|
||||
Uses Unicode NFD decomposition to separate base characters from
|
||||
combining marks, then removes the combining marks.
|
||||
|
||||
Examples:
|
||||
"Č" → "C"
|
||||
"Ř" → "R"
|
||||
"Ö" → "O"
|
||||
"ñ" → "n"
|
||||
"""
|
||||
# NFD decomposition separates base characters from combining marks
|
||||
normalized = unicodedata.normalize('NFD', text)
|
||||
# Remove combining marks (category 'Mn' = Mark, Nonspacing)
|
||||
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
return ascii_text
|
||||
|
||||
|
||||
def has_diacritics_in_ghcid(ghcid: str) -> bool:
|
||||
"""Check if GHCID contains any diacritics (in any component).
|
||||
|
||||
Diacritics can appear in:
|
||||
- Region code (e.g., '31' is fine, but city code 'ČB' has diacritics)
|
||||
- City code (e.g., 'TŘE' for Třebíč)
|
||||
- Abbreviation (e.g., 'VHSPAOČRZS')
|
||||
"""
|
||||
return bool(DIACRITICS_PATTERN.search(ghcid))
|
||||
|
||||
|
||||
def has_diacritics_in_abbreviation(ghcid: str) -> bool:
|
||||
"""Check if GHCID abbreviation component contains diacritics."""
|
||||
# GHCID format: CC-RR-CCC-T-ABBREV or CC-RR-CCC-T-ABBREV-suffix
|
||||
parts = ghcid.split('-')
|
||||
if len(parts) >= 5:
|
||||
# Abbreviation is the 5th component (index 4)
|
||||
abbrev = parts[4]
|
||||
return bool(DIACRITICS_PATTERN.search(abbrev))
|
||||
return False
|
||||
|
||||
|
||||
def fix_ghcid_diacritics(ghcid: str) -> str:
|
||||
"""
|
||||
Fix diacritics in ALL GHCID components.
|
||||
|
||||
Normalizes diacritics in all parts: country, region, city, type,
|
||||
abbreviation, and any suffix components.
|
||||
"""
|
||||
parts = ghcid.split('-')
|
||||
# Normalize all parts
|
||||
normalized_parts = [normalize_diacritics(part) for part in parts]
|
||||
return '-'.join(normalized_parts)
|
||||
|
||||
|
||||
def generate_uuid_v5(ghcid_string: str) -> str:
|
||||
"""Generate deterministic UUID v5 from GHCID string."""
|
||||
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
|
||||
|
||||
|
||||
def generate_uuid_v8_sha256(ghcid_string: str) -> str:
|
||||
"""Generate UUID v8 from SHA-256 hash of GHCID string."""
|
||||
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
||||
# Take first 16 bytes for UUID
|
||||
uuid_bytes = bytearray(sha256_hash[:16])
|
||||
# Set version to 8 (custom)
|
||||
uuid_bytes[6] = (uuid_bytes[6] & 0x0f) | 0x80
|
||||
# Set variant to RFC 4122
|
||||
uuid_bytes[8] = (uuid_bytes[8] & 0x3f) | 0x80
|
||||
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
|
||||
|
||||
|
||||
def generate_numeric_id(ghcid_string: str) -> int:
|
||||
"""Generate 64-bit numeric ID from SHA-256 hash."""
|
||||
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
||||
# Take first 8 bytes as 64-bit unsigned integer
|
||||
numeric_id = int.from_bytes(sha256_hash[:8], byteorder='big')
|
||||
return numeric_id
|
||||
|
||||
|
||||
def process_file(file_path: Path, dry_run: bool = True) -> Optional[dict]:
|
||||
"""
|
||||
Process a single YAML file to fix GHCID diacritics.
|
||||
|
||||
Returns dict with change info, or None if no change needed.
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f" Error reading {file_path}: {e}")
|
||||
return None
|
||||
|
||||
if not data or 'ghcid' not in data:
|
||||
return None
|
||||
|
||||
ghcid_section = data.get('ghcid', {})
|
||||
old_ghcid = ghcid_section.get('ghcid_current', '')
|
||||
|
||||
if not has_diacritics_in_ghcid(old_ghcid):
|
||||
return None
|
||||
|
||||
# Fix the GHCID
|
||||
new_ghcid = fix_ghcid_diacritics(old_ghcid)
|
||||
|
||||
if new_ghcid == old_ghcid:
|
||||
return None
|
||||
|
||||
# Generate new identifiers
|
||||
new_uuid_v5 = generate_uuid_v5(new_ghcid)
|
||||
new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid)
|
||||
new_numeric = generate_numeric_id(new_ghcid)
|
||||
timestamp_now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
change_info = {
|
||||
'file': str(file_path),
|
||||
'old_ghcid': old_ghcid,
|
||||
'new_ghcid': new_ghcid,
|
||||
'old_uuid': ghcid_section.get('ghcid_uuid', ''),
|
||||
'new_uuid': new_uuid_v5,
|
||||
'old_numeric': ghcid_section.get('ghcid_numeric', 0),
|
||||
'new_numeric': new_numeric,
|
||||
}
|
||||
|
||||
if dry_run:
|
||||
return change_info
|
||||
|
||||
# Update ghcid section
|
||||
ghcid_section['ghcid_current'] = new_ghcid
|
||||
ghcid_section['ghcid_uuid'] = new_uuid_v5
|
||||
ghcid_section['ghcid_uuid_sha256'] = new_uuid_v8
|
||||
ghcid_section['ghcid_numeric'] = new_numeric
|
||||
# Keep original as-is (for historical reference)
|
||||
|
||||
# Add history entry for the fix
|
||||
ghcid_history = ghcid_section.get('ghcid_history', [])
|
||||
|
||||
# Add new entry at the beginning
|
||||
new_history_entry = {
|
||||
'ghcid': new_ghcid,
|
||||
'ghcid_numeric': new_numeric,
|
||||
'valid_from': timestamp_now,
|
||||
'reason': f"Normalized diacritics to ASCII per ABBREV-DIACRITICS rule (was: {old_ghcid})"
|
||||
}
|
||||
|
||||
# Mark previous entry as superseded
|
||||
if ghcid_history:
|
||||
if 'valid_to' not in ghcid_history[0]:
|
||||
ghcid_history[0]['valid_to'] = timestamp_now
|
||||
|
||||
ghcid_section['ghcid_history'] = [new_history_entry] + ghcid_history
|
||||
data['ghcid'] = ghcid_section
|
||||
|
||||
# Update identifiers section
|
||||
identifiers = data.get('identifiers', [])
|
||||
for ident in identifiers:
|
||||
if ident.get('identifier_scheme') == 'GHCID':
|
||||
ident['identifier_value'] = new_ghcid
|
||||
elif ident.get('identifier_scheme') == 'GHCID_UUID':
|
||||
ident['identifier_value'] = new_uuid_v5
|
||||
elif ident.get('identifier_scheme') == 'GHCID_UUID_SHA256':
|
||||
ident['identifier_value'] = new_uuid_v8
|
||||
elif ident.get('identifier_scheme') == 'GHCID_NUMERIC':
|
||||
ident['identifier_value'] = str(new_numeric)
|
||||
data['identifiers'] = identifiers
|
||||
|
||||
# Write updated file
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
# Rename file to match new GHCID
|
||||
old_filename = file_path.name
|
||||
new_filename = f"{new_ghcid}.yaml"
|
||||
|
||||
if old_filename != new_filename:
|
||||
new_file_path = file_path.parent / new_filename
|
||||
if new_file_path.exists():
|
||||
print(f" Warning: Target file already exists: {new_file_path}")
|
||||
# Don't rename if target exists
|
||||
else:
|
||||
shutil.move(str(file_path), str(new_file_path))
|
||||
change_info['new_file'] = str(new_file_path)
|
||||
|
||||
return change_info
|
||||
|
||||
|
||||
def find_affected_files(custodian_dir: Path) -> list[Path]:
|
||||
"""Find all YAML files with diacritics in GHCID abbreviation.
|
||||
|
||||
Uses filename-based detection for speed, since filenames match GHCID.
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
# Use find with regex for speed - filenames contain the GHCID
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['find', str(custodian_dir), '-name', '*.yaml', '-type', 'f'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
all_files = [Path(p) for p in result.stdout.strip().split('\n') if p]
|
||||
except Exception:
|
||||
# Fallback to glob
|
||||
all_files = list(custodian_dir.glob("*.yaml"))
|
||||
|
||||
affected = []
|
||||
for yaml_file in all_files:
|
||||
# Check filename for diacritics (faster than parsing YAML)
|
||||
if DIACRITICS_PATTERN.search(yaml_file.stem):
|
||||
affected.append(yaml_file)
|
||||
|
||||
return affected
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fix GHCID abbreviations containing diacritics"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help="Preview changes without modifying files"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--limit',
|
||||
type=int,
|
||||
default=0,
|
||||
help="Limit number of files to process (0 = no limit)"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--custodian-dir',
|
||||
type=Path,
|
||||
default=Path('data/custodian'),
|
||||
help="Path to custodian directory"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
custodian_dir = args.custodian_dir
|
||||
if not custodian_dir.exists():
|
||||
print(f"Error: Directory not found: {custodian_dir}")
|
||||
return 1
|
||||
|
||||
print(f"Scanning {custodian_dir} for files with diacritics in GHCID abbreviation...")
|
||||
affected_files = find_affected_files(custodian_dir)
|
||||
|
||||
print(f"Found {len(affected_files)} affected files")
|
||||
|
||||
if args.limit > 0:
|
||||
affected_files = affected_files[:args.limit]
|
||||
print(f"Limited to {args.limit} files")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n=== DRY RUN (no changes will be made) ===\n")
|
||||
else:
|
||||
print("\n=== APPLYING CHANGES ===\n")
|
||||
|
||||
changes = []
|
||||
for i, file_path in enumerate(affected_files, 1):
|
||||
print(f"[{i}/{len(affected_files)}] Processing {file_path.name}...")
|
||||
change = process_file(file_path, dry_run=args.dry_run)
|
||||
if change:
|
||||
changes.append(change)
|
||||
print(f" {change['old_ghcid']} → {change['new_ghcid']}")
|
||||
|
||||
print(f"\n=== SUMMARY ===")
|
||||
print(f"Files processed: {len(affected_files)}")
|
||||
print(f"Files changed: {len(changes)}")
|
||||
|
||||
if args.dry_run and changes:
|
||||
print("\nTo apply changes, run without --dry-run flag")
|
||||
|
||||
# Show country distribution
|
||||
if changes:
|
||||
countries = {}
|
||||
for c in changes:
|
||||
cc = c['old_ghcid'].split('-')[0]
|
||||
countries[cc] = countries.get(cc, 0) + 1
|
||||
|
||||
print("\nBy country:")
|
||||
for cc, count in sorted(countries.items(), key=lambda x: -x[1]):
|
||||
print(f" {cc}: {count}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit(main())
|
||||
270
scripts/resolve_diacritics_collisions.py
Normal file
270
scripts/resolve_diacritics_collisions.py
Normal file
|
|
@ -0,0 +1,270 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Resolve GHCID collisions caused by diacritics normalization.
|
||||
|
||||
When a file with diacritics normalizes to the same GHCID as an existing file,
|
||||
the diacritics file gets a name suffix per AGENTS.md collision rules.
|
||||
|
||||
Usage:
|
||||
python scripts/resolve_diacritics_collisions.py --dry-run # Preview changes
|
||||
python scripts/resolve_diacritics_collisions.py # Apply changes
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import unicodedata
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import yaml
|
||||
|
||||
# GHCID namespace UUID for deterministic UUID generation
|
||||
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")
|
||||
|
||||
# Regex pattern for common diacritics
|
||||
DIACRITICS_PATTERN = re.compile(r'[ČŘŠŽĚŮÁÉÍÓÚÝÄÖÜßÑÇÀÈÌÒÙŁŃŚŹŻĄĘÅØÆŐŰÂÊÎÔÛčřšžěůáéíóúýäöüñçàèìòùłńśźżąęåøæőűâêîôûĎŇŤďňť]')
|
||||
|
||||
|
||||
def normalize_diacritics(text: str) -> str:
|
||||
"""Normalize diacritics to ASCII equivalents."""
|
||||
normalized = unicodedata.normalize('NFD', text)
|
||||
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
return ascii_text
|
||||
|
||||
|
||||
def generate_name_suffix(native_name: str) -> str:
|
||||
"""Convert native language institution name to snake_case suffix."""
|
||||
# Normalize unicode (NFD decomposition) and remove diacritics
|
||||
normalized = unicodedata.normalize('NFD', native_name)
|
||||
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
||||
|
||||
# Convert to lowercase
|
||||
lowercase = ascii_name.lower()
|
||||
|
||||
# Remove apostrophes, commas, and other punctuation
|
||||
no_punct = re.sub(r"[''`\",.:;!?()[\]{}‒–—]", '', lowercase)
|
||||
|
||||
# Replace spaces and hyphens with underscores
|
||||
underscored = re.sub(r'[\s\-]+', '_', no_punct)
|
||||
|
||||
# Remove any remaining non-alphanumeric characters (except underscores)
|
||||
clean = re.sub(r'[^a-z0-9_]', '', underscored)
|
||||
|
||||
# Collapse multiple underscores
|
||||
final = re.sub(r'_+', '_', clean).strip('_')
|
||||
|
||||
return final
|
||||
|
||||
|
||||
def generate_uuid_v5(ghcid_string: str) -> str:
|
||||
"""Generate deterministic UUID v5 from GHCID string."""
|
||||
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid_string))
|
||||
|
||||
|
||||
def generate_uuid_v8_sha256(ghcid_string: str) -> str:
|
||||
"""Generate UUID v8 from SHA-256 hash of GHCID string."""
|
||||
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
||||
uuid_bytes = bytearray(sha256_hash[:16])
|
||||
uuid_bytes[6] = (uuid_bytes[6] & 0x0f) | 0x80
|
||||
uuid_bytes[8] = (uuid_bytes[8] & 0x3f) | 0x80
|
||||
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
|
||||
|
||||
|
||||
def generate_numeric_id(ghcid_string: str) -> int:
|
||||
"""Generate 64-bit numeric ID from SHA-256 hash."""
|
||||
sha256_hash = hashlib.sha256(ghcid_string.encode('utf-8')).digest()
|
||||
return int.from_bytes(sha256_hash[:8], byteorder='big')
|
||||
|
||||
|
||||
def find_collision_pairs(custodian_dir: Path) -> list[tuple[Path, Path, str]]:
|
||||
"""Find files with diacritics that collide with existing ASCII files.
|
||||
|
||||
Returns list of (diacritics_file, ascii_file, ascii_ghcid).
|
||||
"""
|
||||
collisions = []
|
||||
|
||||
for yaml_file in custodian_dir.glob("*.yaml"):
|
||||
filename = yaml_file.stem # Without .yaml
|
||||
|
||||
if not DIACRITICS_PATTERN.search(filename):
|
||||
continue
|
||||
|
||||
# Normalize to ASCII
|
||||
ascii_filename = normalize_diacritics(filename)
|
||||
ascii_file = custodian_dir / f"{ascii_filename}.yaml"
|
||||
|
||||
if ascii_file.exists():
|
||||
collisions.append((yaml_file, ascii_file, ascii_filename))
|
||||
|
||||
return collisions
|
||||
|
||||
|
||||
def resolve_collision(diacritics_file: Path, ascii_ghcid: str, dry_run: bool = True) -> Optional[dict]:
|
||||
"""
|
||||
Resolve a collision by adding a name suffix to the diacritics file.
|
||||
|
||||
The diacritics file gets a name suffix since it's being added later.
|
||||
"""
|
||||
try:
|
||||
with open(diacritics_file, 'r', encoding='utf-8') as f:
|
||||
data = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print(f" Error reading {diacritics_file}: {e}")
|
||||
return None
|
||||
|
||||
if not data:
|
||||
return None
|
||||
|
||||
# Get institution name for suffix
|
||||
original_entry = data.get('original_entry', {})
|
||||
inst_name = original_entry.get('name', '')
|
||||
|
||||
if not inst_name:
|
||||
print(f" Warning: No institution name found in {diacritics_file}")
|
||||
return None
|
||||
|
||||
# Generate name suffix
|
||||
name_suffix = generate_name_suffix(inst_name)
|
||||
|
||||
# Create new GHCID with name suffix
|
||||
new_ghcid = f"{ascii_ghcid}-{name_suffix}"
|
||||
|
||||
# Get old GHCID from file
|
||||
ghcid_section = data.get('ghcid', {})
|
||||
old_ghcid = ghcid_section.get('ghcid_current', diacritics_file.stem)
|
||||
|
||||
# Generate new identifiers
|
||||
new_uuid_v5 = generate_uuid_v5(new_ghcid)
|
||||
new_uuid_v8 = generate_uuid_v8_sha256(new_ghcid)
|
||||
new_numeric = generate_numeric_id(new_ghcid)
|
||||
timestamp_now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
change_info = {
|
||||
'file': str(diacritics_file),
|
||||
'institution_name': inst_name,
|
||||
'old_ghcid': old_ghcid,
|
||||
'new_ghcid': new_ghcid,
|
||||
'name_suffix': name_suffix,
|
||||
}
|
||||
|
||||
if dry_run:
|
||||
return change_info
|
||||
|
||||
# Update ghcid section
|
||||
ghcid_section['ghcid_current'] = new_ghcid
|
||||
ghcid_section['ghcid_uuid'] = new_uuid_v5
|
||||
ghcid_section['ghcid_uuid_sha256'] = new_uuid_v8
|
||||
ghcid_section['ghcid_numeric'] = new_numeric
|
||||
|
||||
# Add history entry
|
||||
ghcid_history = ghcid_section.get('ghcid_history', [])
|
||||
|
||||
new_history_entry = {
|
||||
'ghcid': new_ghcid,
|
||||
'ghcid_numeric': new_numeric,
|
||||
'valid_from': timestamp_now,
|
||||
'reason': f"Name suffix added to resolve collision with {ascii_ghcid} (was: {old_ghcid})"
|
||||
}
|
||||
|
||||
if ghcid_history and 'valid_to' not in ghcid_history[0]:
|
||||
ghcid_history[0]['valid_to'] = timestamp_now
|
||||
|
||||
ghcid_section['ghcid_history'] = [new_history_entry] + ghcid_history
|
||||
data['ghcid'] = ghcid_section
|
||||
|
||||
# Update identifiers section
|
||||
identifiers = data.get('identifiers', [])
|
||||
for ident in identifiers:
|
||||
if ident.get('identifier_scheme') == 'GHCID':
|
||||
ident['identifier_value'] = new_ghcid
|
||||
elif ident.get('identifier_scheme') == 'GHCID_UUID':
|
||||
ident['identifier_value'] = new_uuid_v5
|
||||
elif ident.get('identifier_scheme') == 'GHCID_UUID_SHA256':
|
||||
ident['identifier_value'] = new_uuid_v8
|
||||
elif ident.get('identifier_scheme') == 'GHCID_NUMERIC':
|
||||
ident['identifier_value'] = str(new_numeric)
|
||||
data['identifiers'] = identifiers
|
||||
|
||||
# Write updated file
|
||||
with open(diacritics_file, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
# Rename file to match new GHCID
|
||||
new_filename = f"{new_ghcid}.yaml"
|
||||
new_file_path = diacritics_file.parent / new_filename
|
||||
|
||||
if new_file_path.exists():
|
||||
print(f" Warning: Target file already exists: {new_file_path}")
|
||||
else:
|
||||
shutil.move(str(diacritics_file), str(new_file_path))
|
||||
change_info['new_file'] = str(new_file_path)
|
||||
|
||||
return change_info
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Resolve GHCID collisions caused by diacritics normalization"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help="Preview changes without modifying files"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--custodian-dir',
|
||||
type=Path,
|
||||
default=Path('data/custodian'),
|
||||
help="Path to custodian directory"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
custodian_dir = args.custodian_dir
|
||||
|
||||
if not custodian_dir.exists():
|
||||
print(f"Error: Directory not found: {custodian_dir}")
|
||||
return 1
|
||||
|
||||
print(f"Scanning {custodian_dir} for diacritics collision pairs...")
|
||||
collisions = find_collision_pairs(custodian_dir)
|
||||
|
||||
print(f"Found {len(collisions)} collision pairs\n")
|
||||
|
||||
if args.dry_run:
|
||||
print("=== DRY RUN (no changes will be made) ===\n")
|
||||
else:
|
||||
print("=== APPLYING CHANGES ===\n")
|
||||
|
||||
changes = []
|
||||
for i, (diacritics_file, ascii_file, ascii_ghcid) in enumerate(collisions, 1):
|
||||
print(f"[{i}/{len(collisions)}] Collision:")
|
||||
print(f" Diacritics file: {diacritics_file.name}")
|
||||
print(f" Collides with: {ascii_file.name}")
|
||||
|
||||
change = resolve_collision(diacritics_file, ascii_ghcid, dry_run=args.dry_run)
|
||||
|
||||
if change:
|
||||
changes.append(change)
|
||||
print(f" Institution: {change['institution_name']}")
|
||||
print(f" GHCID change: {change['old_ghcid']} → {change['new_ghcid']}")
|
||||
if 'new_file' in change:
|
||||
print(f" New file: {Path(change['new_file']).name}")
|
||||
print()
|
||||
|
||||
print(f"=== SUMMARY ===")
|
||||
print(f"Collisions found: {len(collisions)}")
|
||||
print(f"Files resolved: {len(changes)}")
|
||||
|
||||
if args.dry_run and changes:
|
||||
print("\nTo apply changes, run without --dry-run flag")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit(main())
|
||||
1267
scripts/transliterate_emic_names.py
Normal file
1267
scripts/transliterate_emic_names.py
Normal file
File diff suppressed because it is too large
Load diff
350
tests/test_transliteration.py
Normal file
350
tests/test_transliteration.py
Normal file
|
|
@ -0,0 +1,350 @@
|
|||
"""
|
||||
Unit tests for transliteration functions.
|
||||
|
||||
Tests the scripts/transliterate_emic_names.py module for converting
|
||||
non-Latin script institution names to Latin characters.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from scripts.transliterate_emic_names import (
|
||||
detect_script,
|
||||
transliterate,
|
||||
transliterate_for_abbreviation,
|
||||
transliterate_cyrillic,
|
||||
transliterate_chinese,
|
||||
transliterate_japanese,
|
||||
transliterate_korean,
|
||||
transliterate_arabic,
|
||||
transliterate_hebrew,
|
||||
transliterate_greek,
|
||||
transliterate_devanagari,
|
||||
transliterate_armenian,
|
||||
transliterate_georgian,
|
||||
transliterate_thai,
|
||||
transliterate_sinhala,
|
||||
transliterate_khmer,
|
||||
)
|
||||
|
||||
|
||||
class TestScriptDetection:
|
||||
"""Tests for script detection function."""
|
||||
|
||||
def test_detect_latin(self):
|
||||
assert detect_script("Hello World") == "latin"
|
||||
assert detect_script("Rijksmuseum Amsterdam") == "latin"
|
||||
|
||||
def test_detect_cyrillic(self):
|
||||
assert detect_script("Институт") == "cyrillic"
|
||||
assert detect_script("Музей") == "cyrillic"
|
||||
|
||||
def test_detect_chinese(self):
|
||||
assert detect_script("故宮博物院") == "chinese"
|
||||
assert detect_script("中国国家图书馆") == "chinese"
|
||||
|
||||
def test_detect_japanese(self):
|
||||
# Japanese with hiragana or katakana
|
||||
assert detect_script("こんにちは") == "japanese"
|
||||
assert detect_script("カタカナ") == "japanese"
|
||||
|
||||
def test_detect_korean(self):
|
||||
assert detect_script("국립중앙박물관") == "korean"
|
||||
|
||||
def test_detect_arabic(self):
|
||||
assert detect_script("المكتبة الوطنية") == "arabic"
|
||||
|
||||
def test_detect_hebrew(self):
|
||||
assert detect_script("ארכיון") == "hebrew"
|
||||
|
||||
def test_detect_greek(self):
|
||||
assert detect_script("Μουσείο") == "greek"
|
||||
|
||||
def test_detect_devanagari(self):
|
||||
assert detect_script("राजस्थान") == "devanagari"
|
||||
|
||||
def test_detect_thai(self):
|
||||
assert detect_script("สำนักหอจดหมายเหตุ") == "thai"
|
||||
assert detect_script("กรุงเทพ") == "thai"
|
||||
|
||||
def test_detect_sinhala(self):
|
||||
assert detect_script("පේරාදෙණිය") == "sinhala"
|
||||
assert detect_script("ජාතික කෞතුකාගාර") == "sinhala"
|
||||
|
||||
def test_detect_khmer(self):
|
||||
assert detect_script("សារមន្ទីរ") == "khmer"
|
||||
assert detect_script("ភ្នំពេញ") == "khmer"
|
||||
|
||||
|
||||
class TestCyrillicTransliteration:
|
||||
"""Tests for Cyrillic (Russian/Ukrainian/etc.) transliteration."""
|
||||
|
||||
def test_russian_basic(self):
|
||||
result = transliterate_cyrillic("Музей", "ru")
|
||||
assert result == "Muzey"
|
||||
|
||||
def test_russian_institute(self):
|
||||
result = transliterate_cyrillic("Институт восточных рукописей РАН", "ru")
|
||||
assert "Institut" in result
|
||||
assert "vostochnykh" in result
|
||||
|
||||
def test_russian_hard_soft_signs(self):
|
||||
# Hard and soft signs should be removed
|
||||
result = transliterate_cyrillic("объект", "ru")
|
||||
assert "ъ" not in result
|
||||
assert "ь" not in result
|
||||
|
||||
def test_ukrainian(self):
|
||||
result = transliterate_cyrillic("Київ", "uk")
|
||||
# Should handle Ukrainian-specific letters
|
||||
assert "K" in result or "k" in result
|
||||
|
||||
|
||||
class TestChineseTransliteration:
|
||||
"""Tests for Chinese (Hanzi to Pinyin) transliteration."""
|
||||
|
||||
def test_museum_vocabulary(self):
|
||||
result = transliterate_chinese("博物館")
|
||||
assert "bo" in result.lower() or "haku" in result.lower()
|
||||
|
||||
def test_national_palace_museum(self):
|
||||
result = transliterate_chinese("故宮博物院")
|
||||
# Should contain pinyin for these characters
|
||||
assert len(result) > 0
|
||||
assert result != "故宮博物院" # Should be transliterated
|
||||
|
||||
def test_dongba_museum(self):
|
||||
result = transliterate_chinese("东巴文化博物院")
|
||||
assert "dong" in result.lower()
|
||||
assert "wen" in result.lower()
|
||||
|
||||
|
||||
class TestJapaneseTransliteration:
|
||||
"""Tests for Japanese (Kanji/Kana to Romaji) transliteration."""
|
||||
|
||||
def test_national_museum(self):
|
||||
result = transliterate_japanese("国立博物館")
|
||||
assert "koku" in result.lower()
|
||||
assert "ritsu" in result.lower()
|
||||
|
||||
def test_tokyo_national_museum(self):
|
||||
result = transliterate_japanese("東京国立博物館")
|
||||
assert "tou" in result.lower() or "to" in result.lower()
|
||||
assert "kyou" in result.lower() or "kyo" in result.lower()
|
||||
|
||||
def test_hiragana(self):
|
||||
result = transliterate_japanese("あいうえお")
|
||||
assert result == "aiueo"
|
||||
|
||||
def test_katakana(self):
|
||||
result = transliterate_japanese("アイウエオ")
|
||||
assert result == "aiueo"
|
||||
|
||||
|
||||
class TestKoreanTransliteration:
|
||||
"""Tests for Korean (Hangul to Revised Romanization) transliteration."""
|
||||
|
||||
def test_national_museum(self):
|
||||
result = transliterate_korean("국립중앙박물관")
|
||||
# Should contain romanized syllables
|
||||
assert len(result) > 0
|
||||
assert "guk" in result.lower() or "kuk" in result.lower()
|
||||
|
||||
def test_simple_hangul(self):
|
||||
result = transliterate_korean("한글")
|
||||
assert "han" in result.lower()
|
||||
|
||||
|
||||
class TestArabicTransliteration:
|
||||
"""Tests for Arabic script transliteration."""
|
||||
|
||||
def test_national_library(self):
|
||||
result = transliterate_arabic("المكتبة الوطنية")
|
||||
assert "mktb" in result.lower() or "maktab" in result.lower()
|
||||
|
||||
def test_basic_letters(self):
|
||||
result = transliterate_arabic("كتاب")
|
||||
assert "k" in result.lower()
|
||||
assert "t" in result.lower()
|
||||
|
||||
|
||||
class TestHebrewTransliteration:
|
||||
"""Tests for Hebrew script transliteration."""
|
||||
|
||||
def test_archive(self):
|
||||
result = transliterate_hebrew("ארכיון")
|
||||
# Should contain transliterated letters
|
||||
assert len(result) > 0
|
||||
|
||||
def test_basic_letters(self):
|
||||
result = transliterate_hebrew("שלום")
|
||||
assert "sh" in result.lower()
|
||||
|
||||
|
||||
class TestGreekTransliteration:
|
||||
"""Tests for Greek script transliteration."""
|
||||
|
||||
def test_museum(self):
|
||||
result = transliterate_greek("Μουσείο")
|
||||
assert "Moyseio" in result or "Mouseio" in result
|
||||
|
||||
def test_archaeological(self):
|
||||
result = transliterate_greek("Αρχαιολογικό")
|
||||
assert "Archaiologiko" in result
|
||||
|
||||
|
||||
class TestDevanagariTransliteration:
|
||||
"""Tests for Devanagari (Hindi/Nepali) transliteration."""
|
||||
|
||||
def test_rajasthan(self):
|
||||
result = transliterate_devanagari("राजस्थान")
|
||||
# ISO 15919 uses "aa" for long vowels, so "raaj" not "raj"
|
||||
assert "raaj" in result.lower() or "raj" in result.lower()
|
||||
|
||||
def test_basic_consonants(self):
|
||||
result = transliterate_devanagari("क")
|
||||
assert "k" in result.lower()
|
||||
|
||||
|
||||
class TestThaiTransliteration:
|
||||
"""Tests for Thai script transliteration (RTGS)."""
|
||||
|
||||
def test_national_archives(self):
|
||||
# สำนักหอจดหมายเหตุแห่งชาติ = National Archives of Thailand
|
||||
result = transliterate_thai("สำนักหอจดหมายเหตุแห่งชาติ")
|
||||
assert "samnak" in result.lower()
|
||||
assert "haeng chat" in result.lower()
|
||||
|
||||
def test_national_library(self):
|
||||
# สำนักหอสมุดแห่งชาติ = National Library of Thailand
|
||||
result = transliterate_thai("สำนักหอสมุดแห่งชาติ")
|
||||
assert "ho samut" in result.lower()
|
||||
|
||||
def test_national_museum(self):
|
||||
# พิพิธภัณฑสถานแห่งชาติ พระนคร = Bangkok National Museum
|
||||
result = transliterate_thai("พิพิธภัณฑสถานแห่งชาติ พระนคร")
|
||||
assert "phiphitthaphan" in result.lower()
|
||||
assert "phra nakhon" in result.lower()
|
||||
|
||||
def test_siam_society(self):
|
||||
# สยามสมาคมในพระบรมราชูปถัมภ์ = Siam Society
|
||||
result = transliterate_thai("สยามสมาคมในพระบรมราชูปถัมภ์")
|
||||
assert "sayam" in result.lower()
|
||||
assert "samakhom" in result.lower()
|
||||
|
||||
def test_wat_temple(self):
|
||||
# วัดโพธิ์ราม = Wat Pho Ram
|
||||
result = transliterate_thai("วัดโพธิ์ราม")
|
||||
assert "wat" in result.lower()
|
||||
assert "pho" in result.lower()
|
||||
assert "ram" in result.lower()
|
||||
|
||||
def test_empty_without_library(self):
|
||||
# Even without pythainlp, should return transliterated result (not empty)
|
||||
result = transliterate_thai("กรุงเทพ")
|
||||
# Should get 'krung thep' from vocabulary lookup
|
||||
assert len(result) > 0
|
||||
|
||||
|
||||
class TestSinhalaTransliteration:
|
||||
"""Tests for Sinhala script transliteration (ISO 15919)."""
|
||||
|
||||
def test_university_peradeniya(self):
|
||||
# පේරාදෙණිය විශ්වවිද් යාලය = University of Peradeniya
|
||||
result = transliterate_sinhala("පේරාදෙණිය විශ්වවිද් යාලය")
|
||||
assert "peradeniya" in result.lower()
|
||||
assert "vishvavid" in result.lower()
|
||||
|
||||
def test_national_museums(self):
|
||||
# ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව = Department of National Museums
|
||||
result = transliterate_sinhala("ජාතික කෞතුකාගාර දෙපාර්තමේන්තුව")
|
||||
assert "jathika" in result.lower()
|
||||
assert "kauthukagara" in result.lower()
|
||||
|
||||
def test_basic_consonants(self):
|
||||
# Basic consonant test
|
||||
result = transliterate_sinhala("ක") # ka
|
||||
assert "k" in result.lower()
|
||||
|
||||
def test_output_not_empty(self):
|
||||
# Sinhala should never return empty string
|
||||
result = transliterate_sinhala("කොළඹ") # Colombo
|
||||
assert len(result) > 0
|
||||
|
||||
|
||||
class TestKhmerTransliteration:
|
||||
"""Tests for Khmer script transliteration (UNGEGN)."""
|
||||
|
||||
def test_tuol_sleng(self):
|
||||
# សារមន្ទីរទួលស្លែង = Tuol Sleng Genocide Museum
|
||||
result = transliterate_khmer("សារមន្ទីរទួលស្លែង")
|
||||
assert "tuol sleng" in result.lower()
|
||||
|
||||
def test_phnom_penh(self):
|
||||
# ភ្នំពេញ = Phnom Penh
|
||||
result = transliterate_khmer("ភ្នំពេញ")
|
||||
assert "phnom penh" in result.lower()
|
||||
|
||||
def test_angkor(self):
|
||||
# អង្គរ = Angkor
|
||||
result = transliterate_khmer("អង្គរ")
|
||||
assert "angkor" in result.lower()
|
||||
|
||||
def test_output_not_empty(self):
|
||||
# Khmer should never return empty string
|
||||
result = transliterate_khmer("សារមន្ទីរ")
|
||||
assert len(result) > 0
|
||||
|
||||
|
||||
class TestTransliterateForAbbreviation:
|
||||
"""Tests for the main abbreviation function."""
|
||||
|
||||
def test_russian_cleanup(self):
|
||||
result = transliterate_for_abbreviation("Институт восточных рукописей РАН", "ru")
|
||||
# Should be clean Latin text
|
||||
assert result.isascii() or all(c.isalnum() or c in " -'" for c in result)
|
||||
|
||||
def test_chinese_cleanup(self):
|
||||
result = transliterate_for_abbreviation("东巴文化博物院", "zh")
|
||||
# Should be clean Latin text or warning
|
||||
assert result.isascii() or "[REQUIRES" in result
|
||||
|
||||
def test_korean_cleanup(self):
|
||||
result = transliterate_for_abbreviation("국립중앙박물관", "ko")
|
||||
assert result.isascii()
|
||||
|
||||
def test_special_characters_removed(self):
|
||||
# Special characters should be removed for abbreviation
|
||||
result = transliterate_for_abbreviation("Test (Museum) & Gallery", "en")
|
||||
assert "&" not in result
|
||||
assert "(" not in result
|
||||
|
||||
|
||||
class TestIntegration:
|
||||
"""Integration tests using the main transliterate function."""
|
||||
|
||||
def test_auto_detect_russian(self):
|
||||
result = transliterate("Музей")
|
||||
assert result.isascii()
|
||||
|
||||
def test_auto_detect_korean(self):
|
||||
result = transliterate("박물관")
|
||||
assert result.isascii()
|
||||
|
||||
def test_latin_passthrough(self):
|
||||
result = transliterate("Rijksmuseum Amsterdam")
|
||||
assert result == "Rijksmuseum Amsterdam"
|
||||
|
||||
def test_with_explicit_language(self):
|
||||
result = transliterate("故宮博物院", lang="zh")
|
||||
assert len(result) > 0
|
||||
# Should not be original Chinese
|
||||
assert "故" not in result or "[REQUIRES" in result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Loading…
Reference in a new issue