glam/scripts/fix_collision_victims.py
kempersc 0c36429257 feat(scripts): Add batch crawling and data quality scripts
- batch_crawl4ai_recrawl.py: Retry failed URL crawls
- batch_firecrawl_recrawl.py: FireCrawl batch processing
- batch_httpx_scrape.py: HTTPX-based scraping
- detect_name_mismatch.py: Find name mismatches in data
- enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment
- fix_collision_victims.py: GHCID collision resolution
- fix_generic_platform_names*.py: Platform name cleanup
- fix_ghcid_type.py: GHCID type corrections
- fix_simon_kemper_contamination.py: Data cleanup
- scan_dutch_data_quality.py: Data quality scanning
- transform_crawl4ai_to_digital_platform.py: Data transformation
2025-12-15 01:47:46 +01:00

281 lines
9.2 KiB
Python

#!/usr/bin/env python3
"""
Fix GHCID collision victim files.
These files have a trailing dash in their filename (e.g., NL-DR-ASS-L-BD-.yaml)
indicating they were collision victims whose internal GHCID was incorrectly set
to their collision partner's GHCID instead of getting their own unique GHCID.
This script:
1. Reads the institution's real name from original_entry.organisatie
2. Generates a proper name suffix from that name
3. Creates a new unique GHCID with the proper suffix
4. Regenerates all GHCID-derived identifiers (UUID, numeric)
5. Updates the file with correct identifiers
6. Renames the file to match the new GHCID
"""
import hashlib
import re
import shutil
import unicodedata
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import yaml
# GHCID namespace for UUID generation
GHCID_NAMESPACE = uuid.NAMESPACE_URL
GHCID_URL_PREFIX = "https://glam.registry/"
# Skip words for abbreviation generation (Dutch and common)
SKIP_WORDS = {
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
's', 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder',
'door', 'en', 'of', 'stichting', 'vereniging', 'foundation', 'the', 'a',
'an', 'of', 'in', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as',
'museum', 'bibliotheek', 'archief', 'collectie'
}
def normalize_diacritics(text: str) -> str:
"""Normalize diacritics to ASCII equivalents."""
normalized = unicodedata.normalize('NFD', text)
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return ascii_text
def generate_name_suffix(native_name: str) -> str:
"""Convert native language institution name to snake_case suffix.
Examples:
"Biblionet Drenthe POI""biblionet_drenthe_poi"
"Fries Verzetsmuseum""fries_verzetsmuseum"
"Musée d'Orsay""musee_dorsay"
"""
# Normalize unicode and remove diacritics
ascii_name = normalize_diacritics(native_name)
# Convert to lowercase
lowercase = ascii_name.lower()
# Remove apostrophes, commas, and other punctuation
no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)
# Replace spaces and hyphens with underscores
underscored = re.sub(r'[\s\-]+', '_', no_punct)
# Remove any remaining non-alphanumeric characters (except underscores)
clean = re.sub(r'[^a-z0-9_]', '', underscored)
# Collapse multiple underscores
final = re.sub(r'_+', '_', clean).strip('_')
return final
def generate_ghcid_uuid(ghcid: str) -> str:
"""Generate UUID v5 from GHCID."""
return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}{ghcid}"))
def generate_ghcid_uuid_sha256(ghcid: str) -> str:
"""Generate UUID v8 (SHA-256 based) from GHCID."""
return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}sha256/{ghcid}"))
def generate_ghcid_numeric(ghcid: str) -> int:
"""Generate 64-bit numeric ID from GHCID."""
sha256_hash = hashlib.sha256(ghcid.encode()).hexdigest()
return int(sha256_hash[:16], 16)
def fix_collision_victim(file_path: Path, dry_run: bool = False) -> Optional[Path]:
"""Fix a single collision victim file.
Args:
file_path: Path to the collision victim YAML file
dry_run: If True, only print what would be done
Returns:
New file path after renaming, or None if skipped/failed
"""
print(f"\n{'='*80}")
print(f"Processing: {file_path.name}")
print(f"{'='*80}")
# Read file
try:
with open(file_path) as f:
data = yaml.safe_load(f)
except Exception as e:
print(f" ERROR: Could not read file: {e}")
return None
if data is None:
print(f" SKIP: File is empty or invalid")
return None
# Get institution name
org_name = data.get('original_entry', {}).get('organisatie')
if not org_name:
print(f" ERROR: No organisatie found in original_entry")
return None
print(f" Institution: {org_name}")
# Get current GHCID info
ghcid_data = data.get('ghcid', {})
old_ghcid = ghcid_data.get('ghcid_current', '')
print(f" Old GHCID: {old_ghcid}")
# Extract base GHCID from filename (remove trailing dash)
base_ghcid = file_path.stem.rstrip('-')
print(f" Base GHCID: {base_ghcid}")
# Generate new name suffix from institution name
name_suffix = generate_name_suffix(org_name)
print(f" Name suffix: {name_suffix}")
# Create new GHCID
new_ghcid = f"{base_ghcid}-{name_suffix}"
print(f" New GHCID: {new_ghcid}")
# Check if this would be the same as old (only filename is wrong)
if new_ghcid == old_ghcid:
expected_filename = f"{new_ghcid}.yaml"
if file_path.name != expected_filename:
print(f" GHCID correct, but filename wrong - needs rename only")
if dry_run:
print(f" DRY RUN: Would rename to {expected_filename}")
return None
new_file_path = file_path.parent / expected_filename
if new_file_path.exists():
print(f" ERROR: Target file already exists: {new_file_path.name}")
return None
shutil.move(str(file_path), str(new_file_path))
print(f" Renamed: {file_path.name}{new_file_path.name}")
return new_file_path
else:
print(f" SKIP: GHCID and filename both correct")
return None
# Generate new identifiers
new_uuid = generate_ghcid_uuid(new_ghcid)
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
new_numeric = generate_ghcid_numeric(new_ghcid)
print(f" New UUID: {new_uuid}")
print(f" New numeric: {new_numeric}")
if dry_run:
print(f" DRY RUN: Would update file and rename to {new_ghcid}.yaml")
return None
# Update GHCID section
timestamp = datetime.now(timezone.utc).isoformat()
# Preserve old GHCID in history
ghcid_history = ghcid_data.get('ghcid_history', [])
# Add history entry for the fix
ghcid_history.append({
'ghcid': old_ghcid,
'ghcid_uuid': ghcid_data.get('ghcid_uuid', ''),
'ghcid_numeric': ghcid_data.get('ghcid_numeric', 0),
'valid_from': ghcid_data.get('generated_at', ''),
'valid_to': timestamp,
'reason': f"Collision fix: had partner's GHCID, corrected to institution's own GHCID based on name '{org_name}'"
})
data['ghcid'] = {
'ghcid_current': new_ghcid,
'ghcid_uuid': new_uuid,
'ghcid_uuid_sha256': new_uuid_sha256,
'ghcid_numeric': new_numeric,
'generated_at': timestamp,
'ghcid_history': ghcid_history
}
# Update identifiers list
identifiers = data.get('identifiers', [])
updated_identifiers = []
for ident in identifiers:
scheme = ident.get('identifier_scheme', '')
if scheme == 'GHCID':
ident['identifier_value'] = new_ghcid
ident['identifier_url'] = f"https://w3id.org/heritage/custodian/{new_ghcid}"
elif scheme == 'GHCID_UUID':
ident['identifier_value'] = new_uuid
elif scheme == 'GHCID_NUMERIC':
ident['identifier_value'] = str(new_numeric)
updated_identifiers.append(ident)
data['identifiers'] = updated_identifiers
# Write updated data back to file
with open(file_path, 'w') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f" Updated file content")
# Rename file to match new GHCID
new_file_path = file_path.parent / f"{new_ghcid}.yaml"
if new_file_path.exists():
print(f" ERROR: Target file already exists: {new_file_path.name}")
return None
shutil.move(str(file_path), str(new_file_path))
print(f" Renamed: {file_path.name}{new_file_path.name}")
return new_file_path
def main():
import argparse
parser = argparse.ArgumentParser(description='Fix GHCID collision victim files')
parser.add_argument('--dry-run', action='store_true', help='Only show what would be done')
parser.add_argument('--file', type=str, help='Process only this specific file')
args = parser.parse_args()
custodian_dir = Path('data/custodian')
if args.file:
files = [Path(args.file)]
else:
# Find all collision victim files (trailing dash pattern)
files = sorted(custodian_dir.glob('NL-*-.yaml'))
print(f"Found {len(files)} collision victim file(s)")
fixed = 0
skipped = 0
errors = 0
for f in files:
result = fix_collision_victim(f, dry_run=args.dry_run)
if result:
fixed += 1
elif result is None:
# Check if it was empty
if f.stat().st_size == 0:
print(f"\n EMPTY FILE: {f.name} - should be deleted")
errors += 1
else:
skipped += 1
print(f"\n{'='*80}")
print(f"SUMMARY")
print(f"{'='*80}")
print(f" Fixed: {fixed}")
print(f" Skipped: {skipped}")
print(f" Errors/Empty: {errors}")
if __name__ == '__main__':
main()