- batch_crawl4ai_recrawl.py: Retry failed URL crawls - batch_firecrawl_recrawl.py: FireCrawl batch processing - batch_httpx_scrape.py: HTTPX-based scraping - detect_name_mismatch.py: Find name mismatches in data - enrich_dutch_custodians_crawl4ai.py: Dutch custodian enrichment - fix_collision_victims.py: GHCID collision resolution - fix_generic_platform_names*.py: Platform name cleanup - fix_ghcid_type.py: GHCID type corrections - fix_simon_kemper_contamination.py: Data cleanup - scan_dutch_data_quality.py: Data quality scanning - transform_crawl4ai_to_digital_platform.py: Data transformation
281 lines
9.2 KiB
Python
281 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix GHCID collision victim files.
|
|
|
|
These files have a trailing dash in their filename (e.g., NL-DR-ASS-L-BD-.yaml)
|
|
indicating they were collision victims whose internal GHCID was incorrectly set
|
|
to their collision partner's GHCID instead of getting their own unique GHCID.
|
|
|
|
This script:
|
|
1. Reads the institution's real name from original_entry.organisatie
|
|
2. Generates a proper name suffix from that name
|
|
3. Creates a new unique GHCID with the proper suffix
|
|
4. Regenerates all GHCID-derived identifiers (UUID, numeric)
|
|
5. Updates the file with correct identifiers
|
|
6. Renames the file to match the new GHCID
|
|
"""
|
|
|
|
import hashlib
|
|
import re
|
|
import shutil
|
|
import unicodedata
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import yaml
|
|
|
|
# GHCID namespace for UUID generation
|
|
GHCID_NAMESPACE = uuid.NAMESPACE_URL
|
|
GHCID_URL_PREFIX = "https://glam.registry/"
|
|
|
|
# Skip words for abbreviation generation (Dutch and common)
|
|
SKIP_WORDS = {
|
|
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des',
|
|
's', 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder',
|
|
'door', 'en', 'of', 'stichting', 'vereniging', 'foundation', 'the', 'a',
|
|
'an', 'of', 'in', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as',
|
|
'museum', 'bibliotheek', 'archief', 'collectie'
|
|
}
|
|
|
|
|
|
def normalize_diacritics(text: str) -> str:
|
|
"""Normalize diacritics to ASCII equivalents."""
|
|
normalized = unicodedata.normalize('NFD', text)
|
|
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
return ascii_text
|
|
|
|
|
|
def generate_name_suffix(native_name: str) -> str:
|
|
"""Convert native language institution name to snake_case suffix.
|
|
|
|
Examples:
|
|
"Biblionet Drenthe POI" → "biblionet_drenthe_poi"
|
|
"Fries Verzetsmuseum" → "fries_verzetsmuseum"
|
|
"Musée d'Orsay" → "musee_dorsay"
|
|
"""
|
|
# Normalize unicode and remove diacritics
|
|
ascii_name = normalize_diacritics(native_name)
|
|
|
|
# Convert to lowercase
|
|
lowercase = ascii_name.lower()
|
|
|
|
# Remove apostrophes, commas, and other punctuation
|
|
no_punct = re.sub(r"[''`\",.:;!?()[\]{}]", '', lowercase)
|
|
|
|
# Replace spaces and hyphens with underscores
|
|
underscored = re.sub(r'[\s\-]+', '_', no_punct)
|
|
|
|
# Remove any remaining non-alphanumeric characters (except underscores)
|
|
clean = re.sub(r'[^a-z0-9_]', '', underscored)
|
|
|
|
# Collapse multiple underscores
|
|
final = re.sub(r'_+', '_', clean).strip('_')
|
|
|
|
return final
|
|
|
|
|
|
def generate_ghcid_uuid(ghcid: str) -> str:
|
|
"""Generate UUID v5 from GHCID."""
|
|
return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}{ghcid}"))
|
|
|
|
|
|
def generate_ghcid_uuid_sha256(ghcid: str) -> str:
|
|
"""Generate UUID v8 (SHA-256 based) from GHCID."""
|
|
return str(uuid.uuid5(GHCID_NAMESPACE, f"{GHCID_URL_PREFIX}sha256/{ghcid}"))
|
|
|
|
|
|
def generate_ghcid_numeric(ghcid: str) -> int:
|
|
"""Generate 64-bit numeric ID from GHCID."""
|
|
sha256_hash = hashlib.sha256(ghcid.encode()).hexdigest()
|
|
return int(sha256_hash[:16], 16)
|
|
|
|
|
|
def fix_collision_victim(file_path: Path, dry_run: bool = False) -> Optional[Path]:
|
|
"""Fix a single collision victim file.
|
|
|
|
Args:
|
|
file_path: Path to the collision victim YAML file
|
|
dry_run: If True, only print what would be done
|
|
|
|
Returns:
|
|
New file path after renaming, or None if skipped/failed
|
|
"""
|
|
print(f"\n{'='*80}")
|
|
print(f"Processing: {file_path.name}")
|
|
print(f"{'='*80}")
|
|
|
|
# Read file
|
|
try:
|
|
with open(file_path) as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f" ERROR: Could not read file: {e}")
|
|
return None
|
|
|
|
if data is None:
|
|
print(f" SKIP: File is empty or invalid")
|
|
return None
|
|
|
|
# Get institution name
|
|
org_name = data.get('original_entry', {}).get('organisatie')
|
|
if not org_name:
|
|
print(f" ERROR: No organisatie found in original_entry")
|
|
return None
|
|
|
|
print(f" Institution: {org_name}")
|
|
|
|
# Get current GHCID info
|
|
ghcid_data = data.get('ghcid', {})
|
|
old_ghcid = ghcid_data.get('ghcid_current', '')
|
|
print(f" Old GHCID: {old_ghcid}")
|
|
|
|
# Extract base GHCID from filename (remove trailing dash)
|
|
base_ghcid = file_path.stem.rstrip('-')
|
|
print(f" Base GHCID: {base_ghcid}")
|
|
|
|
# Generate new name suffix from institution name
|
|
name_suffix = generate_name_suffix(org_name)
|
|
print(f" Name suffix: {name_suffix}")
|
|
|
|
# Create new GHCID
|
|
new_ghcid = f"{base_ghcid}-{name_suffix}"
|
|
print(f" New GHCID: {new_ghcid}")
|
|
|
|
# Check if this would be the same as old (only filename is wrong)
|
|
if new_ghcid == old_ghcid:
|
|
expected_filename = f"{new_ghcid}.yaml"
|
|
if file_path.name != expected_filename:
|
|
print(f" GHCID correct, but filename wrong - needs rename only")
|
|
if dry_run:
|
|
print(f" DRY RUN: Would rename to {expected_filename}")
|
|
return None
|
|
|
|
new_file_path = file_path.parent / expected_filename
|
|
if new_file_path.exists():
|
|
print(f" ERROR: Target file already exists: {new_file_path.name}")
|
|
return None
|
|
|
|
shutil.move(str(file_path), str(new_file_path))
|
|
print(f" Renamed: {file_path.name} → {new_file_path.name}")
|
|
return new_file_path
|
|
else:
|
|
print(f" SKIP: GHCID and filename both correct")
|
|
return None
|
|
|
|
# Generate new identifiers
|
|
new_uuid = generate_ghcid_uuid(new_ghcid)
|
|
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
|
|
new_numeric = generate_ghcid_numeric(new_ghcid)
|
|
|
|
print(f" New UUID: {new_uuid}")
|
|
print(f" New numeric: {new_numeric}")
|
|
|
|
if dry_run:
|
|
print(f" DRY RUN: Would update file and rename to {new_ghcid}.yaml")
|
|
return None
|
|
|
|
# Update GHCID section
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Preserve old GHCID in history
|
|
ghcid_history = ghcid_data.get('ghcid_history', [])
|
|
|
|
# Add history entry for the fix
|
|
ghcid_history.append({
|
|
'ghcid': old_ghcid,
|
|
'ghcid_uuid': ghcid_data.get('ghcid_uuid', ''),
|
|
'ghcid_numeric': ghcid_data.get('ghcid_numeric', 0),
|
|
'valid_from': ghcid_data.get('generated_at', ''),
|
|
'valid_to': timestamp,
|
|
'reason': f"Collision fix: had partner's GHCID, corrected to institution's own GHCID based on name '{org_name}'"
|
|
})
|
|
|
|
data['ghcid'] = {
|
|
'ghcid_current': new_ghcid,
|
|
'ghcid_uuid': new_uuid,
|
|
'ghcid_uuid_sha256': new_uuid_sha256,
|
|
'ghcid_numeric': new_numeric,
|
|
'generated_at': timestamp,
|
|
'ghcid_history': ghcid_history
|
|
}
|
|
|
|
# Update identifiers list
|
|
identifiers = data.get('identifiers', [])
|
|
updated_identifiers = []
|
|
for ident in identifiers:
|
|
scheme = ident.get('identifier_scheme', '')
|
|
if scheme == 'GHCID':
|
|
ident['identifier_value'] = new_ghcid
|
|
ident['identifier_url'] = f"https://w3id.org/heritage/custodian/{new_ghcid}"
|
|
elif scheme == 'GHCID_UUID':
|
|
ident['identifier_value'] = new_uuid
|
|
elif scheme == 'GHCID_NUMERIC':
|
|
ident['identifier_value'] = str(new_numeric)
|
|
updated_identifiers.append(ident)
|
|
data['identifiers'] = updated_identifiers
|
|
|
|
# Write updated data back to file
|
|
with open(file_path, 'w') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f" Updated file content")
|
|
|
|
# Rename file to match new GHCID
|
|
new_file_path = file_path.parent / f"{new_ghcid}.yaml"
|
|
|
|
if new_file_path.exists():
|
|
print(f" ERROR: Target file already exists: {new_file_path.name}")
|
|
return None
|
|
|
|
shutil.move(str(file_path), str(new_file_path))
|
|
print(f" Renamed: {file_path.name} → {new_file_path.name}")
|
|
|
|
return new_file_path
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Fix GHCID collision victim files')
|
|
parser.add_argument('--dry-run', action='store_true', help='Only show what would be done')
|
|
parser.add_argument('--file', type=str, help='Process only this specific file')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path('data/custodian')
|
|
|
|
if args.file:
|
|
files = [Path(args.file)]
|
|
else:
|
|
# Find all collision victim files (trailing dash pattern)
|
|
files = sorted(custodian_dir.glob('NL-*-.yaml'))
|
|
|
|
print(f"Found {len(files)} collision victim file(s)")
|
|
|
|
fixed = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
for f in files:
|
|
result = fix_collision_victim(f, dry_run=args.dry_run)
|
|
if result:
|
|
fixed += 1
|
|
elif result is None:
|
|
# Check if it was empty
|
|
if f.stat().st_size == 0:
|
|
print(f"\n EMPTY FILE: {f.name} - should be deleted")
|
|
errors += 1
|
|
else:
|
|
skipped += 1
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"SUMMARY")
|
|
print(f"{'='*80}")
|
|
print(f" Fixed: {fixed}")
|
|
print(f" Skipped: {skipped}")
|
|
print(f" Errors/Empty: {errors}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|