glam/scripts/fix_fake_isil_urls.py
2025-12-09 08:02:27 +01:00

143 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""
Fix fake isil.org URLs in custodian files.
ISIL codes are identifiers, NOT URLs. The domain isil.org does not exist.
This script removes the fake identifier_url field for ISIL identifiers,
keeping only the identifier_value.
For countries with real ISIL registries, we could map to actual URLs:
- Switzerland: https://www.isil.nb.admin.ch/en/?isil={code}
- Germany: https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}
- Netherlands: https://isil.bibliotheek.nl/
- Austria: https://www.onb.ac.at/isil/
But these require country-specific logic.
For now, we simply remove the fake URLs.
"""
import os
import sys
import yaml
from pathlib import Path
from datetime import datetime
# Custom YAML representer to preserve formatting
class QuotedString(str):
pass
def quoted_presenter(dumper, data):
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style="'")
yaml.add_representer(QuotedString, quoted_presenter)
def fix_file(filepath: Path, dry_run: bool = False) -> bool:
"""Fix fake isil.org URLs in a single file."""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Quick check if file contains fake isil.org URL
if 'https://isil.org/' not in content:
return False
# Parse YAML
try:
data = yaml.safe_load(content)
except yaml.YAMLError as e:
print(f" ERROR parsing {filepath}: {e}")
return False
modified = False
# Check identifiers list
identifiers = data.get('identifiers', [])
if identifiers:
for identifier in identifiers:
if identifier.get('identifier_scheme') == 'ISIL':
url = identifier.get('identifier_url', '')
if url and 'isil.org/' in url:
if not dry_run:
# Remove the fake URL
del identifier['identifier_url']
modified = True
# Check wikidata_enrichment.wikidata_identifiers
wikidata = data.get('wikidata_enrichment', {})
wd_identifiers = wikidata.get('wikidata_identifiers', {})
# ISIL in wikidata_identifiers is just a value, no URL - should be fine
if modified and not dry_run:
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return modified
def main():
"""Main routine."""
dry_run = '--dry-run' in sys.argv
print("=" * 70)
print("Fix Fake ISIL.org URLs")
print("=" * 70)
print(f"Timestamp: {datetime.now().isoformat()}")
print(f"Mode: {'DRY RUN' if dry_run else 'LIVE'}")
print()
custodian_dir = Path("data/custodian")
if not custodian_dir.exists():
print("ERROR: data/custodian directory not found")
sys.exit(1)
# Find all files with fake isil.org URLs
files_to_fix = []
print("Scanning for files with fake isil.org URLs...")
for filepath in custodian_dir.glob("*.yaml"):
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
if 'https://isil.org/' in content:
files_to_fix.append(filepath)
print(f"Found {len(files_to_fix)} files with fake URLs")
print()
if not files_to_fix:
print("No files to fix!")
return
# Process files
fixed_count = 0
error_count = 0
for i, filepath in enumerate(files_to_fix):
if i % 500 == 0:
print(f"[{i}/{len(files_to_fix)}] Processing...")
try:
if fix_file(filepath, dry_run):
fixed_count += 1
except Exception as e:
print(f" ERROR: {filepath}: {e}")
error_count += 1
print()
print("=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Files scanned: {len(files_to_fix)}")
print(f"Files {'would be ' if dry_run else ''}fixed: {fixed_count}")
print(f"Errors: {error_count}")
if dry_run:
print()
print("Run without --dry-run to apply fixes")
if __name__ == "__main__":
main()