#!/usr/bin/env python3 """ Fix fake isil.org URLs in custodian files. ISIL codes are identifiers, NOT URLs. The domain isil.org does not exist. This script removes the fake identifier_url field for ISIL identifiers, keeping only the identifier_value. For countries with real ISIL registries, we could map to actual URLs: - Switzerland: https://www.isil.nb.admin.ch/en/?isil={code} - Germany: https://sigel.staatsbibliothek-berlin.de/suche/?isil={code} - Netherlands: https://isil.bibliotheek.nl/ - Austria: https://www.onb.ac.at/isil/ But these require country-specific logic. For now, we simply remove the fake URLs. """ import os import sys import yaml from pathlib import Path from datetime import datetime # Custom YAML representer to preserve formatting class QuotedString(str): pass def quoted_presenter(dumper, data): return dumper.represent_scalar('tag:yaml.org,2002:str', data, style="'") yaml.add_representer(QuotedString, quoted_presenter) def fix_file(filepath: Path, dry_run: bool = False) -> bool: """Fix fake isil.org URLs in a single file.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Quick check if file contains fake isil.org URL if 'https://isil.org/' not in content: return False # Parse YAML try: data = yaml.safe_load(content) except yaml.YAMLError as e: print(f" ERROR parsing {filepath}: {e}") return False modified = False # Check identifiers list identifiers = data.get('identifiers', []) if identifiers: for identifier in identifiers: if identifier.get('identifier_scheme') == 'ISIL': url = identifier.get('identifier_url', '') if url and 'isil.org/' in url: if not dry_run: # Remove the fake URL del identifier['identifier_url'] modified = True # Check wikidata_enrichment.wikidata_identifiers wikidata = data.get('wikidata_enrichment', {}) wd_identifiers = wikidata.get('wikidata_identifiers', {}) # ISIL in wikidata_identifiers is just a value, no URL - should be fine if modified and not dry_run: # Write back with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return modified def main(): """Main routine.""" dry_run = '--dry-run' in sys.argv print("=" * 70) print("Fix Fake ISIL.org URLs") print("=" * 70) print(f"Timestamp: {datetime.now().isoformat()}") print(f"Mode: {'DRY RUN' if dry_run else 'LIVE'}") print() custodian_dir = Path("data/custodian") if not custodian_dir.exists(): print("ERROR: data/custodian directory not found") sys.exit(1) # Find all files with fake isil.org URLs files_to_fix = [] print("Scanning for files with fake isil.org URLs...") for filepath in custodian_dir.glob("*.yaml"): with open(filepath, 'r', encoding='utf-8') as f: content = f.read() if 'https://isil.org/' in content: files_to_fix.append(filepath) print(f"Found {len(files_to_fix)} files with fake URLs") print() if not files_to_fix: print("No files to fix!") return # Process files fixed_count = 0 error_count = 0 for i, filepath in enumerate(files_to_fix): if i % 500 == 0: print(f"[{i}/{len(files_to_fix)}] Processing...") try: if fix_file(filepath, dry_run): fixed_count += 1 except Exception as e: print(f" ERROR: {filepath}: {e}") error_count += 1 print() print("=" * 70) print("SUMMARY") print("=" * 70) print(f"Files scanned: {len(files_to_fix)}") print(f"Files {'would be ' if dry_run else ''}fixed: {fixed_count}") print(f"Errors: {error_count}") if dry_run: print() print("Run without --dry-run to apply fixes") if __name__ == "__main__": main()