143 lines
4.1 KiB
Python
143 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix fake isil.org URLs in custodian files.
|
|
|
|
ISIL codes are identifiers, NOT URLs. The domain isil.org does not exist.
|
|
This script removes the fake identifier_url field for ISIL identifiers,
|
|
keeping only the identifier_value.
|
|
|
|
For countries with real ISIL registries, we could map to actual URLs:
|
|
- Switzerland: https://www.isil.nb.admin.ch/en/?isil={code}
|
|
- Germany: https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}
|
|
- Netherlands: https://isil.bibliotheek.nl/
|
|
- Austria: https://www.onb.ac.at/isil/
|
|
But these require country-specific logic.
|
|
|
|
For now, we simply remove the fake URLs.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Custom YAML representer to preserve formatting
|
|
class QuotedString(str):
|
|
pass
|
|
|
|
def quoted_presenter(dumper, data):
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style="'")
|
|
|
|
yaml.add_representer(QuotedString, quoted_presenter)
|
|
|
|
def fix_file(filepath: Path, dry_run: bool = False) -> bool:
|
|
"""Fix fake isil.org URLs in a single file."""
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Quick check if file contains fake isil.org URL
|
|
if 'https://isil.org/' not in content:
|
|
return False
|
|
|
|
# Parse YAML
|
|
try:
|
|
data = yaml.safe_load(content)
|
|
except yaml.YAMLError as e:
|
|
print(f" ERROR parsing {filepath}: {e}")
|
|
return False
|
|
|
|
modified = False
|
|
|
|
# Check identifiers list
|
|
identifiers = data.get('identifiers', [])
|
|
if identifiers:
|
|
for identifier in identifiers:
|
|
if identifier.get('identifier_scheme') == 'ISIL':
|
|
url = identifier.get('identifier_url', '')
|
|
if url and 'isil.org/' in url:
|
|
if not dry_run:
|
|
# Remove the fake URL
|
|
del identifier['identifier_url']
|
|
modified = True
|
|
|
|
# Check wikidata_enrichment.wikidata_identifiers
|
|
wikidata = data.get('wikidata_enrichment', {})
|
|
wd_identifiers = wikidata.get('wikidata_identifiers', {})
|
|
# ISIL in wikidata_identifiers is just a value, no URL - should be fine
|
|
|
|
if modified and not dry_run:
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return modified
|
|
|
|
|
|
def main():
|
|
"""Main routine."""
|
|
|
|
dry_run = '--dry-run' in sys.argv
|
|
|
|
print("=" * 70)
|
|
print("Fix Fake ISIL.org URLs")
|
|
print("=" * 70)
|
|
print(f"Timestamp: {datetime.now().isoformat()}")
|
|
print(f"Mode: {'DRY RUN' if dry_run else 'LIVE'}")
|
|
print()
|
|
|
|
custodian_dir = Path("data/custodian")
|
|
|
|
if not custodian_dir.exists():
|
|
print("ERROR: data/custodian directory not found")
|
|
sys.exit(1)
|
|
|
|
# Find all files with fake isil.org URLs
|
|
files_to_fix = []
|
|
|
|
print("Scanning for files with fake isil.org URLs...")
|
|
|
|
for filepath in custodian_dir.glob("*.yaml"):
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
if 'https://isil.org/' in content:
|
|
files_to_fix.append(filepath)
|
|
|
|
print(f"Found {len(files_to_fix)} files with fake URLs")
|
|
print()
|
|
|
|
if not files_to_fix:
|
|
print("No files to fix!")
|
|
return
|
|
|
|
# Process files
|
|
fixed_count = 0
|
|
error_count = 0
|
|
|
|
for i, filepath in enumerate(files_to_fix):
|
|
if i % 500 == 0:
|
|
print(f"[{i}/{len(files_to_fix)}] Processing...")
|
|
|
|
try:
|
|
if fix_file(filepath, dry_run):
|
|
fixed_count += 1
|
|
except Exception as e:
|
|
print(f" ERROR: {filepath}: {e}")
|
|
error_count += 1
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("SUMMARY")
|
|
print("=" * 70)
|
|
print(f"Files scanned: {len(files_to_fix)}")
|
|
print(f"Files {'would be ' if dry_run else ''}fixed: {fixed_count}")
|
|
print(f"Errors: {error_count}")
|
|
|
|
if dry_run:
|
|
print()
|
|
print("Run without --dry-run to apply fixes")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|