glam/scripts/add_valid_isil_urls_fast.py
2025-12-09 10:46:43 +01:00

178 lines
6.1 KiB
Python

#!/usr/bin/env python3
"""
Add valid country-specific ISIL registry URLs to custodian files.
Fast version that only processes files matching country prefix.
ISIL Registry URL Patterns by Country:
- CH (Switzerland): https://www.isil.nb.admin.ch/en/?isil={code}
- DE (Germany): https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}
- AT (Austria): https://sigel.staatsbibliothek-berlin.de/en/isil/{code} (managed by Germany)
- JP (Japan): No individual lookup URL (CSV downloads only from NDL)
- BE (Belgium): http://isil.kbr.be/ (libraries only, no direct lookup)
- NL (Netherlands): No public lookup URL found
- BY (Belarus): No public lookup URL (list only at nlb.by)
- BG (Bulgaria): No public lookup URL (managed by National Library)
- CZ (Czech): No public lookup URL (managed by nkp.cz)
Author: GLAM Project
Date: 2025-12-09
"""
import os
import re
import sys
from pathlib import Path
from typing import Optional
import yaml
# ISIL registry URL patterns by country code
ISIL_URL_PATTERNS = {
# Countries with resolvable lookup URLs
'CH': 'https://www.isil.nb.admin.ch/en/?isil={code}',
'DE': 'https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}',
'AT': 'https://sigel.staatsbibliothek-berlin.de/en/isil/{code}',
# Countries without individual lookup URLs
'JP': None,
'BE': None,
'NL': None,
'BY': None,
'BG': None,
'CZ': None,
}
def get_country_from_isil(isil_code: str) -> Optional[str]:
"""Extract country code from ISIL code."""
if not isil_code:
return None
match = re.match(r'^([A-Z]{2})-', isil_code)
if match:
return match.group(1)
return None
def get_isil_url(isil_code: str) -> Optional[str]:
"""Generate the registry URL for an ISIL code if available."""
country = get_country_from_isil(isil_code)
if not country:
return None
pattern = ISIL_URL_PATTERNS.get(country)
if pattern:
return pattern.format(code=isil_code)
return None
def process_file_fast(filepath: Path, dry_run: bool = True) -> dict:
"""Process a single file - fast version using string operations."""
stats = {'modified': False, 'isil_code': None, 'url_added': None}
try:
content = filepath.read_text(encoding='utf-8')
# Find ISIL code
isil_match = re.search(r'identifier_scheme: ISIL\s*\n\s*identifier_value: ([A-Z]{2}-[^\s\n]+)', content)
if not isil_match:
return stats
isil_code = isil_match.group(1).strip("'\"")
stats['isil_code'] = isil_code
# Check if already has valid URL
if 'identifier_url:' in content and 'isil.org' not in content:
# Check if it's after the ISIL entry
isil_pos = content.find('identifier_scheme: ISIL')
url_pos = content.find('identifier_url:', isil_pos)
next_scheme_pos = content.find('identifier_scheme:', isil_pos + 20)
if url_pos != -1 and (next_scheme_pos == -1 or url_pos < next_scheme_pos):
# Already has URL for this ISIL
return stats
# Get URL for this country
url = get_isil_url(isil_code)
if not url:
return stats
# Add URL after identifier_value
new_line = f'\n identifier_url: {url}'
# Find the position to insert
value_end = isil_match.end()
# Check if there's already an identifier_url line we need to replace
after_value = content[value_end:value_end+100]
if after_value.strip().startswith('identifier_url:'):
# Replace existing URL
url_line_match = re.search(r'\n\s*identifier_url:[^\n]*', content[value_end:])
if url_line_match:
old_url_line = url_line_match.group(0)
content = content.replace(old_url_line, new_line, 1)
stats['modified'] = True
stats['url_added'] = url
else:
# Insert new URL line
content = content[:value_end] + new_line + content[value_end:]
stats['modified'] = True
stats['url_added'] = url
if stats['modified'] and not dry_run:
filepath.write_text(content, encoding='utf-8')
return stats
except Exception as e:
stats['error'] = str(e)
return stats
def main():
import argparse
parser = argparse.ArgumentParser(description='Add valid ISIL registry URLs to custodian files')
parser.add_argument('--apply', action='store_true', help='Actually apply the changes')
parser.add_argument('--country', type=str, required=True,
help='Country code prefix (e.g., AT, DE, CH)')
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
args = parser.parse_args()
dry_run = not args.apply
print(f"Processing {args.country} files...")
if dry_run:
print("=== DRY RUN MODE (use --apply to make changes) ===\n")
# Only process files matching country prefix
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
pattern = f'{args.country}-*.yaml'
files = list(custodian_dir.glob(pattern))
print(f"Found {len(files)} files matching {pattern}")
url_pattern = ISIL_URL_PATTERNS.get(args.country)
if url_pattern:
print(f"URL pattern: {url_pattern}")
else:
print(f"No URL pattern available for {args.country}")
return
# Process files
modified_count = 0
for filepath in files:
stats = process_file_fast(filepath, dry_run=dry_run)
if stats.get('modified'):
modified_count += 1
if args.verbose:
print(f"{filepath.name}: {stats['isil_code']} -> {stats['url_added']}")
if stats.get('error'):
print(f" ERROR: {filepath.name}: {stats['error']}")
print(f"\n{'Would modify' if dry_run else 'Modified'}: {modified_count} files")
if dry_run and modified_count > 0:
print("\n>>> Use --apply to make changes <<<")
if __name__ == '__main__':
main()