#!/usr/bin/env python3 """ Add valid country-specific ISIL registry URLs to custodian files. Fast version that only processes files matching country prefix. ISIL Registry URL Patterns by Country: - CH (Switzerland): https://www.isil.nb.admin.ch/en/?isil={code} - DE (Germany): https://sigel.staatsbibliothek-berlin.de/suche/?isil={code} - AT (Austria): https://sigel.staatsbibliothek-berlin.de/en/isil/{code} (managed by Germany) - JP (Japan): No individual lookup URL (CSV downloads only from NDL) - BE (Belgium): http://isil.kbr.be/ (libraries only, no direct lookup) - NL (Netherlands): No public lookup URL found - BY (Belarus): No public lookup URL (list only at nlb.by) - BG (Bulgaria): No public lookup URL (managed by National Library) - CZ (Czech): No public lookup URL (managed by nkp.cz) Author: GLAM Project Date: 2025-12-09 """ import os import re import sys from pathlib import Path from typing import Optional import yaml # ISIL registry URL patterns by country code ISIL_URL_PATTERNS = { # Countries with resolvable lookup URLs 'CH': 'https://www.isil.nb.admin.ch/en/?isil={code}', 'DE': 'https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}', 'AT': 'https://sigel.staatsbibliothek-berlin.de/en/isil/{code}', # Countries without individual lookup URLs 'JP': None, 'BE': None, 'NL': None, 'BY': None, 'BG': None, 'CZ': None, } def get_country_from_isil(isil_code: str) -> Optional[str]: """Extract country code from ISIL code.""" if not isil_code: return None match = re.match(r'^([A-Z]{2})-', isil_code) if match: return match.group(1) return None def get_isil_url(isil_code: str) -> Optional[str]: """Generate the registry URL for an ISIL code if available.""" country = get_country_from_isil(isil_code) if not country: return None pattern = ISIL_URL_PATTERNS.get(country) if pattern: return pattern.format(code=isil_code) return None def process_file_fast(filepath: Path, dry_run: bool = True) -> dict: """Process a single file - fast version using string operations.""" stats = {'modified': False, 'isil_code': None, 'url_added': None} try: content = filepath.read_text(encoding='utf-8') # Find ISIL code isil_match = re.search(r'identifier_scheme: ISIL\s*\n\s*identifier_value: ([A-Z]{2}-[^\s\n]+)', content) if not isil_match: return stats isil_code = isil_match.group(1).strip("'\"") stats['isil_code'] = isil_code # Check if already has valid URL if 'identifier_url:' in content and 'isil.org' not in content: # Check if it's after the ISIL entry isil_pos = content.find('identifier_scheme: ISIL') url_pos = content.find('identifier_url:', isil_pos) next_scheme_pos = content.find('identifier_scheme:', isil_pos + 20) if url_pos != -1 and (next_scheme_pos == -1 or url_pos < next_scheme_pos): # Already has URL for this ISIL return stats # Get URL for this country url = get_isil_url(isil_code) if not url: return stats # Add URL after identifier_value new_line = f'\n identifier_url: {url}' # Find the position to insert value_end = isil_match.end() # Check if there's already an identifier_url line we need to replace after_value = content[value_end:value_end+100] if after_value.strip().startswith('identifier_url:'): # Replace existing URL url_line_match = re.search(r'\n\s*identifier_url:[^\n]*', content[value_end:]) if url_line_match: old_url_line = url_line_match.group(0) content = content.replace(old_url_line, new_line, 1) stats['modified'] = True stats['url_added'] = url else: # Insert new URL line content = content[:value_end] + new_line + content[value_end:] stats['modified'] = True stats['url_added'] = url if stats['modified'] and not dry_run: filepath.write_text(content, encoding='utf-8') return stats except Exception as e: stats['error'] = str(e) return stats def main(): import argparse parser = argparse.ArgumentParser(description='Add valid ISIL registry URLs to custodian files') parser.add_argument('--apply', action='store_true', help='Actually apply the changes') parser.add_argument('--country', type=str, required=True, help='Country code prefix (e.g., AT, DE, CH)') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') args = parser.parse_args() dry_run = not args.apply print(f"Processing {args.country} files...") if dry_run: print("=== DRY RUN MODE (use --apply to make changes) ===\n") # Only process files matching country prefix custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') pattern = f'{args.country}-*.yaml' files = list(custodian_dir.glob(pattern)) print(f"Found {len(files)} files matching {pattern}") url_pattern = ISIL_URL_PATTERNS.get(args.country) if url_pattern: print(f"URL pattern: {url_pattern}") else: print(f"No URL pattern available for {args.country}") return # Process files modified_count = 0 for filepath in files: stats = process_file_fast(filepath, dry_run=dry_run) if stats.get('modified'): modified_count += 1 if args.verbose: print(f" ✓ {filepath.name}: {stats['isil_code']} -> {stats['url_added']}") if stats.get('error'): print(f" ERROR: {filepath.name}: {stats['error']}") print(f"\n{'Would modify' if dry_run else 'Modified'}: {modified_count} files") if dry_run and modified_count > 0: print("\n>>> Use --apply to make changes <<<") if __name__ == '__main__': main()