#!/usr/bin/env python3 """ Add valid country-specific ISIL registry URLs to custodian files. This script replaces the fake isil.org URLs with actual country-specific registry lookup URLs where available. ISIL Registry URL Patterns by Country: - CH (Switzerland): https://www.isil.nb.admin.ch/en/?isil={code} - DE (Germany): https://sigel.staatsbibliothek-berlin.de/suche/?isil={code} - AT (Austria): https://sigel.staatsbibliothek-berlin.de/en/isil/{code} (managed by Germany) - JP (Japan): No individual lookup URL (CSV downloads only from NDL) - BE (Belgium): http://isil.kbr.be/ (libraries only, no direct lookup) - NL (Netherlands): No public lookup URL found - BY (Belarus): No public lookup URL (list only at nlb.by) - BG (Bulgaria): No public lookup URL (managed by National Library) - CZ (Czech): No public lookup URL (managed by nkp.cz) Countries WITH resolvable URLs: - CH: Swiss National Library registry - DE: German ISIL Agency (Sigel) - AT: Austrian institutions are indexed in German ISIL Agency Countries WITHOUT resolvable URLs (ISIL is identifier only): - JP, BE, NL, BY, BG, CZ Author: GLAM Project Date: 2025-12-09 """ import os import re import sys from pathlib import Path from typing import Optional import yaml # ISIL registry URL patterns by country code ISIL_URL_PATTERNS = { # Countries with resolvable lookup URLs 'CH': 'https://www.isil.nb.admin.ch/en/?isil={code}', 'DE': 'https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}', 'AT': 'https://sigel.staatsbibliothek-berlin.de/en/isil/{code}', # Countries without individual lookup URLs # ISIL is identifier only - no URL should be added 'JP': None, # NDL provides CSV downloads only 'BE': None, # KBR has form but no lookup by ISIL 'NL': None, # No public registry found 'BY': None, # List only at nlb.by 'BG': None, # National Library list only 'CZ': None, # National Library list only } # Registry information URLs (not lookup but info pages) ISIL_REGISTRY_INFO = { 'JP': 'https://www.ndl.go.jp/en/library/isil/index.html', 'CH': 'https://www.nb.admin.ch/snl/en/home/information-professionals/isil.html', 'BE': 'https://www.kbr.be/en/isil/', 'NL': 'https://www.nationaalarchief.nl/archiveren/kennisbank/isil-codes', 'BY': 'https://nlb.by/en/for-librarians/international-standard-identifier-for-libraries-and-related-organizations-isil/', 'BG': 'https://www.nationallibrary.bg/www/en/national-isil-register/', 'CZ': 'https://www.en.nkp.cz/', 'DE': 'https://sigel.staatsbibliothek-berlin.de/en/', 'AT': 'https://sigel.staatsbibliothek-berlin.de/en/', } def get_country_from_isil(isil_code: str) -> Optional[str]: """Extract country code from ISIL code.""" if not isil_code: return None # ISIL format: XX-... where XX is ISO 3166-1 alpha-2 country code match = re.match(r'^([A-Z]{2})-', isil_code) if match: return match.group(1) return None def get_isil_url(isil_code: str) -> Optional[str]: """Generate the registry URL for an ISIL code if available.""" country = get_country_from_isil(isil_code) if not country: return None pattern = ISIL_URL_PATTERNS.get(country) if pattern: return pattern.format(code=isil_code) return None def process_custodian_file(filepath: Path, dry_run: bool = True) -> dict: """Process a single custodian YAML file and add ISIL URLs where appropriate. Returns dict with stats about what was done. """ stats = { 'file': str(filepath), 'isil_found': False, 'url_added': False, 'country': None, 'isil_code': None, 'no_url_available': False, } try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Parse YAML data = yaml.safe_load(content) if not data: return stats # Find ISIL identifiers identifiers = data.get('identifiers', []) if not identifiers: return stats modified = False for identifier in identifiers: if identifier.get('identifier_scheme') == 'ISIL': isil_code = identifier.get('identifier_value') if not isil_code: continue stats['isil_found'] = True stats['isil_code'] = isil_code stats['country'] = get_country_from_isil(isil_code) # Check if URL already exists current_url = identifier.get('identifier_url') if current_url and 'isil.org' not in current_url: # Already has a valid URL, skip continue # Get the appropriate URL new_url = get_isil_url(isil_code) if new_url: identifier['identifier_url'] = new_url stats['url_added'] = True modified = True else: # Remove any fake URL, keep identifier without URL if 'identifier_url' in identifier: del identifier['identifier_url'] modified = True stats['no_url_available'] = True if modified and not dry_run: # Write back the modified file with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return stats except Exception as e: stats['error'] = str(e) return stats def main(): import argparse parser = argparse.ArgumentParser(description='Add valid ISIL registry URLs to custodian files') parser.add_argument('--dry-run', action='store_true', default=True, help='Show what would be done without making changes (default)') parser.add_argument('--apply', action='store_true', help='Actually apply the changes') parser.add_argument('--country', type=str, default=None, help='Only process files with ISIL codes from this country (e.g., CH, DE)') parser.add_argument('--limit', type=int, default=None, help='Limit number of files to process') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') args = parser.parse_args() dry_run = not args.apply if dry_run: print("=== DRY RUN MODE (use --apply to make changes) ===\n") else: print("=== APPLYING CHANGES ===\n") # Print registry info print("ISIL Registry URL Patterns:") print("-" * 60) for country, pattern in sorted(ISIL_URL_PATTERNS.items()): if pattern: print(f" {country}: {pattern}") else: print(f" {country}: No lookup URL available (identifier only)") print() # Find custodian files custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') yaml_files = list(custodian_dir.glob('*.yaml')) print(f"Found {len(yaml_files)} custodian files") if args.limit: yaml_files = yaml_files[:args.limit] print(f"Limited to {args.limit} files") # Process files stats_summary = { 'total': 0, 'with_isil': 0, 'urls_added': 0, 'no_url_available': 0, 'by_country': {}, } for filepath in yaml_files: stats = process_custodian_file(filepath, dry_run=dry_run) stats_summary['total'] += 1 if stats.get('isil_found'): stats_summary['with_isil'] += 1 country = stats.get('country', 'Unknown') # Filter by country if specified if args.country and country != args.country: continue if country not in stats_summary['by_country']: stats_summary['by_country'][country] = { 'count': 0, 'urls_added': 0, 'no_url': 0, } stats_summary['by_country'][country]['count'] += 1 if stats.get('url_added'): stats_summary['urls_added'] += 1 stats_summary['by_country'][country]['urls_added'] += 1 if args.verbose: print(f" ✓ {filepath.name}: {stats['isil_code']} -> URL added") if stats.get('no_url_available'): stats_summary['no_url_available'] += 1 stats_summary['by_country'][country]['no_url'] += 1 if args.verbose: print(f" - {filepath.name}: {stats['isil_code']} (no URL available)") if stats.get('error'): print(f" ERROR: {filepath.name}: {stats['error']}") # Print summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Total files processed: {stats_summary['total']}") print(f"Files with ISIL codes: {stats_summary['with_isil']}") print(f"URLs added: {stats_summary['urls_added']}") print(f"No URL available (identifier only): {stats_summary['no_url_available']}") print("\nBy Country:") for country, cstats in sorted(stats_summary['by_country'].items()): url_pattern = ISIL_URL_PATTERNS.get(country, 'Unknown') has_url = "✓" if url_pattern else "✗" print(f" {country}: {cstats['count']} ISIL codes, {cstats['urls_added']} URLs added, {cstats['no_url']} no URL [{has_url}]") if dry_run: print("\n>>> This was a dry run. Use --apply to make changes. <<<") if __name__ == '__main__': main()