glam/scripts/add_valid_isil_urls.py
2025-12-09 10:46:43 +01:00

272 lines
9.7 KiB
Python

#!/usr/bin/env python3
"""
Add valid country-specific ISIL registry URLs to custodian files.
This script replaces the fake isil.org URLs with actual country-specific
registry lookup URLs where available.
ISIL Registry URL Patterns by Country:
- CH (Switzerland): https://www.isil.nb.admin.ch/en/?isil={code}
- DE (Germany): https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}
- AT (Austria): https://sigel.staatsbibliothek-berlin.de/en/isil/{code} (managed by Germany)
- JP (Japan): No individual lookup URL (CSV downloads only from NDL)
- BE (Belgium): http://isil.kbr.be/ (libraries only, no direct lookup)
- NL (Netherlands): No public lookup URL found
- BY (Belarus): No public lookup URL (list only at nlb.by)
- BG (Bulgaria): No public lookup URL (managed by National Library)
- CZ (Czech): No public lookup URL (managed by nkp.cz)
Countries WITH resolvable URLs:
- CH: Swiss National Library registry
- DE: German ISIL Agency (Sigel)
- AT: Austrian institutions are indexed in German ISIL Agency
Countries WITHOUT resolvable URLs (ISIL is identifier only):
- JP, BE, NL, BY, BG, CZ
Author: GLAM Project
Date: 2025-12-09
"""
import os
import re
import sys
from pathlib import Path
from typing import Optional
import yaml
# ISIL registry URL patterns by country code
ISIL_URL_PATTERNS = {
# Countries with resolvable lookup URLs
'CH': 'https://www.isil.nb.admin.ch/en/?isil={code}',
'DE': 'https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}',
'AT': 'https://sigel.staatsbibliothek-berlin.de/en/isil/{code}',
# Countries without individual lookup URLs
# ISIL is identifier only - no URL should be added
'JP': None, # NDL provides CSV downloads only
'BE': None, # KBR has form but no lookup by ISIL
'NL': None, # No public registry found
'BY': None, # List only at nlb.by
'BG': None, # National Library list only
'CZ': None, # National Library list only
}
# Registry information URLs (not lookup but info pages)
ISIL_REGISTRY_INFO = {
'JP': 'https://www.ndl.go.jp/en/library/isil/index.html',
'CH': 'https://www.nb.admin.ch/snl/en/home/information-professionals/isil.html',
'BE': 'https://www.kbr.be/en/isil/',
'NL': 'https://www.nationaalarchief.nl/archiveren/kennisbank/isil-codes',
'BY': 'https://nlb.by/en/for-librarians/international-standard-identifier-for-libraries-and-related-organizations-isil/',
'BG': 'https://www.nationallibrary.bg/www/en/national-isil-register/',
'CZ': 'https://www.en.nkp.cz/',
'DE': 'https://sigel.staatsbibliothek-berlin.de/en/',
'AT': 'https://sigel.staatsbibliothek-berlin.de/en/',
}
def get_country_from_isil(isil_code: str) -> Optional[str]:
"""Extract country code from ISIL code."""
if not isil_code:
return None
# ISIL format: XX-... where XX is ISO 3166-1 alpha-2 country code
match = re.match(r'^([A-Z]{2})-', isil_code)
if match:
return match.group(1)
return None
def get_isil_url(isil_code: str) -> Optional[str]:
"""Generate the registry URL for an ISIL code if available."""
country = get_country_from_isil(isil_code)
if not country:
return None
pattern = ISIL_URL_PATTERNS.get(country)
if pattern:
return pattern.format(code=isil_code)
return None
def process_custodian_file(filepath: Path, dry_run: bool = True) -> dict:
"""Process a single custodian YAML file and add ISIL URLs where appropriate.
Returns dict with stats about what was done.
"""
stats = {
'file': str(filepath),
'isil_found': False,
'url_added': False,
'country': None,
'isil_code': None,
'no_url_available': False,
}
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Parse YAML
data = yaml.safe_load(content)
if not data:
return stats
# Find ISIL identifiers
identifiers = data.get('identifiers', [])
if not identifiers:
return stats
modified = False
for identifier in identifiers:
if identifier.get('identifier_scheme') == 'ISIL':
isil_code = identifier.get('identifier_value')
if not isil_code:
continue
stats['isil_found'] = True
stats['isil_code'] = isil_code
stats['country'] = get_country_from_isil(isil_code)
# Check if URL already exists
current_url = identifier.get('identifier_url')
if current_url and 'isil.org' not in current_url:
# Already has a valid URL, skip
continue
# Get the appropriate URL
new_url = get_isil_url(isil_code)
if new_url:
identifier['identifier_url'] = new_url
stats['url_added'] = True
modified = True
else:
# Remove any fake URL, keep identifier without URL
if 'identifier_url' in identifier:
del identifier['identifier_url']
modified = True
stats['no_url_available'] = True
if modified and not dry_run:
# Write back the modified file
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return stats
except Exception as e:
stats['error'] = str(e)
return stats
def main():
import argparse
parser = argparse.ArgumentParser(description='Add valid ISIL registry URLs to custodian files')
parser.add_argument('--dry-run', action='store_true', default=True,
help='Show what would be done without making changes (default)')
parser.add_argument('--apply', action='store_true',
help='Actually apply the changes')
parser.add_argument('--country', type=str, default=None,
help='Only process files with ISIL codes from this country (e.g., CH, DE)')
parser.add_argument('--limit', type=int, default=None,
help='Limit number of files to process')
parser.add_argument('--verbose', '-v', action='store_true',
help='Verbose output')
args = parser.parse_args()
dry_run = not args.apply
if dry_run:
print("=== DRY RUN MODE (use --apply to make changes) ===\n")
else:
print("=== APPLYING CHANGES ===\n")
# Print registry info
print("ISIL Registry URL Patterns:")
print("-" * 60)
for country, pattern in sorted(ISIL_URL_PATTERNS.items()):
if pattern:
print(f" {country}: {pattern}")
else:
print(f" {country}: No lookup URL available (identifier only)")
print()
# Find custodian files
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
yaml_files = list(custodian_dir.glob('*.yaml'))
print(f"Found {len(yaml_files)} custodian files")
if args.limit:
yaml_files = yaml_files[:args.limit]
print(f"Limited to {args.limit} files")
# Process files
stats_summary = {
'total': 0,
'with_isil': 0,
'urls_added': 0,
'no_url_available': 0,
'by_country': {},
}
for filepath in yaml_files:
stats = process_custodian_file(filepath, dry_run=dry_run)
stats_summary['total'] += 1
if stats.get('isil_found'):
stats_summary['with_isil'] += 1
country = stats.get('country', 'Unknown')
# Filter by country if specified
if args.country and country != args.country:
continue
if country not in stats_summary['by_country']:
stats_summary['by_country'][country] = {
'count': 0,
'urls_added': 0,
'no_url': 0,
}
stats_summary['by_country'][country]['count'] += 1
if stats.get('url_added'):
stats_summary['urls_added'] += 1
stats_summary['by_country'][country]['urls_added'] += 1
if args.verbose:
print(f"{filepath.name}: {stats['isil_code']} -> URL added")
if stats.get('no_url_available'):
stats_summary['no_url_available'] += 1
stats_summary['by_country'][country]['no_url'] += 1
if args.verbose:
print(f" - {filepath.name}: {stats['isil_code']} (no URL available)")
if stats.get('error'):
print(f" ERROR: {filepath.name}: {stats['error']}")
# Print summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Total files processed: {stats_summary['total']}")
print(f"Files with ISIL codes: {stats_summary['with_isil']}")
print(f"URLs added: {stats_summary['urls_added']}")
print(f"No URL available (identifier only): {stats_summary['no_url_available']}")
print("\nBy Country:")
for country, cstats in sorted(stats_summary['by_country'].items()):
url_pattern = ISIL_URL_PATTERNS.get(country, 'Unknown')
has_url = "" if url_pattern else ""
print(f" {country}: {cstats['count']} ISIL codes, {cstats['urls_added']} URLs added, {cstats['no_url']} no URL [{has_url}]")
if dry_run:
print("\n>>> This was a dry run. Use --apply to make changes. <<<")
if __name__ == '__main__':
main()