272 lines
9.7 KiB
Python
272 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Add valid country-specific ISIL registry URLs to custodian files.
|
|
|
|
This script replaces the fake isil.org URLs with actual country-specific
|
|
registry lookup URLs where available.
|
|
|
|
ISIL Registry URL Patterns by Country:
|
|
- CH (Switzerland): https://www.isil.nb.admin.ch/en/?isil={code}
|
|
- DE (Germany): https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}
|
|
- AT (Austria): https://sigel.staatsbibliothek-berlin.de/en/isil/{code} (managed by Germany)
|
|
- JP (Japan): No individual lookup URL (CSV downloads only from NDL)
|
|
- BE (Belgium): http://isil.kbr.be/ (libraries only, no direct lookup)
|
|
- NL (Netherlands): No public lookup URL found
|
|
- BY (Belarus): No public lookup URL (list only at nlb.by)
|
|
- BG (Bulgaria): No public lookup URL (managed by National Library)
|
|
- CZ (Czech): No public lookup URL (managed by nkp.cz)
|
|
|
|
Countries WITH resolvable URLs:
|
|
- CH: Swiss National Library registry
|
|
- DE: German ISIL Agency (Sigel)
|
|
- AT: Austrian institutions are indexed in German ISIL Agency
|
|
|
|
Countries WITHOUT resolvable URLs (ISIL is identifier only):
|
|
- JP, BE, NL, BY, BG, CZ
|
|
|
|
Author: GLAM Project
|
|
Date: 2025-12-09
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import yaml
|
|
|
|
# ISIL registry URL patterns by country code
|
|
ISIL_URL_PATTERNS = {
|
|
# Countries with resolvable lookup URLs
|
|
'CH': 'https://www.isil.nb.admin.ch/en/?isil={code}',
|
|
'DE': 'https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}',
|
|
'AT': 'https://sigel.staatsbibliothek-berlin.de/en/isil/{code}',
|
|
|
|
# Countries without individual lookup URLs
|
|
# ISIL is identifier only - no URL should be added
|
|
'JP': None, # NDL provides CSV downloads only
|
|
'BE': None, # KBR has form but no lookup by ISIL
|
|
'NL': None, # No public registry found
|
|
'BY': None, # List only at nlb.by
|
|
'BG': None, # National Library list only
|
|
'CZ': None, # National Library list only
|
|
}
|
|
|
|
# Registry information URLs (not lookup but info pages)
|
|
ISIL_REGISTRY_INFO = {
|
|
'JP': 'https://www.ndl.go.jp/en/library/isil/index.html',
|
|
'CH': 'https://www.nb.admin.ch/snl/en/home/information-professionals/isil.html',
|
|
'BE': 'https://www.kbr.be/en/isil/',
|
|
'NL': 'https://www.nationaalarchief.nl/archiveren/kennisbank/isil-codes',
|
|
'BY': 'https://nlb.by/en/for-librarians/international-standard-identifier-for-libraries-and-related-organizations-isil/',
|
|
'BG': 'https://www.nationallibrary.bg/www/en/national-isil-register/',
|
|
'CZ': 'https://www.en.nkp.cz/',
|
|
'DE': 'https://sigel.staatsbibliothek-berlin.de/en/',
|
|
'AT': 'https://sigel.staatsbibliothek-berlin.de/en/',
|
|
}
|
|
|
|
|
|
def get_country_from_isil(isil_code: str) -> Optional[str]:
|
|
"""Extract country code from ISIL code."""
|
|
if not isil_code:
|
|
return None
|
|
# ISIL format: XX-... where XX is ISO 3166-1 alpha-2 country code
|
|
match = re.match(r'^([A-Z]{2})-', isil_code)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def get_isil_url(isil_code: str) -> Optional[str]:
|
|
"""Generate the registry URL for an ISIL code if available."""
|
|
country = get_country_from_isil(isil_code)
|
|
if not country:
|
|
return None
|
|
|
|
pattern = ISIL_URL_PATTERNS.get(country)
|
|
if pattern:
|
|
return pattern.format(code=isil_code)
|
|
return None
|
|
|
|
|
|
def process_custodian_file(filepath: Path, dry_run: bool = True) -> dict:
|
|
"""Process a single custodian YAML file and add ISIL URLs where appropriate.
|
|
|
|
Returns dict with stats about what was done.
|
|
"""
|
|
stats = {
|
|
'file': str(filepath),
|
|
'isil_found': False,
|
|
'url_added': False,
|
|
'country': None,
|
|
'isil_code': None,
|
|
'no_url_available': False,
|
|
}
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Parse YAML
|
|
data = yaml.safe_load(content)
|
|
if not data:
|
|
return stats
|
|
|
|
# Find ISIL identifiers
|
|
identifiers = data.get('identifiers', [])
|
|
if not identifiers:
|
|
return stats
|
|
|
|
modified = False
|
|
for identifier in identifiers:
|
|
if identifier.get('identifier_scheme') == 'ISIL':
|
|
isil_code = identifier.get('identifier_value')
|
|
if not isil_code:
|
|
continue
|
|
|
|
stats['isil_found'] = True
|
|
stats['isil_code'] = isil_code
|
|
stats['country'] = get_country_from_isil(isil_code)
|
|
|
|
# Check if URL already exists
|
|
current_url = identifier.get('identifier_url')
|
|
if current_url and 'isil.org' not in current_url:
|
|
# Already has a valid URL, skip
|
|
continue
|
|
|
|
# Get the appropriate URL
|
|
new_url = get_isil_url(isil_code)
|
|
|
|
if new_url:
|
|
identifier['identifier_url'] = new_url
|
|
stats['url_added'] = True
|
|
modified = True
|
|
else:
|
|
# Remove any fake URL, keep identifier without URL
|
|
if 'identifier_url' in identifier:
|
|
del identifier['identifier_url']
|
|
modified = True
|
|
stats['no_url_available'] = True
|
|
|
|
if modified and not dry_run:
|
|
# Write back the modified file
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return stats
|
|
|
|
except Exception as e:
|
|
stats['error'] = str(e)
|
|
return stats
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Add valid ISIL registry URLs to custodian files')
|
|
parser.add_argument('--dry-run', action='store_true', default=True,
|
|
help='Show what would be done without making changes (default)')
|
|
parser.add_argument('--apply', action='store_true',
|
|
help='Actually apply the changes')
|
|
parser.add_argument('--country', type=str, default=None,
|
|
help='Only process files with ISIL codes from this country (e.g., CH, DE)')
|
|
parser.add_argument('--limit', type=int, default=None,
|
|
help='Limit number of files to process')
|
|
parser.add_argument('--verbose', '-v', action='store_true',
|
|
help='Verbose output')
|
|
|
|
args = parser.parse_args()
|
|
|
|
dry_run = not args.apply
|
|
|
|
if dry_run:
|
|
print("=== DRY RUN MODE (use --apply to make changes) ===\n")
|
|
else:
|
|
print("=== APPLYING CHANGES ===\n")
|
|
|
|
# Print registry info
|
|
print("ISIL Registry URL Patterns:")
|
|
print("-" * 60)
|
|
for country, pattern in sorted(ISIL_URL_PATTERNS.items()):
|
|
if pattern:
|
|
print(f" {country}: {pattern}")
|
|
else:
|
|
print(f" {country}: No lookup URL available (identifier only)")
|
|
print()
|
|
|
|
# Find custodian files
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
yaml_files = list(custodian_dir.glob('*.yaml'))
|
|
|
|
print(f"Found {len(yaml_files)} custodian files")
|
|
|
|
if args.limit:
|
|
yaml_files = yaml_files[:args.limit]
|
|
print(f"Limited to {args.limit} files")
|
|
|
|
# Process files
|
|
stats_summary = {
|
|
'total': 0,
|
|
'with_isil': 0,
|
|
'urls_added': 0,
|
|
'no_url_available': 0,
|
|
'by_country': {},
|
|
}
|
|
|
|
for filepath in yaml_files:
|
|
stats = process_custodian_file(filepath, dry_run=dry_run)
|
|
stats_summary['total'] += 1
|
|
|
|
if stats.get('isil_found'):
|
|
stats_summary['with_isil'] += 1
|
|
country = stats.get('country', 'Unknown')
|
|
|
|
# Filter by country if specified
|
|
if args.country and country != args.country:
|
|
continue
|
|
|
|
if country not in stats_summary['by_country']:
|
|
stats_summary['by_country'][country] = {
|
|
'count': 0,
|
|
'urls_added': 0,
|
|
'no_url': 0,
|
|
}
|
|
|
|
stats_summary['by_country'][country]['count'] += 1
|
|
|
|
if stats.get('url_added'):
|
|
stats_summary['urls_added'] += 1
|
|
stats_summary['by_country'][country]['urls_added'] += 1
|
|
if args.verbose:
|
|
print(f" ✓ {filepath.name}: {stats['isil_code']} -> URL added")
|
|
|
|
if stats.get('no_url_available'):
|
|
stats_summary['no_url_available'] += 1
|
|
stats_summary['by_country'][country]['no_url'] += 1
|
|
if args.verbose:
|
|
print(f" - {filepath.name}: {stats['isil_code']} (no URL available)")
|
|
|
|
if stats.get('error'):
|
|
print(f" ERROR: {filepath.name}: {stats['error']}")
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
print(f"Total files processed: {stats_summary['total']}")
|
|
print(f"Files with ISIL codes: {stats_summary['with_isil']}")
|
|
print(f"URLs added: {stats_summary['urls_added']}")
|
|
print(f"No URL available (identifier only): {stats_summary['no_url_available']}")
|
|
|
|
print("\nBy Country:")
|
|
for country, cstats in sorted(stats_summary['by_country'].items()):
|
|
url_pattern = ISIL_URL_PATTERNS.get(country, 'Unknown')
|
|
has_url = "✓" if url_pattern else "✗"
|
|
print(f" {country}: {cstats['count']} ISIL codes, {cstats['urls_added']} URLs added, {cstats['no_url']} no URL [{has_url}]")
|
|
|
|
if dry_run:
|
|
print("\n>>> This was a dry run. Use --apply to make changes. <<<")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|