178 lines
6.1 KiB
Python
178 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Add valid country-specific ISIL registry URLs to custodian files.
|
|
|
|
Fast version that only processes files matching country prefix.
|
|
|
|
ISIL Registry URL Patterns by Country:
|
|
- CH (Switzerland): https://www.isil.nb.admin.ch/en/?isil={code}
|
|
- DE (Germany): https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}
|
|
- AT (Austria): https://sigel.staatsbibliothek-berlin.de/en/isil/{code} (managed by Germany)
|
|
- JP (Japan): No individual lookup URL (CSV downloads only from NDL)
|
|
- BE (Belgium): http://isil.kbr.be/ (libraries only, no direct lookup)
|
|
- NL (Netherlands): No public lookup URL found
|
|
- BY (Belarus): No public lookup URL (list only at nlb.by)
|
|
- BG (Bulgaria): No public lookup URL (managed by National Library)
|
|
- CZ (Czech): No public lookup URL (managed by nkp.cz)
|
|
|
|
Author: GLAM Project
|
|
Date: 2025-12-09
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import yaml
|
|
|
|
# ISIL registry URL patterns by country code
|
|
ISIL_URL_PATTERNS = {
|
|
# Countries with resolvable lookup URLs
|
|
'CH': 'https://www.isil.nb.admin.ch/en/?isil={code}',
|
|
'DE': 'https://sigel.staatsbibliothek-berlin.de/suche/?isil={code}',
|
|
'AT': 'https://sigel.staatsbibliothek-berlin.de/en/isil/{code}',
|
|
|
|
# Countries without individual lookup URLs
|
|
'JP': None,
|
|
'BE': None,
|
|
'NL': None,
|
|
'BY': None,
|
|
'BG': None,
|
|
'CZ': None,
|
|
}
|
|
|
|
|
|
def get_country_from_isil(isil_code: str) -> Optional[str]:
|
|
"""Extract country code from ISIL code."""
|
|
if not isil_code:
|
|
return None
|
|
match = re.match(r'^([A-Z]{2})-', isil_code)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def get_isil_url(isil_code: str) -> Optional[str]:
|
|
"""Generate the registry URL for an ISIL code if available."""
|
|
country = get_country_from_isil(isil_code)
|
|
if not country:
|
|
return None
|
|
|
|
pattern = ISIL_URL_PATTERNS.get(country)
|
|
if pattern:
|
|
return pattern.format(code=isil_code)
|
|
return None
|
|
|
|
|
|
def process_file_fast(filepath: Path, dry_run: bool = True) -> dict:
|
|
"""Process a single file - fast version using string operations."""
|
|
stats = {'modified': False, 'isil_code': None, 'url_added': None}
|
|
|
|
try:
|
|
content = filepath.read_text(encoding='utf-8')
|
|
|
|
# Find ISIL code
|
|
isil_match = re.search(r'identifier_scheme: ISIL\s*\n\s*identifier_value: ([A-Z]{2}-[^\s\n]+)', content)
|
|
if not isil_match:
|
|
return stats
|
|
|
|
isil_code = isil_match.group(1).strip("'\"")
|
|
stats['isil_code'] = isil_code
|
|
|
|
# Check if already has valid URL
|
|
if 'identifier_url:' in content and 'isil.org' not in content:
|
|
# Check if it's after the ISIL entry
|
|
isil_pos = content.find('identifier_scheme: ISIL')
|
|
url_pos = content.find('identifier_url:', isil_pos)
|
|
next_scheme_pos = content.find('identifier_scheme:', isil_pos + 20)
|
|
|
|
if url_pos != -1 and (next_scheme_pos == -1 or url_pos < next_scheme_pos):
|
|
# Already has URL for this ISIL
|
|
return stats
|
|
|
|
# Get URL for this country
|
|
url = get_isil_url(isil_code)
|
|
if not url:
|
|
return stats
|
|
|
|
# Add URL after identifier_value
|
|
new_line = f'\n identifier_url: {url}'
|
|
|
|
# Find the position to insert
|
|
value_end = isil_match.end()
|
|
# Check if there's already an identifier_url line we need to replace
|
|
after_value = content[value_end:value_end+100]
|
|
if after_value.strip().startswith('identifier_url:'):
|
|
# Replace existing URL
|
|
url_line_match = re.search(r'\n\s*identifier_url:[^\n]*', content[value_end:])
|
|
if url_line_match:
|
|
old_url_line = url_line_match.group(0)
|
|
content = content.replace(old_url_line, new_line, 1)
|
|
stats['modified'] = True
|
|
stats['url_added'] = url
|
|
else:
|
|
# Insert new URL line
|
|
content = content[:value_end] + new_line + content[value_end:]
|
|
stats['modified'] = True
|
|
stats['url_added'] = url
|
|
|
|
if stats['modified'] and not dry_run:
|
|
filepath.write_text(content, encoding='utf-8')
|
|
|
|
return stats
|
|
|
|
except Exception as e:
|
|
stats['error'] = str(e)
|
|
return stats
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Add valid ISIL registry URLs to custodian files')
|
|
parser.add_argument('--apply', action='store_true', help='Actually apply the changes')
|
|
parser.add_argument('--country', type=str, required=True,
|
|
help='Country code prefix (e.g., AT, DE, CH)')
|
|
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
|
|
|
args = parser.parse_args()
|
|
dry_run = not args.apply
|
|
|
|
print(f"Processing {args.country} files...")
|
|
if dry_run:
|
|
print("=== DRY RUN MODE (use --apply to make changes) ===\n")
|
|
|
|
# Only process files matching country prefix
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
pattern = f'{args.country}-*.yaml'
|
|
files = list(custodian_dir.glob(pattern))
|
|
|
|
print(f"Found {len(files)} files matching {pattern}")
|
|
|
|
url_pattern = ISIL_URL_PATTERNS.get(args.country)
|
|
if url_pattern:
|
|
print(f"URL pattern: {url_pattern}")
|
|
else:
|
|
print(f"No URL pattern available for {args.country}")
|
|
return
|
|
|
|
# Process files
|
|
modified_count = 0
|
|
for filepath in files:
|
|
stats = process_file_fast(filepath, dry_run=dry_run)
|
|
if stats.get('modified'):
|
|
modified_count += 1
|
|
if args.verbose:
|
|
print(f" ✓ {filepath.name}: {stats['isil_code']} -> {stats['url_added']}")
|
|
if stats.get('error'):
|
|
print(f" ERROR: {filepath.name}: {stats['error']}")
|
|
|
|
print(f"\n{'Would modify' if dry_run else 'Modified'}: {modified_count} files")
|
|
|
|
if dry_run and modified_count > 0:
|
|
print("\n>>> Use --apply to make changes <<<")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|