#!/usr/bin/env python3 """ Integrate CH-Annotator data into custodian files. This script: 1. Reads CH-Annotator enhanced files from data/instances/*_ch_annotator.yaml 2. Matches institutions to existing custodian files by ISIL, Wikidata ID, or name 3. Adds ch_annotator section to matching custodian files 4. Creates new custodian files for unmatched institutions 5. Generates an integration report Usage: python scripts/integrate_ch_annotator_to_custodian.py [--dry-run] [--report-only] Options: --dry-run Preview changes without writing files --report-only Only generate matching report, no file changes """ import os import sys import yaml import argparse from datetime import datetime, timezone from pathlib import Path from collections import defaultdict from typing import Dict, List, Optional, Tuple, Any import re # Paths PROJECT_ROOT = Path(__file__).parent.parent CH_ANNOTATOR_DIR = PROJECT_ROOT / "data" / "instances" CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" REPORTS_DIR = PROJECT_ROOT / "reports" def load_yaml(path: Path) -> Any: """Load YAML file, handling various formats.""" with open(path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_yaml(path: Path, data: Any) -> None: """Save data to YAML file with proper formatting.""" with open(path, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) def normalize_isil(isil: str) -> str: """Normalize ISIL code for comparison.""" if not isil: return "" # Remove spaces, convert to uppercase return isil.strip().upper().replace(" ", "") def normalize_wikidata(qid: str) -> str: """Normalize Wikidata ID for comparison.""" if not qid: return "" # Extract Q-number if URL if '/' in str(qid): qid = str(qid).split('/')[-1] return str(qid).strip().upper() def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching.""" if not name: return "" # Lowercase, remove punctuation, collapse whitespace name = name.lower() name = re.sub(r'[^\w\s]', '', name) name = re.sub(r'\s+', ' ', name).strip() return name def extract_identifiers_from_ch_annotator(institution: Dict) -> Dict[str, str]: """Extract normalized identifiers from CH-Annotator institution.""" identifiers = {} for ident in institution.get('identifiers', []): scheme = ident.get('identifier_scheme', '').upper() value = ident.get('identifier_value', '') if scheme == 'ISIL': identifiers['isil'] = normalize_isil(value) elif scheme == 'WIKIDATA': identifiers['wikidata'] = normalize_wikidata(value) # Also check for name if institution.get('name'): identifiers['name'] = normalize_name(institution['name']) identifiers['name_original'] = institution['name'] return identifiers def build_custodian_index(custodian_dir: Path) -> Dict[str, Dict]: """ Build an index of custodian files by various identifiers. Returns dict with keys: 'by_isil', 'by_wikidata', 'by_ghcid', 'by_name' Each maps to {identifier: file_path} """ index = { 'by_isil': {}, 'by_wikidata': {}, 'by_ghcid': {}, 'by_name': {}, 'all_files': {} } if not custodian_dir.exists(): print(f"Warning: Custodian directory does not exist: {custodian_dir}") return index yaml_files = list(custodian_dir.glob("*.yaml")) print(f"Indexing {len(yaml_files)} custodian files...") for file_path in yaml_files: try: data = load_yaml(file_path) if not data: continue # Store file path index['all_files'][file_path.name] = file_path # Extract GHCID from filename or identifiers ghcid = file_path.stem index['by_ghcid'][ghcid] = file_path # Extract identifiers for ident in data.get('identifiers', []): scheme = ident.get('identifier_scheme', '').upper() value = ident.get('identifier_value', '') if scheme == 'GHCID' and value: index['by_ghcid'][value] = file_path # Extract ISIL from original_entry original = data.get('original_entry', {}) if original: isil = original.get('isil-code_na') or original.get('isil_code') or original.get('ISIL') if isil: index['by_isil'][normalize_isil(isil)] = file_path # Also check for name name = original.get('organisatie') or original.get('name') or original.get('institution_name') if name: index['by_name'][normalize_name(name)] = file_path # Extract Wikidata from enrichment wikidata = data.get('wikidata_enrichment', {}) if wikidata: qid = wikidata.get('wikidata_entity_id') if qid: index['by_wikidata'][normalize_wikidata(qid)] = file_path # Also check original entry for wikidata if original and original.get('wikidata_id'): index['by_wikidata'][normalize_wikidata(original['wikidata_id'])] = file_path except Exception as e: print(f"Warning: Error processing {file_path.name}: {e}") continue print(f"Indexed: {len(index['by_isil'])} ISIL, {len(index['by_wikidata'])} Wikidata, " f"{len(index['by_ghcid'])} GHCID, {len(index['by_name'])} names") return index def find_matching_custodian( institution: Dict, custodian_index: Dict ) -> Tuple[Optional[Path], str]: """ Find matching custodian file for a CH-Annotator institution. Returns: (file_path, match_type) or (None, 'no_match') """ identifiers = extract_identifiers_from_ch_annotator(institution) # Priority 1: ISIL match (most reliable) if identifiers.get('isil'): if identifiers['isil'] in custodian_index['by_isil']: return custodian_index['by_isil'][identifiers['isil']], 'isil' # Priority 2: Wikidata match if identifiers.get('wikidata'): if identifiers['wikidata'] in custodian_index['by_wikidata']: return custodian_index['by_wikidata'][identifiers['wikidata']], 'wikidata' # Priority 3: Exact name match (less reliable) if identifiers.get('name'): if identifiers['name'] in custodian_index['by_name']: return custodian_index['by_name'][identifiers['name']], 'name' return None, 'no_match' def extract_ch_annotator_section(institution: Dict) -> Dict: """Extract the ch_annotator section from an institution.""" return institution.get('ch_annotator', {}) def add_ch_annotator_to_custodian( custodian_data: Dict, ch_annotator_section: Dict, source_file: str ) -> Dict: """Add CH-Annotator section to custodian data.""" # Don't overwrite if already present (unless empty) if 'ch_annotator' in custodian_data and custodian_data['ch_annotator']: # Merge or update existing existing = custodian_data['ch_annotator'] # Add integration note ch_annotator_section['integration_note'] = { 'integrated_from': source_file, 'integration_date': datetime.now(timezone.utc).isoformat(), 'previous_annotation_present': True } else: ch_annotator_section['integration_note'] = { 'integrated_from': source_file, 'integration_date': datetime.now(timezone.utc).isoformat() } custodian_data['ch_annotator'] = ch_annotator_section return custodian_data def load_ch_annotator_files() -> List[Tuple[Path, List[Dict]]]: """Load all CH-Annotator files and return list of (path, institutions).""" files = list(CH_ANNOTATOR_DIR.glob("*_ch_annotator.yaml")) results = [] for file_path in files: try: data = load_yaml(file_path) # Handle different formats if isinstance(data, list): institutions = data elif isinstance(data, dict): institutions = data.get('institutions', []) else: print(f"Warning: Unexpected format in {file_path.name}") continue results.append((file_path, institutions)) print(f"Loaded {len(institutions)} institutions from {file_path.name}") except Exception as e: print(f"Error loading {file_path.name}: {e}") continue return results def generate_report( stats: Dict, matches: List[Dict], unmatched: List[Dict], report_path: Path ) -> None: """Generate integration report.""" report = f"""# CH-Annotator to Custodian Integration Report Generated: {datetime.now(timezone.utc).isoformat()} ## Summary | Metric | Count | |--------|-------| | Total CH-Annotator files processed | {stats['files_processed']} | | Total institutions in CH-Annotator files | {stats['total_institutions']} | | Matched to existing custodian files | {stats['matched']} | | - Matched by ISIL | {stats['match_by_isil']} | | - Matched by Wikidata | {stats['match_by_wikidata']} | | - Matched by Name | {stats['match_by_name']} | | Unmatched (no custodian file) | {stats['unmatched']} | | Custodian files updated | {stats['files_updated']} | | New custodian files created | {stats['files_created']} | | Errors | {stats['errors']} | ## Match Rate by Source File | Source File | Institutions | Matched | Unmatched | Match Rate | |-------------|--------------|---------|-----------|------------| """ for source, source_stats in stats.get('by_source', {}).items(): total = source_stats['total'] matched = source_stats['matched'] unmatched_count = source_stats['unmatched'] rate = f"{(matched/total*100):.1f}%" if total > 0 else "N/A" report += f"| {source} | {total} | {matched} | {unmatched_count} | {rate} |\n" report += f""" ## Match Details ### Matched Institutions (first 50) | Institution Name | Match Type | Custodian File | |------------------|------------|----------------| """ for match in matches[:50]: name = match.get('name', 'Unknown')[:50] match_type = match.get('match_type', 'unknown') file_name = match.get('custodian_file', 'N/A') if isinstance(file_name, Path): file_name = file_name.name report += f"| {name} | {match_type} | {file_name} |\n" if len(matches) > 50: report += f"\n... and {len(matches) - 50} more matches\n" report += f""" ### Unmatched Institutions (first 50) These institutions from CH-Annotator files have no matching custodian file: | Institution Name | Source File | Identifiers | |------------------|-------------|-------------| """ for inst in unmatched[:50]: name = inst.get('name', 'Unknown')[:40] source = inst.get('source_file', 'Unknown') idents = [] for ident in inst.get('identifiers', [])[:2]: idents.append(f"{ident.get('identifier_scheme')}: {ident.get('identifier_value', '')[:20]}") ident_str = ", ".join(idents) if idents else "None" report += f"| {name} | {source} | {ident_str} |\n" if len(unmatched) > 50: report += f"\n... and {len(unmatched) - 50} more unmatched institutions\n" report += f""" ## Notes - **ISIL match**: Most reliable, based on standard library/archive identifier - **Wikidata match**: Reliable, based on unique Wikidata Q-number - **Name match**: Less reliable, based on normalized name comparison ## Next Steps 1. Review unmatched institutions - may need manual matching or new custodian files 2. Validate integration by spot-checking updated custodian files 3. Run LinkML validation on updated files """ with open(report_path, 'w', encoding='utf-8') as f: f.write(report) print(f"\nReport saved to: {report_path}") def main(): parser = argparse.ArgumentParser(description='Integrate CH-Annotator data into custodian files') parser.add_argument('--dry-run', action='store_true', help='Preview changes without writing') parser.add_argument('--report-only', action='store_true', help='Only generate report, no changes') args = parser.parse_args() print("=" * 60) print("CH-Annotator to Custodian Integration") print("=" * 60) if args.dry_run: print("DRY RUN MODE - No files will be modified") if args.report_only: print("REPORT ONLY MODE - Only generating matching report") # Build index of existing custodian files print("\n1. Building custodian file index...") custodian_index = build_custodian_index(CUSTODIAN_DIR) # Load CH-Annotator files print("\n2. Loading CH-Annotator files...") ch_annotator_files = load_ch_annotator_files() if not ch_annotator_files: print("No CH-Annotator files found!") return # Process institutions print("\n3. Matching institutions to custodian files...") stats = { 'files_processed': len(ch_annotator_files), 'total_institutions': 0, 'matched': 0, 'unmatched': 0, 'match_by_isil': 0, 'match_by_wikidata': 0, 'match_by_name': 0, 'files_updated': 0, 'files_created': 0, 'errors': 0, 'by_source': {} } all_matches = [] all_unmatched = [] files_to_update = {} # custodian_path -> list of ch_annotator sections for source_path, institutions in ch_annotator_files: source_name = source_path.name source_stats = {'total': len(institutions), 'matched': 0, 'unmatched': 0} for institution in institutions: stats['total_institutions'] += 1 # Find matching custodian file match_path, match_type = find_matching_custodian(institution, custodian_index) if match_path: stats['matched'] += 1 source_stats['matched'] += 1 if match_type == 'isil': stats['match_by_isil'] += 1 elif match_type == 'wikidata': stats['match_by_wikidata'] += 1 elif match_type == 'name': stats['match_by_name'] += 1 all_matches.append({ 'name': institution.get('name'), 'match_type': match_type, 'custodian_file': match_path, 'source_file': source_name }) # Queue for update if match_path not in files_to_update: files_to_update[match_path] = [] files_to_update[match_path].append({ 'institution': institution, 'source_file': source_name }) else: stats['unmatched'] += 1 source_stats['unmatched'] += 1 all_unmatched.append({ 'name': institution.get('name'), 'identifiers': institution.get('identifiers', []), 'source_file': source_name }) stats['by_source'][source_name] = source_stats # Update custodian files if not args.report_only and not args.dry_run: print(f"\n4. Updating {len(files_to_update)} custodian files...") for custodian_path, updates in files_to_update.items(): try: custodian_data = load_yaml(custodian_path) # Use the first matching institution's CH-Annotator section # (usually there's only one match per custodian file) first_update = updates[0] ch_annotator_section = extract_ch_annotator_section(first_update['institution']) if ch_annotator_section: custodian_data = add_ch_annotator_to_custodian( custodian_data, ch_annotator_section, first_update['source_file'] ) save_yaml(custodian_path, custodian_data) stats['files_updated'] += 1 except Exception as e: print(f"Error updating {custodian_path.name}: {e}") stats['errors'] += 1 elif args.dry_run: print(f"\n4. Would update {len(files_to_update)} custodian files (dry run)") stats['files_updated'] = len(files_to_update) # Generate report print("\n5. Generating report...") REPORTS_DIR.mkdir(exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_path = REPORTS_DIR / f"CH_ANNOTATOR_CUSTODIAN_INTEGRATION_{timestamp}.md" generate_report(stats, all_matches, all_unmatched, report_path) # Print summary print("\n" + "=" * 60) print("INTEGRATION SUMMARY") print("=" * 60) print(f"Total institutions processed: {stats['total_institutions']}") print(f"Matched: {stats['matched']} ({stats['matched']/stats['total_institutions']*100:.1f}%)") print(f" - By ISIL: {stats['match_by_isil']}") print(f" - By Wikidata: {stats['match_by_wikidata']}") print(f" - By Name: {stats['match_by_name']}") print(f"Unmatched: {stats['unmatched']}") print(f"Custodian files updated: {stats['files_updated']}") print(f"Errors: {stats['errors']}") return 0 if __name__ == '__main__': sys.exit(main())