#!/usr/bin/env python3 """ Quick cross-link Czech datasets - exact matches only + provenance fix Fast version focusing on: 1. Exact name matching (11 institutions) 2. Fix provenance metadata for all institutions 3. Create unified dataset Skips fuzzy matching for speed (can add later if needed). """ import yaml from pathlib import Path from datetime import datetime, timezone # File paths ADR_FILE = Path("data/instances/czech_institutions.yaml") ARON_FILE = Path("data/instances/czech_archives_aron.yaml") OUTPUT_FILE = Path("data/instances/czech_unified.yaml") print("=" * 70) print("Czech Dataset Quick Cross-linking") print("=" * 70) # Load datasets print("\nLoading datasets...") with open(ADR_FILE, 'r', encoding='utf-8') as f: adr = yaml.safe_load(f) with open(ARON_FILE, 'r', encoding='utf-8') as f: aron = yaml.safe_load(f) print(f" ADR: {len(adr):,} institutions") print(f" ARON: {len(aron):,} institutions") # Find exact matches print("\nFinding exact name matches...") adr_by_name = {inst['name']: (i, inst) for i, inst in enumerate(adr)} aron_by_name = {inst['name']: (i, inst) for i, inst in enumerate(aron)} exact_matches = set(adr_by_name.keys()) & set(aron_by_name.keys()) print(f" Found {len(exact_matches)} exact matches") matched_adr_indices = set() matched_aron_indices = set() # Build unified dataset print("\nBuilding unified dataset...") unified = [] # 1. Merge exact matches print(f" Merging {len(exact_matches)} exact matches...") for name in exact_matches: adr_idx, adr_inst = adr_by_name[name] aron_idx, aron_inst = aron_by_name[name] matched_adr_indices.add(adr_idx) matched_aron_indices.add(aron_idx) # Use ADR as base, add ARON identifiers merged = adr_inst.copy() # Add ARON identifiers if 'identifiers' not in merged: merged['identifiers'] = [] for aron_id in aron_inst.get('identifiers', []): if not any(i.get('identifier_scheme') == aron_id['identifier_scheme'] for i in merged['identifiers']): merged['identifiers'].append(aron_id) # Merge descriptions adr_desc = adr_inst.get('description', '') aron_desc = aron_inst.get('description', '') if adr_desc and aron_desc and adr_desc != aron_desc: merged['description'] = f"{adr_desc}\\n\\nArchival context: {aron_desc}" elif aron_desc and not adr_desc: merged['description'] = aron_desc # Fix provenance merged['provenance'] = { 'data_source': 'API_SCRAPING', 'data_tier': 'TIER_1_AUTHORITATIVE', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Merged from ADR (library API) and ARON (archive API) - exact name match', 'confidence_score': 1.0, 'source_url': 'https://adr.cz + https://portal.nacr.cz/aron', 'notes': 'Combined metadata from both ADR and ARON databases' } unified.append(merged) # 2. Add ADR-only institutions (with fixed provenance) adr_only_count = 0 print(" Adding ADR-only institutions...") for i, inst in enumerate(adr): if i not in matched_adr_indices: inst = inst.copy() inst['provenance']['data_source'] = 'API_SCRAPING' inst['provenance']['source_url'] = 'https://adr.cz/api/institution/list' inst['provenance']['extraction_method'] = 'ADR library database API scraping' unified.append(inst) adr_only_count += 1 print(f" Added {adr_only_count:,} ADR-only institutions") # 3. Add ARON-only institutions (with fixed provenance) aron_only_count = 0 print(" Adding ARON-only institutions...") for i, inst in enumerate(aron): if i not in matched_aron_indices: inst = inst.copy() inst['provenance']['data_source'] = 'API_SCRAPING' inst['provenance']['source_url'] = 'https://portal.nacr.cz/aron/institution' inst['provenance']['extraction_method'] = 'ARON archive portal API scraping (reverse-engineered with type filter)' unified.append(inst) aron_only_count += 1 print(f" Added {aron_only_count:,} ARON-only institutions") print(f"\nUnified dataset: {len(unified):,} institutions") print(f" Merged: {len(exact_matches)}") print(f" ADR only: {adr_only_count:,}") print(f" ARON only: {aron_only_count:,}") # Save print(f"\nSaving to {OUTPUT_FILE}...") OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: yaml.dump(unified, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(f"Saved {len(unified):,} institutions") # Generate quick report print("\nGenerating report...") report = f"""# Czech Dataset Cross-linking Report (Quick Version) **Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} **Status**: āœ… COMPLETE (exact matches only) ## Summary - **Exact matches**: {len(exact_matches)} institutions - **ADR only**: {adr_only_count:,} institutions - **ARON only**: {aron_only_count:,} institutions - **Total unified**: {len(unified):,} institutions ## Exact Matches """ for name in sorted(exact_matches): report += f"- {name}\n" report += f""" ## Provenance Fixes āœ… All {len(unified):,} institutions now have corrected metadata: - **data_source**: Changed from `CONVERSATION_NLP` to `API_SCRAPING` - **source_url**: Added proper API endpoints - **extraction_method**: Clarified for ADR vs ARON vs merged ## Files Created 1. **`{OUTPUT_FILE}`** - Unified dataset ({len(unified):,} institutions) 2. **`CZECH_CROSSLINK_REPORT.md`** - This report ## Next Steps - [x] Cross-link datasets (exact matches) - [x] Fix provenance metadata - [ ] Geocode addresses (Priority 1 - next) - [ ] Fuzzy matching (optional - can add later) - [ ] Wikidata enrichment (Priority 2) """ with open('CZECH_CROSSLINK_REPORT.md', 'w', encoding='utf-8') as f: f.write(report) print("Report saved: CZECH_CROSSLINK_REPORT.md") print("\nāœ… Cross-linking complete!") print(f"\nNext: Geocode {adr_only_count + len(exact_matches):,} ADR addresses")