#!/usr/bin/env python3 """ Merge SPARQL fuzzy matches with ISIL-based enrichments. Strategy: 1. Start with SPARQL enriched file (has 180 NL fuzzy matches) 2. Add ISIL-based Wikidata IDs from current file where missing 3. Preserve ALL real Q-numbers, prioritize SPARQL fuzzy matches 4. Remove synthetic Q-numbers (>Q100000000) """ from pathlib import Path from datetime import datetime, timezone import yaml import sys def is_real_qnumber(q_value: str) -> bool: """Check if Q-number is real (not synthetic).""" if not q_value or not q_value.startswith('Q'): return False try: q_num = int(q_value[1:]) return q_num < 100000000 # Real Q-numbers are below 100M except ValueError: return False def get_wikidata_id(institution: dict) -> str | None: """Extract Wikidata ID from institution identifiers.""" for id_obj in institution.get('identifiers', []): if id_obj.get('identifier_scheme') == 'Wikidata': q_val = id_obj.get('identifier_value', '') if is_real_qnumber(q_val): return q_val return None def merge_identifiers(sparql_inst: dict, isil_inst: dict) -> tuple[list, int]: """ Merge identifiers from both sources, prioritizing SPARQL. Returns: (updated identifiers list, count of identifiers added) """ sparql_ids = sparql_inst.get('identifiers', []) isil_ids = isil_inst.get('identifiers', []) # Build set of existing schemes from SPARQL existing_schemes = {} for id_obj in sparql_ids: scheme = id_obj.get('identifier_scheme', '') value = id_obj.get('identifier_value', '') if scheme: existing_schemes[scheme] = value # Add new identifiers from ISIL enrichment added_count = 0 for id_obj in isil_ids: scheme = id_obj.get('identifier_scheme', '') value = id_obj.get('identifier_value', '') # Skip synthetic Q-numbers if scheme == 'Wikidata' and not is_real_qnumber(value): continue # Add if not already present if scheme not in existing_schemes or existing_schemes[scheme] != value: # Check if it's a real addition (not duplicate with different value) if scheme not in existing_schemes: sparql_ids.append(id_obj) added_count += 1 return sparql_ids, added_count def merge_datasets(sparql_data: list, isil_data: list) -> tuple[list, dict]: """ Merge SPARQL and ISIL datasets. Returns: (merged_data, stats) """ # Index ISIL data by ID isil_by_id = {inst['id']: inst for inst in isil_data} stats = { 'total_institutions': len(sparql_data), 'isil_additions': 0, 'identifiers_added': 0, 'synthetic_removed': 0, 'countries_enriched': set() } merged = [] for sparql_inst in sparql_data: inst_id = sparql_inst['id'] isil_inst = isil_by_id.get(inst_id) if isil_inst: # Check if SPARQL has Wikidata ID sparql_wd = get_wikidata_id(sparql_inst) isil_wd = get_wikidata_id(isil_inst) # If SPARQL doesn't have Wikidata but ISIL does, add ISIL identifiers if not sparql_wd and isil_wd: updated_ids, added = merge_identifiers(sparql_inst, isil_inst) sparql_inst['identifiers'] = updated_ids if added > 0: stats['isil_additions'] += 1 stats['identifiers_added'] += added # Track country country = sparql_inst.get('locations', [{}])[0].get('country', '') if country: stats['countries_enriched'].add(country) # Update provenance prov = sparql_inst.get('provenance', {}) if isinstance(prov, dict): method = prov.get('extraction_method', '') if 'ISIL batch enrichment' not in method: prov['extraction_method'] = f"{method} + ISIL batch enrichment" # Check for synthetic Q-numbers to remove sparql_wd_ids = [ id_obj for id_obj in sparql_inst.get('identifiers', []) if id_obj.get('identifier_scheme') == 'Wikidata' ] for wd_id in sparql_wd_ids: if not is_real_qnumber(wd_id.get('identifier_value', '')): sparql_inst['identifiers'].remove(wd_id) stats['synthetic_removed'] += 1 merged.append(sparql_inst) return merged, stats def main(): base_dir = Path(__file__).parent.parent sparql_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_sparql_enriched.yaml" isil_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml" output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_merged.yaml" print("="*80) print("šŸ”€ MERGE SPARQL FUZZY MATCHES + ISIL ENRICHMENTS") print("="*80) print("\nšŸ“– Loading datasets...\n") # Load SPARQL enriched (has fuzzy matches) print(" Loading SPARQL enriched file...") with open(sparql_file, 'r', encoding='utf-8') as f: sparql_data = yaml.safe_load(f) print(f" āœ… {len(sparql_data):,} institutions") # Load ISIL enriched (has ISIL matches) print(" Loading ISIL enriched file...") with open(isil_file, 'r', encoding='utf-8') as f: isil_data = yaml.safe_load(f) print(f" āœ… {len(isil_data):,} institutions\n") # Count current NL coverage nl_sparql = sum( 1 for inst in sparql_data if any(loc.get('country') == 'NL' for loc in inst.get('locations', [])) and get_wikidata_id(inst) ) nl_total = sum( 1 for inst in sparql_data if any(loc.get('country') == 'NL' for loc in inst.get('locations', [])) ) print(f"šŸ“Š Netherlands coverage before merge: {nl_sparql}/{nl_total} ({nl_sparql/nl_total*100:.1f}%)\n") # Merge print("šŸ”€ Merging datasets...\n") merged_data, stats = merge_datasets(sparql_data, isil_data) # Count NL coverage after merge nl_merged = sum( 1 for inst in merged_data if any(loc.get('country') == 'NL' for loc in inst.get('locations', [])) and get_wikidata_id(inst) ) # Write output print("šŸ’¾ Writing merged dataset...\n") header = f"""--- # Global Heritage Institutions - Merged SPARQL + ISIL Enrichment # Generated: {datetime.now(timezone.utc).isoformat()} # # Total institutions: {len(merged_data):,} # ISIL additions: {stats['isil_additions']:,} # Identifiers added: {stats['identifiers_added']:,} # Synthetic Q-numbers removed: {stats['synthetic_removed']:,} # Countries with additional enrichment: {', '.join(sorted(stats['countries_enriched']))} """ with open(output_file, 'w', encoding='utf-8') as f: f.write(header) yaml.dump(merged_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print(f"āœ… Complete! Output: {output_file}\n") # Final report print("="*80) print("šŸ“Š MERGE REPORT") print("="*80) print(f"\n✨ Results:") print(f" Total institutions: {stats['total_institutions']:,}") print(f" Institutions with ISIL additions: {stats['isil_additions']:,}") print(f" Total identifiers added: {stats['identifiers_added']:,}") print(f" Synthetic Q-numbers removed: {stats['synthetic_removed']:,}") print(f" Countries enriched: {', '.join(sorted(stats['countries_enriched'])) or 'None'}") print(f"\nšŸ‡³šŸ‡± Netherlands Wikidata coverage:") print(f" Before: {nl_sparql}/{nl_total} ({nl_sparql/nl_total*100:.1f}%)") print(f" After: {nl_merged}/{nl_total} ({nl_merged/nl_total*100:.1f}%)") print(f" Change: {nl_merged - nl_sparql:+,} matches") print("="*80 + "\n") if __name__ == "__main__": main()