#!/usr/bin/env python3 """ Add manually discovered Wikidata identifiers from direct SPARQL searches. FOUND: - Egyptian Museum Cairo (EMC) → Q201219 (Egyptian Museum) - Grand Egyptian Museum (GEM) → Q2583681 (Grand Egyptian Museum) - Bibliotheca Alexandrina → Q501851 (Bibliotheca Alexandrina) """ import yaml from pathlib import Path from datetime import datetime, timezone input_file = Path("data/instances/egypt_institutions_wikidata_corrected.yaml") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print("="*80) print("šŸ” Adding manually discovered Wikidata identifiers") print("="*80) print() manual_matches = { "Egyptian Museum Cairo (EMC)": { "qid": "Q201219", "label": "Egyptian Museum", "method": "Direct SPARQL search" }, "Grand Egyptian Museum (GEM)": { "qid": "Q2583681", "label": "Grand Egyptian Museum", "method": "Direct SPARQL search" }, "Bibliotheca Alexandrina": { "qid": "Q501851", "label": "Bibliotheca Alexandrina", "method": "Direct SPARQL search" } } added_count = 0 for inst in institutions: name = inst.get('name', '') if name in manual_matches: match = manual_matches[name] # Check if already has Wikidata identifiers = inst.get('identifiers', []) has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers) if not has_wikidata: # Add Wikidata identifier new_identifier = { 'identifier_scheme': 'Wikidata', 'identifier_value': match['qid'], 'identifier_url': f"https://www.wikidata.org/wiki/{match['qid']}" } if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append(new_identifier) # Add provenance note if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['wikidata_enrichment'] = { 'method': match['method'], 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'wikidata_label': match['label'], 'verified': True } print(f"āœ… ADDED: {name}") print(f" Q-number: {match['qid']} ({match['label']})") print(f" Method: {match['method']}\n") added_count += 1 # Write updated dataset with open(input_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, width=120) # Calculate final coverage total = len(institutions) with_wikidata = sum(1 for inst in institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))) coverage = (with_wikidata / total * 100) if total > 0 else 0 print("="*80) print("šŸ“Š MANUAL ENRICHMENT RESULTS") print("="*80) print(f"āœ… Institutions added: {added_count}") print(f"šŸ“ˆ Wikidata coverage: {coverage:.1f}% ({with_wikidata}/{total})") print(f"\nšŸ’¾ Updated: {input_file}") print("="*80)