#!/usr/bin/env python3 """ Great Britain Heritage Institutions Enrichment - Batch 1 ========================================================== Strategy: Fuzzy name matching with Wikidata SPARQL queries Threshold: 0.85 (same as Georgia Batch 1) Target: 4 GB institutions (0% current coverage) Goal: Achieve 50%+ Wikidata coverage """ import yaml from datetime import datetime, timezone from SPARQLWrapper import SPARQLWrapper, JSON from rapidfuzz import fuzz import time # Wikidata SPARQL endpoint WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql" def query_wikidata_gb_institutions(): """Query Wikidata for British heritage institutions.""" sparql = SPARQLWrapper(WIKIDATA_ENDPOINT) query = """ SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?coord ?website ?viaf ?isil ?inception WHERE { # Archives, research centers, databases in Great Britain VALUES ?type { wd:Q166118 # archives wd:Q21045422 # research database wd:Q31855 # research institute wd:Q7315155 # research center wd:Q3918 # university (for university-based archives/research centers) } ?item wdt:P31/wdt:P279* ?type . ?item wdt:P17 wd:Q145 . # Country: United Kingdom OPTIONAL { ?item wdt:P625 ?coord } OPTIONAL { ?item wdt:P856 ?website } OPTIONAL { ?item wdt:P214 ?viaf } OPTIONAL { ?item wdt:P791 ?isil } OPTIONAL { ?item wdt:P571 ?inception } SERVICE wikibase:label { bd:serviceParam wikibase:language "en,ar" . ?item rdfs:label ?itemLabel . ?item skos:altLabel ?itemAltLabel . } } LIMIT 1000 """ sparql.setQuery(query) sparql.setReturnFormat(JSON) print("šŸ” Querying Wikidata for British heritage institutions...") try: results = sparql.query().convert() institutions = results['results']['bindings'] print(f" āœ… Found {len(institutions)} British institutions in Wikidata\n") return institutions except Exception as e: print(f" āŒ Query failed: {e}") return [] def fuzzy_match_institutions(our_institutions, wikidata_institutions, threshold=0.85): """Match our institutions to Wikidata using fuzzy name matching.""" matches = [] for our_inst in our_institutions: our_name = our_inst['name'].lower() best_match = None best_score = 0 for wd_inst in wikidata_institutions: wd_label = wd_inst.get('itemLabel', {}).get('value', '').lower() # Try main label score = fuzz.ratio(our_name, wd_label) if score > best_score: best_score = score best_match = wd_inst # Try alternative labels if 'itemAltLabel' in wd_inst: alt_label = wd_inst['itemAltLabel']['value'].lower() alt_score = fuzz.ratio(our_name, alt_label) if alt_score > best_score: best_score = alt_score best_match = wd_inst if best_score >= threshold * 100: # rapidfuzz returns 0-100 matches.append({ 'institution': our_inst, 'wikidata': best_match, 'score': best_score / 100 }) print(f" āœ… Match (score={best_score/100:.2f}): {our_inst['name']}") print(f" → {best_match['itemLabel']['value']} ({best_match['item']['value'].split('/')[-1]})") else: print(f" āŒ No match: {our_inst['name']} (best score: {best_score/100:.2f})") return matches def enrich_with_wikidata(institution, wikidata_data, match_score): """Add Wikidata identifiers and metadata to institution.""" q_id = wikidata_data['item']['value'].split('/')[-1] # Add Wikidata identifier if 'identifiers' not in institution: institution['identifiers'] = [] # Check if Wikidata identifier already exists has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in institution['identifiers']) if not has_wikidata: institution['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': q_id, 'identifier_url': f"https://www.wikidata.org/wiki/{q_id}" }) # Add VIAF if available if 'viaf' in wikidata_data: has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in institution['identifiers']) if not has_viaf: institution['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': wikidata_data['viaf']['value'], 'identifier_url': f"https://viaf.org/viaf/{wikidata_data['viaf']['value']}" }) # Add ISIL if available if 'isil' in wikidata_data: has_isil = any(i.get('identifier_scheme') == 'ISIL' for i in institution['identifiers']) if not has_isil: institution['identifiers'].append({ 'identifier_scheme': 'ISIL', 'identifier_value': wikidata_data['isil']['value'] }) # Add coordinates if available and not already present if 'coord' in wikidata_data: coord_str = wikidata_data['coord']['value'] # Parse "Point(lon lat)" format coord_str = coord_str.replace('Point(', '').replace(')', '') lon, lat = map(float, coord_str.split()) for location in institution.get('locations', []): if location.get('country') == 'GB' and 'latitude' not in location: location['latitude'] = lat location['longitude'] = lon # Add founding date if available if 'inception' in wikidata_data: institution['founding_date'] = wikidata_data['inception']['value'].split('T')[0] # Update provenance if 'provenance' not in institution: institution['provenance'] = {} if 'notes' not in institution['provenance']: institution['provenance']['notes'] = [] institution['provenance']['notes'].append( f"Batch 1: Fuzzy name match (score={match_score:.2f}) - Wikidata {q_id}" ) institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat() institution['provenance']['wikidata_verified'] = True return institution def main(): print("=" * 80) print("šŸ‡¬šŸ‡§ Great Britain Heritage Institutions Enrichment - Batch 1") print("=" * 80) print("\nStrategy: Fuzzy name matching (threshold 0.85)\n") # Load our dataset print("šŸ“‚ Loading unified global dataset...") with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) # Filter GB institutions gb_institutions = [ inst for inst in all_institutions if any(loc.get('country') == 'GB' for loc in inst.get('locations', [])) ] print(f" āœ… Found {len(gb_institutions)} GB institutions\n") # Query Wikidata wikidata_institutions = query_wikidata_gb_institutions() time.sleep(1) # Be nice to Wikidata # Fuzzy matching print(f"šŸ”— Matching institutions (threshold=0.85)...\n") matches = fuzzy_match_institutions(gb_institutions, wikidata_institutions, threshold=0.85) print(f"\nšŸ“Š Found {len(matches)} matches\n") # Enrich institutions if matches: print("✨ Enriching institutions with Wikidata metadata...\n") for match in matches: enrich_with_wikidata( match['institution'], match['wikidata'], match['score'] ) print(f" āœ… Enriched: {match['institution']['name']}") # Save results output_path = 'data/instances/great_britain/gb_institutions_enriched_batch1.yaml' print(f"\nšŸ’¾ Saving Batch 1 results to {output_path}...") import os os.makedirs('data/instances/great_britain', exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(gb_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(" āœ… Saved\n") # Summary enriched_count = sum(1 for inst in gb_institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))) print("=" * 80) print("šŸ“Š BATCH 1 RESULTS") print("=" * 80) print(f"Total institutions: {len(gb_institutions)}") print(f"Wikidata enriched: {enriched_count} ({enriched_count/len(gb_institutions)*100:.1f}%)") print(f"Still need enrichment: {len(gb_institutions) - enriched_count}") if enriched_count >= len(gb_institutions) * 0.5: print("\nāœ… SUCCESS: Achieved 50%+ Wikidata coverage goal!") else: print(f"\nāš ļø Below 50% goal. Batch 2 (alternative names) recommended.") print("\n") if __name__ == '__main__': main()