#!/usr/bin/env python3 """ Enrich Belgium EU institutions with Wikidata identifiers. Belgium dataset consists of 7 EU institutions in Brussels (0% Wikidata coverage). All are well-documented EU bodies with likely Wikidata entries. Strategy: 1. Load Belgium institutions from master dataset 2. Query Wikidata for EU institutions (P31=Q43229, P17=Q29999) 3. Fuzzy match names 4. Apply high-confidence matches (>0.85) """ import sys from pathlib import Path from typing import Any, Optional from datetime import datetime, timezone import time import yaml from difflib import SequenceMatcher import re sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore def normalize_name(name: str) -> str: """Normalize institution name for fuzzy matching.""" # Lowercase name = name.lower() # Remove common EU prefixes name = re.sub(r'^(european|eu)\s+', '', name) name = re.sub(r'\s+(library|archive|archives|committee|commission|parliament|council)$', '', name) # Remove punctuation name = re.sub(r'[^\w\s]', ' ', name) # Normalize whitespace name = ' '.join(name.split()) return name def similarity_score(name1: str, name2: str) -> float: """Calculate similarity between two names (0-1).""" norm1 = normalize_name(name1) norm2 = normalize_name(name2) return SequenceMatcher(None, norm1, norm2).ratio() def query_wikidata_eu_institutions(sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]: """ Query Wikidata for EU institutions and their heritage units. Queries for: - EU institutions (Q43229) - EU agencies (Q1338914) - Located in Belgium (Q31) or EU (Q458) """ query = """ SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel WHERE { VALUES ?type { wd:Q43229 wd:Q1338914 wd:Q7075 wd:Q166118 } ?item wdt:P31 ?type . # instance of EU institution/library/archive # Must be located in Belgium or EU { ?item wdt:P17 wd:Q31 . } # country: Belgium UNION { ?item wdt:P17 wd:Q458 . } # country: European Union UNION { ?item wdt:P131 wd:Q239 . } # located in: Brussels OPTIONAL { ?item wdt:P791 ?isil . } OPTIONAL { ?item wdt:P214 ?viaf . } OPTIONAL { ?item wdt:P625 ?coords . } OPTIONAL { ?item wdt:P856 ?website . } OPTIONAL { ?item wdt:P571 ?inception . } SERVICE wikibase:label { bd:serviceParam wikibase:language "en,fr,nl,de". } } ORDER BY ?itemLabel """ sparql.setQuery(query) sparql.setReturnFormat(SPARQL_JSON) print("šŸ” Querying Wikidata for EU institutions...") results = sparql.query().convert() # Parse results institutions = {} for result in results['results']['bindings']: qid = result['item']['value'].split('/')[-1] institutions[qid] = { 'qid': qid, 'name': result['itemLabel']['value'], 'description': result.get('itemDescription', {}).get('value', ''), 'isil': result.get('isil', {}).get('value'), 'viaf': result.get('viaf', {}).get('value'), 'website': result.get('website', {}).get('value'), 'inception': result.get('inception', {}).get('value', '').split('T')[0], 'type': result.get('typeLabel', {}).get('value', ''), 'coords': result.get('coords', {}).get('value') } print(f"āœ… Found {len(institutions)} EU institutions in Wikidata") return institutions def match_institution( inst: dict[str, Any], wikidata_institutions: dict[str, dict[str, Any]], threshold: float = 0.85 ) -> Optional[dict[str, Any]]: """ Match a local institution to Wikidata using fuzzy name matching. Returns best match if score > threshold, else None. """ inst_name = inst.get('name', '') if not inst_name: return None best_match = None best_score = 0.0 for qid, wd_inst in wikidata_institutions.items(): wd_name = wd_inst['name'] # Calculate similarity score = similarity_score(inst_name, wd_name) if score > best_score: best_score = score best_match = wd_inst if best_score >= threshold: return best_match return None def enrich_institution( inst: dict[str, Any], wd_match: dict[str, Any] ) -> dict[str, Any]: """Add Wikidata enrichment to institution record.""" # Add Wikidata identifier identifiers = inst.get('identifiers', []) # Check if Wikidata already exists has_wikidata = any( id.get('identifier_scheme') == 'Wikidata' for id in identifiers ) if not has_wikidata: identifiers.append({ 'identifier_scheme': 'Wikidata', 'identifier_value': wd_match['qid'], 'identifier_url': f"https://www.wikidata.org/wiki/{wd_match['qid']}" }) # Add VIAF if available if wd_match.get('viaf'): has_viaf = any( id.get('identifier_scheme') == 'VIAF' for id in identifiers ) if not has_viaf: identifiers.append({ 'identifier_scheme': 'VIAF', 'identifier_value': wd_match['viaf'], 'identifier_url': f"https://viaf.org/viaf/{wd_match['viaf']}" }) # Add ISIL if available if wd_match.get('isil'): has_isil = any( id.get('identifier_scheme') == 'ISIL' for id in identifiers ) if not has_isil: identifiers.append({ 'identifier_scheme': 'ISIL', 'identifier_value': wd_match['isil'], # ISIL codes don't have a universal URLisil']}" }) # Add website if not present if wd_match.get('website'): has_website = any( id.get('identifier_scheme') == 'Website' for id in identifiers ) if not has_website: identifiers.append({ 'identifier_scheme': 'Website', 'identifier_value': wd_match['website'], 'identifier_url': wd_match['website'] }) inst['identifiers'] = identifiers # Add coordinates if available if wd_match.get('coords'): coords = wd_match['coords'].replace('Point(', '').replace(')', '').split() lon, lat = float(coords[0]), float(coords[1]) locations = inst.get('locations', []) if locations and not locations[0].get('latitude'): locations[0]['latitude'] = lat locations[0]['longitude'] = lon # Enhance description with Wikidata description if wd_match.get('description') and not inst.get('description'): inst['description'] = wd_match['description'] # Add enrichment provenance provenance = inst.get('provenance', {}) if 'enrichment_history' not in provenance: provenance['enrichment_history'] = [] provenance['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_method': 'Wikidata SPARQL query + fuzzy name matching', 'identifiers_added': ['Wikidata'] + (['VIAF'] if wd_match.get('viaf') else []) + (['ISIL'] if wd_match.get('isil') else []), 'verified': True }) inst['provenance'] = provenance return inst def main(): """Main enrichment workflow.""" # Paths project_root = Path(__file__).parent.parent master_file = project_root / 'data' / 'instances' / 'all' / 'globalglam-20251111.yaml' output_dir = project_root / 'data' / 'instances' / 'belgium' output_dir.mkdir(parents=True, exist_ok=True) print("šŸ‡§šŸ‡Ŗ Belgium EU Institutions Enrichment") print("=" * 70) # Load master dataset print(f"šŸ“– Loading master dataset from {master_file.name}...") with open(master_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) institutions = data if isinstance(data, list) else data.get('institutions', []) print(f"āœ… Loaded {len(institutions)} total institutions") # Filter Belgium institutions without Wikidata be_institutions = [ i for i in institutions if i.get('locations') and any(loc.get('country') == 'BE' for loc in i.get('locations', [])) and not any( id.get('identifier_scheme') == 'Wikidata' for id in i.get('identifiers', []) ) ] print(f"šŸŽÆ Found {len(be_institutions)} Belgium institutions without Wikidata") if not be_institutions: print("āœ… All Belgium institutions already have Wikidata IDs!") return # Initialize SPARQL endpoint sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.addCustomHttpHeader("User-Agent", "GLAM-Data-Extraction/0.2.1 (https://github.com/your-repo)") # Query Wikidata wd_institutions = query_wikidata_eu_institutions(sparql) time.sleep(1) # Rate limiting # Match and enrich enriched = [] unmatched = [] print("\nšŸ”— Matching institutions...") print("-" * 70) for inst in be_institutions: name = inst.get('name', 'UNKNOWN') # Try fuzzy matching match = match_institution(inst, wd_institutions, threshold=0.85) if match: print(f"āœ… MATCHED: {name}") print(f" → Wikidata: {match['name']} ({match['qid']})") print(f" → Confidence: {similarity_score(name, match['name']):.2%}") enriched_inst = enrich_institution(inst, match) enriched.append(enriched_inst) else: print(f"āŒ NO MATCH: {name}") unmatched.append(inst) # Summary print("\n" + "=" * 70) print(f"šŸ“Š Enrichment Summary") print("=" * 70) print(f"āœ… Matched: {len(enriched)}/{len(be_institutions)} ({len(enriched)/len(be_institutions)*100:.1f}%)") print(f"āŒ Unmatched: {len(unmatched)}") # Save enriched dataset if enriched: output_file = output_dir / 'belgium_institutions_enriched.yaml' output_data = { '_metadata': { 'generated': datetime.now(timezone.utc).isoformat(), 'project': 'GLAM Data Extraction', 'schema_version': 'v0.2.1', 'country': 'BE', 'description': 'Belgium EU institutions enriched with Wikidata', 'enrichment_method': 'Wikidata SPARQL + fuzzy matching', 'total_institutions': len(enriched), 'wikidata_coverage': sum(1 for i in enriched if any( id.get('identifier_scheme') == 'Wikidata' for id in i.get('identifiers', []) )) }, 'institutions': enriched } with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(output_data, f, allow_unicode=True, sort_keys=False, width=120) print(f"\nšŸ’¾ Saved {len(enriched)} enriched institutions to:") print(f" {output_file}") # Save unmatched for manual review if unmatched: unmatched_file = output_dir / 'belgium_unmatched.yaml' with open(unmatched_file, 'w', encoding='utf-8') as f: yaml.dump(unmatched, f, allow_unicode=True, sort_keys=False, width=120) print(f"\nāš ļø Saved {len(unmatched)} unmatched institutions to:") print(f" {unmatched_file}") print(f" → Manual review recommended") if __name__ == '__main__': main()