glam/scripts/enrich_belgium_eu.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

361 lines
12 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Belgium EU institutions with Wikidata identifiers.
Belgium dataset consists of 7 EU institutions in Brussels (0% Wikidata coverage).
All are well-documented EU bodies with likely Wikidata entries.
Strategy:
1. Load Belgium institutions from master dataset
2. Query Wikidata for EU institutions (P31=Q43229, P17=Q29999)
3. Fuzzy match names
4. Apply high-confidence matches (>0.85)
"""
import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
def normalize_name(name: str) -> str:
"""Normalize institution name for fuzzy matching."""
# Lowercase
name = name.lower()
# Remove common EU prefixes
name = re.sub(r'^(european|eu)\s+', '', name)
name = re.sub(r'\s+(library|archive|archives|committee|commission|parliament|council)$', '', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', ' ', name)
# Normalize whitespace
name = ' '.join(name.split())
return name
def similarity_score(name1: str, name2: str) -> float:
"""Calculate similarity between two names (0-1)."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
return SequenceMatcher(None, norm1, norm2).ratio()
def query_wikidata_eu_institutions(sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]:
"""
Query Wikidata for EU institutions and their heritage units.
Queries for:
- EU institutions (Q43229)
- EU agencies (Q1338914)
- Located in Belgium (Q31) or EU (Q458)
"""
query = """
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel
WHERE {
VALUES ?type { wd:Q43229 wd:Q1338914 wd:Q7075 wd:Q166118 }
?item wdt:P31 ?type . # instance of EU institution/library/archive
# Must be located in Belgium or EU
{ ?item wdt:P17 wd:Q31 . } # country: Belgium
UNION
{ ?item wdt:P17 wd:Q458 . } # country: European Union
UNION
{ ?item wdt:P131 wd:Q239 . } # located in: Brussels
OPTIONAL { ?item wdt:P791 ?isil . }
OPTIONAL { ?item wdt:P214 ?viaf . }
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P856 ?website . }
OPTIONAL { ?item wdt:P571 ?inception . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,fr,nl,de". }
}
ORDER BY ?itemLabel
"""
sparql.setQuery(query)
sparql.setReturnFormat(SPARQL_JSON)
print("🔍 Querying Wikidata for EU institutions...")
results = sparql.query().convert()
# Parse results
institutions = {}
for result in results['results']['bindings']:
qid = result['item']['value'].split('/')[-1]
institutions[qid] = {
'qid': qid,
'name': result['itemLabel']['value'],
'description': result.get('itemDescription', {}).get('value', ''),
'isil': result.get('isil', {}).get('value'),
'viaf': result.get('viaf', {}).get('value'),
'website': result.get('website', {}).get('value'),
'inception': result.get('inception', {}).get('value', '').split('T')[0],
'type': result.get('typeLabel', {}).get('value', ''),
'coords': result.get('coords', {}).get('value')
}
print(f"✅ Found {len(institutions)} EU institutions in Wikidata")
return institutions
def match_institution(
inst: dict[str, Any],
wikidata_institutions: dict[str, dict[str, Any]],
threshold: float = 0.85
) -> Optional[dict[str, Any]]:
"""
Match a local institution to Wikidata using fuzzy name matching.
Returns best match if score > threshold, else None.
"""
inst_name = inst.get('name', '')
if not inst_name:
return None
best_match = None
best_score = 0.0
for qid, wd_inst in wikidata_institutions.items():
wd_name = wd_inst['name']
# Calculate similarity
score = similarity_score(inst_name, wd_name)
if score > best_score:
best_score = score
best_match = wd_inst
if best_score >= threshold:
return best_match
return None
def enrich_institution(
inst: dict[str, Any],
wd_match: dict[str, Any]
) -> dict[str, Any]:
"""Add Wikidata enrichment to institution record."""
# Add Wikidata identifier
identifiers = inst.get('identifiers', [])
# Check if Wikidata already exists
has_wikidata = any(
id.get('identifier_scheme') == 'Wikidata'
for id in identifiers
)
if not has_wikidata:
identifiers.append({
'identifier_scheme': 'Wikidata',
'identifier_value': wd_match['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{wd_match['qid']}"
})
# Add VIAF if available
if wd_match.get('viaf'):
has_viaf = any(
id.get('identifier_scheme') == 'VIAF'
for id in identifiers
)
if not has_viaf:
identifiers.append({
'identifier_scheme': 'VIAF',
'identifier_value': wd_match['viaf'],
'identifier_url': f"https://viaf.org/viaf/{wd_match['viaf']}"
})
# Add ISIL if available
if wd_match.get('isil'):
has_isil = any(
id.get('identifier_scheme') == 'ISIL'
for id in identifiers
)
if not has_isil:
identifiers.append({
'identifier_scheme': 'ISIL',
'identifier_value': wd_match['isil'],
'identifier_url': f"https://isil.org/{wd_match['isil']}"
})
# Add website if not present
if wd_match.get('website'):
has_website = any(
id.get('identifier_scheme') == 'Website'
for id in identifiers
)
if not has_website:
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': wd_match['website'],
'identifier_url': wd_match['website']
})
inst['identifiers'] = identifiers
# Add coordinates if available
if wd_match.get('coords'):
coords = wd_match['coords'].replace('Point(', '').replace(')', '').split()
lon, lat = float(coords[0]), float(coords[1])
locations = inst.get('locations', [])
if locations and not locations[0].get('latitude'):
locations[0]['latitude'] = lat
locations[0]['longitude'] = lon
# Enhance description with Wikidata description
if wd_match.get('description') and not inst.get('description'):
inst['description'] = wd_match['description']
# Add enrichment provenance
provenance = inst.get('provenance', {})
if 'enrichment_history' not in provenance:
provenance['enrichment_history'] = []
provenance['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'Wikidata SPARQL query + fuzzy name matching',
'identifiers_added': ['Wikidata'] +
(['VIAF'] if wd_match.get('viaf') else []) +
(['ISIL'] if wd_match.get('isil') else []),
'verified': True
})
inst['provenance'] = provenance
return inst
def main():
"""Main enrichment workflow."""
# Paths
project_root = Path(__file__).parent.parent
master_file = project_root / 'data' / 'instances' / 'all' / 'globalglam-20251111.yaml'
output_dir = project_root / 'data' / 'instances' / 'belgium'
output_dir.mkdir(parents=True, exist_ok=True)
print("🇧🇪 Belgium EU Institutions Enrichment")
print("=" * 70)
# Load master dataset
print(f"📖 Loading master dataset from {master_file.name}...")
with open(master_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
institutions = data if isinstance(data, list) else data.get('institutions', [])
print(f"✅ Loaded {len(institutions)} total institutions")
# Filter Belgium institutions without Wikidata
be_institutions = [
i for i in institutions
if i.get('locations')
and any(loc.get('country') == 'BE' for loc in i.get('locations', []))
and not any(
id.get('identifier_scheme') == 'Wikidata'
for id in i.get('identifiers', [])
)
]
print(f"🎯 Found {len(be_institutions)} Belgium institutions without Wikidata")
if not be_institutions:
print("✅ All Belgium institutions already have Wikidata IDs!")
return
# Initialize SPARQL endpoint
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.addCustomHttpHeader("User-Agent", "GLAM-Data-Extraction/0.2.1 (https://github.com/your-repo)")
# Query Wikidata
wd_institutions = query_wikidata_eu_institutions(sparql)
time.sleep(1) # Rate limiting
# Match and enrich
enriched = []
unmatched = []
print("\n🔗 Matching institutions...")
print("-" * 70)
for inst in be_institutions:
name = inst.get('name', 'UNKNOWN')
# Try fuzzy matching
match = match_institution(inst, wd_institutions, threshold=0.85)
if match:
print(f"✅ MATCHED: {name}")
print(f" → Wikidata: {match['name']} ({match['qid']})")
print(f" → Confidence: {similarity_score(name, match['name']):.2%}")
enriched_inst = enrich_institution(inst, match)
enriched.append(enriched_inst)
else:
print(f"❌ NO MATCH: {name}")
unmatched.append(inst)
# Summary
print("\n" + "=" * 70)
print(f"📊 Enrichment Summary")
print("=" * 70)
print(f"✅ Matched: {len(enriched)}/{len(be_institutions)} ({len(enriched)/len(be_institutions)*100:.1f}%)")
print(f"❌ Unmatched: {len(unmatched)}")
# Save enriched dataset
if enriched:
output_file = output_dir / 'belgium_institutions_enriched.yaml'
output_data = {
'_metadata': {
'generated': datetime.now(timezone.utc).isoformat(),
'project': 'GLAM Data Extraction',
'schema_version': 'v0.2.1',
'country': 'BE',
'description': 'Belgium EU institutions enriched with Wikidata',
'enrichment_method': 'Wikidata SPARQL + fuzzy matching',
'total_institutions': len(enriched),
'wikidata_coverage': sum(1 for i in enriched if any(
id.get('identifier_scheme') == 'Wikidata'
for id in i.get('identifiers', [])
))
},
'institutions': enriched
}
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(output_data, f, allow_unicode=True, sort_keys=False, width=120)
print(f"\n💾 Saved {len(enriched)} enriched institutions to:")
print(f" {output_file}")
# Save unmatched for manual review
if unmatched:
unmatched_file = output_dir / 'belgium_unmatched.yaml'
with open(unmatched_file, 'w', encoding='utf-8') as f:
yaml.dump(unmatched, f, allow_unicode=True, sort_keys=False, width=120)
print(f"\n⚠️ Saved {len(unmatched)} unmatched institutions to:")
print(f" {unmatched_file}")
print(f" → Manual review recommended")
if __name__ == '__main__':
main()