361 lines
12 KiB
Python
Executable file
361 lines
12 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Belgium EU institutions with Wikidata identifiers.
|
|
|
|
Belgium dataset consists of 7 EU institutions in Brussels (0% Wikidata coverage).
|
|
All are well-documented EU bodies with likely Wikidata entries.
|
|
|
|
Strategy:
|
|
1. Load Belgium institutions from master dataset
|
|
2. Query Wikidata for EU institutions (P31=Q43229, P17=Q29999)
|
|
3. Fuzzy match names
|
|
4. Apply high-confidence matches (>0.85)
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from datetime import datetime, timezone
|
|
import time
|
|
import yaml
|
|
from difflib import SequenceMatcher
|
|
import re
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for fuzzy matching."""
|
|
# Lowercase
|
|
name = name.lower()
|
|
|
|
# Remove common EU prefixes
|
|
name = re.sub(r'^(european|eu)\s+', '', name)
|
|
name = re.sub(r'\s+(library|archive|archives|committee|commission|parliament|council)$', '', name)
|
|
|
|
# Remove punctuation
|
|
name = re.sub(r'[^\w\s]', ' ', name)
|
|
|
|
# Normalize whitespace
|
|
name = ' '.join(name.split())
|
|
|
|
return name
|
|
|
|
|
|
def similarity_score(name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two names (0-1)."""
|
|
norm1 = normalize_name(name1)
|
|
norm2 = normalize_name(name2)
|
|
return SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
|
|
def query_wikidata_eu_institutions(sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]:
|
|
"""
|
|
Query Wikidata for EU institutions and their heritage units.
|
|
|
|
Queries for:
|
|
- EU institutions (Q43229)
|
|
- EU agencies (Q1338914)
|
|
- Located in Belgium (Q31) or EU (Q458)
|
|
"""
|
|
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel
|
|
WHERE {
|
|
VALUES ?type { wd:Q43229 wd:Q1338914 wd:Q7075 wd:Q166118 }
|
|
|
|
?item wdt:P31 ?type . # instance of EU institution/library/archive
|
|
|
|
# Must be located in Belgium or EU
|
|
{ ?item wdt:P17 wd:Q31 . } # country: Belgium
|
|
UNION
|
|
{ ?item wdt:P17 wd:Q458 . } # country: European Union
|
|
UNION
|
|
{ ?item wdt:P131 wd:Q239 . } # located in: Brussels
|
|
|
|
OPTIONAL { ?item wdt:P791 ?isil . }
|
|
OPTIONAL { ?item wdt:P214 ?viaf . }
|
|
OPTIONAL { ?item wdt:P625 ?coords . }
|
|
OPTIONAL { ?item wdt:P856 ?website . }
|
|
OPTIONAL { ?item wdt:P571 ?inception . }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "en,fr,nl,de". }
|
|
}
|
|
ORDER BY ?itemLabel
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
|
|
print("🔍 Querying Wikidata for EU institutions...")
|
|
results = sparql.query().convert()
|
|
|
|
# Parse results
|
|
institutions = {}
|
|
for result in results['results']['bindings']:
|
|
qid = result['item']['value'].split('/')[-1]
|
|
|
|
institutions[qid] = {
|
|
'qid': qid,
|
|
'name': result['itemLabel']['value'],
|
|
'description': result.get('itemDescription', {}).get('value', ''),
|
|
'isil': result.get('isil', {}).get('value'),
|
|
'viaf': result.get('viaf', {}).get('value'),
|
|
'website': result.get('website', {}).get('value'),
|
|
'inception': result.get('inception', {}).get('value', '').split('T')[0],
|
|
'type': result.get('typeLabel', {}).get('value', ''),
|
|
'coords': result.get('coords', {}).get('value')
|
|
}
|
|
|
|
print(f"✅ Found {len(institutions)} EU institutions in Wikidata")
|
|
return institutions
|
|
|
|
|
|
def match_institution(
|
|
inst: dict[str, Any],
|
|
wikidata_institutions: dict[str, dict[str, Any]],
|
|
threshold: float = 0.85
|
|
) -> Optional[dict[str, Any]]:
|
|
"""
|
|
Match a local institution to Wikidata using fuzzy name matching.
|
|
|
|
Returns best match if score > threshold, else None.
|
|
"""
|
|
inst_name = inst.get('name', '')
|
|
if not inst_name:
|
|
return None
|
|
|
|
best_match = None
|
|
best_score = 0.0
|
|
|
|
for qid, wd_inst in wikidata_institutions.items():
|
|
wd_name = wd_inst['name']
|
|
|
|
# Calculate similarity
|
|
score = similarity_score(inst_name, wd_name)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = wd_inst
|
|
|
|
if best_score >= threshold:
|
|
return best_match
|
|
|
|
return None
|
|
|
|
|
|
def enrich_institution(
|
|
inst: dict[str, Any],
|
|
wd_match: dict[str, Any]
|
|
) -> dict[str, Any]:
|
|
"""Add Wikidata enrichment to institution record."""
|
|
|
|
# Add Wikidata identifier
|
|
identifiers = inst.get('identifiers', [])
|
|
|
|
# Check if Wikidata already exists
|
|
has_wikidata = any(
|
|
id.get('identifier_scheme') == 'Wikidata'
|
|
for id in identifiers
|
|
)
|
|
|
|
if not has_wikidata:
|
|
identifiers.append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': wd_match['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{wd_match['qid']}"
|
|
})
|
|
|
|
# Add VIAF if available
|
|
if wd_match.get('viaf'):
|
|
has_viaf = any(
|
|
id.get('identifier_scheme') == 'VIAF'
|
|
for id in identifiers
|
|
)
|
|
if not has_viaf:
|
|
identifiers.append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': wd_match['viaf'],
|
|
'identifier_url': f"https://viaf.org/viaf/{wd_match['viaf']}"
|
|
})
|
|
|
|
# Add ISIL if available
|
|
if wd_match.get('isil'):
|
|
has_isil = any(
|
|
id.get('identifier_scheme') == 'ISIL'
|
|
for id in identifiers
|
|
)
|
|
if not has_isil:
|
|
identifiers.append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': wd_match['isil'],
|
|
# ISIL codes don't have a universal URLisil']}"
|
|
})
|
|
|
|
# Add website if not present
|
|
if wd_match.get('website'):
|
|
has_website = any(
|
|
id.get('identifier_scheme') == 'Website'
|
|
for id in identifiers
|
|
)
|
|
if not has_website:
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': wd_match['website'],
|
|
'identifier_url': wd_match['website']
|
|
})
|
|
|
|
inst['identifiers'] = identifiers
|
|
|
|
# Add coordinates if available
|
|
if wd_match.get('coords'):
|
|
coords = wd_match['coords'].replace('Point(', '').replace(')', '').split()
|
|
lon, lat = float(coords[0]), float(coords[1])
|
|
|
|
locations = inst.get('locations', [])
|
|
if locations and not locations[0].get('latitude'):
|
|
locations[0]['latitude'] = lat
|
|
locations[0]['longitude'] = lon
|
|
|
|
# Enhance description with Wikidata description
|
|
if wd_match.get('description') and not inst.get('description'):
|
|
inst['description'] = wd_match['description']
|
|
|
|
# Add enrichment provenance
|
|
provenance = inst.get('provenance', {})
|
|
if 'enrichment_history' not in provenance:
|
|
provenance['enrichment_history'] = []
|
|
|
|
provenance['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'Wikidata SPARQL query + fuzzy name matching',
|
|
'identifiers_added': ['Wikidata'] +
|
|
(['VIAF'] if wd_match.get('viaf') else []) +
|
|
(['ISIL'] if wd_match.get('isil') else []),
|
|
'verified': True
|
|
})
|
|
|
|
inst['provenance'] = provenance
|
|
|
|
return inst
|
|
|
|
|
|
def main():
|
|
"""Main enrichment workflow."""
|
|
|
|
# Paths
|
|
project_root = Path(__file__).parent.parent
|
|
master_file = project_root / 'data' / 'instances' / 'all' / 'globalglam-20251111.yaml'
|
|
output_dir = project_root / 'data' / 'instances' / 'belgium'
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print("🇧🇪 Belgium EU Institutions Enrichment")
|
|
print("=" * 70)
|
|
|
|
# Load master dataset
|
|
print(f"📖 Loading master dataset from {master_file.name}...")
|
|
with open(master_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
institutions = data if isinstance(data, list) else data.get('institutions', [])
|
|
print(f"✅ Loaded {len(institutions)} total institutions")
|
|
|
|
# Filter Belgium institutions without Wikidata
|
|
be_institutions = [
|
|
i for i in institutions
|
|
if i.get('locations')
|
|
and any(loc.get('country') == 'BE' for loc in i.get('locations', []))
|
|
and not any(
|
|
id.get('identifier_scheme') == 'Wikidata'
|
|
for id in i.get('identifiers', [])
|
|
)
|
|
]
|
|
|
|
print(f"🎯 Found {len(be_institutions)} Belgium institutions without Wikidata")
|
|
|
|
if not be_institutions:
|
|
print("✅ All Belgium institutions already have Wikidata IDs!")
|
|
return
|
|
|
|
# Initialize SPARQL endpoint
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.addCustomHttpHeader("User-Agent", "GLAM-Data-Extraction/0.2.1 (https://github.com/your-repo)")
|
|
|
|
# Query Wikidata
|
|
wd_institutions = query_wikidata_eu_institutions(sparql)
|
|
time.sleep(1) # Rate limiting
|
|
|
|
# Match and enrich
|
|
enriched = []
|
|
unmatched = []
|
|
|
|
print("\n🔗 Matching institutions...")
|
|
print("-" * 70)
|
|
|
|
for inst in be_institutions:
|
|
name = inst.get('name', 'UNKNOWN')
|
|
|
|
# Try fuzzy matching
|
|
match = match_institution(inst, wd_institutions, threshold=0.85)
|
|
|
|
if match:
|
|
print(f"✅ MATCHED: {name}")
|
|
print(f" → Wikidata: {match['name']} ({match['qid']})")
|
|
print(f" → Confidence: {similarity_score(name, match['name']):.2%}")
|
|
|
|
enriched_inst = enrich_institution(inst, match)
|
|
enriched.append(enriched_inst)
|
|
else:
|
|
print(f"❌ NO MATCH: {name}")
|
|
unmatched.append(inst)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 70)
|
|
print(f"📊 Enrichment Summary")
|
|
print("=" * 70)
|
|
print(f"✅ Matched: {len(enriched)}/{len(be_institutions)} ({len(enriched)/len(be_institutions)*100:.1f}%)")
|
|
print(f"❌ Unmatched: {len(unmatched)}")
|
|
|
|
# Save enriched dataset
|
|
if enriched:
|
|
output_file = output_dir / 'belgium_institutions_enriched.yaml'
|
|
|
|
output_data = {
|
|
'_metadata': {
|
|
'generated': datetime.now(timezone.utc).isoformat(),
|
|
'project': 'GLAM Data Extraction',
|
|
'schema_version': 'v0.2.1',
|
|
'country': 'BE',
|
|
'description': 'Belgium EU institutions enriched with Wikidata',
|
|
'enrichment_method': 'Wikidata SPARQL + fuzzy matching',
|
|
'total_institutions': len(enriched),
|
|
'wikidata_coverage': sum(1 for i in enriched if any(
|
|
id.get('identifier_scheme') == 'Wikidata'
|
|
for id in i.get('identifiers', [])
|
|
))
|
|
},
|
|
'institutions': enriched
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(output_data, f, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
print(f"\n💾 Saved {len(enriched)} enriched institutions to:")
|
|
print(f" {output_file}")
|
|
|
|
# Save unmatched for manual review
|
|
if unmatched:
|
|
unmatched_file = output_dir / 'belgium_unmatched.yaml'
|
|
|
|
with open(unmatched_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(unmatched, f, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
print(f"\n⚠️ Saved {len(unmatched)} unmatched institutions to:")
|
|
print(f" {unmatched_file}")
|
|
print(f" → Manual review recommended")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|