glam/scripts/enrich_low_coverage_countries_fuzzy.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

472 lines
17 KiB
Python

#!/usr/bin/env python3
"""
Enrich low-coverage countries using fuzzy name matching with Wikidata.
Target countries with <30% Wikidata coverage:
- Brazil (BR): 1.0%
- Belgium (BE): 0.0%
- Italy (IT): 0.0%
- Thailand (TH): <30%
- Norway (NO): <30%
- Vietnam (VN): <30%
Strategy:
1. Query Wikidata for museums/libraries/archives in target country
2. Fuzzy match institution names (normalized, threshold 0.85)
3. Verify type compatibility (don't match museum → archive)
4. Enrich with Wikidata IDs, VIAF, founding dates, websites
"""
import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
def normalize_name(name: str) -> str:
"""Normalize institution name for fuzzy matching."""
# Lowercase
name = name.lower()
# Remove common prefixes/suffixes in multiple languages
name = re.sub(r'^(stichting|gemeentearchief|regionaal archief|museum|museu|museo|biblioteca|bibliotheek|library|archive|archief|archivo)[\s\-]+', '', name)
name = re.sub(r'[\s\-]+(archief|museum|museo|museu|bibliotheek|biblioteca|library|archive|archivo)$', '', name)
# Remove organizational forms
name = re.sub(r'\b(s\.a\.|sa|nv|bv|vzw|asbl|inc|ltd|gmbh)\b', '', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', ' ', name)
# Normalize whitespace
name = ' '.join(name.split())
return name
def similarity_score(name1: str, name2: str) -> float:
"""Calculate similarity between two names (0-1)."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
return SequenceMatcher(None, norm1, norm2).ratio()
def institution_type_compatible(inst_type: str, inst_name: str, wd_name: str, wd_desc: str) -> bool:
"""
Check if institution types are compatible.
Prevents mismatches like museum → archive or library → museum.
"""
# Define type keywords by language
museum_kw = ['museum', 'museo', 'museu', 'muzeum']
archive_kw = ['archief', 'archive', 'archivo', 'archivio']
library_kw = ['bibliotheek', 'biblioteca', 'library', 'bibliothèque', 'bibliothek']
inst_lower = (inst_name + ' ' + inst_type).lower()
wd_lower = (wd_name + ' ' + wd_desc).lower()
# Check institutional type
inst_is_museum = any(kw in inst_lower for kw in museum_kw)
inst_is_archive = any(kw in inst_lower for kw in archive_kw)
inst_is_library = any(kw in inst_lower for kw in library_kw)
wd_is_museum = any(kw in wd_lower for kw in museum_kw)
wd_is_archive = any(kw in wd_lower for kw in archive_kw)
wd_is_library = any(kw in wd_lower for kw in library_kw)
# If both have explicit types, they must match
if inst_is_museum and not wd_is_museum:
return False
if inst_is_archive and not wd_is_archive:
return False
if inst_is_library and not wd_is_library:
return False
return True
def query_country_institutions(sparql: SPARQLWrapper, country_code: str) -> dict[str, dict[str, Any]]:
"""
Query Wikidata for GLAM institutions in a specific country.
Returns: dict keyed by QID
"""
# Map ISO 3166-1 alpha-2 to Wikidata QIDs
country_qids = {
"BR": "Q155", # Brazil
"BE": "Q31", # Belgium
"IT": "Q38", # Italy
"NO": "Q20", # Norway
"TH": "Q869", # Thailand
"VN": "Q881", # Vietnam
"MX": "Q96", # Mexico
"CL": "Q298", # Chile
}
qid = country_qids.get(country_code)
if not qid:
print(f" ⚠️ No Wikidata QID mapping for country code: {country_code}")
return {}
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?typeLabel ?isil ?viaf ?coords ?website ?inception
WHERE {{
# Institution is in target country
?item wdt:P17 wd:{qid} .
# Institution is a GLAM type
VALUES ?type {{
wd:Q7075 # library
wd:Q166118 # archive
wd:Q33506 # museum
wd:Q1007870 # art gallery
wd:Q28564 # public library
wd:Q11396180 # academic library
wd:Q207694 # art museum
wd:Q2772772 # history museum
}}
?item wdt:P31 ?type .
# Optional enrichment data
OPTIONAL {{ ?item wdt:P791 ?isil . }} # ISIL code
OPTIONAL {{ ?item wdt:P214 ?viaf . }} # VIAF ID
OPTIONAL {{ ?item wdt:P625 ?coords . }} # Coordinates
OPTIONAL {{ ?item wdt:P856 ?website . }} # Official website
OPTIONAL {{ ?item wdt:P571 ?inception . }} # Founding date
# Get labels (adjust languages by region)
SERVICE wikibase:label {{
bd:serviceParam wikibase:language "en,pt,es,nl,fr,it,no,th,vi" .
}}
}}
LIMIT 2000
"""
sparql.setQuery(query)
try:
raw_results = sparql.query().convert()
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
# Parse results into dict keyed by QID
results = {}
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
continue
result = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"type": binding.get("typeLabel", {}).get("value", ""),
"identifiers": {}
}
if "isil" in binding:
result["identifiers"]["ISIL"] = binding["isil"]["value"]
if "viaf" in binding:
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
result["identifiers"]["Website"] = binding["website"]["value"]
if "inception" in binding:
result["founding_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
results[qid] = result
return results
except Exception as e:
print(f"\n❌ Error querying Wikidata: {e}")
import traceback
traceback.print_exc()
return {}
def fuzzy_match_institutions(
institutions: list[dict[str, Any]],
wikidata_results: dict[str, dict[str, Any]],
threshold: float = 0.85
) -> list[tuple[int, str, float, dict[str, Any]]]:
"""
Fuzzy match institutions with Wikidata results.
Returns: List of (institution_idx, qid, confidence_score, wd_data)
"""
matches = []
for idx, inst in enumerate(institutions):
inst_name = inst.get("name", "")
inst_type = inst.get("institution_type", "")
if not inst_name:
continue
# Skip if already has real Wikidata ID
has_wikidata = any(
id_obj.get("identifier_scheme") == "Wikidata" and
id_obj.get("identifier_value", "").startswith("Q") and
int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
for id_obj in inst.get("identifiers", [])
)
if has_wikidata:
continue
# Find best match
best_score = 0.0
best_qid = None
best_data = None
for qid, wd_data in wikidata_results.items():
wd_name = wd_data.get("name", "")
wd_desc = wd_data.get("description", "")
if not wd_name:
continue
# Check type compatibility
if not institution_type_compatible(inst_type, inst_name, wd_name, wd_desc):
continue
score = similarity_score(inst_name, wd_name)
if score > best_score:
best_score = score
best_qid = qid
best_data = wd_data
# Only include matches above threshold
if best_score >= threshold and best_qid and best_data:
matches.append((idx, best_qid, best_score, best_data))
return matches
def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
"""Enrich an institution with Wikidata data. Returns True if enriched."""
enriched = False
if "identifiers" not in inst or not inst["identifiers"]:
inst["identifiers"] = []
identifiers_list = inst["identifiers"]
existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
# Add Wikidata ID
if "Wikidata" not in existing_schemes:
identifiers_list.append({
"identifier_scheme": "Wikidata",
"identifier_value": wd_data["qid"],
"identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
})
enriched = True
# Add other identifiers
wd_identifiers = wd_data.get("identifiers", {})
for scheme, value in wd_identifiers.items():
if scheme not in existing_schemes:
id_obj = {
"identifier_scheme": scheme,
"identifier_value": value
}
if scheme == "VIAF":
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
elif scheme == "Website":
id_obj["identifier_url"] = value
identifiers_list.append(id_obj)
enriched = True
# Add founding date
if "founding_date" in wd_data and not inst.get("founding_date"):
inst["founding_date"] = wd_data["founding_date"]
enriched = True
# Add coordinates if missing
if "latitude" in wd_data and "longitude" in wd_data:
locations = inst.get("locations", [])
if isinstance(locations, list) and len(locations) > 0:
first_loc = locations[0]
if isinstance(first_loc, dict) and first_loc.get("latitude") is None:
first_loc["latitude"] = wd_data["latitude"]
first_loc["longitude"] = wd_data["longitude"]
enriched = True
# Update provenance
if enriched:
prov = inst.get("provenance", {})
if isinstance(prov, dict):
existing_method = prov.get("extraction_method", "")
if existing_method:
prov["extraction_method"] = f"{existing_method} + Wikidata fuzzy enrichment"
else:
prov["extraction_method"] = "Wikidata fuzzy enrichment"
return enriched
def main():
base_dir = Path(__file__).parent.parent
input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
# Target countries with low coverage
target_countries = ["BR", "BE", "IT", "NO", "TH", "VN"]
print("="*80)
print("🌍 LOW-COVERAGE COUNTRIES FUZZY MATCHING")
print("="*80)
print(f"\n🎯 Target countries: {', '.join(target_countries)}\n")
print(f"📖 Loading dataset...\n")
start_time = time.time()
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
load_time = time.time() - start_time
print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s\n")
# Setup SPARQL
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
sparql.setMethod('POST')
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2")
total_enriched = 0
# Process each country
for country_code in target_countries:
print("="*80)
print(f"🌍 Processing {country_code}")
print("="*80)
# Filter institutions for this country
country_institutions_idx = [
idx for idx, inst in enumerate(institutions)
if any(loc.get('country') == country_code for loc in inst.get('locations', []))
]
if not country_institutions_idx:
print(f" ⚠️ No institutions found for {country_code}\n")
continue
print(f" Found {len(country_institutions_idx):,} institutions")
# Count those without Wikidata
country_without_wikidata = [
idx for idx in country_institutions_idx
if not any(
id_obj.get("identifier_scheme") == "Wikidata" and
id_obj.get("identifier_value", "").startswith("Q") and
int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
for id_obj in institutions[idx].get("identifiers", [])
)
]
current_coverage = (len(country_institutions_idx) - len(country_without_wikidata)) / len(country_institutions_idx) * 100
print(f" Current Wikidata coverage: {current_coverage:.1f}%")
print(f" Institutions needing enrichment: {len(country_without_wikidata):,}\n")
# Query Wikidata
print(f"🔍 Querying Wikidata for {country_code} institutions...")
print(" (This may take 30-60 seconds)\n")
wikidata_results = query_country_institutions(sparql, country_code)
print(f"✅ Found {len(wikidata_results):,} {country_code} institutions in Wikidata\n")
if not wikidata_results:
print(f" ⚠️ No Wikidata results for {country_code}, skipping\n")
continue
# Fuzzy match
print("🔗 Fuzzy matching names (threshold: 0.85)...\n")
country_insts = [institutions[idx] for idx in country_without_wikidata]
matches = fuzzy_match_institutions(country_insts, wikidata_results, threshold=0.85)
print(f"✨ Found {len(matches):,} high-confidence matches\n")
# Show sample matches
if matches:
print(f"📋 Sample matches (showing first 5):")
for i, (local_idx, qid, score, wd_data) in enumerate(matches[:5]):
inst = country_insts[local_idx]
print(f"\n{i+1}. Confidence: {score:.3f}")
print(f" Local: {inst.get('name')}")
print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
print(f" Type: {wd_data.get('type', 'Unknown')}")
print(f"\n✅ Applying {len(matches)} matches for {country_code}...\n")
country_enriched = 0
for local_idx, qid, score, wd_data in matches:
global_idx = country_without_wikidata[local_idx]
if enrich_institution(institutions[global_idx], wd_data):
country_enriched += 1
print(f"✨ Enriched {country_enriched:,} institutions")
# Calculate new coverage
new_coverage = (len(country_institutions_idx) - len(country_without_wikidata) + country_enriched) / len(country_institutions_idx) * 100
print(f" Coverage: {current_coverage:.1f}% → {new_coverage:.1f}% (+{new_coverage - current_coverage:.1f}%)\n")
total_enriched += country_enriched
else:
print(f" ❌ No matches found for {country_code}\n")
# Rate limiting between countries
time.sleep(2.0)
# Write output
if total_enriched > 0:
print("="*80)
print("💾 Writing enriched dataset...")
print("="*80 + "\n")
with open(output_file, 'w', encoding='utf-8') as f:
header = f"""---
# Global Heritage Institutions - Low-Coverage Countries Fuzzy Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Total institutions: {len(institutions):,}
# Countries processed: {', '.join(target_countries)}
# Total new matches: {total_enriched:,}
"""
f.write(header)
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
print(f"✅ Complete! Output: {output_file}\n")
# Final report
print("="*80)
print("📊 ENRICHMENT REPORT")
print("="*80)
print(f"\n✨ Total institutions enriched: {total_enriched:,}")
print(f"⏱️ Processing time: {(time.time()-start_time)/60:.1f} minutes")
print("="*80 + "\n")
else:
print("❌ No institutions enriched\n")
if __name__ == "__main__":
main()