glam/scripts/enrich_latam_institutions_fuzzy.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

500 lines
18 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Latin American institutions using fuzzy name matching in Wikidata.
This script addresses low coverage in Brazil (1%), Mexico (21%), and Chile (29%)
by querying Wikidata for heritage institutions using name-based searches.
Strategy:
1. Find institutions without Wikidata IDs in target countries
2. Query Wikidata for museums/archives/libraries in each country
3. Fuzzy match names (normalized)
4. Apply high-confidence matches (>0.85)
Countries:
- Brazil (BR, Q155): 1% → 15-25% expected
- Mexico (MX, Q96): 21% → 35-45% expected
- Chile (CL, Q298): 29% → 40-50% expected
"""
import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
# Country configurations
COUNTRIES = {
'BR': {'name': 'Brazil', 'qid': 'Q155', 'flag': '🇧🇷'},
'MX': {'name': 'Mexico', 'qid': 'Q96', 'flag': '🇲🇽'},
'CL': {'name': 'Chile', 'qid': 'Q298', 'flag': '🇨🇱'}
}
def normalize_name(name: str) -> str:
"""Normalize institution name for fuzzy matching."""
# Lowercase
name = name.lower()
# Remove common prefixes/suffixes (multilingual)
# Spanish
name = re.sub(r'^(fundación|museo|biblioteca|archivo|centro)\s+', '', name)
name = re.sub(r'\s+(museo|biblioteca|archivo|nacional|regional|municipal)$', '', name)
# Portuguese
name = re.sub(r'^(fundação|museu|biblioteca|arquivo|centro)\s+', '', name)
name = re.sub(r'\s+(museu|biblioteca|arquivo|nacional|estadual|municipal)$', '', name)
# English
name = re.sub(r'^(foundation|museum|library|archive|center|centre)\s+', '', name)
name = re.sub(r'\s+(museum|library|archive|national|regional|municipal)$', '', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', ' ', name)
# Normalize whitespace
name = ' '.join(name.split())
return name
def similarity_score(name1: str, name2: str) -> float:
"""Calculate similarity between two names (0-1)."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
return SequenceMatcher(None, norm1, norm2).ratio()
def query_wikidata_institutions(
sparql: SPARQLWrapper,
country_qid: str,
institution_types: list[str]
) -> dict[str, dict[str, Any]]:
"""
Query Wikidata for heritage institutions in a specific country.
country_qid: Wikidata QID for country (Q155=Brazil, Q96=Mexico, Q298=Chile)
institution_types: List of Wikidata QIDs for institution types
Q33506 - museum
Q7075 - library
Q166118 - archive
"""
types_values = " ".join(f"wd:{qid}" for qid in institution_types)
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel
WHERE {{
VALUES ?type {{ {types_values} }}
?item wdt:P31 ?type . # instance of museum/library/archive
?item wdt:P17 wd:{country_qid} . # country
OPTIONAL {{ ?item wdt:P791 ?isil . }}
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
OPTIONAL {{ ?item wdt:P625 ?coords . }}
OPTIONAL {{ ?item wdt:P856 ?website . }}
OPTIONAL {{ ?item wdt:P571 ?inception . }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,pt,en" . }}
}}
LIMIT 2000
"""
sparql.setQuery(query)
try:
raw_results = sparql.query().convert()
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
# Parse results into dict keyed by QID
results = {}
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
continue
result = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"type": binding.get("typeLabel", {}).get("value", ""),
"identifiers": {}
}
if "isil" in binding:
result["identifiers"]["ISIL"] = binding["isil"]["value"]
if "viaf" in binding:
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
result["identifiers"]["Website"] = binding["website"]["value"]
if "inception" in binding:
result["founding_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
results[qid] = result
return results
except Exception as e:
print(f"\n❌ Error querying Wikidata: {e}")
import traceback
traceback.print_exc()
return {}
def institution_type_compatible(inst_name: str, wd_type: str) -> bool:
"""Check if institution types are compatible (avoid museum/archive mismatches)."""
inst_lower = inst_name.lower()
wd_lower = wd_type.lower()
# Define type keywords (multilingual)
museum_keywords = ['museum', 'museo', 'museu', 'musée']
archive_keywords = ['archief', 'archive', 'archivo', 'arquivo']
library_keywords = ['bibliotheek', 'library', 'biblioteca', 'bibliothèque']
# Check if institution name contains type keyword
inst_is_museum = any(kw in inst_lower for kw in museum_keywords)
inst_is_archive = any(kw in inst_lower for kw in archive_keywords)
inst_is_library = any(kw in inst_lower for kw in library_keywords)
# Check if Wikidata type contains type keyword
wd_is_museum = any(kw in wd_lower for kw in museum_keywords)
wd_is_archive = any(kw in wd_lower for kw in archive_keywords)
wd_is_library = any(kw in wd_lower for kw in library_keywords)
# If both have explicit types, they must match
if inst_is_museum and not wd_is_museum:
return False
if inst_is_archive and not wd_is_archive:
return False
if inst_is_library and not wd_is_library:
return False
return True
def fuzzy_match_institutions(
institutions: list[dict[str, Any]],
wikidata_results: dict[str, dict[str, Any]],
threshold: float = 0.85
) -> list[tuple[int, str, float, dict[str, Any]]]:
"""
Fuzzy match institutions with Wikidata results.
Returns: List of (institution_idx, qid, confidence_score, wd_data)
"""
matches = []
for idx, inst in enumerate(institutions):
inst_name = inst.get("name", "")
if not inst_name:
continue
# Skip if already has real Wikidata ID
has_wikidata = any(
id_obj.get("identifier_scheme") == "Wikidata" and
id_obj.get("identifier_value", "").startswith("Q") and
int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
for id_obj in inst.get("identifiers", [])
)
if has_wikidata:
continue
# Find best match
best_score = 0.0
best_qid = None
best_data = None
for qid, wd_data in wikidata_results.items():
wd_name = wd_data.get("name", "")
wd_type = wd_data.get("type", "")
if not wd_name:
continue
# Check type compatibility
if not institution_type_compatible(inst_name, wd_type):
continue
score = similarity_score(inst_name, wd_name)
if score > best_score:
best_score = score
best_qid = qid
best_data = wd_data
# Only include matches above threshold
if best_score >= threshold and best_qid and best_data:
matches.append((idx, best_qid, best_score, best_data))
return matches
def enrich_institution(inst: dict[str, Any], wd_data: dict[str, Any]) -> bool:
"""Enrich an institution with Wikidata data. Returns True if enriched."""
enriched = False
if "identifiers" not in inst or not inst["identifiers"]:
inst["identifiers"] = []
identifiers_list = inst["identifiers"]
existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
# Add Wikidata ID (or replace synthetic Q-number)
wikidata_idx = None
for i, id_obj in enumerate(identifiers_list):
if isinstance(id_obj, dict) and id_obj.get("identifier_scheme") == "Wikidata":
wikidata_idx = i
break
if wikidata_idx is not None:
# Replace existing (possibly synthetic) Wikidata ID
old_value = identifiers_list[wikidata_idx].get("identifier_value", "")
if old_value != wd_data["qid"]:
identifiers_list[wikidata_idx] = {
"identifier_scheme": "Wikidata",
"identifier_value": wd_data["qid"],
"identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
}
enriched = True
else:
# Add new Wikidata ID
identifiers_list.append({
"identifier_scheme": "Wikidata",
"identifier_value": wd_data["qid"],
"identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
})
enriched = True
# Add other identifiers
wd_identifiers = wd_data.get("identifiers", {})
for scheme, value in wd_identifiers.items():
if scheme not in existing_schemes:
id_obj = {
"identifier_scheme": scheme,
"identifier_value": value
}
if scheme == "VIAF":
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
elif scheme == "Website":
id_obj["identifier_url"] = value
identifiers_list.append(id_obj)
enriched = True
# Add founding date
if "founding_date" in wd_data and not inst.get("founding_date"):
inst["founding_date"] = wd_data["founding_date"]
enriched = True
# Add coordinates if missing
if "latitude" in wd_data and "longitude" in wd_data:
locations = inst.get("locations", [])
if isinstance(locations, list) and len(locations) > 0:
first_loc = locations[0]
if isinstance(first_loc, dict) and first_loc.get("latitude") is None:
first_loc["latitude"] = wd_data["latitude"]
first_loc["longitude"] = wd_data["longitude"]
enriched = True
# Update provenance
if enriched:
prov = inst.get("provenance", {})
if isinstance(prov, dict):
existing_method = prov.get("extraction_method", "")
if existing_method:
prov["extraction_method"] = f"{existing_method} + Wikidata enrichment (fuzzy name match)"
else:
prov["extraction_method"] = "Wikidata enrichment (fuzzy name match)"
return enriched
def process_country(
institutions: list[dict[str, Any]],
country_code: str,
sparql: SPARQLWrapper
) -> tuple[int, int]:
"""
Process a single country's institutions.
Returns: (institutions_without_wikidata, enriched_count)
"""
country_info = COUNTRIES[country_code]
print(f"\n{'='*80}")
print(f"{country_info['flag']} {country_info['name'].upper()} ({country_code})")
print(f"{'='*80}\n")
# Filter institutions for this country
country_institutions_idx = [
idx for idx, inst in enumerate(institutions)
if inst.get('locations', [{}])[0].get('country') == country_code
]
print(f"📊 Found {len(country_institutions_idx):,} {country_info['name']} institutions")
# Count those without real Wikidata
without_wikidata = [
idx for idx in country_institutions_idx
if not any(
id_obj.get("identifier_scheme") == "Wikidata" and
id_obj.get("identifier_value", "").startswith("Q") and
int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
for id_obj in institutions[idx].get("identifiers", [])
)
]
current_coverage = (len(country_institutions_idx) - len(without_wikidata)) / len(country_institutions_idx) * 100 if country_institutions_idx else 0
print(f"✅ With Wikidata: {len(country_institutions_idx) - len(without_wikidata):,} ({current_coverage:.1f}%)")
print(f"❓ Without Wikidata: {len(without_wikidata):,}\n")
if not without_wikidata:
print("✨ All institutions already have Wikidata IDs!")
return 0, 0
# Query Wikidata
print(f"🔍 Querying Wikidata for {country_info['name']} museums, libraries, and archives...")
print(" (This may take 30-60 seconds)\n")
institution_types = ["Q33506", "Q7075", "Q166118"] # museum, library, archive
wikidata_results = query_wikidata_institutions(sparql, country_info['qid'], institution_types)
print(f"✅ Found {len(wikidata_results):,} {country_info['name']} institutions in Wikidata\n")
if not wikidata_results:
print("⚠️ No Wikidata results, skipping fuzzy matching")
return len(without_wikidata), 0
# Fuzzy match
print("🔗 Fuzzy matching names (threshold: 0.85)...\n")
country_insts = [institutions[idx] for idx in without_wikidata]
matches = fuzzy_match_institutions(country_insts, wikidata_results, threshold=0.85)
print(f"✨ Found {len(matches):,} high-confidence matches\n")
# Show sample matches
if matches:
print(f"{'='*80}")
print(f"📋 SAMPLE MATCHES (Top 5)")
print(f"{'='*80}")
for i, (local_idx, qid, score, wd_data) in enumerate(matches[:5]):
inst = country_insts[local_idx]
print(f"\n{i+1}. Confidence: {score:.3f}")
print(f" Local: {inst.get('name')} ({inst.get('locations', [{}])[0].get('city', 'Unknown')})")
print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
print(f" Type: {wd_data.get('type', 'Unknown')}")
if "ISIL" in wd_data.get("identifiers", {}):
print(f" ISIL: {wd_data['identifiers']['ISIL']}")
print(f"\n{'='*80}\n")
# Apply all matches
print("✅ Applying all matches...\n")
enriched_count = 0
for local_idx, qid, score, wd_data in matches:
global_idx = without_wikidata[local_idx]
if enrich_institution(institutions[global_idx], wd_data):
enriched_count += 1
new_coverage = (len(country_institutions_idx) - len(without_wikidata) + enriched_count) / len(country_institutions_idx) * 100 if country_institutions_idx else 0
print(f"✨ Enriched {enriched_count:,} institutions")
print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%\n")
return len(without_wikidata), enriched_count
else:
print("❌ No matches found. Try lowering threshold.\n")
return len(without_wikidata), 0
def main():
base_dir = Path(__file__).parent.parent
input_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_latam_enriched.yaml"
print("="*80)
print("🌎 LATIN AMERICA INSTITUTIONS FUZZY MATCHING")
print("="*80)
print(f"\n📖 Loading dataset...\n")
start_time = time.time()
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
load_time = time.time() - start_time
print(f"✅ Loaded {len(institutions):,} institutions in {load_time:.1f}s")
# Setup SPARQL
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
sparql.setMethod('POST')
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2")
# Process each country
total_without_wikidata = 0
total_enriched = 0
for country_code in ['BR', 'MX', 'CL']:
without, enriched = process_country(institutions, country_code, sparql)
total_without_wikidata += without
total_enriched += enriched
# Rate limiting - be nice to Wikidata
if country_code != 'CL': # Don't sleep after last country
print("⏸️ Waiting 5 seconds (Wikidata rate limiting)...\n")
time.sleep(5)
# Write output
print("="*80)
print("💾 WRITING ENRICHED DATASET")
print("="*80 + "\n")
header = f"""---
# Global Heritage Institutions - Latin America Fuzzy Match Enriched
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Total institutions: {len(institutions):,}
# Latin America institutions processed: {sum(len([i for i in institutions if i.get('locations', [{}])[0].get('country') == cc]) for cc in ['BR', 'MX', 'CL']):,}
# New Latin America matches: {total_enriched:,}
"""
with open(output_file, 'w', encoding='utf-8') as f:
f.write(header)
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
print(f"✅ Complete! Output: {output_file}\n")
# Final report
print("="*80)
print("📊 FINAL ENRICHMENT REPORT")
print("="*80)
print(f"\n✨ Results:")
print(f" Total institutions enriched: {total_enriched:,}")
print(f" Latin America institutions without Wikidata: {total_without_wikidata - total_enriched:,}")
print(f"\n⏱️ Total processing time: {(time.time()-start_time)/60:.1f} minutes")
print("="*80 + "\n")
if __name__ == "__main__":
main()