glam/scripts/enrich_phase2_mexico.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

441 lines
16 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Phase 2 Enrichment: Mexico (MX)
Target: 192 institutions, 17.7% Wikidata coverage → 35%+ (67+ institutions)
Strategy: SPARQL batch query + fuzzy name matching (Spanish normalization)
Based on: Brazil Phase 2 methodology (achieved 32.5% coverage from 13.7%)
GLAM Data Extraction Project - Phase 2: High-Volume Country Enrichment
"""
import sys
from pathlib import Path
from typing import Any, Dict, List, Tuple, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON
def normalize_name(name: str) -> str:
"""Normalize institution name for fuzzy matching (Spanish + English)."""
name = name.lower()
# Remove common prefixes/suffixes (Spanish + English)
name = re.sub(r'^(fundación|museo|biblioteca|archivo|centro|memorial|parque|galería)\s+', '', name)
name = re.sub(r'\s+(museo|biblioteca|archivo|nacional|estatal|municipal|federal|regional|memorial)$', '', name)
name = re.sub(r'^(foundation|museum|library|archive|center|centre|memorial|park|gallery)\s+', '', name)
name = re.sub(r'\s+(museum|library|archive|national|state|federal|regional|municipal|memorial)$', '', name)
# Remove abbreviations in parentheses
name = re.sub(r'\s*\([^)]*\)\s*', ' ', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', ' ', name)
# Normalize whitespace
name = ' '.join(name.split())
return name
def similarity_score(name1: str, name2: str) -> float:
"""Calculate similarity between two names (0-1)."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
return SequenceMatcher(None, norm1, norm2).ratio()
def query_wikidata_mexican_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]:
"""
Query Wikidata for ALL heritage institutions in Mexico.
Institution types: museums, libraries, archives, galleries, universities with collections
"""
query = """
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel
WHERE {
VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 wd:Q207694 wd:Q473972 wd:Q641635 }
?item wdt:P31/wdt:P279* ?type . # instance of (or subclass of) institution type
?item wdt:P17 wd:Q96 . # country = Mexico (Q96)
OPTIONAL { ?item wdt:P791 ?isil . }
OPTIONAL { ?item wdt:P214 ?viaf . }
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P856 ?website . }
OPTIONAL { ?item wdt:P571 ?inception . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,pt" . }
}
LIMIT 5000
"""
sparql.setQuery(query)
try:
raw_results = sparql.query().convert()
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
# Parse results into dict keyed by QID
results = {}
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
continue
result = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"type": binding.get("typeLabel", {}).get("value", ""),
"identifiers": {}
}
if "isil" in binding:
result["identifiers"]["ISIL"] = binding["isil"]["value"]
if "viaf" in binding:
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
result["identifiers"]["Website"] = binding["website"]["value"]
if "inception" in binding:
result["founding_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
results[qid] = result
return results
except Exception as e:
print(f"\n❌ Error querying Wikidata: {e}")
import traceback
traceback.print_exc()
return {}
def institution_type_compatible(inst_name: str, inst_type: str, wd_type: str) -> bool:
"""Check if institution types are compatible (prevent museum → library mismatches)."""
inst_lower = inst_name.lower()
wd_lower = wd_type.lower()
museum_kw = ['museo', 'museu', 'museum']
archive_kw = ['archivo', 'arquivo', 'archive']
library_kw = ['biblioteca', 'library', 'bibliothèque']
gallery_kw = ['galería', 'galeria', 'gallery', 'galerie']
inst_is_museum = any(kw in inst_lower for kw in museum_kw) or inst_type == "MUSEUM"
inst_is_archive = any(kw in inst_lower for kw in archive_kw) or inst_type == "ARCHIVE"
inst_is_library = any(kw in inst_lower for kw in library_kw) or inst_type == "LIBRARY"
inst_is_gallery = any(kw in inst_lower for kw in gallery_kw) or inst_type == "GALLERY"
wd_is_museum = any(kw in wd_lower for kw in museum_kw)
wd_is_archive = any(kw in wd_lower for kw in archive_kw)
wd_is_library = any(kw in wd_lower for kw in library_kw)
wd_is_gallery = any(kw in wd_lower for kw in gallery_kw)
# MIXED, OFFICIAL_INSTITUTION, EDUCATION_PROVIDER can match any type
if inst_type in ["MIXED", "OFFICIAL_INSTITUTION", "EDUCATION_PROVIDER"]:
return True
# If both have explicit types, they must match
if inst_is_museum and not wd_is_museum:
return False
if inst_is_archive and not wd_is_archive:
return False
if inst_is_library and not wd_is_library:
return False
if inst_is_gallery and not wd_is_gallery:
return False
return True
def fuzzy_match_institutions(
institutions: List[Dict[str, Any]],
wikidata_results: Dict[str, Dict[str, Any]],
threshold: float = 0.70
) -> List[Tuple[int, str, float, Dict[str, Any]]]:
"""
Fuzzy match Mexican institutions with Wikidata results.
Returns: List of (institution_idx, qid, confidence_score, wd_data)
"""
matches = []
for idx, inst in enumerate(institutions):
inst_name = inst.get("name", "")
inst_type = inst.get("institution_type", "")
if not inst_name:
continue
# Skip if already has real Wikidata ID
has_wikidata = any(
id_obj.get("identifier_scheme") == "Wikidata" and
id_obj.get("identifier_value", "").startswith("Q")
for id_obj in inst.get("identifiers", []) or []
)
if has_wikidata:
continue
# Find best match
best_score = 0.0
best_qid = None
best_data = None
for qid, wd_data in wikidata_results.items():
wd_name = wd_data.get("name", "")
wd_type = wd_data.get("type", "")
if not wd_name:
continue
# Check type compatibility
if not institution_type_compatible(inst_name, inst_type, wd_type):
continue
score = similarity_score(inst_name, wd_name)
if score > best_score:
best_score = score
best_qid = qid
best_data = wd_data
# Only include matches above threshold
if best_score >= threshold and best_qid and best_data:
matches.append((idx, best_qid, best_score, best_data))
return matches
def enrich_institution(inst: Dict[str, Any], wd_data: Dict[str, Any], match_score: float) -> bool:
"""Enrich an institution with Wikidata data. Returns True if enriched."""
enriched = False
if "identifiers" not in inst or not inst["identifiers"]:
inst["identifiers"] = []
identifiers_list = inst["identifiers"]
existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
# Add Wikidata ID
if "Wikidata" not in existing_schemes:
identifiers_list.append({
"identifier_scheme": "Wikidata",
"identifier_value": wd_data["qid"],
"identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
})
enriched = True
# Add other identifiers
wd_identifiers = wd_data.get("identifiers", {})
for scheme, value in wd_identifiers.items():
if scheme not in existing_schemes:
id_obj = {
"identifier_scheme": scheme,
"identifier_value": value
}
if scheme == "VIAF":
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
elif scheme == "Website":
id_obj["identifier_url"] = value
identifiers_list.append(id_obj)
enriched = True
# Update provenance with enrichment metadata
if enriched:
prov = inst.get("provenance", {})
if isinstance(prov, dict):
existing_method = prov.get("extraction_method", "")
if existing_method:
prov["extraction_method"] = f"{existing_method} + Phase 2 Mexico Wikidata enrichment"
else:
prov["extraction_method"] = "Phase 2 Mexico Wikidata enrichment"
# Add enrichment history
if "enrichment_history" not in prov:
prov["enrichment_history"] = []
prov["enrichment_history"].append({
"enrichment_date": datetime.now(timezone.utc).isoformat(),
"enrichment_method": "SPARQL query + fuzzy name matching (Spanish normalization, 70% threshold)",
"enrichment_source": [f"https://www.wikidata.org/wiki/{wd_data['qid']}"],
"match_score": match_score,
"enrichment_notes": f"Phase 2: Fuzzy matched '{inst.get('name')}' to Wikidata '{wd_data.get('name')}'"
})
return enriched
def main():
base_dir = Path(__file__).parent.parent
master_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml"
backup_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml.phase2_mexico_backup"
print("="*80)
print("🇲🇽 PHASE 2 MEXICO WIKIDATA ENRICHMENT")
print("="*80)
print(f"\n📖 Loading master dataset: {master_file.name}\n")
start_time = time.time()
# Load master dataset
with open(master_file, 'r', encoding='utf-8') as f:
all_institutions = yaml.safe_load(f)
# Filter Mexico institutions
mexico_institutions = [
inst for inst in all_institutions
if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations'])
]
# Get Mexico institution indices in master dataset
mexico_indices = [
i for i, inst in enumerate(all_institutions)
if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations'])
]
load_time = time.time() - start_time
print(f"✅ Loaded {len(all_institutions):,} total institutions in {load_time:.1f}s")
print(f"✅ Found {len(mexico_institutions):,} Mexican institutions\n")
# Count Wikidata coverage
with_wikidata = sum(
1 for inst in mexico_institutions
if inst.get('identifiers') and any(
id_obj.get("identifier_scheme") == "Wikidata"
for id_obj in inst['identifiers']
)
)
without_wikidata = len(mexico_institutions) - with_wikidata
current_coverage = (with_wikidata / len(mexico_institutions) * 100) if mexico_institutions else 0
print(f"✅ With Wikidata: {with_wikidata:,} ({current_coverage:.1f}%)")
print(f"❓ Without Wikidata: {without_wikidata:,}\n")
if without_wikidata == 0:
print("✨ All Mexican institutions already have Wikidata IDs!")
return
# Create backup
print(f"💾 Creating backup: {backup_file.name}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
print(f"✅ Backup created\n")
# Query Wikidata
print("🔍 Querying Wikidata for Mexican heritage institutions...")
print(" (This may take 30-60 seconds)\n")
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
sparql.setMethod('POST')
sparql.setTimeout(120) # 2 minute timeout
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2 (Phase 2 Mexico Enrichment)")
query_start = time.time()
wikidata_results = query_wikidata_mexican_institutions(sparql)
query_time = time.time() - query_start
print(f"✅ Found {len(wikidata_results):,} Mexican institutions in Wikidata (query took {query_time:.1f}s)\n")
if not wikidata_results:
print("⚠️ No Wikidata results, aborting enrichment")
return
# Fuzzy match
print("🔗 Fuzzy matching names (threshold: 0.70, Spanish normalization)...\n")
matches = fuzzy_match_institutions(mexico_institutions, wikidata_results, threshold=0.70)
print(f"✨ Found {len(matches):,} high-confidence matches\n")
if not matches:
print("❌ No matches found. Try lowering threshold below 0.70.\n")
return
# Show sample matches
print(f"{'='*80}")
print(f"📋 SAMPLE MATCHES (Top 10)")
print(f"{'='*80}")
sorted_matches = sorted(matches, key=lambda x: x[2], reverse=True)
for i, (idx, qid, score, wd_data) in enumerate(sorted_matches[:10], 1):
inst = mexico_institutions[idx]
city = inst.get("locations", [{}])[0].get("city", inst.get("locations", [{}])[0].get("region", "Unknown"))
print(f"\n{i}. Confidence: {score:.3f}")
print(f" Local: {inst.get('name')} ({city})")
print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
print(f" Type: {wd_data.get('type', 'Unknown')}")
print(f"\n{'='*80}\n")
# Apply matches to master dataset
print("✅ Applying matches to master dataset...\n")
enriched_count = 0
for local_idx, qid, score, wd_data in matches:
master_idx = mexico_indices[local_idx]
if enrich_institution(all_institutions[master_idx], wd_data, score):
enriched_count += 1
new_coverage = (with_wikidata + enriched_count) / len(mexico_institutions) * 100
print(f"✨ Enriched {enriched_count:,} institutions")
print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%")
print(f" (+{new_coverage - current_coverage:.1f} percentage points)\n")
# Write updated master dataset
print("="*80)
print("💾 WRITING UPDATED MASTER DATASET")
print("="*80 + "\n")
print(f"📝 Writing {len(all_institutions):,} institutions to disk...")
print(" (This may take 2-3 minutes for large datasets)\n")
write_start = time.time()
with open(master_file, 'w', encoding='utf-8') as f:
yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
write_time = time.time() - write_start
print(f"✅ Updated: {master_file} (write took {write_time:.1f}s)\n")
# Final summary
print("="*80)
print("📊 ENRICHMENT COMPLETE")
print("="*80)
print(f"\n✨ Results:")
print(f" Mexican institutions enriched: {enriched_count:,}")
print(f" Coverage increase: {current_coverage:.1f}% → {new_coverage:.1f}% (+{new_coverage - current_coverage:.1f}pp)")
print(f" Remaining without Wikidata: {without_wikidata - enriched_count:,}")
print(f" Overall dataset: {len(all_institutions):,} institutions")
print(f"\n⏱️ Total processing time: {(time.time()-start_time)/60:.1f} minutes")
print(f"\n🎯 Phase 2 Target: 35%+ coverage (67+ institutions)")
if new_coverage >= 35:
print(f" ✅ TARGET ACHIEVED!")
else:
print(f" ⏳ In progress... ({new_coverage:.1f}% / 35%)")
print("="*80 + "\n")
if __name__ == "__main__":
main()