- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
441 lines
16 KiB
Python
Executable file
441 lines
16 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Phase 2 Enrichment: Mexico (MX)
|
|
|
|
Target: 192 institutions, 17.7% Wikidata coverage → 35%+ (67+ institutions)
|
|
Strategy: SPARQL batch query + fuzzy name matching (Spanish normalization)
|
|
Based on: Brazil Phase 2 methodology (achieved 32.5% coverage from 13.7%)
|
|
|
|
GLAM Data Extraction Project - Phase 2: High-Volume Country Enrichment
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Tuple, Optional
|
|
from datetime import datetime, timezone
|
|
import time
|
|
import yaml
|
|
from difflib import SequenceMatcher
|
|
import re
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for fuzzy matching (Spanish + English)."""
|
|
name = name.lower()
|
|
|
|
# Remove common prefixes/suffixes (Spanish + English)
|
|
name = re.sub(r'^(fundación|museo|biblioteca|archivo|centro|memorial|parque|galería)\s+', '', name)
|
|
name = re.sub(r'\s+(museo|biblioteca|archivo|nacional|estatal|municipal|federal|regional|memorial)$', '', name)
|
|
name = re.sub(r'^(foundation|museum|library|archive|center|centre|memorial|park|gallery)\s+', '', name)
|
|
name = re.sub(r'\s+(museum|library|archive|national|state|federal|regional|municipal|memorial)$', '', name)
|
|
|
|
# Remove abbreviations in parentheses
|
|
name = re.sub(r'\s*\([^)]*\)\s*', ' ', name)
|
|
|
|
# Remove punctuation
|
|
name = re.sub(r'[^\w\s]', ' ', name)
|
|
|
|
# Normalize whitespace
|
|
name = ' '.join(name.split())
|
|
|
|
return name
|
|
|
|
|
|
def similarity_score(name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two names (0-1)."""
|
|
norm1 = normalize_name(name1)
|
|
norm2 = normalize_name(name2)
|
|
return SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
|
|
def query_wikidata_mexican_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Query Wikidata for ALL heritage institutions in Mexico.
|
|
|
|
Institution types: museums, libraries, archives, galleries, universities with collections
|
|
"""
|
|
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel
|
|
WHERE {
|
|
VALUES ?type { wd:Q33506 wd:Q7075 wd:Q166118 wd:Q207694 wd:Q473972 wd:Q641635 }
|
|
|
|
?item wdt:P31/wdt:P279* ?type . # instance of (or subclass of) institution type
|
|
?item wdt:P17 wd:Q96 . # country = Mexico (Q96)
|
|
|
|
OPTIONAL { ?item wdt:P791 ?isil . }
|
|
OPTIONAL { ?item wdt:P214 ?viaf . }
|
|
OPTIONAL { ?item wdt:P625 ?coords . }
|
|
OPTIONAL { ?item wdt:P856 ?website . }
|
|
OPTIONAL { ?item wdt:P571 ?inception . }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en,pt" . }
|
|
}
|
|
LIMIT 5000
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
|
|
try:
|
|
raw_results = sparql.query().convert()
|
|
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
|
|
|
|
# Parse results into dict keyed by QID
|
|
results = {}
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
continue
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"type": binding.get("typeLabel", {}).get("value", ""),
|
|
"identifiers": {}
|
|
}
|
|
|
|
if "isil" in binding:
|
|
result["identifiers"]["ISIL"] = binding["isil"]["value"]
|
|
|
|
if "viaf" in binding:
|
|
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
|
|
if "website" in binding:
|
|
result["identifiers"]["Website"] = binding["website"]["value"]
|
|
|
|
if "inception" in binding:
|
|
result["founding_date"] = binding["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
results[qid] = result
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ Error querying Wikidata: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return {}
|
|
|
|
|
|
def institution_type_compatible(inst_name: str, inst_type: str, wd_type: str) -> bool:
|
|
"""Check if institution types are compatible (prevent museum → library mismatches)."""
|
|
inst_lower = inst_name.lower()
|
|
wd_lower = wd_type.lower()
|
|
|
|
museum_kw = ['museo', 'museu', 'museum']
|
|
archive_kw = ['archivo', 'arquivo', 'archive']
|
|
library_kw = ['biblioteca', 'library', 'bibliothèque']
|
|
gallery_kw = ['galería', 'galeria', 'gallery', 'galerie']
|
|
|
|
inst_is_museum = any(kw in inst_lower for kw in museum_kw) or inst_type == "MUSEUM"
|
|
inst_is_archive = any(kw in inst_lower for kw in archive_kw) or inst_type == "ARCHIVE"
|
|
inst_is_library = any(kw in inst_lower for kw in library_kw) or inst_type == "LIBRARY"
|
|
inst_is_gallery = any(kw in inst_lower for kw in gallery_kw) or inst_type == "GALLERY"
|
|
|
|
wd_is_museum = any(kw in wd_lower for kw in museum_kw)
|
|
wd_is_archive = any(kw in wd_lower for kw in archive_kw)
|
|
wd_is_library = any(kw in wd_lower for kw in library_kw)
|
|
wd_is_gallery = any(kw in wd_lower for kw in gallery_kw)
|
|
|
|
# MIXED, OFFICIAL_INSTITUTION, EDUCATION_PROVIDER can match any type
|
|
if inst_type in ["MIXED", "OFFICIAL_INSTITUTION", "EDUCATION_PROVIDER"]:
|
|
return True
|
|
|
|
# If both have explicit types, they must match
|
|
if inst_is_museum and not wd_is_museum:
|
|
return False
|
|
if inst_is_archive and not wd_is_archive:
|
|
return False
|
|
if inst_is_library and not wd_is_library:
|
|
return False
|
|
if inst_is_gallery and not wd_is_gallery:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def fuzzy_match_institutions(
|
|
institutions: List[Dict[str, Any]],
|
|
wikidata_results: Dict[str, Dict[str, Any]],
|
|
threshold: float = 0.70
|
|
) -> List[Tuple[int, str, float, Dict[str, Any]]]:
|
|
"""
|
|
Fuzzy match Mexican institutions with Wikidata results.
|
|
|
|
Returns: List of (institution_idx, qid, confidence_score, wd_data)
|
|
"""
|
|
matches = []
|
|
|
|
for idx, inst in enumerate(institutions):
|
|
inst_name = inst.get("name", "")
|
|
inst_type = inst.get("institution_type", "")
|
|
if not inst_name:
|
|
continue
|
|
|
|
# Skip if already has real Wikidata ID
|
|
has_wikidata = any(
|
|
id_obj.get("identifier_scheme") == "Wikidata" and
|
|
id_obj.get("identifier_value", "").startswith("Q")
|
|
for id_obj in inst.get("identifiers", []) or []
|
|
)
|
|
if has_wikidata:
|
|
continue
|
|
|
|
# Find best match
|
|
best_score = 0.0
|
|
best_qid = None
|
|
best_data = None
|
|
|
|
for qid, wd_data in wikidata_results.items():
|
|
wd_name = wd_data.get("name", "")
|
|
wd_type = wd_data.get("type", "")
|
|
if not wd_name:
|
|
continue
|
|
|
|
# Check type compatibility
|
|
if not institution_type_compatible(inst_name, inst_type, wd_type):
|
|
continue
|
|
|
|
score = similarity_score(inst_name, wd_name)
|
|
if score > best_score:
|
|
best_score = score
|
|
best_qid = qid
|
|
best_data = wd_data
|
|
|
|
# Only include matches above threshold
|
|
if best_score >= threshold and best_qid and best_data:
|
|
matches.append((idx, best_qid, best_score, best_data))
|
|
|
|
return matches
|
|
|
|
|
|
def enrich_institution(inst: Dict[str, Any], wd_data: Dict[str, Any], match_score: float) -> bool:
|
|
"""Enrich an institution with Wikidata data. Returns True if enriched."""
|
|
enriched = False
|
|
|
|
if "identifiers" not in inst or not inst["identifiers"]:
|
|
inst["identifiers"] = []
|
|
|
|
identifiers_list = inst["identifiers"]
|
|
existing_schemes = {i.get("identifier_scheme", "") for i in identifiers_list if isinstance(i, dict)}
|
|
|
|
# Add Wikidata ID
|
|
if "Wikidata" not in existing_schemes:
|
|
identifiers_list.append({
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": wd_data["qid"],
|
|
"identifier_url": f"https://www.wikidata.org/wiki/{wd_data['qid']}"
|
|
})
|
|
enriched = True
|
|
|
|
# Add other identifiers
|
|
wd_identifiers = wd_data.get("identifiers", {})
|
|
for scheme, value in wd_identifiers.items():
|
|
if scheme not in existing_schemes:
|
|
id_obj = {
|
|
"identifier_scheme": scheme,
|
|
"identifier_value": value
|
|
}
|
|
|
|
if scheme == "VIAF":
|
|
id_obj["identifier_url"] = f"https://viaf.org/viaf/{value}"
|
|
elif scheme == "Website":
|
|
id_obj["identifier_url"] = value
|
|
|
|
identifiers_list.append(id_obj)
|
|
enriched = True
|
|
|
|
# Update provenance with enrichment metadata
|
|
if enriched:
|
|
prov = inst.get("provenance", {})
|
|
if isinstance(prov, dict):
|
|
existing_method = prov.get("extraction_method", "")
|
|
if existing_method:
|
|
prov["extraction_method"] = f"{existing_method} + Phase 2 Mexico Wikidata enrichment"
|
|
else:
|
|
prov["extraction_method"] = "Phase 2 Mexico Wikidata enrichment"
|
|
|
|
# Add enrichment history
|
|
if "enrichment_history" not in prov:
|
|
prov["enrichment_history"] = []
|
|
|
|
prov["enrichment_history"].append({
|
|
"enrichment_date": datetime.now(timezone.utc).isoformat(),
|
|
"enrichment_method": "SPARQL query + fuzzy name matching (Spanish normalization, 70% threshold)",
|
|
"enrichment_source": [f"https://www.wikidata.org/wiki/{wd_data['qid']}"],
|
|
"match_score": match_score,
|
|
"enrichment_notes": f"Phase 2: Fuzzy matched '{inst.get('name')}' to Wikidata '{wd_data.get('name')}'"
|
|
})
|
|
|
|
return enriched
|
|
|
|
|
|
def main():
|
|
base_dir = Path(__file__).parent.parent
|
|
master_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml"
|
|
backup_file = base_dir / "data" / "instances" / "all" / "globalglam-20251111.yaml.phase2_mexico_backup"
|
|
|
|
print("="*80)
|
|
print("🇲🇽 PHASE 2 MEXICO WIKIDATA ENRICHMENT")
|
|
print("="*80)
|
|
print(f"\n📖 Loading master dataset: {master_file.name}\n")
|
|
|
|
start_time = time.time()
|
|
|
|
# Load master dataset
|
|
with open(master_file, 'r', encoding='utf-8') as f:
|
|
all_institutions = yaml.safe_load(f)
|
|
|
|
# Filter Mexico institutions
|
|
mexico_institutions = [
|
|
inst for inst in all_institutions
|
|
if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations'])
|
|
]
|
|
|
|
# Get Mexico institution indices in master dataset
|
|
mexico_indices = [
|
|
i for i, inst in enumerate(all_institutions)
|
|
if inst.get('locations') and any(loc.get('country') == 'MX' for loc in inst['locations'])
|
|
]
|
|
|
|
load_time = time.time() - start_time
|
|
print(f"✅ Loaded {len(all_institutions):,} total institutions in {load_time:.1f}s")
|
|
print(f"✅ Found {len(mexico_institutions):,} Mexican institutions\n")
|
|
|
|
# Count Wikidata coverage
|
|
with_wikidata = sum(
|
|
1 for inst in mexico_institutions
|
|
if inst.get('identifiers') and any(
|
|
id_obj.get("identifier_scheme") == "Wikidata"
|
|
for id_obj in inst['identifiers']
|
|
)
|
|
)
|
|
|
|
without_wikidata = len(mexico_institutions) - with_wikidata
|
|
current_coverage = (with_wikidata / len(mexico_institutions) * 100) if mexico_institutions else 0
|
|
|
|
print(f"✅ With Wikidata: {with_wikidata:,} ({current_coverage:.1f}%)")
|
|
print(f"❓ Without Wikidata: {without_wikidata:,}\n")
|
|
|
|
if without_wikidata == 0:
|
|
print("✨ All Mexican institutions already have Wikidata IDs!")
|
|
return
|
|
|
|
# Create backup
|
|
print(f"💾 Creating backup: {backup_file.name}")
|
|
with open(backup_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
|
|
print(f"✅ Backup created\n")
|
|
|
|
# Query Wikidata
|
|
print("🔍 Querying Wikidata for Mexican heritage institutions...")
|
|
print(" (This may take 30-60 seconds)\n")
|
|
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
sparql.setMethod('POST')
|
|
sparql.setTimeout(120) # 2 minute timeout
|
|
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2 (Phase 2 Mexico Enrichment)")
|
|
|
|
query_start = time.time()
|
|
wikidata_results = query_wikidata_mexican_institutions(sparql)
|
|
query_time = time.time() - query_start
|
|
|
|
print(f"✅ Found {len(wikidata_results):,} Mexican institutions in Wikidata (query took {query_time:.1f}s)\n")
|
|
|
|
if not wikidata_results:
|
|
print("⚠️ No Wikidata results, aborting enrichment")
|
|
return
|
|
|
|
# Fuzzy match
|
|
print("🔗 Fuzzy matching names (threshold: 0.70, Spanish normalization)...\n")
|
|
|
|
matches = fuzzy_match_institutions(mexico_institutions, wikidata_results, threshold=0.70)
|
|
|
|
print(f"✨ Found {len(matches):,} high-confidence matches\n")
|
|
|
|
if not matches:
|
|
print("❌ No matches found. Try lowering threshold below 0.70.\n")
|
|
return
|
|
|
|
# Show sample matches
|
|
print(f"{'='*80}")
|
|
print(f"📋 SAMPLE MATCHES (Top 10)")
|
|
print(f"{'='*80}")
|
|
|
|
sorted_matches = sorted(matches, key=lambda x: x[2], reverse=True)
|
|
|
|
for i, (idx, qid, score, wd_data) in enumerate(sorted_matches[:10], 1):
|
|
inst = mexico_institutions[idx]
|
|
city = inst.get("locations", [{}])[0].get("city", inst.get("locations", [{}])[0].get("region", "Unknown"))
|
|
|
|
print(f"\n{i}. Confidence: {score:.3f}")
|
|
print(f" Local: {inst.get('name')} ({city})")
|
|
print(f" Wikidata: {wd_data.get('name')} ({wd_data.get('qid')})")
|
|
print(f" Type: {wd_data.get('type', 'Unknown')}")
|
|
|
|
print(f"\n{'='*80}\n")
|
|
|
|
# Apply matches to master dataset
|
|
print("✅ Applying matches to master dataset...\n")
|
|
enriched_count = 0
|
|
|
|
for local_idx, qid, score, wd_data in matches:
|
|
master_idx = mexico_indices[local_idx]
|
|
if enrich_institution(all_institutions[master_idx], wd_data, score):
|
|
enriched_count += 1
|
|
|
|
new_coverage = (with_wikidata + enriched_count) / len(mexico_institutions) * 100
|
|
|
|
print(f"✨ Enriched {enriched_count:,} institutions")
|
|
print(f"📈 Coverage: {current_coverage:.1f}% → {new_coverage:.1f}%")
|
|
print(f" (+{new_coverage - current_coverage:.1f} percentage points)\n")
|
|
|
|
# Write updated master dataset
|
|
print("="*80)
|
|
print("💾 WRITING UPDATED MASTER DATASET")
|
|
print("="*80 + "\n")
|
|
print(f"📝 Writing {len(all_institutions):,} institutions to disk...")
|
|
print(" (This may take 2-3 minutes for large datasets)\n")
|
|
|
|
write_start = time.time()
|
|
with open(master_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
|
|
write_time = time.time() - write_start
|
|
|
|
print(f"✅ Updated: {master_file} (write took {write_time:.1f}s)\n")
|
|
|
|
# Final summary
|
|
print("="*80)
|
|
print("📊 ENRICHMENT COMPLETE")
|
|
print("="*80)
|
|
print(f"\n✨ Results:")
|
|
print(f" Mexican institutions enriched: {enriched_count:,}")
|
|
print(f" Coverage increase: {current_coverage:.1f}% → {new_coverage:.1f}% (+{new_coverage - current_coverage:.1f}pp)")
|
|
print(f" Remaining without Wikidata: {without_wikidata - enriched_count:,}")
|
|
print(f" Overall dataset: {len(all_institutions):,} institutions")
|
|
print(f"\n⏱️ Total processing time: {(time.time()-start_time)/60:.1f} minutes")
|
|
print(f"\n🎯 Phase 2 Target: 35%+ coverage (67+ institutions)")
|
|
if new_coverage >= 35:
|
|
print(f" ✅ TARGET ACHIEVED!")
|
|
else:
|
|
print(f" ⏳ In progress... ({new_coverage:.1f}% / 35%)")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|