glam/scripts/enrich_chilean_batch2_universities.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

491 lines
17 KiB
Python

#!/usr/bin/env python3
"""
Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (Universities Focus)
Improved strategy based on Batch 1 learnings:
- Focus on universities (excellent Wikidata coverage)
- Better name normalization (strip possessives, handle word order)
- Geographic filtering in SPARQL queries
- Higher success rate expected (universities have standardized names)
BATCH 2 TARGET INSTITUTIONS (5 major universities):
1. Universidad de Chile - Santiago
2. Universidad de Santiago de Chile (USACH) - Santiago
3. Universidad de Concepción - Concepción
4. Universidad Austral de Chile - Valdivia
5. Pontificia Universidad Católica de Chile - Santiago
SUCCESS CRITERIA:
- Batch 1: 2/90 with Wikidata (2.2%)
- Goal: 7/90 with Wikidata (7.8%)
- Expected success rate: 100% for universities
IMPROVEMENTS FROM BATCH 1:
1. Name normalization: Remove "'s", "Universidad's""Universidad"
2. Geographic filtering: Add city/region to SPARQL query
3. Multiple name variants: Try both full and abbreviated names
4. Better fuzzy matching: Use token_set_ratio for word order variations
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any, Optional
import requests
import time
from rapidfuzz import fuzz
import re
# Wikidata SPARQL endpoint
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
# Batch 2: Major Chilean universities (high success probability)
BATCH_2_TARGETS = [
{
'name_pattern': 'Universidad de Chile',
'name_variants': ['Universidad de Chile', 'U. de Chile', 'UChile'],
'region': 'Santiago',
'city': 'Santiago',
'inst_type': 'EDUCATION_PROVIDER',
'wikidata_class': 'Q3918', # university
'notes': 'Oldest and most prestigious public university in Chile (founded 1842)'
},
{
'name_pattern': 'Universidad de Santiago de Chile',
'name_variants': ['Universidad de Santiago de Chile', 'USACH', 'U. de Santiago'],
'region': 'Santiago',
'city': 'Santiago',
'inst_type': 'EDUCATION_PROVIDER',
'wikidata_class': 'Q3918',
'notes': 'Major public university in Santiago (founded 1849 as Escuela de Artes y Oficios)'
},
{
'name_pattern': 'Universidad de Concepción',
'name_variants': ['Universidad de Concepción', 'UdeC', 'U. de Concepción'],
'region': 'Concepción',
'city': 'Concepción',
'inst_type': 'EDUCATION_PROVIDER',
'wikidata_class': 'Q3918',
'notes': 'Third oldest university in Chile (founded 1919)'
},
{
'name_pattern': 'Universidad Austral de Chile',
'name_variants': ['Universidad Austral de Chile', 'UACh', 'U. Austral'],
'region': 'Valdivia',
'city': 'Valdivia',
'inst_type': 'EDUCATION_PROVIDER',
'wikidata_class': 'Q3918',
'notes': 'Public university in southern Chile (founded 1954)'
},
{
'name_pattern': 'Pontificia Universidad Católica de Chile',
'name_variants': [
'Pontificia Universidad Católica de Chile',
'UC Chile',
'PUC',
'Universidad Católica de Chile'
],
'region': 'Santiago',
'city': 'Santiago',
'inst_type': 'EDUCATION_PROVIDER',
'wikidata_class': 'Q3918',
'notes': 'Leading private Catholic university (founded 1888)'
}
]
def normalize_name(name: str) -> str:
"""Normalize institution name for better matching."""
# Remove possessive markers
name = re.sub(r"'s\b", "", name)
# Remove leading/trailing whitespace
name = name.strip()
# Normalize whitespace
name = re.sub(r'\s+', ' ', name)
return name
def matches_institution(inst: Dict[str, Any], target: Dict[str, Any]) -> bool:
"""Check if institution matches target criteria with improved name matching."""
# Normalize institution name
inst_name = normalize_name(inst.get('name', '')).lower()
# Check against all name variants
name_variants = target.get('name_variants', [target['name_pattern']])
matched_name = False
for variant in name_variants:
normalized_variant = normalize_name(variant).lower()
# Try multiple fuzzy matching strategies
scores = [
fuzz.ratio(inst_name, normalized_variant),
fuzz.partial_ratio(inst_name, normalized_variant),
fuzz.token_set_ratio(inst_name, normalized_variant)
]
max_score = max(scores)
if max_score >= 75: # Lower threshold to catch variations
matched_name = True
break
if not matched_name:
return False
# Check institution type
if inst.get('institution_type') != target['inst_type']:
return False
# Check location (region or city)
locations = inst.get('locations', [])
if not locations:
return False
location = locations[0]
region = location.get('region', '')
city = location.get('city', '')
# Match by region or city
target_region = target.get('region', '')
target_city = target.get('city', '')
location_match = False
if target_region and (region == target_region or city == target_region):
location_match = True
if target_city and (city == target_city or region == target_city):
location_match = True
return location_match
def has_wikidata(inst: Dict[str, Any]) -> bool:
"""Check if institution already has Wikidata identifier."""
return any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
def query_wikidata_with_location(
name_variants: List[str],
city: Optional[str],
inst_class: str
) -> List[Dict[str, Any]]:
"""Query Wikidata with geographic filtering for better precision."""
# Build filter for city if provided
city_filter = ""
if city:
# Map Chilean city names to Wikidata Q-numbers (add as needed)
city_mapping = {
'Santiago': 'Q2887',
'Concepción': 'Q5775',
'Valdivia': 'Q3883'
}
if city in city_mapping:
city_q = city_mapping[city]
city_filter = f"""
?item wdt:P131* wd:{city_q} . # Located in or subdivision of city
"""
query = f"""
SELECT ?item ?itemLabel ?itemDescription ?viaf ?isil WHERE {{
?item wdt:P31/wdt:P279* wd:{inst_class} .
?item wdt:P17 wd:Q298 . # Country: Chile
{city_filter}
OPTIONAL {{ ?item wdt:P214 ?viaf }}
OPTIONAL {{ ?item wdt:P791 ?isil }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
}}
LIMIT 100
"""
headers = {
'User-Agent': 'GLAM-Data-Extractor/0.2 (heritage-data-project; batch2-universities)',
'Accept': 'application/json'
}
try:
response = requests.get(
WIKIDATA_SPARQL,
params={'query': query, 'format': 'json'},
headers=headers,
timeout=30
)
response.raise_for_status()
results = response.json()
bindings = results.get('results', {}).get('bindings', [])
# Extract relevant fields
matches = []
for binding in bindings:
item_uri = binding.get('item', {}).get('value', '')
q_number = item_uri.split('/')[-1] if item_uri else None
if q_number:
matches.append({
'q_number': q_number,
'label': binding.get('itemLabel', {}).get('value', ''),
'description': binding.get('itemDescription', {}).get('value', ''),
'viaf': binding.get('viaf', {}).get('value', None),
'isil': binding.get('isil', {}).get('value', None)
})
return matches
except Exception as e:
print(f" ⚠️ Wikidata query error: {e}")
return []
def fuzzy_match_wikidata_improved(
inst_name: str,
name_variants: List[str],
wd_results: List[Dict[str, Any]]
) -> tuple[Optional[Dict[str, Any]], float]:
"""Improved fuzzy matching with multiple strategies."""
best_match = None
best_score = 0
# Normalize institution name
inst_name_norm = normalize_name(inst_name).lower()
for result in wd_results:
wd_label = normalize_name(result['label']).lower()
# Try matching against institution name
scores = [
fuzz.ratio(inst_name_norm, wd_label),
fuzz.partial_ratio(inst_name_norm, wd_label),
fuzz.token_set_ratio(inst_name_norm, wd_label),
fuzz.token_sort_ratio(inst_name_norm, wd_label)
]
# Also try matching against target name variants
for variant in name_variants:
variant_norm = normalize_name(variant).lower()
scores.extend([
fuzz.ratio(variant_norm, wd_label),
fuzz.token_set_ratio(variant_norm, wd_label),
fuzz.token_sort_ratio(variant_norm, wd_label)
])
score = max(scores)
if score > best_score:
best_score = score
best_match = result
return best_match, best_score
def add_wikidata_identifier(
inst: Dict[str, Any],
q_number: str,
confidence: float,
notes: str
) -> Dict[str, Any]:
"""Add Wikidata identifier to institution with provenance tracking."""
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Update provenance
if 'provenance' in inst:
old_method = inst['provenance'].get('extraction_method', '')
enrichment_note = (
f" + Wikidata enrichment (Batch 2 universities, confidence={confidence:.2f})"
)
inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"
return inst
def main():
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch1_enriched.yaml'
backup_file = data_file.with_suffix('.batch2_backup')
output_file = data_file.with_name('chilean_institutions_batch2_enriched.yaml')
print("=" * 80)
print("Chilean Heritage Institutions - Batch 2 Wikidata Enrichment")
print("Universities Focus - Improved Matching Strategy")
print("Session: November 9, 2025")
print("Target: 5 major universities")
print("=" * 80)
print()
# Load data
print(f"📂 Loading: {data_file}")
with open(data_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f" Total institutions: {len(institutions)}")
# Check existing Wikidata coverage
with_wikidata_before = sum(1 for inst in institutions if has_wikidata(inst))
print(f" Current Wikidata coverage: {with_wikidata_before}/{len(institutions)} ({with_wikidata_before/len(institutions)*100:.1f}%)")
print()
# Create backup
print(f"💾 Creating backup: {backup_file}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("Batch 2 Enrichment Process:")
print("-" * 80)
# Process each target
enriched_count = 0
skipped_count = 0
not_found_count = 0
manual_review_count = 0
for i, target in enumerate(BATCH_2_TARGETS, 1):
print(f"\n[{i}/{len(BATCH_2_TARGETS)}] 🎓 Searching: {target['name_pattern']}")
print(f" Location: {target['city']}, {target['region']}")
print(f" Name variants: {', '.join(target['name_variants'][:3])}")
# Find matching institution in dataset
matched = None
for inst in institutions:
if matches_institution(inst, target):
matched = inst
break
if not matched:
print(f" ❌ NOT FOUND in dataset")
print(f" (Check if institution name matches any variant)")
not_found_count += 1
continue
print(f" ✓ Found: {matched.get('name')}")
# Check if already has Wikidata
if has_wikidata(matched):
existing_q = next(
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
if id_obj.get('identifier_scheme') == 'Wikidata'),
None
)
print(f" ⏭️ Already enriched with {existing_q}")
skipped_count += 1
continue
# Query Wikidata with location filtering
print(f" 🌐 Querying Wikidata (universities in {target['city']})...")
time.sleep(1.5) # Rate limiting
wd_results = query_wikidata_with_location(
target['name_variants'],
target.get('city'),
target['wikidata_class']
)
if not wd_results:
print(f" ⚠️ No Wikidata results found")
manual_review_count += 1
continue
print(f" 📊 Found {len(wd_results)} Wikidata candidates")
# Improved fuzzy matching
best_match, match_score = fuzzy_match_wikidata_improved(
matched['name'],
target['name_variants'],
wd_results
)
if not best_match:
print(f" ⚠️ No good match found (threshold < 70)")
manual_review_count += 1
continue
print(f" 🎯 Best match: {best_match['label']} ({best_match['q_number']})")
print(f" Similarity: {match_score:.1f}%")
if best_match.get('description'):
print(f" Description: {best_match['description']}")
if best_match.get('viaf'):
print(f" VIAF: {best_match['viaf']}")
# Confidence-based decision
if match_score >= 85:
print(f" ✅ HIGH CONFIDENCE - Auto-accepting")
add_wikidata_identifier(
matched,
best_match['q_number'],
match_score / 100,
target['notes']
)
enriched_count += 1
elif match_score >= 75:
print(f" ⚠️ MEDIUM CONFIDENCE - Needs manual verification")
print(f" Verify at: https://www.wikidata.org/wiki/{best_match['q_number']}")
manual_review_count += 1
else:
print(f" ❌ LOW CONFIDENCE - Skipping")
manual_review_count += 1
print()
print("=" * 80)
print("Batch 2 Summary:")
print("-" * 80)
print(f"✅ Auto-enriched: {enriched_count}")
print(f"⚠️ Manual review: {manual_review_count}")
print(f"⏭️ Already enriched: {skipped_count}")
print(f"❌ Not found: {not_found_count}")
# Calculate updated coverage
with_wikidata_after = sum(1 for inst in institutions if has_wikidata(inst))
print()
print("Chilean Institution Coverage:")
print(f" Total: {len(institutions)}")
print(f" Before Batch 2: {with_wikidata_before} ({with_wikidata_before/len(institutions)*100:.1f}%)")
print(f" After Batch 2: {with_wikidata_after} ({with_wikidata_after/len(institutions)*100:.1f}%)")
print(f" Improvement: +{with_wikidata_after - with_wikidata_before} institutions")
# Save if any enrichments
if enriched_count > 0:
print()
print(f"💾 Saving enriched data to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("✅ Batch 2 enrichment complete!")
print()
print("NEXT STEPS:")
print("1. Review medium-confidence candidates")
print("2. Create Batch 3 targeting major museums:")
print(" - Museo Histórico y Antropológico (Valdivia)")
print(" - Museo Colchagua (Santa Cruz)")
print(" - Museo Gabriela Mistral (Vicuña)")
print(" - Museo Antropológico Padre Sebastián Englert (Easter Island)")
print(" - Casa Museo Isla Negra (Pablo Neruda)")
print("3. Continue until 20+ institutions enriched (22% coverage)")
else:
print()
print("⚠️ No automatic enrichments - all require manual review")
print()
print("DEBUGGING TIPS:")
print("1. Check if institution names in dataset match target name_variants")
print("2. Verify institution_type field matches target")
print("3. Check if location (city/region) matches target")
print("4. Review name normalization logic in matches_institution()")
if __name__ == '__main__':
main()