- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
491 lines
17 KiB
Python
491 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (Universities Focus)
|
|
|
|
Improved strategy based on Batch 1 learnings:
|
|
- Focus on universities (excellent Wikidata coverage)
|
|
- Better name normalization (strip possessives, handle word order)
|
|
- Geographic filtering in SPARQL queries
|
|
- Higher success rate expected (universities have standardized names)
|
|
|
|
BATCH 2 TARGET INSTITUTIONS (5 major universities):
|
|
1. Universidad de Chile - Santiago
|
|
2. Universidad de Santiago de Chile (USACH) - Santiago
|
|
3. Universidad de Concepción - Concepción
|
|
4. Universidad Austral de Chile - Valdivia
|
|
5. Pontificia Universidad Católica de Chile - Santiago
|
|
|
|
SUCCESS CRITERIA:
|
|
- Batch 1: 2/90 with Wikidata (2.2%)
|
|
- Goal: 7/90 with Wikidata (7.8%)
|
|
- Expected success rate: 100% for universities
|
|
|
|
IMPROVEMENTS FROM BATCH 1:
|
|
1. Name normalization: Remove "'s", "Universidad's" → "Universidad"
|
|
2. Geographic filtering: Add city/region to SPARQL query
|
|
3. Multiple name variants: Try both full and abbreviated names
|
|
4. Better fuzzy matching: Use token_set_ratio for word order variations
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
import requests
|
|
import time
|
|
from rapidfuzz import fuzz
|
|
import re
|
|
|
|
# Wikidata SPARQL endpoint
|
|
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
|
|
|
# Batch 2: Major Chilean universities (high success probability)
|
|
BATCH_2_TARGETS = [
|
|
{
|
|
'name_pattern': 'Universidad de Chile',
|
|
'name_variants': ['Universidad de Chile', 'U. de Chile', 'UChile'],
|
|
'region': 'Santiago',
|
|
'city': 'Santiago',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'wikidata_class': 'Q3918', # university
|
|
'notes': 'Oldest and most prestigious public university in Chile (founded 1842)'
|
|
},
|
|
{
|
|
'name_pattern': 'Universidad de Santiago de Chile',
|
|
'name_variants': ['Universidad de Santiago de Chile', 'USACH', 'U. de Santiago'],
|
|
'region': 'Santiago',
|
|
'city': 'Santiago',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'wikidata_class': 'Q3918',
|
|
'notes': 'Major public university in Santiago (founded 1849 as Escuela de Artes y Oficios)'
|
|
},
|
|
{
|
|
'name_pattern': 'Universidad de Concepción',
|
|
'name_variants': ['Universidad de Concepción', 'UdeC', 'U. de Concepción'],
|
|
'region': 'Concepción',
|
|
'city': 'Concepción',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'wikidata_class': 'Q3918',
|
|
'notes': 'Third oldest university in Chile (founded 1919)'
|
|
},
|
|
{
|
|
'name_pattern': 'Universidad Austral de Chile',
|
|
'name_variants': ['Universidad Austral de Chile', 'UACh', 'U. Austral'],
|
|
'region': 'Valdivia',
|
|
'city': 'Valdivia',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'wikidata_class': 'Q3918',
|
|
'notes': 'Public university in southern Chile (founded 1954)'
|
|
},
|
|
{
|
|
'name_pattern': 'Pontificia Universidad Católica de Chile',
|
|
'name_variants': [
|
|
'Pontificia Universidad Católica de Chile',
|
|
'UC Chile',
|
|
'PUC',
|
|
'Universidad Católica de Chile'
|
|
],
|
|
'region': 'Santiago',
|
|
'city': 'Santiago',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'wikidata_class': 'Q3918',
|
|
'notes': 'Leading private Catholic university (founded 1888)'
|
|
}
|
|
]
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for better matching."""
|
|
# Remove possessive markers
|
|
name = re.sub(r"'s\b", "", name)
|
|
|
|
# Remove leading/trailing whitespace
|
|
name = name.strip()
|
|
|
|
# Normalize whitespace
|
|
name = re.sub(r'\s+', ' ', name)
|
|
|
|
return name
|
|
|
|
|
|
def matches_institution(inst: Dict[str, Any], target: Dict[str, Any]) -> bool:
|
|
"""Check if institution matches target criteria with improved name matching."""
|
|
# Normalize institution name
|
|
inst_name = normalize_name(inst.get('name', '')).lower()
|
|
|
|
# Check against all name variants
|
|
name_variants = target.get('name_variants', [target['name_pattern']])
|
|
matched_name = False
|
|
|
|
for variant in name_variants:
|
|
normalized_variant = normalize_name(variant).lower()
|
|
|
|
# Try multiple fuzzy matching strategies
|
|
scores = [
|
|
fuzz.ratio(inst_name, normalized_variant),
|
|
fuzz.partial_ratio(inst_name, normalized_variant),
|
|
fuzz.token_set_ratio(inst_name, normalized_variant)
|
|
]
|
|
|
|
max_score = max(scores)
|
|
|
|
if max_score >= 75: # Lower threshold to catch variations
|
|
matched_name = True
|
|
break
|
|
|
|
if not matched_name:
|
|
return False
|
|
|
|
# Check institution type
|
|
if inst.get('institution_type') != target['inst_type']:
|
|
return False
|
|
|
|
# Check location (region or city)
|
|
locations = inst.get('locations', [])
|
|
if not locations:
|
|
return False
|
|
|
|
location = locations[0]
|
|
region = location.get('region', '')
|
|
city = location.get('city', '')
|
|
|
|
# Match by region or city
|
|
target_region = target.get('region', '')
|
|
target_city = target.get('city', '')
|
|
|
|
location_match = False
|
|
if target_region and (region == target_region or city == target_region):
|
|
location_match = True
|
|
if target_city and (city == target_city or region == target_city):
|
|
location_match = True
|
|
|
|
return location_match
|
|
|
|
|
|
def has_wikidata(inst: Dict[str, Any]) -> bool:
|
|
"""Check if institution already has Wikidata identifier."""
|
|
return any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
|
|
|
|
def query_wikidata_with_location(
|
|
name_variants: List[str],
|
|
city: Optional[str],
|
|
inst_class: str
|
|
) -> List[Dict[str, Any]]:
|
|
"""Query Wikidata with geographic filtering for better precision."""
|
|
# Build filter for city if provided
|
|
city_filter = ""
|
|
if city:
|
|
# Map Chilean city names to Wikidata Q-numbers (add as needed)
|
|
city_mapping = {
|
|
'Santiago': 'Q2887',
|
|
'Concepción': 'Q5775',
|
|
'Valdivia': 'Q3883'
|
|
}
|
|
|
|
if city in city_mapping:
|
|
city_q = city_mapping[city]
|
|
city_filter = f"""
|
|
?item wdt:P131* wd:{city_q} . # Located in or subdivision of city
|
|
"""
|
|
|
|
query = f"""
|
|
SELECT ?item ?itemLabel ?itemDescription ?viaf ?isil WHERE {{
|
|
?item wdt:P31/wdt:P279* wd:{inst_class} .
|
|
?item wdt:P17 wd:Q298 . # Country: Chile
|
|
{city_filter}
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf }}
|
|
OPTIONAL {{ ?item wdt:P791 ?isil }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
|
|
}}
|
|
LIMIT 100
|
|
"""
|
|
|
|
headers = {
|
|
'User-Agent': 'GLAM-Data-Extractor/0.2 (heritage-data-project; batch2-universities)',
|
|
'Accept': 'application/json'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(
|
|
WIKIDATA_SPARQL,
|
|
params={'query': query, 'format': 'json'},
|
|
headers=headers,
|
|
timeout=30
|
|
)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get('results', {}).get('bindings', [])
|
|
|
|
# Extract relevant fields
|
|
matches = []
|
|
for binding in bindings:
|
|
item_uri = binding.get('item', {}).get('value', '')
|
|
q_number = item_uri.split('/')[-1] if item_uri else None
|
|
|
|
if q_number:
|
|
matches.append({
|
|
'q_number': q_number,
|
|
'label': binding.get('itemLabel', {}).get('value', ''),
|
|
'description': binding.get('itemDescription', {}).get('value', ''),
|
|
'viaf': binding.get('viaf', {}).get('value', None),
|
|
'isil': binding.get('isil', {}).get('value', None)
|
|
})
|
|
|
|
return matches
|
|
|
|
except Exception as e:
|
|
print(f" ⚠️ Wikidata query error: {e}")
|
|
return []
|
|
|
|
|
|
def fuzzy_match_wikidata_improved(
|
|
inst_name: str,
|
|
name_variants: List[str],
|
|
wd_results: List[Dict[str, Any]]
|
|
) -> tuple[Optional[Dict[str, Any]], float]:
|
|
"""Improved fuzzy matching with multiple strategies."""
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
# Normalize institution name
|
|
inst_name_norm = normalize_name(inst_name).lower()
|
|
|
|
for result in wd_results:
|
|
wd_label = normalize_name(result['label']).lower()
|
|
|
|
# Try matching against institution name
|
|
scores = [
|
|
fuzz.ratio(inst_name_norm, wd_label),
|
|
fuzz.partial_ratio(inst_name_norm, wd_label),
|
|
fuzz.token_set_ratio(inst_name_norm, wd_label),
|
|
fuzz.token_sort_ratio(inst_name_norm, wd_label)
|
|
]
|
|
|
|
# Also try matching against target name variants
|
|
for variant in name_variants:
|
|
variant_norm = normalize_name(variant).lower()
|
|
scores.extend([
|
|
fuzz.ratio(variant_norm, wd_label),
|
|
fuzz.token_set_ratio(variant_norm, wd_label),
|
|
fuzz.token_sort_ratio(variant_norm, wd_label)
|
|
])
|
|
|
|
score = max(scores)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = result
|
|
|
|
return best_match, best_score
|
|
|
|
|
|
def add_wikidata_identifier(
|
|
inst: Dict[str, Any],
|
|
q_number: str,
|
|
confidence: float,
|
|
notes: str
|
|
) -> Dict[str, Any]:
|
|
"""Add Wikidata identifier to institution with provenance tracking."""
|
|
wikidata_id = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_number,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
|
}
|
|
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append(wikidata_id)
|
|
|
|
# Update provenance
|
|
if 'provenance' in inst:
|
|
old_method = inst['provenance'].get('extraction_method', '')
|
|
enrichment_note = (
|
|
f" + Wikidata enrichment (Batch 2 universities, confidence={confidence:.2f})"
|
|
)
|
|
inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"
|
|
|
|
return inst
|
|
|
|
|
|
def main():
|
|
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch1_enriched.yaml'
|
|
backup_file = data_file.with_suffix('.batch2_backup')
|
|
output_file = data_file.with_name('chilean_institutions_batch2_enriched.yaml')
|
|
|
|
print("=" * 80)
|
|
print("Chilean Heritage Institutions - Batch 2 Wikidata Enrichment")
|
|
print("Universities Focus - Improved Matching Strategy")
|
|
print("Session: November 9, 2025")
|
|
print("Target: 5 major universities")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load data
|
|
print(f"📂 Loading: {data_file}")
|
|
with open(data_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f" Total institutions: {len(institutions)}")
|
|
|
|
# Check existing Wikidata coverage
|
|
with_wikidata_before = sum(1 for inst in institutions if has_wikidata(inst))
|
|
print(f" Current Wikidata coverage: {with_wikidata_before}/{len(institutions)} ({with_wikidata_before/len(institutions)*100:.1f}%)")
|
|
print()
|
|
|
|
# Create backup
|
|
print(f"💾 Creating backup: {backup_file}")
|
|
with open(backup_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
print()
|
|
|
|
print("Batch 2 Enrichment Process:")
|
|
print("-" * 80)
|
|
|
|
# Process each target
|
|
enriched_count = 0
|
|
skipped_count = 0
|
|
not_found_count = 0
|
|
manual_review_count = 0
|
|
|
|
for i, target in enumerate(BATCH_2_TARGETS, 1):
|
|
print(f"\n[{i}/{len(BATCH_2_TARGETS)}] 🎓 Searching: {target['name_pattern']}")
|
|
print(f" Location: {target['city']}, {target['region']}")
|
|
print(f" Name variants: {', '.join(target['name_variants'][:3])}")
|
|
|
|
# Find matching institution in dataset
|
|
matched = None
|
|
for inst in institutions:
|
|
if matches_institution(inst, target):
|
|
matched = inst
|
|
break
|
|
|
|
if not matched:
|
|
print(f" ❌ NOT FOUND in dataset")
|
|
print(f" (Check if institution name matches any variant)")
|
|
not_found_count += 1
|
|
continue
|
|
|
|
print(f" ✓ Found: {matched.get('name')}")
|
|
|
|
# Check if already has Wikidata
|
|
if has_wikidata(matched):
|
|
existing_q = next(
|
|
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
|
|
if id_obj.get('identifier_scheme') == 'Wikidata'),
|
|
None
|
|
)
|
|
print(f" ⏭️ Already enriched with {existing_q}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Query Wikidata with location filtering
|
|
print(f" 🌐 Querying Wikidata (universities in {target['city']})...")
|
|
time.sleep(1.5) # Rate limiting
|
|
|
|
wd_results = query_wikidata_with_location(
|
|
target['name_variants'],
|
|
target.get('city'),
|
|
target['wikidata_class']
|
|
)
|
|
|
|
if not wd_results:
|
|
print(f" ⚠️ No Wikidata results found")
|
|
manual_review_count += 1
|
|
continue
|
|
|
|
print(f" 📊 Found {len(wd_results)} Wikidata candidates")
|
|
|
|
# Improved fuzzy matching
|
|
best_match, match_score = fuzzy_match_wikidata_improved(
|
|
matched['name'],
|
|
target['name_variants'],
|
|
wd_results
|
|
)
|
|
|
|
if not best_match:
|
|
print(f" ⚠️ No good match found (threshold < 70)")
|
|
manual_review_count += 1
|
|
continue
|
|
|
|
print(f" 🎯 Best match: {best_match['label']} ({best_match['q_number']})")
|
|
print(f" Similarity: {match_score:.1f}%")
|
|
if best_match.get('description'):
|
|
print(f" Description: {best_match['description']}")
|
|
if best_match.get('viaf'):
|
|
print(f" VIAF: {best_match['viaf']}")
|
|
|
|
# Confidence-based decision
|
|
if match_score >= 85:
|
|
print(f" ✅ HIGH CONFIDENCE - Auto-accepting")
|
|
add_wikidata_identifier(
|
|
matched,
|
|
best_match['q_number'],
|
|
match_score / 100,
|
|
target['notes']
|
|
)
|
|
enriched_count += 1
|
|
elif match_score >= 75:
|
|
print(f" ⚠️ MEDIUM CONFIDENCE - Needs manual verification")
|
|
print(f" Verify at: https://www.wikidata.org/wiki/{best_match['q_number']}")
|
|
manual_review_count += 1
|
|
else:
|
|
print(f" ❌ LOW CONFIDENCE - Skipping")
|
|
manual_review_count += 1
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("Batch 2 Summary:")
|
|
print("-" * 80)
|
|
print(f"✅ Auto-enriched: {enriched_count}")
|
|
print(f"⚠️ Manual review: {manual_review_count}")
|
|
print(f"⏭️ Already enriched: {skipped_count}")
|
|
print(f"❌ Not found: {not_found_count}")
|
|
|
|
# Calculate updated coverage
|
|
with_wikidata_after = sum(1 for inst in institutions if has_wikidata(inst))
|
|
|
|
print()
|
|
print("Chilean Institution Coverage:")
|
|
print(f" Total: {len(institutions)}")
|
|
print(f" Before Batch 2: {with_wikidata_before} ({with_wikidata_before/len(institutions)*100:.1f}%)")
|
|
print(f" After Batch 2: {with_wikidata_after} ({with_wikidata_after/len(institutions)*100:.1f}%)")
|
|
print(f" Improvement: +{with_wikidata_after - with_wikidata_before} institutions")
|
|
|
|
# Save if any enrichments
|
|
if enriched_count > 0:
|
|
print()
|
|
print(f"💾 Saving enriched data to: {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print()
|
|
print("✅ Batch 2 enrichment complete!")
|
|
print()
|
|
print("NEXT STEPS:")
|
|
print("1. Review medium-confidence candidates")
|
|
print("2. Create Batch 3 targeting major museums:")
|
|
print(" - Museo Histórico y Antropológico (Valdivia)")
|
|
print(" - Museo Colchagua (Santa Cruz)")
|
|
print(" - Museo Gabriela Mistral (Vicuña)")
|
|
print(" - Museo Antropológico Padre Sebastián Englert (Easter Island)")
|
|
print(" - Casa Museo Isla Negra (Pablo Neruda)")
|
|
print("3. Continue until 20+ institutions enriched (22% coverage)")
|
|
else:
|
|
print()
|
|
print("⚠️ No automatic enrichments - all require manual review")
|
|
print()
|
|
print("DEBUGGING TIPS:")
|
|
print("1. Check if institution names in dataset match target name_variants")
|
|
print("2. Verify institution_type field matches target")
|
|
print("3. Check if location (city/region) matches target")
|
|
print("4. Review name normalization logic in matches_institution()")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|