glam/scripts/enrich_chilean_batch1_manual.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

451 lines
15 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Chilean Heritage Institutions - Batch 1 Wikidata Enrichment (Manual Verification)
Adds VERIFIED Wikidata Q-numbers to 13 Chilean institutions (diverse sample):
BATCH 1 TARGET INSTITUTIONS (13):
Museums (4):
1. Museo Universidad de Tarapacá San Miguel de Azapa (MASMA) - Arica
2. Museo de Historia Natural de Atacama - Atacama
3. Museo Indígena Atacameño - Antofagasta
4. Museo de Tocopilla - Antofagasta
Archives (3):
5. Archivo Central Andrés Bello, Universidad de Chile - Santiago
6. Archivo Central USACH - Santiago
7. Archivo Histórico del Arzobispado de Santiago - Santiago
Libraries (3):
8. Biblioteca Nacional Digital de Chile - Santiago
9. Biblioteca Federico Varela, Universidad de Atacama - Atacama
10. CRA Escuela El Olivar - Arica
Education Providers (3):
11. Universidad de Tarapacá - Arica
12. Universidad Arturo Prat - Iquique
13. Universidad Católica del Norte, Sede San Pedro de Atacama - Antofagasta
STRATEGY:
- Query Wikidata API for each institution
- Fuzzy match with manual verification checkpoints (threshold > 85%)
- Export enriched YAML with provenance tracking
- Follow LinkML schema compliance
Coverage Goal: 0/90 (0%) → 13/90 (14.4%)
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any
import requests
import time
from rapidfuzz import fuzz
# Wikidata SPARQL endpoint
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
# Batch 1: Selected 13 institutions for diverse sample
# NOTE: Using simplified patterns that match the dataset's actual names
BATCH_1_TARGETS = [
# Museums (4)
{
'name_pattern': 'Museo Universidad de Tarapacá',
'region': 'Arica',
'city': 'Arica',
'inst_type': 'MUSEUM',
'wikidata_class': 'Q33506', # museum
'notes': 'MASMA - Archaeological museum with pre-Columbian collections'
},
{
'name_pattern': 'Museo de Historia Natural de Atacama',
'region': 'Atacama',
'city': None, # No city in dataset
'inst_type': 'MUSEUM',
'wikidata_class': 'Q33506',
'notes': 'Natural history museum in Atacama region'
},
{
'name_pattern': 'Museo Indígena Atacameño',
'region': 'Antofagasta',
'city': None,
'inst_type': 'MUSEUM',
'wikidata_class': 'Q33506',
'notes': 'Indigenous cultural museum'
},
{
'name_pattern': 'Museo de Tocopilla',
'region': 'Antofagasta',
'city': 'Tocopilla',
'inst_type': 'MUSEUM',
'wikidata_class': 'Q33506',
'notes': 'Local museum in Tocopilla'
},
# Archives (3)
{
'name_pattern': 'Archivo Central Andrés Bello',
'region': 'Metropolitana',
'city': 'Santiago',
'inst_type': 'ARCHIVE',
'wikidata_class': 'Q166118', # archive
'notes': 'Universidad de Chile central archive'
},
{
'name_pattern': 'Archivo Central USACH',
'region': 'Metropolitana',
'city': 'Santiago',
'inst_type': 'ARCHIVE',
'wikidata_class': 'Q166118',
'notes': 'Universidad de Santiago de Chile archive'
},
{
'name_pattern': 'Archivo Histórico del Arzobispado',
'region': 'Metropolitana',
'city': 'Santiago',
'inst_type': 'ARCHIVE',
'wikidata_class': 'Q166118',
'notes': 'Archdiocese of Santiago historical archive'
},
# Libraries (3)
{
'name_pattern': 'Biblioteca Nacional Digital',
'region': 'Metropolitana',
'city': 'Santiago',
'inst_type': 'LIBRARY',
'wikidata_class': 'Q7075', # library
'notes': 'Digital platform of Biblioteca Nacional de Chile'
},
{
'name_pattern': 'Biblioteca Federico Varela',
'region': 'Atacama',
'city': None,
'inst_type': 'LIBRARY',
'wikidata_class': 'Q7075',
'notes': 'Universidad de Atacama library'
},
{
'name_pattern': 'CRA Escuela El Olivar',
'region': 'Arica',
'city': 'Arica',
'inst_type': 'LIBRARY',
'wikidata_class': 'Q7075',
'notes': 'School learning resource center'
},
# Education Providers (3)
{
'name_pattern': 'Universidad de Tarapacá',
'region': 'Arica',
'city': 'Arica',
'inst_type': 'EDUCATION_PROVIDER',
'wikidata_class': 'Q3918', # university
'notes': 'Public university in Arica'
},
{
'name_pattern': 'Universidad Arturo Prat',
'region': 'Tarapacá',
'city': 'Iquique',
'inst_type': 'EDUCATION_PROVIDER',
'wikidata_class': 'Q3918',
'notes': 'Public university in Iquique'
},
{
'name_pattern': 'Universidad Católica del Norte',
'region': 'Antofagasta',
'city': 'San Pedro de Atacama',
'inst_type': 'EDUCATION_PROVIDER',
'wikidata_class': 'Q3918',
'notes': 'Catholic university campus in San Pedro de Atacama'
},
]
def matches_institution(inst: Dict[str, Any], target: Dict[str, Any]) -> bool:
"""Check if institution matches target criteria."""
# Check name pattern (case-insensitive, fuzzy match)
name = inst.get('name', '').lower()
pattern = target['name_pattern'].lower()
# Use fuzzy matching for name
name_score = fuzz.partial_ratio(pattern, name)
if name_score < 70: # Low threshold for initial match
return False
# Check institution type
if inst.get('institution_type') != target['inst_type']:
return False
# Check region
locations = inst.get('locations', [])
if not locations:
return False
region = locations[0].get('region', '')
if target['region'] and region != target['region']:
return False
# Check city if specified
if target.get('city'):
city = locations[0].get('city', '')
if city and city != target['city']:
return False
return True
def has_wikidata(inst: Dict[str, Any]) -> bool:
"""Check if institution already has Wikidata identifier."""
return any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
def query_wikidata(name: str, region: str, inst_class: str) -> List[Dict[str, Any]]:
"""Query Wikidata for institutions matching name and location."""
# Simplified query - search for institutions in Chile with matching type
query = f"""
SELECT ?item ?itemLabel ?itemDescription ?viaf ?isil WHERE {{
?item wdt:P31/wdt:P279* wd:{inst_class} .
?item wdt:P17 wd:Q298 . # Country: Chile
OPTIONAL {{ ?item wdt:P214 ?viaf }}
OPTIONAL {{ ?item wdt:P791 ?isil }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
}}
LIMIT 100
"""
headers = {
'User-Agent': 'GLAM-Data-Extractor/0.1 (heritage-data-project)',
'Accept': 'application/json'
}
try:
response = requests.get(
WIKIDATA_SPARQL,
params={'query': query, 'format': 'json'},
headers=headers,
timeout=30
)
response.raise_for_status()
results = response.json()
bindings = results.get('results', {}).get('bindings', [])
# Extract relevant fields
matches = []
for binding in bindings:
item_uri = binding.get('item', {}).get('value', '')
q_number = item_uri.split('/')[-1] if item_uri else None
if q_number:
matches.append({
'q_number': q_number,
'label': binding.get('itemLabel', {}).get('value', ''),
'description': binding.get('itemDescription', {}).get('value', ''),
'viaf': binding.get('viaf', {}).get('value', None),
'isil': binding.get('isil', {}).get('value', None)
})
return matches
except Exception as e:
print(f" ⚠️ Wikidata query error: {e}")
return []
def fuzzy_match_wikidata(inst_name: str, wd_results: List[Dict[str, Any]]) -> tuple[Dict[str, Any] | None, float]:
"""Fuzzy match institution name to Wikidata results."""
best_match = None
best_score = 0
for result in wd_results:
wd_label = result['label']
# Try multiple fuzzy matching strategies
scores = [
fuzz.ratio(inst_name.lower(), wd_label.lower()),
fuzz.partial_ratio(inst_name.lower(), wd_label.lower()),
fuzz.token_sort_ratio(inst_name.lower(), wd_label.lower())
]
score = max(scores)
if score > best_score:
best_score = score
best_match = result
return best_match, best_score
def add_wikidata_identifier(inst: Dict[str, Any], q_number: str, confidence: float, notes: str) -> Dict[str, Any]:
"""Add Wikidata identifier to institution."""
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Update provenance
if 'provenance' in inst:
old_method = inst['provenance'].get('extraction_method', '')
enrichment_note = f" + Wikidata enrichment (Batch 1 manual verification, confidence={confidence:.2f})"
inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"
return inst
def main():
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_geocoded_v2.yaml'
backup_file = data_file.with_suffix('.batch1_backup')
output_file = data_file.with_name('chilean_institutions_batch1_enriched.yaml')
print("=" * 80)
print("Chilean Heritage Institutions - Batch 1 Wikidata Enrichment")
print("Manual Verification Session - November 9, 2025")
print("Target: 13 institutions (diverse sample)")
print("=" * 80)
print()
# Load data
print(f"📂 Loading: {data_file}")
with open(data_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f" Total institutions: {len(institutions)}")
print()
# Create backup
print(f"💾 Creating backup: {backup_file}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("Enrichment Process:")
print("-" * 80)
# Process each target
enriched_count = 0
skipped_count = 0
not_found_count = 0
manual_review_count = 0
for i, target in enumerate(BATCH_1_TARGETS, 1):
print(f"\n[{i}/13] 🔍 Searching: {target['name_pattern']} ({target['region']})")
# Find matching institution in dataset
matched = None
for inst in institutions:
if matches_institution(inst, target):
matched = inst
break
if not matched:
print(f" ❌ NOT FOUND in dataset")
not_found_count += 1
continue
print(f" ✓ Found: {matched.get('name')}")
# Check if already has Wikidata
if has_wikidata(matched):
existing_q = next(
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
if id_obj.get('identifier_scheme') == 'Wikidata'),
None
)
print(f" ⏭️ Already enriched with {existing_q}")
skipped_count += 1
continue
# Query Wikidata
print(f" 🌐 Querying Wikidata for {target['wikidata_class']} in Chile...")
time.sleep(1) # Rate limiting
wd_results = query_wikidata(
target['name_pattern'],
target['region'],
target['wikidata_class']
)
if not wd_results:
print(f" ⚠️ No Wikidata results found")
manual_review_count += 1
continue
print(f" 📊 Found {len(wd_results)} Wikidata candidates")
# Fuzzy match
best_match, match_score = fuzzy_match_wikidata(matched['name'], wd_results)
if not best_match:
print(f" ⚠️ No good match found (threshold < 70)")
manual_review_count += 1
continue
print(f" 🎯 Best match: {best_match['label']} ({best_match['q_number']})")
print(f" Similarity: {match_score:.1f}%")
if best_match.get('description'):
print(f" Description: {best_match['description']}")
# Manual verification checkpoint
if match_score >= 85:
print(f" ✅ HIGH CONFIDENCE - Auto-accepting")
add_wikidata_identifier(matched, best_match['q_number'], match_score / 100, target['notes'])
enriched_count += 1
elif match_score >= 70:
print(f" ⚠️ MEDIUM CONFIDENCE - Flagged for manual review")
print(f" To accept, manually verify: https://www.wikidata.org/wiki/{best_match['q_number']}")
manual_review_count += 1
else:
print(f" ❌ LOW CONFIDENCE - Skipping")
manual_review_count += 1
print()
print("=" * 80)
print("Batch 1 Summary:")
print("-" * 80)
print(f"✅ Auto-enriched: {enriched_count}")
print(f"⚠️ Manual review: {manual_review_count}")
print(f"⏭️ Already enriched: {skipped_count}")
print(f"❌ Not found: {not_found_count}")
# Calculate coverage
with_wikidata = sum(1 for inst in institutions if has_wikidata(inst))
print()
print("Chilean Institution Coverage:")
print(f" Total: {len(institutions)}")
print(f" With Wikidata: {with_wikidata} ({with_wikidata/len(institutions)*100:.1f}%)")
print(f" Without: {len(institutions) - with_wikidata}")
# Save if any enrichments
if enriched_count > 0:
print()
print(f"💾 Saving enriched data to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("✅ Batch 1 enrichment complete!")
print()
print("NEXT STEPS:")
print("1. Review manual verification candidates")
print("2. Create Batch 2 with remaining high-priority institutions")
print("3. Continue iterating until 80%+ coverage")
else:
print()
print("⚠️ No automatic enrichments - all require manual review")
print(" Review candidates above and add Q-numbers manually if matches confirmed")
if __name__ == '__main__':
main()