- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
451 lines
15 KiB
Python
Executable file
451 lines
15 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Chilean Heritage Institutions - Batch 1 Wikidata Enrichment (Manual Verification)
|
|
|
|
Adds VERIFIED Wikidata Q-numbers to 13 Chilean institutions (diverse sample):
|
|
|
|
BATCH 1 TARGET INSTITUTIONS (13):
|
|
Museums (4):
|
|
1. Museo Universidad de Tarapacá San Miguel de Azapa (MASMA) - Arica
|
|
2. Museo de Historia Natural de Atacama - Atacama
|
|
3. Museo Indígena Atacameño - Antofagasta
|
|
4. Museo de Tocopilla - Antofagasta
|
|
|
|
Archives (3):
|
|
5. Archivo Central Andrés Bello, Universidad de Chile - Santiago
|
|
6. Archivo Central USACH - Santiago
|
|
7. Archivo Histórico del Arzobispado de Santiago - Santiago
|
|
|
|
Libraries (3):
|
|
8. Biblioteca Nacional Digital de Chile - Santiago
|
|
9. Biblioteca Federico Varela, Universidad de Atacama - Atacama
|
|
10. CRA Escuela El Olivar - Arica
|
|
|
|
Education Providers (3):
|
|
11. Universidad de Tarapacá - Arica
|
|
12. Universidad Arturo Prat - Iquique
|
|
13. Universidad Católica del Norte, Sede San Pedro de Atacama - Antofagasta
|
|
|
|
STRATEGY:
|
|
- Query Wikidata API for each institution
|
|
- Fuzzy match with manual verification checkpoints (threshold > 85%)
|
|
- Export enriched YAML with provenance tracking
|
|
- Follow LinkML schema compliance
|
|
|
|
Coverage Goal: 0/90 (0%) → 13/90 (14.4%)
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any
|
|
import requests
|
|
import time
|
|
from rapidfuzz import fuzz
|
|
|
|
# Wikidata SPARQL endpoint
|
|
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
|
|
|
# Batch 1: Selected 13 institutions for diverse sample
|
|
# NOTE: Using simplified patterns that match the dataset's actual names
|
|
BATCH_1_TARGETS = [
|
|
# Museums (4)
|
|
{
|
|
'name_pattern': 'Museo Universidad de Tarapacá',
|
|
'region': 'Arica',
|
|
'city': 'Arica',
|
|
'inst_type': 'MUSEUM',
|
|
'wikidata_class': 'Q33506', # museum
|
|
'notes': 'MASMA - Archaeological museum with pre-Columbian collections'
|
|
},
|
|
{
|
|
'name_pattern': 'Museo de Historia Natural de Atacama',
|
|
'region': 'Atacama',
|
|
'city': None, # No city in dataset
|
|
'inst_type': 'MUSEUM',
|
|
'wikidata_class': 'Q33506',
|
|
'notes': 'Natural history museum in Atacama region'
|
|
},
|
|
{
|
|
'name_pattern': 'Museo Indígena Atacameño',
|
|
'region': 'Antofagasta',
|
|
'city': None,
|
|
'inst_type': 'MUSEUM',
|
|
'wikidata_class': 'Q33506',
|
|
'notes': 'Indigenous cultural museum'
|
|
},
|
|
{
|
|
'name_pattern': 'Museo de Tocopilla',
|
|
'region': 'Antofagasta',
|
|
'city': 'Tocopilla',
|
|
'inst_type': 'MUSEUM',
|
|
'wikidata_class': 'Q33506',
|
|
'notes': 'Local museum in Tocopilla'
|
|
},
|
|
|
|
# Archives (3)
|
|
{
|
|
'name_pattern': 'Archivo Central Andrés Bello',
|
|
'region': 'Metropolitana',
|
|
'city': 'Santiago',
|
|
'inst_type': 'ARCHIVE',
|
|
'wikidata_class': 'Q166118', # archive
|
|
'notes': 'Universidad de Chile central archive'
|
|
},
|
|
{
|
|
'name_pattern': 'Archivo Central USACH',
|
|
'region': 'Metropolitana',
|
|
'city': 'Santiago',
|
|
'inst_type': 'ARCHIVE',
|
|
'wikidata_class': 'Q166118',
|
|
'notes': 'Universidad de Santiago de Chile archive'
|
|
},
|
|
{
|
|
'name_pattern': 'Archivo Histórico del Arzobispado',
|
|
'region': 'Metropolitana',
|
|
'city': 'Santiago',
|
|
'inst_type': 'ARCHIVE',
|
|
'wikidata_class': 'Q166118',
|
|
'notes': 'Archdiocese of Santiago historical archive'
|
|
},
|
|
|
|
# Libraries (3)
|
|
{
|
|
'name_pattern': 'Biblioteca Nacional Digital',
|
|
'region': 'Metropolitana',
|
|
'city': 'Santiago',
|
|
'inst_type': 'LIBRARY',
|
|
'wikidata_class': 'Q7075', # library
|
|
'notes': 'Digital platform of Biblioteca Nacional de Chile'
|
|
},
|
|
{
|
|
'name_pattern': 'Biblioteca Federico Varela',
|
|
'region': 'Atacama',
|
|
'city': None,
|
|
'inst_type': 'LIBRARY',
|
|
'wikidata_class': 'Q7075',
|
|
'notes': 'Universidad de Atacama library'
|
|
},
|
|
{
|
|
'name_pattern': 'CRA Escuela El Olivar',
|
|
'region': 'Arica',
|
|
'city': 'Arica',
|
|
'inst_type': 'LIBRARY',
|
|
'wikidata_class': 'Q7075',
|
|
'notes': 'School learning resource center'
|
|
},
|
|
|
|
# Education Providers (3)
|
|
{
|
|
'name_pattern': 'Universidad de Tarapacá',
|
|
'region': 'Arica',
|
|
'city': 'Arica',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'wikidata_class': 'Q3918', # university
|
|
'notes': 'Public university in Arica'
|
|
},
|
|
{
|
|
'name_pattern': 'Universidad Arturo Prat',
|
|
'region': 'Tarapacá',
|
|
'city': 'Iquique',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'wikidata_class': 'Q3918',
|
|
'notes': 'Public university in Iquique'
|
|
},
|
|
{
|
|
'name_pattern': 'Universidad Católica del Norte',
|
|
'region': 'Antofagasta',
|
|
'city': 'San Pedro de Atacama',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'wikidata_class': 'Q3918',
|
|
'notes': 'Catholic university campus in San Pedro de Atacama'
|
|
},
|
|
]
|
|
|
|
|
|
def matches_institution(inst: Dict[str, Any], target: Dict[str, Any]) -> bool:
|
|
"""Check if institution matches target criteria."""
|
|
# Check name pattern (case-insensitive, fuzzy match)
|
|
name = inst.get('name', '').lower()
|
|
pattern = target['name_pattern'].lower()
|
|
|
|
# Use fuzzy matching for name
|
|
name_score = fuzz.partial_ratio(pattern, name)
|
|
if name_score < 70: # Low threshold for initial match
|
|
return False
|
|
|
|
# Check institution type
|
|
if inst.get('institution_type') != target['inst_type']:
|
|
return False
|
|
|
|
# Check region
|
|
locations = inst.get('locations', [])
|
|
if not locations:
|
|
return False
|
|
|
|
region = locations[0].get('region', '')
|
|
if target['region'] and region != target['region']:
|
|
return False
|
|
|
|
# Check city if specified
|
|
if target.get('city'):
|
|
city = locations[0].get('city', '')
|
|
if city and city != target['city']:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def has_wikidata(inst: Dict[str, Any]) -> bool:
|
|
"""Check if institution already has Wikidata identifier."""
|
|
return any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
|
|
|
|
def query_wikidata(name: str, region: str, inst_class: str) -> List[Dict[str, Any]]:
|
|
"""Query Wikidata for institutions matching name and location."""
|
|
# Simplified query - search for institutions in Chile with matching type
|
|
query = f"""
|
|
SELECT ?item ?itemLabel ?itemDescription ?viaf ?isil WHERE {{
|
|
?item wdt:P31/wdt:P279* wd:{inst_class} .
|
|
?item wdt:P17 wd:Q298 . # Country: Chile
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf }}
|
|
OPTIONAL {{ ?item wdt:P791 ?isil }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
|
|
}}
|
|
LIMIT 100
|
|
"""
|
|
|
|
headers = {
|
|
'User-Agent': 'GLAM-Data-Extractor/0.1 (heritage-data-project)',
|
|
'Accept': 'application/json'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(
|
|
WIKIDATA_SPARQL,
|
|
params={'query': query, 'format': 'json'},
|
|
headers=headers,
|
|
timeout=30
|
|
)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get('results', {}).get('bindings', [])
|
|
|
|
# Extract relevant fields
|
|
matches = []
|
|
for binding in bindings:
|
|
item_uri = binding.get('item', {}).get('value', '')
|
|
q_number = item_uri.split('/')[-1] if item_uri else None
|
|
|
|
if q_number:
|
|
matches.append({
|
|
'q_number': q_number,
|
|
'label': binding.get('itemLabel', {}).get('value', ''),
|
|
'description': binding.get('itemDescription', {}).get('value', ''),
|
|
'viaf': binding.get('viaf', {}).get('value', None),
|
|
'isil': binding.get('isil', {}).get('value', None)
|
|
})
|
|
|
|
return matches
|
|
|
|
except Exception as e:
|
|
print(f" ⚠️ Wikidata query error: {e}")
|
|
return []
|
|
|
|
|
|
def fuzzy_match_wikidata(inst_name: str, wd_results: List[Dict[str, Any]]) -> tuple[Dict[str, Any] | None, float]:
|
|
"""Fuzzy match institution name to Wikidata results."""
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for result in wd_results:
|
|
wd_label = result['label']
|
|
|
|
# Try multiple fuzzy matching strategies
|
|
scores = [
|
|
fuzz.ratio(inst_name.lower(), wd_label.lower()),
|
|
fuzz.partial_ratio(inst_name.lower(), wd_label.lower()),
|
|
fuzz.token_sort_ratio(inst_name.lower(), wd_label.lower())
|
|
]
|
|
|
|
score = max(scores)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = result
|
|
|
|
return best_match, best_score
|
|
|
|
|
|
def add_wikidata_identifier(inst: Dict[str, Any], q_number: str, confidence: float, notes: str) -> Dict[str, Any]:
|
|
"""Add Wikidata identifier to institution."""
|
|
wikidata_id = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_number,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
|
}
|
|
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append(wikidata_id)
|
|
|
|
# Update provenance
|
|
if 'provenance' in inst:
|
|
old_method = inst['provenance'].get('extraction_method', '')
|
|
enrichment_note = f" + Wikidata enrichment (Batch 1 manual verification, confidence={confidence:.2f})"
|
|
inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"
|
|
|
|
return inst
|
|
|
|
|
|
def main():
|
|
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_geocoded_v2.yaml'
|
|
backup_file = data_file.with_suffix('.batch1_backup')
|
|
output_file = data_file.with_name('chilean_institutions_batch1_enriched.yaml')
|
|
|
|
print("=" * 80)
|
|
print("Chilean Heritage Institutions - Batch 1 Wikidata Enrichment")
|
|
print("Manual Verification Session - November 9, 2025")
|
|
print("Target: 13 institutions (diverse sample)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load data
|
|
print(f"📂 Loading: {data_file}")
|
|
with open(data_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f" Total institutions: {len(institutions)}")
|
|
print()
|
|
|
|
# Create backup
|
|
print(f"💾 Creating backup: {backup_file}")
|
|
with open(backup_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
print()
|
|
|
|
print("Enrichment Process:")
|
|
print("-" * 80)
|
|
|
|
# Process each target
|
|
enriched_count = 0
|
|
skipped_count = 0
|
|
not_found_count = 0
|
|
manual_review_count = 0
|
|
|
|
for i, target in enumerate(BATCH_1_TARGETS, 1):
|
|
print(f"\n[{i}/13] 🔍 Searching: {target['name_pattern']} ({target['region']})")
|
|
|
|
# Find matching institution in dataset
|
|
matched = None
|
|
for inst in institutions:
|
|
if matches_institution(inst, target):
|
|
matched = inst
|
|
break
|
|
|
|
if not matched:
|
|
print(f" ❌ NOT FOUND in dataset")
|
|
not_found_count += 1
|
|
continue
|
|
|
|
print(f" ✓ Found: {matched.get('name')}")
|
|
|
|
# Check if already has Wikidata
|
|
if has_wikidata(matched):
|
|
existing_q = next(
|
|
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
|
|
if id_obj.get('identifier_scheme') == 'Wikidata'),
|
|
None
|
|
)
|
|
print(f" ⏭️ Already enriched with {existing_q}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Query Wikidata
|
|
print(f" 🌐 Querying Wikidata for {target['wikidata_class']} in Chile...")
|
|
time.sleep(1) # Rate limiting
|
|
|
|
wd_results = query_wikidata(
|
|
target['name_pattern'],
|
|
target['region'],
|
|
target['wikidata_class']
|
|
)
|
|
|
|
if not wd_results:
|
|
print(f" ⚠️ No Wikidata results found")
|
|
manual_review_count += 1
|
|
continue
|
|
|
|
print(f" 📊 Found {len(wd_results)} Wikidata candidates")
|
|
|
|
# Fuzzy match
|
|
best_match, match_score = fuzzy_match_wikidata(matched['name'], wd_results)
|
|
|
|
if not best_match:
|
|
print(f" ⚠️ No good match found (threshold < 70)")
|
|
manual_review_count += 1
|
|
continue
|
|
|
|
print(f" 🎯 Best match: {best_match['label']} ({best_match['q_number']})")
|
|
print(f" Similarity: {match_score:.1f}%")
|
|
if best_match.get('description'):
|
|
print(f" Description: {best_match['description']}")
|
|
|
|
# Manual verification checkpoint
|
|
if match_score >= 85:
|
|
print(f" ✅ HIGH CONFIDENCE - Auto-accepting")
|
|
add_wikidata_identifier(matched, best_match['q_number'], match_score / 100, target['notes'])
|
|
enriched_count += 1
|
|
elif match_score >= 70:
|
|
print(f" ⚠️ MEDIUM CONFIDENCE - Flagged for manual review")
|
|
print(f" To accept, manually verify: https://www.wikidata.org/wiki/{best_match['q_number']}")
|
|
manual_review_count += 1
|
|
else:
|
|
print(f" ❌ LOW CONFIDENCE - Skipping")
|
|
manual_review_count += 1
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("Batch 1 Summary:")
|
|
print("-" * 80)
|
|
print(f"✅ Auto-enriched: {enriched_count}")
|
|
print(f"⚠️ Manual review: {manual_review_count}")
|
|
print(f"⏭️ Already enriched: {skipped_count}")
|
|
print(f"❌ Not found: {not_found_count}")
|
|
|
|
# Calculate coverage
|
|
with_wikidata = sum(1 for inst in institutions if has_wikidata(inst))
|
|
|
|
print()
|
|
print("Chilean Institution Coverage:")
|
|
print(f" Total: {len(institutions)}")
|
|
print(f" With Wikidata: {with_wikidata} ({with_wikidata/len(institutions)*100:.1f}%)")
|
|
print(f" Without: {len(institutions) - with_wikidata}")
|
|
|
|
# Save if any enrichments
|
|
if enriched_count > 0:
|
|
print()
|
|
print(f"💾 Saving enriched data to: {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print()
|
|
print("✅ Batch 1 enrichment complete!")
|
|
print()
|
|
print("NEXT STEPS:")
|
|
print("1. Review manual verification candidates")
|
|
print("2. Create Batch 2 with remaining high-priority institutions")
|
|
print("3. Continue iterating until 80%+ coverage")
|
|
else:
|
|
print()
|
|
print("⚠️ No automatic enrichments - all require manual review")
|
|
print(" Review candidates above and add Q-numbers manually if matches confirmed")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|