glam/scripts/enrich_chilean_batch2_university_depts.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

310 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (University Departments)
REVISED STRATEGY based on dataset structure:
- Dataset contains university DEPARTMENTS/ARCHIVES, not universities themselves
- Example: "Universidad de Chile's Archivo Central" (archive department)
- Strategy: Enrich department records with PARENT UNIVERSITY's Wikidata Q-number
- This provides valuable linkage to authoritative university entities
BATCH 2 TARGET INSTITUTIONS (University departments/archives):
1. Universidad de Chile's Archivo Central Andrés Bello → Q219576 (Universidad de Chile)
2. Universidad de Concepción's SIBUDEC → Q1163431 (Universidad de Concepción)
3. Universidad Austral → Q1163558 (Universidad Austral de Chile)
4. Universidad Católica → Q1562315 (Pontificia Universidad Católica de Chile)
IMPROVEMENTS FROM BATCH 1:
1. Direct Q-number mapping (no SPARQL needed for major universities)
2. Fuzzy matching against department name patterns
3. Parent organization linkage (department → university)
4. Fast execution (no slow Wikidata queries)
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any, Optional
from rapidfuzz import fuzz
import re
# Direct Q-number mapping for major Chilean universities
UNIVERSITY_WIKIDATA_MAP = {
'Universidad de Chile': 'Q219576',
'Universidad de Concepción': 'Q1163431',
'Universidad Austral de Chile': 'Q1163558',
'Pontificia Universidad Católica de Chile': 'Q1562315',
'Universidad de Santiago de Chile': 'Q2006105',
'Universidad Católica del Norte': 'Q3244385', # Already enriched in Batch 1
'Universidad de Tarapacá': 'Q3138071' # Already enriched in Batch 1
}
# Batch 2 targets: Department records in dataset
BATCH_2_TARGETS = [
{
'name_pattern': "Universidad de Chile's Archivo",
'name_variants': [
"Universidad de Chile's Archivo Central",
"Archivo Central Andrés Bello",
"Universidad de Chile Archivo"
],
'parent_university': 'Universidad de Chile',
'wikidata_q': 'Q219576',
'inst_type': 'ARCHIVE',
'notes': 'Central archive of Universidad de Chile (founded 1842)'
},
{
'name_pattern': "Universidad de Concepción's SIBUDEC",
'name_variants': [
"SIBUDEC",
"Sistema de Bibliotecas UdeC",
"Universidad de Concepción SIBUDEC"
],
'parent_university': 'Universidad de Concepción',
'wikidata_q': 'Q1163431',
'inst_type': 'EDUCATION_PROVIDER',
'notes': 'Library system of Universidad de Concepción (founded 1919)'
},
{
'name_pattern': 'Universidad Austral',
'name_variants': [
'Universidad Austral',
'Universidad Austral de Chile',
'UACh'
],
'parent_university': 'Universidad Austral de Chile',
'wikidata_q': 'Q1163558',
'inst_type': 'EDUCATION_PROVIDER',
'notes': 'Universidad Austral de Chile in Valdivia (founded 1954)'
},
{
'name_pattern': 'Universidad Católica',
'name_variants': [
'Universidad Católica',
'Pontificia Universidad Católica',
'UC Chile',
'PUC'
],
'parent_university': 'Pontificia Universidad Católica de Chile',
'wikidata_q': 'Q1562315',
'inst_type': 'EDUCATION_PROVIDER',
'notes': 'Pontificia Universidad Católica de Chile in Santiago (founded 1888)'
}
]
def normalize_name(name: str) -> str:
"""Normalize institution name for better matching."""
# Remove possessive markers
name = re.sub(r"'s\b", "", name)
# Remove leading/trailing whitespace
name = name.strip()
# Normalize whitespace
name = re.sub(r'\s+', ' ', name)
return name
def fuzzy_match_name(inst_name: str, name_variants: List[str]) -> tuple[bool, float]:
"""Check if institution name matches any variant with fuzzy matching."""
inst_name_norm = normalize_name(inst_name).lower()
best_score = 0
for variant in name_variants:
variant_norm = normalize_name(variant).lower()
# Try multiple strategies
scores = [
fuzz.ratio(inst_name_norm, variant_norm),
fuzz.partial_ratio(inst_name_norm, variant_norm),
fuzz.token_set_ratio(inst_name_norm, variant_norm),
fuzz.token_sort_ratio(inst_name_norm, variant_norm)
]
best_score = max(best_score, max(scores))
return best_score >= 80, best_score
def has_wikidata(inst: Dict[str, Any]) -> bool:
"""Check if institution already has Wikidata identifier."""
return any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
def add_wikidata_identifier(
inst: Dict[str, Any],
q_number: str,
parent_university: str,
confidence: float,
notes: str
) -> Dict[str, Any]:
"""Add Wikidata identifier to institution with provenance tracking."""
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Update description to note parent university
if 'description' in inst and inst['description']:
if parent_university not in inst['description']:
inst['description'] = f"{inst['description']} Part of {parent_university}."
else:
inst['description'] = f"Part of {parent_university}. {notes}"
# Update provenance
if 'provenance' in inst:
old_method = inst['provenance'].get('extraction_method', '')
enrichment_note = (
f" + Wikidata enrichment (Batch 2 university depts, "
f"parent: {parent_university}, confidence={confidence:.2f})"
)
inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"
return inst
def main():
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch1_enriched.yaml'
backup_file = data_file.with_suffix('.batch2_backup')
output_file = data_file.with_name('chilean_institutions_batch2_enriched.yaml')
print("=" * 80)
print("Chilean Heritage Institutions - Batch 2 Wikidata Enrichment")
print("University Departments/Archives Focus")
print("Session: November 9, 2025")
print("Strategy: Enrich dept records with parent university Q-numbers")
print("=" * 80)
print()
# Load data
print(f"📂 Loading: {data_file}")
with open(data_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f" Total institutions: {len(institutions)}")
# Check existing Wikidata coverage
with_wikidata_before = sum(1 for inst in institutions if has_wikidata(inst))
print(f" Current Wikidata coverage: {with_wikidata_before}/{len(institutions)} ({with_wikidata_before/len(institutions)*100:.1f}%)")
print()
# Create backup
print(f"💾 Creating backup: {backup_file.name}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("Batch 2 Enrichment Process:")
print("-" * 80)
# Process each target
enriched_count = 0
skipped_count = 0
not_found_count = 0
for i, target in enumerate(BATCH_2_TARGETS, 1):
print(f"\n[{i}/{len(BATCH_2_TARGETS)}] 🎓 Searching: {target['name_pattern']}")
print(f" Parent: {target['parent_university']}")
print(f" Wikidata: {target['wikidata_q']}")
print(f" Name variants: {', '.join(target['name_variants'][:3])}")
# Find matching institution in dataset
matched = None
match_score = 0
for inst in institutions:
is_match, score = fuzzy_match_name(inst.get('name', ''), target['name_variants'])
# Also check institution type
if is_match and inst.get('institution_type') == target['inst_type']:
matched = inst
match_score = score
break
if not matched:
print(f" ❌ NOT FOUND in dataset")
print(f" (No match for any variant above 80% similarity)")
not_found_count += 1
continue
print(f" ✓ Found: {matched.get('name')}")
print(f" Match score: {match_score:.1f}%")
# Check if already has Wikidata
if has_wikidata(matched):
existing_q = next(
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
if id_obj.get('identifier_scheme') == 'Wikidata'),
None
)
print(f" ⏭️ Already enriched with {existing_q}")
skipped_count += 1
continue
# Add Wikidata identifier (direct mapping, no query needed)
print(f" ✅ Adding Wikidata identifier: {target['wikidata_q']}")
print(f" Linking to parent: {target['parent_university']}")
add_wikidata_identifier(
matched,
target['wikidata_q'],
target['parent_university'],
match_score / 100,
target['notes']
)
enriched_count += 1
print()
print("=" * 80)
print("Batch 2 Summary:")
print("-" * 80)
print(f"✅ Enriched: {enriched_count}")
print(f"⏭️ Already enriched: {skipped_count}")
print(f"❌ Not found: {not_found_count}")
# Calculate updated coverage
with_wikidata_after = sum(1 for inst in institutions if has_wikidata(inst))
print()
print("Chilean Institution Coverage:")
print(f" Total: {len(institutions)}")
print(f" Before Batch 2: {with_wikidata_before} ({with_wikidata_before/len(institutions)*100:.1f}%)")
print(f" After Batch 2: {with_wikidata_after} ({with_wikidata_after/len(institutions)*100:.1f}%)")
print(f" Improvement: +{with_wikidata_after - with_wikidata_before} institutions ({(with_wikidata_after - with_wikidata_before)/len(institutions)*100:.1f}%)")
# Save if any enrichments
if enriched_count > 0:
print()
print(f"💾 Saving enriched data to: {output_file.name}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("✅ Batch 2 enrichment complete!")
print()
print("NEXT STEPS:")
print("1. Create Batch 3 targeting major museums (5 institutions):")
print(" - Museo Nacional de Historia Natural (Santiago)")
print(" - Museo de Arte Precolombino (Santiago)")
print(" - Museo Histórico Nacional (Santiago)")
print(" - Museo de Bellas Artes (Santiago)")
print(" - Museo Regional de Ancud (Chiloé)")
print("2. Continue until 20+ institutions enriched (22% coverage)")
else:
print()
print("⚠️ No enrichments - all targets already enriched or not found")
if __name__ == '__main__':
main()