- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
310 lines
11 KiB
Python
310 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (University Departments)
|
|
|
|
REVISED STRATEGY based on dataset structure:
|
|
- Dataset contains university DEPARTMENTS/ARCHIVES, not universities themselves
|
|
- Example: "Universidad de Chile's Archivo Central" (archive department)
|
|
- Strategy: Enrich department records with PARENT UNIVERSITY's Wikidata Q-number
|
|
- This provides valuable linkage to authoritative university entities
|
|
|
|
BATCH 2 TARGET INSTITUTIONS (University departments/archives):
|
|
1. Universidad de Chile's Archivo Central Andrés Bello → Q219576 (Universidad de Chile)
|
|
2. Universidad de Concepción's SIBUDEC → Q1163431 (Universidad de Concepción)
|
|
3. Universidad Austral → Q1163558 (Universidad Austral de Chile)
|
|
4. Universidad Católica → Q1562315 (Pontificia Universidad Católica de Chile)
|
|
|
|
IMPROVEMENTS FROM BATCH 1:
|
|
1. Direct Q-number mapping (no SPARQL needed for major universities)
|
|
2. Fuzzy matching against department name patterns
|
|
3. Parent organization linkage (department → university)
|
|
4. Fast execution (no slow Wikidata queries)
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
from rapidfuzz import fuzz
|
|
import re
|
|
|
|
# Direct Q-number mapping for major Chilean universities
|
|
UNIVERSITY_WIKIDATA_MAP = {
|
|
'Universidad de Chile': 'Q219576',
|
|
'Universidad de Concepción': 'Q1163431',
|
|
'Universidad Austral de Chile': 'Q1163558',
|
|
'Pontificia Universidad Católica de Chile': 'Q1562315',
|
|
'Universidad de Santiago de Chile': 'Q2006105',
|
|
'Universidad Católica del Norte': 'Q3244385', # Already enriched in Batch 1
|
|
'Universidad de Tarapacá': 'Q3138071' # Already enriched in Batch 1
|
|
}
|
|
|
|
# Batch 2 targets: Department records in dataset
|
|
BATCH_2_TARGETS = [
|
|
{
|
|
'name_pattern': "Universidad de Chile's Archivo",
|
|
'name_variants': [
|
|
"Universidad de Chile's Archivo Central",
|
|
"Archivo Central Andrés Bello",
|
|
"Universidad de Chile Archivo"
|
|
],
|
|
'parent_university': 'Universidad de Chile',
|
|
'wikidata_q': 'Q219576',
|
|
'inst_type': 'ARCHIVE',
|
|
'notes': 'Central archive of Universidad de Chile (founded 1842)'
|
|
},
|
|
{
|
|
'name_pattern': "Universidad de Concepción's SIBUDEC",
|
|
'name_variants': [
|
|
"SIBUDEC",
|
|
"Sistema de Bibliotecas UdeC",
|
|
"Universidad de Concepción SIBUDEC"
|
|
],
|
|
'parent_university': 'Universidad de Concepción',
|
|
'wikidata_q': 'Q1163431',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'notes': 'Library system of Universidad de Concepción (founded 1919)'
|
|
},
|
|
{
|
|
'name_pattern': 'Universidad Austral',
|
|
'name_variants': [
|
|
'Universidad Austral',
|
|
'Universidad Austral de Chile',
|
|
'UACh'
|
|
],
|
|
'parent_university': 'Universidad Austral de Chile',
|
|
'wikidata_q': 'Q1163558',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'notes': 'Universidad Austral de Chile in Valdivia (founded 1954)'
|
|
},
|
|
{
|
|
'name_pattern': 'Universidad Católica',
|
|
'name_variants': [
|
|
'Universidad Católica',
|
|
'Pontificia Universidad Católica',
|
|
'UC Chile',
|
|
'PUC'
|
|
],
|
|
'parent_university': 'Pontificia Universidad Católica de Chile',
|
|
'wikidata_q': 'Q1562315',
|
|
'inst_type': 'EDUCATION_PROVIDER',
|
|
'notes': 'Pontificia Universidad Católica de Chile in Santiago (founded 1888)'
|
|
}
|
|
]
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for better matching."""
|
|
# Remove possessive markers
|
|
name = re.sub(r"'s\b", "", name)
|
|
|
|
# Remove leading/trailing whitespace
|
|
name = name.strip()
|
|
|
|
# Normalize whitespace
|
|
name = re.sub(r'\s+', ' ', name)
|
|
|
|
return name
|
|
|
|
|
|
def fuzzy_match_name(inst_name: str, name_variants: List[str]) -> tuple[bool, float]:
|
|
"""Check if institution name matches any variant with fuzzy matching."""
|
|
inst_name_norm = normalize_name(inst_name).lower()
|
|
|
|
best_score = 0
|
|
for variant in name_variants:
|
|
variant_norm = normalize_name(variant).lower()
|
|
|
|
# Try multiple strategies
|
|
scores = [
|
|
fuzz.ratio(inst_name_norm, variant_norm),
|
|
fuzz.partial_ratio(inst_name_norm, variant_norm),
|
|
fuzz.token_set_ratio(inst_name_norm, variant_norm),
|
|
fuzz.token_sort_ratio(inst_name_norm, variant_norm)
|
|
]
|
|
|
|
best_score = max(best_score, max(scores))
|
|
|
|
return best_score >= 80, best_score
|
|
|
|
|
|
def has_wikidata(inst: Dict[str, Any]) -> bool:
|
|
"""Check if institution already has Wikidata identifier."""
|
|
return any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
|
|
|
|
def add_wikidata_identifier(
|
|
inst: Dict[str, Any],
|
|
q_number: str,
|
|
parent_university: str,
|
|
confidence: float,
|
|
notes: str
|
|
) -> Dict[str, Any]:
|
|
"""Add Wikidata identifier to institution with provenance tracking."""
|
|
wikidata_id = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_number,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
|
}
|
|
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append(wikidata_id)
|
|
|
|
# Update description to note parent university
|
|
if 'description' in inst and inst['description']:
|
|
if parent_university not in inst['description']:
|
|
inst['description'] = f"{inst['description']} Part of {parent_university}."
|
|
else:
|
|
inst['description'] = f"Part of {parent_university}. {notes}"
|
|
|
|
# Update provenance
|
|
if 'provenance' in inst:
|
|
old_method = inst['provenance'].get('extraction_method', '')
|
|
enrichment_note = (
|
|
f" + Wikidata enrichment (Batch 2 university depts, "
|
|
f"parent: {parent_university}, confidence={confidence:.2f})"
|
|
)
|
|
inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"
|
|
|
|
return inst
|
|
|
|
|
|
def main():
|
|
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch1_enriched.yaml'
|
|
backup_file = data_file.with_suffix('.batch2_backup')
|
|
output_file = data_file.with_name('chilean_institutions_batch2_enriched.yaml')
|
|
|
|
print("=" * 80)
|
|
print("Chilean Heritage Institutions - Batch 2 Wikidata Enrichment")
|
|
print("University Departments/Archives Focus")
|
|
print("Session: November 9, 2025")
|
|
print("Strategy: Enrich dept records with parent university Q-numbers")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load data
|
|
print(f"📂 Loading: {data_file}")
|
|
with open(data_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f" Total institutions: {len(institutions)}")
|
|
|
|
# Check existing Wikidata coverage
|
|
with_wikidata_before = sum(1 for inst in institutions if has_wikidata(inst))
|
|
print(f" Current Wikidata coverage: {with_wikidata_before}/{len(institutions)} ({with_wikidata_before/len(institutions)*100:.1f}%)")
|
|
print()
|
|
|
|
# Create backup
|
|
print(f"💾 Creating backup: {backup_file.name}")
|
|
with open(backup_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
print()
|
|
|
|
print("Batch 2 Enrichment Process:")
|
|
print("-" * 80)
|
|
|
|
# Process each target
|
|
enriched_count = 0
|
|
skipped_count = 0
|
|
not_found_count = 0
|
|
|
|
for i, target in enumerate(BATCH_2_TARGETS, 1):
|
|
print(f"\n[{i}/{len(BATCH_2_TARGETS)}] 🎓 Searching: {target['name_pattern']}")
|
|
print(f" Parent: {target['parent_university']}")
|
|
print(f" Wikidata: {target['wikidata_q']}")
|
|
print(f" Name variants: {', '.join(target['name_variants'][:3])}")
|
|
|
|
# Find matching institution in dataset
|
|
matched = None
|
|
match_score = 0
|
|
|
|
for inst in institutions:
|
|
is_match, score = fuzzy_match_name(inst.get('name', ''), target['name_variants'])
|
|
|
|
# Also check institution type
|
|
if is_match and inst.get('institution_type') == target['inst_type']:
|
|
matched = inst
|
|
match_score = score
|
|
break
|
|
|
|
if not matched:
|
|
print(f" ❌ NOT FOUND in dataset")
|
|
print(f" (No match for any variant above 80% similarity)")
|
|
not_found_count += 1
|
|
continue
|
|
|
|
print(f" ✓ Found: {matched.get('name')}")
|
|
print(f" Match score: {match_score:.1f}%")
|
|
|
|
# Check if already has Wikidata
|
|
if has_wikidata(matched):
|
|
existing_q = next(
|
|
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
|
|
if id_obj.get('identifier_scheme') == 'Wikidata'),
|
|
None
|
|
)
|
|
print(f" ⏭️ Already enriched with {existing_q}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Add Wikidata identifier (direct mapping, no query needed)
|
|
print(f" ✅ Adding Wikidata identifier: {target['wikidata_q']}")
|
|
print(f" Linking to parent: {target['parent_university']}")
|
|
|
|
add_wikidata_identifier(
|
|
matched,
|
|
target['wikidata_q'],
|
|
target['parent_university'],
|
|
match_score / 100,
|
|
target['notes']
|
|
)
|
|
enriched_count += 1
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("Batch 2 Summary:")
|
|
print("-" * 80)
|
|
print(f"✅ Enriched: {enriched_count}")
|
|
print(f"⏭️ Already enriched: {skipped_count}")
|
|
print(f"❌ Not found: {not_found_count}")
|
|
|
|
# Calculate updated coverage
|
|
with_wikidata_after = sum(1 for inst in institutions if has_wikidata(inst))
|
|
|
|
print()
|
|
print("Chilean Institution Coverage:")
|
|
print(f" Total: {len(institutions)}")
|
|
print(f" Before Batch 2: {with_wikidata_before} ({with_wikidata_before/len(institutions)*100:.1f}%)")
|
|
print(f" After Batch 2: {with_wikidata_after} ({with_wikidata_after/len(institutions)*100:.1f}%)")
|
|
print(f" Improvement: +{with_wikidata_after - with_wikidata_before} institutions ({(with_wikidata_after - with_wikidata_before)/len(institutions)*100:.1f}%)")
|
|
|
|
# Save if any enrichments
|
|
if enriched_count > 0:
|
|
print()
|
|
print(f"💾 Saving enriched data to: {output_file.name}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print()
|
|
print("✅ Batch 2 enrichment complete!")
|
|
print()
|
|
print("NEXT STEPS:")
|
|
print("1. Create Batch 3 targeting major museums (5 institutions):")
|
|
print(" - Museo Nacional de Historia Natural (Santiago)")
|
|
print(" - Museo de Arte Precolombino (Santiago)")
|
|
print(" - Museo Histórico Nacional (Santiago)")
|
|
print(" - Museo de Bellas Artes (Santiago)")
|
|
print(" - Museo Regional de Ancud (Chiloé)")
|
|
print("2. Continue until 20+ institutions enriched (22% coverage)")
|
|
else:
|
|
print()
|
|
print("⚠️ No enrichments - all targets already enriched or not found")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|