- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
208 lines
7 KiB
Python
Executable file
208 lines
7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Chilean GLAM Institutions - Batch 8 Wikidata Enrichment (Libraries)
|
|
Uses bulk SPARQL matches from query_wikidata_chilean_libraries.py
|
|
2 libraries with verified Q-numbers from Wikidata Query Service
|
|
|
|
Target: 54/90 institutions (60% coverage)
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Batch 8: 2 libraries from SPARQL bulk query
|
|
BATCH_8_ENRICHMENTS = [
|
|
{
|
|
"name": "Biblioteca Nacional Digital",
|
|
"city": "Iquique",
|
|
"q_number": "Q18924152",
|
|
"wikidata_name": "Biblioteca Nacional Digital de Chile",
|
|
"confidence": "partial",
|
|
"notes": "SPARQL match - partial name (full official title in Wikidata). Note: City may be incorrect in our data - this is a national digital library, not specific to Iquique.",
|
|
},
|
|
{
|
|
"name": "William Mulloy Library",
|
|
"city": "Isla de Pascua", # Updated from Unknown
|
|
"q_number": "Q8015912",
|
|
"wikidata_name": "Biblioteca William Mulloy",
|
|
"confidence": "partial",
|
|
"founded": "2002",
|
|
"notes": "SPARQL match - partial name (Spanish vs English). Easter Island archaeological library.",
|
|
},
|
|
]
|
|
|
|
def load_yaml(file_path: Path) -> list:
|
|
"""Load YAML file."""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def save_yaml(data: list, file_path: Path) -> None:
|
|
"""Save data to YAML file."""
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
data,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=120,
|
|
indent=2
|
|
)
|
|
|
|
def find_institution(institutions: list, name: str, city: str) -> dict:
|
|
"""Find institution by name and city."""
|
|
for inst in institutions:
|
|
if inst['name'] == name:
|
|
# Check city if provided
|
|
if city and city != "Unknown":
|
|
inst_city = inst.get('locations', [{}])[0].get('city', '')
|
|
if inst_city == city or city == "Isla de Pascua": # Allow Easter Island match
|
|
return inst
|
|
else:
|
|
return inst
|
|
raise ValueError(f"Institution not found: {name} ({city})")
|
|
|
|
def enrich_institution(inst: dict, enrichment: dict) -> None:
|
|
"""Add Wikidata identifier to institution."""
|
|
|
|
# Check if already has Wikidata
|
|
existing_ids = inst.get('identifiers', [])
|
|
has_wikidata = any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in existing_ids
|
|
)
|
|
|
|
if has_wikidata:
|
|
print(f" ⚠️ {inst['name']} already has Wikidata identifier")
|
|
return
|
|
|
|
# Add Wikidata identifier
|
|
wikidata_id = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': enrichment['q_number'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['q_number']}"
|
|
}
|
|
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append(wikidata_id)
|
|
|
|
# Update city if it was Unknown
|
|
if enrichment.get('city') and enrichment['city'] != "Unknown":
|
|
locations = inst.get('locations', [])
|
|
if locations and not locations[0].get('city'):
|
|
locations[0]['city'] = enrichment['city']
|
|
print(f" 📍 Updated city to: {enrichment['city']}")
|
|
|
|
# Update provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
inst['provenance']['enrichment_method'] = 'Wikidata SPARQL bulk query (Batch 8 - Libraries)'
|
|
inst['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
|
|
inst['provenance']['wikidata_match_confidence'] = enrichment['confidence']
|
|
|
|
# Add notes
|
|
if 'notes' not in inst['provenance']:
|
|
inst['provenance']['notes'] = []
|
|
elif isinstance(inst['provenance']['notes'], str):
|
|
inst['provenance']['notes'] = [inst['provenance']['notes']]
|
|
|
|
inst['provenance']['notes'].append(
|
|
f"Batch 8: {enrichment['notes']}"
|
|
)
|
|
|
|
print(f" ✅ Added Wikidata: {enrichment['q_number']} ({enrichment['wikidata_name']})")
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("CHILEAN GLAM INSTITUTIONS - BATCH 8 ENRICHMENT (LIBRARIES)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load data
|
|
input_file = Path('data/instances/chile/chilean_institutions_batch7_enriched.yaml')
|
|
print(f"📖 Loading: {input_file}")
|
|
institutions = load_yaml(input_file)
|
|
print(f" Loaded {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Create backup
|
|
backup_file = input_file.with_suffix('.yaml.batch8_backup')
|
|
print(f"💾 Creating backup: {backup_file}")
|
|
save_yaml(institutions, backup_file)
|
|
print()
|
|
|
|
# Apply enrichments
|
|
print(f"🔧 Applying {len(BATCH_8_ENRICHMENTS)} enrichments...")
|
|
print()
|
|
|
|
enriched_count = 0
|
|
for i, enrichment in enumerate(BATCH_8_ENRICHMENTS, 1):
|
|
print(f"{i}. {enrichment['name']} ({enrichment['city']})")
|
|
|
|
try:
|
|
inst = find_institution(institutions, enrichment['name'], enrichment['city'])
|
|
enrich_institution(inst, enrichment)
|
|
enriched_count += 1
|
|
except ValueError as e:
|
|
print(f" ❌ {e}")
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
|
|
print()
|
|
|
|
# Save enriched data
|
|
output_file = Path('data/instances/chile/chilean_institutions_batch8_enriched.yaml')
|
|
print(f"💾 Saving enriched data: {output_file}")
|
|
save_yaml(institutions, output_file)
|
|
print()
|
|
|
|
# Statistics
|
|
print("=" * 80)
|
|
print("ENRICHMENT SUMMARY")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
total = len(institutions)
|
|
with_wikidata = sum(
|
|
1 for inst in institutions
|
|
if any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
)
|
|
|
|
print(f"Total institutions: {total}")
|
|
print(f"With Wikidata: {with_wikidata} ({with_wikidata/total*100:.1f}%)")
|
|
print(f"Batch 8 enrichments: {enriched_count}")
|
|
print()
|
|
|
|
# By type
|
|
from collections import defaultdict
|
|
by_type = defaultdict(lambda: {'total': 0, 'with_wd': 0})
|
|
|
|
for inst in institutions:
|
|
inst_type = inst.get('institution_type', 'UNKNOWN')
|
|
by_type[inst_type]['total'] += 1
|
|
if any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
):
|
|
by_type[inst_type]['with_wd'] += 1
|
|
|
|
print("Coverage by type:")
|
|
for inst_type in sorted(by_type.keys()):
|
|
stats = by_type[inst_type]
|
|
pct = stats['with_wd']/stats['total']*100 if stats['total'] > 0 else 0
|
|
status = "✅" if pct == 100 else "⭐" if pct >= 50 else ""
|
|
print(f" {status} {inst_type}: {stats['with_wd']}/{stats['total']} ({pct:.1f}%)")
|
|
print()
|
|
|
|
print("🎉 Batch 8 enrichment complete!")
|
|
print(f"📊 New coverage: {with_wikidata}/{total} ({with_wikidata/total*100:.1f}%)")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|