glam/scripts/enrich_chilean_institutions.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

547 lines
24 KiB
Python

#!/usr/bin/env python3
"""
Enrich Chilean GLAM institution records from conversation JSON.
This script reads the minimally-populated Chilean institutions YAML file
and enriches each record with comprehensive information extracted from
the source conversation JSON file.
Expected Enrichments:
- Detailed descriptions synthesized from conversation context
- Complete location data (cities, addresses, coordinates)
- Identifiers (ISIL codes, Wikidata IDs, URLs)
- Digital platform information (SURDOC, SINAR, institutional websites)
- Collection metadata (types, subjects, temporal coverage, extent)
- Founding dates and organizational change history
- Enhanced confidence scores based on explicit vs. inferred data
Schema Compliance: LinkML v0.2.0
- schemas/core.yaml - HeritageCustodian, Location, Identifier, DigitalPlatform
- schemas/enums.yaml - InstitutionTypeEnum, ChangeTypeEnum, DataSource, DataTier
- schemas/provenance.yaml - Provenance, ChangeEvent
- schemas/collections.yaml - Collection
Usage:
python scripts/enrich_chilean_institutions.py
Input Files:
- data/raw/chilean_glam_conversation.json - Source conversation (454KB)
- data/instances/chilean_institutions.yaml - Current 90 minimal records
Output File:
- data/instances/chilean_institutions_curated.yaml - Enriched records
The script will:
1. Load existing 90 Chilean institution records
2. Load and parse the conversation JSON
3. For each institution, extract and add rich contextual information
4. Generate comprehensive LinkML-compliant YAML records
5. Produce a data completeness report
Author: AI Data Curation Agent
Date: 2025-11-06
"""
import json
import yaml
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, field
# Import LinkML models
import sys
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from glam_extractor.models import (
HeritageCustodian,
Location,
Identifier,
DigitalPlatform,
Collection,
Provenance,
ChangeEvent,
InstitutionType,
DataSource,
DataTier,
ChangeType,
DigitalPlatformType,
)
# File paths
PROJECT_ROOT = Path(__file__).parent.parent
CONVERSATION_PATH = PROJECT_ROOT / 'data' / 'raw' / 'chilean_glam_conversation.json'
INPUT_YAML_PATH = PROJECT_ROOT / 'data' / 'instances' / 'chilean_institutions.yaml'
OUTPUT_YAML_PATH = PROJECT_ROOT / 'data' / 'instances' / 'chilean_institutions_curated.yaml'
REPORT_PATH = PROJECT_ROOT / 'data' / 'instances' / 'chilean_curation_report.md'
@dataclass
class EnrichmentContext:
"""Context information extracted from conversation for institution enrichment."""
institution_name: str
descriptions: List[str] = field(default_factory=list)
cities: List[str] = field(default_factory=list)
addresses: List[str] = field(default_factory=list)
urls: List[str] = field(default_factory=list)
isil_codes: List[str] = field(default_factory=list)
wikidata_ids: List[str] = field(default_factory=list)
platforms: List[str] = field(default_factory=list)
collection_info: List[str] = field(default_factory=list)
founding_dates: List[str] = field(default_factory=list)
confidence_boost: float = 0.0 # +0.1 to +0.15 for explicit mentions
class ChileanInstitutionEnricher:
"""Enriches Chilean institution records with data from conversation JSON."""
def __init__(self):
self.conversation_data: Dict[str, Any] = {}
self.conversation_text: str = ""
self.existing_records: List[HeritageCustodian] = []
self.enriched_records: List[HeritageCustodian] = []
# Conversation metadata
self.conversation_id: str = ""
self.conversation_name: str = ""
# National platforms mentioned in conversation
self.national_platforms = {
'SURDOC': 'http://www.surdoc.cl',
'SINAR': 'http://www.sinar.cl',
'Memoria Chilena': 'http://www.memoriachilena.gob.cl',
'Biblioteca Nacional Digital': 'http://www.bibliotecanacionaldigital.gob.cl',
}
def load_conversation(self):
"""Load and parse the Chilean GLAM conversation JSON file."""
print(f"Loading conversation from: {CONVERSATION_PATH}")
with open(CONVERSATION_PATH, 'r', encoding='utf-8') as f:
self.conversation_data = json.load(f)
self.conversation_id = self.conversation_data.get('uuid', '')
self.conversation_name = self.conversation_data.get('name', '')
# Concatenate all message text for searching
messages = self.conversation_data.get('chat_messages', [])
text_parts = []
for msg in messages:
if 'text' in msg and msg['text']:
text_parts.append(msg['text'])
self.conversation_text = '\n\n'.join(text_parts)
print(f"Loaded conversation: {self.conversation_name}")
print(f"Total characters: {len(self.conversation_text):,}")
print(f"Total messages: {len(messages)}")
def load_existing_records(self):
"""Load existing minimal Chilean institution records."""
print(f"\nLoading existing records from: {INPUT_YAML_PATH}")
with open(INPUT_YAML_PATH, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Parse YAML records into HeritageCustodian objects
for record_dict in data:
try:
# Remove file:// URLs from provenance (not supported by HttpUrl validator)
if 'provenance' in record_dict and 'source_url' in record_dict['provenance']:
source_url = record_dict['provenance']['source_url']
if source_url and source_url.startswith('file://'):
# Store as None, we'll add proper source_url during enrichment
record_dict['provenance']['source_url'] = None
# Create HeritageCustodian from dict
record = HeritageCustodian(**record_dict)
self.existing_records.append(record)
except Exception as e:
print(f"Warning: Could not parse record {record_dict.get('name', 'UNKNOWN')}: {e}")
print(f"Loaded {len(self.existing_records)} existing records")
def extract_enrichment_context(self, institution_name: str) -> EnrichmentContext:
"""
Extract enrichment context for a specific institution from conversation text.
This method searches the conversation for mentions of the institution
and extracts surrounding context to populate enrichment fields.
"""
context = EnrichmentContext(institution_name=institution_name)
# Search for institution name mentions (case-insensitive)
pattern = re.compile(re.escape(institution_name), re.IGNORECASE)
matches = list(pattern.finditer(self.conversation_text))
if not matches:
# Try partial name matching (first 3 significant words)
words = institution_name.split()
significant_words = [w for w in words if len(w) > 3][:3]
if significant_words:
partial_pattern = '.*'.join(re.escape(w) for w in significant_words)
pattern = re.compile(partial_pattern, re.IGNORECASE)
matches = list(pattern.finditer(self.conversation_text))
# Extract context around each match
for match in matches:
start = max(0, match.start() - 500) # 500 chars before
end = min(len(self.conversation_text), match.end() + 500) # 500 chars after
context_text = self.conversation_text[start:end]
# Look for URLs in context
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
urls = re.findall(url_pattern, context_text)
context.urls.extend(urls)
# Look for ISIL codes (CL-XXXXX format)
isil_pattern = r'\bCL-[A-Za-z0-9]+'
isil_codes = re.findall(isil_pattern, context_text)
context.isil_codes.extend(isil_codes)
# Look for Wikidata IDs
wikidata_pattern = r'\bQ\d{4,}'
wikidata_ids = re.findall(wikidata_pattern, context_text)
context.wikidata_ids.extend(wikidata_ids)
# Look for city names (Chilean cities - common ones)
chilean_cities = [
'Santiago', 'Valparaíso', 'Concepción', 'Temuco', 'Antofagasta',
'Iquique', 'Arica', 'Talca', 'La Serena', 'Punta Arenas',
'Rancagua', 'Osorno', 'Valdivia', 'Puerto Montt', 'Chillán',
'Copiapó', 'Calama', 'Coyhaique', 'Quillota', 'Curicó'
]
for city in chilean_cities:
if city.lower() in context_text.lower():
context.cities.append(city)
# Look for founding/establishment dates
date_patterns = [
r'fundad[oa] en (\d{4})',
r'establecid[oa] en (\d{4})',
r'cread[oa] en (\d{4})',
r'inaugurad[oa] en (\d{4})',
r'desde (\d{4})',
]
for date_pattern in date_patterns:
dates = re.findall(date_pattern, context_text, re.IGNORECASE)
context.founding_dates.extend(dates)
# Check for platform mentions
for platform_name, platform_url in self.national_platforms.items():
if platform_name.lower() in self.conversation_text.lower():
context.platforms.append(platform_name)
# Boost confidence if explicit mentions found
if matches:
context.confidence_boost = min(0.15, len(matches) * 0.05)
# Deduplicate lists
context.urls = list(set(context.urls))
context.isil_codes = list(set(context.isil_codes))
context.wikidata_ids = list(set(context.wikidata_ids))
context.cities = list(set(context.cities))
context.founding_dates = list(set(context.founding_dates))
context.platforms = list(set(context.platforms))
return context
def enrich_record(self, record: HeritageCustodian) -> HeritageCustodian:
"""
Enrich a single institution record with contextual information.
Returns an enriched HeritageCustodian with:
- Enhanced descriptions
- Additional locations
- Extracted identifiers
- Digital platform links
- Collection metadata (if available)
- Change history (founding events)
- Updated provenance with higher confidence scores
"""
print(f"\nEnriching: {record.name}")
# Extract context from conversation
context = self.extract_enrichment_context(record.name)
# Build enriched description
description_parts = []
if context.cities:
city_list = ', '.join(context.cities[:3])
description_parts.append(f"Heritage institution located in {city_list}, Chile.")
if context.platforms:
platform_list = ', '.join(context.platforms)
description_parts.append(f"Participates in national platforms: {platform_list}.")
if context.founding_dates:
earliest_date = min(context.founding_dates)
description_parts.append(f"Established in {earliest_date}.")
# Add generic description based on institution type
type_descriptions = {
InstitutionType.MUSEUM: "Museum institution preserving and exhibiting cultural heritage.",
InstitutionType.LIBRARY: "Library providing access to published materials and information resources.",
InstitutionType.ARCHIVE: "Archive preserving historical documents and records.",
InstitutionType.EDUCATION_PROVIDER: "Educational institution with heritage collections.",
InstitutionType.RESEARCH_CENTER: "Research center focusing on heritage documentation.",
InstitutionType.OFFICIAL_INSTITUTION: "Official government heritage institution.",
InstitutionType.MIXED: "Multi-purpose cultural heritage institution.",
}
if record.institution_type in type_descriptions:
description_parts.append(type_descriptions[record.institution_type])
enriched_description = ' '.join(description_parts) if description_parts else None
# Enrich locations
enriched_locations = record.locations or []
if context.cities and enriched_locations:
# Update first location with city if missing
if not enriched_locations[0].city and context.cities:
enriched_locations[0].city = context.cities[0]
# Build identifiers list
identifiers = record.identifiers or []
for isil_code in context.isil_codes:
identifiers.append(Identifier(
identifier_scheme='ISIL',
identifier_value=isil_code,
identifier_url=f'https://isil.org/{isil_code}'
))
for wikidata_id in context.wikidata_ids:
identifiers.append(Identifier(
identifier_scheme='Wikidata',
identifier_value=wikidata_id,
identifier_url=f'https://www.wikidata.org/wiki/{wikidata_id}'
))
for url in context.urls[:2]: # Limit to 2 URLs
identifiers.append(Identifier(
identifier_scheme='Website',
identifier_value=url,
identifier_url=url
))
# Build digital platforms list
digital_platforms = record.digital_platforms or []
for platform_name in context.platforms:
if platform_name in self.national_platforms:
digital_platforms.append(DigitalPlatform(
platform_name=platform_name,
platform_url=self.national_platforms[platform_name],
platform_type=DigitalPlatformType.DISCOVERY_PORTAL,
))
# Build change history (founding events)
change_history = record.change_history or []
for founding_date in context.founding_dates:
change_history.append(ChangeEvent(
event_id=f"https://w3id.org/heritage/custodian/event/{record.id.split('/')[-1]}-founding",
change_type=ChangeType.FOUNDING,
event_date=f"{founding_date}-01-01",
event_description=f"Institution founded in {founding_date} (extracted from conversation context).",
))
# Update provenance with enhanced confidence
base_confidence = record.provenance.confidence_score if record.provenance else 0.85
new_confidence = min(1.0, base_confidence + context.confidence_boost)
enriched_provenance = Provenance(
data_source=DataSource.CONVERSATION_NLP,
data_tier=DataTier.TIER_4_INFERRED,
extraction_date=datetime.now(timezone.utc).isoformat(),
extraction_method="Comprehensive AI-driven enrichment from conversation context",
confidence_score=new_confidence,
conversation_id=self.conversation_id,
source_url=None, # file:// URLs not supported by Pydantic HttpUrl
)
# Create enriched record (only include optional fields if they have content)
enriched_data = {
'id': record.id,
'name': record.name,
'institution_type': record.institution_type,
'provenance': enriched_provenance,
}
# Add optional fields only if they have values
if record.alternative_names:
enriched_data['alternative_names'] = record.alternative_names
if enriched_description or record.description:
enriched_data['description'] = enriched_description or record.description
if enriched_locations or record.locations:
enriched_data['locations'] = enriched_locations if enriched_locations else record.locations
if identifiers:
enriched_data['identifiers'] = identifiers
if digital_platforms:
enriched_data['digital_platforms'] = digital_platforms
if record.collections:
enriched_data['collections'] = record.collections
if change_history:
enriched_data['change_history'] = change_history
enriched = HeritageCustodian(**enriched_data)
print(f" ✓ Description: {'Added' if enriched_description else 'None'}")
print(f" ✓ Identifiers: {len(identifiers)}")
print(f" ✓ Platforms: {len(digital_platforms)}")
print(f" ✓ Change Events: {len(change_history)}")
print(f" ✓ Confidence: {new_confidence:.2f} (boost: +{context.confidence_boost:.2f})")
return enriched
def enrich_all_records(self):
"""Enrich all existing records with conversation context."""
print(f"\n{'='*60}")
print(f"ENRICHING {len(self.existing_records)} CHILEAN INSTITUTIONS")
print(f"{'='*60}")
for record in self.existing_records:
enriched = self.enrich_record(record)
self.enriched_records.append(enriched)
print(f"\n{'='*60}")
print(f"ENRICHMENT COMPLETE: {len(self.enriched_records)} records")
print(f"{'='*60}")
def save_enriched_records(self):
"""Save enriched records to YAML file."""
print(f"\nSaving enriched records to: {OUTPUT_YAML_PATH}")
# Convert HeritageCustodian objects to dicts
records_dicts = []
for record in self.enriched_records:
# Use dict() with mode='json' to convert HttpUrl to str
record_dict = json.loads(record.json(exclude_none=True, exclude_unset=True))
records_dicts.append(record_dict)
# Write YAML
with open(OUTPUT_YAML_PATH, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write("# Chilean GLAM Institutions - Curated Edition\n")
f.write(f"# Enriched from conversation: {self.conversation_name}\n")
f.write(f"# Conversation ID: {self.conversation_id}\n")
f.write(f"# Total institutions: {len(self.enriched_records)}\n")
f.write(f"# Curation date: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}\n")
f.write("\n")
yaml.safe_dump(records_dicts, f, allow_unicode=True, sort_keys=False, width=120)
print(f"✓ Saved {len(self.enriched_records)} enriched records")
def generate_report(self):
"""Generate a data completeness and curation report."""
print(f"\nGenerating curation report: {REPORT_PATH}")
# Calculate statistics
total_records = len(self.enriched_records)
records_with_descriptions = sum(1 for r in self.enriched_records if r.description)
records_with_identifiers = sum(1 for r in self.enriched_records if r.identifiers)
records_with_platforms = sum(1 for r in self.enriched_records if r.digital_platforms)
records_with_change_history = sum(1 for r in self.enriched_records if r.change_history)
total_identifiers = sum(len(r.identifiers or []) for r in self.enriched_records)
total_platforms = sum(len(r.digital_platforms or []) for r in self.enriched_records)
total_events = sum(len(r.change_history or []) for r in self.enriched_records)
# Find top 5 most complete records
def completeness_score(record: HeritageCustodian) -> int:
score = 0
if record.description: score += 2
score += len(record.identifiers or []) * 2
score += len(record.digital_platforms or [])
score += len(record.change_history or [])
if record.locations and record.locations[0].city: score += 1
return score
sorted_records = sorted(self.enriched_records, key=completeness_score, reverse=True)
top_5 = sorted_records[:5]
bottom_5 = sorted_records[-5:]
# Write report
with open(REPORT_PATH, 'w', encoding='utf-8') as f:
f.write("# Chilean GLAM Institutions Curation Report\n\n")
f.write(f"**Curation Date**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}\n\n")
f.write(f"**Source Conversation**: {self.conversation_name}\n\n")
f.write(f"**Conversation ID**: `{self.conversation_id}`\n\n")
f.write("## Summary Statistics\n\n")
f.write(f"- **Total Institutions**: {total_records}\n")
f.write(f"- **Records with Descriptions**: {records_with_descriptions} ({records_with_descriptions/total_records*100:.1f}%)\n")
f.write(f"- **Records with Identifiers**: {records_with_identifiers} ({records_with_identifiers/total_records*100:.1f}%)\n")
f.write(f"- **Records with Digital Platforms**: {records_with_platforms} ({records_with_platforms/total_records*100:.1f}%)\n")
f.write(f"- **Records with Change History**: {records_with_change_history} ({records_with_change_history/total_records*100:.1f}%)\n")
f.write(f"- **Total Identifiers Extracted**: {total_identifiers}\n")
f.write(f"- **Total Digital Platforms**: {total_platforms}\n")
f.write(f"- **Total Change Events**: {total_events}\n\n")
f.write("## Top 5 Most Complete Records\n\n")
for i, record in enumerate(top_5, 1):
f.write(f"{i}. **{record.name}** (Score: {completeness_score(record)})\n")
f.write(f" - Type: {record.institution_type}\n")
if record.description:
f.write(f" - Description: {record.description[:100]}...\n")
if record.identifiers:
f.write(f" - Identifiers: {len(record.identifiers)}\n")
if record.digital_platforms:
f.write(f" - Platforms: {len(record.digital_platforms)}\n")
if record.change_history:
f.write(f" - Events: {len(record.change_history)}\n")
f.write("\n")
f.write("## Bottom 5 Records (Need Further Research)\n\n")
for i, record in enumerate(bottom_5, 1):
f.write(f"{i}. **{record.name}** (Score: {completeness_score(record)})\n")
f.write(f" - Type: {record.institution_type}\n")
if record.locations:
f.write(f" - Region: {record.locations[0].region}\n")
f.write(f" - **Status**: Minimal data available in conversation - requires additional sources\n")
f.write("\n")
f.write("## Institution Type Distribution\n\n")
type_counts = {}
for record in self.enriched_records:
inst_type = record.institution_type
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
f.write(f"- **{inst_type}**: {count}\n")
f.write("\n## Next Steps\n\n")
f.write("1. **Manual Review**: Review bottom 5 records and search for additional sources\n")
f.write("2. **Geocoding**: Use geocoding service to add coordinates for all locations\n")
f.write("3. **Identifier Lookup**: Query Wikidata and VIAF for missing identifiers\n")
f.write("4. **Platform Verification**: Verify institutional websites and digital platforms\n")
f.write("5. **LinkML Validation**: Run `linkml-validate` to ensure schema compliance\n")
f.write("6. **Export Formats**: Generate JSON-LD, RDF/Turtle, and CSV exports\n")
print(f"✓ Report generated")
def run(self):
"""Execute the full enrichment pipeline."""
print("="*60)
print("CHILEAN GLAM INSTITUTIONS ENRICHMENT PIPELINE")
print("="*60)
self.load_conversation()
self.load_existing_records()
self.enrich_all_records()
self.save_enriched_records()
self.generate_report()
print("\n" + "="*60)
print("ENRICHMENT PIPELINE COMPLETE")
print("="*60)
print(f"\nOutput files:")
print(f" - Enriched YAML: {OUTPUT_YAML_PATH}")
print(f" - Curation Report: {REPORT_PATH}")
if __name__ == '__main__':
enricher = ChileanInstitutionEnricher()
enricher.run()