#!/usr/bin/env python3 """ Enrich Chilean GLAM institution records from conversation JSON. This script reads the minimally-populated Chilean institutions YAML file and enriches each record with comprehensive information extracted from the source conversation JSON file. Expected Enrichments: - Detailed descriptions synthesized from conversation context - Complete location data (cities, addresses, coordinates) - Identifiers (ISIL codes, Wikidata IDs, URLs) - Digital platform information (SURDOC, SINAR, institutional websites) - Collection metadata (types, subjects, temporal coverage, extent) - Founding dates and organizational change history - Enhanced confidence scores based on explicit vs. inferred data Schema Compliance: LinkML v0.2.0 - schemas/core.yaml - HeritageCustodian, Location, Identifier, DigitalPlatform - schemas/enums.yaml - InstitutionTypeEnum, ChangeTypeEnum, DataSource, DataTier - schemas/provenance.yaml - Provenance, ChangeEvent - schemas/collections.yaml - Collection Usage: python scripts/enrich_chilean_institutions.py Input Files: - data/raw/chilean_glam_conversation.json - Source conversation (454KB) - data/instances/chilean_institutions.yaml - Current 90 minimal records Output File: - data/instances/chilean_institutions_curated.yaml - Enriched records The script will: 1. Load existing 90 Chilean institution records 2. Load and parse the conversation JSON 3. For each institution, extract and add rich contextual information 4. Generate comprehensive LinkML-compliant YAML records 5. Produce a data completeness report Author: AI Data Curation Agent Date: 2025-11-06 """ import json import yaml import re from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Any, Optional from dataclasses import dataclass, field # Import LinkML models import sys sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from glam_extractor.models import ( HeritageCustodian, Location, Identifier, DigitalPlatform, Collection, Provenance, ChangeEvent, InstitutionType, DataSource, DataTier, ChangeType, DigitalPlatformType, ) # File paths PROJECT_ROOT = Path(__file__).parent.parent CONVERSATION_PATH = PROJECT_ROOT / 'data' / 'raw' / 'chilean_glam_conversation.json' INPUT_YAML_PATH = PROJECT_ROOT / 'data' / 'instances' / 'chilean_institutions.yaml' OUTPUT_YAML_PATH = PROJECT_ROOT / 'data' / 'instances' / 'chilean_institutions_curated.yaml' REPORT_PATH = PROJECT_ROOT / 'data' / 'instances' / 'chilean_curation_report.md' @dataclass class EnrichmentContext: """Context information extracted from conversation for institution enrichment.""" institution_name: str descriptions: List[str] = field(default_factory=list) cities: List[str] = field(default_factory=list) addresses: List[str] = field(default_factory=list) urls: List[str] = field(default_factory=list) isil_codes: List[str] = field(default_factory=list) wikidata_ids: List[str] = field(default_factory=list) platforms: List[str] = field(default_factory=list) collection_info: List[str] = field(default_factory=list) founding_dates: List[str] = field(default_factory=list) confidence_boost: float = 0.0 # +0.1 to +0.15 for explicit mentions class ChileanInstitutionEnricher: """Enriches Chilean institution records with data from conversation JSON.""" def __init__(self): self.conversation_data: Dict[str, Any] = {} self.conversation_text: str = "" self.existing_records: List[HeritageCustodian] = [] self.enriched_records: List[HeritageCustodian] = [] # Conversation metadata self.conversation_id: str = "" self.conversation_name: str = "" # National platforms mentioned in conversation self.national_platforms = { 'SURDOC': 'http://www.surdoc.cl', 'SINAR': 'http://www.sinar.cl', 'Memoria Chilena': 'http://www.memoriachilena.gob.cl', 'Biblioteca Nacional Digital': 'http://www.bibliotecanacionaldigital.gob.cl', } def load_conversation(self): """Load and parse the Chilean GLAM conversation JSON file.""" print(f"Loading conversation from: {CONVERSATION_PATH}") with open(CONVERSATION_PATH, 'r', encoding='utf-8') as f: self.conversation_data = json.load(f) self.conversation_id = self.conversation_data.get('uuid', '') self.conversation_name = self.conversation_data.get('name', '') # Concatenate all message text for searching messages = self.conversation_data.get('chat_messages', []) text_parts = [] for msg in messages: if 'text' in msg and msg['text']: text_parts.append(msg['text']) self.conversation_text = '\n\n'.join(text_parts) print(f"Loaded conversation: {self.conversation_name}") print(f"Total characters: {len(self.conversation_text):,}") print(f"Total messages: {len(messages)}") def load_existing_records(self): """Load existing minimal Chilean institution records.""" print(f"\nLoading existing records from: {INPUT_YAML_PATH}") with open(INPUT_YAML_PATH, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Parse YAML records into HeritageCustodian objects for record_dict in data: try: # Remove file:// URLs from provenance (not supported by HttpUrl validator) if 'provenance' in record_dict and 'source_url' in record_dict['provenance']: source_url = record_dict['provenance']['source_url'] if source_url and source_url.startswith('file://'): # Store as None, we'll add proper source_url during enrichment record_dict['provenance']['source_url'] = None # Create HeritageCustodian from dict record = HeritageCustodian(**record_dict) self.existing_records.append(record) except Exception as e: print(f"Warning: Could not parse record {record_dict.get('name', 'UNKNOWN')}: {e}") print(f"Loaded {len(self.existing_records)} existing records") def extract_enrichment_context(self, institution_name: str) -> EnrichmentContext: """ Extract enrichment context for a specific institution from conversation text. This method searches the conversation for mentions of the institution and extracts surrounding context to populate enrichment fields. """ context = EnrichmentContext(institution_name=institution_name) # Search for institution name mentions (case-insensitive) pattern = re.compile(re.escape(institution_name), re.IGNORECASE) matches = list(pattern.finditer(self.conversation_text)) if not matches: # Try partial name matching (first 3 significant words) words = institution_name.split() significant_words = [w for w in words if len(w) > 3][:3] if significant_words: partial_pattern = '.*'.join(re.escape(w) for w in significant_words) pattern = re.compile(partial_pattern, re.IGNORECASE) matches = list(pattern.finditer(self.conversation_text)) # Extract context around each match for match in matches: start = max(0, match.start() - 500) # 500 chars before end = min(len(self.conversation_text), match.end() + 500) # 500 chars after context_text = self.conversation_text[start:end] # Look for URLs in context url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+' urls = re.findall(url_pattern, context_text) context.urls.extend(urls) # Look for ISIL codes (CL-XXXXX format) isil_pattern = r'\bCL-[A-Za-z0-9]+' isil_codes = re.findall(isil_pattern, context_text) context.isil_codes.extend(isil_codes) # Look for Wikidata IDs wikidata_pattern = r'\bQ\d{4,}' wikidata_ids = re.findall(wikidata_pattern, context_text) context.wikidata_ids.extend(wikidata_ids) # Look for city names (Chilean cities - common ones) chilean_cities = [ 'Santiago', 'Valparaíso', 'Concepción', 'Temuco', 'Antofagasta', 'Iquique', 'Arica', 'Talca', 'La Serena', 'Punta Arenas', 'Rancagua', 'Osorno', 'Valdivia', 'Puerto Montt', 'Chillán', 'Copiapó', 'Calama', 'Coyhaique', 'Quillota', 'Curicó' ] for city in chilean_cities: if city.lower() in context_text.lower(): context.cities.append(city) # Look for founding/establishment dates date_patterns = [ r'fundad[oa] en (\d{4})', r'establecid[oa] en (\d{4})', r'cread[oa] en (\d{4})', r'inaugurad[oa] en (\d{4})', r'desde (\d{4})', ] for date_pattern in date_patterns: dates = re.findall(date_pattern, context_text, re.IGNORECASE) context.founding_dates.extend(dates) # Check for platform mentions for platform_name, platform_url in self.national_platforms.items(): if platform_name.lower() in self.conversation_text.lower(): context.platforms.append(platform_name) # Boost confidence if explicit mentions found if matches: context.confidence_boost = min(0.15, len(matches) * 0.05) # Deduplicate lists context.urls = list(set(context.urls)) context.isil_codes = list(set(context.isil_codes)) context.wikidata_ids = list(set(context.wikidata_ids)) context.cities = list(set(context.cities)) context.founding_dates = list(set(context.founding_dates)) context.platforms = list(set(context.platforms)) return context def enrich_record(self, record: HeritageCustodian) -> HeritageCustodian: """ Enrich a single institution record with contextual information. Returns an enriched HeritageCustodian with: - Enhanced descriptions - Additional locations - Extracted identifiers - Digital platform links - Collection metadata (if available) - Change history (founding events) - Updated provenance with higher confidence scores """ print(f"\nEnriching: {record.name}") # Extract context from conversation context = self.extract_enrichment_context(record.name) # Build enriched description description_parts = [] if context.cities: city_list = ', '.join(context.cities[:3]) description_parts.append(f"Heritage institution located in {city_list}, Chile.") if context.platforms: platform_list = ', '.join(context.platforms) description_parts.append(f"Participates in national platforms: {platform_list}.") if context.founding_dates: earliest_date = min(context.founding_dates) description_parts.append(f"Established in {earliest_date}.") # Add generic description based on institution type type_descriptions = { InstitutionType.MUSEUM: "Museum institution preserving and exhibiting cultural heritage.", InstitutionType.LIBRARY: "Library providing access to published materials and information resources.", InstitutionType.ARCHIVE: "Archive preserving historical documents and records.", InstitutionType.EDUCATION_PROVIDER: "Educational institution with heritage collections.", InstitutionType.RESEARCH_CENTER: "Research center focusing on heritage documentation.", InstitutionType.OFFICIAL_INSTITUTION: "Official government heritage institution.", InstitutionType.MIXED: "Multi-purpose cultural heritage institution.", } if record.institution_type in type_descriptions: description_parts.append(type_descriptions[record.institution_type]) enriched_description = ' '.join(description_parts) if description_parts else None # Enrich locations enriched_locations = record.locations or [] if context.cities and enriched_locations: # Update first location with city if missing if not enriched_locations[0].city and context.cities: enriched_locations[0].city = context.cities[0] # Build identifiers list identifiers = record.identifiers or [] for isil_code in context.isil_codes: identifiers.append(Identifier( identifier_scheme='ISIL', identifier_value=isil_code, )) for wikidata_id in context.wikidata_ids: identifiers.append(Identifier( identifier_scheme='Wikidata', identifier_value=wikidata_id, identifier_url=f'https://www.wikidata.org/wiki/{wikidata_id}' )) for url in context.urls[:2]: # Limit to 2 URLs identifiers.append(Identifier( identifier_scheme='Website', identifier_value=url, identifier_url=url )) # Build digital platforms list digital_platforms = record.digital_platforms or [] for platform_name in context.platforms: if platform_name in self.national_platforms: digital_platforms.append(DigitalPlatform( platform_name=platform_name, platform_url=self.national_platforms[platform_name], platform_type=DigitalPlatformType.DISCOVERY_PORTAL, )) # Build change history (founding events) change_history = record.change_history or [] for founding_date in context.founding_dates: change_history.append(ChangeEvent( event_id=f"https://w3id.org/heritage/custodian/event/{record.id.split('/')[-1]}-founding", change_type=ChangeType.FOUNDING, event_date=f"{founding_date}-01-01", event_description=f"Institution founded in {founding_date} (extracted from conversation context).", )) # Update provenance with enhanced confidence base_confidence = record.provenance.confidence_score if record.provenance else 0.85 new_confidence = min(1.0, base_confidence + context.confidence_boost) enriched_provenance = Provenance( data_source=DataSource.CONVERSATION_NLP, data_tier=DataTier.TIER_4_INFERRED, extraction_date=datetime.now(timezone.utc).isoformat(), extraction_method="Comprehensive AI-driven enrichment from conversation context", confidence_score=new_confidence, conversation_id=self.conversation_id, source_url=None, # file:// URLs not supported by Pydantic HttpUrl ) # Create enriched record (only include optional fields if they have content) enriched_data = { 'id': record.id, 'name': record.name, 'institution_type': record.institution_type, 'provenance': enriched_provenance, } # Add optional fields only if they have values if record.alternative_names: enriched_data['alternative_names'] = record.alternative_names if enriched_description or record.description: enriched_data['description'] = enriched_description or record.description if enriched_locations or record.locations: enriched_data['locations'] = enriched_locations if enriched_locations else record.locations if identifiers: enriched_data['identifiers'] = identifiers if digital_platforms: enriched_data['digital_platforms'] = digital_platforms if record.collections: enriched_data['collections'] = record.collections if change_history: enriched_data['change_history'] = change_history enriched = HeritageCustodian(**enriched_data) print(f" ✓ Description: {'Added' if enriched_description else 'None'}") print(f" ✓ Identifiers: {len(identifiers)}") print(f" ✓ Platforms: {len(digital_platforms)}") print(f" ✓ Change Events: {len(change_history)}") print(f" ✓ Confidence: {new_confidence:.2f} (boost: +{context.confidence_boost:.2f})") return enriched def enrich_all_records(self): """Enrich all existing records with conversation context.""" print(f"\n{'='*60}") print(f"ENRICHING {len(self.existing_records)} CHILEAN INSTITUTIONS") print(f"{'='*60}") for record in self.existing_records: enriched = self.enrich_record(record) self.enriched_records.append(enriched) print(f"\n{'='*60}") print(f"ENRICHMENT COMPLETE: {len(self.enriched_records)} records") print(f"{'='*60}") def save_enriched_records(self): """Save enriched records to YAML file.""" print(f"\nSaving enriched records to: {OUTPUT_YAML_PATH}") # Convert HeritageCustodian objects to dicts records_dicts = [] for record in self.enriched_records: # Use dict() with mode='json' to convert HttpUrl to str record_dict = json.loads(record.json(exclude_none=True, exclude_unset=True)) records_dicts.append(record_dict) # Write YAML with open(OUTPUT_YAML_PATH, 'w', encoding='utf-8') as f: f.write("---\n") f.write("# Chilean GLAM Institutions - Curated Edition\n") f.write(f"# Enriched from conversation: {self.conversation_name}\n") f.write(f"# Conversation ID: {self.conversation_id}\n") f.write(f"# Total institutions: {len(self.enriched_records)}\n") f.write(f"# Curation date: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}\n") f.write("\n") yaml.safe_dump(records_dicts, f, allow_unicode=True, sort_keys=False, width=120) print(f"✓ Saved {len(self.enriched_records)} enriched records") def generate_report(self): """Generate a data completeness and curation report.""" print(f"\nGenerating curation report: {REPORT_PATH}") # Calculate statistics total_records = len(self.enriched_records) records_with_descriptions = sum(1 for r in self.enriched_records if r.description) records_with_identifiers = sum(1 for r in self.enriched_records if r.identifiers) records_with_platforms = sum(1 for r in self.enriched_records if r.digital_platforms) records_with_change_history = sum(1 for r in self.enriched_records if r.change_history) total_identifiers = sum(len(r.identifiers or []) for r in self.enriched_records) total_platforms = sum(len(r.digital_platforms or []) for r in self.enriched_records) total_events = sum(len(r.change_history or []) for r in self.enriched_records) # Find top 5 most complete records def completeness_score(record: HeritageCustodian) -> int: score = 0 if record.description: score += 2 score += len(record.identifiers or []) * 2 score += len(record.digital_platforms or []) score += len(record.change_history or []) if record.locations and record.locations[0].city: score += 1 return score sorted_records = sorted(self.enriched_records, key=completeness_score, reverse=True) top_5 = sorted_records[:5] bottom_5 = sorted_records[-5:] # Write report with open(REPORT_PATH, 'w', encoding='utf-8') as f: f.write("# Chilean GLAM Institutions Curation Report\n\n") f.write(f"**Curation Date**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}\n\n") f.write(f"**Source Conversation**: {self.conversation_name}\n\n") f.write(f"**Conversation ID**: `{self.conversation_id}`\n\n") f.write("## Summary Statistics\n\n") f.write(f"- **Total Institutions**: {total_records}\n") f.write(f"- **Records with Descriptions**: {records_with_descriptions} ({records_with_descriptions/total_records*100:.1f}%)\n") f.write(f"- **Records with Identifiers**: {records_with_identifiers} ({records_with_identifiers/total_records*100:.1f}%)\n") f.write(f"- **Records with Digital Platforms**: {records_with_platforms} ({records_with_platforms/total_records*100:.1f}%)\n") f.write(f"- **Records with Change History**: {records_with_change_history} ({records_with_change_history/total_records*100:.1f}%)\n") f.write(f"- **Total Identifiers Extracted**: {total_identifiers}\n") f.write(f"- **Total Digital Platforms**: {total_platforms}\n") f.write(f"- **Total Change Events**: {total_events}\n\n") f.write("## Top 5 Most Complete Records\n\n") for i, record in enumerate(top_5, 1): f.write(f"{i}. **{record.name}** (Score: {completeness_score(record)})\n") f.write(f" - Type: {record.institution_type}\n") if record.description: f.write(f" - Description: {record.description[:100]}...\n") if record.identifiers: f.write(f" - Identifiers: {len(record.identifiers)}\n") if record.digital_platforms: f.write(f" - Platforms: {len(record.digital_platforms)}\n") if record.change_history: f.write(f" - Events: {len(record.change_history)}\n") f.write("\n") f.write("## Bottom 5 Records (Need Further Research)\n\n") for i, record in enumerate(bottom_5, 1): f.write(f"{i}. **{record.name}** (Score: {completeness_score(record)})\n") f.write(f" - Type: {record.institution_type}\n") if record.locations: f.write(f" - Region: {record.locations[0].region}\n") f.write(f" - **Status**: Minimal data available in conversation - requires additional sources\n") f.write("\n") f.write("## Institution Type Distribution\n\n") type_counts = {} for record in self.enriched_records: inst_type = record.institution_type type_counts[inst_type] = type_counts.get(inst_type, 0) + 1 for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True): f.write(f"- **{inst_type}**: {count}\n") f.write("\n## Next Steps\n\n") f.write("1. **Manual Review**: Review bottom 5 records and search for additional sources\n") f.write("2. **Geocoding**: Use geocoding service to add coordinates for all locations\n") f.write("3. **Identifier Lookup**: Query Wikidata and VIAF for missing identifiers\n") f.write("4. **Platform Verification**: Verify institutional websites and digital platforms\n") f.write("5. **LinkML Validation**: Run `linkml-validate` to ensure schema compliance\n") f.write("6. **Export Formats**: Generate JSON-LD, RDF/Turtle, and CSV exports\n") print(f"✓ Report generated") def run(self): """Execute the full enrichment pipeline.""" print("="*60) print("CHILEAN GLAM INSTITUTIONS ENRICHMENT PIPELINE") print("="*60) self.load_conversation() self.load_existing_records() self.enrich_all_records() self.save_enriched_records() self.generate_report() print("\n" + "="*60) print("ENRICHMENT PIPELINE COMPLETE") print("="*60) print(f"\nOutput files:") print(f" - Enriched YAML: {OUTPUT_YAML_PATH}") print(f" - Curation Report: {REPORT_PATH}") if __name__ == '__main__': enricher = ChileanInstitutionEnricher() enricher.run()