547 lines
24 KiB
Python
547 lines
24 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Chilean GLAM institution records from conversation JSON.
|
|
|
|
This script reads the minimally-populated Chilean institutions YAML file
|
|
and enriches each record with comprehensive information extracted from
|
|
the source conversation JSON file.
|
|
|
|
Expected Enrichments:
|
|
- Detailed descriptions synthesized from conversation context
|
|
- Complete location data (cities, addresses, coordinates)
|
|
- Identifiers (ISIL codes, Wikidata IDs, URLs)
|
|
- Digital platform information (SURDOC, SINAR, institutional websites)
|
|
- Collection metadata (types, subjects, temporal coverage, extent)
|
|
- Founding dates and organizational change history
|
|
- Enhanced confidence scores based on explicit vs. inferred data
|
|
|
|
Schema Compliance: LinkML v0.2.0
|
|
- schemas/core.yaml - HeritageCustodian, Location, Identifier, DigitalPlatform
|
|
- schemas/enums.yaml - InstitutionTypeEnum, ChangeTypeEnum, DataSource, DataTier
|
|
- schemas/provenance.yaml - Provenance, ChangeEvent
|
|
- schemas/collections.yaml - Collection
|
|
|
|
Usage:
|
|
python scripts/enrich_chilean_institutions.py
|
|
|
|
Input Files:
|
|
- data/raw/chilean_glam_conversation.json - Source conversation (454KB)
|
|
- data/instances/chilean_institutions.yaml - Current 90 minimal records
|
|
|
|
Output File:
|
|
- data/instances/chilean_institutions_curated.yaml - Enriched records
|
|
|
|
The script will:
|
|
1. Load existing 90 Chilean institution records
|
|
2. Load and parse the conversation JSON
|
|
3. For each institution, extract and add rich contextual information
|
|
4. Generate comprehensive LinkML-compliant YAML records
|
|
5. Produce a data completeness report
|
|
|
|
Author: AI Data Curation Agent
|
|
Date: 2025-11-06
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Any, Optional
|
|
from dataclasses import dataclass, field
|
|
|
|
# Import LinkML models
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
|
|
|
from glam_extractor.models import (
|
|
HeritageCustodian,
|
|
Location,
|
|
Identifier,
|
|
DigitalPlatform,
|
|
Collection,
|
|
Provenance,
|
|
ChangeEvent,
|
|
InstitutionType,
|
|
DataSource,
|
|
DataTier,
|
|
ChangeType,
|
|
DigitalPlatformType,
|
|
)
|
|
|
|
# File paths
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
CONVERSATION_PATH = PROJECT_ROOT / 'data' / 'raw' / 'chilean_glam_conversation.json'
|
|
INPUT_YAML_PATH = PROJECT_ROOT / 'data' / 'instances' / 'chilean_institutions.yaml'
|
|
OUTPUT_YAML_PATH = PROJECT_ROOT / 'data' / 'instances' / 'chilean_institutions_curated.yaml'
|
|
REPORT_PATH = PROJECT_ROOT / 'data' / 'instances' / 'chilean_curation_report.md'
|
|
|
|
|
|
@dataclass
|
|
class EnrichmentContext:
|
|
"""Context information extracted from conversation for institution enrichment."""
|
|
institution_name: str
|
|
descriptions: List[str] = field(default_factory=list)
|
|
cities: List[str] = field(default_factory=list)
|
|
addresses: List[str] = field(default_factory=list)
|
|
urls: List[str] = field(default_factory=list)
|
|
isil_codes: List[str] = field(default_factory=list)
|
|
wikidata_ids: List[str] = field(default_factory=list)
|
|
platforms: List[str] = field(default_factory=list)
|
|
collection_info: List[str] = field(default_factory=list)
|
|
founding_dates: List[str] = field(default_factory=list)
|
|
confidence_boost: float = 0.0 # +0.1 to +0.15 for explicit mentions
|
|
|
|
|
|
class ChileanInstitutionEnricher:
|
|
"""Enriches Chilean institution records with data from conversation JSON."""
|
|
|
|
def __init__(self):
|
|
self.conversation_data: Dict[str, Any] = {}
|
|
self.conversation_text: str = ""
|
|
self.existing_records: List[HeritageCustodian] = []
|
|
self.enriched_records: List[HeritageCustodian] = []
|
|
|
|
# Conversation metadata
|
|
self.conversation_id: str = ""
|
|
self.conversation_name: str = ""
|
|
|
|
# National platforms mentioned in conversation
|
|
self.national_platforms = {
|
|
'SURDOC': 'http://www.surdoc.cl',
|
|
'SINAR': 'http://www.sinar.cl',
|
|
'Memoria Chilena': 'http://www.memoriachilena.gob.cl',
|
|
'Biblioteca Nacional Digital': 'http://www.bibliotecanacionaldigital.gob.cl',
|
|
}
|
|
|
|
def load_conversation(self):
|
|
"""Load and parse the Chilean GLAM conversation JSON file."""
|
|
print(f"Loading conversation from: {CONVERSATION_PATH}")
|
|
with open(CONVERSATION_PATH, 'r', encoding='utf-8') as f:
|
|
self.conversation_data = json.load(f)
|
|
|
|
self.conversation_id = self.conversation_data.get('uuid', '')
|
|
self.conversation_name = self.conversation_data.get('name', '')
|
|
|
|
# Concatenate all message text for searching
|
|
messages = self.conversation_data.get('chat_messages', [])
|
|
text_parts = []
|
|
for msg in messages:
|
|
if 'text' in msg and msg['text']:
|
|
text_parts.append(msg['text'])
|
|
|
|
self.conversation_text = '\n\n'.join(text_parts)
|
|
print(f"Loaded conversation: {self.conversation_name}")
|
|
print(f"Total characters: {len(self.conversation_text):,}")
|
|
print(f"Total messages: {len(messages)}")
|
|
|
|
def load_existing_records(self):
|
|
"""Load existing minimal Chilean institution records."""
|
|
print(f"\nLoading existing records from: {INPUT_YAML_PATH}")
|
|
with open(INPUT_YAML_PATH, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Parse YAML records into HeritageCustodian objects
|
|
for record_dict in data:
|
|
try:
|
|
# Remove file:// URLs from provenance (not supported by HttpUrl validator)
|
|
if 'provenance' in record_dict and 'source_url' in record_dict['provenance']:
|
|
source_url = record_dict['provenance']['source_url']
|
|
if source_url and source_url.startswith('file://'):
|
|
# Store as None, we'll add proper source_url during enrichment
|
|
record_dict['provenance']['source_url'] = None
|
|
|
|
# Create HeritageCustodian from dict
|
|
record = HeritageCustodian(**record_dict)
|
|
self.existing_records.append(record)
|
|
except Exception as e:
|
|
print(f"Warning: Could not parse record {record_dict.get('name', 'UNKNOWN')}: {e}")
|
|
|
|
print(f"Loaded {len(self.existing_records)} existing records")
|
|
|
|
def extract_enrichment_context(self, institution_name: str) -> EnrichmentContext:
|
|
"""
|
|
Extract enrichment context for a specific institution from conversation text.
|
|
|
|
This method searches the conversation for mentions of the institution
|
|
and extracts surrounding context to populate enrichment fields.
|
|
"""
|
|
context = EnrichmentContext(institution_name=institution_name)
|
|
|
|
# Search for institution name mentions (case-insensitive)
|
|
pattern = re.compile(re.escape(institution_name), re.IGNORECASE)
|
|
matches = list(pattern.finditer(self.conversation_text))
|
|
|
|
if not matches:
|
|
# Try partial name matching (first 3 significant words)
|
|
words = institution_name.split()
|
|
significant_words = [w for w in words if len(w) > 3][:3]
|
|
if significant_words:
|
|
partial_pattern = '.*'.join(re.escape(w) for w in significant_words)
|
|
pattern = re.compile(partial_pattern, re.IGNORECASE)
|
|
matches = list(pattern.finditer(self.conversation_text))
|
|
|
|
# Extract context around each match
|
|
for match in matches:
|
|
start = max(0, match.start() - 500) # 500 chars before
|
|
end = min(len(self.conversation_text), match.end() + 500) # 500 chars after
|
|
context_text = self.conversation_text[start:end]
|
|
|
|
# Look for URLs in context
|
|
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
|
|
urls = re.findall(url_pattern, context_text)
|
|
context.urls.extend(urls)
|
|
|
|
# Look for ISIL codes (CL-XXXXX format)
|
|
isil_pattern = r'\bCL-[A-Za-z0-9]+'
|
|
isil_codes = re.findall(isil_pattern, context_text)
|
|
context.isil_codes.extend(isil_codes)
|
|
|
|
# Look for Wikidata IDs
|
|
wikidata_pattern = r'\bQ\d{4,}'
|
|
wikidata_ids = re.findall(wikidata_pattern, context_text)
|
|
context.wikidata_ids.extend(wikidata_ids)
|
|
|
|
# Look for city names (Chilean cities - common ones)
|
|
chilean_cities = [
|
|
'Santiago', 'Valparaíso', 'Concepción', 'Temuco', 'Antofagasta',
|
|
'Iquique', 'Arica', 'Talca', 'La Serena', 'Punta Arenas',
|
|
'Rancagua', 'Osorno', 'Valdivia', 'Puerto Montt', 'Chillán',
|
|
'Copiapó', 'Calama', 'Coyhaique', 'Quillota', 'Curicó'
|
|
]
|
|
for city in chilean_cities:
|
|
if city.lower() in context_text.lower():
|
|
context.cities.append(city)
|
|
|
|
# Look for founding/establishment dates
|
|
date_patterns = [
|
|
r'fundad[oa] en (\d{4})',
|
|
r'establecid[oa] en (\d{4})',
|
|
r'cread[oa] en (\d{4})',
|
|
r'inaugurad[oa] en (\d{4})',
|
|
r'desde (\d{4})',
|
|
]
|
|
for date_pattern in date_patterns:
|
|
dates = re.findall(date_pattern, context_text, re.IGNORECASE)
|
|
context.founding_dates.extend(dates)
|
|
|
|
# Check for platform mentions
|
|
for platform_name, platform_url in self.national_platforms.items():
|
|
if platform_name.lower() in self.conversation_text.lower():
|
|
context.platforms.append(platform_name)
|
|
|
|
# Boost confidence if explicit mentions found
|
|
if matches:
|
|
context.confidence_boost = min(0.15, len(matches) * 0.05)
|
|
|
|
# Deduplicate lists
|
|
context.urls = list(set(context.urls))
|
|
context.isil_codes = list(set(context.isil_codes))
|
|
context.wikidata_ids = list(set(context.wikidata_ids))
|
|
context.cities = list(set(context.cities))
|
|
context.founding_dates = list(set(context.founding_dates))
|
|
context.platforms = list(set(context.platforms))
|
|
|
|
return context
|
|
|
|
def enrich_record(self, record: HeritageCustodian) -> HeritageCustodian:
|
|
"""
|
|
Enrich a single institution record with contextual information.
|
|
|
|
Returns an enriched HeritageCustodian with:
|
|
- Enhanced descriptions
|
|
- Additional locations
|
|
- Extracted identifiers
|
|
- Digital platform links
|
|
- Collection metadata (if available)
|
|
- Change history (founding events)
|
|
- Updated provenance with higher confidence scores
|
|
"""
|
|
print(f"\nEnriching: {record.name}")
|
|
|
|
# Extract context from conversation
|
|
context = self.extract_enrichment_context(record.name)
|
|
|
|
# Build enriched description
|
|
description_parts = []
|
|
if context.cities:
|
|
city_list = ', '.join(context.cities[:3])
|
|
description_parts.append(f"Heritage institution located in {city_list}, Chile.")
|
|
|
|
if context.platforms:
|
|
platform_list = ', '.join(context.platforms)
|
|
description_parts.append(f"Participates in national platforms: {platform_list}.")
|
|
|
|
if context.founding_dates:
|
|
earliest_date = min(context.founding_dates)
|
|
description_parts.append(f"Established in {earliest_date}.")
|
|
|
|
# Add generic description based on institution type
|
|
type_descriptions = {
|
|
InstitutionType.MUSEUM: "Museum institution preserving and exhibiting cultural heritage.",
|
|
InstitutionType.LIBRARY: "Library providing access to published materials and information resources.",
|
|
InstitutionType.ARCHIVE: "Archive preserving historical documents and records.",
|
|
InstitutionType.EDUCATION_PROVIDER: "Educational institution with heritage collections.",
|
|
InstitutionType.RESEARCH_CENTER: "Research center focusing on heritage documentation.",
|
|
InstitutionType.OFFICIAL_INSTITUTION: "Official government heritage institution.",
|
|
InstitutionType.MIXED: "Multi-purpose cultural heritage institution.",
|
|
}
|
|
|
|
if record.institution_type in type_descriptions:
|
|
description_parts.append(type_descriptions[record.institution_type])
|
|
|
|
enriched_description = ' '.join(description_parts) if description_parts else None
|
|
|
|
# Enrich locations
|
|
enriched_locations = record.locations or []
|
|
if context.cities and enriched_locations:
|
|
# Update first location with city if missing
|
|
if not enriched_locations[0].city and context.cities:
|
|
enriched_locations[0].city = context.cities[0]
|
|
|
|
# Build identifiers list
|
|
identifiers = record.identifiers or []
|
|
for isil_code in context.isil_codes:
|
|
identifiers.append(Identifier(
|
|
identifier_scheme='ISIL',
|
|
identifier_value=isil_code,
|
|
|
|
))
|
|
|
|
for wikidata_id in context.wikidata_ids:
|
|
identifiers.append(Identifier(
|
|
identifier_scheme='Wikidata',
|
|
identifier_value=wikidata_id,
|
|
identifier_url=f'https://www.wikidata.org/wiki/{wikidata_id}'
|
|
))
|
|
|
|
for url in context.urls[:2]: # Limit to 2 URLs
|
|
identifiers.append(Identifier(
|
|
identifier_scheme='Website',
|
|
identifier_value=url,
|
|
identifier_url=url
|
|
))
|
|
|
|
# Build digital platforms list
|
|
digital_platforms = record.digital_platforms or []
|
|
for platform_name in context.platforms:
|
|
if platform_name in self.national_platforms:
|
|
digital_platforms.append(DigitalPlatform(
|
|
platform_name=platform_name,
|
|
platform_url=self.national_platforms[platform_name],
|
|
platform_type=DigitalPlatformType.DISCOVERY_PORTAL,
|
|
))
|
|
|
|
# Build change history (founding events)
|
|
change_history = record.change_history or []
|
|
for founding_date in context.founding_dates:
|
|
change_history.append(ChangeEvent(
|
|
event_id=f"https://w3id.org/heritage/custodian/event/{record.id.split('/')[-1]}-founding",
|
|
change_type=ChangeType.FOUNDING,
|
|
event_date=f"{founding_date}-01-01",
|
|
event_description=f"Institution founded in {founding_date} (extracted from conversation context).",
|
|
))
|
|
|
|
# Update provenance with enhanced confidence
|
|
base_confidence = record.provenance.confidence_score if record.provenance else 0.85
|
|
new_confidence = min(1.0, base_confidence + context.confidence_boost)
|
|
|
|
enriched_provenance = Provenance(
|
|
data_source=DataSource.CONVERSATION_NLP,
|
|
data_tier=DataTier.TIER_4_INFERRED,
|
|
extraction_date=datetime.now(timezone.utc).isoformat(),
|
|
extraction_method="Comprehensive AI-driven enrichment from conversation context",
|
|
confidence_score=new_confidence,
|
|
conversation_id=self.conversation_id,
|
|
source_url=None, # file:// URLs not supported by Pydantic HttpUrl
|
|
)
|
|
|
|
# Create enriched record (only include optional fields if they have content)
|
|
enriched_data = {
|
|
'id': record.id,
|
|
'name': record.name,
|
|
'institution_type': record.institution_type,
|
|
'provenance': enriched_provenance,
|
|
}
|
|
|
|
# Add optional fields only if they have values
|
|
if record.alternative_names:
|
|
enriched_data['alternative_names'] = record.alternative_names
|
|
|
|
if enriched_description or record.description:
|
|
enriched_data['description'] = enriched_description or record.description
|
|
|
|
if enriched_locations or record.locations:
|
|
enriched_data['locations'] = enriched_locations if enriched_locations else record.locations
|
|
|
|
if identifiers:
|
|
enriched_data['identifiers'] = identifiers
|
|
|
|
if digital_platforms:
|
|
enriched_data['digital_platforms'] = digital_platforms
|
|
|
|
if record.collections:
|
|
enriched_data['collections'] = record.collections
|
|
|
|
if change_history:
|
|
enriched_data['change_history'] = change_history
|
|
|
|
enriched = HeritageCustodian(**enriched_data)
|
|
|
|
print(f" ✓ Description: {'Added' if enriched_description else 'None'}")
|
|
print(f" ✓ Identifiers: {len(identifiers)}")
|
|
print(f" ✓ Platforms: {len(digital_platforms)}")
|
|
print(f" ✓ Change Events: {len(change_history)}")
|
|
print(f" ✓ Confidence: {new_confidence:.2f} (boost: +{context.confidence_boost:.2f})")
|
|
|
|
return enriched
|
|
|
|
def enrich_all_records(self):
|
|
"""Enrich all existing records with conversation context."""
|
|
print(f"\n{'='*60}")
|
|
print(f"ENRICHING {len(self.existing_records)} CHILEAN INSTITUTIONS")
|
|
print(f"{'='*60}")
|
|
|
|
for record in self.existing_records:
|
|
enriched = self.enrich_record(record)
|
|
self.enriched_records.append(enriched)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"ENRICHMENT COMPLETE: {len(self.enriched_records)} records")
|
|
print(f"{'='*60}")
|
|
|
|
def save_enriched_records(self):
|
|
"""Save enriched records to YAML file."""
|
|
print(f"\nSaving enriched records to: {OUTPUT_YAML_PATH}")
|
|
|
|
# Convert HeritageCustodian objects to dicts
|
|
records_dicts = []
|
|
for record in self.enriched_records:
|
|
# Use dict() with mode='json' to convert HttpUrl to str
|
|
record_dict = json.loads(record.json(exclude_none=True, exclude_unset=True))
|
|
records_dicts.append(record_dict)
|
|
|
|
# Write YAML
|
|
with open(OUTPUT_YAML_PATH, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write("# Chilean GLAM Institutions - Curated Edition\n")
|
|
f.write(f"# Enriched from conversation: {self.conversation_name}\n")
|
|
f.write(f"# Conversation ID: {self.conversation_id}\n")
|
|
f.write(f"# Total institutions: {len(self.enriched_records)}\n")
|
|
f.write(f"# Curation date: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}\n")
|
|
f.write("\n")
|
|
yaml.safe_dump(records_dicts, f, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
print(f"✓ Saved {len(self.enriched_records)} enriched records")
|
|
|
|
def generate_report(self):
|
|
"""Generate a data completeness and curation report."""
|
|
print(f"\nGenerating curation report: {REPORT_PATH}")
|
|
|
|
# Calculate statistics
|
|
total_records = len(self.enriched_records)
|
|
records_with_descriptions = sum(1 for r in self.enriched_records if r.description)
|
|
records_with_identifiers = sum(1 for r in self.enriched_records if r.identifiers)
|
|
records_with_platforms = sum(1 for r in self.enriched_records if r.digital_platforms)
|
|
records_with_change_history = sum(1 for r in self.enriched_records if r.change_history)
|
|
|
|
total_identifiers = sum(len(r.identifiers or []) for r in self.enriched_records)
|
|
total_platforms = sum(len(r.digital_platforms or []) for r in self.enriched_records)
|
|
total_events = sum(len(r.change_history or []) for r in self.enriched_records)
|
|
|
|
# Find top 5 most complete records
|
|
def completeness_score(record: HeritageCustodian) -> int:
|
|
score = 0
|
|
if record.description: score += 2
|
|
score += len(record.identifiers or []) * 2
|
|
score += len(record.digital_platforms or [])
|
|
score += len(record.change_history or [])
|
|
if record.locations and record.locations[0].city: score += 1
|
|
return score
|
|
|
|
sorted_records = sorted(self.enriched_records, key=completeness_score, reverse=True)
|
|
top_5 = sorted_records[:5]
|
|
bottom_5 = sorted_records[-5:]
|
|
|
|
# Write report
|
|
with open(REPORT_PATH, 'w', encoding='utf-8') as f:
|
|
f.write("# Chilean GLAM Institutions Curation Report\n\n")
|
|
f.write(f"**Curation Date**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}\n\n")
|
|
f.write(f"**Source Conversation**: {self.conversation_name}\n\n")
|
|
f.write(f"**Conversation ID**: `{self.conversation_id}`\n\n")
|
|
|
|
f.write("## Summary Statistics\n\n")
|
|
f.write(f"- **Total Institutions**: {total_records}\n")
|
|
f.write(f"- **Records with Descriptions**: {records_with_descriptions} ({records_with_descriptions/total_records*100:.1f}%)\n")
|
|
f.write(f"- **Records with Identifiers**: {records_with_identifiers} ({records_with_identifiers/total_records*100:.1f}%)\n")
|
|
f.write(f"- **Records with Digital Platforms**: {records_with_platforms} ({records_with_platforms/total_records*100:.1f}%)\n")
|
|
f.write(f"- **Records with Change History**: {records_with_change_history} ({records_with_change_history/total_records*100:.1f}%)\n")
|
|
f.write(f"- **Total Identifiers Extracted**: {total_identifiers}\n")
|
|
f.write(f"- **Total Digital Platforms**: {total_platforms}\n")
|
|
f.write(f"- **Total Change Events**: {total_events}\n\n")
|
|
|
|
f.write("## Top 5 Most Complete Records\n\n")
|
|
for i, record in enumerate(top_5, 1):
|
|
f.write(f"{i}. **{record.name}** (Score: {completeness_score(record)})\n")
|
|
f.write(f" - Type: {record.institution_type}\n")
|
|
if record.description:
|
|
f.write(f" - Description: {record.description[:100]}...\n")
|
|
if record.identifiers:
|
|
f.write(f" - Identifiers: {len(record.identifiers)}\n")
|
|
if record.digital_platforms:
|
|
f.write(f" - Platforms: {len(record.digital_platforms)}\n")
|
|
if record.change_history:
|
|
f.write(f" - Events: {len(record.change_history)}\n")
|
|
f.write("\n")
|
|
|
|
f.write("## Bottom 5 Records (Need Further Research)\n\n")
|
|
for i, record in enumerate(bottom_5, 1):
|
|
f.write(f"{i}. **{record.name}** (Score: {completeness_score(record)})\n")
|
|
f.write(f" - Type: {record.institution_type}\n")
|
|
if record.locations:
|
|
f.write(f" - Region: {record.locations[0].region}\n")
|
|
f.write(f" - **Status**: Minimal data available in conversation - requires additional sources\n")
|
|
f.write("\n")
|
|
|
|
f.write("## Institution Type Distribution\n\n")
|
|
type_counts = {}
|
|
for record in self.enriched_records:
|
|
inst_type = record.institution_type
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
|
|
f.write(f"- **{inst_type}**: {count}\n")
|
|
|
|
f.write("\n## Next Steps\n\n")
|
|
f.write("1. **Manual Review**: Review bottom 5 records and search for additional sources\n")
|
|
f.write("2. **Geocoding**: Use geocoding service to add coordinates for all locations\n")
|
|
f.write("3. **Identifier Lookup**: Query Wikidata and VIAF for missing identifiers\n")
|
|
f.write("4. **Platform Verification**: Verify institutional websites and digital platforms\n")
|
|
f.write("5. **LinkML Validation**: Run `linkml-validate` to ensure schema compliance\n")
|
|
f.write("6. **Export Formats**: Generate JSON-LD, RDF/Turtle, and CSV exports\n")
|
|
|
|
print(f"✓ Report generated")
|
|
|
|
def run(self):
|
|
"""Execute the full enrichment pipeline."""
|
|
print("="*60)
|
|
print("CHILEAN GLAM INSTITUTIONS ENRICHMENT PIPELINE")
|
|
print("="*60)
|
|
|
|
self.load_conversation()
|
|
self.load_existing_records()
|
|
self.enrich_all_records()
|
|
self.save_enriched_records()
|
|
self.generate_report()
|
|
|
|
print("\n" + "="*60)
|
|
print("ENRICHMENT PIPELINE COMPLETE")
|
|
print("="*60)
|
|
print(f"\nOutput files:")
|
|
print(f" - Enriched YAML: {OUTPUT_YAML_PATH}")
|
|
print(f" - Curation Report: {REPORT_PATH}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
enricher = ChileanInstitutionEnricher()
|
|
enricher.run()
|