glam/process_chilean_institutions.py

#!/usr/bin/env python3
"""
Process Chilean GLAM institutions from conversation file.
Extracts institutions from comprehensive provincial directory format.
"""

import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List
from collections import defaultdict

# Reuse non-institution filter from Mexican script
NON_INSTITUTION_TERMS = {
    "dublin core", "mods", "mets", "vra core", "object id", "marc21", "marc 21",
    "handle system", "orcid", "doi", "master formats", "documents", "audio/video",
    "storage", "climate control", "digital objects", "photography", "tiff", "jpeg",
    "pdf/a", "api", "oai-pmh", "rest", "iiif", "current status", "museums", "archives",
    "libraries", "archaeological sites", "digital objects", "access", "platform",
    "network", "system", "standard", "format", "preservation", "digitization", "metadata"
}

def is_real_institution(name: str) -> bool:
    """Check if extracted name is a real institution."""
    name_lower = name.lower().strip()

    if name_lower in NON_INSTITUTION_TERMS:
        return False

    words = set(name_lower.split())
    if words <= NON_INSTITUTION_TERMS:
        return False

    if len(name_lower) < 5:
        return False

    return True

def extract_institutions_from_report(report_text: str) -> List[Dict]:
    """Extract Chilean institutions from the provincial directory report.

    Chilean format uses inline bold text within paragraphs:
    Example: "**Copiapó Province** anticipates... the new **Museo Regional de Atacama**..."
    """
    institutions = []

    # Chilean-specific filter terms (in addition to NON_INSTITUTION_TERMS)
    chilean_skip_terms = {
        'province', 'region', 'unesco', 'serpat', 'dibam', 'national monuments',
        'cultural heritage', 'digital preservation', 'atacama province',
        'copiapó province', 'huasco province', 'chañaral province',
        'antofagasta region', 'metropolitan region', 'maule region',
        'valparaíso region', 'biobío region', 'los lagos region',
        'araucanía region', 'tarapacá region', 'arica and parinacota region',
        'aysén region', 'magallanes region', 'coquimbo region',
        'atacama region', "o'higgins region", 'los ríos region', 'ñuble region'
    }

    # Institution keywords in Spanish
    institution_keywords = [
        'museo', 'museum', 'biblioteca', 'library', 'archivo', 'archiv',
        'universidad', 'university', 'instituto', 'institute', 'fundación',
        'foundation', 'centro', 'center', 'galería', 'gallery', 'servicio',
        'consejo', 'dirección', 'departamento', 'academia', 'sociedad'
    ]

    # Find all bold text instances: **something**
    bold_pattern = re.compile(r'\*\*([^*]+?)\*\*')
    matches = bold_pattern.findall(report_text)

    # Track current region/province context
    current_province = None
    lines = report_text.split('\n')

    for match in matches:
        candidate = match.strip()
        candidate_lower = candidate.lower()

        # Skip empty or very short
        if len(candidate) < 5:
            continue

        # Skip section headers (Province, Region)
        if 'province' in candidate_lower or 'region' in candidate_lower:
            # Update context for location tracking
            if 'province' in candidate_lower:
                current_province = candidate.replace(' Province', '').strip()
            continue

        # Skip generic Chilean terms
        if candidate_lower in chilean_skip_terms:
            continue

        # Skip metadata standards and technical terms
        if not is_real_institution(candidate):
            continue

        # Must contain at least one institution keyword
        has_keyword = any(keyword in candidate_lower for keyword in institution_keywords)
        if not has_keyword:
            continue

        # Skip if it's just a keyword by itself
        if candidate_lower in institution_keywords:
            continue

        # Extract institution
        inst_record = {
            'name': candidate,
            'urls': [],
            'emails': [],
            'province': current_province,
            'institution_type': 'MIXED'
        }

        # Classify institution type
        if 'museo' in candidate_lower or 'museum' in candidate_lower:
            inst_record['institution_type'] = 'MUSEUM'
        elif 'archivo' in candidate_lower or 'archiv' in candidate_lower:
            inst_record['institution_type'] = 'ARCHIVE'
        elif 'biblioteca' in candidate_lower or 'library' in candidate_lower:
            inst_record['institution_type'] = 'LIBRARY'
        elif 'universidad' in candidate_lower or 'university' in candidate_lower:
            inst_record['institution_type'] = 'EDUCATION_PROVIDER'
        elif 'servicio nacional' in candidate_lower or 'consejo' in candidate_lower or 'dirección' in candidate_lower:
            inst_record['institution_type'] = 'OFFICIAL_INSTITUTION'
        elif 'fundación' in candidate_lower or 'foundation' in candidate_lower:
            inst_record['institution_type'] = 'RESEARCH_CENTER'

        # Try to find URLs/emails in context (look in surrounding text)
        # Find where this institution appears in the full text
        context_start = report_text.find(f'**{candidate}**')
        if context_start != -1:
            # Get 500 characters after the mention
            context = report_text[context_start:context_start+500]

            # Extract URLs
            url_matches = re.findall(r'https?://[^\s\)\],]+', context)
            for url in url_matches:
                url = url.rstrip(',.;:')
                if url not in inst_record['urls']:
                    inst_record['urls'].append(url)

            # Extract emails
            email_matches = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', context)
            for email in email_matches:
                if email not in inst_record['emails']:
                    inst_record['emails'].append(email)

        institutions.append(inst_record)

    return institutions

def normalize_name(name: str) -> str:
    """Normalize institution name for deduplication."""
    name = re.sub(r'\([^)]*\)', '', name)
    name = name.lower().strip()
    name = re.sub(r'\s+', ' ', name)
    return name

def deduplicate_institutions(institutions: List[Dict]) -> List[Dict]:
    """Deduplicate by normalized name, merging metadata."""
    name_map = {}

    for inst in institutions:
        norm_name = normalize_name(inst['name'])

        if norm_name in name_map:
            # Merge URLs and emails
            existing = name_map[norm_name]
            existing['urls'] = list(set(existing.get('urls', []) + inst.get('urls', [])))
            existing['emails'] = list(set(existing.get('emails', []) + inst.get('emails', [])))

            # Prefer more specific type
            if existing.get('institution_type') == 'MIXED' and inst.get('institution_type') != 'MIXED':
                existing['institution_type'] = inst['institution_type']
        else:
            name_map[norm_name] = inst

    return list(name_map.values())

def generate_ghcid(country_code: str, inst_type: str, name: str, index: int) -> str:
    """Generate GHCID identifier."""
    type_codes = {
        'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
        'OFFICIAL_INSTITUTION': 'O', 'RESEARCH_CENTER': 'R',
        'EDUCATION_PROVIDER': 'E', 'MIXED': 'M', 'UNKNOWN': 'U'
    }

    type_code = type_codes.get(inst_type, 'U')
    slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')[:30]

    return f"https://w3id.org/heritage/custodian/{country_code.lower()}/{type_code.lower()}-{slug}-{index:04d}"

def convert_to_linkml_yaml(institutions: List[Dict], conversation_id: str, source_file_path: str) -> str:
    """Convert to LinkML YAML format."""
    yaml_lines = [
        "---",
        "# Chilean GLAM Institutions",
        f"# Extracted from: {source_file_path}",
        f"# Conversation ID: {conversation_id}",
        ""
    ]

    for i, inst in enumerate(institutions, 1):
        ghcid = generate_ghcid('CL', inst.get('institution_type', 'MIXED'), inst['name'], i)

        # Handle names with quotes
        name = inst['name']
        if '"' in name:
            name_escaped = name.replace("'", "''")
            name_field = f"  name: '{name_escaped}'"
        else:
            name_field = f"  name: \"{name}\""

        yaml_lines.append(f"- id: {ghcid}")
        yaml_lines.append(name_field)
        yaml_lines.append(f"  institution_type: {inst.get('institution_type', 'MIXED')}")

        # Locations
        if inst.get('province'):
            yaml_lines.append(f"  locations:")
            yaml_lines.append(f"    - region: {inst['province']}")
            yaml_lines.append(f"      country: CL")

        # Identifiers
        if inst.get('urls') or inst.get('emails'):
            yaml_lines.append(f"  identifiers:")

            for url in inst.get('urls', []):
                yaml_lines.append(f"    - identifier_scheme: Website")
                yaml_lines.append(f"      identifier_value: {url}")
                yaml_lines.append(f"      identifier_url: {url}")

            for email in inst.get('emails', []):
                yaml_lines.append(f"    - identifier_scheme: Email")
                yaml_lines.append(f"      identifier_value: {email}")

        # Provenance with source reference
        extraction_date = datetime.now(timezone.utc).isoformat()
        yaml_lines.append(f"  provenance:")
        yaml_lines.append(f"    data_source: CONVERSATION_NLP")
        yaml_lines.append(f"    data_tier: TIER_4_INFERRED")
        yaml_lines.append(f"    extraction_date: \"{extraction_date}\"")
        yaml_lines.append(f"    extraction_method: \"Inline bold text extraction from provincial directory\"")
        yaml_lines.append(f"    confidence_score: 0.85")
        yaml_lines.append(f"    conversation_id: \"{conversation_id}\"")
        yaml_lines.append(f"    source_url: \"file://{source_file_path}\"")

        yaml_lines.append("")

    return '\n'.join(yaml_lines)

def main():
    # Load conversation file
    file_path = "/Users/kempersc/Documents/claude/data-2025-11-02-18-13-26-batch-0000/conversations/2025-09-22T14-43-14-edc75d66-ee42-4199-8e22-65b0d2347922-Chilean_GLAM_inventories_research.json"

    print("Loading Chilean GLAM conversation...")
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    conversation_id = data['uuid']

    # Find the comprehensive provincial directory (usually the longest artifact)
    report_text = None
    max_length = 0

    for msg in data.get('chat_messages', []):
        for content in msg.get('content', []):
            if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
                artifact = content.get('input', {}).get('content', '')
                if len(artifact) > max_length:
                    max_length = len(artifact)
                    report_text = artifact

    if not report_text:
        print("ERROR: No artifact found in conversation!")
        return

    print(f"Found report with {len(report_text)} characters")

    # Extract institutions
    print("\nExtracting institutions from provincial directory...")
    institutions = extract_institutions_from_report(report_text)
    print(f"Extracted: {len(institutions)} institutions")

    # Deduplicate
    print("\nDeduplicating by normalized name...")
    final_institutions = deduplicate_institutions(institutions)
    print(f"Final: {len(final_institutions)} unique institutions")

    # Statistics
    print("\n" + "="*60)
    print("FINAL STATISTICS")
    print("="*60)

    type_counts = defaultdict(int)
    province_counts = defaultdict(int)
    with_urls = 0
    with_emails = 0

    for inst in final_institutions:
        type_counts[inst.get('institution_type', 'MIXED')] += 1
        if inst.get('province'):
            province_counts[inst['province']] += 1
        if inst.get('urls'):
            with_urls += 1
        if inst.get('emails'):
            with_emails += 1

    print(f"\nTotal institutions: {len(final_institutions)}")
    print(f"\nBy type:")
    for itype, count in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"  {itype}: {count}")

    print(f"\nTop 15 provinces by institution count:")
    for province, count in sorted(province_counts.items(), key=lambda x: -x[1])[:15]:
        print(f"  {province}: {count}")

    print(f"\nIdentifiers:")
    print(f"  Institutions with URLs: {with_urls} ({with_urls/len(final_institutions)*100:.1f}%)")
    print(f"  Institutions with emails: {with_emails} ({with_emails/len(final_institutions)*100:.1f}%)")

    # Convert to LinkML YAML
    print("\nConverting to LinkML YAML format...")
    yaml_output = convert_to_linkml_yaml(final_institutions, conversation_id, file_path)

    # Write output
    output_path = Path('/Users/kempersc/apps/glam/data/instances/chilean_institutions.yaml')
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(yaml_output)

    print(f"\nOutput written to: {output_path}")

    # Save JSON
    json_output = {
        'institutions': final_institutions,
        'statistics': {
            'total': len(final_institutions),
            'by_type': dict(type_counts),
            'by_province': dict(province_counts),
            'with_urls': with_urls,
            'with_emails': with_emails
        }
    }

    json_path = '/tmp/chilean_institutions_final.json'
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(json_output, f, indent=2, ensure_ascii=False)

    print(f"JSON version saved to: {json_path}")

if __name__ == '__main__':
    main()