glam/extract_brazilian_institutions.py

#!/usr/bin/env python3
"""
Brazilian GLAM Institution Extractor

Extracts ALL heritage institutions from Brazilian conversation JSON and creates
LinkML-compliant YAML records following schema v0.2.0.

Expected output: 200+ institutions covering all 27 Brazilian federative units.
"""

import json
import re
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional
from pathlib import Path
import yaml


def slugify(text: str) -> str:
    """Create URL-safe slug from institution name."""
    text = text.lower()
    text = re.sub(r'[àáâãäå]', 'a', text)
    text = re.sub(r'[èéêë]', 'e', text)
    text = re.sub(r'[ìíîï]', 'i', text)
    text = re.sub(r'[òóôõö]', 'o', text)
    text = re.sub(r'[ùúûü]', 'u', text)
    text = re.sub(r'[ç]', 'c', text)
    text = re.sub(r'[^a-z0-9]+', '-', text)
    text = text.strip('-')
    return text[:50]  # Limit length


def classify_institution(name: str, description: str = "") -> str:
    """Classify institution type from name and description."""
    text = (name + " " + description).lower()

    if any(word in text for word in ['museu', 'museum', 'memorial', 'pinacoteca']):
        return 'MUSEUM'
    elif any(word in text for word in ['biblioteca', 'library', 'bibliotheca']):
        return 'LIBRARY'
    elif any(word in text for word in ['arquivo', 'archive', 'archiv']):
        return 'ARCHIVE'
    elif any(word in text for word in ['ibram', 'iphan', 'secult', 'secretaria', 'fundação de cultura', 'instituto brasileiro']):
        return 'OFFICIAL_INSTITUTION'
    elif any(word in text for word in ['universidade', 'university', 'usp', 'ufmg', 'unicamp', 'ufrj', 'ufba']):
        return 'EDUCATION_PROVIDER'
    elif any(word in text for word in ['centro de pesquisa', 'research center', 'laboratório', 'documentation']):
        return 'RESEARCH_CENTER'
    else:
        return 'MIXED'  # Default when unclear


def extract_institutions_from_conversation(json_path: Path) -> List[Dict[str, Any]]:
    """Extract all institutions from Brazilian GLAM conversation JSON."""

    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    conversation_id = data.get('uuid', '')
    extraction_date = datetime.now(timezone.utc).isoformat()

    institutions = []
    seen_names = set()  # Deduplicate

    # Extract markdown content from artifacts
    markdown_content = []
    citations = []

    for message in data.get('chat_messages', []):
        for content_item in message.get('content', []):
            if content_item.get('type') == 'tool_use':
                tool_input = content_item.get('input', {})
                if 'content' in tool_input:
                    markdown_content.append(tool_input['content'])
                if 'md_citations' in tool_input:
                    citations.extend(tool_input['md_citations'])

    # Parse markdown artifacts for institutions
    for artifact in markdown_content:
        institutions.extend(parse_artifact(artifact, conversation_id, extraction_date, seen_names))

    # Parse citations to create digital platform records
    institutions.extend(parse_citations(citations, conversation_id, extraction_date, seen_names))

    return institutions


def parse_artifact(content: str, conversation_id: str, extraction_date: str, seen_names: set) -> List[Dict[str, Any]]:
    """Parse markdown artifact content to extract institutions."""
    institutions = []

    # Pattern 1: Federal institutions (IBRAM, IPHAN, Biblioteca Nacional, etc.)
    federal_pattern = r'\*\*([^*]+(?:Instituto Brasileiro de Museus|IBRAM|IPHAN|Biblioteca Nacional|Arquivo Nacional|Fundação Cultural Palmares)[^*]*)\*\*'
    for match in re.finditer(federal_pattern, content, re.IGNORECASE):
        name = match.group(1).strip()
        if name not in seen_names:
            inst = create_institution_record(name, content, conversation_id, extraction_date)
            if inst:
                institutions.append(inst)
                seen_names.add(name)

    # Pattern 2: Museums
    museum_pattern = r'\*\*([^*]*(?:Museu|Museum|Memorial|Pinacoteca)[^*]*?)\*\*'
    for match in re.finditer(museum_pattern, content, re.IGNORECASE):
        name = match.group(1).strip()
        if name and name not in seen_names and len(name) > 5:
            inst = create_institution_record(name, content, conversation_id, extraction_date)
            if inst:
                institutions.append(inst)
                seen_names.add(name)

    # Pattern 3: Libraries
    library_pattern = r'\*\*([^*]*(?:Biblioteca|Library)[^*]*?)\*\*'
    for match in re.finditer(library_pattern, content, re.IGNORECASE):
        name = match.group(1).strip()
        if name and name not in seen_names and len(name) > 5:
            inst = create_institution_record(name, content, conversation_id, extraction_date)
            if inst:
                institutions.append(inst)
                seen_names.add(name)

    # Pattern 4: Archives
    archive_pattern = r'\*\*([^*]*(?:Arquivo|Archive)[^*]*?)\*\*'
    for match in re.finditer(archive_pattern, content, re.IGNORECASE):
        name = match.group(1).strip()
        if name and name not in seen_names and len(name) > 5:
            inst = create_institution_record(name, content, conversation_id, extraction_date)
            if inst:
                institutions.append(inst)
                seen_names.add(name)

    # Pattern 5: Universities
    university_pattern = r'\*\*([^*]*(?:Universidade|University|USP|UFMG|UNICAMP|UFRJ|UFBA|UNIFAP|UFAC)[^*]*?)\*\*'
    for match in re.finditer(university_pattern, content, re.IGNORECASE):
        name = match.group(1).strip()
        if name and name not in seen_names and len(name) > 3:
            inst = create_institution_record(name, content, conversation_id, extraction_date)
            if inst:
                institutions.append(inst)
                seen_names.add(name)

    # Pattern 6: Digital platforms
    platform_pattern = r'\*\*([^*]*(?:Digital|Brasiliana|Hemeroteca|Tainacan|BNDigital)[^*]*?)\*\*'
    for match in re.finditer(platform_pattern, content, re.IGNORECASE):
        name = match.group(1).strip()
        if name and name not in seen_names and len(name) > 5:
            inst = create_institution_record(name, content, conversation_id, extraction_date)
            if inst:
                institutions.append(inst)
                seen_names.add(name)

    return institutions


def create_institution_record(name: str, context: str, conversation_id: str, extraction_date: str) -> Optional[Dict[str, Any]]:
    """Create a complete LinkML-compliant institution record."""

    # Clean up name
    name = re.sub(r'\s+', ' ', name).strip()
    if not name or len(name) < 4:
        return None

    # Skip generic headings
    skip_words = ['Executive Summary', 'State Infrastructure', 'Digital Systems', 'Collections', 'Federal', 'Contact']
    if any(skip in name for skip in skip_words):
        return None

    slug = slugify(name)
    inst_type = classify_institution(name, context)

    # Extract description from nearby context
    description = extract_description(name, context)

    # Extract URL if mentioned
    urls = extract_urls_for_institution(name, context)

    # Extract location
    location = extract_location(name, context)

    # Determine confidence score
    confidence = calculate_confidence(name, context, urls, location)

    record = {
        'id': f'https://w3id.org/heritage/custodian/br/{slug}',
        'name': name,
        'institution_type': inst_type,
        'description': description if description else f'Brazilian heritage institution: {name}',
        'provenance': {
            'data_source': 'CONVERSATION_NLP',
            'data_tier': 'TIER_4_INFERRED',
            'extraction_date': extraction_date,
            'extraction_method': 'Python NLP extraction from Brazilian GLAM conversation artifact',
            'confidence_score': confidence,
            'conversation_id': conversation_id
        }
    }

    # Add location if found
    if location:
        record['locations'] = [location]

    # Add identifiers if URLs found
    if urls:
        record['identifiers'] = []
        for url in urls[:5]:  # Limit to 5 URLs
            record['identifiers'].append({
                'identifier_scheme': 'Website',
                'identifier_value': url,
                'identifier_url': url
            })

    return record


def extract_description(name: str, context: str) -> Optional[str]:
    """Extract descriptive text about the institution from context."""
    # Find paragraphs mentioning the institution
    sentences = []

    # Look for sentences containing the institution name
    for paragraph in context.split('\n\n'):
        if name[:20] in paragraph:  # Use partial name for matching
            # Extract sentences from this paragraph
            para_sentences = re.split(r'[.!?]+\s+', paragraph)
            for sent in para_sentences[:3]:  # Up to 3 sentences
                if len(sent) > 30 and len(sent) < 500:
                    clean = re.sub(r'\*\*', '', sent).strip()
                    if clean and clean not in sentences:
                        sentences.append(clean)

    if sentences:
        return ' '.join(sentences[:2])  # Combine up to 2 sentences
    return None


def extract_urls_for_institution(name: str, context: str) -> List[str]:
    """Extract URLs mentioned near institution name."""
    urls = []

    # Find URLs in context
    url_pattern = r'https?://[^\s<>"\')]+(?:\.[^\s<>"\')\]]+)+'

    # Look in paragraphs mentioning the institution
    for paragraph in context.split('\n'):
        if name[:15] in paragraph or name.split()[0] in paragraph:
            found_urls = re.findall(url_pattern, paragraph)
            urls.extend(found_urls)

    return list(set(urls))  # Deduplicate


def extract_location(name: str, context: str) -> Optional[Dict[str, Any]]:
    """Extract location information for institution."""

    # Brazilian cities commonly mentioned
    cities = ['São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Belo Horizonte',
              'Curitiba', 'Recife', 'Porto Alegre', 'Manaus', 'Fortaleza',
              'Rio Branco', 'Maceió', 'Macapá', 'Belém', 'Goiânia', 'Campo Grande',
              'Cuiabá', 'João Pessoa', 'Teresina', 'Natal', 'Florianópolis',
              'Aracaju', 'Palmas', 'Boa Vista', 'Vitória']

    # Brazilian states
    states = {
        'Acre': 'AC', 'Alagoas': 'AL', 'Amapá': 'AP', 'Amazonas': 'AM',
        'Bahia': 'BA', 'Ceará': 'CE', 'Distrito Federal': 'DF', 'Espírito Santo': 'ES',
        'Goiás': 'GO', 'Maranhão': 'MA', 'Mato Grosso': 'MT', 'Mato Grosso do Sul': 'MS',
        'Minas Gerais': 'MG', 'Pará': 'PA', 'Paraíba': 'PB', 'Paraná': 'PR',
        'Pernambuco': 'PE', 'Piauí': 'PI', 'Rio de Janeiro': 'RJ', 'Rio Grande do Norte': 'RN',
        'Rio Grande do Sul': 'RS', 'Rondônia': 'RO', 'Roraima': 'RR', 'Santa Catarina': 'SC',
        'São Paulo': 'SP', 'Sergipe': 'SE', 'Tocantins': 'TO'
    }

    location = None

    # Search for city mentions near institution name
    for paragraph in context.split('\n'):
        if name[:15] in paragraph:
            for city in cities:
                if city in paragraph:
                    # Try to find state too
                    region = None
                    for state_name, state_code in states.items():
                        if state_name in paragraph or f'({state_code})' in paragraph:
                            region = state_name
                            break

                    location = {
                        'city': city,
                        'country': 'BR'
                    }
                    if region:
                        location['region'] = region

                    return location

    return None


def calculate_confidence(name: str, context: str, urls: List[str], location: Optional[Dict]) -> float:
    """Calculate confidence score for extraction."""
    score = 0.5  # Base score

    # Name quality
    if len(name) > 10 and len(name) < 100:
        score += 0.1

    # Has URL
    if urls:
        score += 0.15

    # Has location
    if location:
        score += 0.1

    # Has description in context
    if name in context and len(context) > 100:
        score += 0.1

    # Mentioned multiple times
    count = context.lower().count(name[:20].lower())
    if count > 2:
        score += 0.05

    return min(score, 0.95)  # Cap at 0.95


def parse_citations(citations: List[Dict], conversation_id: str, extraction_date: str, seen_names: set) -> List[Dict[str, Any]]:
    """Parse citation metadata to extract additional institutions."""
    institutions = []

    for citation in citations:
        # Extract from citation titles and sources
        title = citation.get('title', '')
        url = citation.get('url', '')

        # Look for institutional sources
        for source_item in citation.get('sources', []):
            source = source_item.get('source', '')
            if source and source not in seen_names and len(source) > 5:
                # Create minimal record for citation sources
                slug = slugify(source)
                inst_type = classify_institution(source, title)

                record = {
                    'id': f'https://w3id.org/heritage/custodian/br/{slug}',
                    'name': source,
                    'institution_type': inst_type,
                    'description': f'Institution identified from web citation: {title}',
                    'identifiers': [{
                        'identifier_scheme': 'Website',
                        'identifier_value': url,
                        'identifier_url': url
                    }],
                    'locations': [{
                        'country': 'BR'
                    }],
                    'provenance': {
                        'data_source': 'CONVERSATION_NLP',
                        'data_tier': 'TIER_4_INFERRED',
                        'extraction_date': extraction_date,
                        'extraction_method': 'Python extraction from conversation citation metadata',
                        'confidence_score': 0.6,  # Lower confidence for citation-only
                        'conversation_id': conversation_id
                    }
                }

                institutions.append(record)
                seen_names.add(source)

    return institutions


def main():
    """Main extraction workflow."""
    input_file = Path('/Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json')
    output_file = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_comprehensive.yaml')

    print(f"Extracting Brazilian institutions from: {input_file.name}")
    print(f"Output file: {output_file}")
    print()

    institutions = extract_institutions_from_conversation(input_file)

    print(f"✓ Extracted {len(institutions)} institutions")

    # Count by type
    type_counts = {}
    for inst in institutions:
        inst_type = inst['institution_type']
        type_counts[inst_type] = type_counts.get(inst_type, 0) + 1

    print(f"\nInstitution type breakdown:")
    for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {inst_type}: {count}")

    # Write output
    output_file.parent.mkdir(parents=True, exist_ok=True)

    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=100)

    print(f"\n✓ Wrote {len(institutions)} records to {output_file}")
    print(f"\nSample institutions:")
    for inst in institutions[:5]:
        print(f"  - {inst['name']} ({inst['institution_type']})")


if __name__ == '__main__':
    main()