#!/usr/bin/env python3
"""
Brazilian GLAM Institution Extractor v2.0

Improved extraction focusing on structured bullet-point lists from the state-by-state
artifact section. Extracts actual institution names (not sentence fragments).

Expected output: 150-200 quality institutions with proper names, types, and metadata.

Source file: /Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json
"""

import json
import re
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional, Tuple
from pathlib import Path
import yaml


def slugify(text: str) -> str:
    """Create URL-safe slug from institution name."""
    text = text.lower()
    text = re.sub(r'[àáâãäå]', 'a', text)
    text = re.sub(r'[èéêë]', 'e', text)
    text = re.sub(r'[ìíîï]', 'i', text)
    text = re.sub(r'[òóôõö]', 'o', text)
    text = re.sub(r'[ùúûü]', 'u', text)
    text = re.sub(r'[ç]', 'c', text)
    text = re.sub(r'[^a-z0-9]+', '-', text)
    text = text.strip('-')
    return text[:50]


def classify_institution(name: str, description: str = "") -> str:
    """Classify institution type from name and description.
    
    Priority order:
    1. Check name first (most reliable)
    2. Then description (less reliable, may contain misleading keywords)
    """
    name_lower = name.lower()
    desc_lower = description.lower()
    
    # Check NAME first for strong indicators
    if 'museu' in name_lower or 'museum' in name_lower or 'memorial' in name_lower or 'pinacoteca' in name_lower:
        return 'MUSEUM'
    elif 'arquivo' in name_lower or 'archiv' in name_lower:
        return 'ARCHIVE'
    elif 'biblioteca' in name_lower or 'library' in name_lower or 'bibliotheca' in name_lower:
        return 'LIBRARY'
    
    # For acronyms and ambiguous names, check description
    text = (name_lower + " " + desc_lower)
    
    if any(word in text for word in ['museu', 'museum', 'memorial', 'pinacoteca', 'casa de cultura', 'mam-', 'mam ', 'marco', 'musear']):
        return 'MUSEUM'
    elif 'arquivo' in text or 'archiv' in text:
        return 'ARCHIVE'
    elif 'biblioteca' in text or 'bce' in text:
        return 'LIBRARY'
    elif any(word in text for word in ['ibram', 'iphan', 'secult', 'secretaria', 'fundação de cultura', 'instituto brasileiro', 'fpc/', 'ipac', 'unesco']):
        return 'OFFICIAL_INSTITUTION'
    elif any(word in text for word in ['universidade', 'university', 'ufac', 'ufal', 'unifap', 'ufpa', 'usp', 'ufmg', 'unicamp', 'ufrj', 'ufba', 'ufam', 'ufc', 'unb', 'ufg', 'ufma', 'ufmt', 'ufms', 'ufpe', 'ufpi', 'ufrn', 'ufpb', 'ufrgs', 'ufpr', 'ufsc', 'ufes', 'repository', 'repositories']):
        return 'EDUCATION_PROVIDER'
    elif any(word in text for word in ['centro de pesquisa', 'research center', 'laboratório', 'documentation', 'cepap', 'instituto histórico', 'instituto geográfico']):
        return 'RESEARCH_CENTER'
    elif any(word in text for word in ['teatro', 'centro cultural', 'ccbb', 'centro dragão', 'geopark', 'casa das', 'projects', 'projetos']):
        return 'MIXED'
    else:
        return 'MIXED'


def parse_state_sections(content: str, source_file_path: str) -> List[Dict[str, Any]]:
    """Parse state-by-state sections to extract institution records."""
    institutions = []
    
    # Brazilian states mapping (for region extraction)
    states = {
        'ACRE': 'AC', 'ALAGOAS': 'AL', 'AMAPÁ': 'AP', 'AMAZONAS': 'AM',
        'BAHIA': 'BA', 'CEARÁ': 'CE', 'DISTRITO FEDERAL': 'DF', 'ESPÍRITO SANTO': 'ES',
        'GOIÁS': 'GO', 'MARANHÃO': 'MA', 'MATO GROSSO': 'MT', 'MATO GROSSO DO SUL': 'MS',
        'MINAS GERAIS': 'MG', 'PARÁ': 'PA', 'PARAÍBA': 'PB', 'PARANÁ': 'PR',
        'PERNAMBUCO': 'PE', 'PIAUÍ': 'PI', 'RIO DE JANEIRO': 'RJ', 'RIO GRANDE DO NORTE': 'RN',
        'RIO GRANDE DO SUL': 'RS', 'RONDÔNIA': 'RO', 'RORAIMA': 'RR', 'SANTA CATARINA': 'SC',
        'SÃO PAULO': 'SP', 'SERGIPE': 'SE', 'TOCANTINS': 'TO'
    }
    
    # Split content by state headers (## STATE_NAME (XX))
    state_pattern = r'## ([A-ZÀÁÂÃÄÅÈÉÊËÌÍÎÏÒÓÔÕÖÙÚÛÜ\s]+) \(([A-Z]{2})\)'
    state_sections = re.split(state_pattern, content)
    
    current_state = None
    current_state_code = None
    
    for i, section in enumerate(state_sections):
        # Check if this is a state name
        if i % 3 == 1:  # State names appear at positions 1, 4, 7, etc.
            current_state = section.strip()
        elif i % 3 == 2:  # State codes appear at positions 2, 5, 8, etc.
            current_state_code = section.strip()
        elif i % 3 == 0 and i > 0:  # Content appears at positions 3, 6, 9, etc.
            # Parse institutions from this state's section (only if state info is available)
            if current_state and current_state_code:
                state_institutions = parse_institutions_from_section(
                    section, 
                    current_state, 
                    current_state_code,
                    source_file_path
                )
                institutions.extend(state_institutions)
    
    return institutions


def parse_institutions_from_section(section: str, state_name: str, state_code: str, source_file_path: str) -> List[Dict[str, Any]]:
    """Parse individual institutions from a state section."""
    institutions = []
    
    # Pattern: - **Institution Name**: description/details
    # Match bullet points with bold institution names
    bullet_pattern = r'^[\s]*[-•]\s*\*\*([^*:]+?)(?:\s*\([^)]+\))?\*\*:?\s*(.*)$'
    
    lines = section.split('\n')
    
    for line in lines:
        match = re.match(bullet_pattern, line, re.MULTILINE)
        if match:
            name = match.group(1).strip()
            details = match.group(2).strip()
            
            # Skip section headers and generic labels
            skip_words = ['State Infrastructure', 'Digital Systems', 'Collections', 'Digital Initiatives', 
                         'Contact', 'Federal', 'Technical', 'Metadata', 'Notable', 'Key Features', 
                         'Preservation', 'Ongoing', 'Major', 'Systems', 'Coverage', 'Database']
            if any(skip in name for skip in skip_words):
                continue
            
            # Skip Contact entries (phone numbers, addresses)
            if re.search(r'\(\d{2}\)\s*\d', name):  # Brazilian phone format
                continue
            
            # Skip if name is too short or too long
            if len(name) < 3 or len(name) > 150:
                continue
            
            # Extract URL from details
            url_match = re.search(r'https?://[^\s,]+', details)
            url = url_match.group(0) if url_match else None
            
            # Extract city if mentioned
            city = extract_city_from_details(details, name)
            
            # Create institution record
            inst = create_institution_record_v2(
                name=name,
                description=details,
                state_name=state_name,
                state_code=state_code,
                city=city,
                url=url,
                source_file_path=source_file_path
            )
            
            if inst:
                institutions.append(inst)
    
    return institutions


def extract_city_from_details(details: str, name: str) -> Optional[str]:
    """Extract city name from institution details or name."""
    # Common Brazilian cities
    cities = [
        'São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Belo Horizonte', 
        'Curitiba', 'Recife', 'Porto Alegre', 'Manaus', 'Fortaleza',
        'Rio Branco', 'Maceió', 'Macapá', 'Belém', 'Goiânia', 'Campo Grande',
        'Cuiabá', 'João Pessoa', 'Teresina', 'Natal', 'Florianópolis',
        'Aracaju', 'Palmas', 'Boa Vista', 'Vitória', 'São Luís', 'Campinas',
        'Santos', 'Niterói', 'Ouro Preto', 'Petrópolis', 'Paraty', 'Olinda',
        'Tiradentes', 'Diamantina', 'Cachoeira', 'São Cristóvão'
    ]
    
    # Check in details first
    for city in cities:
        if city in details or city in name:
            return city
    
    return None


def create_institution_record_v2(
    name: str,
    description: str,
    state_name: str,
    state_code: str,
    city: Optional[str],
    url: Optional[str],
    source_file_path: str
) -> Optional[Dict[str, Any]]:
    """Create LinkML-compliant institution record from structured data."""
    
    # Clean name
    name = re.sub(r'\s+', ' ', name).strip()
    
    # Create slug for ID
    slug = slugify(name)
    
    # Classify institution type
    inst_type = classify_institution(name, description)
    
    # Calculate confidence based on available data
    confidence = 0.7  # Base for structured extraction
    if url:
        confidence += 0.1
    if city:
        confidence += 0.05
    if len(description) > 50:
        confidence += 0.1
    confidence = min(confidence, 0.95)
    
    # Create location record
    location = {
        'country': 'BR',
        'region': state_name
    }
    if city:
        location['city'] = city
    
    # Create record
    record = {
        'id': f'https://w3id.org/heritage/custodian/br/{slug}',
        'name': name,
        'institution_type': inst_type,
        'locations': [location],
        'provenance': {
            'data_source': 'CONVERSATION_NLP',
            'data_tier': 'TIER_4_INFERRED',
            'extraction_date': datetime.now(timezone.utc).isoformat(),
            'extraction_method': 'Python structured extraction from Brazilian state-by-state artifact v2.0',
            'confidence_score': confidence,
            'conversation_id': '2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5',
            'source_url': f'file://{source_file_path}'
        }
    }
    
    # Add description if meaningful
    if description and len(description) > 10:
        # Clean up description
        desc = description
        # Remove URLs from description text
        desc = re.sub(r'https?://[^\s,]+', '', desc).strip()
        # Limit length
        if len(desc) > 500:
            desc = desc[:497] + '...'
        if desc:
            record['description'] = desc
    
    # Add URL as identifier
    if url:
        record['identifiers'] = [{
            'identifier_scheme': 'Website',
            'identifier_value': url,
            'identifier_url': url
        }]
    
    return record


def extract_institutions_from_conversation(json_path: Path) -> List[Dict[str, Any]]:
    """Main extraction function."""
    
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract markdown artifacts
    markdown_artifacts = []
    
    for message in data.get('chat_messages', []):
        for content_item in message.get('content', []):
            if content_item.get('type') == 'tool_use':
                tool_input = content_item.get('input', {})
                if 'content' in tool_input:
                    markdown_artifacts.append(tool_input['content'])
    
    # Process all artifacts
    all_institutions = []
    seen_names = set()
    
    # Convert Path to absolute string for source_url
    source_file_path = str(json_path.resolve())
    
    for artifact in markdown_artifacts:
        # Parse state sections
        institutions = parse_state_sections(artifact, source_file_path)
        
        # Deduplicate by name
        for inst in institutions:
            name = inst['name']
            if name not in seen_names:
                all_institutions.append(inst)
                seen_names.add(name)
    
    return all_institutions


def main():
    """Main extraction workflow."""
    input_file = Path('/Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json')
    output_file = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_v2.yaml')
    
    print(f"Brazilian GLAM Institution Extractor v2.0")
    print(f"=" * 60)
    print(f"Input: {input_file.name}")
    print(f"Output: {output_file}")
    print()
    
    institutions = extract_institutions_from_conversation(input_file)
    
    print(f"✓ Extracted {len(institutions)} institutions")
    print()
    
    # Type breakdown
    type_counts = {}
    for inst in institutions:
        inst_type = inst['institution_type']
        type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
    
    print(f"Institution Type Distribution:")
    for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {inst_type:25s}: {count:3d}")
    print()
    
    # Location coverage
    states_covered = set()
    for inst in institutions:
        if inst.get('locations'):
            state = inst['locations'][0].get('region')
            if state:
                states_covered.add(state)
    
    print(f"Geographic Coverage:")
    print(f"  States covered: {len(states_covered)}/27")
    print()
    
    # URL coverage
    with_urls = sum(1 for inst in institutions if inst.get('identifiers'))
    print(f"Data Quality:")
    print(f"  Institutions with URLs: {with_urls}/{len(institutions)} ({100*with_urls/len(institutions):.1f}%)")
    
    # Average confidence
    avg_confidence = sum(inst['provenance']['confidence_score'] for inst in institutions) / len(institutions)
    print(f"  Average confidence: {avg_confidence:.3f}")
    print()
    
    # Write output
    output_file.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=120)
    
    print(f"✓ Wrote {len(institutions)} records to {output_file}")
    print()
    print(f"Sample Institutions:")
    for inst in institutions[:10]:
        city = inst['locations'][0].get('city', '(state level)')
        print(f"  - {inst['name'][:50]:50s} | {inst['institution_type']:12s} | {city}")


if __name__ == '__main__':
    main()