glam/extract_mexican_glams.py

#!/usr/bin/env python3
"""
Extract Mexican GLAM institutions from conversation JSON files.
Follows GLAM Data Extraction project specifications.
"""

import json
import re
from typing import List, Dict, Any, Set
from datetime import datetime, timezone
from collections import defaultdict


class MexicanGLAMExtractor:
    """Extract heritage institutions from Mexican GLAM conversation artifacts."""

    # Comprehensive institution type patterns - capture up to natural boundaries
    INSTITUTION_PATTERNS = {
        'MUSEUM': [
            r'Museo\s+Nacional\s+de\s+(?:Antropología|Arte|Historia|Culturas\s+Populares)[A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located|operates|provides|offers|has|was|is))',
            r'Museo\s+Regional\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located|INAH|\())',
            r'Museo\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]{3,50}?(?=\s*[,.\n]|\s+(?:in|at|located|operates|provides))',
            r'Casa\s+Museo\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
        ],
        'LIBRARY': [
            r'Biblioteca\s+Nacional\s+de\s+(?:México|Antropología)[A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
            r'Biblioteca\s+(?:Central|Pública)\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
            r'Hemeroteca\s+Nacional\s+(?:Digital\s+)?de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
            r'Red\s+Nacional\s+de\s+Bibliotecas\s+Públicas',
        ],
        'ARCHIVE': [
            r'Archivo\s+General\s+de\s+(?:la\s+Nación|[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?)(?=\s*[,.\n]|\s+(?:in|at|located|holds|manages))',
            r'Archivo\s+(?:Histórico|Municipal|del\s+Estado)\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
        ],
        'GALLERY': [
            r'Galería\s+(?:de\s+Arte\s+|Nacional\s+)?[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
        ],
        'RESEARCH_CENTER': [
            r'Instituto\s+Nacional\s+de\s+Antropología\s+e\s+Historia(?:\s+\(INAH\))?',
            r'Instituto\s+Nacional\s+de\s+Bellas\s+Artes\s+y\s+Literatura(?:\s+\(INBAL\))?',
            r'Instituto\s+Nacional\s+de\s+Lenguas\s+Indígenas(?:\s+\(INALI\))?',
            r'Instituto\s+Nacional\s+de\s+Estudios\s+Históricos(?:\s+\(INEHRM\))?',
        ],
        'OFFICIAL_INSTITUTION': [
            r'Secretaría\s+de\s+Cultura',
            r'Sistema\s+de\s+Información\s+Cultural(?:\s+\(SIC\))?',
            r'Fonoteca\s+Nacional',
            r'IMCINE',
        ],
    }

    # URL patterns
    URL_PATTERN = re.compile(r'https?://[^\s\)]+')

    # Email patterns
    EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

    # Mexican states for location matching
    MEXICAN_STATES = {
        'AGUASCALIENTES', 'BAJA CALIFORNIA', 'BAJA CALIFORNIA SUR', 'CAMPECHE',
        'CHIAPAS', 'CHIHUAHUA', 'COAHUILA', 'COLIMA', 'DURANGO', 'GUANAJUATO',
        'GUERRERO', 'HIDALGO', 'JALISCO', 'MÉXICO', 'MICHOACÁN', 'MORELOS',
        'NAYARIT', 'NUEVO LEÓN', 'OAXACA', 'PUEBLA', 'QUERÉTARO', 'QUINTANA ROO',
        'SAN LUIS POTOSÍ', 'SINALOA', 'SONORA', 'TABASCO', 'TAMAULIPAS',
        'TLAXCALA', 'VERACRUZ', 'YUCATÁN', 'ZACATECAS', 'CIUDAD DE MÉXICO'
    }

    def __init__(self):
        self.extracted_institutions = {}
        self.institution_id_counter = 1

    def extract_from_conversation_file(self, filepath: str) -> Dict[str, Any]:
        """Extract institutions from a single conversation JSON file."""
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)

        conversation_id = data.get('uuid', '')
        conversation_name = data.get('name', '')

        # Extract from all artifact content in the conversation
        for msg in data.get('chat_messages', []):
            for content in msg.get('content', []):
                if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
                    artifact_text = content.get('input', {}).get('content', '')
                    if artifact_text:
                        self._extract_from_text(artifact_text, conversation_id, conversation_name)

        return {
            'conversation_id': conversation_id,
            'conversation_name': conversation_name,
            'institutions_found': len(self.extracted_institutions)
        }

    def _extract_from_text(self, text: str, conversation_id: str, conversation_name: str):
        """Extract institutions from artifact text content."""

        # Split text into sections for better context tracking
        sections = self._split_into_sections(text)

        for section in sections:
            # Extract state context from section header
            state = self._extract_state_from_section(section)

            # Extract institutions using pattern matching
            for inst_type, patterns in self.INSTITUTION_PATTERNS.items():
                for pattern in patterns:
                    matches = re.finditer(pattern, section, re.IGNORECASE)
                    for match in matches:
                        institution_name = self._clean_institution_name(match.group(0).strip())

                        # Skip if it's just a generic term
                        if self._is_generic_term(institution_name):
                            continue

                        # Extract context around the match
                        context = self._get_context(section, match.start(), match.end())

                        # Create or update institution record
                        self._add_institution(
                            name=institution_name,
                            institution_type=inst_type,
                            state=state,
                            context=context,
                            conversation_id=conversation_id,
                            conversation_name=conversation_name
                        )

    def _split_into_sections(self, text: str) -> List[str]:
        """Split text into logical sections based on headers."""
        # Split on ### headers or state names in ALL CAPS
        section_pattern = re.compile(r'(?:^|\n)(?:###?\s+[A-ZÁÉÍÓÚÑ]|[A-ZÁÉÍÓÚÑ]{5,})', re.MULTILINE)
        sections = section_pattern.split(text)
        return [s for s in sections if len(s.strip()) > 50]  # Filter out tiny sections

    def _extract_state_from_section(self, section: str) -> str:
        """Extract Mexican state name from section text."""
        # Look for state names in first 200 characters of section
        header = section[:200].upper()
        for state in self.MEXICAN_STATES:
            if state in header:
                return state
        return ""

    def _is_generic_term(self, name: str) -> bool:
        """Check if extracted name is just a generic term."""
        generic_terms = [
            'Museo de', 'Museo del', 'Museo Nacional', 'Biblioteca de',
            'Archivo de', 'Instituto Nacional', 'INAH', 'INBAL',
            'Museo Regional', 'Biblioteca Nacional', 'Archivo General'
        ]

        # Must be longer than just the generic prefix
        name_clean = name.strip()
        if len(name_clean) < 15:  # Too short to be a full institution name
            return True

        return False

    def _clean_institution_name(self, name: str) -> str:
        """Clean institution name by removing sentence fragments."""
        # Remove trailing verbs and common sentence starters
        stop_words = [
            r'\s+stands?\s+as\b.*',
            r'\s+operates?\s+.*',
            r'\s+provides?\s+.*',
            r'\s+offers?\s+.*',
            r'\s+manages?\s+.*',
            r'\s+holds?\s+.*',
            r'\s+contains?\s+.*',
            r'\s+includes?\s+.*',
            r'\s+features?\s+.*',
            r'\s+was\s+.*',
            r'\s+is\s+.*',
            r'\s+has\s+.*',
            r'\s+serves?\s+.*',
            r'\s+at\s+.*',
            r'\s+in\s+.*',
            r'\s+located\s+.*',
        ]

        cleaned = name
        for pattern in stop_words:
            cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)

        # Remove trailing punctuation and whitespace
        cleaned = re.sub(r'[,.\s]+$', '', cleaned)

        return cleaned.strip()

    def _get_context(self, text: str, start: int, end: int, window: int = 500) -> str:
        """Extract context around a match for additional metadata."""
        context_start = max(0, start - window)
        context_end = min(len(text), end + window)
        return text[context_start:context_end]

    def _add_institution(self, name: str, institution_type: str, state: str,
                        context: str, conversation_id: str, conversation_name: str):
        """Add or update an institution record."""

        # Normalize name for deduplication
        name_normalized = self._normalize_name(name)

        # Check if already exists
        if name_normalized in self.extracted_institutions:
            # Update existing record with additional context
            existing = self.extracted_institutions[name_normalized]
            existing['provenance']['confidence_score'] = min(1.0, existing['provenance']['confidence_score'] + 0.1)
            if state and not existing['locations']:
                if state:
                    existing['locations'].append({'city': '', 'region': state, 'country': 'MX'})
            return

        # Extract metadata from context
        urls = self._extract_urls(context)
        emails = self._extract_emails(context)
        address = self._extract_address(context)
        city = self._extract_city(context)
        description = self._extract_description(context, name)

        # Create new record
        institution_id = f"mx-glam-{self.institution_id_counter:04d}"
        self.institution_id_counter += 1

        record = {
            'id': institution_id,
            'name': name,
            'name_normalized': name_normalized,
            'institution_type': institution_type,
            'alternative_names': [],
            'description': description,
            'locations': [],
            'identifiers': [],
            'digital_platforms': [],
            'provenance': {
                'data_source': 'CONVERSATION_NLP',
                'data_tier': 'TIER_4_INFERRED',
                'extraction_date': datetime.now(timezone.utc).isoformat(),
                'extraction_method': 'Pattern-based NER from Mexican GLAM conversations',
                'confidence_score': self._calculate_confidence(name, context, urls),
                'conversation_id': conversation_id,
                'source_url': None,
            }
        }

        # Add location if available
        if city or state:
            location = {
                'city': city,
                'region': state,
                'country': 'MX',
            }
            if address:
                location['street_address'] = address
            record['locations'].append(location)

        # Add identifiers (URLs, emails)
        for url in urls:
            record['identifiers'].append({
                'identifier_scheme': 'Website',
                'identifier_value': url,
                'identifier_url': url,
            })

        if emails:
            record['identifiers'].append({
                'identifier_scheme': 'Email',
                'identifier_value': emails[0],  # Take first email
            })

        self.extracted_institutions[name_normalized] = record

    def _normalize_name(self, name: str) -> str:
        """Normalize institution name for deduplication."""
        # Remove extra whitespace, lowercase, remove punctuation
        normalized = re.sub(r'\s+', ' ', name.strip().lower())
        normalized = re.sub(r'[^\w\s]', '', normalized)
        return normalized

    def _extract_urls(self, context: str) -> List[str]:
        """Extract URLs from context."""
        urls = self.URL_PATTERN.findall(context)
        return [url.rstrip('.,;)') for url in urls[:3]]  # Take up to 3 URLs

    def _extract_emails(self, context: str) -> List[str]:
        """Extract email addresses from context."""
        return self.EMAIL_PATTERN.findall(context)[:2]  # Take up to 2 emails

    def _extract_address(self, context: str) -> str:
        """Extract street address from context."""
        # Look for "Address:" or address patterns
        address_match = re.search(r'(?:Address|Dirección):\s*([^\n]+)', context, re.IGNORECASE)
        if address_match:
            return address_match.group(1).strip()

        # Look for Mexican address patterns (street number, colony, postal code)
        address_match = re.search(r'([A-Z][a-záéíóúñ]+\s+\d+[^,\n]+,?\s*\d{5})', context)
        if address_match:
            return address_match.group(1).strip()

        return ""

    def _extract_city(self, context: str) -> str:
        """Extract city name from context."""
        # Common Mexican cities
        cities = [
            'México', 'Guadalajara', 'Monterrey', 'Puebla', 'Toluca',
            'Tijuana', 'León', 'Ciudad Juárez', 'Zapopan', 'Mérida',
            'Aguascalientes', 'Querétaro', 'Morelia', 'Hermosillo',
            'Saltillo', 'Mexicali', 'Culiacán', 'Chihuahua', 'Oaxaca',
            'Veracruz', 'Acapulco', 'Cancún', 'Cuernavaca', 'Pachuca',
            'Durango', 'Tepic', 'Tuxtla Gutiérrez', 'Villahermosa',
            'Campeche', 'Chetumal', 'Zacatecas', 'Colima', 'Guanajuato',
            'Haarlem',  # In case of Dutch crossover
        ]

        context_lower = context.lower()
        for city in cities:
            if city.lower() in context_lower:
                return city

        return ""

    def _extract_description(self, context: str, institution_name: str) -> str:
        """Extract description from context around institution mention."""
        # Look for sentences containing the institution name
        sentences = re.split(r'[.!?]\s+', context)
        relevant_sentences = []

        for sentence in sentences:
            if institution_name[:20] in sentence:  # Match on first 20 chars
                relevant_sentences.append(sentence.strip())

        if relevant_sentences:
            return '. '.join(relevant_sentences[:2])  # Take up to 2 sentences

        return ""

    def _calculate_confidence(self, name: str, context: str, urls: List[str]) -> float:
        """Calculate confidence score for extraction."""
        confidence = 0.6  # Base confidence for pattern match

        # Increase confidence based on available metadata
        if urls:
            confidence += 0.15
        if len(context) > 300:
            confidence += 0.1
        if len(name) > 25:  # Longer, more specific name
            confidence += 0.1
        if 'museo' in name.lower() or 'biblioteca' in name.lower() or 'archivo' in name.lower():
            confidence += 0.05

        return min(1.0, confidence)

    def get_results(self) -> Dict[str, Any]:
        """Get extraction results with statistics."""
        institutions_list = list(self.extracted_institutions.values())

        # Calculate statistics
        type_counts = defaultdict(int)
        state_counts = defaultdict(int)

        for inst in institutions_list:
            type_counts[inst['institution_type']] += 1
            for loc in inst.get('locations', []):
                if loc.get('region'):
                    state_counts[loc['region']] += 1

        return {
            'total_institutions': len(institutions_list),
            'institutions': institutions_list,
            'statistics': {
                'by_type': dict(type_counts),
                'by_state': dict(state_counts),
                'with_urls': sum(1 for i in institutions_list if i.get('identifiers')),
                'with_locations': sum(1 for i in institutions_list if i.get('locations')),
            },
            'extraction_metadata': {
                'extraction_date': datetime.now(timezone.utc).isoformat(),
                'data_tier': 'TIER_4_INFERRED',
                'data_source': 'CONVERSATION_NLP',
            }
        }


def main():
    """Main extraction workflow."""
    print("Mexican GLAM Institution Extractor")
    print("=" * 60)

    extractor = MexicanGLAMExtractor()

    # Process both conversation files
    files = [
        'mexican_glam_1.json',
        'mexican_glam_2.json'
    ]

    for filepath in files:
        print(f"\nProcessing: {filepath}")
        result = extractor.extract_from_conversation_file(filepath)
        print(f"  Conversation: {result['conversation_name']}")
        print(f"  UUID: {result['conversation_id']}")
        print(f"  Running total: {result['institutions_found']} institutions")

    # Get final results
    results = extractor.get_results()

    # Print statistics
    print("\n" + "=" * 60)
    print("EXTRACTION RESULTS")
    print("=" * 60)
    print(f"Total institutions extracted: {results['total_institutions']}")
    print(f"\nBy institution type:")
    for inst_type, count in sorted(results['statistics']['by_type'].items()):
        print(f"  {inst_type}: {count}")

    print(f"\nTop 10 states by institution count:")
    state_counts = results['statistics']['by_state']
    for state, count in sorted(state_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"  {state}: {count}")

    print(f"\nMetadata completeness:")
    print(f"  With URLs: {results['statistics']['with_urls']}")
    print(f"  With locations: {results['statistics']['with_locations']}")

    # Save to JSON
    output_file = 'mexican_glam_extracted.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"\nResults saved to: {output_file}")

    # Show sample institutions
    print("\n" + "=" * 60)
    print("SAMPLE INSTITUTIONS (first 10):")
    print("=" * 60)
    for i, inst in enumerate(results['institutions'][:10], 1):
        print(f"\n{i}. {inst['name']}")
        print(f"   Type: {inst['institution_type']}")
        print(f"   Confidence: {inst['provenance']['confidence_score']:.2f}")
        if inst.get('locations'):
            loc = inst['locations'][0]
            print(f"   Location: {loc.get('city', '')}, {loc.get('region', '')}")
        if inst.get('identifiers'):
            urls = [id['identifier_value'] for id in inst['identifiers'] if id['identifier_scheme'] == 'Website']
            if urls:
                print(f"   URL: {urls[0]}")


if __name__ == '__main__':
    main()