glam/extract_conversations_batch.py

#!/usr/bin/env python3
"""
Batch extract heritage institutions from conversation JSON files.

This script processes all conversation files in docs/reflection/ and extracts
heritage institution data (TIER_4) using NLP pattern matching.

Usage:
    python extract_conversations_batch.py [--limit N] [--country CODE]
"""

import json
import re
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any, Set, Tuple
from collections import defaultdict
import argparse

# Institution type keywords (from AGENTS.md taxonomy)
INSTITUTION_KEYWORDS = {
    'MUSEUM': [
        'museum', 'museo', 'museu', 'musée', 'muzeum', 'muzej',
        'art gallery', 'kunstmuseum', 'kunsthal'
    ],
    'LIBRARY': [
        'library', 'biblioteca', 'bibliothek', 'bibliotheek', 'bibliothèque',
        'biblioteka', 'national library', 'public library', 'university library'
    ],
    'ARCHIVE': [
        'archive', 'archiv', 'archivo', 'arquivo', 'archief',
        'national archive', 'state archive', 'regional archive'
    ],
    'GALLERY': [
        'gallery', 'galerie', 'galería', 'kunsthal', 'art center'
    ],
    'RESEARCH_CENTER': [
        'research center', 'research centre', 'research institute',
        'documentation center', 'knowledge center'
    ],
    'BOTANICAL_ZOO': [
        'botanical garden', 'botanic garden', 'arboretum',
        'zoo', 'zoological garden', 'zoological park'
    ],
    'EDUCATION_PROVIDER': [
        'university', 'universidad', 'universiteit', 'université',
        'college', 'school', 'institute', 'academy'
    ],
    'HOLY_SITES': [
        'church', 'cathedral', 'mosque', 'temple', 'synagogue',
        'monastery', 'abbey', 'shrine'
    ]
}

# ISIL code pattern (e.g., NL-AsdRM, US-MBMM, BR-RjBN)
ISIL_PATTERN = re.compile(r'\b([A-Z]{2}-[A-Za-z0-9]+)\b')

# Website URL pattern
URL_PATTERN = re.compile(r'https?://[^\s<>"]+')

# Country codes (ISO 3166-1 alpha-2)
COUNTRY_CODES = {
    'NL', 'US', 'BR', 'GB', 'FR', 'DE', 'ES', 'IT', 'PT', 'BE',
    'AR', 'MX', 'CL', 'CO', 'PE', 'VE', 'EC', 'BO', 'PY', 'UY',
    'JP', 'CN', 'IN', 'KR', 'TH', 'VN', 'ID', 'PH', 'MY', 'SG',
    'EG', 'MA', 'DZ', 'TN', 'LY', 'ZA', 'NG', 'KE', 'GH', 'ET',
    'AU', 'NZ', 'CA', 'RU', 'TR', 'SA', 'AE', 'QA', 'KW', 'OM',
    'PL', 'CZ', 'HU', 'RO', 'BG', 'GR', 'HR', 'RS', 'UA', 'BY'
}


class ConversationExtractor:
    """Extract heritage institutions from Claude conversation files."""

    def __init__(self, verbose: bool = True):
        self.verbose = verbose
        self.stats = defaultdict(int)
        self.extracted_institutions = []
        self.seen_names = set()

    def log(self, message: str):
        """Print log message if verbose."""
        if self.verbose:
            print(message)

    def extract_country_from_filename(self, filename: str) -> str:
        """Extract country name from conversation filename."""
        # Examples:
        # - Brazilian_GLAM_collection_inventories.json → Brazil
        # - Mexican_GLAM_inventories_and_catalogues.json → Mexico
        # - Panamanian_cultural_heritage_resources.json → Panama

        country_map = {
            'brazilian': 'BR', 'brazil': 'BR',
            'mexican': 'MX', 'mexico': 'MX',
            'panamanian': 'PA', 'panama': 'PA',
            'argentine': 'AR', 'argentina': 'AR', 'argentinian': 'AR',
            'chilean': 'CL', 'chile': 'CL',
            'colombian': 'CO', 'colombia': 'CO',
            'canadian': 'CA', 'canada': 'CA',
            'american': 'US', 'united states': 'US',
            'dutch': 'NL', 'netherlands': 'NL', 'holland': 'NL',
            'german': 'DE', 'germany': 'DE',
            'french': 'FR', 'france': 'FR',
            'spanish': 'ES', 'spain': 'ES',
            'italian': 'IT', 'italy': 'IT',
            'portuguese': 'PT', 'portugal': 'PT',
            'belgian': 'BE', 'belgium': 'BE',
            'austrian': 'AT', 'austria': 'AT',
            'japanese': 'JP', 'japan': 'JP',
            'chinese': 'CN', 'china': 'CN',
            'indian': 'IN', 'india': 'IN',
            'egyptian': 'EG', 'egypt': 'EG',
            'moroccan': 'MA', 'morocco': 'MA',
            'algerian': 'DZ', 'algeria': 'DZ',
            'tunisian': 'TN', 'tunisia': 'TN',
            'libyan': 'LY', 'libya': 'LY',
            'south african': 'ZA', 'south africa': 'ZA',
            'nigerian': 'NG', 'nigeria': 'NG',
            'kenyan': 'KE', 'kenya': 'KE',
            'ghanaian': 'GH', 'ghana': 'GH',
            'ethiopian': 'ET', 'ethiopia': 'ET',
            'pakistani': 'PK', 'pakistan': 'PK',
            'afghan': 'AF', 'afghanistan': 'AF',
            'iraqi': 'IQ', 'iraq': 'IQ',
            'hungarian': 'HU', 'hungary': 'HU',
            'polish': 'PL', 'poland': 'PL',
            'czech': 'CZ', 'czech republic': 'CZ',
            'romanian': 'RO', 'romania': 'RO',
            'bulgarian': 'BG', 'bulgaria': 'BG',
            'greek': 'GR', 'greece': 'GR',
            'croatian': 'HR', 'croatia': 'HR',
            'serbian': 'RS', 'serbia': 'RS',
            'ukrainian': 'UA', 'ukraine': 'UA',
            'belarusian': 'BY', 'belarus': 'BY',
            'thai': 'TH', 'thailand': 'TH',
            'vietnamese': 'VN', 'vietnam': 'VN',
            'indonesian': 'ID', 'indonesia': 'ID',
            'filipino': 'PH', 'philippines': 'PH',
            'malaysian': 'MY', 'malaysia': 'MY',
            'singaporean': 'SG', 'singapore': 'SG',
            'australian': 'AU', 'australia': 'AU',
            'new zealand': 'NZ',
            'russian': 'RU', 'russia': 'RU',
            'turkish': 'TR', 'turkey': 'TR',
            'saudi': 'SA', 'saudi arabia': 'SA',
            'emirati': 'AE', 'uae': 'AE', 'emirates': 'AE',
            'qatari': 'QA', 'qatar': 'QA',
            'kuwaiti': 'KW', 'kuwait': 'KW',
            'omani': 'OM', 'oman': 'OM',
            'cuban': 'CU', 'cuba': 'CU',
            'madagascan': 'MG', 'madagascar': 'MG',
            'togolese': 'TG', 'togo': 'TG',
            'zeeland': 'NL',  # Dutch province
            'limburg': 'NL',  # Dutch province
        }

        filename_lower = filename.lower()
        for country_name, code in country_map.items():
            if country_name in filename_lower:
                return code

        return 'XX'  # Unknown country

    def classify_institution_type(self, text: str) -> str:
        """Classify institution type based on keywords in text."""
        text_lower = text.lower()

        # Count keyword matches per type
        scores = defaultdict(int)
        for inst_type, keywords in INSTITUTION_KEYWORDS.items():
            for keyword in keywords:
                if keyword in text_lower:
                    scores[inst_type] += 1

        # Return type with most matches
        if scores:
            return max(scores.items(), key=lambda x: x[1])[0]

        return 'UNKNOWN'

    def extract_identifiers(self, text: str) -> List[Dict[str, str]]:
        """Extract identifiers (ISIL, URLs) from text."""
        identifiers = []

        # Extract ISIL codes
        for match in ISIL_PATTERN.finditer(text):
            isil_code = match.group(1)
            # Verify it's a real ISIL (starts with valid country code)
            country_prefix = isil_code.split('-')[0]
            if country_prefix in COUNTRY_CODES:
                identifiers.append({
                    'identifier_scheme': 'ISIL',
                    'identifier_value': isil_code,
                    'identifier_url': f'https://isil.org/{isil_code}'
                })

        # Extract website URLs
        for match in URL_PATTERN.finditer(text):
            url = match.group(0)
            # Clean trailing punctuation
            url = url.rstrip('.,;:)')
            identifiers.append({
                'identifier_scheme': 'Website',
                'identifier_value': url,
                'identifier_url': url
            })

        return identifiers

    def extract_location(self, text: str, country_code: str) -> Dict[str, str]:
        """Extract location information from text."""
        # Simple city extraction (look for "in CITY", "located in CITY", etc.)
        location_patterns = [
            r'(?:in|at|located in|based in)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)',
            r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?),\s*(?:[A-Z]{2}|[A-Z][a-z]+)'
        ]

        for pattern in location_patterns:
            match = re.search(pattern, text)
            if match:
                city = match.group(1).strip()
                return {
                    'city': city,
                    'country': country_code
                }

        return {'country': country_code}

    def extract_institutions_from_text(self, text: str, country_code: str) -> List[Dict[str, Any]]:
        """Extract institution mentions from a text block."""
        institutions = []

        # Split text into sentences
        sentences = re.split(r'[.!?]\s+', text)

        for sentence in sentences:
            # Look for institution mentions (patterns like "The X Museum", "X Library", etc.)
            # This is a simplified approach - could be enhanced with NER

            # Pattern 1: "The [Institution Name]" followed by institution keyword
            pattern1 = r'(?:The|the)\s+([A-Z][^.!?]{5,80}?(?:' + '|'.join(
                [kw.title() for keywords in INSTITUTION_KEYWORDS.values() for kw in keywords]
            ) + r'))'

            for match in re.finditer(pattern1, sentence):
                name = match.group(1).strip()

                # Skip if already seen (deduplicate within file)
                name_normalized = name.lower()
                if name_normalized in self.seen_names:
                    continue

                # Extract additional info
                inst_type = self.classify_institution_type(sentence)
                identifiers = self.extract_identifiers(sentence)
                location = self.extract_location(sentence, country_code)

                # Only add if we have a meaningful name
                if len(name) > 5 and inst_type != 'UNKNOWN':
                    self.seen_names.add(name_normalized)
                    institutions.append({
                        'name': name,
                        'institution_type': inst_type,
                        'identifiers': identifiers,
                        'location': location,
                        'source_text': sentence[:200]  # Keep snippet for verification
                    })

        return institutions

    def process_conversation(self, filepath: Path) -> List[Dict[str, Any]]:
        """Process a single conversation file and extract institutions."""
        self.log(f"📄 Processing: {filepath.name}")

        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                conversation = json.load(f)
        except Exception as e:
            self.log(f"   ❌ Error reading file: {e}")
            self.stats['errors'] += 1
            return []

        # Extract country from filename
        country_code = self.extract_country_from_filename(filepath.name)

        # Collect all text from assistant messages
        full_text = []
        for message in conversation.get('chat_messages', []):
            if message.get('sender') == 'assistant':
                text = message.get('text', '')
                if text:
                    full_text.append(text)

        if not full_text:
            self.log(f"   ⚠️  No assistant messages found")
            self.stats['empty'] += 1
            return []

        # Extract institutions from combined text
        combined_text = '\n\n'.join(full_text)
        institutions = self.extract_institutions_from_text(combined_text, country_code)

        # Add provenance metadata
        for inst in institutions:
            inst['provenance'] = {
                'data_source': 'CONVERSATION_NLP',
                'data_tier': 'TIER_4_INFERRED',
                'extraction_date': datetime.now(timezone.utc).isoformat(),
                'extraction_method': 'Pattern-based NLP extraction from conversation',
                'confidence_score': 0.6,  # Lower confidence for conversational data
                'conversation_id': conversation.get('uuid', 'unknown'),
                'conversation_name': conversation.get('name', ''),
                'source_file': filepath.name
            }

        self.log(f"   ✅ Extracted {len(institutions)} institutions")
        self.stats['processed'] += 1
        self.stats['institutions_found'] += len(institutions)

        return institutions

    def process_all_conversations(self, conversations_dir: Path, limit: int | None = None,
                                  country_filter: str | None = None) -> List[Dict[str, Any]]:
        """Process all conversation files in a directory."""
        conversation_files = sorted(conversations_dir.glob('*.json'))

        if country_filter:
            conversation_files = [f for f in conversation_files
                                 if country_filter.lower() in f.name.lower()]

        if limit:
            conversation_files = conversation_files[:limit]

        self.log(f"\n🔍 Found {len(conversation_files)} conversation files to process")
        if country_filter:
            self.log(f"   Filtering by country: {country_filter}")
        if limit:
            self.log(f"   Limited to first {limit} files")
        self.log("")

        all_institutions = []

        for filepath in conversation_files:
            institutions = self.process_conversation(filepath)
            all_institutions.extend(institutions)

            # Brief progress update every 10 files
            if self.stats['processed'] % 10 == 0:
                self.log(f"   Progress: {self.stats['processed']}/{len(conversation_files)} files, "
                        f"{self.stats['institutions_found']} institutions")

        return all_institutions

    def convert_to_linkml(self, institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Convert extracted institutions to LinkML format."""
        linkml_institutions = []

        for idx, inst in enumerate(institutions, 1):
            # Generate ID
            name_slug = re.sub(r'[^a-z0-9]+', '-', inst['name'].lower())[:50]
            country = inst['location'].get('country', 'xx')
            inst_id = f"https://w3id.org/heritage/custodian/{country.lower()}/{name_slug}-conv{idx}"

            linkml_inst = {
                'id': inst_id,
                'name': inst['name'],
                'institution_type': inst['institution_type'],
                'provenance': inst['provenance']
            }

            # Add identifiers if present
            if inst.get('identifiers'):
                linkml_inst['identifiers'] = inst['identifiers']

            # Add location if present
            if inst.get('location'):
                linkml_inst['locations'] = [inst['location']]

            # Add description (source text snippet)
            if inst.get('source_text'):
                linkml_inst['description'] = f"Extracted from conversation: {inst['source_text'][:150]}..."

            linkml_institutions.append(linkml_inst)

        return linkml_institutions


def main():
    parser = argparse.ArgumentParser(description='Extract institutions from conversation files')
    parser.add_argument('--limit', type=int, help='Limit number of files to process')
    parser.add_argument('--country', type=str, help='Filter by country name (e.g., "Brazil")')
    parser.add_argument('--output', type=str, default='data/instances/conversations_extracted.yaml',
                       help='Output YAML file path')
    parser.add_argument('--quiet', action='store_true', help='Suppress progress messages')
    args = parser.parse_args()

    # Setup paths
    project_root = Path(__file__).parent
    conversations_dir = project_root / 'docs' / 'reflection'
    output_path = project_root / args.output

    # Create output directory if needed
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Extract institutions
    extractor = ConversationExtractor(verbose=not args.quiet)
    institutions = extractor.process_all_conversations(
        conversations_dir,
        limit=args.limit,
        country_filter=args.country
    )

    # Convert to LinkML format
    print(f"\n📝 Converting to LinkML format...")
    linkml_institutions = extractor.convert_to_linkml(institutions)

    # Save to YAML
    print(f"💾 Saving to {output_path}...")
    with open(output_path, 'w', encoding='utf-8') as f:
        yaml.dump(linkml_institutions, f, allow_unicode=True, sort_keys=False,
                 default_flow_style=False)

    # Print statistics
    print(f"\n📊 Extraction Summary:")
    print(f"   Files processed: {extractor.stats['processed']}")
    print(f"   Files with errors: {extractor.stats['errors']}")
    print(f"   Empty files: {extractor.stats['empty']}")
    print(f"   Total institutions extracted: {len(linkml_institutions)}")
    print(f"   Unique institution names: {len(extractor.seen_names)}")
    print(f"   Output file: {output_path}")
    print(f"   File size: {output_path.stat().st_size / 1024:.1f} KB")

    # Country distribution
    country_dist = defaultdict(int)
    for inst in linkml_institutions:
        country = inst.get('locations', [{}])[0].get('country', 'XX')
        country_dist[country] += 1

    print(f"\n🌍 Country Distribution (Top 10):")
    for country, count in sorted(country_dist.items(), key=lambda x: -x[1])[:10]:
        print(f"   {country}: {count}")

    print(f"\n✅ Extraction complete!")


if __name__ == '__main__':
    main()