glam/scripts/extract_egypt_institutions.py

#!/usr/bin/env python3
"""
Extract Egyptian heritage institutions from conversation JSON file.

This script reads the Egyptian GLAM inventory conversation and extracts
heritage custodian records in LinkML-compliant format.
"""

import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional
import yaml

# Import models
import sys
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))

from glam_extractor.models import (
    HeritageCustodian,
    Location,
    Identifier,
    DigitalPlatform,
    Provenance,
    DataSource,
    DataTier,
    InstitutionType,
    DigitalPlatformType
)

# Conversation file path
CONVERSATION_PATH = Path("/Users/kempersc/Documents/claude/data-2025-11-02-18-13-26-batch-0000/conversations/2025-09-22T14-50-31-39e11630-a2af-407c-a365-d485eb8257b0-Egyptian_GLAM_resources_inventory.json")

def extract_markdown_content(json_path: Path) -> str:
    """Extract the markdown content from conversation JSON."""
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for msg in data['chat_messages']:
        for content in msg.get('content', []):
            if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
                markdown = content.get('input', {}).get('content', '')
                if markdown and 'Egyptian GLAM' in markdown:
                    return markdown
    return ""

def parse_institution_section(text: str, institution_type: InstitutionType) -> Optional[HeritageCustodian]:
    """
    Parse a single institution section from markdown text.

    Args:
        text: Markdown text for one institution
        institution_type: Type of institution (LIBRARY, MUSEUM, etc.)

    Returns:
        HeritageCustodian object or None if parsing fails
    """
    # Extract institution name (first heading)
    name_match = re.search(r'###?\s+\*\*(.+?)\*\*', text)
    if not name_match:
        return None

    name = name_match.group(1).strip()

    # Extract Arabic name if present
    arabic_match = re.search(r'\*\*(.+?)\*\*\s+\((.+?)\)', text)
    alternative_names = []
    if arabic_match and arabic_match.group(1) != name:
        alternative_names.append(arabic_match.group(1))

    # Extract location information
    location = None
    city_match = re.search(r'\*\*(?:Address|Location)\*\*:\s*(.+?)(?:\n|$)', text)
    if city_match:
        address_text = city_match.group(1).strip()
        # Try to extract city from address
        if 'Cairo' in address_text:
            city = 'Cairo'
        elif 'Alexandria' in address_text:
            city = 'Alexandria'
        elif 'Aswan' in address_text:
            city = 'Aswan'
        elif 'Luxor' in address_text:
            city = 'Luxor'
        else:
            # Try to extract city from comma-separated address
            parts = address_text.split(',')
            city = parts[-1].strip() if len(parts) > 1 else 'Cairo'

        location = Location(
            city=city,
            country="EG",
            street_address=address_text
        )

    # Extract website URL
    identifiers = []
    website_match = re.search(r'\*\*Website\*\*:\s+(https?://[^\s\)]+)', text)
    if website_match:
        url = website_match.group(1).strip()
        identifiers.append(Identifier(
            identifier_scheme="Website",
            identifier_value=url,
            identifier_url=url
        ))

    # Extract description (everything after the name/address/website)
    description_parts = []

    # Look for collection/digital infrastructure sections
    collection_match = re.search(r'\*\*(?:Collections?|Digital Infrastructure)\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL)
    if collection_match:
        description_parts.append(collection_match.group(1).strip())

    description = ' '.join(description_parts) if description_parts else f"Heritage institution in Egypt."
    # Truncate to reasonable length
    description = description[:500] + "..." if len(description) > 500 else description

    # Create provenance metadata
    provenance = Provenance(
        data_source=DataSource.CONVERSATION_NLP,
        data_tier=DataTier.TIER_4_INFERRED,
        extraction_date=datetime.now(timezone.utc),
        extraction_method="Python NLP extraction from Egyptian GLAM conversation",
        confidence_score=0.85,
        conversation_id="39e11630-a2af-407c-a365-d485eb8257b0"
    )

    # Generate ID from name
    name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
    institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}"

    # Create institution record
    custodian = HeritageCustodian(
        id=institution_id,
        name=name,
        institution_type=institution_type,
        alternative_names=alternative_names if alternative_names else None,
        description=description,
        locations=[location] if location else [],
        identifiers=identifiers if identifiers else [],
        provenance=provenance
    )

    return custodian

def extract_institutions_step1() -> List[HeritageCustodian]:
    """
    Step 1: Extract major national institutions (libraries and museums).

    Returns:
        List of HeritageCustodian objects
    """
    print("Step 1: Extracting markdown content...")
    markdown = extract_markdown_content(CONVERSATION_PATH)

    if not markdown:
        print("ERROR: Could not extract markdown content from conversation")
        return []

    print(f"  Found {len(markdown)} characters of content")

    institutions = []

    # Extract Bibliotheca Alexandrina
    print("\nStep 2: Extracting Bibliotheca Alexandrina...")
    bibalex_section = markdown[markdown.find("### **Bibliotheca Alexandrina**"):markdown.find("### **Egyptian National Library")]
    if bibalex_section:
        inst = parse_institution_section(bibalex_section, InstitutionType.LIBRARY)
        if inst:
            institutions.append(inst)
            print(f"  ✓ Extracted: {inst.name}")

    # Extract Egyptian National Library
    print("\nStep 3: Extracting Egyptian National Library...")
    enl_section = markdown[markdown.find("### **Egyptian National Library"):markdown.find("### **National Archives")]
    if enl_section:
        inst = parse_institution_section(enl_section, InstitutionType.LIBRARY)
        if inst:
            institutions.append(inst)
            print(f"  ✓ Extracted: {inst.name}")

    # Extract National Archives
    print("\nStep 4: Extracting National Archives...")
    archives_section = markdown[markdown.find("### **National Archives"):markdown.find("# Part II")]
    if archives_section:
        inst = parse_institution_section(archives_section, InstitutionType.ARCHIVE)
        if inst:
            institutions.append(inst)
            print(f"  ✓ Extracted: {inst.name}")

    return institutions

def extract_institutions_step2() -> List[HeritageCustodian]:
    """
    Step 2: Extract major museums.

    Returns:
        List of HeritageCustodian objects
    """
    print("Step 2: Extracting museums...")
    markdown = extract_markdown_content(CONVERSATION_PATH)

    if not markdown:
        print("ERROR: Could not extract markdown content from conversation")
        return []

    print(f"  Found {len(markdown)} characters of content")

    institutions = []

    # Define museum sections to extract
    museum_sections = [
        ("Egyptian Museum Cairo (EMC)", "### **Egyptian Museum Cairo", "### **Grand Egyptian Museum"),
        ("Grand Egyptian Museum (GEM)", "### **Grand Egyptian Museum", "### **National Museum of Egyptian Civilization"),
        ("National Museum of Egyptian Civilization (NMEC)", "### **National Museum of Egyptian Civilization", "### **Regional Archaeological Museums"),
        ("Museum of Islamic Art Cairo", "### **Museum of Islamic Art Cairo**", "### **Coptic Museum"),
        ("Coptic Museum", "### **Coptic Museum**", "### **Greco-Roman Museum"),
        ("Greco-Roman Museum Alexandria", "### **Greco-Roman Museum Alexandria**", "### **Art Museums"),
    ]

    for name, start_marker, end_marker in museum_sections:
        print(f"\n  Extracting {name}...")
        try:
            start_idx = markdown.find(start_marker)
            end_idx = markdown.find(end_marker)

            if start_idx == -1 or end_idx == -1:
                print(f"    ⚠ Could not find section markers for {name}")
                continue

            section = markdown[start_idx:end_idx]
            inst = parse_institution_section(section, InstitutionType.MUSEUM)

            if inst:
                institutions.append(inst)
                print(f"    ✓ Extracted: {inst.name}")
        except Exception as e:
            print(f"    ✗ Error extracting {name}: {e}")

    return institutions

def main():
    """Main extraction workflow - Steps 1 and 2."""
    print("="*60)
    print("Egyptian GLAM Institution Extraction - STEPS 1 & 2")
    print("="*60)

    # Extract first batch (national libraries and archives)
    print("\n" + "="*60)
    print("STEP 1: National Libraries and Archives")
    print("="*60)
    institutions = extract_institutions_step1()

    print(f"\nStep 1 Complete: Extracted {len(institutions)} institutions")

    # Extract second batch (museums)
    print("\n" + "="*60)
    print("STEP 2: Museums")
    print("="*60)
    museums = extract_institutions_step2()
    institutions.extend(museums)

    print(f"\nStep 2 Complete: Extracted {len(museums)} museums")

    print(f"\n{'='*60}")
    print(f"Total Extracted: {len(institutions)} institutions")
    print(f"{'='*60}")

    # Save to YAML file
    output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step1_2.yaml"
    output_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"\nSaving to: {output_path}")

    with open(output_path, 'w', encoding='utf-8') as f:
        # Convert to dicts for YAML serialization
        records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions]
        yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    print(f"✓ Saved {len(institutions)} institutions to {output_path.name}")

    # Print summary
    print("\nInstitutions extracted:")
    for i, inst in enumerate(institutions, 1):
        print(f"  {i}. {inst.name} ({inst.institution_type})")
        if inst.locations:
            print(f"     Location: {inst.locations[0].city}")
        if inst.identifiers:
            print(f"     Website: {inst.identifiers[0].identifier_url}")

if __name__ == "__main__":
    main()