glam/scripts/enrich_descriptions.py

#!/usr/bin/env python3
"""
Enrich custodian descriptions using available data sources and GLM-4.6.

This script:
1. Finds custodian files with placeholder descriptions
2. Gathers available data (Wikidata, Google Maps, UNESCO MoW, etc.)
3. Uses GLM-4.6 to generate a rich description
4. Updates the file with the new description

Usage:
    python enrich_descriptions.py --limit 10  # Process 10 files
    python enrich_descriptions.py --dry-run    # Show what would be done
    python enrich_descriptions.py --all        # Process all files
"""

import asyncio
import argparse
import os
import re
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional
import httpx
from ruamel.yaml import YAML

# Load environment
from dotenv import load_dotenv
load_dotenv()

# Constants
DATA_DIR = Path(__file__).parent.parent / "data" / "custodian"
PLACEHOLDER_DESCRIPTION = "Heritage institution holding UNESCO Memory of the World inscribed documents"

# Z.AI GLM API configuration
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"


class DescriptionEnricher:
    """Enrich custodian descriptions using GLM-4.6."""

    SYSTEM_PROMPT = """You are a cultural heritage expert writing descriptions for heritage institutions.

Your task is to create a concise, informative description (2-4 sentences) for a heritage institution based on the available data.

## Guidelines
- Focus on what makes the institution significant
- Include the type of collections if known (manuscripts, archives, art, etc.)
- Mention UNESCO Memory of the World inscriptions if present
- Include location context when relevant
- Use formal, encyclopedic tone
- Do NOT invent information not present in the data
- Keep descriptions under 100 words

## Output Format
Provide ONLY the description text, no quotes or formatting.
"""

    def __init__(self, model: str = "glm-4.6", dry_run: bool = False):
        self.api_key = os.environ.get("ZAI_API_TOKEN")
        if not self.api_key:
            raise ValueError("ZAI_API_TOKEN not found in environment. See docs/GLM_API_SETUP.md")

        self.model = model
        self.dry_run = dry_run
        self.yaml = YAML()
        self.yaml.preserve_quotes = True
        self.yaml.default_flow_style = False
        self.yaml.width = 4096  # Prevent line wrapping

        self.client = httpx.AsyncClient(
            timeout=60.0,
            headers={
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json",
            }
        )

        self.stats = {
            "processed": 0,
            "enriched": 0,
            "skipped": 0,
            "errors": 0,
        }

    async def close(self):
        """Close the HTTP client."""
        await self.client.aclose()

    def find_files_with_placeholder(self, limit: Optional[int] = None) -> List[Path]:
        """Find custodian files with placeholder descriptions."""
        files = []

        for yaml_file in DATA_DIR.glob("*.yaml"):
            try:
                with open(yaml_file, 'r', encoding='utf-8') as f:
                    data = self.yaml.load(f)

                if not data:
                    continue

                # Check for placeholder in wikidata_enrichment.wikidata_description_en
                wd_desc = data.get('wikidata_enrichment', {}).get('wikidata_description_en', '')
                if PLACEHOLDER_DESCRIPTION in str(wd_desc):
                    files.append(yaml_file)
                    if limit and len(files) >= limit:
                        break

            except Exception as e:
                print(f"Error reading {yaml_file}: {e}")

        return files

    def gather_context(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """Gather all available context from the entry."""
        context = {
            "name": None,
            "type": None,
            "location": {},
            "wikidata": {},
            "google_maps": {},
            "unesco_mow": {},
            "collections": [],
        }

        # Name from various sources
        if 'custodian_name' in data:
            context['name'] = data['custodian_name'].get('claim_value')
        elif 'wikidata_enrichment' in data:
            context['name'] = data['wikidata_enrichment'].get('wikidata_label_en')
        elif 'original_entry' in data:
            context['name'] = data['original_entry'].get('name') or data['original_entry'].get('organisatie')

        # Institution type
        if 'wikidata_enrichment' in data:
            context['type'] = data['wikidata_enrichment'].get('instance_of')

        # Location from GHCID
        if 'ghcid' in data:
            loc_res = data['ghcid'].get('location_resolution', {})
            context['location'] = {
                "city": loc_res.get('city_label'),
                "country": loc_res.get('country_label'),
                "region": loc_res.get('region_code'),
            }

        # Wikidata data
        if 'wikidata_enrichment' in data:
            wd = data['wikidata_enrichment']
            context['wikidata'] = {
                "qid": wd.get('wikidata_entity_id'),
                "instance_of": wd.get('instance_of'),
            }

        # Google Maps data
        if 'google_maps_enrichment' in data:
            gm = data['google_maps_enrichment']
            context['google_maps'] = {
                "name": gm.get('name'),
                "types": gm.get('google_place_types', []),
                "address": gm.get('formatted_address'),
                "primary_type": gm.get('primary_type'),
            }

        # UNESCO Memory of the World
        if 'unesco_mow_enrichment' in data:
            mow = data['unesco_mow_enrichment']
            context['unesco_mow'] = {
                "is_custodian": mow.get('is_mow_custodian', False),
                "inscription_count": mow.get('inscription_count', 0),
                "inscriptions": [
                    {"name": i.get('name'), "country": i.get('inscription_country')}
                    for i in mow.get('inscriptions', [])
                ],
            }

        return context

    def build_prompt(self, context: Dict[str, Any]) -> str:
        """Build a prompt for GLM based on available context."""
        parts = [f"Institution: {context['name']}"]

        if context['type']:
            parts.append(f"Type: {context['type']}")

        if context['location'].get('city'):
            loc = context['location']
            loc_str = f"Location: {loc['city']}"
            if loc.get('country'):
                loc_str += f", {loc['country']}"
            parts.append(loc_str)

        if context['google_maps'].get('types'):
            parts.append(f"Google Maps Types: {', '.join(context['google_maps']['types'])}")

        if context['unesco_mow'].get('is_custodian'):
            mow = context['unesco_mow']
            inscriptions = mow.get('inscriptions', [])
            if inscriptions:
                inscription_names = [i['name'] for i in inscriptions[:3]]  # Limit to 3
                parts.append(f"UNESCO Memory of the World inscriptions held: {', '.join(inscription_names)}")
                if mow['inscription_count'] > 3:
                    parts.append(f"(Total: {mow['inscription_count']} inscriptions)")

        if context['wikidata'].get('qid'):
            parts.append(f"Wikidata ID: {context['wikidata']['qid']}")

        return "\n".join(parts)

    async def generate_description(self, context: Dict[str, Any]) -> Optional[str]:
        """Generate a description using GLM-4.6."""
        prompt = self.build_prompt(context)

        try:
            response = await self.client.post(
                ZAI_API_URL,
                json={
                    "model": self.model,
                    "messages": [
                        {"role": "system", "content": self.SYSTEM_PROMPT},
                        {"role": "user", "content": prompt}
                    ],
                    "temperature": 0.3,
                    "max_tokens": 1024,  # GLM-4.6 needs room for reasoning + content
                }
            )

            if response.status_code != 200:
                print(f"    API Error: {response.status_code}")
                print(f"    Response: {response.text[:500]}")
                return None

            result = response.json()

            if "choices" not in result or len(result["choices"]) == 0:
                print(f"    No choices in response")
                return None

            content = result["choices"][0]["message"]["content"]

            if not content or content.strip() == "":
                # GLM-4.6 sometimes puts content in reasoning_content
                reasoning = result["choices"][0]["message"].get("reasoning_content", "")
                if reasoning:
                    print(f"    Warning: Content was empty, model only provided reasoning")
                return None

            # Clean up the response
            content = content.strip().strip('"').strip("'")

            return content

        except httpx.HTTPStatusError as e:
            print(f"    HTTP Error: {e.response.status_code}")
            return None
        except Exception as e:
            print(f"    Error calling GLM API: {type(e).__name__}: {e}")
            return None

    async def enrich_file(self, file_path: Path) -> bool:
        """Enrich a single file with a better description."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = self.yaml.load(f)

            if not data:
                return False

            # Gather context
            context = self.gather_context(data)

            if not context['name']:
                print(f"  Skipping {file_path.name}: No name found")
                self.stats['skipped'] += 1
                return False

            print(f"  Processing: {context['name']}")

            if self.dry_run:
                print(f"    [DRY RUN] Would generate description from context:")
                print(f"    - Type: {context['type']}")
                print(f"    - Location: {context['location'].get('city')}, {context['location'].get('country')}")
                if context['unesco_mow'].get('is_custodian'):
                    print(f"    - UNESCO MoW inscriptions: {context['unesco_mow']['inscription_count']}")
                return True

            # Generate new description
            new_description = await self.generate_description(context)

            if not new_description:
                print(f"    Failed to generate description")
                self.stats['errors'] += 1
                return False

            print(f"    Generated: {new_description[:80]}...")

            # Update the file
            if 'wikidata_enrichment' not in data:
                data['wikidata_enrichment'] = {}

            data['wikidata_enrichment']['wikidata_description_en'] = new_description
            data['wikidata_enrichment']['description_enrichment'] = {
                'method': 'glm-4.6',
                'timestamp': datetime.now(timezone.utc).isoformat(),
                'source_data': ['wikidata', 'google_maps', 'unesco_mow'],
            }

            # Write back
            with open(file_path, 'w', encoding='utf-8') as f:
                self.yaml.dump(data, f)

            self.stats['enriched'] += 1
            return True

        except Exception as e:
            print(f"  Error processing {file_path.name}: {e}")
            self.stats['errors'] += 1
            return False

    async def run(self, limit: Optional[int] = None):
        """Run the enrichment process."""
        print(f"Finding files with placeholder descriptions...")
        files = self.find_files_with_placeholder(limit)
        print(f"Found {len(files)} files to process")

        if not files:
            print("No files need enrichment.")
            return

        for i, file_path in enumerate(files, 1):
            print(f"\n[{i}/{len(files)}] {file_path.name}")
            await self.enrich_file(file_path)
            self.stats['processed'] += 1

            # Small delay between API calls
            if not self.dry_run:
                await asyncio.sleep(0.5)

        await self.close()

        # Print summary
        print("\n" + "=" * 50)
        print("SUMMARY")
        print("=" * 50)
        print(f"Processed: {self.stats['processed']}")
        print(f"Enriched:  {self.stats['enriched']}")
        print(f"Skipped:   {self.stats['skipped']}")
        print(f"Errors:    {self.stats['errors']}")


async def main():
    parser = argparse.ArgumentParser(
        description="Enrich custodian descriptions using GLM-4.6"
    )
    parser.add_argument(
        "--limit", "-n", type=int, default=10,
        help="Maximum number of files to process (default: 10)"
    )
    parser.add_argument(
        "--dry-run", "-d", action="store_true",
        help="Show what would be done without making changes"
    )
    parser.add_argument(
        "--all", "-a", action="store_true",
        help="Process all files (ignores --limit)"
    )
    parser.add_argument(
        "--model", "-m", type=str, default="glm-4.6",
        help="GLM model to use (default: glm-4.6)"
    )

    args = parser.parse_args()

    limit = None if args.all else args.limit

    enricher = DescriptionEnricher(
        model=args.model,
        dry_run=args.dry_run,
    )

    await enricher.run(limit=limit)


if __name__ == "__main__":
    asyncio.run(main())