#!/usr/bin/env python3 """ Enrich custodian descriptions using available data sources and GLM-4.6. This script: 1. Finds custodian files with placeholder descriptions 2. Gathers available data (Wikidata, Google Maps, UNESCO MoW, etc.) 3. Uses GLM-4.6 to generate a rich description 4. Updates the file with the new description Usage: python enrich_descriptions.py --limit 10 # Process 10 files python enrich_descriptions.py --dry-run # Show what would be done python enrich_descriptions.py --all # Process all files """ import asyncio import argparse import os import re import json from pathlib import Path from datetime import datetime, timezone from typing import Any, Dict, List, Optional import httpx from ruamel.yaml import YAML # Load environment from dotenv import load_dotenv load_dotenv() # Constants DATA_DIR = Path(__file__).parent.parent / "data" / "custodian" PLACEHOLDER_DESCRIPTION = "Heritage institution holding UNESCO Memory of the World inscribed documents" # Z.AI GLM API configuration ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions" class DescriptionEnricher: """Enrich custodian descriptions using GLM-4.6.""" SYSTEM_PROMPT = """You are a cultural heritage expert writing descriptions for heritage institutions. Your task is to create a concise, informative description (2-4 sentences) for a heritage institution based on the available data. ## Guidelines - Focus on what makes the institution significant - Include the type of collections if known (manuscripts, archives, art, etc.) - Mention UNESCO Memory of the World inscriptions if present - Include location context when relevant - Use formal, encyclopedic tone - Do NOT invent information not present in the data - Keep descriptions under 100 words ## Output Format Provide ONLY the description text, no quotes or formatting. """ def __init__(self, model: str = "glm-4.6", dry_run: bool = False): self.api_key = os.environ.get("ZAI_API_TOKEN") if not self.api_key: raise ValueError("ZAI_API_TOKEN not found in environment. See docs/GLM_API_SETUP.md") self.model = model self.dry_run = dry_run self.yaml = YAML() self.yaml.preserve_quotes = True self.yaml.default_flow_style = False self.yaml.width = 4096 # Prevent line wrapping self.client = httpx.AsyncClient( timeout=60.0, headers={ "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", } ) self.stats = { "processed": 0, "enriched": 0, "skipped": 0, "errors": 0, } async def close(self): """Close the HTTP client.""" await self.client.aclose() def find_files_with_placeholder(self, limit: Optional[int] = None) -> List[Path]: """Find custodian files with placeholder descriptions.""" files = [] for yaml_file in DATA_DIR.glob("*.yaml"): try: with open(yaml_file, 'r', encoding='utf-8') as f: data = self.yaml.load(f) if not data: continue # Check for placeholder in wikidata_enrichment.wikidata_description_en wd_desc = data.get('wikidata_enrichment', {}).get('wikidata_description_en', '') if PLACEHOLDER_DESCRIPTION in str(wd_desc): files.append(yaml_file) if limit and len(files) >= limit: break except Exception as e: print(f"Error reading {yaml_file}: {e}") return files def gather_context(self, data: Dict[str, Any]) -> Dict[str, Any]: """Gather all available context from the entry.""" context = { "name": None, "type": None, "location": {}, "wikidata": {}, "google_maps": {}, "unesco_mow": {}, "collections": [], } # Name from various sources if 'custodian_name' in data: context['name'] = data['custodian_name'].get('claim_value') elif 'wikidata_enrichment' in data: context['name'] = data['wikidata_enrichment'].get('wikidata_label_en') elif 'original_entry' in data: context['name'] = data['original_entry'].get('name') or data['original_entry'].get('organisatie') # Institution type if 'wikidata_enrichment' in data: context['type'] = data['wikidata_enrichment'].get('instance_of') # Location from GHCID if 'ghcid' in data: loc_res = data['ghcid'].get('location_resolution', {}) context['location'] = { "city": loc_res.get('city_label'), "country": loc_res.get('country_label'), "region": loc_res.get('region_code'), } # Wikidata data if 'wikidata_enrichment' in data: wd = data['wikidata_enrichment'] context['wikidata'] = { "qid": wd.get('wikidata_entity_id'), "instance_of": wd.get('instance_of'), } # Google Maps data if 'google_maps_enrichment' in data: gm = data['google_maps_enrichment'] context['google_maps'] = { "name": gm.get('name'), "types": gm.get('google_place_types', []), "address": gm.get('formatted_address'), "primary_type": gm.get('primary_type'), } # UNESCO Memory of the World if 'unesco_mow_enrichment' in data: mow = data['unesco_mow_enrichment'] context['unesco_mow'] = { "is_custodian": mow.get('is_mow_custodian', False), "inscription_count": mow.get('inscription_count', 0), "inscriptions": [ {"name": i.get('name'), "country": i.get('inscription_country')} for i in mow.get('inscriptions', []) ], } return context def build_prompt(self, context: Dict[str, Any]) -> str: """Build a prompt for GLM based on available context.""" parts = [f"Institution: {context['name']}"] if context['type']: parts.append(f"Type: {context['type']}") if context['location'].get('city'): loc = context['location'] loc_str = f"Location: {loc['city']}" if loc.get('country'): loc_str += f", {loc['country']}" parts.append(loc_str) if context['google_maps'].get('types'): parts.append(f"Google Maps Types: {', '.join(context['google_maps']['types'])}") if context['unesco_mow'].get('is_custodian'): mow = context['unesco_mow'] inscriptions = mow.get('inscriptions', []) if inscriptions: inscription_names = [i['name'] for i in inscriptions[:3]] # Limit to 3 parts.append(f"UNESCO Memory of the World inscriptions held: {', '.join(inscription_names)}") if mow['inscription_count'] > 3: parts.append(f"(Total: {mow['inscription_count']} inscriptions)") if context['wikidata'].get('qid'): parts.append(f"Wikidata ID: {context['wikidata']['qid']}") return "\n".join(parts) async def generate_description(self, context: Dict[str, Any]) -> Optional[str]: """Generate a description using GLM-4.6.""" prompt = self.build_prompt(context) try: response = await self.client.post( ZAI_API_URL, json={ "model": self.model, "messages": [ {"role": "system", "content": self.SYSTEM_PROMPT}, {"role": "user", "content": prompt} ], "temperature": 0.3, "max_tokens": 1024, # GLM-4.6 needs room for reasoning + content } ) if response.status_code != 200: print(f" API Error: {response.status_code}") print(f" Response: {response.text[:500]}") return None result = response.json() if "choices" not in result or len(result["choices"]) == 0: print(f" No choices in response") return None content = result["choices"][0]["message"]["content"] if not content or content.strip() == "": # GLM-4.6 sometimes puts content in reasoning_content reasoning = result["choices"][0]["message"].get("reasoning_content", "") if reasoning: print(f" Warning: Content was empty, model only provided reasoning") return None # Clean up the response content = content.strip().strip('"').strip("'") return content except httpx.HTTPStatusError as e: print(f" HTTP Error: {e.response.status_code}") return None except Exception as e: print(f" Error calling GLM API: {type(e).__name__}: {e}") return None async def enrich_file(self, file_path: Path) -> bool: """Enrich a single file with a better description.""" try: with open(file_path, 'r', encoding='utf-8') as f: data = self.yaml.load(f) if not data: return False # Gather context context = self.gather_context(data) if not context['name']: print(f" Skipping {file_path.name}: No name found") self.stats['skipped'] += 1 return False print(f" Processing: {context['name']}") if self.dry_run: print(f" [DRY RUN] Would generate description from context:") print(f" - Type: {context['type']}") print(f" - Location: {context['location'].get('city')}, {context['location'].get('country')}") if context['unesco_mow'].get('is_custodian'): print(f" - UNESCO MoW inscriptions: {context['unesco_mow']['inscription_count']}") return True # Generate new description new_description = await self.generate_description(context) if not new_description: print(f" Failed to generate description") self.stats['errors'] += 1 return False print(f" Generated: {new_description[:80]}...") # Update the file if 'wikidata_enrichment' not in data: data['wikidata_enrichment'] = {} data['wikidata_enrichment']['wikidata_description_en'] = new_description data['wikidata_enrichment']['description_enrichment'] = { 'method': 'glm-4.6', 'timestamp': datetime.now(timezone.utc).isoformat(), 'source_data': ['wikidata', 'google_maps', 'unesco_mow'], } # Write back with open(file_path, 'w', encoding='utf-8') as f: self.yaml.dump(data, f) self.stats['enriched'] += 1 return True except Exception as e: print(f" Error processing {file_path.name}: {e}") self.stats['errors'] += 1 return False async def run(self, limit: Optional[int] = None): """Run the enrichment process.""" print(f"Finding files with placeholder descriptions...") files = self.find_files_with_placeholder(limit) print(f"Found {len(files)} files to process") if not files: print("No files need enrichment.") return for i, file_path in enumerate(files, 1): print(f"\n[{i}/{len(files)}] {file_path.name}") await self.enrich_file(file_path) self.stats['processed'] += 1 # Small delay between API calls if not self.dry_run: await asyncio.sleep(0.5) await self.close() # Print summary print("\n" + "=" * 50) print("SUMMARY") print("=" * 50) print(f"Processed: {self.stats['processed']}") print(f"Enriched: {self.stats['enriched']}") print(f"Skipped: {self.stats['skipped']}") print(f"Errors: {self.stats['errors']}") async def main(): parser = argparse.ArgumentParser( description="Enrich custodian descriptions using GLM-4.6" ) parser.add_argument( "--limit", "-n", type=int, default=10, help="Maximum number of files to process (default: 10)" ) parser.add_argument( "--dry-run", "-d", action="store_true", help="Show what would be done without making changes" ) parser.add_argument( "--all", "-a", action="store_true", help="Process all files (ignores --limit)" ) parser.add_argument( "--model", "-m", type=str, default="glm-4.6", help="GLM model to use (default: glm-4.6)" ) args = parser.parse_args() limit = None if args.all else args.limit enricher = DescriptionEnricher( model=args.model, dry_run=args.dry_run, ) await enricher.run(limit=limit) if __name__ == "__main__": asyncio.run(main())