glam/scripts/enrich_descriptions.py
kempersc 6a6557bbe8 feat(enrichment): add emic name enrichment and update CustodianName schema
- Add emic_name, name_language, standardized_name to CustodianName
- Add scripts for enriching custodian emic names from Wikidata
- Add YouTube and Google Maps enrichment scripts
- Update DuckLake loader for new schema fields
2025-12-08 14:58:50 +01:00

386 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Enrich custodian descriptions using available data sources and GLM-4.6.
This script:
1. Finds custodian files with placeholder descriptions
2. Gathers available data (Wikidata, Google Maps, UNESCO MoW, etc.)
3. Uses GLM-4.6 to generate a rich description
4. Updates the file with the new description
Usage:
python enrich_descriptions.py --limit 10 # Process 10 files
python enrich_descriptions.py --dry-run # Show what would be done
python enrich_descriptions.py --all # Process all files
"""
import asyncio
import argparse
import os
import re
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional
import httpx
from ruamel.yaml import YAML
# Load environment
from dotenv import load_dotenv
load_dotenv()
# Constants
DATA_DIR = Path(__file__).parent.parent / "data" / "custodian"
PLACEHOLDER_DESCRIPTION = "Heritage institution holding UNESCO Memory of the World inscribed documents"
# Z.AI GLM API configuration
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
class DescriptionEnricher:
"""Enrich custodian descriptions using GLM-4.6."""
SYSTEM_PROMPT = """You are a cultural heritage expert writing descriptions for heritage institutions.
Your task is to create a concise, informative description (2-4 sentences) for a heritage institution based on the available data.
## Guidelines
- Focus on what makes the institution significant
- Include the type of collections if known (manuscripts, archives, art, etc.)
- Mention UNESCO Memory of the World inscriptions if present
- Include location context when relevant
- Use formal, encyclopedic tone
- Do NOT invent information not present in the data
- Keep descriptions under 100 words
## Output Format
Provide ONLY the description text, no quotes or formatting.
"""
def __init__(self, model: str = "glm-4.6", dry_run: bool = False):
self.api_key = os.environ.get("ZAI_API_TOKEN")
if not self.api_key:
raise ValueError("ZAI_API_TOKEN not found in environment. See docs/GLM_API_SETUP.md")
self.model = model
self.dry_run = dry_run
self.yaml = YAML()
self.yaml.preserve_quotes = True
self.yaml.default_flow_style = False
self.yaml.width = 4096 # Prevent line wrapping
self.client = httpx.AsyncClient(
timeout=60.0,
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
)
self.stats = {
"processed": 0,
"enriched": 0,
"skipped": 0,
"errors": 0,
}
async def close(self):
"""Close the HTTP client."""
await self.client.aclose()
def find_files_with_placeholder(self, limit: Optional[int] = None) -> List[Path]:
"""Find custodian files with placeholder descriptions."""
files = []
for yaml_file in DATA_DIR.glob("*.yaml"):
try:
with open(yaml_file, 'r', encoding='utf-8') as f:
data = self.yaml.load(f)
if not data:
continue
# Check for placeholder in wikidata_enrichment.wikidata_description_en
wd_desc = data.get('wikidata_enrichment', {}).get('wikidata_description_en', '')
if PLACEHOLDER_DESCRIPTION in str(wd_desc):
files.append(yaml_file)
if limit and len(files) >= limit:
break
except Exception as e:
print(f"Error reading {yaml_file}: {e}")
return files
def gather_context(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""Gather all available context from the entry."""
context = {
"name": None,
"type": None,
"location": {},
"wikidata": {},
"google_maps": {},
"unesco_mow": {},
"collections": [],
}
# Name from various sources
if 'custodian_name' in data:
context['name'] = data['custodian_name'].get('claim_value')
elif 'wikidata_enrichment' in data:
context['name'] = data['wikidata_enrichment'].get('wikidata_label_en')
elif 'original_entry' in data:
context['name'] = data['original_entry'].get('name') or data['original_entry'].get('organisatie')
# Institution type
if 'wikidata_enrichment' in data:
context['type'] = data['wikidata_enrichment'].get('instance_of')
# Location from GHCID
if 'ghcid' in data:
loc_res = data['ghcid'].get('location_resolution', {})
context['location'] = {
"city": loc_res.get('city_label'),
"country": loc_res.get('country_label'),
"region": loc_res.get('region_code'),
}
# Wikidata data
if 'wikidata_enrichment' in data:
wd = data['wikidata_enrichment']
context['wikidata'] = {
"qid": wd.get('wikidata_entity_id'),
"instance_of": wd.get('instance_of'),
}
# Google Maps data
if 'google_maps_enrichment' in data:
gm = data['google_maps_enrichment']
context['google_maps'] = {
"name": gm.get('name'),
"types": gm.get('google_place_types', []),
"address": gm.get('formatted_address'),
"primary_type": gm.get('primary_type'),
}
# UNESCO Memory of the World
if 'unesco_mow_enrichment' in data:
mow = data['unesco_mow_enrichment']
context['unesco_mow'] = {
"is_custodian": mow.get('is_mow_custodian', False),
"inscription_count": mow.get('inscription_count', 0),
"inscriptions": [
{"name": i.get('name'), "country": i.get('inscription_country')}
for i in mow.get('inscriptions', [])
],
}
return context
def build_prompt(self, context: Dict[str, Any]) -> str:
"""Build a prompt for GLM based on available context."""
parts = [f"Institution: {context['name']}"]
if context['type']:
parts.append(f"Type: {context['type']}")
if context['location'].get('city'):
loc = context['location']
loc_str = f"Location: {loc['city']}"
if loc.get('country'):
loc_str += f", {loc['country']}"
parts.append(loc_str)
if context['google_maps'].get('types'):
parts.append(f"Google Maps Types: {', '.join(context['google_maps']['types'])}")
if context['unesco_mow'].get('is_custodian'):
mow = context['unesco_mow']
inscriptions = mow.get('inscriptions', [])
if inscriptions:
inscription_names = [i['name'] for i in inscriptions[:3]] # Limit to 3
parts.append(f"UNESCO Memory of the World inscriptions held: {', '.join(inscription_names)}")
if mow['inscription_count'] > 3:
parts.append(f"(Total: {mow['inscription_count']} inscriptions)")
if context['wikidata'].get('qid'):
parts.append(f"Wikidata ID: {context['wikidata']['qid']}")
return "\n".join(parts)
async def generate_description(self, context: Dict[str, Any]) -> Optional[str]:
"""Generate a description using GLM-4.6."""
prompt = self.build_prompt(context)
try:
response = await self.client.post(
ZAI_API_URL,
json={
"model": self.model,
"messages": [
{"role": "system", "content": self.SYSTEM_PROMPT},
{"role": "user", "content": prompt}
],
"temperature": 0.3,
"max_tokens": 1024, # GLM-4.6 needs room for reasoning + content
}
)
if response.status_code != 200:
print(f" API Error: {response.status_code}")
print(f" Response: {response.text[:500]}")
return None
result = response.json()
if "choices" not in result or len(result["choices"]) == 0:
print(f" No choices in response")
return None
content = result["choices"][0]["message"]["content"]
if not content or content.strip() == "":
# GLM-4.6 sometimes puts content in reasoning_content
reasoning = result["choices"][0]["message"].get("reasoning_content", "")
if reasoning:
print(f" Warning: Content was empty, model only provided reasoning")
return None
# Clean up the response
content = content.strip().strip('"').strip("'")
return content
except httpx.HTTPStatusError as e:
print(f" HTTP Error: {e.response.status_code}")
return None
except Exception as e:
print(f" Error calling GLM API: {type(e).__name__}: {e}")
return None
async def enrich_file(self, file_path: Path) -> bool:
"""Enrich a single file with a better description."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = self.yaml.load(f)
if not data:
return False
# Gather context
context = self.gather_context(data)
if not context['name']:
print(f" Skipping {file_path.name}: No name found")
self.stats['skipped'] += 1
return False
print(f" Processing: {context['name']}")
if self.dry_run:
print(f" [DRY RUN] Would generate description from context:")
print(f" - Type: {context['type']}")
print(f" - Location: {context['location'].get('city')}, {context['location'].get('country')}")
if context['unesco_mow'].get('is_custodian'):
print(f" - UNESCO MoW inscriptions: {context['unesco_mow']['inscription_count']}")
return True
# Generate new description
new_description = await self.generate_description(context)
if not new_description:
print(f" Failed to generate description")
self.stats['errors'] += 1
return False
print(f" Generated: {new_description[:80]}...")
# Update the file
if 'wikidata_enrichment' not in data:
data['wikidata_enrichment'] = {}
data['wikidata_enrichment']['wikidata_description_en'] = new_description
data['wikidata_enrichment']['description_enrichment'] = {
'method': 'glm-4.6',
'timestamp': datetime.now(timezone.utc).isoformat(),
'source_data': ['wikidata', 'google_maps', 'unesco_mow'],
}
# Write back
with open(file_path, 'w', encoding='utf-8') as f:
self.yaml.dump(data, f)
self.stats['enriched'] += 1
return True
except Exception as e:
print(f" Error processing {file_path.name}: {e}")
self.stats['errors'] += 1
return False
async def run(self, limit: Optional[int] = None):
"""Run the enrichment process."""
print(f"Finding files with placeholder descriptions...")
files = self.find_files_with_placeholder(limit)
print(f"Found {len(files)} files to process")
if not files:
print("No files need enrichment.")
return
for i, file_path in enumerate(files, 1):
print(f"\n[{i}/{len(files)}] {file_path.name}")
await self.enrich_file(file_path)
self.stats['processed'] += 1
# Small delay between API calls
if not self.dry_run:
await asyncio.sleep(0.5)
await self.close()
# Print summary
print("\n" + "=" * 50)
print("SUMMARY")
print("=" * 50)
print(f"Processed: {self.stats['processed']}")
print(f"Enriched: {self.stats['enriched']}")
print(f"Skipped: {self.stats['skipped']}")
print(f"Errors: {self.stats['errors']}")
async def main():
parser = argparse.ArgumentParser(
description="Enrich custodian descriptions using GLM-4.6"
)
parser.add_argument(
"--limit", "-n", type=int, default=10,
help="Maximum number of files to process (default: 10)"
)
parser.add_argument(
"--dry-run", "-d", action="store_true",
help="Show what would be done without making changes"
)
parser.add_argument(
"--all", "-a", action="store_true",
help="Process all files (ignores --limit)"
)
parser.add_argument(
"--model", "-m", type=str, default="glm-4.6",
help="GLM model to use (default: glm-4.6)"
)
args = parser.parse_args()
limit = None if args.all else args.limit
enricher = DescriptionEnricher(
model=args.model,
dry_run=args.dry_run,
)
await enricher.run(limit=limit)
if __name__ == "__main__":
asyncio.run(main())