- Add emic_name, name_language, standardized_name to CustodianName - Add scripts for enriching custodian emic names from Wikidata - Add YouTube and Google Maps enrichment scripts - Update DuckLake loader for new schema fields
386 lines
14 KiB
Python
386 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich custodian descriptions using available data sources and GLM-4.6.
|
|
|
|
This script:
|
|
1. Finds custodian files with placeholder descriptions
|
|
2. Gathers available data (Wikidata, Google Maps, UNESCO MoW, etc.)
|
|
3. Uses GLM-4.6 to generate a rich description
|
|
4. Updates the file with the new description
|
|
|
|
Usage:
|
|
python enrich_descriptions.py --limit 10 # Process 10 files
|
|
python enrich_descriptions.py --dry-run # Show what would be done
|
|
python enrich_descriptions.py --all # Process all files
|
|
"""
|
|
|
|
import asyncio
|
|
import argparse
|
|
import os
|
|
import re
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Dict, List, Optional
|
|
import httpx
|
|
from ruamel.yaml import YAML
|
|
|
|
# Load environment
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
# Constants
|
|
DATA_DIR = Path(__file__).parent.parent / "data" / "custodian"
|
|
PLACEHOLDER_DESCRIPTION = "Heritage institution holding UNESCO Memory of the World inscribed documents"
|
|
|
|
# Z.AI GLM API configuration
|
|
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
|
|
|
|
|
|
class DescriptionEnricher:
|
|
"""Enrich custodian descriptions using GLM-4.6."""
|
|
|
|
SYSTEM_PROMPT = """You are a cultural heritage expert writing descriptions for heritage institutions.
|
|
|
|
Your task is to create a concise, informative description (2-4 sentences) for a heritage institution based on the available data.
|
|
|
|
## Guidelines
|
|
- Focus on what makes the institution significant
|
|
- Include the type of collections if known (manuscripts, archives, art, etc.)
|
|
- Mention UNESCO Memory of the World inscriptions if present
|
|
- Include location context when relevant
|
|
- Use formal, encyclopedic tone
|
|
- Do NOT invent information not present in the data
|
|
- Keep descriptions under 100 words
|
|
|
|
## Output Format
|
|
Provide ONLY the description text, no quotes or formatting.
|
|
"""
|
|
|
|
def __init__(self, model: str = "glm-4.6", dry_run: bool = False):
|
|
self.api_key = os.environ.get("ZAI_API_TOKEN")
|
|
if not self.api_key:
|
|
raise ValueError("ZAI_API_TOKEN not found in environment. See docs/GLM_API_SETUP.md")
|
|
|
|
self.model = model
|
|
self.dry_run = dry_run
|
|
self.yaml = YAML()
|
|
self.yaml.preserve_quotes = True
|
|
self.yaml.default_flow_style = False
|
|
self.yaml.width = 4096 # Prevent line wrapping
|
|
|
|
self.client = httpx.AsyncClient(
|
|
timeout=60.0,
|
|
headers={
|
|
"Authorization": f"Bearer {self.api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
)
|
|
|
|
self.stats = {
|
|
"processed": 0,
|
|
"enriched": 0,
|
|
"skipped": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
async def close(self):
|
|
"""Close the HTTP client."""
|
|
await self.client.aclose()
|
|
|
|
def find_files_with_placeholder(self, limit: Optional[int] = None) -> List[Path]:
|
|
"""Find custodian files with placeholder descriptions."""
|
|
files = []
|
|
|
|
for yaml_file in DATA_DIR.glob("*.yaml"):
|
|
try:
|
|
with open(yaml_file, 'r', encoding='utf-8') as f:
|
|
data = self.yaml.load(f)
|
|
|
|
if not data:
|
|
continue
|
|
|
|
# Check for placeholder in wikidata_enrichment.wikidata_description_en
|
|
wd_desc = data.get('wikidata_enrichment', {}).get('wikidata_description_en', '')
|
|
if PLACEHOLDER_DESCRIPTION in str(wd_desc):
|
|
files.append(yaml_file)
|
|
if limit and len(files) >= limit:
|
|
break
|
|
|
|
except Exception as e:
|
|
print(f"Error reading {yaml_file}: {e}")
|
|
|
|
return files
|
|
|
|
def gather_context(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Gather all available context from the entry."""
|
|
context = {
|
|
"name": None,
|
|
"type": None,
|
|
"location": {},
|
|
"wikidata": {},
|
|
"google_maps": {},
|
|
"unesco_mow": {},
|
|
"collections": [],
|
|
}
|
|
|
|
# Name from various sources
|
|
if 'custodian_name' in data:
|
|
context['name'] = data['custodian_name'].get('claim_value')
|
|
elif 'wikidata_enrichment' in data:
|
|
context['name'] = data['wikidata_enrichment'].get('wikidata_label_en')
|
|
elif 'original_entry' in data:
|
|
context['name'] = data['original_entry'].get('name') or data['original_entry'].get('organisatie')
|
|
|
|
# Institution type
|
|
if 'wikidata_enrichment' in data:
|
|
context['type'] = data['wikidata_enrichment'].get('instance_of')
|
|
|
|
# Location from GHCID
|
|
if 'ghcid' in data:
|
|
loc_res = data['ghcid'].get('location_resolution', {})
|
|
context['location'] = {
|
|
"city": loc_res.get('city_label'),
|
|
"country": loc_res.get('country_label'),
|
|
"region": loc_res.get('region_code'),
|
|
}
|
|
|
|
# Wikidata data
|
|
if 'wikidata_enrichment' in data:
|
|
wd = data['wikidata_enrichment']
|
|
context['wikidata'] = {
|
|
"qid": wd.get('wikidata_entity_id'),
|
|
"instance_of": wd.get('instance_of'),
|
|
}
|
|
|
|
# Google Maps data
|
|
if 'google_maps_enrichment' in data:
|
|
gm = data['google_maps_enrichment']
|
|
context['google_maps'] = {
|
|
"name": gm.get('name'),
|
|
"types": gm.get('google_place_types', []),
|
|
"address": gm.get('formatted_address'),
|
|
"primary_type": gm.get('primary_type'),
|
|
}
|
|
|
|
# UNESCO Memory of the World
|
|
if 'unesco_mow_enrichment' in data:
|
|
mow = data['unesco_mow_enrichment']
|
|
context['unesco_mow'] = {
|
|
"is_custodian": mow.get('is_mow_custodian', False),
|
|
"inscription_count": mow.get('inscription_count', 0),
|
|
"inscriptions": [
|
|
{"name": i.get('name'), "country": i.get('inscription_country')}
|
|
for i in mow.get('inscriptions', [])
|
|
],
|
|
}
|
|
|
|
return context
|
|
|
|
def build_prompt(self, context: Dict[str, Any]) -> str:
|
|
"""Build a prompt for GLM based on available context."""
|
|
parts = [f"Institution: {context['name']}"]
|
|
|
|
if context['type']:
|
|
parts.append(f"Type: {context['type']}")
|
|
|
|
if context['location'].get('city'):
|
|
loc = context['location']
|
|
loc_str = f"Location: {loc['city']}"
|
|
if loc.get('country'):
|
|
loc_str += f", {loc['country']}"
|
|
parts.append(loc_str)
|
|
|
|
if context['google_maps'].get('types'):
|
|
parts.append(f"Google Maps Types: {', '.join(context['google_maps']['types'])}")
|
|
|
|
if context['unesco_mow'].get('is_custodian'):
|
|
mow = context['unesco_mow']
|
|
inscriptions = mow.get('inscriptions', [])
|
|
if inscriptions:
|
|
inscription_names = [i['name'] for i in inscriptions[:3]] # Limit to 3
|
|
parts.append(f"UNESCO Memory of the World inscriptions held: {', '.join(inscription_names)}")
|
|
if mow['inscription_count'] > 3:
|
|
parts.append(f"(Total: {mow['inscription_count']} inscriptions)")
|
|
|
|
if context['wikidata'].get('qid'):
|
|
parts.append(f"Wikidata ID: {context['wikidata']['qid']}")
|
|
|
|
return "\n".join(parts)
|
|
|
|
async def generate_description(self, context: Dict[str, Any]) -> Optional[str]:
|
|
"""Generate a description using GLM-4.6."""
|
|
prompt = self.build_prompt(context)
|
|
|
|
try:
|
|
response = await self.client.post(
|
|
ZAI_API_URL,
|
|
json={
|
|
"model": self.model,
|
|
"messages": [
|
|
{"role": "system", "content": self.SYSTEM_PROMPT},
|
|
{"role": "user", "content": prompt}
|
|
],
|
|
"temperature": 0.3,
|
|
"max_tokens": 1024, # GLM-4.6 needs room for reasoning + content
|
|
}
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
print(f" API Error: {response.status_code}")
|
|
print(f" Response: {response.text[:500]}")
|
|
return None
|
|
|
|
result = response.json()
|
|
|
|
if "choices" not in result or len(result["choices"]) == 0:
|
|
print(f" No choices in response")
|
|
return None
|
|
|
|
content = result["choices"][0]["message"]["content"]
|
|
|
|
if not content or content.strip() == "":
|
|
# GLM-4.6 sometimes puts content in reasoning_content
|
|
reasoning = result["choices"][0]["message"].get("reasoning_content", "")
|
|
if reasoning:
|
|
print(f" Warning: Content was empty, model only provided reasoning")
|
|
return None
|
|
|
|
# Clean up the response
|
|
content = content.strip().strip('"').strip("'")
|
|
|
|
return content
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
print(f" HTTP Error: {e.response.status_code}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" Error calling GLM API: {type(e).__name__}: {e}")
|
|
return None
|
|
|
|
async def enrich_file(self, file_path: Path) -> bool:
|
|
"""Enrich a single file with a better description."""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = self.yaml.load(f)
|
|
|
|
if not data:
|
|
return False
|
|
|
|
# Gather context
|
|
context = self.gather_context(data)
|
|
|
|
if not context['name']:
|
|
print(f" Skipping {file_path.name}: No name found")
|
|
self.stats['skipped'] += 1
|
|
return False
|
|
|
|
print(f" Processing: {context['name']}")
|
|
|
|
if self.dry_run:
|
|
print(f" [DRY RUN] Would generate description from context:")
|
|
print(f" - Type: {context['type']}")
|
|
print(f" - Location: {context['location'].get('city')}, {context['location'].get('country')}")
|
|
if context['unesco_mow'].get('is_custodian'):
|
|
print(f" - UNESCO MoW inscriptions: {context['unesco_mow']['inscription_count']}")
|
|
return True
|
|
|
|
# Generate new description
|
|
new_description = await self.generate_description(context)
|
|
|
|
if not new_description:
|
|
print(f" Failed to generate description")
|
|
self.stats['errors'] += 1
|
|
return False
|
|
|
|
print(f" Generated: {new_description[:80]}...")
|
|
|
|
# Update the file
|
|
if 'wikidata_enrichment' not in data:
|
|
data['wikidata_enrichment'] = {}
|
|
|
|
data['wikidata_enrichment']['wikidata_description_en'] = new_description
|
|
data['wikidata_enrichment']['description_enrichment'] = {
|
|
'method': 'glm-4.6',
|
|
'timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'source_data': ['wikidata', 'google_maps', 'unesco_mow'],
|
|
}
|
|
|
|
# Write back
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
self.yaml.dump(data, f)
|
|
|
|
self.stats['enriched'] += 1
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" Error processing {file_path.name}: {e}")
|
|
self.stats['errors'] += 1
|
|
return False
|
|
|
|
async def run(self, limit: Optional[int] = None):
|
|
"""Run the enrichment process."""
|
|
print(f"Finding files with placeholder descriptions...")
|
|
files = self.find_files_with_placeholder(limit)
|
|
print(f"Found {len(files)} files to process")
|
|
|
|
if not files:
|
|
print("No files need enrichment.")
|
|
return
|
|
|
|
for i, file_path in enumerate(files, 1):
|
|
print(f"\n[{i}/{len(files)}] {file_path.name}")
|
|
await self.enrich_file(file_path)
|
|
self.stats['processed'] += 1
|
|
|
|
# Small delay between API calls
|
|
if not self.dry_run:
|
|
await asyncio.sleep(0.5)
|
|
|
|
await self.close()
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 50)
|
|
print("SUMMARY")
|
|
print("=" * 50)
|
|
print(f"Processed: {self.stats['processed']}")
|
|
print(f"Enriched: {self.stats['enriched']}")
|
|
print(f"Skipped: {self.stats['skipped']}")
|
|
print(f"Errors: {self.stats['errors']}")
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Enrich custodian descriptions using GLM-4.6"
|
|
)
|
|
parser.add_argument(
|
|
"--limit", "-n", type=int, default=10,
|
|
help="Maximum number of files to process (default: 10)"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run", "-d", action="store_true",
|
|
help="Show what would be done without making changes"
|
|
)
|
|
parser.add_argument(
|
|
"--all", "-a", action="store_true",
|
|
help="Process all files (ignores --limit)"
|
|
)
|
|
parser.add_argument(
|
|
"--model", "-m", type=str, default="glm-4.6",
|
|
help="GLM model to use (default: glm-4.6)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
limit = None if args.all else args.limit
|
|
|
|
enricher = DescriptionEnricher(
|
|
model=args.model,
|
|
dry_run=args.dry_run,
|
|
)
|
|
|
|
await enricher.run(limit=limit)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|