#!/usr/bin/env python3 """ Re-enrich heritage institutions with Wikidata using GLM-4.6 CH Annotator verification. This script: 1. Finds files marked for re-enrichment (after duplicate cleanup) 2. Queries Wikidata API for candidates by institution name 3. Uses GLM-4.6 to verify matches based on CH Annotator entity types (GRP.HER) 4. Only adds Wikidata enrichment if entity is verified as heritage institution 5. Updates files with verified Wikidata data CH Annotator Convention (v1.7.0): - Heritage institutions are type GRP.HER (glam:HeritageCustodian) - Maps to: org:FormalOrganization, rov:RegisteredOrganization, schema:Museum, schema:Library, schema:ArchiveOrganization - Subtypes: GRP.HER.GAL (Gallery), GRP.HER.LIB (Library), GRP.HER.ARC (Archive), GRP.HER.MUS (Museum) Wikidata "instance of" (P31) values for heritage institutions: - Q33506 (museum) - Q7075 (library) - Q166118 (archive) - Q1007870 (art gallery) - Q207694 (art museum) - Q1970365 (natural history museum) - Q18388277 (history museum) - Q23413 (castle) - when used as museum - Q839954 (archaeological site) - Q174782 (town square) - NOT heritage institution - Q515 (city) - NOT heritage institution """ import asyncio import json import os import re import sys import time from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import yaml import httpx import logging # Load environment variables from .env file from dotenv import load_dotenv env_path = Path(__file__).parent.parent / ".env" load_dotenv(env_path) # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) # ============================================================================= # WIKIDATA HERITAGE INSTITUTION TYPE CLASSES (P31 values) # ============================================================================= # These are valid Wikidata "instance of" values for heritage institutions HERITAGE_P31_TYPES = { # Museums "Q33506": "museum", "Q207694": "art museum", "Q1970365": "natural history museum", "Q18388277": "history museum", "Q2087181": "university museum", "Q1007870": "art gallery", "Q17431399": "national museum", "Q16735822": "museum building", "Q1788742": "war museum", "Q7889618": "gallery of art", "Q4989906": "monuments and memorials", "Q57660343": "maritime museum", "Q15206070": "transport museum", "Q214090": "ethnographic museum", "Q2522387": "aviation museum", "Q841573": "archaeological museum", "Q28737012": "memorial museum", "Q588140": "railway museum", "Q515034": "science museum", "Q4287745": "local museum", # Libraries "Q7075": "library", "Q856234": "national library", "Q1078570": "academic library", "Q11294": "public library", "Q13226383": "research library", # Archives "Q166118": "archive", "Q473972": "national archives", "Q1423895": "film archive", "Q2066131": "regional archive", "Q63400100": "historical archive", "Q63400127": "municipal archive", "Q1026954": "photo archive", # Galleries "Q1007870": "art gallery", "Q7889618": "gallery of art", # Research centers "Q31855": "research institute", "Q327333": "heritage organisation", # Botanical/Zoo "Q43229": "botanical garden", "Q45585": "botanical garden", "Q43501": "zoo", # Holy sites (when managing heritage collections) "Q317557": "monastery", "Q83405": "abbey", "Q1088552": "cathedral chapter", # Educational (with collections) "Q3918": "university", "Q875538": "public university", } # These P31 values indicate NOT a heritage institution NON_HERITAGE_P31_TYPES = { "Q515": "city", "Q174782": "square", "Q5": "human", "Q4830453": "business", "Q891723": "public company", "Q783794": "company", "Q6881511": "enterprise", "Q43229": "organization", # Too generic "Q55678": "movie", "Q7366": "song", "Q5398426": "television series", } # ============================================================================= # WIKIDATA API CLIENT # ============================================================================= class WikidataSearchClient: """Client for Wikidata search and entity API.""" SEARCH_URL = "https://www.wikidata.org/w/api.php" ENTITY_URL = "https://www.wikidata.org/wiki/Special:EntityData/{qid}.json" def __init__(self, contact_email: Optional[str] = None): self.contact_email = contact_email or os.environ.get("WIKIMEDIA_CONTACT_EMAIL", "glam@example.org") self.client = httpx.AsyncClient( timeout=30.0, headers={ "User-Agent": f"GLAMBot/1.0 ({self.contact_email})", } ) async def search_entity(self, name: str, language: str = "en", limit: int = 5) -> List[Dict[str, Any]]: """ Search Wikidata for entities matching a name. Returns list of candidates with qid, label, description. """ params = { "action": "wbsearchentities", "format": "json", "language": language, "type": "item", "limit": limit, "search": name, } try: response = await self.client.get(self.SEARCH_URL, params=params) response.raise_for_status() data = response.json() results = [] for item in data.get("search", []): results.append({ "qid": item.get("id"), "label": item.get("label"), "description": item.get("description", ""), "url": item.get("concepturi"), }) return results except Exception as e: logger.error(f"Wikidata search error for '{name}': {e}") return [] async def get_entity_claims(self, qid: str) -> Dict[str, Any]: """ Get entity claims (properties) from Wikidata. Returns dict with P31 (instance of), P131 (located in), P625 (coordinates), etc. """ url = self.ENTITY_URL.format(qid=qid) try: response = await self.client.get(url) response.raise_for_status() data = response.json() entity = data.get("entities", {}).get(qid, {}) claims = entity.get("claims", {}) labels = entity.get("labels", {}) descriptions = entity.get("descriptions", {}) # Extract P31 values (instance of) p31_values = [] for claim in claims.get("P31", []): mainsnak = claim.get("mainsnak", {}) if mainsnak.get("snaktype") == "value": datavalue = mainsnak.get("datavalue", {}) if datavalue.get("type") == "wikibase-entityid": p31_qid = datavalue.get("value", {}).get("id") if p31_qid: p31_values.append(p31_qid) # Extract P131 (located in administrative entity) p131_values = [] for claim in claims.get("P131", []): mainsnak = claim.get("mainsnak", {}) if mainsnak.get("snaktype") == "value": datavalue = mainsnak.get("datavalue", {}) if datavalue.get("type") == "wikibase-entityid": p131_qid = datavalue.get("value", {}).get("id") if p131_qid: p131_values.append(p131_qid) # Extract P625 (coordinates) coordinates = None for claim in claims.get("P625", []): mainsnak = claim.get("mainsnak", {}) if mainsnak.get("snaktype") == "value": datavalue = mainsnak.get("datavalue", {}) if datavalue.get("type") == "globecoordinate": value = datavalue.get("value", {}) coordinates = { "latitude": value.get("latitude"), "longitude": value.get("longitude"), } break # Extract P17 (country) country = None for claim in claims.get("P17", []): mainsnak = claim.get("mainsnak", {}) if mainsnak.get("snaktype") == "value": datavalue = mainsnak.get("datavalue", {}) if datavalue.get("type") == "wikibase-entityid": country = datavalue.get("value", {}).get("id") break return { "qid": qid, "labels": {k: v.get("value") for k, v in labels.items()}, "descriptions": {k: v.get("value") for k, v in descriptions.items()}, "p31": p31_values, "p131": p131_values, "p17_country": country, "coordinates": coordinates, } except Exception as e: logger.error(f"Wikidata entity fetch error for {qid}: {e}") return {} async def close(self): await self.client.aclose() # ============================================================================= # GLM-4.6 CH ANNOTATOR VERIFICATION # ============================================================================= class GLMHeritageVerifier: """ Verify Wikidata entity matches using GLM-4.6 CH Annotator. Uses CH Annotator v1.7.0 entity type GRP.HER to verify that a Wikidata entity is actually a heritage institution. """ # Z.AI Coding Plan endpoint (different from regular BigModel API) ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions" VERIFICATION_PROMPT = """You are a heritage institution classifier following CH-Annotator v1.7.0 convention. Your task is to determine if a Wikidata entity is a heritage institution (type GRP.HER). ## CH-Annotator GRP.HER Definition Heritage institutions are organizations that: - Collect, preserve, and provide access to cultural heritage materials - Include: museums (GRP.HER.MUS), libraries (GRP.HER.LIB), archives (GRP.HER.ARC), galleries (GRP.HER.GAL) - May also include: research centers, botanical gardens, educational institutions WITH collections ## Entity Types That Are NOT Heritage Institutions - Cities, towns, municipalities (these are places, not institutions) - General businesses or companies (unless they manage heritage collections) - People (individuals are AGT.PER, not GRP.HER) - Events, festivals, exhibitions (temporary, not institutions) - Buildings without institutional function (just architecture) ## Your Task Analyze the Wikidata entity data and determine: 1. Is this entity a heritage institution (GRP.HER)? 2. If yes, what subtype? (MUS/LIB/ARC/GAL/OTHER) 3. Confidence score (0.0-1.0) Respond in JSON format: ```json {{ "is_heritage_institution": true/false, "subtype": "MUS|LIB|ARC|GAL|RES|BOT|EDU|OTHER|null", "confidence": 0.95, "reasoning": "Brief explanation" }} ``` ## Entity to Analyze Institution name from our data: {institution_name} Location from our data: {institution_location} Wikidata entity: - QID: {qid} - Label: {wd_label} - Description: {wd_description} - Instance of (P31): {p31_types} - Located in (P131): {p131_location} """ def __init__(self, api_key: Optional[str] = None, model: str = "glm-4.6", use_claude: bool = False): self.use_claude = use_claude if use_claude: self.api_key = api_key or os.environ.get("CLAUDE_API_KEY") self.model = "claude-3-5-haiku-20241022" # Fast, cheap model self.api_url = "https://api.anthropic.com/v1/messages" if not self.api_key: raise ValueError("CLAUDE_API_KEY not found in environment") self.client = httpx.AsyncClient( timeout=60.0, headers={ "x-api-key": self.api_key, "anthropic-version": "2023-06-01", "Content-Type": "application/json", } ) else: self.api_key = api_key or os.environ.get("ZAI_API_TOKEN") self.model = model # Use Z.AI Coding Plan endpoint (same as OpenCode) self.api_url = "https://api.z.ai/api/coding/paas/v4/chat/completions" if not self.api_key: raise ValueError("ZAI_API_TOKEN not found in environment") self.client = httpx.AsyncClient( timeout=60.0, headers={ "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", } ) async def verify_heritage_institution( self, institution_name: str, institution_location: str, qid: str, wd_label: str, wd_description: str, p31_types: List[str], p131_location: List[str], ) -> Dict[str, Any]: """ Verify if a Wikidata entity matches a heritage institution. Returns verification result with confidence score. """ # First, quick heuristic check using P31 types p31_set = set(p31_types) # Check for definite heritage types heritage_matches = p31_set & set(HERITAGE_P31_TYPES.keys()) non_heritage_matches = p31_set & set(NON_HERITAGE_P31_TYPES.keys()) # If clear non-heritage type, reject without LLM call if non_heritage_matches and not heritage_matches: logger.debug(f"Quick reject {qid}: P31 indicates non-heritage ({non_heritage_matches})") return { "is_heritage_institution": False, "subtype": None, "confidence": 0.95, "reasoning": f"P31 types indicate non-heritage: {[NON_HERITAGE_P31_TYPES.get(t, t) for t in non_heritage_matches]}", "verification_method": "p31_heuristic", } # If clear heritage type, high confidence without LLM if heritage_matches and not non_heritage_matches: subtype = self._infer_subtype_from_p31(p31_types) logger.debug(f"Quick accept {qid}: P31 indicates heritage ({heritage_matches})") return { "is_heritage_institution": True, "subtype": subtype, "confidence": 0.9, "reasoning": f"P31 types indicate heritage: {[HERITAGE_P31_TYPES.get(t, t) for t in heritage_matches]}", "verification_method": "p31_heuristic", } # Ambiguous case - use GLM-4.6 for verification p31_labels = [HERITAGE_P31_TYPES.get(t, NON_HERITAGE_P31_TYPES.get(t, t)) for t in p31_types] prompt = self.VERIFICATION_PROMPT.format( institution_name=institution_name, institution_location=institution_location, qid=qid, wd_label=wd_label, wd_description=wd_description, p31_types=", ".join(p31_labels) if p31_labels else "None specified", p131_location=", ".join(p131_location) if p131_location else "Not specified", ) try: if self.use_claude: # Claude API request format response = await self.client.post( self.api_url, json={ "model": self.model, "max_tokens": 512, "messages": [ {"role": "user", "content": prompt}, ], "system": "You are a heritage institution classifier. Respond only in valid JSON. Start your response with { and end with }.", } ) response.raise_for_status() data = response.json() content = data.get("content", [{}])[0].get("text", "") logger.debug(f"Claude raw response for {qid}: {content[:300]}") verification_method = "claude_ch_annotator" else: # GLM/Z.AI API request format response = await self.client.post( self.api_url, json={ "model": self.model, "messages": [ {"role": "system", "content": "You are a heritage institution classifier. Respond only in valid JSON."}, {"role": "user", "content": prompt}, ], "temperature": 0.1, "max_tokens": 512, } ) response.raise_for_status() data = response.json() content = data.get("choices", [{}])[0].get("message", {}).get("content", "") verification_method = "glm_4.6_ch_annotator" # Parse JSON from response try: # Extract JSON from markdown code blocks if present if "```json" in content: content = content.split("```json")[1].split("```")[0] elif "```" in content: content = content.split("```")[1].split("```")[0] # Try to find JSON object in content content = content.strip() # If content doesn't start with {, try to find first { if not content.startswith("{"): start_idx = content.find("{") if start_idx != -1: # Find matching closing brace brace_count = 0 end_idx = start_idx for i, char in enumerate(content[start_idx:], start_idx): if char == "{": brace_count += 1 elif char == "}": brace_count -= 1 if brace_count == 0: end_idx = i break content = content[start_idx:end_idx + 1] else: # No { found - wrap content in braces if it looks like JSON body if '"is_heritage_institution"' in content: content = "{" + content.rstrip().rstrip(",") + "}" result = json.loads(content) result["verification_method"] = verification_method return result except json.JSONDecodeError as e: # Fallback: try to extract values with regex logger.debug(f"JSON parse failed for {qid}, trying regex fallback: {content[:200]}") is_heritage = None subtype = None confidence = 0.5 reasoning = "Parsed from non-JSON response" # Check for is_heritage_institution value if '"is_heritage_institution"' in content: if 'true' in content.lower(): is_heritage = True elif 'false' in content.lower(): is_heritage = False # Extract subtype subtype_match = re.search(r'"subtype"\s*:\s*"([^"]+)"', content) if subtype_match: subtype = subtype_match.group(1) # Extract confidence conf_match = re.search(r'"confidence"\s*:\s*([\d.]+)', content) if conf_match: try: confidence = float(conf_match.group(1)) except ValueError: pass # Extract reasoning reason_match = re.search(r'"reasoning"\s*:\s*"([^"]+)"', content) if reason_match: reasoning = reason_match.group(1) if is_heritage is not None: return { "is_heritage_institution": is_heritage, "subtype": subtype, "confidence": confidence, "reasoning": reasoning, "verification_method": f"{verification_method}_regex_fallback", } logger.warning(f"Failed to parse LLM response for {qid}: {str(e)[:100]} - content: {content[:200]}") return { "is_heritage_institution": False, "subtype": None, "confidence": 0.0, "reasoning": f"Failed to parse LLM response: {str(e)}", "verification_method": f"{verification_method}_parse_error", } except Exception as e: logger.error(f"LLM verification error for {qid}: {e}") return { "is_heritage_institution": False, "subtype": None, "confidence": 0.0, "reasoning": f"API error: {e}", "verification_method": "llm_api_error", } def _infer_subtype_from_p31(self, p31_types: List[str]) -> str: """Infer heritage institution subtype from P31 values.""" p31_set = set(p31_types) # Museum types museum_types = {"Q33506", "Q207694", "Q1970365", "Q18388277", "Q2087181", "Q17431399", "Q1788742", "Q57660343", "Q15206070", "Q214090", "Q2522387", "Q841573", "Q28737012", "Q588140", "Q515034", "Q4287745"} if p31_set & museum_types: return "MUS" # Library types library_types = {"Q7075", "Q856234", "Q1078570", "Q11294", "Q13226383"} if p31_set & library_types: return "LIB" # Archive types archive_types = {"Q166118", "Q473972", "Q1423895", "Q2066131", "Q63400100", "Q63400127", "Q1026954"} if p31_set & archive_types: return "ARC" # Gallery types gallery_types = {"Q1007870", "Q7889618"} if p31_set & gallery_types: return "GAL" # Research centers if "Q31855" in p31_set or "Q327333" in p31_set: return "RES" # Botanical/Zoo if "Q43229" in p31_set or "Q45585" in p31_set or "Q43501" in p31_set: return "BOT" # Educational if "Q3918" in p31_set or "Q875538" in p31_set: return "EDU" return "OTHER" async def close(self): await self.client.aclose() # ============================================================================= # MAIN ENRICHMENT LOGIC # ============================================================================= async def find_files_needing_reenrichment(custodian_dir: Path) -> List[Path]: """Find all files marked for re-enrichment.""" files = [] for file_path in custodian_dir.glob("*.yaml"): try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() if "Re-enrichment required" in content: files.append(file_path) except Exception as e: logger.warning(f"Error reading {file_path}: {e}") return files async def enrich_file_with_wikidata( file_path: Path, wd_client: WikidataSearchClient, verifier: GLMHeritageVerifier, ) -> Dict[str, Any]: """ Enrich a single file with verified Wikidata data. Returns enrichment result. """ with open(file_path, 'r', encoding='utf-8') as f: entry = yaml.safe_load(f) if not entry: return {"status": "empty_file", "file": str(file_path)} # Get institution name name = None if "custodian_name" in entry and isinstance(entry["custodian_name"], dict): name = entry["custodian_name"].get("claim_value") if not name and "google_maps_enrichment" in entry: name = entry["google_maps_enrichment"].get("name") if not name and "original_entry" in entry: name = entry["original_entry"].get("organisatie") or entry["original_entry"].get("instelling") if not name: return {"status": "no_name", "file": str(file_path)} # Get location for verification location = "" if "google_maps_enrichment" in entry: gm = entry["google_maps_enrichment"] parts = [] if gm.get("short_address"): parts.append(gm["short_address"]) elif gm.get("formatted_address"): parts.append(gm["formatted_address"]) location = ", ".join(parts) elif "original_entry" in entry: oe = entry["original_entry"] parts = [] if oe.get("plaatsnaam_bezoekadres"): parts.append(oe["plaatsnaam_bezoekadres"]) if oe.get("provincie"): parts.append(oe["provincie"]) location = ", ".join(parts) # Get country for search language country_code = "NL" # Default if "ghcid" in entry: ghcid = entry["ghcid"].get("ghcid_current", "") if ghcid and len(ghcid) >= 2: country_code = ghcid[:2] # Determine search language based on country search_langs = ["en"] # Always search English if country_code == "NL": search_langs = ["nl", "en"] elif country_code == "BE": search_langs = ["nl", "fr", "en"] elif country_code == "DE": search_langs = ["de", "en"] elif country_code == "FR": search_langs = ["fr", "en"] elif country_code in ["BR", "PT"]: search_langs = ["pt", "en"] elif country_code in ["ES", "MX", "AR", "CL", "CO"]: search_langs = ["es", "en"] # Search Wikidata for candidates all_candidates = [] for lang in search_langs: candidates = await wd_client.search_entity(name, language=lang, limit=5) all_candidates.extend(candidates) await asyncio.sleep(0.2) # Rate limiting # Deduplicate by QID seen_qids = set() unique_candidates = [] for c in all_candidates: if c["qid"] not in seen_qids: seen_qids.add(c["qid"]) unique_candidates.append(c) if not unique_candidates: # Update file to mark as not found entry["wikidata_enrichment_status"] = "NOT_FOUND" entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat() # Remove re-enrichment note from provenance if "provenance" in entry and "notes" in entry["provenance"]: notes = entry["provenance"]["notes"] if "Re-enrichment required" in notes: entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip() with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) return {"status": "not_found", "file": str(file_path), "name": name} # Verify each candidate best_match = None best_confidence = 0.0 for candidate in unique_candidates[:5]: # Limit to top 5 qid = candidate["qid"] # Get entity details entity_data = await wd_client.get_entity_claims(qid) await asyncio.sleep(0.2) if not entity_data: continue # Verify with GLM-4.6 verification = await verifier.verify_heritage_institution( institution_name=name, institution_location=location, qid=qid, wd_label=candidate.get("label", ""), wd_description=candidate.get("description", ""), p31_types=entity_data.get("p31", []), p131_location=[str(x) for x in entity_data.get("p131", [])], ) if verification.get("is_heritage_institution") and verification.get("confidence", 0) > best_confidence: best_match = { "qid": qid, "label": candidate.get("label"), "description": candidate.get("description"), "entity_data": entity_data, "verification": verification, } best_confidence = verification.get("confidence", 0) if not best_match or best_confidence < 0.5: # No verified match found entry["wikidata_enrichment_status"] = "NO_VERIFIED_MATCH" entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat() entry["wikidata_candidates_checked"] = len(unique_candidates) # Remove re-enrichment note if "provenance" in entry and "notes" in entry["provenance"]: notes = entry["provenance"]["notes"] if "Re-enrichment required" in notes: entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip() with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) return {"status": "no_verified_match", "file": str(file_path), "name": name, "candidates": len(unique_candidates)} # Add verified Wikidata enrichment qid = best_match["qid"] entity_data = best_match["entity_data"] verification = best_match["verification"] entry["wikidata_enrichment"] = { "wikidata_id": qid, "wikidata_url": f"https://www.wikidata.org/wiki/{qid}", "wikidata_label": best_match.get("label"), "wikidata_description": best_match.get("description"), "labels": entity_data.get("labels", {}), "descriptions": entity_data.get("descriptions", {}), "instance_of": entity_data.get("p31", []), "located_in": entity_data.get("p131", []), "country": entity_data.get("p17_country"), "coordinates": entity_data.get("coordinates"), "enrichment_timestamp": datetime.now(timezone.utc).isoformat(), "verification": { "method": verification.get("verification_method"), "confidence": verification.get("confidence"), "subtype": verification.get("subtype"), "reasoning": verification.get("reasoning"), "ch_annotator_version": "v1.7.0", }, } entry["wikidata_enrichment_status"] = "VERIFIED" entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat() # Add Wikidata ID to identifiers if "identifiers" not in entry: entry["identifiers"] = [] # Check if Wikidata ID already exists existing_schemes = {i.get("identifier_scheme") for i in entry["identifiers"] if isinstance(i, dict)} if "Wikidata" not in existing_schemes: entry["identifiers"].append({ "identifier_scheme": "Wikidata", "identifier_value": qid, "identifier_url": f"https://www.wikidata.org/wiki/{qid}", }) # Remove re-enrichment note if "provenance" in entry and "notes" in entry["provenance"]: notes = entry["provenance"]["notes"] if "Re-enrichment required" in notes: entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip() # Save updated file with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) logger.info(f"✓ Enriched {file_path.name} with {qid} ({best_match.get('label')}) - confidence: {best_confidence:.2f}") return { "status": "verified_match", "file": str(file_path), "name": name, "qid": qid, "label": best_match.get("label"), "confidence": best_confidence, "subtype": verification.get("subtype"), } async def main(): """Main entry point.""" import argparse parser = argparse.ArgumentParser(description="Re-enrich Wikidata with LLM verification") parser.add_argument("--limit", type=int, default=100, help="Max files to process") parser.add_argument("--dry-run", action="store_true", help="Don't modify files") parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") parser.add_argument("--use-claude", action="store_true", help="Use Claude instead of GLM-4.6") args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) custodian_dir = Path("/Users/kempersc/apps/glam/data/custodian") llm_name = "Claude" if args.use_claude else "GLM-4.6" print("=" * 60) print(f"Wikidata Re-enrichment with {llm_name} CH Annotator Verification") print("=" * 60) print() # Find files needing re-enrichment print("Finding files needing re-enrichment...") files = await find_files_needing_reenrichment(custodian_dir) print(f"Found {len(files)} files needing re-enrichment") if not files: print("No files to process!") return # Limit files files = files[:args.limit] print(f"Processing {len(files)} files (limit: {args.limit})") print() if args.dry_run: print("DRY RUN - no files will be modified") for f in files[:20]: print(f" Would process: {f.name}") return # Initialize clients wd_client = WikidataSearchClient() verifier = GLMHeritageVerifier(use_claude=args.use_claude) # Process files results = { "verified_match": [], "no_verified_match": [], "not_found": [], "no_name": [], "error": [], } try: for i, file_path in enumerate(files, 1): print(f"\n[{i}/{len(files)}] Processing {file_path.name}...") try: result = await enrich_file_with_wikidata(file_path, wd_client, verifier) status = result.get("status", "error") results.setdefault(status, []).append(result) if status == "verified_match": print(f" ✓ {result.get('qid')} ({result.get('label')}) - {result.get('confidence', 0):.2f}") elif status == "no_verified_match": print(f" ✗ No verified match (checked {result.get('candidates', 0)} candidates)") elif status == "not_found": print(f" ✗ No Wikidata candidates found") elif status == "no_name": print(f" ⚠ No institution name found") except Exception as e: logger.error(f"Error processing {file_path}: {e}") results["error"].append({"file": str(file_path), "error": str(e)}) # Rate limiting await asyncio.sleep(0.5) finally: await wd_client.close() await verifier.close() # Print summary print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Verified matches: {len(results.get('verified_match', []))}") print(f"No verified match: {len(results.get('no_verified_match', []))}") print(f"Not found: {len(results.get('not_found', []))}") print(f"No name: {len(results.get('no_name', []))}") print(f"Errors: {len(results.get('error', []))}") print() # Save results timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") results_file = Path(f"/Users/kempersc/apps/glam/reports/wikidata_reenrichment_{timestamp}.yaml") results_file.parent.mkdir(parents=True, exist_ok=True) with open(results_file, 'w', encoding='utf-8') as f: yaml.dump({ "timestamp": datetime.now(timezone.utc).isoformat(), "files_processed": len(files), "results": results, }, f, allow_unicode=True, default_flow_style=False) print(f"Results saved to: {results_file}") if __name__ == "__main__": asyncio.run(main())