feat(wikidata): add re-enrichment and duplicate removal scripts

- Add reenrich_wikidata_with_verification.py for re-running enrichment - Add remove_wikidata_duplicates.py for deduplication
2025-12-08 14:59:38 +01:00 · 2025-12-08 14:59:38 +01:00 · 486bbee813
commit 486bbee813
parent 891692a4d6
2 changed files with 1103 additions and 0 deletions
--- a/scripts/reenrich_wikidata_with_verification.py
+++ b/scripts/reenrich_wikidata_with_verification.py
@ -0,0 +1,958 @@
+#!/usr/bin/env python3
+"""
+Re-enrich heritage institutions with Wikidata using GLM-4.6 CH Annotator verification.
+
+This script:
+1. Finds files marked for re-enrichment (after duplicate cleanup)
+2. Queries Wikidata API for candidates by institution name
+3. Uses GLM-4.6 to verify matches based on CH Annotator entity types (GRP.HER)
+4. Only adds Wikidata enrichment if entity is verified as heritage institution
+5. Updates files with verified Wikidata data
+
+CH Annotator Convention (v1.7.0):
+- Heritage institutions are type GRP.HER (glam:HeritageCustodian)
+- Maps to: org:FormalOrganization, rov:RegisteredOrganization, schema:Museum, schema:Library, schema:ArchiveOrganization
+- Subtypes: GRP.HER.GAL (Gallery), GRP.HER.LIB (Library), GRP.HER.ARC (Archive), GRP.HER.MUS (Museum)
+
+Wikidata "instance of" (P31) values for heritage institutions:
+- Q33506 (museum)
+- Q7075 (library)
+- Q166118 (archive)
+- Q1007870 (art gallery)
+- Q207694 (art museum)
+- Q1970365 (natural history museum)
+- Q18388277 (history museum)
+- Q23413 (castle) - when used as museum
+- Q839954 (archaeological site)
+- Q174782 (town square) - NOT heritage institution
+- Q515 (city) - NOT heritage institution
+"""
+
+import asyncio
+import json
+import os
+import re
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import yaml
+import httpx
+import logging
+
+# Load environment variables from .env file
+from dotenv import load_dotenv
+env_path = Path(__file__).parent.parent / ".env"
+load_dotenv(env_path)
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+# =============================================================================
+# WIKIDATA HERITAGE INSTITUTION TYPE CLASSES (P31 values)
+# =============================================================================
+
+# These are valid Wikidata "instance of" values for heritage institutions
+HERITAGE_P31_TYPES = {
+    # Museums
+    "Q33506": "museum",
+    "Q207694": "art museum",
+    "Q1970365": "natural history museum", 
+    "Q18388277": "history museum",
+    "Q2087181": "university museum",
+    "Q1007870": "art gallery",
+    "Q17431399": "national museum",
+    "Q16735822": "museum building",
+    "Q1788742": "war museum",
+    "Q7889618": "gallery of art",
+    "Q4989906": "monuments and memorials",
+    "Q57660343": "maritime museum",
+    "Q15206070": "transport museum",
+    "Q214090": "ethnographic museum",
+    "Q2522387": "aviation museum",
+    "Q841573": "archaeological museum",
+    "Q28737012": "memorial museum",
+    "Q588140": "railway museum",
+    "Q515034": "science museum",
+    "Q4287745": "local museum",
+    
+    # Libraries
+    "Q7075": "library",
+    "Q856234": "national library",
+    "Q1078570": "academic library",
+    "Q11294": "public library",
+    "Q13226383": "research library",
+    
+    # Archives
+    "Q166118": "archive",
+    "Q473972": "national archives",
+    "Q1423895": "film archive",
+    "Q2066131": "regional archive",
+    "Q63400100": "historical archive",
+    "Q63400127": "municipal archive",
+    "Q1026954": "photo archive",
+    
+    # Galleries
+    "Q1007870": "art gallery",
+    "Q7889618": "gallery of art",
+    
+    # Research centers
+    "Q31855": "research institute",
+    "Q327333": "heritage organisation",
+    
+    # Botanical/Zoo
+    "Q43229": "botanical garden",
+    "Q45585": "botanical garden",
+    "Q43501": "zoo",
+    
+    # Holy sites (when managing heritage collections)
+    "Q317557": "monastery",
+    "Q83405": "abbey",
+    "Q1088552": "cathedral chapter",
+    
+    # Educational (with collections)
+    "Q3918": "university",
+    "Q875538": "public university",
+}
+
+# These P31 values indicate NOT a heritage institution
+NON_HERITAGE_P31_TYPES = {
+    "Q515": "city",
+    "Q174782": "square",
+    "Q5": "human",
+    "Q4830453": "business",
+    "Q891723": "public company",
+    "Q783794": "company",
+    "Q6881511": "enterprise",
+    "Q43229": "organization",  # Too generic
+    "Q55678": "movie", 
+    "Q7366": "song",
+    "Q5398426": "television series",
+}
+
+
+# =============================================================================
+# WIKIDATA API CLIENT
+# =============================================================================
+
+class WikidataSearchClient:
+    """Client for Wikidata search and entity API."""
+    
+    SEARCH_URL = "https://www.wikidata.org/w/api.php"
+    ENTITY_URL = "https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
+    
+    def __init__(self, contact_email: Optional[str] = None):
+        self.contact_email = contact_email or os.environ.get("WIKIMEDIA_CONTACT_EMAIL", "glam@example.org")
+        self.client = httpx.AsyncClient(
+            timeout=30.0,
+            headers={
+                "User-Agent": f"GLAMBot/1.0 ({self.contact_email})",
+            }
+        )
+    
+    async def search_entity(self, name: str, language: str = "en", limit: int = 5) -> List[Dict[str, Any]]:
+        """
+        Search Wikidata for entities matching a name.
+        
+        Returns list of candidates with qid, label, description.
+        """
+        params = {
+            "action": "wbsearchentities",
+            "format": "json",
+            "language": language,
+            "type": "item",
+            "limit": limit,
+            "search": name,
+        }
+        
+        try:
+            response = await self.client.get(self.SEARCH_URL, params=params)
+            response.raise_for_status()
+            data = response.json()
+            
+            results = []
+            for item in data.get("search", []):
+                results.append({
+                    "qid": item.get("id"),
+                    "label": item.get("label"),
+                    "description": item.get("description", ""),
+                    "url": item.get("concepturi"),
+                })
+            
+            return results
+            
+        except Exception as e:
+            logger.error(f"Wikidata search error for '{name}': {e}")
+            return []
+    
+    async def get_entity_claims(self, qid: str) -> Dict[str, Any]:
+        """
+        Get entity claims (properties) from Wikidata.
+        
+        Returns dict with P31 (instance of), P131 (located in), P625 (coordinates), etc.
+        """
+        url = self.ENTITY_URL.format(qid=qid)
+        
+        try:
+            response = await self.client.get(url)
+            response.raise_for_status()
+            data = response.json()
+            
+            entity = data.get("entities", {}).get(qid, {})
+            claims = entity.get("claims", {})
+            labels = entity.get("labels", {})
+            descriptions = entity.get("descriptions", {})
+            
+            # Extract P31 values (instance of)
+            p31_values = []
+            for claim in claims.get("P31", []):
+                mainsnak = claim.get("mainsnak", {})
+                if mainsnak.get("snaktype") == "value":
+                    datavalue = mainsnak.get("datavalue", {})
+                    if datavalue.get("type") == "wikibase-entityid":
+                        p31_qid = datavalue.get("value", {}).get("id")
+                        if p31_qid:
+                            p31_values.append(p31_qid)
+            
+            # Extract P131 (located in administrative entity)
+            p131_values = []
+            for claim in claims.get("P131", []):
+                mainsnak = claim.get("mainsnak", {})
+                if mainsnak.get("snaktype") == "value":
+                    datavalue = mainsnak.get("datavalue", {})
+                    if datavalue.get("type") == "wikibase-entityid":
+                        p131_qid = datavalue.get("value", {}).get("id")
+                        if p131_qid:
+                            p131_values.append(p131_qid)
+            
+            # Extract P625 (coordinates)
+            coordinates = None
+            for claim in claims.get("P625", []):
+                mainsnak = claim.get("mainsnak", {})
+                if mainsnak.get("snaktype") == "value":
+                    datavalue = mainsnak.get("datavalue", {})
+                    if datavalue.get("type") == "globecoordinate":
+                        value = datavalue.get("value", {})
+                        coordinates = {
+                            "latitude": value.get("latitude"),
+                            "longitude": value.get("longitude"),
+                        }
+                        break
+            
+            # Extract P17 (country)
+            country = None
+            for claim in claims.get("P17", []):
+                mainsnak = claim.get("mainsnak", {})
+                if mainsnak.get("snaktype") == "value":
+                    datavalue = mainsnak.get("datavalue", {})
+                    if datavalue.get("type") == "wikibase-entityid":
+                        country = datavalue.get("value", {}).get("id")
+                        break
+            
+            return {
+                "qid": qid,
+                "labels": {k: v.get("value") for k, v in labels.items()},
+                "descriptions": {k: v.get("value") for k, v in descriptions.items()},
+                "p31": p31_values,
+                "p131": p131_values,
+                "p17_country": country,
+                "coordinates": coordinates,
+            }
+            
+        except Exception as e:
+            logger.error(f"Wikidata entity fetch error for {qid}: {e}")
+            return {}
+    
+    async def close(self):
+        await self.client.aclose()
+
+
+# =============================================================================
+# GLM-4.6 CH ANNOTATOR VERIFICATION
+# =============================================================================
+
+class GLMHeritageVerifier:
+    """
+    Verify Wikidata entity matches using GLM-4.6 CH Annotator.
+    
+    Uses CH Annotator v1.7.0 entity type GRP.HER to verify that
+    a Wikidata entity is actually a heritage institution.
+    """
+    
+    # Z.AI Coding Plan endpoint (different from regular BigModel API)
+    ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
+    
+    VERIFICATION_PROMPT = """You are a heritage institution classifier following CH-Annotator v1.7.0 convention.
+
+Your task is to determine if a Wikidata entity is a heritage institution (type GRP.HER).
+
+## CH-Annotator GRP.HER Definition
+Heritage institutions are organizations that:
+- Collect, preserve, and provide access to cultural heritage materials
+- Include: museums (GRP.HER.MUS), libraries (GRP.HER.LIB), archives (GRP.HER.ARC), galleries (GRP.HER.GAL)
+- May also include: research centers, botanical gardens, educational institutions WITH collections
+
+## Entity Types That Are NOT Heritage Institutions
+- Cities, towns, municipalities (these are places, not institutions)
+- General businesses or companies (unless they manage heritage collections)
+- People (individuals are AGT.PER, not GRP.HER)
+- Events, festivals, exhibitions (temporary, not institutions)
+- Buildings without institutional function (just architecture)
+
+## Your Task
+Analyze the Wikidata entity data and determine:
+1. Is this entity a heritage institution (GRP.HER)?
+2. If yes, what subtype? (MUS/LIB/ARC/GAL/OTHER)
+3. Confidence score (0.0-1.0)
+
+Respond in JSON format:
+```json
+{{
+  "is_heritage_institution": true/false,
+  "subtype": "MUS|LIB|ARC|GAL|RES|BOT|EDU|OTHER|null",
+  "confidence": 0.95,
+  "reasoning": "Brief explanation"
+}}
+```
+
+## Entity to Analyze
+Institution name from our data: {institution_name}
+Location from our data: {institution_location}
+
+Wikidata entity:
+- QID: {qid}
+- Label: {wd_label}
+- Description: {wd_description}
+- Instance of (P31): {p31_types}
+- Located in (P131): {p131_location}
+"""
+
+    def __init__(self, api_key: Optional[str] = None, model: str = "glm-4.6", use_claude: bool = False):
+        self.use_claude = use_claude
+        
+        if use_claude:
+            self.api_key = api_key or os.environ.get("CLAUDE_API_KEY")
+            self.model = "claude-3-5-haiku-20241022"  # Fast, cheap model
+            self.api_url = "https://api.anthropic.com/v1/messages"
+            if not self.api_key:
+                raise ValueError("CLAUDE_API_KEY not found in environment")
+            self.client = httpx.AsyncClient(
+                timeout=60.0,
+                headers={
+                    "x-api-key": self.api_key,
+                    "anthropic-version": "2023-06-01",
+                    "Content-Type": "application/json",
+                }
+            )
+        else:
+            self.api_key = api_key or os.environ.get("ZAI_API_TOKEN")
+            self.model = model
+            # Use Z.AI Coding Plan endpoint (same as OpenCode)
+            self.api_url = "https://api.z.ai/api/coding/paas/v4/chat/completions"
+            if not self.api_key:
+                raise ValueError("ZAI_API_TOKEN not found in environment")
+            self.client = httpx.AsyncClient(
+                timeout=60.0,
+                headers={
+                    "Authorization": f"Bearer {self.api_key}",
+                    "Content-Type": "application/json",
+                }
+            )
+    
+    async def verify_heritage_institution(
+        self,
+        institution_name: str,
+        institution_location: str,
+        qid: str,
+        wd_label: str,
+        wd_description: str,
+        p31_types: List[str],
+        p131_location: List[str],
+    ) -> Dict[str, Any]:
+        """
+        Verify if a Wikidata entity matches a heritage institution.
+        
+        Returns verification result with confidence score.
+        """
+        # First, quick heuristic check using P31 types
+        p31_set = set(p31_types)
+        
+        # Check for definite heritage types
+        heritage_matches = p31_set & set(HERITAGE_P31_TYPES.keys())
+        non_heritage_matches = p31_set & set(NON_HERITAGE_P31_TYPES.keys())
+        
+        # If clear non-heritage type, reject without LLM call
+        if non_heritage_matches and not heritage_matches:
+            logger.debug(f"Quick reject {qid}: P31 indicates non-heritage ({non_heritage_matches})")
+            return {
+                "is_heritage_institution": False,
+                "subtype": None,
+                "confidence": 0.95,
+                "reasoning": f"P31 types indicate non-heritage: {[NON_HERITAGE_P31_TYPES.get(t, t) for t in non_heritage_matches]}",
+                "verification_method": "p31_heuristic",
+            }
+        
+        # If clear heritage type, high confidence without LLM
+        if heritage_matches and not non_heritage_matches:
+            subtype = self._infer_subtype_from_p31(p31_types)
+            logger.debug(f"Quick accept {qid}: P31 indicates heritage ({heritage_matches})")
+            return {
+                "is_heritage_institution": True,
+                "subtype": subtype,
+                "confidence": 0.9,
+                "reasoning": f"P31 types indicate heritage: {[HERITAGE_P31_TYPES.get(t, t) for t in heritage_matches]}",
+                "verification_method": "p31_heuristic",
+            }
+        
+        # Ambiguous case - use GLM-4.6 for verification
+        p31_labels = [HERITAGE_P31_TYPES.get(t, NON_HERITAGE_P31_TYPES.get(t, t)) for t in p31_types]
+        
+        prompt = self.VERIFICATION_PROMPT.format(
+            institution_name=institution_name,
+            institution_location=institution_location,
+            qid=qid,
+            wd_label=wd_label,
+            wd_description=wd_description,
+            p31_types=", ".join(p31_labels) if p31_labels else "None specified",
+            p131_location=", ".join(p131_location) if p131_location else "Not specified",
+        )
+        
+        try:
+            if self.use_claude:
+                # Claude API request format
+                response = await self.client.post(
+                    self.api_url,
+                    json={
+                        "model": self.model,
+                        "max_tokens": 512,
+                        "messages": [
+                            {"role": "user", "content": prompt},
+                        ],
+                        "system": "You are a heritage institution classifier. Respond only in valid JSON. Start your response with { and end with }.",
+                    }
+                )
+                response.raise_for_status()
+                data = response.json()
+                content = data.get("content", [{}])[0].get("text", "")
+                logger.debug(f"Claude raw response for {qid}: {content[:300]}")
+                verification_method = "claude_ch_annotator"
+            else:
+                # GLM/Z.AI API request format
+                response = await self.client.post(
+                    self.api_url,
+                    json={
+                        "model": self.model,
+                        "messages": [
+                            {"role": "system", "content": "You are a heritage institution classifier. Respond only in valid JSON."},
+                            {"role": "user", "content": prompt},
+                        ],
+                        "temperature": 0.1,
+                        "max_tokens": 512,
+                    }
+                )
+                response.raise_for_status()
+                data = response.json()
+                content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+                verification_method = "glm_4.6_ch_annotator"
+            
+            # Parse JSON from response
+            try:
+                # Extract JSON from markdown code blocks if present
+                if "```json" in content:
+                    content = content.split("```json")[1].split("```")[0]
+                elif "```" in content:
+                    content = content.split("```")[1].split("```")[0]
+                
+                # Try to find JSON object in content
+                content = content.strip()
+                
+                # If content doesn't start with {, try to find first {
+                if not content.startswith("{"):
+                    start_idx = content.find("{")
+                    if start_idx != -1:
+                        # Find matching closing brace
+                        brace_count = 0
+                        end_idx = start_idx
+                        for i, char in enumerate(content[start_idx:], start_idx):
+                            if char == "{":
+                                brace_count += 1
+                            elif char == "}":
+                                brace_count -= 1
+                                if brace_count == 0:
+                                    end_idx = i
+                                    break
+                        content = content[start_idx:end_idx + 1]
+                    else:
+                        # No { found - wrap content in braces if it looks like JSON body
+                        if '"is_heritage_institution"' in content:
+                            content = "{" + content.rstrip().rstrip(",") + "}"
+                
+                result = json.loads(content)
+                result["verification_method"] = verification_method
+                return result
+                
+            except json.JSONDecodeError as e:
+                # Fallback: try to extract values with regex
+                logger.debug(f"JSON parse failed for {qid}, trying regex fallback: {content[:200]}")
+                
+                is_heritage = None
+                subtype = None
+                confidence = 0.5
+                reasoning = "Parsed from non-JSON response"
+                
+                # Check for is_heritage_institution value
+                if '"is_heritage_institution"' in content:
+                    if 'true' in content.lower():
+                        is_heritage = True
+                    elif 'false' in content.lower():
+                        is_heritage = False
+                
+                # Extract subtype
+                subtype_match = re.search(r'"subtype"\s*:\s*"([^"]+)"', content)
+                if subtype_match:
+                    subtype = subtype_match.group(1)
+                
+                # Extract confidence
+                conf_match = re.search(r'"confidence"\s*:\s*([\d.]+)', content)
+                if conf_match:
+                    try:
+                        confidence = float(conf_match.group(1))
+                    except ValueError:
+                        pass
+                
+                # Extract reasoning
+                reason_match = re.search(r'"reasoning"\s*:\s*"([^"]+)"', content)
+                if reason_match:
+                    reasoning = reason_match.group(1)
+                
+                if is_heritage is not None:
+                    return {
+                        "is_heritage_institution": is_heritage,
+                        "subtype": subtype,
+                        "confidence": confidence,
+                        "reasoning": reasoning,
+                        "verification_method": f"{verification_method}_regex_fallback",
+                    }
+                
+                logger.warning(f"Failed to parse LLM response for {qid}: {str(e)[:100]} - content: {content[:200]}")
+                return {
+                    "is_heritage_institution": False,
+                    "subtype": None,
+                    "confidence": 0.0,
+                    "reasoning": f"Failed to parse LLM response: {str(e)}",
+                    "verification_method": f"{verification_method}_parse_error",
+                }
+                
+        except Exception as e:
+            logger.error(f"LLM verification error for {qid}: {e}")
+            return {
+                "is_heritage_institution": False,
+                "subtype": None,
+                "confidence": 0.0,
+                "reasoning": f"API error: {e}",
+                "verification_method": "llm_api_error",
+            }
+    
+    def _infer_subtype_from_p31(self, p31_types: List[str]) -> str:
+        """Infer heritage institution subtype from P31 values."""
+        p31_set = set(p31_types)
+        
+        # Museum types
+        museum_types = {"Q33506", "Q207694", "Q1970365", "Q18388277", "Q2087181", "Q17431399", 
+                        "Q1788742", "Q57660343", "Q15206070", "Q214090", "Q2522387", 
+                        "Q841573", "Q28737012", "Q588140", "Q515034", "Q4287745"}
+        if p31_set & museum_types:
+            return "MUS"
+        
+        # Library types
+        library_types = {"Q7075", "Q856234", "Q1078570", "Q11294", "Q13226383"}
+        if p31_set & library_types:
+            return "LIB"
+        
+        # Archive types
+        archive_types = {"Q166118", "Q473972", "Q1423895", "Q2066131", "Q63400100", "Q63400127", "Q1026954"}
+        if p31_set & archive_types:
+            return "ARC"
+        
+        # Gallery types
+        gallery_types = {"Q1007870", "Q7889618"}
+        if p31_set & gallery_types:
+            return "GAL"
+        
+        # Research centers
+        if "Q31855" in p31_set or "Q327333" in p31_set:
+            return "RES"
+        
+        # Botanical/Zoo
+        if "Q43229" in p31_set or "Q45585" in p31_set or "Q43501" in p31_set:
+            return "BOT"
+        
+        # Educational
+        if "Q3918" in p31_set or "Q875538" in p31_set:
+            return "EDU"
+        
+        return "OTHER"
+    
+    async def close(self):
+        await self.client.aclose()
+
+
+# =============================================================================
+# MAIN ENRICHMENT LOGIC
+# =============================================================================
+
+async def find_files_needing_reenrichment(custodian_dir: Path) -> List[Path]:
+    """Find all files marked for re-enrichment."""
+    files = []
+    
+    for file_path in custodian_dir.glob("*.yaml"):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                
+            if "Re-enrichment required" in content:
+                files.append(file_path)
+                
+        except Exception as e:
+            logger.warning(f"Error reading {file_path}: {e}")
+    
+    return files
+
+
+async def enrich_file_with_wikidata(
+    file_path: Path,
+    wd_client: WikidataSearchClient,
+    verifier: GLMHeritageVerifier,
+) -> Dict[str, Any]:
+    """
+    Enrich a single file with verified Wikidata data.
+    
+    Returns enrichment result.
+    """
+    with open(file_path, 'r', encoding='utf-8') as f:
+        entry = yaml.safe_load(f)
+    
+    if not entry:
+        return {"status": "empty_file", "file": str(file_path)}
+    
+    # Get institution name
+    name = None
+    if "custodian_name" in entry and isinstance(entry["custodian_name"], dict):
+        name = entry["custodian_name"].get("claim_value")
+    if not name and "google_maps_enrichment" in entry:
+        name = entry["google_maps_enrichment"].get("name")
+    if not name and "original_entry" in entry:
+        name = entry["original_entry"].get("organisatie") or entry["original_entry"].get("instelling")
+    
+    if not name:
+        return {"status": "no_name", "file": str(file_path)}
+    
+    # Get location for verification
+    location = ""
+    if "google_maps_enrichment" in entry:
+        gm = entry["google_maps_enrichment"]
+        parts = []
+        if gm.get("short_address"):
+            parts.append(gm["short_address"])
+        elif gm.get("formatted_address"):
+            parts.append(gm["formatted_address"])
+        location = ", ".join(parts)
+    elif "original_entry" in entry:
+        oe = entry["original_entry"]
+        parts = []
+        if oe.get("plaatsnaam_bezoekadres"):
+            parts.append(oe["plaatsnaam_bezoekadres"])
+        if oe.get("provincie"):
+            parts.append(oe["provincie"])
+        location = ", ".join(parts)
+    
+    # Get country for search language
+    country_code = "NL"  # Default
+    if "ghcid" in entry:
+        ghcid = entry["ghcid"].get("ghcid_current", "")
+        if ghcid and len(ghcid) >= 2:
+            country_code = ghcid[:2]
+    
+    # Determine search language based on country
+    search_langs = ["en"]  # Always search English
+    if country_code == "NL":
+        search_langs = ["nl", "en"]
+    elif country_code == "BE":
+        search_langs = ["nl", "fr", "en"]
+    elif country_code == "DE":
+        search_langs = ["de", "en"]
+    elif country_code == "FR":
+        search_langs = ["fr", "en"]
+    elif country_code in ["BR", "PT"]:
+        search_langs = ["pt", "en"]
+    elif country_code in ["ES", "MX", "AR", "CL", "CO"]:
+        search_langs = ["es", "en"]
+    
+    # Search Wikidata for candidates
+    all_candidates = []
+    for lang in search_langs:
+        candidates = await wd_client.search_entity(name, language=lang, limit=5)
+        all_candidates.extend(candidates)
+        await asyncio.sleep(0.2)  # Rate limiting
+    
+    # Deduplicate by QID
+    seen_qids = set()
+    unique_candidates = []
+    for c in all_candidates:
+        if c["qid"] not in seen_qids:
+            seen_qids.add(c["qid"])
+            unique_candidates.append(c)
+    
+    if not unique_candidates:
+        # Update file to mark as not found
+        entry["wikidata_enrichment_status"] = "NOT_FOUND"
+        entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat()
+        
+        # Remove re-enrichment note from provenance
+        if "provenance" in entry and "notes" in entry["provenance"]:
+            notes = entry["provenance"]["notes"]
+            if "Re-enrichment required" in notes:
+                entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip()
+        
+        with open(file_path, 'w', encoding='utf-8') as f:
+            yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
+        
+        return {"status": "not_found", "file": str(file_path), "name": name}
+    
+    # Verify each candidate
+    best_match = None
+    best_confidence = 0.0
+    
+    for candidate in unique_candidates[:5]:  # Limit to top 5
+        qid = candidate["qid"]
+        
+        # Get entity details
+        entity_data = await wd_client.get_entity_claims(qid)
+        await asyncio.sleep(0.2)
+        
+        if not entity_data:
+            continue
+        
+        # Verify with GLM-4.6
+        verification = await verifier.verify_heritage_institution(
+            institution_name=name,
+            institution_location=location,
+            qid=qid,
+            wd_label=candidate.get("label", ""),
+            wd_description=candidate.get("description", ""),
+            p31_types=entity_data.get("p31", []),
+            p131_location=[str(x) for x in entity_data.get("p131", [])],
+        )
+        
+        if verification.get("is_heritage_institution") and verification.get("confidence", 0) > best_confidence:
+            best_match = {
+                "qid": qid,
+                "label": candidate.get("label"),
+                "description": candidate.get("description"),
+                "entity_data": entity_data,
+                "verification": verification,
+            }
+            best_confidence = verification.get("confidence", 0)
+    
+    if not best_match or best_confidence < 0.5:
+        # No verified match found
+        entry["wikidata_enrichment_status"] = "NO_VERIFIED_MATCH"
+        entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat()
+        entry["wikidata_candidates_checked"] = len(unique_candidates)
+        
+        # Remove re-enrichment note
+        if "provenance" in entry and "notes" in entry["provenance"]:
+            notes = entry["provenance"]["notes"]
+            if "Re-enrichment required" in notes:
+                entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip()
+        
+        with open(file_path, 'w', encoding='utf-8') as f:
+            yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
+        
+        return {"status": "no_verified_match", "file": str(file_path), "name": name, "candidates": len(unique_candidates)}
+    
+    # Add verified Wikidata enrichment
+    qid = best_match["qid"]
+    entity_data = best_match["entity_data"]
+    verification = best_match["verification"]
+    
+    entry["wikidata_enrichment"] = {
+        "wikidata_id": qid,
+        "wikidata_url": f"https://www.wikidata.org/wiki/{qid}",
+        "wikidata_label": best_match.get("label"),
+        "wikidata_description": best_match.get("description"),
+        "labels": entity_data.get("labels", {}),
+        "descriptions": entity_data.get("descriptions", {}),
+        "instance_of": entity_data.get("p31", []),
+        "located_in": entity_data.get("p131", []),
+        "country": entity_data.get("p17_country"),
+        "coordinates": entity_data.get("coordinates"),
+        "enrichment_timestamp": datetime.now(timezone.utc).isoformat(),
+        "verification": {
+            "method": verification.get("verification_method"),
+            "confidence": verification.get("confidence"),
+            "subtype": verification.get("subtype"),
+            "reasoning": verification.get("reasoning"),
+            "ch_annotator_version": "v1.7.0",
+        },
+    }
+    
+    entry["wikidata_enrichment_status"] = "VERIFIED"
+    entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat()
+    
+    # Add Wikidata ID to identifiers
+    if "identifiers" not in entry:
+        entry["identifiers"] = []
+    
+    # Check if Wikidata ID already exists
+    existing_schemes = {i.get("identifier_scheme") for i in entry["identifiers"] if isinstance(i, dict)}
+    if "Wikidata" not in existing_schemes:
+        entry["identifiers"].append({
+            "identifier_scheme": "Wikidata",
+            "identifier_value": qid,
+            "identifier_url": f"https://www.wikidata.org/wiki/{qid}",
+        })
+    
+    # Remove re-enrichment note
+    if "provenance" in entry and "notes" in entry["provenance"]:
+        notes = entry["provenance"]["notes"]
+        if "Re-enrichment required" in notes:
+            entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip()
+    
+    # Save updated file
+    with open(file_path, 'w', encoding='utf-8') as f:
+        yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
+    
+    logger.info(f"✓ Enriched {file_path.name} with {qid} ({best_match.get('label')}) - confidence: {best_confidence:.2f}")
+    
+    return {
+        "status": "verified_match",
+        "file": str(file_path),
+        "name": name,
+        "qid": qid,
+        "label": best_match.get("label"),
+        "confidence": best_confidence,
+        "subtype": verification.get("subtype"),
+    }
+
+
+async def main():
+    """Main entry point."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Re-enrich Wikidata with LLM verification")
+    parser.add_argument("--limit", type=int, default=100, help="Max files to process")
+    parser.add_argument("--dry-run", action="store_true", help="Don't modify files")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    parser.add_argument("--use-claude", action="store_true", help="Use Claude instead of GLM-4.6")
+    args = parser.parse_args()
+    
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+    
+    custodian_dir = Path("/Users/kempersc/apps/glam/data/custodian")
+    
+    llm_name = "Claude" if args.use_claude else "GLM-4.6"
+    print("=" * 60)
+    print(f"Wikidata Re-enrichment with {llm_name} CH Annotator Verification")
+    print("=" * 60)
+    print()
+    
+    # Find files needing re-enrichment
+    print("Finding files needing re-enrichment...")
+    files = await find_files_needing_reenrichment(custodian_dir)
+    print(f"Found {len(files)} files needing re-enrichment")
+    
+    if not files:
+        print("No files to process!")
+        return
+    
+    # Limit files
+    files = files[:args.limit]
+    print(f"Processing {len(files)} files (limit: {args.limit})")
+    print()
+    
+    if args.dry_run:
+        print("DRY RUN - no files will be modified")
+        for f in files[:20]:
+            print(f"  Would process: {f.name}")
+        return
+    
+    # Initialize clients
+    wd_client = WikidataSearchClient()
+    verifier = GLMHeritageVerifier(use_claude=args.use_claude)
+    
+    # Process files
+    results = {
+        "verified_match": [],
+        "no_verified_match": [],
+        "not_found": [],
+        "no_name": [],
+        "error": [],
+    }
+    
+    try:
+        for i, file_path in enumerate(files, 1):
+            print(f"\n[{i}/{len(files)}] Processing {file_path.name}...")
+            
+            try:
+                result = await enrich_file_with_wikidata(file_path, wd_client, verifier)
+                status = result.get("status", "error")
+                results.setdefault(status, []).append(result)
+                
+                if status == "verified_match":
+                    print(f"  ✓ {result.get('qid')} ({result.get('label')}) - {result.get('confidence', 0):.2f}")
+                elif status == "no_verified_match":
+                    print(f"  ✗ No verified match (checked {result.get('candidates', 0)} candidates)")
+                elif status == "not_found":
+                    print(f"  ✗ No Wikidata candidates found")
+                elif status == "no_name":
+                    print(f"  ⚠ No institution name found")
+                    
+            except Exception as e:
+                logger.error(f"Error processing {file_path}: {e}")
+                results["error"].append({"file": str(file_path), "error": str(e)})
+            
+            # Rate limiting
+            await asyncio.sleep(0.5)
+            
+    finally:
+        await wd_client.close()
+        await verifier.close()
+    
+    # Print summary
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    print(f"Verified matches:    {len(results.get('verified_match', []))}")
+    print(f"No verified match:   {len(results.get('no_verified_match', []))}")
+    print(f"Not found:           {len(results.get('not_found', []))}")
+    print(f"No name:             {len(results.get('no_name', []))}")
+    print(f"Errors:              {len(results.get('error', []))}")
+    print()
+    
+    # Save results
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    results_file = Path(f"/Users/kempersc/apps/glam/reports/wikidata_reenrichment_{timestamp}.yaml")
+    results_file.parent.mkdir(parents=True, exist_ok=True)
+    
+    with open(results_file, 'w', encoding='utf-8') as f:
+        yaml.dump({
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "files_processed": len(files),
+            "results": results,
+        }, f, allow_unicode=True, default_flow_style=False)
+    
+    print(f"Results saved to: {results_file}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/scripts/remove_wikidata_duplicates.py
+++ b/scripts/remove_wikidata_duplicates.py
@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+"""
+Remove wikidata_enrichment from files with duplicate Wikidata entity IDs.
+
+These files have incorrect Wikidata entity ID assignments where the same Q-number
+was incorrectly assigned to multiple different institutions.
+
+The script:
+1. Reads the list of affected files from /tmp/wikidata_duplicates_to_clean.txt
+2. For each file, removes the wikidata_enrichment section
+3. Adds a provenance note documenting the removal
+4. Preserves all other data
+"""
+
+import os
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+import yaml
+
+# Preserve order in YAML output
+class OrderedDumper(yaml.SafeDumper):
+    pass
+
+def represent_ordereddict(dumper, data):
+    return dumper.represent_mapping('tag:yaml.org,2002:map', data.items())
+
+def str_representer(dumper, data):
+    if '\n' in data:
+        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
+    return dumper.represent_scalar('tag:yaml.org,2002:str', data)
+
+OrderedDumper.add_representer(str, str_representer)
+
+def remove_wikidata_enrichment(file_path: Path) -> tuple[bool, str]:
+    """
+    Remove wikidata_enrichment from a file.
+    
+    Returns:
+        tuple of (success: bool, message: str)
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        
+        # Parse YAML
+        data = yaml.safe_load(content)
+        
+        if data is None:
+            return False, "Empty or invalid YAML"
+        
+        # Check if wikidata_enrichment exists
+        if 'wikidata_enrichment' not in data:
+            return False, "No wikidata_enrichment found"
+        
+        # Get the old wikidata ID for documentation
+        old_wikidata_id = None
+        if isinstance(data.get('wikidata_enrichment'), dict):
+            old_wikidata_id = data['wikidata_enrichment'].get('wikidata_id')
+        
+        # Remove wikidata_enrichment
+        del data['wikidata_enrichment']
+        
+        # Add provenance note
+        timestamp = datetime.now(timezone.utc).isoformat()
+        note = f"Removed incorrect wikidata_enrichment on {timestamp}. "
+        if old_wikidata_id:
+            note += f"Previous Wikidata ID {old_wikidata_id} was incorrectly assigned (duplicate across multiple institutions). "
+        note += "Re-enrichment required with proper matching."
+        
+        # Update or create provenance
+        if 'provenance' not in data:
+            data['provenance'] = {}
+        
+        if isinstance(data['provenance'], dict):
+            existing_notes = data['provenance'].get('notes', '')
+            # Handle case where notes is a list
+            if isinstance(existing_notes, list):
+                existing_notes.append(note)
+                data['provenance']['notes'] = existing_notes
+            elif existing_notes:
+                data['provenance']['notes'] = existing_notes + '\n\n' + note
+            else:
+                data['provenance']['notes'] = note
+        
+        # Write back
+        with open(file_path, 'w', encoding='utf-8') as f:
+            yaml.dump(data, f, Dumper=OrderedDumper, allow_unicode=True, 
+                     default_flow_style=False, sort_keys=False, width=120)
+        
+        return True, f"Removed wikidata_enrichment (was {old_wikidata_id})"
+        
+    except Exception as e:
+        return False, f"Error: {e}"
+
+
+def main():
+    # Read the list of files to clean
+    list_file = Path('/tmp/wikidata_duplicates_to_clean.txt')
+    if not list_file.exists():
+        print(f"Error: {list_file} not found")
+        sys.exit(1)
+    
+    with open(list_file, 'r') as f:
+        filenames = [line.strip() for line in f if line.strip()]
+    
+    print(f"Found {len(filenames)} files to clean")
+    
+    # Base directory for custodian files
+    base_dir = Path('/Users/kempersc/apps/glam/data/custodian')
+    
+    success_count = 0
+    skip_count = 0
+    error_count = 0
+    
+    for i, filename in enumerate(filenames):
+        file_path = base_dir / filename
+        
+        if not file_path.exists():
+            print(f"[{i+1}/{len(filenames)}] SKIP (not found): {filename}")
+            skip_count += 1
+            continue
+        
+        success, message = remove_wikidata_enrichment(file_path)
+        
+        if success:
+            print(f"[{i+1}/{len(filenames)}] OK: {filename} - {message}")
+            success_count += 1
+        else:
+            if "No wikidata_enrichment" in message:
+                print(f"[{i+1}/{len(filenames)}] SKIP (no wikidata): {filename}")
+                skip_count += 1
+            else:
+                print(f"[{i+1}/{len(filenames)}] ERROR: {filename} - {message}")
+                error_count += 1
+    
+    print(f"\n=== Summary ===")
+    print(f"Total files: {len(filenames)}")
+    print(f"Successfully cleaned: {success_count}")
+    print(f"Skipped: {skip_count}")
+    print(f"Errors: {error_count}")
+
+
+if __name__ == '__main__':
+    main()