diff --git a/scripts/reenrich_wikidata_with_verification.py b/scripts/reenrich_wikidata_with_verification.py new file mode 100755 index 0000000000..d3bafb07bd --- /dev/null +++ b/scripts/reenrich_wikidata_with_verification.py @@ -0,0 +1,958 @@ +#!/usr/bin/env python3 +""" +Re-enrich heritage institutions with Wikidata using GLM-4.6 CH Annotator verification. + +This script: +1. Finds files marked for re-enrichment (after duplicate cleanup) +2. Queries Wikidata API for candidates by institution name +3. Uses GLM-4.6 to verify matches based on CH Annotator entity types (GRP.HER) +4. Only adds Wikidata enrichment if entity is verified as heritage institution +5. Updates files with verified Wikidata data + +CH Annotator Convention (v1.7.0): +- Heritage institutions are type GRP.HER (glam:HeritageCustodian) +- Maps to: org:FormalOrganization, rov:RegisteredOrganization, schema:Museum, schema:Library, schema:ArchiveOrganization +- Subtypes: GRP.HER.GAL (Gallery), GRP.HER.LIB (Library), GRP.HER.ARC (Archive), GRP.HER.MUS (Museum) + +Wikidata "instance of" (P31) values for heritage institutions: +- Q33506 (museum) +- Q7075 (library) +- Q166118 (archive) +- Q1007870 (art gallery) +- Q207694 (art museum) +- Q1970365 (natural history museum) +- Q18388277 (history museum) +- Q23413 (castle) - when used as museum +- Q839954 (archaeological site) +- Q174782 (town square) - NOT heritage institution +- Q515 (city) - NOT heritage institution +""" + +import asyncio +import json +import os +import re +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple +import yaml +import httpx +import logging + +# Load environment variables from .env file +from dotenv import load_dotenv +env_path = Path(__file__).parent.parent / ".env" +load_dotenv(env_path) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +# ============================================================================= +# WIKIDATA HERITAGE INSTITUTION TYPE CLASSES (P31 values) +# ============================================================================= + +# These are valid Wikidata "instance of" values for heritage institutions +HERITAGE_P31_TYPES = { + # Museums + "Q33506": "museum", + "Q207694": "art museum", + "Q1970365": "natural history museum", + "Q18388277": "history museum", + "Q2087181": "university museum", + "Q1007870": "art gallery", + "Q17431399": "national museum", + "Q16735822": "museum building", + "Q1788742": "war museum", + "Q7889618": "gallery of art", + "Q4989906": "monuments and memorials", + "Q57660343": "maritime museum", + "Q15206070": "transport museum", + "Q214090": "ethnographic museum", + "Q2522387": "aviation museum", + "Q841573": "archaeological museum", + "Q28737012": "memorial museum", + "Q588140": "railway museum", + "Q515034": "science museum", + "Q4287745": "local museum", + + # Libraries + "Q7075": "library", + "Q856234": "national library", + "Q1078570": "academic library", + "Q11294": "public library", + "Q13226383": "research library", + + # Archives + "Q166118": "archive", + "Q473972": "national archives", + "Q1423895": "film archive", + "Q2066131": "regional archive", + "Q63400100": "historical archive", + "Q63400127": "municipal archive", + "Q1026954": "photo archive", + + # Galleries + "Q1007870": "art gallery", + "Q7889618": "gallery of art", + + # Research centers + "Q31855": "research institute", + "Q327333": "heritage organisation", + + # Botanical/Zoo + "Q43229": "botanical garden", + "Q45585": "botanical garden", + "Q43501": "zoo", + + # Holy sites (when managing heritage collections) + "Q317557": "monastery", + "Q83405": "abbey", + "Q1088552": "cathedral chapter", + + # Educational (with collections) + "Q3918": "university", + "Q875538": "public university", +} + +# These P31 values indicate NOT a heritage institution +NON_HERITAGE_P31_TYPES = { + "Q515": "city", + "Q174782": "square", + "Q5": "human", + "Q4830453": "business", + "Q891723": "public company", + "Q783794": "company", + "Q6881511": "enterprise", + "Q43229": "organization", # Too generic + "Q55678": "movie", + "Q7366": "song", + "Q5398426": "television series", +} + + +# ============================================================================= +# WIKIDATA API CLIENT +# ============================================================================= + +class WikidataSearchClient: + """Client for Wikidata search and entity API.""" + + SEARCH_URL = "https://www.wikidata.org/w/api.php" + ENTITY_URL = "https://www.wikidata.org/wiki/Special:EntityData/{qid}.json" + + def __init__(self, contact_email: Optional[str] = None): + self.contact_email = contact_email or os.environ.get("WIKIMEDIA_CONTACT_EMAIL", "glam@example.org") + self.client = httpx.AsyncClient( + timeout=30.0, + headers={ + "User-Agent": f"GLAMBot/1.0 ({self.contact_email})", + } + ) + + async def search_entity(self, name: str, language: str = "en", limit: int = 5) -> List[Dict[str, Any]]: + """ + Search Wikidata for entities matching a name. + + Returns list of candidates with qid, label, description. + """ + params = { + "action": "wbsearchentities", + "format": "json", + "language": language, + "type": "item", + "limit": limit, + "search": name, + } + + try: + response = await self.client.get(self.SEARCH_URL, params=params) + response.raise_for_status() + data = response.json() + + results = [] + for item in data.get("search", []): + results.append({ + "qid": item.get("id"), + "label": item.get("label"), + "description": item.get("description", ""), + "url": item.get("concepturi"), + }) + + return results + + except Exception as e: + logger.error(f"Wikidata search error for '{name}': {e}") + return [] + + async def get_entity_claims(self, qid: str) -> Dict[str, Any]: + """ + Get entity claims (properties) from Wikidata. + + Returns dict with P31 (instance of), P131 (located in), P625 (coordinates), etc. + """ + url = self.ENTITY_URL.format(qid=qid) + + try: + response = await self.client.get(url) + response.raise_for_status() + data = response.json() + + entity = data.get("entities", {}).get(qid, {}) + claims = entity.get("claims", {}) + labels = entity.get("labels", {}) + descriptions = entity.get("descriptions", {}) + + # Extract P31 values (instance of) + p31_values = [] + for claim in claims.get("P31", []): + mainsnak = claim.get("mainsnak", {}) + if mainsnak.get("snaktype") == "value": + datavalue = mainsnak.get("datavalue", {}) + if datavalue.get("type") == "wikibase-entityid": + p31_qid = datavalue.get("value", {}).get("id") + if p31_qid: + p31_values.append(p31_qid) + + # Extract P131 (located in administrative entity) + p131_values = [] + for claim in claims.get("P131", []): + mainsnak = claim.get("mainsnak", {}) + if mainsnak.get("snaktype") == "value": + datavalue = mainsnak.get("datavalue", {}) + if datavalue.get("type") == "wikibase-entityid": + p131_qid = datavalue.get("value", {}).get("id") + if p131_qid: + p131_values.append(p131_qid) + + # Extract P625 (coordinates) + coordinates = None + for claim in claims.get("P625", []): + mainsnak = claim.get("mainsnak", {}) + if mainsnak.get("snaktype") == "value": + datavalue = mainsnak.get("datavalue", {}) + if datavalue.get("type") == "globecoordinate": + value = datavalue.get("value", {}) + coordinates = { + "latitude": value.get("latitude"), + "longitude": value.get("longitude"), + } + break + + # Extract P17 (country) + country = None + for claim in claims.get("P17", []): + mainsnak = claim.get("mainsnak", {}) + if mainsnak.get("snaktype") == "value": + datavalue = mainsnak.get("datavalue", {}) + if datavalue.get("type") == "wikibase-entityid": + country = datavalue.get("value", {}).get("id") + break + + return { + "qid": qid, + "labels": {k: v.get("value") for k, v in labels.items()}, + "descriptions": {k: v.get("value") for k, v in descriptions.items()}, + "p31": p31_values, + "p131": p131_values, + "p17_country": country, + "coordinates": coordinates, + } + + except Exception as e: + logger.error(f"Wikidata entity fetch error for {qid}: {e}") + return {} + + async def close(self): + await self.client.aclose() + + +# ============================================================================= +# GLM-4.6 CH ANNOTATOR VERIFICATION +# ============================================================================= + +class GLMHeritageVerifier: + """ + Verify Wikidata entity matches using GLM-4.6 CH Annotator. + + Uses CH Annotator v1.7.0 entity type GRP.HER to verify that + a Wikidata entity is actually a heritage institution. + """ + + # Z.AI Coding Plan endpoint (different from regular BigModel API) + ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions" + + VERIFICATION_PROMPT = """You are a heritage institution classifier following CH-Annotator v1.7.0 convention. + +Your task is to determine if a Wikidata entity is a heritage institution (type GRP.HER). + +## CH-Annotator GRP.HER Definition +Heritage institutions are organizations that: +- Collect, preserve, and provide access to cultural heritage materials +- Include: museums (GRP.HER.MUS), libraries (GRP.HER.LIB), archives (GRP.HER.ARC), galleries (GRP.HER.GAL) +- May also include: research centers, botanical gardens, educational institutions WITH collections + +## Entity Types That Are NOT Heritage Institutions +- Cities, towns, municipalities (these are places, not institutions) +- General businesses or companies (unless they manage heritage collections) +- People (individuals are AGT.PER, not GRP.HER) +- Events, festivals, exhibitions (temporary, not institutions) +- Buildings without institutional function (just architecture) + +## Your Task +Analyze the Wikidata entity data and determine: +1. Is this entity a heritage institution (GRP.HER)? +2. If yes, what subtype? (MUS/LIB/ARC/GAL/OTHER) +3. Confidence score (0.0-1.0) + +Respond in JSON format: +```json +{{ + "is_heritage_institution": true/false, + "subtype": "MUS|LIB|ARC|GAL|RES|BOT|EDU|OTHER|null", + "confidence": 0.95, + "reasoning": "Brief explanation" +}} +``` + +## Entity to Analyze +Institution name from our data: {institution_name} +Location from our data: {institution_location} + +Wikidata entity: +- QID: {qid} +- Label: {wd_label} +- Description: {wd_description} +- Instance of (P31): {p31_types} +- Located in (P131): {p131_location} +""" + + def __init__(self, api_key: Optional[str] = None, model: str = "glm-4.6", use_claude: bool = False): + self.use_claude = use_claude + + if use_claude: + self.api_key = api_key or os.environ.get("CLAUDE_API_KEY") + self.model = "claude-3-5-haiku-20241022" # Fast, cheap model + self.api_url = "https://api.anthropic.com/v1/messages" + if not self.api_key: + raise ValueError("CLAUDE_API_KEY not found in environment") + self.client = httpx.AsyncClient( + timeout=60.0, + headers={ + "x-api-key": self.api_key, + "anthropic-version": "2023-06-01", + "Content-Type": "application/json", + } + ) + else: + self.api_key = api_key or os.environ.get("ZAI_API_TOKEN") + self.model = model + # Use Z.AI Coding Plan endpoint (same as OpenCode) + self.api_url = "https://api.z.ai/api/coding/paas/v4/chat/completions" + if not self.api_key: + raise ValueError("ZAI_API_TOKEN not found in environment") + self.client = httpx.AsyncClient( + timeout=60.0, + headers={ + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + ) + + async def verify_heritage_institution( + self, + institution_name: str, + institution_location: str, + qid: str, + wd_label: str, + wd_description: str, + p31_types: List[str], + p131_location: List[str], + ) -> Dict[str, Any]: + """ + Verify if a Wikidata entity matches a heritage institution. + + Returns verification result with confidence score. + """ + # First, quick heuristic check using P31 types + p31_set = set(p31_types) + + # Check for definite heritage types + heritage_matches = p31_set & set(HERITAGE_P31_TYPES.keys()) + non_heritage_matches = p31_set & set(NON_HERITAGE_P31_TYPES.keys()) + + # If clear non-heritage type, reject without LLM call + if non_heritage_matches and not heritage_matches: + logger.debug(f"Quick reject {qid}: P31 indicates non-heritage ({non_heritage_matches})") + return { + "is_heritage_institution": False, + "subtype": None, + "confidence": 0.95, + "reasoning": f"P31 types indicate non-heritage: {[NON_HERITAGE_P31_TYPES.get(t, t) for t in non_heritage_matches]}", + "verification_method": "p31_heuristic", + } + + # If clear heritage type, high confidence without LLM + if heritage_matches and not non_heritage_matches: + subtype = self._infer_subtype_from_p31(p31_types) + logger.debug(f"Quick accept {qid}: P31 indicates heritage ({heritage_matches})") + return { + "is_heritage_institution": True, + "subtype": subtype, + "confidence": 0.9, + "reasoning": f"P31 types indicate heritage: {[HERITAGE_P31_TYPES.get(t, t) for t in heritage_matches]}", + "verification_method": "p31_heuristic", + } + + # Ambiguous case - use GLM-4.6 for verification + p31_labels = [HERITAGE_P31_TYPES.get(t, NON_HERITAGE_P31_TYPES.get(t, t)) for t in p31_types] + + prompt = self.VERIFICATION_PROMPT.format( + institution_name=institution_name, + institution_location=institution_location, + qid=qid, + wd_label=wd_label, + wd_description=wd_description, + p31_types=", ".join(p31_labels) if p31_labels else "None specified", + p131_location=", ".join(p131_location) if p131_location else "Not specified", + ) + + try: + if self.use_claude: + # Claude API request format + response = await self.client.post( + self.api_url, + json={ + "model": self.model, + "max_tokens": 512, + "messages": [ + {"role": "user", "content": prompt}, + ], + "system": "You are a heritage institution classifier. Respond only in valid JSON. Start your response with { and end with }.", + } + ) + response.raise_for_status() + data = response.json() + content = data.get("content", [{}])[0].get("text", "") + logger.debug(f"Claude raw response for {qid}: {content[:300]}") + verification_method = "claude_ch_annotator" + else: + # GLM/Z.AI API request format + response = await self.client.post( + self.api_url, + json={ + "model": self.model, + "messages": [ + {"role": "system", "content": "You are a heritage institution classifier. Respond only in valid JSON."}, + {"role": "user", "content": prompt}, + ], + "temperature": 0.1, + "max_tokens": 512, + } + ) + response.raise_for_status() + data = response.json() + content = data.get("choices", [{}])[0].get("message", {}).get("content", "") + verification_method = "glm_4.6_ch_annotator" + + # Parse JSON from response + try: + # Extract JSON from markdown code blocks if present + if "```json" in content: + content = content.split("```json")[1].split("```")[0] + elif "```" in content: + content = content.split("```")[1].split("```")[0] + + # Try to find JSON object in content + content = content.strip() + + # If content doesn't start with {, try to find first { + if not content.startswith("{"): + start_idx = content.find("{") + if start_idx != -1: + # Find matching closing brace + brace_count = 0 + end_idx = start_idx + for i, char in enumerate(content[start_idx:], start_idx): + if char == "{": + brace_count += 1 + elif char == "}": + brace_count -= 1 + if brace_count == 0: + end_idx = i + break + content = content[start_idx:end_idx + 1] + else: + # No { found - wrap content in braces if it looks like JSON body + if '"is_heritage_institution"' in content: + content = "{" + content.rstrip().rstrip(",") + "}" + + result = json.loads(content) + result["verification_method"] = verification_method + return result + + except json.JSONDecodeError as e: + # Fallback: try to extract values with regex + logger.debug(f"JSON parse failed for {qid}, trying regex fallback: {content[:200]}") + + is_heritage = None + subtype = None + confidence = 0.5 + reasoning = "Parsed from non-JSON response" + + # Check for is_heritage_institution value + if '"is_heritage_institution"' in content: + if 'true' in content.lower(): + is_heritage = True + elif 'false' in content.lower(): + is_heritage = False + + # Extract subtype + subtype_match = re.search(r'"subtype"\s*:\s*"([^"]+)"', content) + if subtype_match: + subtype = subtype_match.group(1) + + # Extract confidence + conf_match = re.search(r'"confidence"\s*:\s*([\d.]+)', content) + if conf_match: + try: + confidence = float(conf_match.group(1)) + except ValueError: + pass + + # Extract reasoning + reason_match = re.search(r'"reasoning"\s*:\s*"([^"]+)"', content) + if reason_match: + reasoning = reason_match.group(1) + + if is_heritage is not None: + return { + "is_heritage_institution": is_heritage, + "subtype": subtype, + "confidence": confidence, + "reasoning": reasoning, + "verification_method": f"{verification_method}_regex_fallback", + } + + logger.warning(f"Failed to parse LLM response for {qid}: {str(e)[:100]} - content: {content[:200]}") + return { + "is_heritage_institution": False, + "subtype": None, + "confidence": 0.0, + "reasoning": f"Failed to parse LLM response: {str(e)}", + "verification_method": f"{verification_method}_parse_error", + } + + except Exception as e: + logger.error(f"LLM verification error for {qid}: {e}") + return { + "is_heritage_institution": False, + "subtype": None, + "confidence": 0.0, + "reasoning": f"API error: {e}", + "verification_method": "llm_api_error", + } + + def _infer_subtype_from_p31(self, p31_types: List[str]) -> str: + """Infer heritage institution subtype from P31 values.""" + p31_set = set(p31_types) + + # Museum types + museum_types = {"Q33506", "Q207694", "Q1970365", "Q18388277", "Q2087181", "Q17431399", + "Q1788742", "Q57660343", "Q15206070", "Q214090", "Q2522387", + "Q841573", "Q28737012", "Q588140", "Q515034", "Q4287745"} + if p31_set & museum_types: + return "MUS" + + # Library types + library_types = {"Q7075", "Q856234", "Q1078570", "Q11294", "Q13226383"} + if p31_set & library_types: + return "LIB" + + # Archive types + archive_types = {"Q166118", "Q473972", "Q1423895", "Q2066131", "Q63400100", "Q63400127", "Q1026954"} + if p31_set & archive_types: + return "ARC" + + # Gallery types + gallery_types = {"Q1007870", "Q7889618"} + if p31_set & gallery_types: + return "GAL" + + # Research centers + if "Q31855" in p31_set or "Q327333" in p31_set: + return "RES" + + # Botanical/Zoo + if "Q43229" in p31_set or "Q45585" in p31_set or "Q43501" in p31_set: + return "BOT" + + # Educational + if "Q3918" in p31_set or "Q875538" in p31_set: + return "EDU" + + return "OTHER" + + async def close(self): + await self.client.aclose() + + +# ============================================================================= +# MAIN ENRICHMENT LOGIC +# ============================================================================= + +async def find_files_needing_reenrichment(custodian_dir: Path) -> List[Path]: + """Find all files marked for re-enrichment.""" + files = [] + + for file_path in custodian_dir.glob("*.yaml"): + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + if "Re-enrichment required" in content: + files.append(file_path) + + except Exception as e: + logger.warning(f"Error reading {file_path}: {e}") + + return files + + +async def enrich_file_with_wikidata( + file_path: Path, + wd_client: WikidataSearchClient, + verifier: GLMHeritageVerifier, +) -> Dict[str, Any]: + """ + Enrich a single file with verified Wikidata data. + + Returns enrichment result. + """ + with open(file_path, 'r', encoding='utf-8') as f: + entry = yaml.safe_load(f) + + if not entry: + return {"status": "empty_file", "file": str(file_path)} + + # Get institution name + name = None + if "custodian_name" in entry and isinstance(entry["custodian_name"], dict): + name = entry["custodian_name"].get("claim_value") + if not name and "google_maps_enrichment" in entry: + name = entry["google_maps_enrichment"].get("name") + if not name and "original_entry" in entry: + name = entry["original_entry"].get("organisatie") or entry["original_entry"].get("instelling") + + if not name: + return {"status": "no_name", "file": str(file_path)} + + # Get location for verification + location = "" + if "google_maps_enrichment" in entry: + gm = entry["google_maps_enrichment"] + parts = [] + if gm.get("short_address"): + parts.append(gm["short_address"]) + elif gm.get("formatted_address"): + parts.append(gm["formatted_address"]) + location = ", ".join(parts) + elif "original_entry" in entry: + oe = entry["original_entry"] + parts = [] + if oe.get("plaatsnaam_bezoekadres"): + parts.append(oe["plaatsnaam_bezoekadres"]) + if oe.get("provincie"): + parts.append(oe["provincie"]) + location = ", ".join(parts) + + # Get country for search language + country_code = "NL" # Default + if "ghcid" in entry: + ghcid = entry["ghcid"].get("ghcid_current", "") + if ghcid and len(ghcid) >= 2: + country_code = ghcid[:2] + + # Determine search language based on country + search_langs = ["en"] # Always search English + if country_code == "NL": + search_langs = ["nl", "en"] + elif country_code == "BE": + search_langs = ["nl", "fr", "en"] + elif country_code == "DE": + search_langs = ["de", "en"] + elif country_code == "FR": + search_langs = ["fr", "en"] + elif country_code in ["BR", "PT"]: + search_langs = ["pt", "en"] + elif country_code in ["ES", "MX", "AR", "CL", "CO"]: + search_langs = ["es", "en"] + + # Search Wikidata for candidates + all_candidates = [] + for lang in search_langs: + candidates = await wd_client.search_entity(name, language=lang, limit=5) + all_candidates.extend(candidates) + await asyncio.sleep(0.2) # Rate limiting + + # Deduplicate by QID + seen_qids = set() + unique_candidates = [] + for c in all_candidates: + if c["qid"] not in seen_qids: + seen_qids.add(c["qid"]) + unique_candidates.append(c) + + if not unique_candidates: + # Update file to mark as not found + entry["wikidata_enrichment_status"] = "NOT_FOUND" + entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat() + + # Remove re-enrichment note from provenance + if "provenance" in entry and "notes" in entry["provenance"]: + notes = entry["provenance"]["notes"] + if "Re-enrichment required" in notes: + entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip() + + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + return {"status": "not_found", "file": str(file_path), "name": name} + + # Verify each candidate + best_match = None + best_confidence = 0.0 + + for candidate in unique_candidates[:5]: # Limit to top 5 + qid = candidate["qid"] + + # Get entity details + entity_data = await wd_client.get_entity_claims(qid) + await asyncio.sleep(0.2) + + if not entity_data: + continue + + # Verify with GLM-4.6 + verification = await verifier.verify_heritage_institution( + institution_name=name, + institution_location=location, + qid=qid, + wd_label=candidate.get("label", ""), + wd_description=candidate.get("description", ""), + p31_types=entity_data.get("p31", []), + p131_location=[str(x) for x in entity_data.get("p131", [])], + ) + + if verification.get("is_heritage_institution") and verification.get("confidence", 0) > best_confidence: + best_match = { + "qid": qid, + "label": candidate.get("label"), + "description": candidate.get("description"), + "entity_data": entity_data, + "verification": verification, + } + best_confidence = verification.get("confidence", 0) + + if not best_match or best_confidence < 0.5: + # No verified match found + entry["wikidata_enrichment_status"] = "NO_VERIFIED_MATCH" + entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat() + entry["wikidata_candidates_checked"] = len(unique_candidates) + + # Remove re-enrichment note + if "provenance" in entry and "notes" in entry["provenance"]: + notes = entry["provenance"]["notes"] + if "Re-enrichment required" in notes: + entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip() + + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + return {"status": "no_verified_match", "file": str(file_path), "name": name, "candidates": len(unique_candidates)} + + # Add verified Wikidata enrichment + qid = best_match["qid"] + entity_data = best_match["entity_data"] + verification = best_match["verification"] + + entry["wikidata_enrichment"] = { + "wikidata_id": qid, + "wikidata_url": f"https://www.wikidata.org/wiki/{qid}", + "wikidata_label": best_match.get("label"), + "wikidata_description": best_match.get("description"), + "labels": entity_data.get("labels", {}), + "descriptions": entity_data.get("descriptions", {}), + "instance_of": entity_data.get("p31", []), + "located_in": entity_data.get("p131", []), + "country": entity_data.get("p17_country"), + "coordinates": entity_data.get("coordinates"), + "enrichment_timestamp": datetime.now(timezone.utc).isoformat(), + "verification": { + "method": verification.get("verification_method"), + "confidence": verification.get("confidence"), + "subtype": verification.get("subtype"), + "reasoning": verification.get("reasoning"), + "ch_annotator_version": "v1.7.0", + }, + } + + entry["wikidata_enrichment_status"] = "VERIFIED" + entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat() + + # Add Wikidata ID to identifiers + if "identifiers" not in entry: + entry["identifiers"] = [] + + # Check if Wikidata ID already exists + existing_schemes = {i.get("identifier_scheme") for i in entry["identifiers"] if isinstance(i, dict)} + if "Wikidata" not in existing_schemes: + entry["identifiers"].append({ + "identifier_scheme": "Wikidata", + "identifier_value": qid, + "identifier_url": f"https://www.wikidata.org/wiki/{qid}", + }) + + # Remove re-enrichment note + if "provenance" in entry and "notes" in entry["provenance"]: + notes = entry["provenance"]["notes"] + if "Re-enrichment required" in notes: + entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip() + + # Save updated file + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False) + + logger.info(f"✓ Enriched {file_path.name} with {qid} ({best_match.get('label')}) - confidence: {best_confidence:.2f}") + + return { + "status": "verified_match", + "file": str(file_path), + "name": name, + "qid": qid, + "label": best_match.get("label"), + "confidence": best_confidence, + "subtype": verification.get("subtype"), + } + + +async def main(): + """Main entry point.""" + import argparse + + parser = argparse.ArgumentParser(description="Re-enrich Wikidata with LLM verification") + parser.add_argument("--limit", type=int, default=100, help="Max files to process") + parser.add_argument("--dry-run", action="store_true", help="Don't modify files") + parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") + parser.add_argument("--use-claude", action="store_true", help="Use Claude instead of GLM-4.6") + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + custodian_dir = Path("/Users/kempersc/apps/glam/data/custodian") + + llm_name = "Claude" if args.use_claude else "GLM-4.6" + print("=" * 60) + print(f"Wikidata Re-enrichment with {llm_name} CH Annotator Verification") + print("=" * 60) + print() + + # Find files needing re-enrichment + print("Finding files needing re-enrichment...") + files = await find_files_needing_reenrichment(custodian_dir) + print(f"Found {len(files)} files needing re-enrichment") + + if not files: + print("No files to process!") + return + + # Limit files + files = files[:args.limit] + print(f"Processing {len(files)} files (limit: {args.limit})") + print() + + if args.dry_run: + print("DRY RUN - no files will be modified") + for f in files[:20]: + print(f" Would process: {f.name}") + return + + # Initialize clients + wd_client = WikidataSearchClient() + verifier = GLMHeritageVerifier(use_claude=args.use_claude) + + # Process files + results = { + "verified_match": [], + "no_verified_match": [], + "not_found": [], + "no_name": [], + "error": [], + } + + try: + for i, file_path in enumerate(files, 1): + print(f"\n[{i}/{len(files)}] Processing {file_path.name}...") + + try: + result = await enrich_file_with_wikidata(file_path, wd_client, verifier) + status = result.get("status", "error") + results.setdefault(status, []).append(result) + + if status == "verified_match": + print(f" ✓ {result.get('qid')} ({result.get('label')}) - {result.get('confidence', 0):.2f}") + elif status == "no_verified_match": + print(f" ✗ No verified match (checked {result.get('candidates', 0)} candidates)") + elif status == "not_found": + print(f" ✗ No Wikidata candidates found") + elif status == "no_name": + print(f" ⚠ No institution name found") + + except Exception as e: + logger.error(f"Error processing {file_path}: {e}") + results["error"].append({"file": str(file_path), "error": str(e)}) + + # Rate limiting + await asyncio.sleep(0.5) + + finally: + await wd_client.close() + await verifier.close() + + # Print summary + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Verified matches: {len(results.get('verified_match', []))}") + print(f"No verified match: {len(results.get('no_verified_match', []))}") + print(f"Not found: {len(results.get('not_found', []))}") + print(f"No name: {len(results.get('no_name', []))}") + print(f"Errors: {len(results.get('error', []))}") + print() + + # Save results + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + results_file = Path(f"/Users/kempersc/apps/glam/reports/wikidata_reenrichment_{timestamp}.yaml") + results_file.parent.mkdir(parents=True, exist_ok=True) + + with open(results_file, 'w', encoding='utf-8') as f: + yaml.dump({ + "timestamp": datetime.now(timezone.utc).isoformat(), + "files_processed": len(files), + "results": results, + }, f, allow_unicode=True, default_flow_style=False) + + print(f"Results saved to: {results_file}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/remove_wikidata_duplicates.py b/scripts/remove_wikidata_duplicates.py new file mode 100644 index 0000000000..a70fb6ae8b --- /dev/null +++ b/scripts/remove_wikidata_duplicates.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +""" +Remove wikidata_enrichment from files with duplicate Wikidata entity IDs. + +These files have incorrect Wikidata entity ID assignments where the same Q-number +was incorrectly assigned to multiple different institutions. + +The script: +1. Reads the list of affected files from /tmp/wikidata_duplicates_to_clean.txt +2. For each file, removes the wikidata_enrichment section +3. Adds a provenance note documenting the removal +4. Preserves all other data +""" + +import os +import sys +from datetime import datetime, timezone +from pathlib import Path +import yaml + +# Preserve order in YAML output +class OrderedDumper(yaml.SafeDumper): + pass + +def represent_ordereddict(dumper, data): + return dumper.represent_mapping('tag:yaml.org,2002:map', data.items()) + +def str_representer(dumper, data): + if '\n' in data: + return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') + return dumper.represent_scalar('tag:yaml.org,2002:str', data) + +OrderedDumper.add_representer(str, str_representer) + +def remove_wikidata_enrichment(file_path: Path) -> tuple[bool, str]: + """ + Remove wikidata_enrichment from a file. + + Returns: + tuple of (success: bool, message: str) + """ + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Parse YAML + data = yaml.safe_load(content) + + if data is None: + return False, "Empty or invalid YAML" + + # Check if wikidata_enrichment exists + if 'wikidata_enrichment' not in data: + return False, "No wikidata_enrichment found" + + # Get the old wikidata ID for documentation + old_wikidata_id = None + if isinstance(data.get('wikidata_enrichment'), dict): + old_wikidata_id = data['wikidata_enrichment'].get('wikidata_id') + + # Remove wikidata_enrichment + del data['wikidata_enrichment'] + + # Add provenance note + timestamp = datetime.now(timezone.utc).isoformat() + note = f"Removed incorrect wikidata_enrichment on {timestamp}. " + if old_wikidata_id: + note += f"Previous Wikidata ID {old_wikidata_id} was incorrectly assigned (duplicate across multiple institutions). " + note += "Re-enrichment required with proper matching." + + # Update or create provenance + if 'provenance' not in data: + data['provenance'] = {} + + if isinstance(data['provenance'], dict): + existing_notes = data['provenance'].get('notes', '') + # Handle case where notes is a list + if isinstance(existing_notes, list): + existing_notes.append(note) + data['provenance']['notes'] = existing_notes + elif existing_notes: + data['provenance']['notes'] = existing_notes + '\n\n' + note + else: + data['provenance']['notes'] = note + + # Write back + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(data, f, Dumper=OrderedDumper, allow_unicode=True, + default_flow_style=False, sort_keys=False, width=120) + + return True, f"Removed wikidata_enrichment (was {old_wikidata_id})" + + except Exception as e: + return False, f"Error: {e}" + + +def main(): + # Read the list of files to clean + list_file = Path('/tmp/wikidata_duplicates_to_clean.txt') + if not list_file.exists(): + print(f"Error: {list_file} not found") + sys.exit(1) + + with open(list_file, 'r') as f: + filenames = [line.strip() for line in f if line.strip()] + + print(f"Found {len(filenames)} files to clean") + + # Base directory for custodian files + base_dir = Path('/Users/kempersc/apps/glam/data/custodian') + + success_count = 0 + skip_count = 0 + error_count = 0 + + for i, filename in enumerate(filenames): + file_path = base_dir / filename + + if not file_path.exists(): + print(f"[{i+1}/{len(filenames)}] SKIP (not found): {filename}") + skip_count += 1 + continue + + success, message = remove_wikidata_enrichment(file_path) + + if success: + print(f"[{i+1}/{len(filenames)}] OK: {filename} - {message}") + success_count += 1 + else: + if "No wikidata_enrichment" in message: + print(f"[{i+1}/{len(filenames)}] SKIP (no wikidata): {filename}") + skip_count += 1 + else: + print(f"[{i+1}/{len(filenames)}] ERROR: {filename} - {message}") + error_count += 1 + + print(f"\n=== Summary ===") + print(f"Total files: {len(filenames)}") + print(f"Successfully cleaned: {success_count}") + print(f"Skipped: {skip_count}") + print(f"Errors: {error_count}") + + +if __name__ == '__main__': + main()