feat(wikidata): add re-enrichment and duplicate removal scripts
- Add reenrich_wikidata_with_verification.py for re-running enrichment - Add remove_wikidata_duplicates.py for deduplication
This commit is contained in:
parent
891692a4d6
commit
486bbee813
2 changed files with 1103 additions and 0 deletions
958
scripts/reenrich_wikidata_with_verification.py
Executable file
958
scripts/reenrich_wikidata_with_verification.py
Executable file
|
|
@ -0,0 +1,958 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Re-enrich heritage institutions with Wikidata using GLM-4.6 CH Annotator verification.
|
||||
|
||||
This script:
|
||||
1. Finds files marked for re-enrichment (after duplicate cleanup)
|
||||
2. Queries Wikidata API for candidates by institution name
|
||||
3. Uses GLM-4.6 to verify matches based on CH Annotator entity types (GRP.HER)
|
||||
4. Only adds Wikidata enrichment if entity is verified as heritage institution
|
||||
5. Updates files with verified Wikidata data
|
||||
|
||||
CH Annotator Convention (v1.7.0):
|
||||
- Heritage institutions are type GRP.HER (glam:HeritageCustodian)
|
||||
- Maps to: org:FormalOrganization, rov:RegisteredOrganization, schema:Museum, schema:Library, schema:ArchiveOrganization
|
||||
- Subtypes: GRP.HER.GAL (Gallery), GRP.HER.LIB (Library), GRP.HER.ARC (Archive), GRP.HER.MUS (Museum)
|
||||
|
||||
Wikidata "instance of" (P31) values for heritage institutions:
|
||||
- Q33506 (museum)
|
||||
- Q7075 (library)
|
||||
- Q166118 (archive)
|
||||
- Q1007870 (art gallery)
|
||||
- Q207694 (art museum)
|
||||
- Q1970365 (natural history museum)
|
||||
- Q18388277 (history museum)
|
||||
- Q23413 (castle) - when used as museum
|
||||
- Q839954 (archaeological site)
|
||||
- Q174782 (town square) - NOT heritage institution
|
||||
- Q515 (city) - NOT heritage institution
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
import yaml
|
||||
import httpx
|
||||
import logging
|
||||
|
||||
# Load environment variables from .env file
|
||||
from dotenv import load_dotenv
|
||||
env_path = Path(__file__).parent.parent / ".env"
|
||||
load_dotenv(env_path)
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
# =============================================================================
|
||||
# WIKIDATA HERITAGE INSTITUTION TYPE CLASSES (P31 values)
|
||||
# =============================================================================
|
||||
|
||||
# These are valid Wikidata "instance of" values for heritage institutions
|
||||
HERITAGE_P31_TYPES = {
|
||||
# Museums
|
||||
"Q33506": "museum",
|
||||
"Q207694": "art museum",
|
||||
"Q1970365": "natural history museum",
|
||||
"Q18388277": "history museum",
|
||||
"Q2087181": "university museum",
|
||||
"Q1007870": "art gallery",
|
||||
"Q17431399": "national museum",
|
||||
"Q16735822": "museum building",
|
||||
"Q1788742": "war museum",
|
||||
"Q7889618": "gallery of art",
|
||||
"Q4989906": "monuments and memorials",
|
||||
"Q57660343": "maritime museum",
|
||||
"Q15206070": "transport museum",
|
||||
"Q214090": "ethnographic museum",
|
||||
"Q2522387": "aviation museum",
|
||||
"Q841573": "archaeological museum",
|
||||
"Q28737012": "memorial museum",
|
||||
"Q588140": "railway museum",
|
||||
"Q515034": "science museum",
|
||||
"Q4287745": "local museum",
|
||||
|
||||
# Libraries
|
||||
"Q7075": "library",
|
||||
"Q856234": "national library",
|
||||
"Q1078570": "academic library",
|
||||
"Q11294": "public library",
|
||||
"Q13226383": "research library",
|
||||
|
||||
# Archives
|
||||
"Q166118": "archive",
|
||||
"Q473972": "national archives",
|
||||
"Q1423895": "film archive",
|
||||
"Q2066131": "regional archive",
|
||||
"Q63400100": "historical archive",
|
||||
"Q63400127": "municipal archive",
|
||||
"Q1026954": "photo archive",
|
||||
|
||||
# Galleries
|
||||
"Q1007870": "art gallery",
|
||||
"Q7889618": "gallery of art",
|
||||
|
||||
# Research centers
|
||||
"Q31855": "research institute",
|
||||
"Q327333": "heritage organisation",
|
||||
|
||||
# Botanical/Zoo
|
||||
"Q43229": "botanical garden",
|
||||
"Q45585": "botanical garden",
|
||||
"Q43501": "zoo",
|
||||
|
||||
# Holy sites (when managing heritage collections)
|
||||
"Q317557": "monastery",
|
||||
"Q83405": "abbey",
|
||||
"Q1088552": "cathedral chapter",
|
||||
|
||||
# Educational (with collections)
|
||||
"Q3918": "university",
|
||||
"Q875538": "public university",
|
||||
}
|
||||
|
||||
# These P31 values indicate NOT a heritage institution
|
||||
NON_HERITAGE_P31_TYPES = {
|
||||
"Q515": "city",
|
||||
"Q174782": "square",
|
||||
"Q5": "human",
|
||||
"Q4830453": "business",
|
||||
"Q891723": "public company",
|
||||
"Q783794": "company",
|
||||
"Q6881511": "enterprise",
|
||||
"Q43229": "organization", # Too generic
|
||||
"Q55678": "movie",
|
||||
"Q7366": "song",
|
||||
"Q5398426": "television series",
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# WIKIDATA API CLIENT
|
||||
# =============================================================================
|
||||
|
||||
class WikidataSearchClient:
|
||||
"""Client for Wikidata search and entity API."""
|
||||
|
||||
SEARCH_URL = "https://www.wikidata.org/w/api.php"
|
||||
ENTITY_URL = "https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
|
||||
|
||||
def __init__(self, contact_email: Optional[str] = None):
|
||||
self.contact_email = contact_email or os.environ.get("WIKIMEDIA_CONTACT_EMAIL", "glam@example.org")
|
||||
self.client = httpx.AsyncClient(
|
||||
timeout=30.0,
|
||||
headers={
|
||||
"User-Agent": f"GLAMBot/1.0 ({self.contact_email})",
|
||||
}
|
||||
)
|
||||
|
||||
async def search_entity(self, name: str, language: str = "en", limit: int = 5) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Search Wikidata for entities matching a name.
|
||||
|
||||
Returns list of candidates with qid, label, description.
|
||||
"""
|
||||
params = {
|
||||
"action": "wbsearchentities",
|
||||
"format": "json",
|
||||
"language": language,
|
||||
"type": "item",
|
||||
"limit": limit,
|
||||
"search": name,
|
||||
}
|
||||
|
||||
try:
|
||||
response = await self.client.get(self.SEARCH_URL, params=params)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = []
|
||||
for item in data.get("search", []):
|
||||
results.append({
|
||||
"qid": item.get("id"),
|
||||
"label": item.get("label"),
|
||||
"description": item.get("description", ""),
|
||||
"url": item.get("concepturi"),
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Wikidata search error for '{name}': {e}")
|
||||
return []
|
||||
|
||||
async def get_entity_claims(self, qid: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get entity claims (properties) from Wikidata.
|
||||
|
||||
Returns dict with P31 (instance of), P131 (located in), P625 (coordinates), etc.
|
||||
"""
|
||||
url = self.ENTITY_URL.format(qid=qid)
|
||||
|
||||
try:
|
||||
response = await self.client.get(url)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
entity = data.get("entities", {}).get(qid, {})
|
||||
claims = entity.get("claims", {})
|
||||
labels = entity.get("labels", {})
|
||||
descriptions = entity.get("descriptions", {})
|
||||
|
||||
# Extract P31 values (instance of)
|
||||
p31_values = []
|
||||
for claim in claims.get("P31", []):
|
||||
mainsnak = claim.get("mainsnak", {})
|
||||
if mainsnak.get("snaktype") == "value":
|
||||
datavalue = mainsnak.get("datavalue", {})
|
||||
if datavalue.get("type") == "wikibase-entityid":
|
||||
p31_qid = datavalue.get("value", {}).get("id")
|
||||
if p31_qid:
|
||||
p31_values.append(p31_qid)
|
||||
|
||||
# Extract P131 (located in administrative entity)
|
||||
p131_values = []
|
||||
for claim in claims.get("P131", []):
|
||||
mainsnak = claim.get("mainsnak", {})
|
||||
if mainsnak.get("snaktype") == "value":
|
||||
datavalue = mainsnak.get("datavalue", {})
|
||||
if datavalue.get("type") == "wikibase-entityid":
|
||||
p131_qid = datavalue.get("value", {}).get("id")
|
||||
if p131_qid:
|
||||
p131_values.append(p131_qid)
|
||||
|
||||
# Extract P625 (coordinates)
|
||||
coordinates = None
|
||||
for claim in claims.get("P625", []):
|
||||
mainsnak = claim.get("mainsnak", {})
|
||||
if mainsnak.get("snaktype") == "value":
|
||||
datavalue = mainsnak.get("datavalue", {})
|
||||
if datavalue.get("type") == "globecoordinate":
|
||||
value = datavalue.get("value", {})
|
||||
coordinates = {
|
||||
"latitude": value.get("latitude"),
|
||||
"longitude": value.get("longitude"),
|
||||
}
|
||||
break
|
||||
|
||||
# Extract P17 (country)
|
||||
country = None
|
||||
for claim in claims.get("P17", []):
|
||||
mainsnak = claim.get("mainsnak", {})
|
||||
if mainsnak.get("snaktype") == "value":
|
||||
datavalue = mainsnak.get("datavalue", {})
|
||||
if datavalue.get("type") == "wikibase-entityid":
|
||||
country = datavalue.get("value", {}).get("id")
|
||||
break
|
||||
|
||||
return {
|
||||
"qid": qid,
|
||||
"labels": {k: v.get("value") for k, v in labels.items()},
|
||||
"descriptions": {k: v.get("value") for k, v in descriptions.items()},
|
||||
"p31": p31_values,
|
||||
"p131": p131_values,
|
||||
"p17_country": country,
|
||||
"coordinates": coordinates,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Wikidata entity fetch error for {qid}: {e}")
|
||||
return {}
|
||||
|
||||
async def close(self):
|
||||
await self.client.aclose()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# GLM-4.6 CH ANNOTATOR VERIFICATION
|
||||
# =============================================================================
|
||||
|
||||
class GLMHeritageVerifier:
|
||||
"""
|
||||
Verify Wikidata entity matches using GLM-4.6 CH Annotator.
|
||||
|
||||
Uses CH Annotator v1.7.0 entity type GRP.HER to verify that
|
||||
a Wikidata entity is actually a heritage institution.
|
||||
"""
|
||||
|
||||
# Z.AI Coding Plan endpoint (different from regular BigModel API)
|
||||
ZAI_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions"
|
||||
|
||||
VERIFICATION_PROMPT = """You are a heritage institution classifier following CH-Annotator v1.7.0 convention.
|
||||
|
||||
Your task is to determine if a Wikidata entity is a heritage institution (type GRP.HER).
|
||||
|
||||
## CH-Annotator GRP.HER Definition
|
||||
Heritage institutions are organizations that:
|
||||
- Collect, preserve, and provide access to cultural heritage materials
|
||||
- Include: museums (GRP.HER.MUS), libraries (GRP.HER.LIB), archives (GRP.HER.ARC), galleries (GRP.HER.GAL)
|
||||
- May also include: research centers, botanical gardens, educational institutions WITH collections
|
||||
|
||||
## Entity Types That Are NOT Heritage Institutions
|
||||
- Cities, towns, municipalities (these are places, not institutions)
|
||||
- General businesses or companies (unless they manage heritage collections)
|
||||
- People (individuals are AGT.PER, not GRP.HER)
|
||||
- Events, festivals, exhibitions (temporary, not institutions)
|
||||
- Buildings without institutional function (just architecture)
|
||||
|
||||
## Your Task
|
||||
Analyze the Wikidata entity data and determine:
|
||||
1. Is this entity a heritage institution (GRP.HER)?
|
||||
2. If yes, what subtype? (MUS/LIB/ARC/GAL/OTHER)
|
||||
3. Confidence score (0.0-1.0)
|
||||
|
||||
Respond in JSON format:
|
||||
```json
|
||||
{{
|
||||
"is_heritage_institution": true/false,
|
||||
"subtype": "MUS|LIB|ARC|GAL|RES|BOT|EDU|OTHER|null",
|
||||
"confidence": 0.95,
|
||||
"reasoning": "Brief explanation"
|
||||
}}
|
||||
```
|
||||
|
||||
## Entity to Analyze
|
||||
Institution name from our data: {institution_name}
|
||||
Location from our data: {institution_location}
|
||||
|
||||
Wikidata entity:
|
||||
- QID: {qid}
|
||||
- Label: {wd_label}
|
||||
- Description: {wd_description}
|
||||
- Instance of (P31): {p31_types}
|
||||
- Located in (P131): {p131_location}
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, model: str = "glm-4.6", use_claude: bool = False):
|
||||
self.use_claude = use_claude
|
||||
|
||||
if use_claude:
|
||||
self.api_key = api_key or os.environ.get("CLAUDE_API_KEY")
|
||||
self.model = "claude-3-5-haiku-20241022" # Fast, cheap model
|
||||
self.api_url = "https://api.anthropic.com/v1/messages"
|
||||
if not self.api_key:
|
||||
raise ValueError("CLAUDE_API_KEY not found in environment")
|
||||
self.client = httpx.AsyncClient(
|
||||
timeout=60.0,
|
||||
headers={
|
||||
"x-api-key": self.api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
)
|
||||
else:
|
||||
self.api_key = api_key or os.environ.get("ZAI_API_TOKEN")
|
||||
self.model = model
|
||||
# Use Z.AI Coding Plan endpoint (same as OpenCode)
|
||||
self.api_url = "https://api.z.ai/api/coding/paas/v4/chat/completions"
|
||||
if not self.api_key:
|
||||
raise ValueError("ZAI_API_TOKEN not found in environment")
|
||||
self.client = httpx.AsyncClient(
|
||||
timeout=60.0,
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
)
|
||||
|
||||
async def verify_heritage_institution(
|
||||
self,
|
||||
institution_name: str,
|
||||
institution_location: str,
|
||||
qid: str,
|
||||
wd_label: str,
|
||||
wd_description: str,
|
||||
p31_types: List[str],
|
||||
p131_location: List[str],
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Verify if a Wikidata entity matches a heritage institution.
|
||||
|
||||
Returns verification result with confidence score.
|
||||
"""
|
||||
# First, quick heuristic check using P31 types
|
||||
p31_set = set(p31_types)
|
||||
|
||||
# Check for definite heritage types
|
||||
heritage_matches = p31_set & set(HERITAGE_P31_TYPES.keys())
|
||||
non_heritage_matches = p31_set & set(NON_HERITAGE_P31_TYPES.keys())
|
||||
|
||||
# If clear non-heritage type, reject without LLM call
|
||||
if non_heritage_matches and not heritage_matches:
|
||||
logger.debug(f"Quick reject {qid}: P31 indicates non-heritage ({non_heritage_matches})")
|
||||
return {
|
||||
"is_heritage_institution": False,
|
||||
"subtype": None,
|
||||
"confidence": 0.95,
|
||||
"reasoning": f"P31 types indicate non-heritage: {[NON_HERITAGE_P31_TYPES.get(t, t) for t in non_heritage_matches]}",
|
||||
"verification_method": "p31_heuristic",
|
||||
}
|
||||
|
||||
# If clear heritage type, high confidence without LLM
|
||||
if heritage_matches and not non_heritage_matches:
|
||||
subtype = self._infer_subtype_from_p31(p31_types)
|
||||
logger.debug(f"Quick accept {qid}: P31 indicates heritage ({heritage_matches})")
|
||||
return {
|
||||
"is_heritage_institution": True,
|
||||
"subtype": subtype,
|
||||
"confidence": 0.9,
|
||||
"reasoning": f"P31 types indicate heritage: {[HERITAGE_P31_TYPES.get(t, t) for t in heritage_matches]}",
|
||||
"verification_method": "p31_heuristic",
|
||||
}
|
||||
|
||||
# Ambiguous case - use GLM-4.6 for verification
|
||||
p31_labels = [HERITAGE_P31_TYPES.get(t, NON_HERITAGE_P31_TYPES.get(t, t)) for t in p31_types]
|
||||
|
||||
prompt = self.VERIFICATION_PROMPT.format(
|
||||
institution_name=institution_name,
|
||||
institution_location=institution_location,
|
||||
qid=qid,
|
||||
wd_label=wd_label,
|
||||
wd_description=wd_description,
|
||||
p31_types=", ".join(p31_labels) if p31_labels else "None specified",
|
||||
p131_location=", ".join(p131_location) if p131_location else "Not specified",
|
||||
)
|
||||
|
||||
try:
|
||||
if self.use_claude:
|
||||
# Claude API request format
|
||||
response = await self.client.post(
|
||||
self.api_url,
|
||||
json={
|
||||
"model": self.model,
|
||||
"max_tokens": 512,
|
||||
"messages": [
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"system": "You are a heritage institution classifier. Respond only in valid JSON. Start your response with { and end with }.",
|
||||
}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
content = data.get("content", [{}])[0].get("text", "")
|
||||
logger.debug(f"Claude raw response for {qid}: {content[:300]}")
|
||||
verification_method = "claude_ch_annotator"
|
||||
else:
|
||||
# GLM/Z.AI API request format
|
||||
response = await self.client.post(
|
||||
self.api_url,
|
||||
json={
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a heritage institution classifier. Respond only in valid JSON."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 512,
|
||||
}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
verification_method = "glm_4.6_ch_annotator"
|
||||
|
||||
# Parse JSON from response
|
||||
try:
|
||||
# Extract JSON from markdown code blocks if present
|
||||
if "```json" in content:
|
||||
content = content.split("```json")[1].split("```")[0]
|
||||
elif "```" in content:
|
||||
content = content.split("```")[1].split("```")[0]
|
||||
|
||||
# Try to find JSON object in content
|
||||
content = content.strip()
|
||||
|
||||
# If content doesn't start with {, try to find first {
|
||||
if not content.startswith("{"):
|
||||
start_idx = content.find("{")
|
||||
if start_idx != -1:
|
||||
# Find matching closing brace
|
||||
brace_count = 0
|
||||
end_idx = start_idx
|
||||
for i, char in enumerate(content[start_idx:], start_idx):
|
||||
if char == "{":
|
||||
brace_count += 1
|
||||
elif char == "}":
|
||||
brace_count -= 1
|
||||
if brace_count == 0:
|
||||
end_idx = i
|
||||
break
|
||||
content = content[start_idx:end_idx + 1]
|
||||
else:
|
||||
# No { found - wrap content in braces if it looks like JSON body
|
||||
if '"is_heritage_institution"' in content:
|
||||
content = "{" + content.rstrip().rstrip(",") + "}"
|
||||
|
||||
result = json.loads(content)
|
||||
result["verification_method"] = verification_method
|
||||
return result
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
# Fallback: try to extract values with regex
|
||||
logger.debug(f"JSON parse failed for {qid}, trying regex fallback: {content[:200]}")
|
||||
|
||||
is_heritage = None
|
||||
subtype = None
|
||||
confidence = 0.5
|
||||
reasoning = "Parsed from non-JSON response"
|
||||
|
||||
# Check for is_heritage_institution value
|
||||
if '"is_heritage_institution"' in content:
|
||||
if 'true' in content.lower():
|
||||
is_heritage = True
|
||||
elif 'false' in content.lower():
|
||||
is_heritage = False
|
||||
|
||||
# Extract subtype
|
||||
subtype_match = re.search(r'"subtype"\s*:\s*"([^"]+)"', content)
|
||||
if subtype_match:
|
||||
subtype = subtype_match.group(1)
|
||||
|
||||
# Extract confidence
|
||||
conf_match = re.search(r'"confidence"\s*:\s*([\d.]+)', content)
|
||||
if conf_match:
|
||||
try:
|
||||
confidence = float(conf_match.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Extract reasoning
|
||||
reason_match = re.search(r'"reasoning"\s*:\s*"([^"]+)"', content)
|
||||
if reason_match:
|
||||
reasoning = reason_match.group(1)
|
||||
|
||||
if is_heritage is not None:
|
||||
return {
|
||||
"is_heritage_institution": is_heritage,
|
||||
"subtype": subtype,
|
||||
"confidence": confidence,
|
||||
"reasoning": reasoning,
|
||||
"verification_method": f"{verification_method}_regex_fallback",
|
||||
}
|
||||
|
||||
logger.warning(f"Failed to parse LLM response for {qid}: {str(e)[:100]} - content: {content[:200]}")
|
||||
return {
|
||||
"is_heritage_institution": False,
|
||||
"subtype": None,
|
||||
"confidence": 0.0,
|
||||
"reasoning": f"Failed to parse LLM response: {str(e)}",
|
||||
"verification_method": f"{verification_method}_parse_error",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM verification error for {qid}: {e}")
|
||||
return {
|
||||
"is_heritage_institution": False,
|
||||
"subtype": None,
|
||||
"confidence": 0.0,
|
||||
"reasoning": f"API error: {e}",
|
||||
"verification_method": "llm_api_error",
|
||||
}
|
||||
|
||||
def _infer_subtype_from_p31(self, p31_types: List[str]) -> str:
|
||||
"""Infer heritage institution subtype from P31 values."""
|
||||
p31_set = set(p31_types)
|
||||
|
||||
# Museum types
|
||||
museum_types = {"Q33506", "Q207694", "Q1970365", "Q18388277", "Q2087181", "Q17431399",
|
||||
"Q1788742", "Q57660343", "Q15206070", "Q214090", "Q2522387",
|
||||
"Q841573", "Q28737012", "Q588140", "Q515034", "Q4287745"}
|
||||
if p31_set & museum_types:
|
||||
return "MUS"
|
||||
|
||||
# Library types
|
||||
library_types = {"Q7075", "Q856234", "Q1078570", "Q11294", "Q13226383"}
|
||||
if p31_set & library_types:
|
||||
return "LIB"
|
||||
|
||||
# Archive types
|
||||
archive_types = {"Q166118", "Q473972", "Q1423895", "Q2066131", "Q63400100", "Q63400127", "Q1026954"}
|
||||
if p31_set & archive_types:
|
||||
return "ARC"
|
||||
|
||||
# Gallery types
|
||||
gallery_types = {"Q1007870", "Q7889618"}
|
||||
if p31_set & gallery_types:
|
||||
return "GAL"
|
||||
|
||||
# Research centers
|
||||
if "Q31855" in p31_set or "Q327333" in p31_set:
|
||||
return "RES"
|
||||
|
||||
# Botanical/Zoo
|
||||
if "Q43229" in p31_set or "Q45585" in p31_set or "Q43501" in p31_set:
|
||||
return "BOT"
|
||||
|
||||
# Educational
|
||||
if "Q3918" in p31_set or "Q875538" in p31_set:
|
||||
return "EDU"
|
||||
|
||||
return "OTHER"
|
||||
|
||||
async def close(self):
|
||||
await self.client.aclose()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MAIN ENRICHMENT LOGIC
|
||||
# =============================================================================
|
||||
|
||||
async def find_files_needing_reenrichment(custodian_dir: Path) -> List[Path]:
|
||||
"""Find all files marked for re-enrichment."""
|
||||
files = []
|
||||
|
||||
for file_path in custodian_dir.glob("*.yaml"):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
if "Re-enrichment required" in content:
|
||||
files.append(file_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error reading {file_path}: {e}")
|
||||
|
||||
return files
|
||||
|
||||
|
||||
async def enrich_file_with_wikidata(
|
||||
file_path: Path,
|
||||
wd_client: WikidataSearchClient,
|
||||
verifier: GLMHeritageVerifier,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Enrich a single file with verified Wikidata data.
|
||||
|
||||
Returns enrichment result.
|
||||
"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
entry = yaml.safe_load(f)
|
||||
|
||||
if not entry:
|
||||
return {"status": "empty_file", "file": str(file_path)}
|
||||
|
||||
# Get institution name
|
||||
name = None
|
||||
if "custodian_name" in entry and isinstance(entry["custodian_name"], dict):
|
||||
name = entry["custodian_name"].get("claim_value")
|
||||
if not name and "google_maps_enrichment" in entry:
|
||||
name = entry["google_maps_enrichment"].get("name")
|
||||
if not name and "original_entry" in entry:
|
||||
name = entry["original_entry"].get("organisatie") or entry["original_entry"].get("instelling")
|
||||
|
||||
if not name:
|
||||
return {"status": "no_name", "file": str(file_path)}
|
||||
|
||||
# Get location for verification
|
||||
location = ""
|
||||
if "google_maps_enrichment" in entry:
|
||||
gm = entry["google_maps_enrichment"]
|
||||
parts = []
|
||||
if gm.get("short_address"):
|
||||
parts.append(gm["short_address"])
|
||||
elif gm.get("formatted_address"):
|
||||
parts.append(gm["formatted_address"])
|
||||
location = ", ".join(parts)
|
||||
elif "original_entry" in entry:
|
||||
oe = entry["original_entry"]
|
||||
parts = []
|
||||
if oe.get("plaatsnaam_bezoekadres"):
|
||||
parts.append(oe["plaatsnaam_bezoekadres"])
|
||||
if oe.get("provincie"):
|
||||
parts.append(oe["provincie"])
|
||||
location = ", ".join(parts)
|
||||
|
||||
# Get country for search language
|
||||
country_code = "NL" # Default
|
||||
if "ghcid" in entry:
|
||||
ghcid = entry["ghcid"].get("ghcid_current", "")
|
||||
if ghcid and len(ghcid) >= 2:
|
||||
country_code = ghcid[:2]
|
||||
|
||||
# Determine search language based on country
|
||||
search_langs = ["en"] # Always search English
|
||||
if country_code == "NL":
|
||||
search_langs = ["nl", "en"]
|
||||
elif country_code == "BE":
|
||||
search_langs = ["nl", "fr", "en"]
|
||||
elif country_code == "DE":
|
||||
search_langs = ["de", "en"]
|
||||
elif country_code == "FR":
|
||||
search_langs = ["fr", "en"]
|
||||
elif country_code in ["BR", "PT"]:
|
||||
search_langs = ["pt", "en"]
|
||||
elif country_code in ["ES", "MX", "AR", "CL", "CO"]:
|
||||
search_langs = ["es", "en"]
|
||||
|
||||
# Search Wikidata for candidates
|
||||
all_candidates = []
|
||||
for lang in search_langs:
|
||||
candidates = await wd_client.search_entity(name, language=lang, limit=5)
|
||||
all_candidates.extend(candidates)
|
||||
await asyncio.sleep(0.2) # Rate limiting
|
||||
|
||||
# Deduplicate by QID
|
||||
seen_qids = set()
|
||||
unique_candidates = []
|
||||
for c in all_candidates:
|
||||
if c["qid"] not in seen_qids:
|
||||
seen_qids.add(c["qid"])
|
||||
unique_candidates.append(c)
|
||||
|
||||
if not unique_candidates:
|
||||
# Update file to mark as not found
|
||||
entry["wikidata_enrichment_status"] = "NOT_FOUND"
|
||||
entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Remove re-enrichment note from provenance
|
||||
if "provenance" in entry and "notes" in entry["provenance"]:
|
||||
notes = entry["provenance"]["notes"]
|
||||
if "Re-enrichment required" in notes:
|
||||
entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip()
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
return {"status": "not_found", "file": str(file_path), "name": name}
|
||||
|
||||
# Verify each candidate
|
||||
best_match = None
|
||||
best_confidence = 0.0
|
||||
|
||||
for candidate in unique_candidates[:5]: # Limit to top 5
|
||||
qid = candidate["qid"]
|
||||
|
||||
# Get entity details
|
||||
entity_data = await wd_client.get_entity_claims(qid)
|
||||
await asyncio.sleep(0.2)
|
||||
|
||||
if not entity_data:
|
||||
continue
|
||||
|
||||
# Verify with GLM-4.6
|
||||
verification = await verifier.verify_heritage_institution(
|
||||
institution_name=name,
|
||||
institution_location=location,
|
||||
qid=qid,
|
||||
wd_label=candidate.get("label", ""),
|
||||
wd_description=candidate.get("description", ""),
|
||||
p31_types=entity_data.get("p31", []),
|
||||
p131_location=[str(x) for x in entity_data.get("p131", [])],
|
||||
)
|
||||
|
||||
if verification.get("is_heritage_institution") and verification.get("confidence", 0) > best_confidence:
|
||||
best_match = {
|
||||
"qid": qid,
|
||||
"label": candidate.get("label"),
|
||||
"description": candidate.get("description"),
|
||||
"entity_data": entity_data,
|
||||
"verification": verification,
|
||||
}
|
||||
best_confidence = verification.get("confidence", 0)
|
||||
|
||||
if not best_match or best_confidence < 0.5:
|
||||
# No verified match found
|
||||
entry["wikidata_enrichment_status"] = "NO_VERIFIED_MATCH"
|
||||
entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat()
|
||||
entry["wikidata_candidates_checked"] = len(unique_candidates)
|
||||
|
||||
# Remove re-enrichment note
|
||||
if "provenance" in entry and "notes" in entry["provenance"]:
|
||||
notes = entry["provenance"]["notes"]
|
||||
if "Re-enrichment required" in notes:
|
||||
entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip()
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
return {"status": "no_verified_match", "file": str(file_path), "name": name, "candidates": len(unique_candidates)}
|
||||
|
||||
# Add verified Wikidata enrichment
|
||||
qid = best_match["qid"]
|
||||
entity_data = best_match["entity_data"]
|
||||
verification = best_match["verification"]
|
||||
|
||||
entry["wikidata_enrichment"] = {
|
||||
"wikidata_id": qid,
|
||||
"wikidata_url": f"https://www.wikidata.org/wiki/{qid}",
|
||||
"wikidata_label": best_match.get("label"),
|
||||
"wikidata_description": best_match.get("description"),
|
||||
"labels": entity_data.get("labels", {}),
|
||||
"descriptions": entity_data.get("descriptions", {}),
|
||||
"instance_of": entity_data.get("p31", []),
|
||||
"located_in": entity_data.get("p131", []),
|
||||
"country": entity_data.get("p17_country"),
|
||||
"coordinates": entity_data.get("coordinates"),
|
||||
"enrichment_timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"verification": {
|
||||
"method": verification.get("verification_method"),
|
||||
"confidence": verification.get("confidence"),
|
||||
"subtype": verification.get("subtype"),
|
||||
"reasoning": verification.get("reasoning"),
|
||||
"ch_annotator_version": "v1.7.0",
|
||||
},
|
||||
}
|
||||
|
||||
entry["wikidata_enrichment_status"] = "VERIFIED"
|
||||
entry["wikidata_search_timestamp"] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# Add Wikidata ID to identifiers
|
||||
if "identifiers" not in entry:
|
||||
entry["identifiers"] = []
|
||||
|
||||
# Check if Wikidata ID already exists
|
||||
existing_schemes = {i.get("identifier_scheme") for i in entry["identifiers"] if isinstance(i, dict)}
|
||||
if "Wikidata" not in existing_schemes:
|
||||
entry["identifiers"].append({
|
||||
"identifier_scheme": "Wikidata",
|
||||
"identifier_value": qid,
|
||||
"identifier_url": f"https://www.wikidata.org/wiki/{qid}",
|
||||
})
|
||||
|
||||
# Remove re-enrichment note
|
||||
if "provenance" in entry and "notes" in entry["provenance"]:
|
||||
notes = entry["provenance"]["notes"]
|
||||
if "Re-enrichment required" in notes:
|
||||
entry["provenance"]["notes"] = notes.split("Re-enrichment required")[0].strip()
|
||||
|
||||
# Save updated file
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(entry, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
|
||||
logger.info(f"✓ Enriched {file_path.name} with {qid} ({best_match.get('label')}) - confidence: {best_confidence:.2f}")
|
||||
|
||||
return {
|
||||
"status": "verified_match",
|
||||
"file": str(file_path),
|
||||
"name": name,
|
||||
"qid": qid,
|
||||
"label": best_match.get("label"),
|
||||
"confidence": best_confidence,
|
||||
"subtype": verification.get("subtype"),
|
||||
}
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Re-enrich Wikidata with LLM verification")
|
||||
parser.add_argument("--limit", type=int, default=100, help="Max files to process")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Don't modify files")
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
||||
parser.add_argument("--use-claude", action="store_true", help="Use Claude instead of GLM-4.6")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
custodian_dir = Path("/Users/kempersc/apps/glam/data/custodian")
|
||||
|
||||
llm_name = "Claude" if args.use_claude else "GLM-4.6"
|
||||
print("=" * 60)
|
||||
print(f"Wikidata Re-enrichment with {llm_name} CH Annotator Verification")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# Find files needing re-enrichment
|
||||
print("Finding files needing re-enrichment...")
|
||||
files = await find_files_needing_reenrichment(custodian_dir)
|
||||
print(f"Found {len(files)} files needing re-enrichment")
|
||||
|
||||
if not files:
|
||||
print("No files to process!")
|
||||
return
|
||||
|
||||
# Limit files
|
||||
files = files[:args.limit]
|
||||
print(f"Processing {len(files)} files (limit: {args.limit})")
|
||||
print()
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN - no files will be modified")
|
||||
for f in files[:20]:
|
||||
print(f" Would process: {f.name}")
|
||||
return
|
||||
|
||||
# Initialize clients
|
||||
wd_client = WikidataSearchClient()
|
||||
verifier = GLMHeritageVerifier(use_claude=args.use_claude)
|
||||
|
||||
# Process files
|
||||
results = {
|
||||
"verified_match": [],
|
||||
"no_verified_match": [],
|
||||
"not_found": [],
|
||||
"no_name": [],
|
||||
"error": [],
|
||||
}
|
||||
|
||||
try:
|
||||
for i, file_path in enumerate(files, 1):
|
||||
print(f"\n[{i}/{len(files)}] Processing {file_path.name}...")
|
||||
|
||||
try:
|
||||
result = await enrich_file_with_wikidata(file_path, wd_client, verifier)
|
||||
status = result.get("status", "error")
|
||||
results.setdefault(status, []).append(result)
|
||||
|
||||
if status == "verified_match":
|
||||
print(f" ✓ {result.get('qid')} ({result.get('label')}) - {result.get('confidence', 0):.2f}")
|
||||
elif status == "no_verified_match":
|
||||
print(f" ✗ No verified match (checked {result.get('candidates', 0)} candidates)")
|
||||
elif status == "not_found":
|
||||
print(f" ✗ No Wikidata candidates found")
|
||||
elif status == "no_name":
|
||||
print(f" ⚠ No institution name found")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {file_path}: {e}")
|
||||
results["error"].append({"file": str(file_path), "error": str(e)})
|
||||
|
||||
# Rate limiting
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
finally:
|
||||
await wd_client.close()
|
||||
await verifier.close()
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Verified matches: {len(results.get('verified_match', []))}")
|
||||
print(f"No verified match: {len(results.get('no_verified_match', []))}")
|
||||
print(f"Not found: {len(results.get('not_found', []))}")
|
||||
print(f"No name: {len(results.get('no_name', []))}")
|
||||
print(f"Errors: {len(results.get('error', []))}")
|
||||
print()
|
||||
|
||||
# Save results
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
results_file = Path(f"/Users/kempersc/apps/glam/reports/wikidata_reenrichment_{timestamp}.yaml")
|
||||
results_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(results_file, 'w', encoding='utf-8') as f:
|
||||
yaml.dump({
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"files_processed": len(files),
|
||||
"results": results,
|
||||
}, f, allow_unicode=True, default_flow_style=False)
|
||||
|
||||
print(f"Results saved to: {results_file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
145
scripts/remove_wikidata_duplicates.py
Normal file
145
scripts/remove_wikidata_duplicates.py
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Remove wikidata_enrichment from files with duplicate Wikidata entity IDs.
|
||||
|
||||
These files have incorrect Wikidata entity ID assignments where the same Q-number
|
||||
was incorrectly assigned to multiple different institutions.
|
||||
|
||||
The script:
|
||||
1. Reads the list of affected files from /tmp/wikidata_duplicates_to_clean.txt
|
||||
2. For each file, removes the wikidata_enrichment section
|
||||
3. Adds a provenance note documenting the removal
|
||||
4. Preserves all other data
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
|
||||
# Preserve order in YAML output
|
||||
class OrderedDumper(yaml.SafeDumper):
|
||||
pass
|
||||
|
||||
def represent_ordereddict(dumper, data):
|
||||
return dumper.represent_mapping('tag:yaml.org,2002:map', data.items())
|
||||
|
||||
def str_representer(dumper, data):
|
||||
if '\n' in data:
|
||||
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
|
||||
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
||||
|
||||
OrderedDumper.add_representer(str, str_representer)
|
||||
|
||||
def remove_wikidata_enrichment(file_path: Path) -> tuple[bool, str]:
|
||||
"""
|
||||
Remove wikidata_enrichment from a file.
|
||||
|
||||
Returns:
|
||||
tuple of (success: bool, message: str)
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Parse YAML
|
||||
data = yaml.safe_load(content)
|
||||
|
||||
if data is None:
|
||||
return False, "Empty or invalid YAML"
|
||||
|
||||
# Check if wikidata_enrichment exists
|
||||
if 'wikidata_enrichment' not in data:
|
||||
return False, "No wikidata_enrichment found"
|
||||
|
||||
# Get the old wikidata ID for documentation
|
||||
old_wikidata_id = None
|
||||
if isinstance(data.get('wikidata_enrichment'), dict):
|
||||
old_wikidata_id = data['wikidata_enrichment'].get('wikidata_id')
|
||||
|
||||
# Remove wikidata_enrichment
|
||||
del data['wikidata_enrichment']
|
||||
|
||||
# Add provenance note
|
||||
timestamp = datetime.now(timezone.utc).isoformat()
|
||||
note = f"Removed incorrect wikidata_enrichment on {timestamp}. "
|
||||
if old_wikidata_id:
|
||||
note += f"Previous Wikidata ID {old_wikidata_id} was incorrectly assigned (duplicate across multiple institutions). "
|
||||
note += "Re-enrichment required with proper matching."
|
||||
|
||||
# Update or create provenance
|
||||
if 'provenance' not in data:
|
||||
data['provenance'] = {}
|
||||
|
||||
if isinstance(data['provenance'], dict):
|
||||
existing_notes = data['provenance'].get('notes', '')
|
||||
# Handle case where notes is a list
|
||||
if isinstance(existing_notes, list):
|
||||
existing_notes.append(note)
|
||||
data['provenance']['notes'] = existing_notes
|
||||
elif existing_notes:
|
||||
data['provenance']['notes'] = existing_notes + '\n\n' + note
|
||||
else:
|
||||
data['provenance']['notes'] = note
|
||||
|
||||
# Write back
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
yaml.dump(data, f, Dumper=OrderedDumper, allow_unicode=True,
|
||||
default_flow_style=False, sort_keys=False, width=120)
|
||||
|
||||
return True, f"Removed wikidata_enrichment (was {old_wikidata_id})"
|
||||
|
||||
except Exception as e:
|
||||
return False, f"Error: {e}"
|
||||
|
||||
|
||||
def main():
|
||||
# Read the list of files to clean
|
||||
list_file = Path('/tmp/wikidata_duplicates_to_clean.txt')
|
||||
if not list_file.exists():
|
||||
print(f"Error: {list_file} not found")
|
||||
sys.exit(1)
|
||||
|
||||
with open(list_file, 'r') as f:
|
||||
filenames = [line.strip() for line in f if line.strip()]
|
||||
|
||||
print(f"Found {len(filenames)} files to clean")
|
||||
|
||||
# Base directory for custodian files
|
||||
base_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
||||
|
||||
success_count = 0
|
||||
skip_count = 0
|
||||
error_count = 0
|
||||
|
||||
for i, filename in enumerate(filenames):
|
||||
file_path = base_dir / filename
|
||||
|
||||
if not file_path.exists():
|
||||
print(f"[{i+1}/{len(filenames)}] SKIP (not found): {filename}")
|
||||
skip_count += 1
|
||||
continue
|
||||
|
||||
success, message = remove_wikidata_enrichment(file_path)
|
||||
|
||||
if success:
|
||||
print(f"[{i+1}/{len(filenames)}] OK: {filename} - {message}")
|
||||
success_count += 1
|
||||
else:
|
||||
if "No wikidata_enrichment" in message:
|
||||
print(f"[{i+1}/{len(filenames)}] SKIP (no wikidata): {filename}")
|
||||
skip_count += 1
|
||||
else:
|
||||
print(f"[{i+1}/{len(filenames)}] ERROR: {filename} - {message}")
|
||||
error_count += 1
|
||||
|
||||
print(f"\n=== Summary ===")
|
||||
print(f"Total files: {len(filenames)}")
|
||||
print(f"Successfully cleaned: {success_count}")
|
||||
print(f"Skipped: {skip_count}")
|
||||
print(f"Errors: {error_count}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in a new issue