glam/scripts/import_argentina_wikidata_institutions.py
2025-12-21 00:01:54 +01:00

658 lines
20 KiB
Python

#!/usr/bin/env python3
"""
Import Argentina heritage institutions from Wikidata into custodian YAML files.
Queries Wikidata for museums and archives in Argentina, filters out institutions
that already exist in custodian files, and creates new YAML files with complete
GHCID metadata.
GLAM Data Extraction Project
Schema: LinkML v0.2.1
Country: Argentina (AR)
Source: Wikidata SPARQL queries
Usage:
python scripts/import_argentina_wikidata_institutions.py [--dry-run]
"""
import argparse
import json
import re
import sqlite3
import sys
import time
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
import requests
import yaml
# Add project root to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.identifiers.ghcid import GHCIDComponents
# Constants
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Argentina-Wikidata-Import/1.0 (https://github.com/glam-project)"
BASE_DIR = Path(__file__).parent.parent
CUSTODIAN_DIR = BASE_DIR / "data" / "custodian"
GEONAMES_DB = BASE_DIR / "data" / "reference" / "geonames.db"
ISO_3166_2_AR = BASE_DIR / "data" / "reference" / "iso_3166_2_ar.json"
# Argentina ISO 3166-2 region codes
AR_REGION_CODES = {
"Salta": "A",
"Buenos Aires": "B",
"Buenos Aires Province": "B",
"Provincia de Buenos Aires": "B",
"Ciudad Autónoma de Buenos Aires": "C",
"Ciudad de Buenos Aires": "C",
"Autonomous City of Buenos Aires": "C",
"Capital Federal": "C",
"CABA": "C",
"San Luis": "D",
"Entre Ríos": "E",
"Entre Rios": "E",
"La Rioja": "F",
"Santiago del Estero": "G",
"Chaco": "H",
"San Juan": "J",
"Catamarca": "K",
"La Pampa": "L",
"Mendoza": "M",
"Misiones": "N",
"Formosa": "P",
"Neuquén": "Q",
"Neuquen": "Q",
"Río Negro": "R",
"Rio Negro": "R",
"Santa Fe": "S",
"Tucumán": "T",
"Tucuman": "T",
"Chubut": "U",
"Tierra del Fuego": "V",
"Corrientes": "W",
"Córdoba": "X",
"Cordoba": "X",
"Jujuy": "Y",
"Santa Cruz": "Z",
}
# GeoNames admin1 code to ISO 3166-2 mapping for Argentina
GEONAMES_ADMIN1_TO_ISO = {
"01": "B", # Buenos Aires Province
"02": "K", # Catamarca
"03": "H", # Chaco
"04": "U", # Chubut
"05": "X", # Córdoba
"06": "W", # Corrientes
"07": "C", # Ciudad de Buenos Aires (CABA)
"08": "E", # Entre Ríos
"09": "P", # Formosa
"10": "Y", # Jujuy
"11": "L", # La Pampa
"12": "F", # La Rioja
"13": "M", # Mendoza
"14": "N", # Misiones
"15": "Q", # Neuquén
"16": "R", # Río Negro
"17": "A", # Salta
"18": "J", # San Juan
"19": "D", # San Luis
"20": "Z", # Santa Cruz
"21": "S", # Santa Fe
"22": "G", # Santiago del Estero
"23": "V", # Tierra del Fuego
"24": "T", # Tucumán
}
def normalize_to_ascii(text: str) -> str:
"""Normalize text to ASCII, removing diacritics."""
# NFD decomposition separates base characters from combining marks
normalized = unicodedata.normalize("NFD", text)
# Remove combining marks (category 'Mn' = Mark, Nonspacing)
ascii_text = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
return ascii_text
def generate_city_code(city_name: str) -> str:
"""
Generate 3-letter city code from city name.
Rules:
- Single word: First 3 letters
- Multi-word: First letter of each word (up to 3)
- Dutch articles (de, het, den, 's): Article initial + 2 from main word
"""
if not city_name:
return "XXX"
# Normalize to ASCII
city_ascii = normalize_to_ascii(city_name)
# Split into words
words = city_ascii.split()
if len(words) == 1:
# Single word: first 3 letters
return words[0][:3].upper()
# Check for Spanish articles (la, el, los, las)
spanish_articles = {"la", "el", "los", "las", "de", "del"}
if words[0].lower() in spanish_articles:
# Skip article, use main word(s)
remaining = [w for w in words if w.lower() not in spanish_articles]
if remaining:
if len(remaining) == 1:
return remaining[0][:3].upper()
else:
# Initials of remaining words
return "".join(w[0] for w in remaining[:3]).upper()
# Multi-word: initials
return "".join(w[0] for w in words[:3]).upper()
def extract_abbreviation_from_name(name: str) -> str:
"""
Generate institution abbreviation from emic name.
Takes first letter of each significant word (skipping articles, prepositions).
Maximum 10 characters.
"""
if not name:
return "UNK"
# Spanish skip words (articles, prepositions, conjunctions)
skip_words = {
"el", "la", "los", "las", "un", "una", "unos", "unas",
"de", "del", "a", "al", "en", "con", "por", "para",
"sobre", "bajo", "y", "o", "e", "u"
}
# Normalize to ASCII
name_ascii = normalize_to_ascii(name)
# Remove special characters except spaces
name_clean = re.sub(r"[^a-zA-Z0-9\s]", "", name_ascii)
# Split into words
words = name_clean.split()
# Filter skip words
significant_words = [w for w in words if w.lower() not in skip_words and len(w) > 0]
if not significant_words:
# Fallback: use all words
significant_words = words
if not significant_words:
return "UNK"
# Take first letter of each significant word
abbrev = "".join(w[0].upper() for w in significant_words)
# Limit to 10 characters
return abbrev[:10]
def query_wikidata_museums() -> list[dict]:
"""Query Wikidata for museums in Argentina."""
query = """
SELECT DISTINCT ?item ?itemLabel ?coords ?cityLabel ?websiteUrl WHERE {
?item wdt:P31/wdt:P279* wd:Q33506 . # instance of museum (or subclass)
?item wdt:P17 wd:Q414 . # country: Argentina
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P131 ?city . }
OPTIONAL { ?item wdt:P856 ?websiteUrl . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
}
ORDER BY ?itemLabel
"""
return _execute_sparql(query)
def query_wikidata_archives() -> list[dict]:
"""Query Wikidata for archives in Argentina."""
query = """
SELECT DISTINCT ?item ?itemLabel ?coords ?cityLabel ?websiteUrl WHERE {
?item wdt:P31/wdt:P279* wd:Q166118 . # instance of archive (or subclass)
?item wdt:P17 wd:Q414 . # country: Argentina
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P131 ?city . }
OPTIONAL { ?item wdt:P856 ?websiteUrl . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
}
ORDER BY ?itemLabel
"""
return _execute_sparql(query)
def query_wikidata_galleries() -> list[dict]:
"""Query Wikidata for art galleries in Argentina."""
query = """
SELECT DISTINCT ?item ?itemLabel ?coords ?cityLabel ?websiteUrl WHERE {
?item wdt:P31/wdt:P279* wd:Q1007870 . # instance of art gallery (or subclass)
?item wdt:P17 wd:Q414 . # country: Argentina
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P131 ?city . }
OPTIONAL { ?item wdt:P856 ?websiteUrl . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "es,en" . }
}
ORDER BY ?itemLabel
"""
return _execute_sparql(query)
def _execute_sparql(query: str) -> list[dict]:
"""Execute SPARQL query and return parsed results."""
headers = {
"User-Agent": USER_AGENT,
"Accept": "application/sparql-results+json"
}
params = {"query": query, "format": "json"}
time.sleep(1.0) # Rate limiting
try:
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=60)
response.raise_for_status()
data = response.json()
return data.get("results", {}).get("bindings", [])
except Exception as e:
print(f" ❌ SPARQL query failed: {e}")
return []
def parse_wikidata_result(binding: dict, institution_type: str) -> Optional[dict]:
"""Parse a Wikidata SPARQL result binding into a normalized dict."""
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
return None
label = binding.get("itemLabel", {}).get("value", "")
if not label or label == qid: # Skip if label is just the QID (no label found)
return None
result = {
"qid": qid,
"name": label,
"institution_type": institution_type,
}
# Parse coordinates
coords_str = binding.get("coords", {}).get("value", "")
if coords_str and coords_str.startswith("Point("):
try:
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
except (ValueError, IndexError):
pass
# Parse city
city = binding.get("cityLabel", {}).get("value", "")
if city and not city.startswith("Q"): # Skip if city label is QID
result["city"] = city
# Parse website
website = binding.get("websiteUrl", {}).get("value", "")
if website:
result["website"] = website
return result
def get_existing_qids() -> set[str]:
"""Get set of Wikidata QIDs already in Argentina custodian files."""
qids = set()
for filepath in CUSTODIAN_DIR.glob("AR-*.yaml"):
try:
with open(filepath, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
# Check wikidata_enrichment section
wd_id = data.get("wikidata_enrichment", {}).get("wikidata_entity_id")
if wd_id:
qids.add(wd_id)
# Check original_entry section
wd_id = data.get("original_entry", {}).get("wikidata_id")
if wd_id:
qids.add(wd_id)
except Exception:
continue
return qids
def reverse_geocode_to_region(lat: float, lon: float) -> Optional[tuple[str, str, str]]:
"""
Reverse geocode coordinates to find region code and city.
Returns: (region_code, city_name, city_code) or None
"""
if not GEONAMES_DB.exists():
return None
try:
conn = sqlite3.connect(GEONAMES_DB)
cursor = conn.cursor()
# Find nearest city with proper feature codes (not neighborhoods)
cursor.execute("""
SELECT name, ascii_name, admin1_code, admin1_name,
latitude, longitude,
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
FROM cities
WHERE country_code = 'AR'
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY distance_sq
LIMIT 1
""", (lat, lat, lon, lon))
row = cursor.fetchone()
conn.close()
if row:
city_name = row[0]
ascii_name = row[1]
admin1_code = row[2]
# Map GeoNames admin1 to ISO 3166-2
region_code = GEONAMES_ADMIN1_TO_ISO.get(admin1_code, "XX")
city_code = generate_city_code(ascii_name or city_name)
return (region_code, city_name, city_code)
except Exception as e:
print(f" ⚠️ GeoNames lookup failed: {e}")
return None
def city_label_to_region(city_label: str) -> Optional[str]:
"""Try to map city label to region code."""
if not city_label:
return None
# Direct match in region codes
for name, code in AR_REGION_CODES.items():
if name.lower() == city_label.lower():
return code
if name.lower() in city_label.lower():
return code
# Known city to region mappings
city_to_region = {
"la plata": "B",
"mar del plata": "B",
"bahía blanca": "B",
"bahia blanca": "B",
"rosario": "S",
"ushuaia": "V",
"resistencia": "H",
"posadas": "N",
"paraná": "E",
"parana": "E",
"san salvador de jujuy": "Y",
"san miguel de tucumán": "T",
"san miguel de tucuman": "T",
}
city_lower = city_label.lower()
for city, region in city_to_region.items():
if city in city_lower:
return region
return None
def create_custodian_yaml(inst: dict, dry_run: bool = False) -> Optional[Path]:
"""Create a custodian YAML file for an institution."""
qid = inst["qid"]
name = inst["name"]
inst_type = inst["institution_type"]
# Determine location
region_code = "XX"
city_code = "XXX"
city_name = inst.get("city", "")
# Try reverse geocoding first (most accurate)
if "latitude" in inst and "longitude" in inst:
geo_result = reverse_geocode_to_region(inst["latitude"], inst["longitude"])
if geo_result:
region_code, city_name, city_code = geo_result
# Fallback: try city label
if region_code == "XX" and city_name:
region = city_label_to_region(city_name)
if region:
region_code = region
city_code = generate_city_code(city_name)
# Generate abbreviation
abbreviation = extract_abbreviation_from_name(name)
# Create GHCID components
try:
components = GHCIDComponents(
country_code="AR",
region_code=region_code,
city_locode=city_code,
institution_type=inst_type,
abbreviation=abbreviation,
)
ghcid_current = components.to_string()
ghcid_uuid = str(components.to_uuid())
ghcid_uuid_sha256 = str(components.to_uuid_sha256())
ghcid_numeric = components.to_numeric()
except Exception as e:
print(f" ❌ GHCID generation failed for {name}: {e}")
return None
# Check for collision
filename = f"{ghcid_current}.yaml"
filepath = CUSTODIAN_DIR / filename
if filepath.exists():
# Collision - append Wikidata QID
components.wikidata_qid = qid.replace("Q", "")
ghcid_current = components.to_string()
ghcid_uuid = str(components.to_uuid())
ghcid_uuid_sha256 = str(components.to_uuid_sha256())
ghcid_numeric = components.to_numeric()
filename = f"{ghcid_current}.yaml"
filepath = CUSTODIAN_DIR / filename
timestamp = datetime.now(timezone.utc).isoformat()
# Build YAML structure
data = {
"original_entry": {
"name": name,
"source": "Wikidata SPARQL import",
"wikidata_id": qid,
},
"processing_timestamp": timestamp,
"ghcid": {
"ghcid_current": ghcid_current,
"ghcid_uuid": ghcid_uuid,
"ghcid_uuid_sha256": ghcid_uuid_sha256,
"ghcid_numeric": ghcid_numeric,
"record_id": str(__import__("uuid").uuid4()),
"generation_timestamp": timestamp,
"location_resolution": {
"method": "WIKIDATA_IMPORT",
"country_code": "AR",
"region_code": region_code,
"city_code": city_code,
"city_label": city_name or None,
},
},
"custodian_name": {
"claim_type": "custodian_name",
"claim_value": name,
"source_type": "wikidata",
"emic_name": name,
"name_language": "es",
},
"institution_type": {
"M": "MUSEUM",
"A": "ARCHIVE",
"G": "GALLERY",
"L": "LIBRARY",
}.get(inst_type, "UNKNOWN"),
"location": {
"country": "AR",
"region_code": region_code,
},
"wikidata_enrichment": {
"wikidata_entity_id": qid,
"enrichment_date": timestamp,
"source": "Wikidata SPARQL import",
},
}
# Add optional fields
if city_name:
data["location"]["city"] = city_name
if "latitude" in inst and "longitude" in inst:
data["location"]["latitude"] = inst["latitude"]
data["location"]["longitude"] = inst["longitude"]
if "website" in inst:
data["website"] = inst["website"]
if dry_run:
print(f" [DRY RUN] Would create: {filename}")
return filepath
# Write YAML file
with open(filepath, "w", encoding="utf-8") as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
return filepath
def main():
parser = argparse.ArgumentParser(description="Import Argentina institutions from Wikidata")
parser.add_argument("--dry-run", action="store_true", help="Don't create files, just report what would be done")
args = parser.parse_args()
print("=" * 80)
print("ARGENTINA WIKIDATA INSTITUTION IMPORT")
print("=" * 80)
print()
# Get existing QIDs
print("📂 Scanning existing custodian files...")
existing_qids = get_existing_qids()
print(f" Found {len(existing_qids)} existing Wikidata QIDs")
print()
# Query Wikidata
print("🔍 Querying Wikidata for Argentina institutions...")
print(" Museums...", end=" ", flush=True)
museum_results = query_wikidata_museums()
print(f"found {len(museum_results)} raw results")
print(" Archives...", end=" ", flush=True)
archive_results = query_wikidata_archives()
print(f"found {len(archive_results)} raw results")
print(" Galleries...", end=" ", flush=True)
gallery_results = query_wikidata_galleries()
print(f"found {len(gallery_results)} raw results")
print()
# Parse and deduplicate results
institutions = {}
for binding in museum_results:
inst = parse_wikidata_result(binding, "M")
if inst and inst["qid"] not in institutions:
institutions[inst["qid"]] = inst
for binding in archive_results:
inst = parse_wikidata_result(binding, "A")
if inst and inst["qid"] not in institutions:
institutions[inst["qid"]] = inst
for binding in gallery_results:
inst = parse_wikidata_result(binding, "G")
if inst and inst["qid"] not in institutions:
institutions[inst["qid"]] = inst
print(f"📊 Total unique institutions: {len(institutions)}")
# Filter out existing
new_institutions = {qid: inst for qid, inst in institutions.items() if qid not in existing_qids}
print(f" After filtering existing: {len(new_institutions)} new institutions")
print()
# Create custodian files
stats = {
"created": 0,
"collisions": 0,
"errors": 0,
"by_type": {"M": 0, "A": 0, "G": 0},
}
if args.dry_run:
print("🔄 [DRY RUN] Would create the following files:")
else:
print("🔄 Creating custodian YAML files...")
print()
for qid, inst in sorted(new_institutions.items(), key=lambda x: x[1]["name"]):
name = inst["name"]
inst_type = inst["institution_type"]
city = inst.get("city", "Unknown")
print(f" [{inst_type}] {name}")
print(f" 📍 {city}, QID: {qid}")
filepath = create_custodian_yaml(inst, dry_run=args.dry_run)
if filepath:
if not args.dry_run:
print(f" ✅ Created: {filepath.name}")
stats["created"] += 1
stats["by_type"][inst_type] += 1
else:
stats["errors"] += 1
print()
# Summary
print("=" * 80)
print("IMPORT COMPLETE")
print("=" * 80)
print(f"✅ Created: {stats['created']} custodian files")
print(f" - Museums: {stats['by_type']['M']}")
print(f" - Archives: {stats['by_type']['A']}")
print(f" - Galleries: {stats['by_type']['G']}")
print(f"❌ Errors: {stats['errors']}")
print()
# Final count
if not args.dry_run:
final_count = len(list(CUSTODIAN_DIR.glob("AR-*.yaml")))
print(f"📁 Total Argentina custodian files: {final_count}")
if __name__ == "__main__":
main()