474 lines
16 KiB
Python
474 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Resolve NL-*-XXX-* files by looking up institutions in Wikidata and GeoNames.
|
|
|
|
This script:
|
|
1. Reads all NL-*-XXX-*.yaml files
|
|
2. Searches Wikidata for each institution
|
|
3. Gets coordinates and city from Wikidata
|
|
4. Looks up city code from GeoNames
|
|
5. Generates new GHCID with proper city code
|
|
6. Optionally renames files to new GHCID
|
|
|
|
Usage:
|
|
python scripts/resolve_nl_xxx_locations.py --dry-run # Preview changes
|
|
python scripts/resolve_nl_xxx_locations.py # Apply changes
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import yaml
|
|
import sqlite3
|
|
import argparse
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, Dict, Any, Tuple
|
|
import requests
|
|
import time
|
|
|
|
# Load environment variables
|
|
try:
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
except ImportError:
|
|
pass # dotenv not installed, rely on environment
|
|
|
|
# Configuration
|
|
CUSTODIAN_DIR = Path("data/custodian")
|
|
GEONAMES_DB = Path("data/reference/geonames.db")
|
|
|
|
# Wikidata API credentials from environment
|
|
WIKIDATA_API_TOKEN = os.getenv("WIKIDATA_API_TOKEN")
|
|
WIKIMEDIA_CONTACT_EMAIL = os.getenv("WIKIMEDIA_CONTACT_EMAIL", "glam-ontology-bot@example.com")
|
|
|
|
# Wikidata API endpoints
|
|
WIKIDATA_SEARCH_URL = "https://www.wikidata.org/w/api.php"
|
|
WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
|
|
|
|
# Request headers with authentication and proper User-Agent (required by Wikimedia policy)
|
|
def get_headers(include_auth: bool = True) -> Dict[str, str]:
|
|
"""Get request headers with authentication and User-Agent."""
|
|
headers = {
|
|
"User-Agent": f"GLAM-Ontology-Bot/1.0 ({WIKIMEDIA_CONTACT_EMAIL})",
|
|
"Accept": "application/json",
|
|
}
|
|
if include_auth and WIKIDATA_API_TOKEN:
|
|
headers["Authorization"] = f"Bearer {WIKIDATA_API_TOKEN}"
|
|
return headers
|
|
|
|
# Netherlands province code mapping (GeoNames admin1 -> ISO 3166-2)
|
|
NL_PROVINCE_CODES = {
|
|
"01": "DR", # Drenthe
|
|
"02": "FR", # Friesland
|
|
"03": "GE", # Gelderland
|
|
"04": "GR", # Groningen
|
|
"05": "LI", # Limburg
|
|
"06": "NB", # Noord-Brabant
|
|
"07": "NH", # Noord-Holland
|
|
"09": "UT", # Utrecht
|
|
"10": "ZE", # Zeeland
|
|
"11": "ZH", # Zuid-Holland
|
|
"15": "OV", # Overijssel
|
|
"16": "FL", # Flevoland
|
|
}
|
|
|
|
|
|
def search_wikidata(name: str) -> Optional[str]:
|
|
"""Search Wikidata for an entity by name, return QID."""
|
|
params = {
|
|
"action": "wbsearchentities",
|
|
"search": name,
|
|
"language": "nl",
|
|
"format": "json",
|
|
"limit": 5,
|
|
}
|
|
try:
|
|
resp = requests.get(
|
|
WIKIDATA_SEARCH_URL,
|
|
params=params,
|
|
headers=get_headers(include_auth=True),
|
|
timeout=10
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
if data.get("search"):
|
|
return data["search"][0]["id"]
|
|
except Exception as e:
|
|
print(f" Warning: Wikidata search failed for '{name}': {e}")
|
|
return None
|
|
|
|
|
|
def get_wikidata_location(qid: str) -> Optional[Dict[str, Any]]:
|
|
"""Get location data from Wikidata using SPARQL."""
|
|
query = f"""
|
|
SELECT ?coords ?cityLabel ?city ?regionLabel ?region WHERE {{
|
|
wd:{qid} wdt:P625 ?coords .
|
|
OPTIONAL {{ wd:{qid} wdt:P131 ?city . }}
|
|
OPTIONAL {{ ?city wdt:P131* ?region . ?region wdt:P31 wd:Q134390 . }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "nl,en". }}
|
|
}}
|
|
LIMIT 1
|
|
"""
|
|
try:
|
|
resp = requests.get(
|
|
WIKIDATA_SPARQL_URL,
|
|
params={"query": query, "format": "json"},
|
|
headers=get_headers(include_auth=False), # SPARQL endpoint doesn't use OAuth
|
|
timeout=30,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
if data.get("results", {}).get("bindings"):
|
|
result = data["results"]["bindings"][0]
|
|
coords_str = result.get("coords", {}).get("value", "")
|
|
# Parse Point(lon lat)
|
|
match = re.search(r"Point\(([-\d.]+)\s+([-\d.]+)\)", coords_str)
|
|
if match:
|
|
lon, lat = float(match.group(1)), float(match.group(2))
|
|
return {
|
|
"qid": qid,
|
|
"latitude": lat,
|
|
"longitude": lon,
|
|
"city_label": result.get("cityLabel", {}).get("value"),
|
|
"city_qid": result.get("city", {}).get("value", "").split("/")[-1],
|
|
}
|
|
except Exception as e:
|
|
print(f" Warning: SPARQL query failed for {qid}: {e}")
|
|
return None
|
|
|
|
|
|
def get_city_code_from_geonames(
|
|
lat: float, lon: float, country_code: str = "NL"
|
|
) -> Optional[Dict[str, str]]:
|
|
"""Reverse geocode coordinates to get city code from GeoNames database.
|
|
|
|
Strategy: Find the largest settlement within ~10km. If no settlement found,
|
|
fall back to nearest settlement within ~20km.
|
|
|
|
This prevents small villages from being selected over nearby major cities
|
|
(e.g., Apenheul should map to Apeldoorn, not Ugchelen).
|
|
"""
|
|
if not GEONAMES_DB.exists():
|
|
print(f" Warning: GeoNames database not found: {GEONAMES_DB}")
|
|
return None
|
|
|
|
conn = sqlite3.connect(GEONAMES_DB)
|
|
cursor = conn.cursor()
|
|
|
|
# First try: Find largest settlement within ~10km (0.01 degree² ≈ 10km at NL latitude)
|
|
# This prefers major cities over small villages
|
|
query_largest = """
|
|
SELECT
|
|
name, ascii_name, admin1_code, admin1_name,
|
|
latitude, longitude, geonames_id, population, feature_code,
|
|
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
AND ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) < 0.01
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
"""
|
|
|
|
# Fallback: Find nearest settlement within ~20km
|
|
query_nearest = """
|
|
SELECT
|
|
name, ascii_name, admin1_code, admin1_name,
|
|
latitude, longitude, geonames_id, population, feature_code,
|
|
((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance_sq
|
|
FROM cities
|
|
WHERE country_code = ?
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
AND ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) < 0.04
|
|
ORDER BY distance_sq
|
|
LIMIT 1
|
|
"""
|
|
|
|
try:
|
|
# Try largest first
|
|
cursor.execute(query_largest, (
|
|
lat, lat, lon, lon, # distance_sq
|
|
country_code,
|
|
lat, lat, lon, lon, # distance filter
|
|
))
|
|
row = cursor.fetchone()
|
|
|
|
# Fallback to nearest if no large city found
|
|
if not row:
|
|
cursor.execute(query_nearest, (
|
|
lat, lat, lon, lon, # distance_sq
|
|
country_code,
|
|
lat, lat, lon, lon, # distance filter
|
|
))
|
|
row = cursor.fetchone()
|
|
|
|
if row:
|
|
name, ascii_name, admin1_code, admin1_name, g_lat, g_lon, geonames_id, pop, feature_code = row[:9]
|
|
|
|
# Generate city code (first 3 letters of ASCII name, uppercase)
|
|
city_code = ascii_name[:3].upper() if ascii_name else name[:3].upper()
|
|
|
|
# Get province code
|
|
province_code = NL_PROVINCE_CODES.get(admin1_code, "XX")
|
|
|
|
return {
|
|
"city_name": name,
|
|
"city_code": city_code,
|
|
"geonames_id": geonames_id,
|
|
"admin1_code": admin1_code,
|
|
"province_code": province_code,
|
|
"feature_code": feature_code,
|
|
"latitude": g_lat,
|
|
"longitude": g_lon,
|
|
}
|
|
except Exception as e:
|
|
print(f" Warning: GeoNames lookup failed: {e}")
|
|
finally:
|
|
conn.close()
|
|
|
|
return None
|
|
|
|
|
|
def generate_abbreviation(name: str) -> str:
|
|
"""Generate institution abbreviation from name."""
|
|
# Skip words (Dutch/English articles, prepositions)
|
|
skip_words = {
|
|
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s",
|
|
'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of',
|
|
'a', 'an', 'the', 'of', 'in', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'and', 'or'
|
|
}
|
|
|
|
words = re.split(r'[\s\-]+', name)
|
|
initials = []
|
|
for word in words:
|
|
# Clean word
|
|
clean = re.sub(r'[^\w]', '', word)
|
|
if clean.lower() not in skip_words and clean:
|
|
initials.append(clean[0].upper())
|
|
|
|
return ''.join(initials[:10]) # Max 10 characters
|
|
|
|
|
|
def load_yaml(filepath: Path) -> Dict[str, Any]:
|
|
"""Load YAML file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def save_yaml(filepath: Path, data: Dict[str, Any]):
|
|
"""Save YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
|
|
def resolve_institution(filepath: Path, dry_run: bool = True) -> Optional[Dict[str, Any]]:
|
|
"""Resolve location for a single institution file."""
|
|
data = load_yaml(filepath)
|
|
|
|
# Get institution name
|
|
name = data.get("custodian_name", {}).get("emic_name", "")
|
|
if not name:
|
|
print(f" Skipping: No emic_name found")
|
|
return None
|
|
|
|
print(f"\n Institution: {name}")
|
|
|
|
# Search Wikidata
|
|
time.sleep(0.5) # Rate limiting
|
|
qid = search_wikidata(name)
|
|
if not qid:
|
|
print(f" Warning: Not found in Wikidata")
|
|
return None
|
|
|
|
print(f" Wikidata: {qid}")
|
|
|
|
# Get location from Wikidata
|
|
time.sleep(0.5)
|
|
location = get_wikidata_location(qid)
|
|
if not location:
|
|
print(f" Warning: No coordinates in Wikidata")
|
|
return None
|
|
|
|
print(f" Coords: ({location['latitude']}, {location['longitude']})")
|
|
if location.get("city_label"):
|
|
print(f" Wikidata city: {location['city_label']}")
|
|
|
|
# Reverse geocode to GeoNames
|
|
geonames = get_city_code_from_geonames(location["latitude"], location["longitude"])
|
|
if not geonames:
|
|
print(f" Warning: GeoNames lookup failed")
|
|
return None
|
|
|
|
print(f" GeoNames city: {geonames['city_name']} ({geonames['city_code']})")
|
|
print(f" Province: {geonames['province_code']} (admin1: {geonames['admin1_code']})")
|
|
|
|
# Get institution type and abbreviation
|
|
inst_types = data.get("institution_type", ["U"])
|
|
inst_type = inst_types[0] if isinstance(inst_types, list) else inst_types
|
|
abbrev = generate_abbreviation(name)
|
|
|
|
# Generate new GHCID
|
|
old_ghcid = data.get("ghcid", {}).get("ghcid_current", filepath.stem)
|
|
new_ghcid = f"NL-{geonames['province_code']}-{geonames['city_code']}-{inst_type}-{abbrev}"
|
|
|
|
print(f" Old GHCID: {old_ghcid}")
|
|
print(f" New GHCID: {new_ghcid}")
|
|
|
|
# Check if province changed (location was wrong)
|
|
old_province = old_ghcid.split("-")[1] if len(old_ghcid.split("-")) > 1 else "XX"
|
|
if old_province != geonames['province_code']:
|
|
print(f" ⚠️ PROVINCE MISMATCH: Was {old_province}, should be {geonames['province_code']}")
|
|
|
|
result = {
|
|
"filepath": filepath,
|
|
"name": name,
|
|
"qid": qid,
|
|
"old_ghcid": old_ghcid,
|
|
"new_ghcid": new_ghcid,
|
|
"city_name": geonames["city_name"],
|
|
"city_code": geonames["city_code"],
|
|
"province_code": geonames["province_code"],
|
|
"geonames_id": geonames["geonames_id"],
|
|
"latitude": location["latitude"],
|
|
"longitude": location["longitude"],
|
|
"province_changed": old_province != geonames['province_code'],
|
|
}
|
|
|
|
if not dry_run:
|
|
# Update the YAML file
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Update location
|
|
data["location"] = {
|
|
"city": geonames["city_name"],
|
|
"region": geonames["province_code"],
|
|
"country": "NL",
|
|
"coordinates": {
|
|
"latitude": location["latitude"],
|
|
"longitude": location["longitude"],
|
|
}
|
|
}
|
|
|
|
# Update GHCID
|
|
old_history = data.get("ghcid", {}).get("ghcid_history", [])
|
|
data["ghcid"] = {
|
|
"ghcid_current": new_ghcid,
|
|
"ghcid_original": old_ghcid,
|
|
"location_resolution": {
|
|
"method": "WIKIDATA_GEONAMES_LOOKUP",
|
|
"wikidata_id": qid,
|
|
"geonames_id": geonames["geonames_id"],
|
|
"city_name": geonames["city_name"],
|
|
"city_code": geonames["city_code"],
|
|
"region_code": geonames["province_code"],
|
|
"country_code": "NL",
|
|
"resolution_date": timestamp,
|
|
},
|
|
"ghcid_history": [
|
|
{
|
|
"ghcid": new_ghcid,
|
|
"valid_from": timestamp,
|
|
"valid_to": None,
|
|
"reason": f"Location resolved via Wikidata ({qid}) + GeoNames reverse geocoding"
|
|
}
|
|
] + old_history
|
|
}
|
|
|
|
# Add Wikidata identifier if not present
|
|
identifiers = data.get("identifiers", [])
|
|
has_wikidata = any(i.get("identifier_scheme") == "Wikidata" for i in identifiers)
|
|
if not has_wikidata:
|
|
identifiers.append({
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": qid,
|
|
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
|
|
})
|
|
data["identifiers"] = identifiers
|
|
|
|
# Save updated file
|
|
save_yaml(filepath, data)
|
|
print(f" ✓ Updated file")
|
|
|
|
# Rename file if GHCID changed
|
|
if new_ghcid != old_ghcid:
|
|
new_filepath = filepath.parent / f"{new_ghcid}.yaml"
|
|
if new_filepath.exists():
|
|
print(f" ⚠️ Cannot rename: {new_filepath} already exists!")
|
|
else:
|
|
filepath.rename(new_filepath)
|
|
print(f" ✓ Renamed to {new_filepath.name}")
|
|
result["new_filepath"] = new_filepath
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Resolve NL-*-XXX-* location files")
|
|
parser.add_argument("--dry-run", action="store_true", help="Preview changes without modifying files")
|
|
parser.add_argument("--limit", type=int, default=None, help="Limit number of files to process")
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
args = parser.parse_args()
|
|
|
|
# Check API token
|
|
if WIKIDATA_API_TOKEN:
|
|
print(f"✓ Wikidata API token loaded ({len(WIKIDATA_API_TOKEN)} chars)")
|
|
else:
|
|
print("⚠️ Warning: No WIKIDATA_API_TOKEN found in environment")
|
|
print(" The script will use unauthenticated requests (may hit rate limits)")
|
|
|
|
if args.verbose:
|
|
print(f" User-Agent: GLAM-Ontology-Bot/1.0 ({WIKIMEDIA_CONTACT_EMAIL})")
|
|
|
|
# Find all NL-*-XXX-*.yaml files
|
|
pattern = "NL-*-XXX-*.yaml"
|
|
files = sorted(CUSTODIAN_DIR.glob(pattern))
|
|
|
|
print(f"Found {len(files)} NL-*-XXX-*.yaml files")
|
|
if args.dry_run:
|
|
print("DRY RUN - No changes will be made")
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
print(f"Processing first {args.limit} files")
|
|
|
|
results = []
|
|
resolved = 0
|
|
failed = 0
|
|
province_mismatches = 0
|
|
|
|
for filepath in files:
|
|
print(f"\n{'='*60}")
|
|
print(f"Processing: {filepath.name}")
|
|
|
|
try:
|
|
result = resolve_institution(filepath, dry_run=args.dry_run)
|
|
if result:
|
|
results.append(result)
|
|
resolved += 1
|
|
if result.get("province_changed"):
|
|
province_mismatches += 1
|
|
else:
|
|
failed += 1
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
failed += 1
|
|
|
|
# Summary
|
|
print(f"\n{'='*60}")
|
|
print("SUMMARY")
|
|
print(f"{'='*60}")
|
|
print(f"Total files: {len(files)}")
|
|
print(f"Resolved: {resolved}")
|
|
print(f"Failed/Skipped: {failed}")
|
|
print(f"Province mismatches: {province_mismatches}")
|
|
|
|
if province_mismatches > 0:
|
|
print(f"\n⚠️ Province mismatches found - these institutions were assigned wrong province in LinkedIn import:")
|
|
for r in results:
|
|
if r.get("province_changed"):
|
|
print(f" - {r['name']}: {r['old_ghcid']} → {r['new_ghcid']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|