glam/scripts/convert_wikidata_to_linkml.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

760 lines
25 KiB
Python

#!/usr/bin/env python3
"""
Wikidata JSON to LinkML YAML Converter
This script transforms raw Wikidata SPARQL extraction results into LinkML-compliant
YAML instance files conforming to the Heritage Custodian schema v0.2.2.
Input: data/wikidata/{country_code}/{timestamp}.json
Output: data/instances/wikidata_{country_code}_{timestamp}.yaml
Features:
- Maps Wikidata fields → LinkML schema (core.yaml, enums.yaml, provenance.yaml)
- Generates GHCIDs (Global Heritage Custodian Identifiers) with collision detection
- Generates UUIDs (v5, v7, v8) for persistent identification
- Enriches data with provenance metadata (TIER_3_CROWD_SOURCED)
- Validates institution types against InstitutionTypeEnum
- Geocodes addresses to lat/lon (if missing)
- Cross-references with existing LinkML instances for deduplication
Wikidata → LinkML Field Mapping:
- wikidata_qid → identifiers[scheme=Wikidata]
- name → name (fallback to QID if empty)
- description → description
- institution_type → InstitutionTypeEnum mapping
- location{latitude, longitude, city, street_address} → Location class
- identifiers{website, isil, viaf, email, phone} → Identifier + ContactInfo
- temporal{inception, founding_date} → founded_date
- organizational{parent_org} → parent_organization_name
- collection{size, subject} → Collection class
- media{image, logo} → future enhancement
Usage:
# Convert single country extraction
python convert_wikidata_to_linkml.py --country NL --timestamp 20251111_105038
# Convert all extractions for a country (latest by default)
python convert_wikidata_to_linkml.py --country NL
# Convert all countries (all JSON files)
python convert_wikidata_to_linkml.py --all-countries
# Dry run (show what would be converted)
python convert_wikidata_to_linkml.py --country NL --dry-run
# Skip institutions with missing critical data
python convert_wikidata_to_linkml.py --country NL --skip-incomplete
"""
import sys
import json
import yaml
import argparse
from pathlib import Path
from typing import Any, Optional, Dict, List
from datetime import datetime, timezone
from collections import defaultdict
import re
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
sys.path.insert(0, str(Path(__file__).parent)) # Add scripts directory for wikidata_type_mapping
# Import comprehensive Wikidata type mapping
try:
from wikidata_type_mapping import map_wikidata_type_to_linkml, is_valid_heritage_custodian
TYPE_MAPPING_AVAILABLE = True
except ImportError:
TYPE_MAPPING_AVAILABLE = False
print("⚠️ Warning: wikidata_type_mapping module not available. Using basic type mapping.")
# Optional: Import GHCID and UUID generation utilities if available
try:
from glam_extractor.ghcid import (
generate_ghcid,
generate_ghcid_uuids,
detect_ghcid_collision,
)
GHCID_AVAILABLE = True
except ImportError:
GHCID_AVAILABLE = False
print("⚠️ Warning: GHCID utilities not available. GHCIDs will be basic format.")
# =============================================================================
# WIKIDATA INSTITUTION TYPE → LinkML InstitutionTypeEnum MAPPING
# =============================================================================
WIKIDATA_TYPE_MAP = {
# Museums
"museum": "MUSEUM",
"kunstmuseum": "MUSEUM",
"art museum": "MUSEUM",
"science museum": "MUSEUM",
"natural history museum": "MUSEUM",
"history museum": "MUSEUM",
"local museum": "MUSEUM",
# Libraries
"bibliotheek": "LIBRARY",
"library": "LIBRARY",
"openbare bibliotheek": "LIBRARY",
"public library": "LIBRARY",
"academic library": "LIBRARY",
"university library": "LIBRARY",
"national library": "LIBRARY",
# Archives
"archief": "ARCHIVE",
"archive": "ARCHIVE",
"gemeentearchief": "ARCHIVE",
"city archive": "ARCHIVE",
"national archive": "ARCHIVE",
# Galleries
"galerie": "GALLERY",
"gallery": "GALLERY",
"art gallery": "GALLERY",
"kunstgalerie": "GALLERY",
# Research Centers
"research center": "RESEARCH_CENTER",
"onderzoekscentrum": "RESEARCH_CENTER",
"research institute": "RESEARCH_CENTER",
# Universities
"university": "UNIVERSITY",
"universiteit": "UNIVERSITY",
"hogeschool": "UNIVERSITY",
# Botanical Gardens / Zoos
"botanical garden": "BOTANICAL_ZOO",
"botanische tuin": "BOTANICAL_ZOO",
"zoo": "BOTANICAL_ZOO",
"dierentuin": "BOTANICAL_ZOO",
"aquarium": "BOTANICAL_ZOO",
# Cultural Centers
"cultural center": "OFFICIAL_INSTITUTION",
"cultureel centrum": "OFFICIAL_INSTITUTION",
# Default fallback
"cultural institution": "MIXED",
}
def map_institution_type(wikidata_type: str) -> Optional[str]:
"""
Map Wikidata institution type to LinkML InstitutionTypeEnum.
Args:
wikidata_type: Institution type from Wikidata (e.g., "museum", "bibliotheek")
Returns:
LinkML InstitutionTypeEnum value (e.g., "MUSEUM", "LIBRARY"), or None if excluded
"""
# Use comprehensive mapping if available
if TYPE_MAPPING_AVAILABLE:
return map_wikidata_type_to_linkml(wikidata_type)
# Fallback to basic mapping
wikidata_type_lower = wikidata_type.lower().strip()
# Exact match
if wikidata_type_lower in WIKIDATA_TYPE_MAP:
return WIKIDATA_TYPE_MAP[wikidata_type_lower]
# Fuzzy match (contains keyword)
for key, value in WIKIDATA_TYPE_MAP.items():
if key in wikidata_type_lower:
return value
# Fallback to MIXED if unknown
return "MIXED"
# =============================================================================
# GHCID GENERATION (Fallback if utilities unavailable)
# =============================================================================
def generate_basic_ghcid(
country_code: str,
city: Optional[str],
institution_type: str,
institution_name: str,
wikidata_qid: Optional[str] = None,
) -> str:
"""
Generate a basic GHCID without full validation.
Fallback method if glam_extractor.ghcid module unavailable.
Format: {Country}-{Region}-{City}-{Type}-{Abbreviation}[-Q{WikidataID}]
Example: NL-NH-AMS-M-RM (Rijksmuseum Amsterdam)
"""
# Type code mapping
TYPE_CODES = {
"MUSEUM": "M", "LIBRARY": "L", "ARCHIVE": "A", "GALLERY": "G",
"OFFICIAL_INSTITUTION": "O", "RESEARCH_CENTER": "R", "CORPORATION": "C",
"UNIVERSITY": "U", "BOTANICAL_ZOO": "B", "EDUCATION_PROVIDER": "E",
"PERSONAL_COLLECTION": "P", "COLLECTING_SOCIETY": "S", "MIXED": "X",
"HOLY_SITES": "H"
}
type_code = TYPE_CODES.get(institution_type, "X")
# Extract abbreviation from name (first letters of each word)
if institution_name and institution_name != "unknown":
words = re.findall(r'\b\w', institution_name.upper())
abbreviation = ''.join(words[:5]) # Max 5 letters
else:
abbreviation = "UNK"
# City code (first 3 letters uppercase)
if city:
city_code = re.sub(r'[^A-Z]', '', city.upper())[:3].ljust(3, 'X')
else:
city_code = "UNK"
# Region code (use "XX" as placeholder - proper mapping requires country-specific logic)
region_code = "XX"
# Base GHCID
ghcid = f"{country_code}-{region_code}-{city_code}-{type_code}-{abbreviation}"
# Add Wikidata Q-number for collision resolution
if wikidata_qid:
ghcid += f"-{wikidata_qid}"
return ghcid
def generate_uuid_v5(ghcid: str) -> str:
"""Generate UUID v5 from GHCID string (SHA-1 deterministic)."""
import uuid
# GHCID namespace UUID (fixed for all GHCIDs)
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid))
def generate_uuid_v7() -> str:
"""Generate UUID v7 (time-ordered, random) for database record ID."""
import uuid
# Python 3.11+ has uuid.uuid7(), fallback to uuid4() for older versions
try:
return str(uuid.uuid7())
except AttributeError:
return str(uuid.uuid4()) # Fallback to v4 if v7 not available
def generate_uuid_v8_sha256(ghcid: str) -> str:
"""Generate custom UUID v8 from GHCID string (SHA-256 deterministic)."""
import uuid
import hashlib
# SHA-256 hash of GHCID
hash_bytes = hashlib.sha256(ghcid.encode('utf-8')).digest()
# Take first 16 bytes for UUID
uuid_bytes = bytearray(hash_bytes[:16])
# Set version bits (8) and variant bits (RFC 4122)
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant RFC 4122
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
def generate_numeric_id(ghcid: str) -> int:
"""Generate 64-bit numeric ID from GHCID (SHA-256 hash)."""
import hashlib
hash_bytes = hashlib.sha256(ghcid.encode('utf-8')).digest()
# Convert first 8 bytes to 64-bit unsigned integer
return int.from_bytes(hash_bytes[:8], byteorder='big')
# =============================================================================
# WIKIDATA INSTITUTION CONVERTER
# =============================================================================
def convert_wikidata_institution(
wikidata_inst: Dict[str, Any],
country_code: str,
extraction_date: str,
) -> Optional[Dict[str, Any]]:
"""
Convert a single Wikidata institution to LinkML format.
Args:
wikidata_inst: Wikidata institution dictionary from JSON
country_code: ISO 3166-1 alpha-2 country code
extraction_date: ISO 8601 timestamp of Wikidata extraction
Returns:
LinkML-compliant institution dictionary, or None if invalid
"""
qid = wikidata_inst.get("wikidata_qid", "")
if not qid:
return None # Skip institutions without QID
# Extract basic fields
name = wikidata_inst.get("name", qid) # Fallback to QID if name missing
description = wikidata_inst.get("description", "").strip()
wikidata_type = wikidata_inst.get("institution_type", "")
# Map institution type
institution_type = map_institution_type(wikidata_type)
# Skip if type is excluded (generic non-heritage types)
if institution_type is None:
return None
# Extract location data
location_data = wikidata_inst.get("location", {})
city = location_data.get("city", "")
street_address = location_data.get("street_address", "")
latitude = location_data.get("latitude")
longitude = location_data.get("longitude")
# Extract identifiers
identifiers_data = wikidata_inst.get("identifiers", {})
website = identifiers_data.get("website", "")
isil_code = identifiers_data.get("isil", "")
viaf_id = identifiers_data.get("viaf", "")
email = identifiers_data.get("email", "")
phone = identifiers_data.get("phone", "")
# Extract temporal data
temporal_data = wikidata_inst.get("temporal", {})
inception_str = temporal_data.get("inception", "")
founding_str = temporal_data.get("founding_date", "")
# Parse founding date (prefer founding_date over inception)
founded_date = None
if founding_str:
founded_date = founding_str.split("T")[0] # Extract YYYY-MM-DD
elif inception_str:
founded_date = inception_str.split("T")[0]
# Generate GHCID
if GHCID_AVAILABLE:
# Use full GHCID generation with proper region/city codes
try:
ghcid = generate_ghcid(
country_code=country_code,
city=city or "Unknown",
institution_type=institution_type,
institution_name=name,
)
except Exception as e:
print(f"⚠️ GHCID generation failed for {qid}: {e}")
ghcid = generate_basic_ghcid(country_code, city, institution_type, name, qid)
else:
# Fallback to basic GHCID
ghcid = generate_basic_ghcid(country_code, city, institution_type, name, qid)
# Generate UUIDs
ghcid_uuid = generate_uuid_v5(ghcid)
ghcid_uuid_sha256 = generate_uuid_v8_sha256(ghcid)
record_id = generate_uuid_v7()
ghcid_numeric = generate_numeric_id(ghcid)
# Build LinkML institution record
institution = {
"id": f"https://w3id.org/heritage/custodian/wikidata/{qid}",
"record_id": record_id,
"ghcid_uuid": ghcid_uuid,
"ghcid_uuid_sha256": ghcid_uuid_sha256,
"ghcid_numeric": ghcid_numeric,
"ghcid_current": ghcid,
"ghcid_original": ghcid,
"name": name,
"institution_type": institution_type,
}
# Add optional fields
if description:
institution["description"] = description
if founded_date:
institution["founded_date"] = founded_date
# Build identifiers list
identifiers = []
# Always add Wikidata QID
identifiers.append({
"identifier_scheme": "Wikidata",
"identifier_value": qid,
"identifier_url": f"https://www.wikidata.org/wiki/{qid}",
})
if isil_code:
identifiers.append({
"identifier_scheme": "ISIL",
"identifier_value": isil_code,
})
if viaf_id:
identifiers.append({
"identifier_scheme": "VIAF",
"identifier_value": viaf_id,
"identifier_url": f"https://viaf.org/viaf/{viaf_id}",
})
if website:
identifiers.append({
"identifier_scheme": "Website",
"identifier_value": website,
"identifier_url": website,
})
institution["identifiers"] = identifiers
# Build locations list
if city or latitude or longitude or street_address:
location = {}
if city:
location["city"] = city
if street_address:
location["street_address"] = street_address
if latitude is not None:
location["latitude"] = float(latitude)
if longitude is not None:
location["longitude"] = float(longitude)
location["country"] = country_code
location["is_primary"] = True
institution["locations"] = [location]
# Build contact_info
if email or phone:
contact_info = {}
if email:
contact_info["email"] = email
if phone:
contact_info["phone"] = phone
contact_info["contact_type"] = "general"
institution["contact_info"] = contact_info
# Build provenance metadata (REQUIRED)
institution["provenance"] = {
"data_source": "WIKIDATA",
"data_tier": "TIER_3_CROWD_SOURCED",
"extraction_date": extraction_date,
"extraction_method": f"Wikidata SPARQL extraction via extract_global_wikidata.py, converted by convert_wikidata_to_linkml.py",
"confidence_score": 0.85, # Wikidata is crowd-sourced but generally reliable
}
return institution
# =============================================================================
# BATCH CONVERSION
# =============================================================================
def convert_wikidata_file(
json_path: Path,
output_dir: Path,
skip_incomplete: bool = False,
dry_run: bool = False,
) -> Dict[str, Any]:
"""
Convert a Wikidata JSON file to LinkML YAML.
Args:
json_path: Path to Wikidata JSON file
output_dir: Output directory for YAML files
skip_incomplete: Skip institutions with missing critical data
dry_run: Don't write files, just show statistics
Returns:
Dictionary with conversion statistics
"""
print(f"\n{'='*80}")
print(f"📂 Processing: {json_path.name}")
print(f"{'='*80}\n")
# Load Wikidata JSON
with open(json_path, 'r', encoding='utf-8') as f:
wikidata_data = json.load(f)
country_code = wikidata_data.get("country_code", "XX")
country_name = wikidata_data.get("country_name", "Unknown")
extraction_date = wikidata_data.get("extraction_date", datetime.now(timezone.utc).isoformat())
wikidata_institutions = wikidata_data.get("institutions", [])
print(f"🌍 Country: {country_name} ({country_code})")
print(f"📅 Extraction Date: {extraction_date}")
print(f"🏛️ Total Wikidata Institutions: {len(wikidata_institutions)}")
# Convert institutions
converted_institutions = []
skipped_count = 0
excluded_count = 0 # Non-heritage types (generic organizations)
error_count = 0
for wikidata_inst in wikidata_institutions:
try:
institution = convert_wikidata_institution(
wikidata_inst,
country_code,
extraction_date,
)
if institution is None:
# Check if excluded due to type filtering
wikidata_type = wikidata_inst.get("institution_type", "")
mapped_type = map_institution_type(wikidata_type)
if mapped_type is None:
excluded_count += 1
else:
skipped_count += 1
continue
# Skip incomplete records if requested
if skip_incomplete:
# Check for critical data
has_name = institution.get("name") and institution["name"] != institution.get("identifiers", [{}])[0].get("identifier_value")
has_location = bool(institution.get("locations"))
if not (has_name or has_location):
skipped_count += 1
continue
converted_institutions.append(institution)
except Exception as e:
error_count += 1
qid = wikidata_inst.get("wikidata_qid", "unknown")
print(f"❌ Error converting {qid}: {e}")
# Statistics
stats = {
"country_code": country_code,
"country_name": country_name,
"total_wikidata": len(wikidata_institutions),
"converted": len(converted_institutions),
"skipped": skipped_count,
"excluded": excluded_count,
"errors": error_count,
}
print(f"\n📊 Conversion Results:")
print(f" ✅ Converted: {stats['converted']}")
print(f" ⏭️ Skipped (incomplete data): {stats['skipped']}")
print(f" 🚫 Excluded (non-heritage types): {stats['excluded']}")
print(f" ❌ Errors: {stats['errors']}")
# Write YAML file
if not dry_run and converted_institutions:
timestamp = json_path.stem # Use same timestamp as input file
output_file = output_dir / f"wikidata_{country_code.lower()}_{timestamp}.yaml"
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
# Write YAML header
f.write(f"# Wikidata Heritage Institutions - {country_name}\n")
f.write(f"# Extracted: {extraction_date}\n")
f.write(f"# Converted: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Total institutions: {len(converted_institutions)}\n")
f.write(f"# Schema: Heritage Custodian v0.2.2 (LinkML modular schema)\n")
f.write(f"---\n")
# Write institutions as YAML list
yaml.dump(
converted_institutions,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=100,
)
print(f"\n💾 Output: {output_file}")
print(f" Size: {output_file.stat().st_size / 1024:.1f} KB")
return stats
def convert_all_countries(
wikidata_dir: Path,
output_dir: Path,
skip_incomplete: bool = False,
dry_run: bool = False,
) -> None:
"""
Convert all Wikidata JSON files to LinkML YAML.
Args:
wikidata_dir: Directory containing Wikidata JSON files (organized by country)
output_dir: Output directory for YAML files
skip_incomplete: Skip institutions with missing critical data
dry_run: Don't write files, just show statistics
"""
# Find all JSON files (recursive search in country subdirectories)
json_files = sorted(wikidata_dir.glob("**/*.json"))
if not json_files:
print(f"❌ No Wikidata JSON files found in {wikidata_dir}")
return
print(f"\n🌍 Found {len(json_files)} Wikidata JSON files")
# Convert each file
all_stats = []
for json_file in json_files:
stats = convert_wikidata_file(
json_file,
output_dir,
skip_incomplete=skip_incomplete,
dry_run=dry_run,
)
all_stats.append(stats)
# Global statistics
print(f"\n{'='*80}")
print(f"🌍 GLOBAL CONVERSION SUMMARY")
print(f"{'='*80}\n")
total_wikidata = sum(s["total_wikidata"] for s in all_stats)
total_converted = sum(s["converted"] for s in all_stats)
total_skipped = sum(s["skipped"] for s in all_stats)
total_errors = sum(s["errors"] for s in all_stats)
print(f"📊 Total Institutions:")
print(f" 🌍 Wikidata: {total_wikidata}")
print(f" ✅ Converted: {total_converted} ({total_converted/total_wikidata*100:.1f}%)")
print(f" ⏭️ Skipped: {total_skipped} ({total_skipped/total_wikidata*100:.1f}%)")
print(f" ❌ Errors: {total_errors}")
# Per-country breakdown
print(f"\n📍 Per-Country Breakdown:")
for stats in all_stats:
print(f" {stats['country_name']:20s} ({stats['country_code']}): "
f"{stats['converted']:4d} / {stats['total_wikidata']:4d} "
f"({stats['converted']/stats['total_wikidata']*100:5.1f}%)")
# =============================================================================
# CLI
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description="Convert Wikidata SPARQL extractions to LinkML YAML instances",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--country",
type=str,
help="Country code to convert (e.g., NL, CL, BE)",
)
parser.add_argument(
"--timestamp",
type=str,
help="Specific timestamp file to convert (e.g., 20251111_105038)",
)
parser.add_argument(
"--all-countries",
action="store_true",
help="Convert all Wikidata JSON files (all countries)",
)
parser.add_argument(
"--skip-incomplete",
action="store_true",
help="Skip institutions with missing critical data (name, location)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show statistics without writing files",
)
parser.add_argument(
"--wikidata-dir",
type=Path,
default=Path("data/wikidata"),
help="Directory containing Wikidata JSON files (default: data/wikidata)",
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("data/instances"),
help="Output directory for LinkML YAML files (default: data/instances)",
)
args = parser.parse_args()
# Resolve paths
wikidata_dir = Path(args.wikidata_dir).resolve()
output_dir = Path(args.output_dir).resolve()
if not wikidata_dir.exists():
print(f"❌ Wikidata directory not found: {wikidata_dir}")
sys.exit(1)
# Convert all countries
if args.all_countries:
convert_all_countries(
wikidata_dir,
output_dir,
skip_incomplete=args.skip_incomplete,
dry_run=args.dry_run,
)
return
# Convert specific country
if args.country:
country_code = args.country.upper()
country_dir = wikidata_dir / country_code.lower()
if not country_dir.exists():
print(f"❌ Country directory not found: {country_dir}")
sys.exit(1)
# Find JSON files for this country
if args.timestamp:
json_file = country_dir / f"{args.timestamp}.json"
if not json_file.exists():
print(f"❌ JSON file not found: {json_file}")
sys.exit(1)
json_files = [json_file]
else:
# Get latest JSON file
json_files = sorted(country_dir.glob("*.json"), reverse=True)
if not json_files:
print(f"❌ No JSON files found in {country_dir}")
sys.exit(1)
json_files = [json_files[0]] # Latest file
# Convert
for json_file in json_files:
convert_wikidata_file(
json_file,
output_dir,
skip_incomplete=args.skip_incomplete,
dry_run=args.dry_run,
)
return
# No arguments provided
parser.print_help()
if __name__ == "__main__":
main()