- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
760 lines
25 KiB
Python
760 lines
25 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Wikidata JSON to LinkML YAML Converter
|
|
|
|
This script transforms raw Wikidata SPARQL extraction results into LinkML-compliant
|
|
YAML instance files conforming to the Heritage Custodian schema v0.2.2.
|
|
|
|
Input: data/wikidata/{country_code}/{timestamp}.json
|
|
Output: data/instances/wikidata_{country_code}_{timestamp}.yaml
|
|
|
|
Features:
|
|
- Maps Wikidata fields → LinkML schema (core.yaml, enums.yaml, provenance.yaml)
|
|
- Generates GHCIDs (Global Heritage Custodian Identifiers) with collision detection
|
|
- Generates UUIDs (v5, v7, v8) for persistent identification
|
|
- Enriches data with provenance metadata (TIER_3_CROWD_SOURCED)
|
|
- Validates institution types against InstitutionTypeEnum
|
|
- Geocodes addresses to lat/lon (if missing)
|
|
- Cross-references with existing LinkML instances for deduplication
|
|
|
|
Wikidata → LinkML Field Mapping:
|
|
- wikidata_qid → identifiers[scheme=Wikidata]
|
|
- name → name (fallback to QID if empty)
|
|
- description → description
|
|
- institution_type → InstitutionTypeEnum mapping
|
|
- location{latitude, longitude, city, street_address} → Location class
|
|
- identifiers{website, isil, viaf, email, phone} → Identifier + ContactInfo
|
|
- temporal{inception, founding_date} → founded_date
|
|
- organizational{parent_org} → parent_organization_name
|
|
- collection{size, subject} → Collection class
|
|
- media{image, logo} → future enhancement
|
|
|
|
Usage:
|
|
# Convert single country extraction
|
|
python convert_wikidata_to_linkml.py --country NL --timestamp 20251111_105038
|
|
|
|
# Convert all extractions for a country (latest by default)
|
|
python convert_wikidata_to_linkml.py --country NL
|
|
|
|
# Convert all countries (all JSON files)
|
|
python convert_wikidata_to_linkml.py --all-countries
|
|
|
|
# Dry run (show what would be converted)
|
|
python convert_wikidata_to_linkml.py --country NL --dry-run
|
|
|
|
# Skip institutions with missing critical data
|
|
python convert_wikidata_to_linkml.py --country NL --skip-incomplete
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import yaml
|
|
import argparse
|
|
from pathlib import Path
|
|
from typing import Any, Optional, Dict, List
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
import re
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
sys.path.insert(0, str(Path(__file__).parent)) # Add scripts directory for wikidata_type_mapping
|
|
|
|
# Import comprehensive Wikidata type mapping
|
|
try:
|
|
from wikidata_type_mapping import map_wikidata_type_to_linkml, is_valid_heritage_custodian
|
|
TYPE_MAPPING_AVAILABLE = True
|
|
except ImportError:
|
|
TYPE_MAPPING_AVAILABLE = False
|
|
print("⚠️ Warning: wikidata_type_mapping module not available. Using basic type mapping.")
|
|
|
|
# Optional: Import GHCID and UUID generation utilities if available
|
|
try:
|
|
from glam_extractor.ghcid import (
|
|
generate_ghcid,
|
|
generate_ghcid_uuids,
|
|
detect_ghcid_collision,
|
|
)
|
|
GHCID_AVAILABLE = True
|
|
except ImportError:
|
|
GHCID_AVAILABLE = False
|
|
print("⚠️ Warning: GHCID utilities not available. GHCIDs will be basic format.")
|
|
|
|
|
|
# =============================================================================
|
|
# WIKIDATA INSTITUTION TYPE → LinkML InstitutionTypeEnum MAPPING
|
|
# =============================================================================
|
|
|
|
WIKIDATA_TYPE_MAP = {
|
|
# Museums
|
|
"museum": "MUSEUM",
|
|
"kunstmuseum": "MUSEUM",
|
|
"art museum": "MUSEUM",
|
|
"science museum": "MUSEUM",
|
|
"natural history museum": "MUSEUM",
|
|
"history museum": "MUSEUM",
|
|
"local museum": "MUSEUM",
|
|
|
|
# Libraries
|
|
"bibliotheek": "LIBRARY",
|
|
"library": "LIBRARY",
|
|
"openbare bibliotheek": "LIBRARY",
|
|
"public library": "LIBRARY",
|
|
"academic library": "LIBRARY",
|
|
"university library": "LIBRARY",
|
|
"national library": "LIBRARY",
|
|
|
|
# Archives
|
|
"archief": "ARCHIVE",
|
|
"archive": "ARCHIVE",
|
|
"gemeentearchief": "ARCHIVE",
|
|
"city archive": "ARCHIVE",
|
|
"national archive": "ARCHIVE",
|
|
|
|
# Galleries
|
|
"galerie": "GALLERY",
|
|
"gallery": "GALLERY",
|
|
"art gallery": "GALLERY",
|
|
"kunstgalerie": "GALLERY",
|
|
|
|
# Research Centers
|
|
"research center": "RESEARCH_CENTER",
|
|
"onderzoekscentrum": "RESEARCH_CENTER",
|
|
"research institute": "RESEARCH_CENTER",
|
|
|
|
# Universities
|
|
"university": "UNIVERSITY",
|
|
"universiteit": "UNIVERSITY",
|
|
"hogeschool": "UNIVERSITY",
|
|
|
|
# Botanical Gardens / Zoos
|
|
"botanical garden": "BOTANICAL_ZOO",
|
|
"botanische tuin": "BOTANICAL_ZOO",
|
|
"zoo": "BOTANICAL_ZOO",
|
|
"dierentuin": "BOTANICAL_ZOO",
|
|
"aquarium": "BOTANICAL_ZOO",
|
|
|
|
# Cultural Centers
|
|
"cultural center": "OFFICIAL_INSTITUTION",
|
|
"cultureel centrum": "OFFICIAL_INSTITUTION",
|
|
|
|
# Default fallback
|
|
"cultural institution": "MIXED",
|
|
}
|
|
|
|
|
|
def map_institution_type(wikidata_type: str) -> Optional[str]:
|
|
"""
|
|
Map Wikidata institution type to LinkML InstitutionTypeEnum.
|
|
|
|
Args:
|
|
wikidata_type: Institution type from Wikidata (e.g., "museum", "bibliotheek")
|
|
|
|
Returns:
|
|
LinkML InstitutionTypeEnum value (e.g., "MUSEUM", "LIBRARY"), or None if excluded
|
|
"""
|
|
# Use comprehensive mapping if available
|
|
if TYPE_MAPPING_AVAILABLE:
|
|
return map_wikidata_type_to_linkml(wikidata_type)
|
|
|
|
# Fallback to basic mapping
|
|
wikidata_type_lower = wikidata_type.lower().strip()
|
|
|
|
# Exact match
|
|
if wikidata_type_lower in WIKIDATA_TYPE_MAP:
|
|
return WIKIDATA_TYPE_MAP[wikidata_type_lower]
|
|
|
|
# Fuzzy match (contains keyword)
|
|
for key, value in WIKIDATA_TYPE_MAP.items():
|
|
if key in wikidata_type_lower:
|
|
return value
|
|
|
|
# Fallback to MIXED if unknown
|
|
return "MIXED"
|
|
|
|
|
|
# =============================================================================
|
|
# GHCID GENERATION (Fallback if utilities unavailable)
|
|
# =============================================================================
|
|
|
|
def generate_basic_ghcid(
|
|
country_code: str,
|
|
city: Optional[str],
|
|
institution_type: str,
|
|
institution_name: str,
|
|
wikidata_qid: Optional[str] = None,
|
|
) -> str:
|
|
"""
|
|
Generate a basic GHCID without full validation.
|
|
Fallback method if glam_extractor.ghcid module unavailable.
|
|
|
|
Format: {Country}-{Region}-{City}-{Type}-{Abbreviation}[-Q{WikidataID}]
|
|
Example: NL-NH-AMS-M-RM (Rijksmuseum Amsterdam)
|
|
"""
|
|
# Type code mapping
|
|
TYPE_CODES = {
|
|
"MUSEUM": "M", "LIBRARY": "L", "ARCHIVE": "A", "GALLERY": "G",
|
|
"OFFICIAL_INSTITUTION": "O", "RESEARCH_CENTER": "R", "CORPORATION": "C",
|
|
"UNIVERSITY": "U", "BOTANICAL_ZOO": "B", "EDUCATION_PROVIDER": "E",
|
|
"PERSONAL_COLLECTION": "P", "COLLECTING_SOCIETY": "S", "MIXED": "X",
|
|
"HOLY_SITES": "H"
|
|
}
|
|
|
|
type_code = TYPE_CODES.get(institution_type, "X")
|
|
|
|
# Extract abbreviation from name (first letters of each word)
|
|
if institution_name and institution_name != "unknown":
|
|
words = re.findall(r'\b\w', institution_name.upper())
|
|
abbreviation = ''.join(words[:5]) # Max 5 letters
|
|
else:
|
|
abbreviation = "UNK"
|
|
|
|
# City code (first 3 letters uppercase)
|
|
if city:
|
|
city_code = re.sub(r'[^A-Z]', '', city.upper())[:3].ljust(3, 'X')
|
|
else:
|
|
city_code = "UNK"
|
|
|
|
# Region code (use "XX" as placeholder - proper mapping requires country-specific logic)
|
|
region_code = "XX"
|
|
|
|
# Base GHCID
|
|
ghcid = f"{country_code}-{region_code}-{city_code}-{type_code}-{abbreviation}"
|
|
|
|
# Add Wikidata Q-number for collision resolution
|
|
if wikidata_qid:
|
|
ghcid += f"-{wikidata_qid}"
|
|
|
|
return ghcid
|
|
|
|
|
|
def generate_uuid_v5(ghcid: str) -> str:
|
|
"""Generate UUID v5 from GHCID string (SHA-1 deterministic)."""
|
|
import uuid
|
|
# GHCID namespace UUID (fixed for all GHCIDs)
|
|
GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
|
|
return str(uuid.uuid5(GHCID_NAMESPACE, ghcid))
|
|
|
|
|
|
def generate_uuid_v7() -> str:
|
|
"""Generate UUID v7 (time-ordered, random) for database record ID."""
|
|
import uuid
|
|
# Python 3.11+ has uuid.uuid7(), fallback to uuid4() for older versions
|
|
try:
|
|
return str(uuid.uuid7())
|
|
except AttributeError:
|
|
return str(uuid.uuid4()) # Fallback to v4 if v7 not available
|
|
|
|
|
|
def generate_uuid_v8_sha256(ghcid: str) -> str:
|
|
"""Generate custom UUID v8 from GHCID string (SHA-256 deterministic)."""
|
|
import uuid
|
|
import hashlib
|
|
|
|
# SHA-256 hash of GHCID
|
|
hash_bytes = hashlib.sha256(ghcid.encode('utf-8')).digest()
|
|
|
|
# Take first 16 bytes for UUID
|
|
uuid_bytes = bytearray(hash_bytes[:16])
|
|
|
|
# Set version bits (8) and variant bits (RFC 4122)
|
|
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
|
|
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant RFC 4122
|
|
|
|
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
|
|
|
|
|
|
def generate_numeric_id(ghcid: str) -> int:
|
|
"""Generate 64-bit numeric ID from GHCID (SHA-256 hash)."""
|
|
import hashlib
|
|
hash_bytes = hashlib.sha256(ghcid.encode('utf-8')).digest()
|
|
# Convert first 8 bytes to 64-bit unsigned integer
|
|
return int.from_bytes(hash_bytes[:8], byteorder='big')
|
|
|
|
|
|
# =============================================================================
|
|
# WIKIDATA INSTITUTION CONVERTER
|
|
# =============================================================================
|
|
|
|
def convert_wikidata_institution(
|
|
wikidata_inst: Dict[str, Any],
|
|
country_code: str,
|
|
extraction_date: str,
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Convert a single Wikidata institution to LinkML format.
|
|
|
|
Args:
|
|
wikidata_inst: Wikidata institution dictionary from JSON
|
|
country_code: ISO 3166-1 alpha-2 country code
|
|
extraction_date: ISO 8601 timestamp of Wikidata extraction
|
|
|
|
Returns:
|
|
LinkML-compliant institution dictionary, or None if invalid
|
|
"""
|
|
qid = wikidata_inst.get("wikidata_qid", "")
|
|
if not qid:
|
|
return None # Skip institutions without QID
|
|
|
|
# Extract basic fields
|
|
name = wikidata_inst.get("name", qid) # Fallback to QID if name missing
|
|
description = wikidata_inst.get("description", "").strip()
|
|
wikidata_type = wikidata_inst.get("institution_type", "")
|
|
|
|
# Map institution type
|
|
institution_type = map_institution_type(wikidata_type)
|
|
|
|
# Skip if type is excluded (generic non-heritage types)
|
|
if institution_type is None:
|
|
return None
|
|
|
|
# Extract location data
|
|
location_data = wikidata_inst.get("location", {})
|
|
city = location_data.get("city", "")
|
|
street_address = location_data.get("street_address", "")
|
|
latitude = location_data.get("latitude")
|
|
longitude = location_data.get("longitude")
|
|
|
|
# Extract identifiers
|
|
identifiers_data = wikidata_inst.get("identifiers", {})
|
|
website = identifiers_data.get("website", "")
|
|
isil_code = identifiers_data.get("isil", "")
|
|
viaf_id = identifiers_data.get("viaf", "")
|
|
email = identifiers_data.get("email", "")
|
|
phone = identifiers_data.get("phone", "")
|
|
|
|
# Extract temporal data
|
|
temporal_data = wikidata_inst.get("temporal", {})
|
|
inception_str = temporal_data.get("inception", "")
|
|
founding_str = temporal_data.get("founding_date", "")
|
|
|
|
# Parse founding date (prefer founding_date over inception)
|
|
founded_date = None
|
|
if founding_str:
|
|
founded_date = founding_str.split("T")[0] # Extract YYYY-MM-DD
|
|
elif inception_str:
|
|
founded_date = inception_str.split("T")[0]
|
|
|
|
# Generate GHCID
|
|
if GHCID_AVAILABLE:
|
|
# Use full GHCID generation with proper region/city codes
|
|
try:
|
|
ghcid = generate_ghcid(
|
|
country_code=country_code,
|
|
city=city or "Unknown",
|
|
institution_type=institution_type,
|
|
institution_name=name,
|
|
)
|
|
except Exception as e:
|
|
print(f"⚠️ GHCID generation failed for {qid}: {e}")
|
|
ghcid = generate_basic_ghcid(country_code, city, institution_type, name, qid)
|
|
else:
|
|
# Fallback to basic GHCID
|
|
ghcid = generate_basic_ghcid(country_code, city, institution_type, name, qid)
|
|
|
|
# Generate UUIDs
|
|
ghcid_uuid = generate_uuid_v5(ghcid)
|
|
ghcid_uuid_sha256 = generate_uuid_v8_sha256(ghcid)
|
|
record_id = generate_uuid_v7()
|
|
ghcid_numeric = generate_numeric_id(ghcid)
|
|
|
|
# Build LinkML institution record
|
|
institution = {
|
|
"id": f"https://w3id.org/heritage/custodian/wikidata/{qid}",
|
|
"record_id": record_id,
|
|
"ghcid_uuid": ghcid_uuid,
|
|
"ghcid_uuid_sha256": ghcid_uuid_sha256,
|
|
"ghcid_numeric": ghcid_numeric,
|
|
"ghcid_current": ghcid,
|
|
"ghcid_original": ghcid,
|
|
"name": name,
|
|
"institution_type": institution_type,
|
|
}
|
|
|
|
# Add optional fields
|
|
if description:
|
|
institution["description"] = description
|
|
|
|
if founded_date:
|
|
institution["founded_date"] = founded_date
|
|
|
|
# Build identifiers list
|
|
identifiers = []
|
|
|
|
# Always add Wikidata QID
|
|
identifiers.append({
|
|
"identifier_scheme": "Wikidata",
|
|
"identifier_value": qid,
|
|
"identifier_url": f"https://www.wikidata.org/wiki/{qid}",
|
|
})
|
|
|
|
if isil_code:
|
|
identifiers.append({
|
|
"identifier_scheme": "ISIL",
|
|
"identifier_value": isil_code,
|
|
})
|
|
|
|
if viaf_id:
|
|
identifiers.append({
|
|
"identifier_scheme": "VIAF",
|
|
"identifier_value": viaf_id,
|
|
"identifier_url": f"https://viaf.org/viaf/{viaf_id}",
|
|
})
|
|
|
|
if website:
|
|
identifiers.append({
|
|
"identifier_scheme": "Website",
|
|
"identifier_value": website,
|
|
"identifier_url": website,
|
|
})
|
|
|
|
institution["identifiers"] = identifiers
|
|
|
|
# Build locations list
|
|
if city or latitude or longitude or street_address:
|
|
location = {}
|
|
|
|
if city:
|
|
location["city"] = city
|
|
|
|
if street_address:
|
|
location["street_address"] = street_address
|
|
|
|
if latitude is not None:
|
|
location["latitude"] = float(latitude)
|
|
|
|
if longitude is not None:
|
|
location["longitude"] = float(longitude)
|
|
|
|
location["country"] = country_code
|
|
location["is_primary"] = True
|
|
|
|
institution["locations"] = [location]
|
|
|
|
# Build contact_info
|
|
if email or phone:
|
|
contact_info = {}
|
|
|
|
if email:
|
|
contact_info["email"] = email
|
|
|
|
if phone:
|
|
contact_info["phone"] = phone
|
|
|
|
contact_info["contact_type"] = "general"
|
|
|
|
institution["contact_info"] = contact_info
|
|
|
|
# Build provenance metadata (REQUIRED)
|
|
institution["provenance"] = {
|
|
"data_source": "WIKIDATA",
|
|
"data_tier": "TIER_3_CROWD_SOURCED",
|
|
"extraction_date": extraction_date,
|
|
"extraction_method": f"Wikidata SPARQL extraction via extract_global_wikidata.py, converted by convert_wikidata_to_linkml.py",
|
|
"confidence_score": 0.85, # Wikidata is crowd-sourced but generally reliable
|
|
}
|
|
|
|
return institution
|
|
|
|
|
|
# =============================================================================
|
|
# BATCH CONVERSION
|
|
# =============================================================================
|
|
|
|
def convert_wikidata_file(
|
|
json_path: Path,
|
|
output_dir: Path,
|
|
skip_incomplete: bool = False,
|
|
dry_run: bool = False,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Convert a Wikidata JSON file to LinkML YAML.
|
|
|
|
Args:
|
|
json_path: Path to Wikidata JSON file
|
|
output_dir: Output directory for YAML files
|
|
skip_incomplete: Skip institutions with missing critical data
|
|
dry_run: Don't write files, just show statistics
|
|
|
|
Returns:
|
|
Dictionary with conversion statistics
|
|
"""
|
|
print(f"\n{'='*80}")
|
|
print(f"📂 Processing: {json_path.name}")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Load Wikidata JSON
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
wikidata_data = json.load(f)
|
|
|
|
country_code = wikidata_data.get("country_code", "XX")
|
|
country_name = wikidata_data.get("country_name", "Unknown")
|
|
extraction_date = wikidata_data.get("extraction_date", datetime.now(timezone.utc).isoformat())
|
|
wikidata_institutions = wikidata_data.get("institutions", [])
|
|
|
|
print(f"🌍 Country: {country_name} ({country_code})")
|
|
print(f"📅 Extraction Date: {extraction_date}")
|
|
print(f"🏛️ Total Wikidata Institutions: {len(wikidata_institutions)}")
|
|
|
|
# Convert institutions
|
|
converted_institutions = []
|
|
skipped_count = 0
|
|
excluded_count = 0 # Non-heritage types (generic organizations)
|
|
error_count = 0
|
|
|
|
for wikidata_inst in wikidata_institutions:
|
|
try:
|
|
institution = convert_wikidata_institution(
|
|
wikidata_inst,
|
|
country_code,
|
|
extraction_date,
|
|
)
|
|
|
|
if institution is None:
|
|
# Check if excluded due to type filtering
|
|
wikidata_type = wikidata_inst.get("institution_type", "")
|
|
mapped_type = map_institution_type(wikidata_type)
|
|
if mapped_type is None:
|
|
excluded_count += 1
|
|
else:
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Skip incomplete records if requested
|
|
if skip_incomplete:
|
|
# Check for critical data
|
|
has_name = institution.get("name") and institution["name"] != institution.get("identifiers", [{}])[0].get("identifier_value")
|
|
has_location = bool(institution.get("locations"))
|
|
|
|
if not (has_name or has_location):
|
|
skipped_count += 1
|
|
continue
|
|
|
|
converted_institutions.append(institution)
|
|
|
|
except Exception as e:
|
|
error_count += 1
|
|
qid = wikidata_inst.get("wikidata_qid", "unknown")
|
|
print(f"❌ Error converting {qid}: {e}")
|
|
|
|
# Statistics
|
|
stats = {
|
|
"country_code": country_code,
|
|
"country_name": country_name,
|
|
"total_wikidata": len(wikidata_institutions),
|
|
"converted": len(converted_institutions),
|
|
"skipped": skipped_count,
|
|
"excluded": excluded_count,
|
|
"errors": error_count,
|
|
}
|
|
|
|
print(f"\n📊 Conversion Results:")
|
|
print(f" ✅ Converted: {stats['converted']}")
|
|
print(f" ⏭️ Skipped (incomplete data): {stats['skipped']}")
|
|
print(f" 🚫 Excluded (non-heritage types): {stats['excluded']}")
|
|
print(f" ❌ Errors: {stats['errors']}")
|
|
|
|
# Write YAML file
|
|
if not dry_run and converted_institutions:
|
|
timestamp = json_path.stem # Use same timestamp as input file
|
|
output_file = output_dir / f"wikidata_{country_code.lower()}_{timestamp}.yaml"
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
# Write YAML header
|
|
f.write(f"# Wikidata Heritage Institutions - {country_name}\n")
|
|
f.write(f"# Extracted: {extraction_date}\n")
|
|
f.write(f"# Converted: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Total institutions: {len(converted_institutions)}\n")
|
|
f.write(f"# Schema: Heritage Custodian v0.2.2 (LinkML modular schema)\n")
|
|
f.write(f"---\n")
|
|
|
|
# Write institutions as YAML list
|
|
yaml.dump(
|
|
converted_institutions,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=100,
|
|
)
|
|
|
|
print(f"\n💾 Output: {output_file}")
|
|
print(f" Size: {output_file.stat().st_size / 1024:.1f} KB")
|
|
|
|
return stats
|
|
|
|
|
|
def convert_all_countries(
|
|
wikidata_dir: Path,
|
|
output_dir: Path,
|
|
skip_incomplete: bool = False,
|
|
dry_run: bool = False,
|
|
) -> None:
|
|
"""
|
|
Convert all Wikidata JSON files to LinkML YAML.
|
|
|
|
Args:
|
|
wikidata_dir: Directory containing Wikidata JSON files (organized by country)
|
|
output_dir: Output directory for YAML files
|
|
skip_incomplete: Skip institutions with missing critical data
|
|
dry_run: Don't write files, just show statistics
|
|
"""
|
|
# Find all JSON files (recursive search in country subdirectories)
|
|
json_files = sorted(wikidata_dir.glob("**/*.json"))
|
|
|
|
if not json_files:
|
|
print(f"❌ No Wikidata JSON files found in {wikidata_dir}")
|
|
return
|
|
|
|
print(f"\n🌍 Found {len(json_files)} Wikidata JSON files")
|
|
|
|
# Convert each file
|
|
all_stats = []
|
|
for json_file in json_files:
|
|
stats = convert_wikidata_file(
|
|
json_file,
|
|
output_dir,
|
|
skip_incomplete=skip_incomplete,
|
|
dry_run=dry_run,
|
|
)
|
|
all_stats.append(stats)
|
|
|
|
# Global statistics
|
|
print(f"\n{'='*80}")
|
|
print(f"🌍 GLOBAL CONVERSION SUMMARY")
|
|
print(f"{'='*80}\n")
|
|
|
|
total_wikidata = sum(s["total_wikidata"] for s in all_stats)
|
|
total_converted = sum(s["converted"] for s in all_stats)
|
|
total_skipped = sum(s["skipped"] for s in all_stats)
|
|
total_errors = sum(s["errors"] for s in all_stats)
|
|
|
|
print(f"📊 Total Institutions:")
|
|
print(f" 🌍 Wikidata: {total_wikidata}")
|
|
print(f" ✅ Converted: {total_converted} ({total_converted/total_wikidata*100:.1f}%)")
|
|
print(f" ⏭️ Skipped: {total_skipped} ({total_skipped/total_wikidata*100:.1f}%)")
|
|
print(f" ❌ Errors: {total_errors}")
|
|
|
|
# Per-country breakdown
|
|
print(f"\n📍 Per-Country Breakdown:")
|
|
for stats in all_stats:
|
|
print(f" {stats['country_name']:20s} ({stats['country_code']}): "
|
|
f"{stats['converted']:4d} / {stats['total_wikidata']:4d} "
|
|
f"({stats['converted']/stats['total_wikidata']*100:5.1f}%)")
|
|
|
|
|
|
# =============================================================================
|
|
# CLI
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Convert Wikidata SPARQL extractions to LinkML YAML instances",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--country",
|
|
type=str,
|
|
help="Country code to convert (e.g., NL, CL, BE)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--timestamp",
|
|
type=str,
|
|
help="Specific timestamp file to convert (e.g., 20251111_105038)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--all-countries",
|
|
action="store_true",
|
|
help="Convert all Wikidata JSON files (all countries)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--skip-incomplete",
|
|
action="store_true",
|
|
help="Skip institutions with missing critical data (name, location)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show statistics without writing files",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--wikidata-dir",
|
|
type=Path,
|
|
default=Path("data/wikidata"),
|
|
help="Directory containing Wikidata JSON files (default: data/wikidata)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=Path,
|
|
default=Path("data/instances"),
|
|
help="Output directory for LinkML YAML files (default: data/instances)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Resolve paths
|
|
wikidata_dir = Path(args.wikidata_dir).resolve()
|
|
output_dir = Path(args.output_dir).resolve()
|
|
|
|
if not wikidata_dir.exists():
|
|
print(f"❌ Wikidata directory not found: {wikidata_dir}")
|
|
sys.exit(1)
|
|
|
|
# Convert all countries
|
|
if args.all_countries:
|
|
convert_all_countries(
|
|
wikidata_dir,
|
|
output_dir,
|
|
skip_incomplete=args.skip_incomplete,
|
|
dry_run=args.dry_run,
|
|
)
|
|
return
|
|
|
|
# Convert specific country
|
|
if args.country:
|
|
country_code = args.country.upper()
|
|
country_dir = wikidata_dir / country_code.lower()
|
|
|
|
if not country_dir.exists():
|
|
print(f"❌ Country directory not found: {country_dir}")
|
|
sys.exit(1)
|
|
|
|
# Find JSON files for this country
|
|
if args.timestamp:
|
|
json_file = country_dir / f"{args.timestamp}.json"
|
|
if not json_file.exists():
|
|
print(f"❌ JSON file not found: {json_file}")
|
|
sys.exit(1)
|
|
json_files = [json_file]
|
|
else:
|
|
# Get latest JSON file
|
|
json_files = sorted(country_dir.glob("*.json"), reverse=True)
|
|
if not json_files:
|
|
print(f"❌ No JSON files found in {country_dir}")
|
|
sys.exit(1)
|
|
json_files = [json_files[0]] # Latest file
|
|
|
|
# Convert
|
|
for json_file in json_files:
|
|
convert_wikidata_file(
|
|
json_file,
|
|
output_dir,
|
|
skip_incomplete=args.skip_incomplete,
|
|
dry_run=args.dry_run,
|
|
)
|
|
return
|
|
|
|
# No arguments provided
|
|
parser.print_help()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|