#!/usr/bin/env python3 """ Wikidata JSON to LinkML YAML Converter This script transforms raw Wikidata SPARQL extraction results into LinkML-compliant YAML instance files conforming to the Heritage Custodian schema v0.2.2. Input: data/wikidata/{country_code}/{timestamp}.json Output: data/instances/wikidata_{country_code}_{timestamp}.yaml Features: - Maps Wikidata fields → LinkML schema (core.yaml, enums.yaml, provenance.yaml) - Generates GHCIDs (Global Heritage Custodian Identifiers) with collision detection - Generates UUIDs (v5, v7, v8) for persistent identification - Enriches data with provenance metadata (TIER_3_CROWD_SOURCED) - Validates institution types against InstitutionTypeEnum - Geocodes addresses to lat/lon (if missing) - Cross-references with existing LinkML instances for deduplication Wikidata → LinkML Field Mapping: - wikidata_qid → identifiers[scheme=Wikidata] - name → name (fallback to QID if empty) - description → description - institution_type → InstitutionTypeEnum mapping - location{latitude, longitude, city, street_address} → Location class - identifiers{website, isil, viaf, email, phone} → Identifier + ContactInfo - temporal{inception, founding_date} → founded_date - organizational{parent_org} → parent_organization_name - collection{size, subject} → Collection class - media{image, logo} → future enhancement Usage: # Convert single country extraction python convert_wikidata_to_linkml.py --country NL --timestamp 20251111_105038 # Convert all extractions for a country (latest by default) python convert_wikidata_to_linkml.py --country NL # Convert all countries (all JSON files) python convert_wikidata_to_linkml.py --all-countries # Dry run (show what would be converted) python convert_wikidata_to_linkml.py --country NL --dry-run # Skip institutions with missing critical data python convert_wikidata_to_linkml.py --country NL --skip-incomplete """ import sys import json import yaml import argparse from pathlib import Path from typing import Any, Optional, Dict, List from datetime import datetime, timezone from collections import defaultdict import re sys.path.insert(0, str(Path(__file__).parent.parent / "src")) sys.path.insert(0, str(Path(__file__).parent)) # Add scripts directory for wikidata_type_mapping # Import comprehensive Wikidata type mapping try: from wikidata_type_mapping import map_wikidata_type_to_linkml, is_valid_heritage_custodian TYPE_MAPPING_AVAILABLE = True except ImportError: TYPE_MAPPING_AVAILABLE = False print("⚠️ Warning: wikidata_type_mapping module not available. Using basic type mapping.") # Optional: Import GHCID and UUID generation utilities if available try: from glam_extractor.ghcid import ( generate_ghcid, generate_ghcid_uuids, detect_ghcid_collision, ) GHCID_AVAILABLE = True except ImportError: GHCID_AVAILABLE = False print("⚠️ Warning: GHCID utilities not available. GHCIDs will be basic format.") # ============================================================================= # WIKIDATA INSTITUTION TYPE → LinkML InstitutionTypeEnum MAPPING # ============================================================================= WIKIDATA_TYPE_MAP = { # Museums "museum": "MUSEUM", "kunstmuseum": "MUSEUM", "art museum": "MUSEUM", "science museum": "MUSEUM", "natural history museum": "MUSEUM", "history museum": "MUSEUM", "local museum": "MUSEUM", # Libraries "bibliotheek": "LIBRARY", "library": "LIBRARY", "openbare bibliotheek": "LIBRARY", "public library": "LIBRARY", "academic library": "LIBRARY", "university library": "LIBRARY", "national library": "LIBRARY", # Archives "archief": "ARCHIVE", "archive": "ARCHIVE", "gemeentearchief": "ARCHIVE", "city archive": "ARCHIVE", "national archive": "ARCHIVE", # Galleries "galerie": "GALLERY", "gallery": "GALLERY", "art gallery": "GALLERY", "kunstgalerie": "GALLERY", # Research Centers "research center": "RESEARCH_CENTER", "onderzoekscentrum": "RESEARCH_CENTER", "research institute": "RESEARCH_CENTER", # Universities "university": "UNIVERSITY", "universiteit": "UNIVERSITY", "hogeschool": "UNIVERSITY", # Botanical Gardens / Zoos "botanical garden": "BOTANICAL_ZOO", "botanische tuin": "BOTANICAL_ZOO", "zoo": "BOTANICAL_ZOO", "dierentuin": "BOTANICAL_ZOO", "aquarium": "BOTANICAL_ZOO", # Cultural Centers "cultural center": "OFFICIAL_INSTITUTION", "cultureel centrum": "OFFICIAL_INSTITUTION", # Default fallback "cultural institution": "MIXED", } def map_institution_type(wikidata_type: str) -> Optional[str]: """ Map Wikidata institution type to LinkML InstitutionTypeEnum. Args: wikidata_type: Institution type from Wikidata (e.g., "museum", "bibliotheek") Returns: LinkML InstitutionTypeEnum value (e.g., "MUSEUM", "LIBRARY"), or None if excluded """ # Use comprehensive mapping if available if TYPE_MAPPING_AVAILABLE: return map_wikidata_type_to_linkml(wikidata_type) # Fallback to basic mapping wikidata_type_lower = wikidata_type.lower().strip() # Exact match if wikidata_type_lower in WIKIDATA_TYPE_MAP: return WIKIDATA_TYPE_MAP[wikidata_type_lower] # Fuzzy match (contains keyword) for key, value in WIKIDATA_TYPE_MAP.items(): if key in wikidata_type_lower: return value # Fallback to MIXED if unknown return "MIXED" # ============================================================================= # GHCID GENERATION (Fallback if utilities unavailable) # ============================================================================= def generate_basic_ghcid( country_code: str, city: Optional[str], institution_type: str, institution_name: str, wikidata_qid: Optional[str] = None, ) -> str: """ Generate a basic GHCID without full validation. Fallback method if glam_extractor.ghcid module unavailable. Format: {Country}-{Region}-{City}-{Type}-{Abbreviation}[-Q{WikidataID}] Example: NL-NH-AMS-M-RM (Rijksmuseum Amsterdam) """ # Type code mapping TYPE_CODES = { "MUSEUM": "M", "LIBRARY": "L", "ARCHIVE": "A", "GALLERY": "G", "OFFICIAL_INSTITUTION": "O", "RESEARCH_CENTER": "R", "CORPORATION": "C", "UNIVERSITY": "U", "BOTANICAL_ZOO": "B", "EDUCATION_PROVIDER": "E", "PERSONAL_COLLECTION": "P", "COLLECTING_SOCIETY": "S", "MIXED": "X", "HOLY_SITES": "H" } type_code = TYPE_CODES.get(institution_type, "X") # Extract abbreviation from name (first letters of each word) if institution_name and institution_name != "unknown": words = re.findall(r'\b\w', institution_name.upper()) abbreviation = ''.join(words[:5]) # Max 5 letters else: abbreviation = "UNK" # City code (first 3 letters uppercase) if city: city_code = re.sub(r'[^A-Z]', '', city.upper())[:3].ljust(3, 'X') else: city_code = "UNK" # Region code (use "XX" as placeholder - proper mapping requires country-specific logic) region_code = "XX" # Base GHCID ghcid = f"{country_code}-{region_code}-{city_code}-{type_code}-{abbreviation}" # Add Wikidata Q-number for collision resolution if wikidata_qid: ghcid += f"-{wikidata_qid}" return ghcid def generate_uuid_v5(ghcid: str) -> str: """Generate UUID v5 from GHCID string (SHA-1 deterministic).""" import uuid # GHCID namespace UUID (fixed for all GHCIDs) GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') return str(uuid.uuid5(GHCID_NAMESPACE, ghcid)) def generate_uuid_v7() -> str: """Generate UUID v7 (time-ordered, random) for database record ID.""" import uuid # Python 3.11+ has uuid.uuid7(), fallback to uuid4() for older versions try: return str(uuid.uuid7()) except AttributeError: return str(uuid.uuid4()) # Fallback to v4 if v7 not available def generate_uuid_v8_sha256(ghcid: str) -> str: """Generate custom UUID v8 from GHCID string (SHA-256 deterministic).""" import uuid import hashlib # SHA-256 hash of GHCID hash_bytes = hashlib.sha256(ghcid.encode('utf-8')).digest() # Take first 16 bytes for UUID uuid_bytes = bytearray(hash_bytes[:16]) # Set version bits (8) and variant bits (RFC 4122) uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8 uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant RFC 4122 return str(uuid.UUID(bytes=bytes(uuid_bytes))) def generate_numeric_id(ghcid: str) -> int: """Generate 64-bit numeric ID from GHCID (SHA-256 hash).""" import hashlib hash_bytes = hashlib.sha256(ghcid.encode('utf-8')).digest() # Convert first 8 bytes to 64-bit unsigned integer return int.from_bytes(hash_bytes[:8], byteorder='big') # ============================================================================= # WIKIDATA INSTITUTION CONVERTER # ============================================================================= def convert_wikidata_institution( wikidata_inst: Dict[str, Any], country_code: str, extraction_date: str, ) -> Optional[Dict[str, Any]]: """ Convert a single Wikidata institution to LinkML format. Args: wikidata_inst: Wikidata institution dictionary from JSON country_code: ISO 3166-1 alpha-2 country code extraction_date: ISO 8601 timestamp of Wikidata extraction Returns: LinkML-compliant institution dictionary, or None if invalid """ qid = wikidata_inst.get("wikidata_qid", "") if not qid: return None # Skip institutions without QID # Extract basic fields name = wikidata_inst.get("name", qid) # Fallback to QID if name missing description = wikidata_inst.get("description", "").strip() wikidata_type = wikidata_inst.get("institution_type", "") # Map institution type institution_type = map_institution_type(wikidata_type) # Skip if type is excluded (generic non-heritage types) if institution_type is None: return None # Extract location data location_data = wikidata_inst.get("location", {}) city = location_data.get("city", "") street_address = location_data.get("street_address", "") latitude = location_data.get("latitude") longitude = location_data.get("longitude") # Extract identifiers identifiers_data = wikidata_inst.get("identifiers", {}) website = identifiers_data.get("website", "") isil_code = identifiers_data.get("isil", "") viaf_id = identifiers_data.get("viaf", "") email = identifiers_data.get("email", "") phone = identifiers_data.get("phone", "") # Extract temporal data temporal_data = wikidata_inst.get("temporal", {}) inception_str = temporal_data.get("inception", "") founding_str = temporal_data.get("founding_date", "") # Parse founding date (prefer founding_date over inception) founded_date = None if founding_str: founded_date = founding_str.split("T")[0] # Extract YYYY-MM-DD elif inception_str: founded_date = inception_str.split("T")[0] # Generate GHCID if GHCID_AVAILABLE: # Use full GHCID generation with proper region/city codes try: ghcid = generate_ghcid( country_code=country_code, city=city or "Unknown", institution_type=institution_type, institution_name=name, ) except Exception as e: print(f"⚠️ GHCID generation failed for {qid}: {e}") ghcid = generate_basic_ghcid(country_code, city, institution_type, name, qid) else: # Fallback to basic GHCID ghcid = generate_basic_ghcid(country_code, city, institution_type, name, qid) # Generate UUIDs ghcid_uuid = generate_uuid_v5(ghcid) ghcid_uuid_sha256 = generate_uuid_v8_sha256(ghcid) record_id = generate_uuid_v7() ghcid_numeric = generate_numeric_id(ghcid) # Build LinkML institution record institution = { "id": f"https://w3id.org/heritage/custodian/wikidata/{qid}", "record_id": record_id, "ghcid_uuid": ghcid_uuid, "ghcid_uuid_sha256": ghcid_uuid_sha256, "ghcid_numeric": ghcid_numeric, "ghcid_current": ghcid, "ghcid_original": ghcid, "name": name, "institution_type": institution_type, } # Add optional fields if description: institution["description"] = description if founded_date: institution["founded_date"] = founded_date # Build identifiers list identifiers = [] # Always add Wikidata QID identifiers.append({ "identifier_scheme": "Wikidata", "identifier_value": qid, "identifier_url": f"https://www.wikidata.org/wiki/{qid}", }) if isil_code: identifiers.append({ "identifier_scheme": "ISIL", "identifier_value": isil_code, }) if viaf_id: identifiers.append({ "identifier_scheme": "VIAF", "identifier_value": viaf_id, "identifier_url": f"https://viaf.org/viaf/{viaf_id}", }) if website: identifiers.append({ "identifier_scheme": "Website", "identifier_value": website, "identifier_url": website, }) institution["identifiers"] = identifiers # Build locations list if city or latitude or longitude or street_address: location = {} if city: location["city"] = city if street_address: location["street_address"] = street_address if latitude is not None: location["latitude"] = float(latitude) if longitude is not None: location["longitude"] = float(longitude) location["country"] = country_code location["is_primary"] = True institution["locations"] = [location] # Build contact_info if email or phone: contact_info = {} if email: contact_info["email"] = email if phone: contact_info["phone"] = phone contact_info["contact_type"] = "general" institution["contact_info"] = contact_info # Build provenance metadata (REQUIRED) institution["provenance"] = { "data_source": "WIKIDATA", "data_tier": "TIER_3_CROWD_SOURCED", "extraction_date": extraction_date, "extraction_method": f"Wikidata SPARQL extraction via extract_global_wikidata.py, converted by convert_wikidata_to_linkml.py", "confidence_score": 0.85, # Wikidata is crowd-sourced but generally reliable } return institution # ============================================================================= # BATCH CONVERSION # ============================================================================= def convert_wikidata_file( json_path: Path, output_dir: Path, skip_incomplete: bool = False, dry_run: bool = False, ) -> Dict[str, Any]: """ Convert a Wikidata JSON file to LinkML YAML. Args: json_path: Path to Wikidata JSON file output_dir: Output directory for YAML files skip_incomplete: Skip institutions with missing critical data dry_run: Don't write files, just show statistics Returns: Dictionary with conversion statistics """ print(f"\n{'='*80}") print(f"📂 Processing: {json_path.name}") print(f"{'='*80}\n") # Load Wikidata JSON with open(json_path, 'r', encoding='utf-8') as f: wikidata_data = json.load(f) country_code = wikidata_data.get("country_code", "XX") country_name = wikidata_data.get("country_name", "Unknown") extraction_date = wikidata_data.get("extraction_date", datetime.now(timezone.utc).isoformat()) wikidata_institutions = wikidata_data.get("institutions", []) print(f"🌍 Country: {country_name} ({country_code})") print(f"📅 Extraction Date: {extraction_date}") print(f"🏛️ Total Wikidata Institutions: {len(wikidata_institutions)}") # Convert institutions converted_institutions = [] skipped_count = 0 excluded_count = 0 # Non-heritage types (generic organizations) error_count = 0 for wikidata_inst in wikidata_institutions: try: institution = convert_wikidata_institution( wikidata_inst, country_code, extraction_date, ) if institution is None: # Check if excluded due to type filtering wikidata_type = wikidata_inst.get("institution_type", "") mapped_type = map_institution_type(wikidata_type) if mapped_type is None: excluded_count += 1 else: skipped_count += 1 continue # Skip incomplete records if requested if skip_incomplete: # Check for critical data has_name = institution.get("name") and institution["name"] != institution.get("identifiers", [{}])[0].get("identifier_value") has_location = bool(institution.get("locations")) if not (has_name or has_location): skipped_count += 1 continue converted_institutions.append(institution) except Exception as e: error_count += 1 qid = wikidata_inst.get("wikidata_qid", "unknown") print(f"❌ Error converting {qid}: {e}") # Statistics stats = { "country_code": country_code, "country_name": country_name, "total_wikidata": len(wikidata_institutions), "converted": len(converted_institutions), "skipped": skipped_count, "excluded": excluded_count, "errors": error_count, } print(f"\n📊 Conversion Results:") print(f" ✅ Converted: {stats['converted']}") print(f" ⏭️ Skipped (incomplete data): {stats['skipped']}") print(f" 🚫 Excluded (non-heritage types): {stats['excluded']}") print(f" ❌ Errors: {stats['errors']}") # Write YAML file if not dry_run and converted_institutions: timestamp = json_path.stem # Use same timestamp as input file output_file = output_dir / f"wikidata_{country_code.lower()}_{timestamp}.yaml" output_dir.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: # Write YAML header f.write(f"# Wikidata Heritage Institutions - {country_name}\n") f.write(f"# Extracted: {extraction_date}\n") f.write(f"# Converted: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Total institutions: {len(converted_institutions)}\n") f.write(f"# Schema: Heritage Custodian v0.2.2 (LinkML modular schema)\n") f.write(f"---\n") # Write institutions as YAML list yaml.dump( converted_institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100, ) print(f"\n💾 Output: {output_file}") print(f" Size: {output_file.stat().st_size / 1024:.1f} KB") return stats def convert_all_countries( wikidata_dir: Path, output_dir: Path, skip_incomplete: bool = False, dry_run: bool = False, ) -> None: """ Convert all Wikidata JSON files to LinkML YAML. Args: wikidata_dir: Directory containing Wikidata JSON files (organized by country) output_dir: Output directory for YAML files skip_incomplete: Skip institutions with missing critical data dry_run: Don't write files, just show statistics """ # Find all JSON files (recursive search in country subdirectories) json_files = sorted(wikidata_dir.glob("**/*.json")) if not json_files: print(f"❌ No Wikidata JSON files found in {wikidata_dir}") return print(f"\n🌍 Found {len(json_files)} Wikidata JSON files") # Convert each file all_stats = [] for json_file in json_files: stats = convert_wikidata_file( json_file, output_dir, skip_incomplete=skip_incomplete, dry_run=dry_run, ) all_stats.append(stats) # Global statistics print(f"\n{'='*80}") print(f"🌍 GLOBAL CONVERSION SUMMARY") print(f"{'='*80}\n") total_wikidata = sum(s["total_wikidata"] for s in all_stats) total_converted = sum(s["converted"] for s in all_stats) total_skipped = sum(s["skipped"] for s in all_stats) total_errors = sum(s["errors"] for s in all_stats) print(f"📊 Total Institutions:") print(f" 🌍 Wikidata: {total_wikidata}") print(f" ✅ Converted: {total_converted} ({total_converted/total_wikidata*100:.1f}%)") print(f" ⏭️ Skipped: {total_skipped} ({total_skipped/total_wikidata*100:.1f}%)") print(f" ❌ Errors: {total_errors}") # Per-country breakdown print(f"\n📍 Per-Country Breakdown:") for stats in all_stats: print(f" {stats['country_name']:20s} ({stats['country_code']}): " f"{stats['converted']:4d} / {stats['total_wikidata']:4d} " f"({stats['converted']/stats['total_wikidata']*100:5.1f}%)") # ============================================================================= # CLI # ============================================================================= def main(): parser = argparse.ArgumentParser( description="Convert Wikidata SPARQL extractions to LinkML YAML instances", formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( "--country", type=str, help="Country code to convert (e.g., NL, CL, BE)", ) parser.add_argument( "--timestamp", type=str, help="Specific timestamp file to convert (e.g., 20251111_105038)", ) parser.add_argument( "--all-countries", action="store_true", help="Convert all Wikidata JSON files (all countries)", ) parser.add_argument( "--skip-incomplete", action="store_true", help="Skip institutions with missing critical data (name, location)", ) parser.add_argument( "--dry-run", action="store_true", help="Show statistics without writing files", ) parser.add_argument( "--wikidata-dir", type=Path, default=Path("data/wikidata"), help="Directory containing Wikidata JSON files (default: data/wikidata)", ) parser.add_argument( "--output-dir", type=Path, default=Path("data/instances"), help="Output directory for LinkML YAML files (default: data/instances)", ) args = parser.parse_args() # Resolve paths wikidata_dir = Path(args.wikidata_dir).resolve() output_dir = Path(args.output_dir).resolve() if not wikidata_dir.exists(): print(f"❌ Wikidata directory not found: {wikidata_dir}") sys.exit(1) # Convert all countries if args.all_countries: convert_all_countries( wikidata_dir, output_dir, skip_incomplete=args.skip_incomplete, dry_run=args.dry_run, ) return # Convert specific country if args.country: country_code = args.country.upper() country_dir = wikidata_dir / country_code.lower() if not country_dir.exists(): print(f"❌ Country directory not found: {country_dir}") sys.exit(1) # Find JSON files for this country if args.timestamp: json_file = country_dir / f"{args.timestamp}.json" if not json_file.exists(): print(f"❌ JSON file not found: {json_file}") sys.exit(1) json_files = [json_file] else: # Get latest JSON file json_files = sorted(country_dir.glob("*.json"), reverse=True) if not json_files: print(f"❌ No JSON files found in {country_dir}") sys.exit(1) json_files = [json_files[0]] # Latest file # Convert for json_file in json_files: convert_wikidata_file( json_file, output_dir, skip_incomplete=args.skip_incomplete, dry_run=args.dry_run, ) return # No arguments provided parser.print_help() if __name__ == "__main__": main()