glam/scripts/convert_wikidata_to_linkml.py

#!/usr/bin/env python3
"""
Wikidata JSON to LinkML YAML Converter

This script transforms raw Wikidata SPARQL extraction results into LinkML-compliant
YAML instance files conforming to the Heritage Custodian schema v0.2.2.

Input:  data/wikidata/{country_code}/{timestamp}.json
Output: data/instances/wikidata_{country_code}_{timestamp}.yaml

Features:
- Maps Wikidata fields → LinkML schema (core.yaml, enums.yaml, provenance.yaml)
- Generates GHCIDs (Global Heritage Custodian Identifiers) with collision detection
- Generates UUIDs (v5, v7, v8) for persistent identification
- Enriches data with provenance metadata (TIER_3_CROWD_SOURCED)
- Validates institution types against InstitutionTypeEnum
- Geocodes addresses to lat/lon (if missing)
- Cross-references with existing LinkML instances for deduplication

Wikidata → LinkML Field Mapping:
- wikidata_qid → identifiers[scheme=Wikidata]
- name → name (fallback to QID if empty)
- description → description
- institution_type → InstitutionTypeEnum mapping
- location{latitude, longitude, city, street_address} → Location class
- identifiers{website, isil, viaf, email, phone} → Identifier + ContactInfo
- temporal{inception, founding_date} → founded_date
- organizational{parent_org} → parent_organization_name
- collection{size, subject} → Collection class
- media{image, logo} → future enhancement

Usage:
    # Convert single country extraction
    python convert_wikidata_to_linkml.py --country NL --timestamp 20251111_105038

    # Convert all extractions for a country (latest by default)
    python convert_wikidata_to_linkml.py --country NL

    # Convert all countries (all JSON files)
    python convert_wikidata_to_linkml.py --all-countries

    # Dry run (show what would be converted)
    python convert_wikidata_to_linkml.py --country NL --dry-run

    # Skip institutions with missing critical data
    python convert_wikidata_to_linkml.py --country NL --skip-incomplete
"""

import sys
import json
import yaml
import argparse
from pathlib import Path
from typing import Any, Optional, Dict, List
from datetime import datetime, timezone
from collections import defaultdict
import re

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
sys.path.insert(0, str(Path(__file__).parent))  # Add scripts directory for wikidata_type_mapping

# Import comprehensive Wikidata type mapping
try:
    from wikidata_type_mapping import map_wikidata_type_to_linkml, is_valid_heritage_custodian
    TYPE_MAPPING_AVAILABLE = True
except ImportError:
    TYPE_MAPPING_AVAILABLE = False
    print("⚠️  Warning: wikidata_type_mapping module not available. Using basic type mapping.")

# Optional: Import GHCID and UUID generation utilities if available
try:
    from glam_extractor.ghcid import (
        generate_ghcid,
        generate_ghcid_uuids,
        detect_ghcid_collision,
    )
    GHCID_AVAILABLE = True
except ImportError:
    GHCID_AVAILABLE = False
    print("⚠️  Warning: GHCID utilities not available. GHCIDs will be basic format.")


# =============================================================================
# WIKIDATA INSTITUTION TYPE → LinkML InstitutionTypeEnum MAPPING
# =============================================================================

WIKIDATA_TYPE_MAP = {
    # Museums
    "museum": "MUSEUM",
    "kunstmuseum": "MUSEUM",
    "art museum": "MUSEUM",
    "science museum": "MUSEUM",
    "natural history museum": "MUSEUM",
    "history museum": "MUSEUM",
    "local museum": "MUSEUM",

    # Libraries
    "bibliotheek": "LIBRARY",
    "library": "LIBRARY",
    "openbare bibliotheek": "LIBRARY",
    "public library": "LIBRARY",
    "academic library": "LIBRARY",
    "university library": "LIBRARY",
    "national library": "LIBRARY",

    # Archives
    "archief": "ARCHIVE",
    "archive": "ARCHIVE",
    "gemeentearchief": "ARCHIVE",
    "city archive": "ARCHIVE",
    "national archive": "ARCHIVE",

    # Galleries
    "galerie": "GALLERY",
    "gallery": "GALLERY",
    "art gallery": "GALLERY",
    "kunstgalerie": "GALLERY",

    # Research Centers
    "research center": "RESEARCH_CENTER",
    "onderzoekscentrum": "RESEARCH_CENTER",
    "research institute": "RESEARCH_CENTER",

    # Universities
    "university": "UNIVERSITY",
    "universiteit": "UNIVERSITY",
    "hogeschool": "UNIVERSITY",

    # Botanical Gardens / Zoos
    "botanical garden": "BOTANICAL_ZOO",
    "botanische tuin": "BOTANICAL_ZOO",
    "zoo": "BOTANICAL_ZOO",
    "dierentuin": "BOTANICAL_ZOO",
    "aquarium": "BOTANICAL_ZOO",

    # Cultural Centers
    "cultural center": "OFFICIAL_INSTITUTION",
    "cultureel centrum": "OFFICIAL_INSTITUTION",

    # Default fallback
    "cultural institution": "MIXED",
}


def map_institution_type(wikidata_type: str) -> Optional[str]:
    """
    Map Wikidata institution type to LinkML InstitutionTypeEnum.

    Args:
        wikidata_type: Institution type from Wikidata (e.g., "museum", "bibliotheek")

    Returns:
        LinkML InstitutionTypeEnum value (e.g., "MUSEUM", "LIBRARY"), or None if excluded
    """
    # Use comprehensive mapping if available
    if TYPE_MAPPING_AVAILABLE:
        return map_wikidata_type_to_linkml(wikidata_type)

    # Fallback to basic mapping
    wikidata_type_lower = wikidata_type.lower().strip()

    # Exact match
    if wikidata_type_lower in WIKIDATA_TYPE_MAP:
        return WIKIDATA_TYPE_MAP[wikidata_type_lower]

    # Fuzzy match (contains keyword)
    for key, value in WIKIDATA_TYPE_MAP.items():
        if key in wikidata_type_lower:
            return value

    # Fallback to MIXED if unknown
    return "MIXED"


# =============================================================================
# GHCID GENERATION (Fallback if utilities unavailable)
# =============================================================================

def generate_basic_ghcid(
    country_code: str,
    city: Optional[str],
    institution_type: str,
    institution_name: str,
    wikidata_qid: Optional[str] = None,
) -> str:
    """
    Generate a basic GHCID without full validation.
    Fallback method if glam_extractor.ghcid module unavailable.

    Format: {Country}-{Region}-{City}-{Type}-{Abbreviation}[-Q{WikidataID}]
    Example: NL-NH-AMS-M-RM (Rijksmuseum Amsterdam)
    """
    # Type code mapping
    TYPE_CODES = {
        "MUSEUM": "M", "LIBRARY": "L", "ARCHIVE": "A", "GALLERY": "G",
        "OFFICIAL_INSTITUTION": "O", "RESEARCH_CENTER": "R", "CORPORATION": "C",
        "UNIVERSITY": "U", "BOTANICAL_ZOO": "B", "EDUCATION_PROVIDER": "E",
        "PERSONAL_COLLECTION": "P", "COLLECTING_SOCIETY": "S", "MIXED": "X",
        "HOLY_SITES": "H"
    }

    type_code = TYPE_CODES.get(institution_type, "X")

    # Extract abbreviation from name (first letters of each word)
    if institution_name and institution_name != "unknown":
        words = re.findall(r'\b\w', institution_name.upper())
        abbreviation = ''.join(words[:5])  # Max 5 letters
    else:
        abbreviation = "UNK"

    # City code (first 3 letters uppercase)
    if city:
        city_code = re.sub(r'[^A-Z]', '', city.upper())[:3].ljust(3, 'X')
    else:
        city_code = "UNK"

    # Region code (use "XX" as placeholder - proper mapping requires country-specific logic)
    region_code = "XX"

    # Base GHCID
    ghcid = f"{country_code}-{region_code}-{city_code}-{type_code}-{abbreviation}"

    # Add Wikidata Q-number for collision resolution
    if wikidata_qid:
        ghcid += f"-{wikidata_qid}"

    return ghcid


def generate_uuid_v5(ghcid: str) -> str:
    """Generate UUID v5 from GHCID string (SHA-1 deterministic)."""
    import uuid
    # GHCID namespace UUID (fixed for all GHCIDs)
    GHCID_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
    return str(uuid.uuid5(GHCID_NAMESPACE, ghcid))


def generate_uuid_v7() -> str:
    """Generate UUID v7 (time-ordered, random) for database record ID."""
    import uuid
    # Python 3.11+ has uuid.uuid7(), fallback to uuid4() for older versions
    try:
        return str(uuid.uuid7())
    except AttributeError:
        return str(uuid.uuid4())  # Fallback to v4 if v7 not available


def generate_uuid_v8_sha256(ghcid: str) -> str:
    """Generate custom UUID v8 from GHCID string (SHA-256 deterministic)."""
    import uuid
    import hashlib

    # SHA-256 hash of GHCID
    hash_bytes = hashlib.sha256(ghcid.encode('utf-8')).digest()

    # Take first 16 bytes for UUID
    uuid_bytes = bytearray(hash_bytes[:16])

    # Set version bits (8) and variant bits (RFC 4122)
    uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80  # Version 8
    uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80  # Variant RFC 4122

    return str(uuid.UUID(bytes=bytes(uuid_bytes)))


def generate_numeric_id(ghcid: str) -> int:
    """Generate 64-bit numeric ID from GHCID (SHA-256 hash)."""
    import hashlib
    hash_bytes = hashlib.sha256(ghcid.encode('utf-8')).digest()
    # Convert first 8 bytes to 64-bit unsigned integer
    return int.from_bytes(hash_bytes[:8], byteorder='big')


# =============================================================================
# WIKIDATA INSTITUTION CONVERTER
# =============================================================================

def convert_wikidata_institution(
    wikidata_inst: Dict[str, Any],
    country_code: str,
    extraction_date: str,
) -> Optional[Dict[str, Any]]:
    """
    Convert a single Wikidata institution to LinkML format.

    Args:
        wikidata_inst: Wikidata institution dictionary from JSON
        country_code: ISO 3166-1 alpha-2 country code
        extraction_date: ISO 8601 timestamp of Wikidata extraction

    Returns:
        LinkML-compliant institution dictionary, or None if invalid
    """
    qid = wikidata_inst.get("wikidata_qid", "")
    if not qid:
        return None  # Skip institutions without QID

    # Extract basic fields
    name = wikidata_inst.get("name", qid)  # Fallback to QID if name missing
    description = wikidata_inst.get("description", "").strip()
    wikidata_type = wikidata_inst.get("institution_type", "")

    # Map institution type
    institution_type = map_institution_type(wikidata_type)

    # Skip if type is excluded (generic non-heritage types)
    if institution_type is None:
        return None

    # Extract location data
    location_data = wikidata_inst.get("location", {})
    city = location_data.get("city", "")
    street_address = location_data.get("street_address", "")
    latitude = location_data.get("latitude")
    longitude = location_data.get("longitude")

    # Extract identifiers
    identifiers_data = wikidata_inst.get("identifiers", {})
    website = identifiers_data.get("website", "")
    isil_code = identifiers_data.get("isil", "")
    viaf_id = identifiers_data.get("viaf", "")
    email = identifiers_data.get("email", "")
    phone = identifiers_data.get("phone", "")

    # Extract temporal data
    temporal_data = wikidata_inst.get("temporal", {})
    inception_str = temporal_data.get("inception", "")
    founding_str = temporal_data.get("founding_date", "")

    # Parse founding date (prefer founding_date over inception)
    founded_date = None
    if founding_str:
        founded_date = founding_str.split("T")[0]  # Extract YYYY-MM-DD
    elif inception_str:
        founded_date = inception_str.split("T")[0]

    # Generate GHCID
    if GHCID_AVAILABLE:
        # Use full GHCID generation with proper region/city codes
        try:
            ghcid = generate_ghcid(
                country_code=country_code,
                city=city or "Unknown",
                institution_type=institution_type,
                institution_name=name,
            )
        except Exception as e:
            print(f"⚠️  GHCID generation failed for {qid}: {e}")
            ghcid = generate_basic_ghcid(country_code, city, institution_type, name, qid)
    else:
        # Fallback to basic GHCID
        ghcid = generate_basic_ghcid(country_code, city, institution_type, name, qid)

    # Generate UUIDs
    ghcid_uuid = generate_uuid_v5(ghcid)
    ghcid_uuid_sha256 = generate_uuid_v8_sha256(ghcid)
    record_id = generate_uuid_v7()
    ghcid_numeric = generate_numeric_id(ghcid)

    # Build LinkML institution record
    institution = {
        "id": f"https://w3id.org/heritage/custodian/wikidata/{qid}",
        "record_id": record_id,
        "ghcid_uuid": ghcid_uuid,
        "ghcid_uuid_sha256": ghcid_uuid_sha256,
        "ghcid_numeric": ghcid_numeric,
        "ghcid_current": ghcid,
        "ghcid_original": ghcid,
        "name": name,
        "institution_type": institution_type,
    }

    # Add optional fields
    if description:
        institution["description"] = description

    if founded_date:
        institution["founded_date"] = founded_date

    # Build identifiers list
    identifiers = []

    # Always add Wikidata QID
    identifiers.append({
        "identifier_scheme": "Wikidata",
        "identifier_value": qid,
        "identifier_url": f"https://www.wikidata.org/wiki/{qid}",
    })

    if isil_code:
        identifiers.append({
            "identifier_scheme": "ISIL",
            "identifier_value": isil_code,
        })

    if viaf_id:
        identifiers.append({
            "identifier_scheme": "VIAF",
            "identifier_value": viaf_id,
            "identifier_url": f"https://viaf.org/viaf/{viaf_id}",
        })

    if website:
        identifiers.append({
            "identifier_scheme": "Website",
            "identifier_value": website,
            "identifier_url": website,
        })

    institution["identifiers"] = identifiers

    # Build locations list
    if city or latitude or longitude or street_address:
        location = {}

        if city:
            location["city"] = city

        if street_address:
            location["street_address"] = street_address

        if latitude is not None:
            location["latitude"] = float(latitude)

        if longitude is not None:
            location["longitude"] = float(longitude)

        location["country"] = country_code
        location["is_primary"] = True

        institution["locations"] = [location]

    # Build contact_info
    if email or phone:
        contact_info = {}

        if email:
            contact_info["email"] = email

        if phone:
            contact_info["phone"] = phone

        contact_info["contact_type"] = "general"

        institution["contact_info"] = contact_info

    # Build provenance metadata (REQUIRED)
    institution["provenance"] = {
        "data_source": "WIKIDATA",
        "data_tier": "TIER_3_CROWD_SOURCED",
        "extraction_date": extraction_date,
        "extraction_method": f"Wikidata SPARQL extraction via extract_global_wikidata.py, converted by convert_wikidata_to_linkml.py",
        "confidence_score": 0.85,  # Wikidata is crowd-sourced but generally reliable
    }

    return institution


# =============================================================================
# BATCH CONVERSION
# =============================================================================

def convert_wikidata_file(
    json_path: Path,
    output_dir: Path,
    skip_incomplete: bool = False,
    dry_run: bool = False,
) -> Dict[str, Any]:
    """
    Convert a Wikidata JSON file to LinkML YAML.

    Args:
        json_path: Path to Wikidata JSON file
        output_dir: Output directory for YAML files
        skip_incomplete: Skip institutions with missing critical data
        dry_run: Don't write files, just show statistics

    Returns:
        Dictionary with conversion statistics
    """
    print(f"\n{'='*80}")
    print(f"📂 Processing: {json_path.name}")
    print(f"{'='*80}\n")

    # Load Wikidata JSON
    with open(json_path, 'r', encoding='utf-8') as f:
        wikidata_data = json.load(f)

    country_code = wikidata_data.get("country_code", "XX")
    country_name = wikidata_data.get("country_name", "Unknown")
    extraction_date = wikidata_data.get("extraction_date", datetime.now(timezone.utc).isoformat())
    wikidata_institutions = wikidata_data.get("institutions", [])

    print(f"🌍 Country: {country_name} ({country_code})")
    print(f"📅 Extraction Date: {extraction_date}")
    print(f"🏛️  Total Wikidata Institutions: {len(wikidata_institutions)}")

    # Convert institutions
    converted_institutions = []
    skipped_count = 0
    excluded_count = 0  # Non-heritage types (generic organizations)
    error_count = 0

    for wikidata_inst in wikidata_institutions:
        try:
            institution = convert_wikidata_institution(
                wikidata_inst,
                country_code,
                extraction_date,
            )

            if institution is None:
                # Check if excluded due to type filtering
                wikidata_type = wikidata_inst.get("institution_type", "")
                mapped_type = map_institution_type(wikidata_type)
                if mapped_type is None:
                    excluded_count += 1
                else:
                    skipped_count += 1
                continue

            # Skip incomplete records if requested
            if skip_incomplete:
                # Check for critical data
                has_name = institution.get("name") and institution["name"] != institution.get("identifiers", [{}])[0].get("identifier_value")
                has_location = bool(institution.get("locations"))

                if not (has_name or has_location):
                    skipped_count += 1
                    continue

            converted_institutions.append(institution)

        except Exception as e:
            error_count += 1
            qid = wikidata_inst.get("wikidata_qid", "unknown")
            print(f"❌ Error converting {qid}: {e}")

    # Statistics
    stats = {
        "country_code": country_code,
        "country_name": country_name,
        "total_wikidata": len(wikidata_institutions),
        "converted": len(converted_institutions),
        "skipped": skipped_count,
        "excluded": excluded_count,
        "errors": error_count,
    }

    print(f"\n📊 Conversion Results:")
    print(f"   ✅ Converted: {stats['converted']}")
    print(f"   ⏭️  Skipped (incomplete data): {stats['skipped']}")
    print(f"   🚫 Excluded (non-heritage types): {stats['excluded']}")
    print(f"   ❌ Errors: {stats['errors']}")

    # Write YAML file
    if not dry_run and converted_institutions:
        timestamp = json_path.stem  # Use same timestamp as input file
        output_file = output_dir / f"wikidata_{country_code.lower()}_{timestamp}.yaml"

        output_dir.mkdir(parents=True, exist_ok=True)

        with open(output_file, 'w', encoding='utf-8') as f:
            # Write YAML header
            f.write(f"# Wikidata Heritage Institutions - {country_name}\n")
            f.write(f"# Extracted: {extraction_date}\n")
            f.write(f"# Converted: {datetime.now(timezone.utc).isoformat()}\n")
            f.write(f"# Total institutions: {len(converted_institutions)}\n")
            f.write(f"# Schema: Heritage Custodian v0.2.2 (LinkML modular schema)\n")
            f.write(f"---\n")

            # Write institutions as YAML list
            yaml.dump(
                converted_institutions,
                f,
                default_flow_style=False,
                allow_unicode=True,
                sort_keys=False,
                width=100,
            )

        print(f"\n💾 Output: {output_file}")
        print(f"   Size: {output_file.stat().st_size / 1024:.1f} KB")

    return stats


def convert_all_countries(
    wikidata_dir: Path,
    output_dir: Path,
    skip_incomplete: bool = False,
    dry_run: bool = False,
) -> None:
    """
    Convert all Wikidata JSON files to LinkML YAML.

    Args:
        wikidata_dir: Directory containing Wikidata JSON files (organized by country)
        output_dir: Output directory for YAML files
        skip_incomplete: Skip institutions with missing critical data
        dry_run: Don't write files, just show statistics
    """
    # Find all JSON files (recursive search in country subdirectories)
    json_files = sorted(wikidata_dir.glob("**/*.json"))

    if not json_files:
        print(f"❌ No Wikidata JSON files found in {wikidata_dir}")
        return

    print(f"\n🌍 Found {len(json_files)} Wikidata JSON files")

    # Convert each file
    all_stats = []
    for json_file in json_files:
        stats = convert_wikidata_file(
            json_file,
            output_dir,
            skip_incomplete=skip_incomplete,
            dry_run=dry_run,
        )
        all_stats.append(stats)

    # Global statistics
    print(f"\n{'='*80}")
    print(f"🌍 GLOBAL CONVERSION SUMMARY")
    print(f"{'='*80}\n")

    total_wikidata = sum(s["total_wikidata"] for s in all_stats)
    total_converted = sum(s["converted"] for s in all_stats)
    total_skipped = sum(s["skipped"] for s in all_stats)
    total_errors = sum(s["errors"] for s in all_stats)

    print(f"📊 Total Institutions:")
    print(f"   🌍 Wikidata: {total_wikidata}")
    print(f"   ✅ Converted: {total_converted} ({total_converted/total_wikidata*100:.1f}%)")
    print(f"   ⏭️  Skipped: {total_skipped} ({total_skipped/total_wikidata*100:.1f}%)")
    print(f"   ❌ Errors: {total_errors}")

    # Per-country breakdown
    print(f"\n📍 Per-Country Breakdown:")
    for stats in all_stats:
        print(f"   {stats['country_name']:20s} ({stats['country_code']}): "
              f"{stats['converted']:4d} / {stats['total_wikidata']:4d} "
              f"({stats['converted']/stats['total_wikidata']*100:5.1f}%)")


# =============================================================================
# CLI
# =============================================================================

def main():
    parser = argparse.ArgumentParser(
        description="Convert Wikidata SPARQL extractions to LinkML YAML instances",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    parser.add_argument(
        "--country",
        type=str,
        help="Country code to convert (e.g., NL, CL, BE)",
    )

    parser.add_argument(
        "--timestamp",
        type=str,
        help="Specific timestamp file to convert (e.g., 20251111_105038)",
    )

    parser.add_argument(
        "--all-countries",
        action="store_true",
        help="Convert all Wikidata JSON files (all countries)",
    )

    parser.add_argument(
        "--skip-incomplete",
        action="store_true",
        help="Skip institutions with missing critical data (name, location)",
    )

    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show statistics without writing files",
    )

    parser.add_argument(
        "--wikidata-dir",
        type=Path,
        default=Path("data/wikidata"),
        help="Directory containing Wikidata JSON files (default: data/wikidata)",
    )

    parser.add_argument(
        "--output-dir",
        type=Path,
        default=Path("data/instances"),
        help="Output directory for LinkML YAML files (default: data/instances)",
    )

    args = parser.parse_args()

    # Resolve paths
    wikidata_dir = Path(args.wikidata_dir).resolve()
    output_dir = Path(args.output_dir).resolve()

    if not wikidata_dir.exists():
        print(f"❌ Wikidata directory not found: {wikidata_dir}")
        sys.exit(1)

    # Convert all countries
    if args.all_countries:
        convert_all_countries(
            wikidata_dir,
            output_dir,
            skip_incomplete=args.skip_incomplete,
            dry_run=args.dry_run,
        )
        return

    # Convert specific country
    if args.country:
        country_code = args.country.upper()
        country_dir = wikidata_dir / country_code.lower()

        if not country_dir.exists():
            print(f"❌ Country directory not found: {country_dir}")
            sys.exit(1)

        # Find JSON files for this country
        if args.timestamp:
            json_file = country_dir / f"{args.timestamp}.json"
            if not json_file.exists():
                print(f"❌ JSON file not found: {json_file}")
                sys.exit(1)
            json_files = [json_file]
        else:
            # Get latest JSON file
            json_files = sorted(country_dir.glob("*.json"), reverse=True)
            if not json_files:
                print(f"❌ No JSON files found in {country_dir}")
                sys.exit(1)
            json_files = [json_files[0]]  # Latest file

        # Convert
        for json_file in json_files:
            convert_wikidata_file(
                json_file,
                output_dir,
                skip_incomplete=args.skip_incomplete,
                dry_run=args.dry_run,
            )
        return

    # No arguments provided
    parser.print_help()


if __name__ == "__main__":
    main()