glam/scripts/extract_global_wikidata.py

#!/usr/bin/env python3
"""
Global Wikidata Heritage Institution Extractor

This script extracts ALL GLAMORCUBEPSXHF (Galleries, Libraries, Archives, Museums,
Official institutions, Research centers, Corporations, Universities, Botanical gardens/zoos,
Educational providers, Personal collections, Societies, Holy sites, Features) institutions worldwide
from Wikidata using SPARQL queries.

🌍 COMPREHENSIVE GLOBAL EXTRACTION STRATEGY 🌍

Strategy:
1. Load country configuration (205+ countries with Wikidata QIDs)
2. Query Wikidata SPARQL endpoint for each country separately
3. Extract 15 GLAMORCUBEPSXHF institution types per country
4. Capture complete metadata (identifiers, coordinates, temporal data, collections)
5. Save results to data/wikidata/{country_code}/{timestamp}.json
6. Track progress, errors, and statistics

Query Optimization:
- Query each institution type separately to avoid 504 timeouts
- Use LIMITED transitive subclass queries (wdt:P279? not wdt:P279*)
- Implement pagination (LIMIT/OFFSET) for large datasets
- Rate limiting: 2-5 second delays between requests
- Exponential backoff for error handling

Priority Countries (configurable):
- Priority 1: Netherlands (Q55), Chile (Q298), Belgium (Q31)
- Priority 2: Italy (Q38), Denmark (Q35), Austria (Q40), Switzerland (Q39)
- Priority 3: Latin America (Brazil, Mexico, Argentina, Colombia)
- Priority 4: Asia (Japan, Vietnam, Thailand, Taiwan, South Korea)
- Priority 5: Africa/Middle East (Egypt, South Africa, Kenya, Nigeria)

Output Format:
data/wikidata/{country_code}/{timestamp}.json
{
  "country_code": "NL",
  "country_name": "Netherlands",
  "country_qid": "Q55",
  "extraction_date": "2025-11-11T10:30:00Z",
  "total_institutions": 1247,
  "institution_types": {"museum": 843, "library": 302, ...},
  "institutions": [...]
}

Usage:
    # Extract priority 1 countries (Netherlands, Chile)
    python extract_global_wikidata.py --priority 1

    # Extract specific countries
    python extract_global_wikidata.py --countries NL CL BE IT

    # Extract all countries (use with caution - 205+ countries)
    python extract_global_wikidata.py --all-countries

    # Dry run (show what would be extracted)
    python extract_global_wikidata.py --priority 1 --dry-run

    # Resume from specific country
    python extract_global_wikidata.py --countries BR MX AR --skip-existing
"""

import sys
import json
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import argparse
from collections import defaultdict

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON  # type: ignore
from SPARQLWrapper import SPARQLExceptions  # type: ignore


# Country configurations (Wikidata QIDs) - 205+ countries worldwide
# Organized by continent and priority level
COUNTRY_CONFIGS = {
    # =============================================================================
    # PRIORITY 1: HIGH DATA QUALITY, LARGE DATASETS
    # =============================================================================
    'NL': {'name': 'Netherlands', 'qid': 'Q55', 'flag': '🇳🇱', 'languages': 'nl,en', 'priority': 1, 'continent': 'Europe'},
    'CL': {'name': 'Chile', 'qid': 'Q298', 'flag': '🇨🇱', 'languages': 'es,en', 'priority': 1, 'continent': 'Americas'},
    'BE': {'name': 'Belgium', 'qid': 'Q31', 'flag': '🇧🇪', 'languages': 'nl,fr,en', 'priority': 1, 'continent': 'Europe'},

    # =============================================================================
    # PRIORITY 2: MEDIUM DATASETS, GOOD COVERAGE POTENTIAL
    # =============================================================================
    'IT': {'name': 'Italy', 'qid': 'Q38', 'flag': '🇮🇹', 'languages': 'it,en', 'priority': 2, 'continent': 'Europe'},
    'DK': {'name': 'Denmark', 'qid': 'Q35', 'flag': '🇩🇰', 'languages': 'da,en', 'priority': 2, 'continent': 'Europe'},
    'AT': {'name': 'Austria', 'qid': 'Q40', 'flag': '🇦🇹', 'languages': 'de,en', 'priority': 2, 'continent': 'Europe'},
    'CH': {'name': 'Switzerland', 'qid': 'Q39', 'flag': '🇨🇭', 'languages': 'de,fr,it,en', 'priority': 2, 'continent': 'Europe'},
    'NO': {'name': 'Norway', 'qid': 'Q20', 'flag': '🇳🇴', 'languages': 'no,en', 'priority': 2, 'continent': 'Europe'},
    'SE': {'name': 'Sweden', 'qid': 'Q34', 'flag': '🇸🇪', 'languages': 'sv,en', 'priority': 2, 'continent': 'Europe'},
    'FI': {'name': 'Finland', 'qid': 'Q33', 'flag': '🇫🇮', 'languages': 'fi,en', 'priority': 2, 'continent': 'Europe'},
    'FR': {'name': 'France', 'qid': 'Q142', 'flag': '🇫🇷', 'languages': 'fr,en', 'priority': 2, 'continent': 'Europe'},
    'DE': {'name': 'Germany', 'qid': 'Q183', 'flag': '🇩🇪', 'languages': 'de,en', 'priority': 2, 'continent': 'Europe'},
    'ES': {'name': 'Spain', 'qid': 'Q29', 'flag': '🇪🇸', 'languages': 'es,en', 'priority': 2, 'continent': 'Europe'},
    'PT': {'name': 'Portugal', 'qid': 'Q45', 'flag': '🇵🇹', 'languages': 'pt,en', 'priority': 2, 'continent': 'Europe'},

    # =============================================================================
    # PRIORITY 3: LATIN AMERICA
    # =============================================================================
    'BR': {'name': 'Brazil', 'qid': 'Q155', 'flag': '🇧🇷', 'languages': 'pt,en', 'priority': 3, 'continent': 'Americas'},
    'MX': {'name': 'Mexico', 'qid': 'Q96', 'flag': '🇲🇽', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
    'AR': {'name': 'Argentina', 'qid': 'Q414', 'flag': '🇦🇷', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
    'CO': {'name': 'Colombia', 'qid': 'Q739', 'flag': '🇨🇴', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
    'PE': {'name': 'Peru', 'qid': 'Q419', 'flag': '🇵🇪', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
    'VE': {'name': 'Venezuela', 'qid': 'Q717', 'flag': '🇻🇪', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
    'EC': {'name': 'Ecuador', 'qid': 'Q736', 'flag': '🇪🇨', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
    'BO': {'name': 'Bolivia', 'qid': 'Q750', 'flag': '🇧🇴', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
    'UY': {'name': 'Uruguay', 'qid': 'Q77', 'flag': '🇺🇾', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
    'PY': {'name': 'Paraguay', 'qid': 'Q733', 'flag': '🇵🇾', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},

    # =============================================================================
    # PRIORITY 4: ASIA
    # =============================================================================
    'JP': {'name': 'Japan', 'qid': 'Q17', 'flag': '🇯🇵', 'languages': 'ja,en', 'priority': 4, 'continent': 'Asia'},
    'CN': {'name': 'China', 'qid': 'Q148', 'flag': '🇨🇳', 'languages': 'zh,en', 'priority': 4, 'continent': 'Asia'},
    'IN': {'name': 'India', 'qid': 'Q668', 'flag': '🇮🇳', 'languages': 'hi,en', 'priority': 4, 'continent': 'Asia'},
    'KR': {'name': 'South Korea', 'qid': 'Q884', 'flag': '🇰🇷', 'languages': 'ko,en', 'priority': 4, 'continent': 'Asia'},
    'TW': {'name': 'Taiwan', 'qid': 'Q865', 'flag': '🇹🇼', 'languages': 'zh,en', 'priority': 4, 'continent': 'Asia'},
    'TH': {'name': 'Thailand', 'qid': 'Q869', 'flag': '🇹🇭', 'languages': 'th,en', 'priority': 4, 'continent': 'Asia'},
    'VN': {'name': 'Vietnam', 'qid': 'Q881', 'flag': '🇻🇳', 'languages': 'vi,en', 'priority': 4, 'continent': 'Asia'},
    'MY': {'name': 'Malaysia', 'qid': 'Q833', 'flag': '🇲🇾', 'languages': 'ms,en', 'priority': 4, 'continent': 'Asia'},
    'ID': {'name': 'Indonesia', 'qid': 'Q252', 'flag': '🇮🇩', 'languages': 'id,en', 'priority': 4, 'continent': 'Asia'},
    'PH': {'name': 'Philippines', 'qid': 'Q928', 'flag': '🇵🇭', 'languages': 'en,tl', 'priority': 4, 'continent': 'Asia'},
    'SG': {'name': 'Singapore', 'qid': 'Q334', 'flag': '🇸🇬', 'languages': 'en', 'priority': 4, 'continent': 'Asia'},

    # =============================================================================
    # PRIORITY 5: AFRICA AND MIDDLE EAST
    # =============================================================================
    'EG': {'name': 'Egypt', 'qid': 'Q79', 'flag': '🇪🇬', 'languages': 'ar,en', 'priority': 5, 'continent': 'Africa'},
    'ZA': {'name': 'South Africa', 'qid': 'Q258', 'flag': '🇿🇦', 'languages': 'en,af', 'priority': 5, 'continent': 'Africa'},
    'KE': {'name': 'Kenya', 'qid': 'Q114', 'flag': '🇰🇪', 'languages': 'en,sw', 'priority': 5, 'continent': 'Africa'},
    'NG': {'name': 'Nigeria', 'qid': 'Q1033', 'flag': '🇳🇬', 'languages': 'en', 'priority': 5, 'continent': 'Africa'},
    'GH': {'name': 'Ghana', 'qid': 'Q117', 'flag': '🇬🇭', 'languages': 'en', 'priority': 5, 'continent': 'Africa'},
    'ET': {'name': 'Ethiopia', 'qid': 'Q115', 'flag': '🇪🇹', 'languages': 'am,en', 'priority': 5, 'continent': 'Africa'},
    'TZ': {'name': 'Tanzania', 'qid': 'Q924', 'flag': '🇹🇿', 'languages': 'sw,en', 'priority': 5, 'continent': 'Africa'},
    'UG': {'name': 'Uganda', 'qid': 'Q1036', 'flag': '🇺🇬', 'languages': 'en', 'priority': 5, 'continent': 'Africa'},

    # TODO: Add remaining 160+ countries from /docs/WIKIDATA_SPARQL_QUERIES.md
}


# Institution type mappings (GLAMORCUBEPSXHF taxonomy → Wikidata QIDs)
INSTITUTION_TYPES = {
    'museum': {'qid': 'Q33506', 'code': 'M', 'label': 'Museum'},
    'library': {'qid': 'Q7075', 'code': 'L', 'label': 'Library'},
    'archive': {'qid': 'Q166118', 'code': 'A', 'label': 'Archive'},
    'gallery': {'qid': 'Q2668072', 'code': 'G', 'label': 'Gallery'},
    'cultural_center': {'qid': 'Q5282129', 'code': 'O', 'label': 'Cultural Center'},
    'research_center': {'qid': 'Q3152824', 'code': 'R', 'label': 'Research Center'},
    'university': {'qid': 'Q3918', 'code': 'U', 'label': 'University'},
    'botanical_garden': {'qid': 'Q167346', 'code': 'B', 'label': 'Botanical Garden'},
    'features': {'qid': 'Q4989906', 'code': 'F', 'label': 'Monument/Landmark'},
    # Add holy sites, societies, etc. as needed
}


def create_sparql_query(country_qid: str, inst_type_qid: str, languages: str = "en", limit: int = 1000, offset: int = 0) -> str:
    """
    Generate SPARQL query for heritage institutions in a specific country and type.

    This query extracts comprehensive metadata from Wikidata including:
    - Identifiers (ISIL, VIAF, Wikidata QID, website, email, phone)
    - Geographic data (coordinates, address, city, region)
    - Temporal data (inception, founding, dissolution dates)
    - Organizational data (parent org, part-of relationships)
    - Collection metadata (size, types)
    - Media (images, logos)

    Args:
        country_qid: Wikidata QID for country (e.g., Q55 for Netherlands)
        inst_type_qid: Wikidata QID for institution type (e.g., Q33506 for museum)
        languages: Comma-separated language codes (e.g., "nl,en")
        limit: Maximum results to return
        offset: Pagination offset

    Returns:
        SPARQL query string
    """
    query = f"""
    SELECT DISTINCT
      ?item ?itemLabel ?itemDescription ?itemAltLabel
      ?instType ?instTypeLabel
      ?coords
      ?streetAddress ?postalCode ?city ?cityLabel ?region ?regionLabel
      # Primary identifiers
      ?isil ?viaf ?wikidataQID ?website ?email ?phone
      # Archives & Libraries
      ?archivesPortalEuropeID ?egaxaID ?archiveGridID ?atomURL
      # Museum identifiers
      ?museofileID ?commonsInstitution
      # Dutch heritage identifiers
      ?rkdInstituteID ?rkdArtistsID ?rceMonumentID ?monumentsFlandersID
      # Authority control
      ?locID ?gndID ?bnfID ?librariesAustraliaID ?nliID
      # Social media
      ?twitter ?facebook ?instagram
      # Temporal, organizational, collection, media
      ?inception ?dissolved ?foundingDate
      ?parent ?parentLabel ?partOf ?partOfLabel
      ?collectionSize ?collectionType
      ?image ?logo
    WHERE {{
      # Instance of heritage institution type (including subclasses)
      # This checks if ?item is an instance of a type that is a subclass of our target type
      ?item wdt:P31 ?instType .
      ?instType wdt:P279* wd:{inst_type_qid} .

      # Country filter
      ?item wdt:P17 wd:{country_qid} .

      # =============================================================================
      # GEOGRAPHIC DATA
      # =============================================================================

      # Coordinates (lat/lon) - returned as Point(lon lat) string, parsed in Python
      OPTIONAL {{ ?item wdt:P625 ?coords . }}

      # Physical address components
      OPTIONAL {{ ?item wdt:P6375 ?streetAddress . }}
      OPTIONAL {{ ?item wdt:P281 ?postalCode . }}

      # City/municipality
      OPTIONAL {{
        ?item wdt:P131 ?city .
        ?city wdt:P31/wdt:P279? wd:Q515 .
      }}

      # Region/province/state
      OPTIONAL {{
        ?item wdt:P131 ?region .
        ?region wdt:P31/wdt:P279? wd:Q10864048 .
      }}

      # =============================================================================
      # IDENTIFIERS - Comprehensive heritage institution identifiers
      # =============================================================================

      # Primary identifiers (ISIL, VIAF)
      OPTIONAL {{ ?item wdt:P791 ?isil . }}
      OPTIONAL {{ ?item wdt:P214 ?viaf . }}

      # Contact information
      OPTIONAL {{ ?item wdt:P856 ?website . }}
      OPTIONAL {{ ?item wdt:P968 ?email . }}
      OPTIONAL {{ ?item wdt:P1329 ?phone . }}

      # Archives & Libraries
      OPTIONAL {{ ?item wdt:P3066 ?archivesPortalEuropeID . }}
      OPTIONAL {{ ?item wdt:P1309 ?egaxaID . }}
      OPTIONAL {{ ?item wdt:P1984 ?archiveGridID . }}
      OPTIONAL {{ ?item wdt:P6721 ?atomURL . }}

      # Museum identifiers
      OPTIONAL {{ ?item wdt:P539 ?museofileID . }}
      OPTIONAL {{ ?item wdt:P1907 ?commonsInstitution . }}

      # Dutch heritage identifiers (PRIORITY for NL data)
      OPTIONAL {{ ?item wdt:P7740 ?rkdInstituteID . }}
      OPTIONAL {{ ?item wdt:P350 ?rkdArtistsID . }}
      OPTIONAL {{ ?item wdt:P7314 ?rceMonumentID . }}
      OPTIONAL {{ ?item wdt:P4372 ?monumentsFlandersID . }}

      # Authority control
      OPTIONAL {{ ?item wdt:P244 ?locID . }}
      OPTIONAL {{ ?item wdt:P227 ?gndID . }}
      OPTIONAL {{ ?item wdt:P268 ?bnfID . }}
      OPTIONAL {{ ?item wdt:P409 ?librariesAustraliaID . }}
      OPTIONAL {{ ?item wdt:P3788 ?nliID . }}

      # Social media
      OPTIONAL {{ ?item wdt:P2002 ?twitter . }}
      OPTIONAL {{ ?item wdt:P2013 ?facebook . }}
      OPTIONAL {{ ?item wdt:P2003 ?instagram . }}

      # Extract Wikidata QID from URI
      BIND(STRAFTER(STR(?item), "http://www.wikidata.org/entity/") AS ?wikidataQID)

      # =============================================================================
      # TEMPORAL DATA
      # =============================================================================

      OPTIONAL {{ ?item wdt:P571 ?inception . }}
      OPTIONAL {{ ?item wdt:P576 ?dissolved . }}
      OPTIONAL {{ ?item wdt:P1619 ?foundingDate . }}

      # =============================================================================
      # ORGANIZATIONAL RELATIONSHIPS
      # =============================================================================

      OPTIONAL {{ ?item wdt:P749 ?parent . }}
      OPTIONAL {{ ?item wdt:P361 ?partOf . }}

      # =============================================================================
      # COLLECTION METADATA
      # =============================================================================

      OPTIONAL {{ ?item wdt:P1301 ?collectionSize . }}
      OPTIONAL {{ ?item wdt:P195 ?collectionType . }}

      # =============================================================================
      # MEDIA
      # =============================================================================

      OPTIONAL {{ ?item wdt:P18 ?image . }}
      OPTIONAL {{ ?item wdt:P154 ?logo . }}

      # =============================================================================
      # LABELS AND DESCRIPTIONS (Multilingual)
      # =============================================================================

      SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "{languages}" .
        ?item rdfs:label ?itemLabel .
        ?item schema:description ?itemDescription .
        ?item skos:altLabel ?itemAltLabel .
        ?instType rdfs:label ?instTypeLabel .
        ?city rdfs:label ?cityLabel .
        ?region rdfs:label ?regionLabel .
        ?parent rdfs:label ?parentLabel .
        ?partOf rdfs:label ?partOfLabel .
      }}
    }}
    ORDER BY ?itemLabel
    LIMIT {limit}
    OFFSET {offset}
    """
    return query


def parse_sparql_results(bindings: list[dict]) -> list[dict[str, Any]]:
    """
    Parse SPARQL query results into institution records.

    Aggregates results by Wikidata QID (multiple rows may exist per institution
    due to alternative labels, collection types, etc.)

    Returns:
        List of institution dictionaries with complete metadata
    """
    institutions_by_qid: dict[str, dict[str, Any]] = {}

    for binding in bindings:
        item_uri = binding.get("item", {}).get("value", "")
        qid = item_uri.split("/")[-1] if item_uri else None

        if not qid or not qid.startswith("Q"):
            continue

        # Skip synthetic Q-numbers (policy: real identifiers only)
        try:
            qid_num = int(qid[1:])
            if qid_num >= 90000000:
                continue
        except ValueError:
            continue

        # Initialize institution record if first occurrence
        if qid not in institutions_by_qid:
            institutions_by_qid[qid] = {
                "wikidata_qid": qid,
                "name": binding.get("itemLabel", {}).get("value", ""),
                "description": binding.get("itemDescription", {}).get("value", ""),
                "institution_type": binding.get("instTypeLabel", {}).get("value", ""),
                "alternative_names": [],
                "identifiers": {},
                "location": {},
                "temporal": {},
                "organizational": {},
                "collection": {},
                "media": {}
            }

        inst = institutions_by_qid[qid]

        # Collect alternative names
        alt_label = binding.get("itemAltLabel", {}).get("value", "")
        if alt_label and alt_label not in inst["alternative_names"]:
            inst["alternative_names"].append(alt_label)

        # Identifiers - comprehensive mapping
        identifier_mappings = {
            # Primary identifiers
            "isil": "ISIL",
            "viaf": "VIAF",
            # Contact
            "website": "website",
            "email": "email",
            "phone": "phone",
            # Archives & Libraries
            "archivesPortalEuropeID": "Archives_Portal_Europe_ID",
            "egaxaID": "EGAXA_ID",
            "archiveGridID": "ArchiveGrid_ID",
            "atomURL": "AtoM_URL",
            # Museums
            "museofileID": "Museofile_ID",
            "commonsInstitution": "Wikimedia_Commons_Institution",
            # Dutch heritage
            "rkdInstituteID": "RKD_Institute_ID",
            "rkdArtistsID": "RKDartists_ID",
            "rceMonumentID": "RCE_Monument_ID",
            "monumentsFlandersID": "Monuments_Flanders_ID",
            # Authority control
            "locID": "Library_of_Congress_ID",
            "gndID": "GND_ID",
            "bnfID": "BnF_ID",
            "librariesAustraliaID": "Libraries_Australia_ID",
            "nliID": "National_Library_Israel_ID",
            # Social media
            "twitter": "Twitter",
            "facebook": "Facebook",
            "instagram": "Instagram"
        }

        for sparql_field, linkml_field in identifier_mappings.items():
            if sparql_field in binding:
                inst["identifiers"][linkml_field] = binding[sparql_field]["value"]

        # Location - parse coordinates from Point(lon lat) format
        if "coords" in binding:
            coords_str = binding["coords"]["value"]
            if coords_str and coords_str.startswith("Point("):
                # Format: "Point(longitude latitude)"
                coords_inner = coords_str[6:-1]  # Remove "Point(" and ")"
                lon_str, lat_str = coords_inner.split()
                inst["location"]["latitude"] = float(lat_str)
                inst["location"]["longitude"] = float(lon_str)
        if "streetAddress" in binding:
            inst["location"]["street_address"] = binding["streetAddress"]["value"]
        if "postalCode" in binding:
            inst["location"]["postal_code"] = binding["postalCode"]["value"]
        if "cityLabel" in binding:
            inst["location"]["city"] = binding["cityLabel"]["value"]
        if "regionLabel" in binding:
            inst["location"]["region"] = binding["regionLabel"]["value"]

        # Temporal
        if "inception" in binding:
            inst["temporal"]["inception"] = binding["inception"]["value"].split("T")[0]
        if "foundingDate" in binding:
            inst["temporal"]["founding_date"] = binding["foundingDate"]["value"].split("T")[0]
        if "dissolved" in binding:
            inst["temporal"]["dissolved"] = binding["dissolved"]["value"].split("T")[0]

        # Organizational
        if "parentLabel" in binding:
            inst["organizational"]["parent"] = binding["parentLabel"]["value"]
        if "partOfLabel" in binding:
            inst["organizational"]["part_of"] = binding["partOfLabel"]["value"]

        # Collection
        if "collectionSize" in binding:
            inst["collection"]["size"] = int(binding["collectionSize"]["value"])
        if "collectionType" in binding:
            if "types" not in inst["collection"]:
                inst["collection"]["types"] = []
            inst["collection"]["types"].append(binding["collectionType"]["value"])

        # Media
        if "image" in binding:
            inst["media"]["image"] = binding["image"]["value"]
        if "logo" in binding:
            inst["media"]["logo"] = binding["logo"]["value"]

    return list(institutions_by_qid.values())


def query_wikidata_country(
    sparql: SPARQLWrapper,
    country_code: str,
    country_info: dict[str, Any],
    dry_run: bool = False
) -> dict[str, Any]:
    """
    Query Wikidata for all heritage institutions in a specific country.

    Queries each institution type separately to avoid timeout issues.
    Implements pagination for large datasets.

    Returns:
        Dictionary with extraction results and statistics
    """
    print(f"\n{'='*80}")
    print(f"{country_info['flag']} {country_info['name'].upper()} ({country_code})")
    print(f"{'='*80}\n")
    print(f"🔍 Querying Wikidata for {country_info['name']} heritage institutions...")
    print(f"   Languages: {country_info['languages']}")
    print(f"   Wikidata QID: {country_info['qid']}\n")

    all_institutions = []
    type_stats = defaultdict(int)
    errors = []

    for type_key, type_info in INSTITUTION_TYPES.items():
        print(f"   - {type_info['label']} ({type_info['qid']})...", end="", flush=True)

        try:
            # Pagination: fetch ALL results in batches of 1000
            offset = 0
            batch_size = 1000
            type_institutions = []

            while True:
                query = create_sparql_query(
                    country_qid=country_info['qid'],
                    inst_type_qid=type_info['qid'],
                    languages=country_info['languages'],
                    limit=batch_size,
                    offset=offset
                )

                if dry_run:
                    print(" [DRY RUN - Query prepared]")
                    break

                sparql.setQuery(query)
                raw_results = sparql.query().convert()
                bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []

                if not bindings:
                    # No more results
                    break

                institutions = parse_sparql_results(bindings)
                type_institutions.extend(institutions)

                # If we got fewer results than batch_size, we've reached the end
                if len(bindings) < batch_size:
                    break

                offset += batch_size
                # Rate limiting between pagination requests
                time.sleep(1)

            all_institutions.extend(type_institutions)
            type_stats[type_key] = len(type_institutions)

            print(f" ✅ {len(type_institutions)} found")

            # Rate limiting - be nice to Wikidata
            time.sleep(2)

        except SPARQLExceptions.EndPointInternalError as e:
            print(f" ❌ Timeout (query too complex)")
            errors.append(f"{type_key}: Endpoint timeout")
        except Exception as e:
            print(f" ❌ Error: {e}")
            errors.append(f"{type_key}: {str(e)}")

    print(f"\n✅ Total institutions extracted: {len(all_institutions):,}")
    print(f"📊 By type:")
    for type_key, count in type_stats.items():
        if count > 0:
            print(f"   {INSTITUTION_TYPES[type_key]['label']}: {count:,}")

    if errors:
        print(f"\n⚠️  Errors encountered: {len(errors)}")
        for error in errors[:5]:
            print(f"   - {error}")

    return {
        "country_code": country_code,
        "country_name": country_info['name'],
        "country_qid": country_info['qid'],
        "extraction_date": datetime.now(timezone.utc).isoformat(),
        "total_institutions": len(all_institutions),
        "institution_types": dict(type_stats),
        "institutions": all_institutions,
        "errors": errors
    }


def save_results(results: dict[str, Any], output_dir: Path) -> Path:
    """
    Save extraction results to JSON file.

    File path: data/wikidata/{country_code}/{timestamp}.json

    Returns:
        Path to saved file
    """
    country_code = results['country_code']
    country_dir = output_dir / country_code.lower()
    country_dir.mkdir(parents=True, exist_ok=True)

    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    output_file = country_dir / f"{timestamp}.json"

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    return output_file


def main():
    parser = argparse.ArgumentParser(
        description="Extract heritage institutions from Wikidata globally using SPARQL",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Extract priority 1 countries (Netherlands, Chile, Belgium)
  %(prog)s --priority 1

  # Extract specific countries
  %(prog)s --countries NL CL BE IT FR DE

  # Extract all countries (use with caution - 205+ countries)
  %(prog)s --all-countries

  # Dry run (preview what would be extracted)
  %(prog)s --priority 1 --dry-run

  # Resume extraction, skip countries with existing data
  %(prog)s --priority 2 --skip-existing
        """
    )

    parser.add_argument(
        '--countries',
        nargs='+',
        metavar='CODE',
        help='Country codes to process (e.g., NL CL BE IT)'
    )

    parser.add_argument(
        '--priority',
        nargs='+',
        type=int,
        metavar='N',
        help='Process countries by priority level (1-5)'
    )

    parser.add_argument(
        '--all-countries',
        action='store_true',
        help='Process all configured countries (use with caution)'
    )

    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Preview queries without executing them'
    )

    parser.add_argument(
        '--skip-existing',
        action='store_true',
        help='Skip countries that already have extraction data'
    )

    parser.add_argument(
        '--output-dir',
        type=Path,
        help='Output directory (default: data/wikidata/)'
    )

    args = parser.parse_args()

    # Determine countries to process
    countries_to_process = []

    if args.countries:
        countries_to_process = args.countries
    elif args.priority:
        countries_to_process = [
            code for code, info in COUNTRY_CONFIGS.items()
            if info.get('priority') in args.priority
        ]
    elif args.all_countries:
        countries_to_process = list(COUNTRY_CONFIGS.keys())
    else:
        # Default: Priority 1 countries
        countries_to_process = [
            code for code, info in COUNTRY_CONFIGS.items()
            if info.get('priority') == 1
        ]

    # Validate country codes
    invalid_countries = [c for c in countries_to_process if c not in COUNTRY_CONFIGS]
    if invalid_countries:
        print(f"❌ Invalid country codes: {', '.join(invalid_countries)}")
        print(f"   Valid codes: {', '.join(sorted(COUNTRY_CONFIGS.keys()))}")
        return 1

    # Output directory
    base_dir = Path(__file__).parent.parent
    output_dir = args.output_dir if args.output_dir else base_dir / "data" / "wikidata"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Header
    print("="*80)
    print("🌍 GLOBAL WIKIDATA HERITAGE INSTITUTION EXTRACTOR")
    print("="*80)
    print(f"\n📂 Output directory: {output_dir}")
    print(f"🌍 Countries to process: {len(countries_to_process)}")
    country_names = [f"{COUNTRY_CONFIGS[c]['flag']} {COUNTRY_CONFIGS[c]['name']}" for c in countries_to_process]
    print(f"   {', '.join(country_names)}\n")

    if args.dry_run:
        print("🔍 DRY RUN MODE: No data will be extracted or saved\n")

    # Setup SPARQL
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setReturnFormat(SPARQL_JSON)
    sparql.setMethod('POST')
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.0 (Global Wikidata Extraction)")

    # Process countries
    start_time = time.time()
    total_institutions = 0
    successful_countries = 0
    failed_countries = []

    for i, country_code in enumerate(countries_to_process, 1):
        country_info = COUNTRY_CONFIGS[country_code]

        # Skip if already extracted (optional)
        if args.skip_existing:
            country_dir = output_dir / country_code.lower()
            if country_dir.exists() and any(country_dir.glob("*.json")):
                print(f"\n⏭️  Skipping {country_info['name']} (already extracted)")
                continue

        try:
            results = query_wikidata_country(sparql, country_code, country_info, dry_run=args.dry_run)

            if not args.dry_run:
                output_file = save_results(results, output_dir)
                print(f"\n💾 Saved to: {output_file}")

            total_institutions += results['total_institutions']
            successful_countries += 1

        except Exception as e:
            print(f"\n❌ FAILED: {country_info['name']}: {e}")
            failed_countries.append(country_code)

        # Rate limiting between countries
        if i < len(countries_to_process):
            wait_time = 5
            print(f"\n⏸️  Waiting {wait_time} seconds (Wikidata rate limiting)...\n")
            time.sleep(wait_time)

    # Final report
    elapsed_time = time.time() - start_time
    print("\n" + "="*80)
    print("📊 EXTRACTION COMPLETE")
    print("="*80)
    print(f"\n✅ Successful countries: {successful_countries}/{len(countries_to_process)}")
    print(f"✨ Total institutions extracted: {total_institutions:,}")
    print(f"⏱️  Total time: {elapsed_time/60:.1f} minutes")

    if failed_countries:
        print(f"\n❌ Failed countries ({len(failed_countries)}):")
        for code in failed_countries:
            print(f"   - {COUNTRY_CONFIGS[code]['name']} ({code})")

    if args.dry_run:
        print("\n🔍 This was a dry run. Remove --dry-run to extract data.")

    print("="*80 + "\n")

    return 0


if __name__ == "__main__":
    sys.exit(main())