glam/scripts/extract_wikidata_geography.py

#!/usr/bin/env python3
"""
Extract geographic metadata from Wikidata hyponyms_curated.yaml.

This script:
1. Parses data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml
2. Extracts country, subregion, settlement fields from each hypernym entry
3. Maps human-readable names to ISO codes:
   - Country names → ISO 3166-1 alpha-2 codes (e.g., "Netherlands" → "NL")
   - Subregion names → ISO 3166-2 codes (e.g., "Pennsylvania" → "US-PA")
   - Settlement names → GeoNames IDs (e.g., "Pittsburgh" → 5206379)
4. Generates annotations for FeatureTypeEnum.yaml

Output:
- data/extracted/wikidata_geography_mapping.yaml (intermediate mapping)
- data/extracted/feature_type_geographic_annotations.yaml (for schema integration)

Usage:
    python3 scripts/extract_wikidata_geography.py

Author: OpenCODE AI Assistant
Date: 2025-11-22
"""

import yaml
import sys
from pathlib import Path
from typing import Dict, List, Set, Optional
from collections import defaultdict

# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# Country name to ISO 3166-1 alpha-2 mapping
# Source: Wikidata, ISO 3166 Maintenance Agency
COUNTRY_NAME_TO_ISO = {
    # Modern countries (alphabetical)
    "Albania": "AL",
    "Argentina": "AR",
    "Armenia": "AM",
    "Aruba": "AW",
    "Australia": "AU",
    "Austria": "AT",
    "Azerbaijan": "AZ",
    "Bangladesh": "BD",
    "Barbados": "BB",
    "Bardbados": "BB",  # Typo in source data
    "Belarus": "BY",
    "Belgium": "BE",
    "Bolivia": "BO",
    "Bosnia and Herzegovina": "BA",
    "Brazil": "BR",
    "Bulgaria": "BG",
    "Cameroon": "CM",
    "Canada": "CA",
    "Chile": "CL",
    "China": "CN",
    "Colombia": "CO",
    "Costa Rica": "CR",
    "Croatia": "HR",
    "Curaçao": "CW",
    "Czech Republic": "CZ",
    "Denmark": "DK",
    "Dominica": "DM",
    "Ecuador": "EC",
    "El Salvador": "SV",
    "England": "GB-ENG",  # ISO 3166-2 for England
    "Estonia": "EE",
    "Finland": "FI",
    "France": "FR",
    "Gabon": "GA",
    "Germany": "DE",
    "Ghana": "GH",
    "Greece": "GR",
    "Guatemala": "GT",
    "Guinea": "GN",
    "Hungary": "HU",
    "Iceland": "IS",
    "India": "IN",
    "Indonesia": "ID",
    "Iran": "IR",
    "Ireland": "IE",
    "Israel": "IL",
    "Italy": "IT",
    "Ivory Coast": "CI",
    "Japan": "JP",
    "Kazakhstan": "KZ",
    "Kenya": "KE",
    "Kosovo": "XK",  # User-assigned code
    "Kyrgyzstan": "KG",
    "Latvia": "LV",
    "Lesotho": "LS",
    "Libya": "LY",
    "Lithuania": "LT",
    "Luxembourg": "LU",
    "Madagascar": "MG",
    "Malaysia": "MY",
    "Mauritius": "MU",
    "Mexico": "MX",
    "Moldova": "MD",
    "Mongolia": "MN",
    "Montenegro": "ME",
    "Morocco": "MA",
    "Mozambique": "MZ",
    "Namibia": "NA",
    "Nepal": "NP",
    "Netherlands": "NL",
    "New Zealand": "NZ",
    "Nicaragua": "NI",
    "Nigeria": "NG",
    "North Korea": "KP",
    "North Macedonia": "MK",
    "Norway": "NO",
    "Norwegian": "NO",  # Language/nationality in source data
    "Oman": "OM",
    "Pakistan": "PK",
    "Panama": "PA",
    "Paraguay": "PY",
    "Peru": "PE",
    "Philippines": "PH",
    "Poland": "PL",
    "Portugal": "PT",
    "Romania": "RO",
    "Russia": "RU",
    "Scotland": "GB-SCT",  # ISO 3166-2 for Scotland
    "Senegal": "SN",
    "Serbia": "RS",
    "Seychelles": "SC",
    "Singapore": "SG",
    "Sint Maarten": "SX",
    "Slovakia": "SK",
    "Slovenia": "SI",
    "Somalia": "SO",
    "South Africa": "ZA",
    "South Korea": "KR",
    "Spain": "ES",
    "Sri Lanka": "LK",
    "Suriname": "SR",
    "Swaziland": "SZ",
    "Sweden": "SE",
    "Switzerland": "CH",
    "Taiwan": "TW",
    "Tanzania": "TZ",
    "Thailand": "TH",
    "Turkiye": "TR",
    "Turkmenistan": "TM",
    "UK": "GB",
    "USA": "US",
    "Uganda": "UG",
    "Ukraine": "UA",
    "Venezuela": "VE",
    "Vietnam": "VN",
    "Yemen": "YE",

    # Historical entities (use modern successor codes or special codes)
    "Byzantine Empire": "HIST-BYZ",  # Historical entity
    "Czechoslovakia": "HIST-CS",     # Dissolved 1993 → CZ + SK
    "Japanese Empire": "HIST-JP",    # Historical Japan
    "Russian Empire": "HIST-RU",     # Historical Russia
    "Soviet Union": "HIST-SU",       # Dissolved 1991
}

# Subregion name to ISO 3166-2 code mapping
# Format: {country_alpha2}-{subdivision_code}
SUBREGION_NAME_TO_ISO = {
    # United States (US-XX format)
    "Alabama": "US-AL",
    "Alaska": "US-AK",
    "Arizona": "US-AZ",
    "Arkansas": "US-AR",
    "California": "US-CA",
    "Colorado": "US-CO",
    "Connecticut": "US-CT",
    "Delaware": "US-DE",
    "Florida": "US-FL",
    "Georgia": "US-GA",
    "Hawaii": "US-HI",
    "Idaho": "US-ID",
    "Illinois": "US-IL",
    "Indiana": "US-IN",
    "Iowa": "US-IA",
    "Kansas": "US-KS",
    "Kentucky": "US-KY",
    "Louisiana": "US-LA",
    "Maine": "US-ME",
    "Maryland": "US-MD",
    "Massachusetts": "US-MA",
    "Michigan": "US-MI",
    "Minnesota": "US-MN",
    "Mississippi": "US-MS",
    "Missouri": "US-MO",
    "Montana": "US-MT",
    "Nebraska": "US-NE",
    "Nevada": "US-NV",
    "New Hampshire": "US-NH",
    "New Jersey": "US-NJ",
    "New Mexico": "US-NM",
    "New York": "US-NY",
    "North Carolina": "US-NC",
    "North Dakota": "US-ND",
    "Ohio": "US-OH",
    "Oklahoma": "US-OK",
    "Oregon": "US-OR",
    "Pennsylvania": "US-PA",
    "Rhode Island": "US-RI",
    "South Carolina": "US-SC",
    "South Dakota": "US-SD",
    "Tennessee": "US-TN",
    "Texas": "US-TX",
    "Utah": "US-UT",
    "Vermont": "US-VT",
    "Virginia": "US-VA",
    "Washington": "US-WA",
    "West Virginia": "US-WV",
    "Wisconsin": "US-WI",
    "Wyoming": "US-WY",

    # Germany (DE-XX format)
    "Baden-Württemberg": "DE-BW",
    "Bavaria": "DE-BY",
    "Brandenburg": "DE-BB",
    "Hesse": "DE-HE",
    "Mecklenburg-Western Pomerania": "DE-MV",
    "North-Rhine Westphalia": "DE-NW",
    "Saxony": "DE-SN",
    "Saxony-Anhalt": "DE-ST",
    "Schleswig-Holstein": "DE-SH",
    "Thuringia": "DE-TH",

    # Austria (AT-X format)
    "Burgenland": "AT-1",
    "Carinthia": "AT-2",
    "Lower Austria": "AT-3",
    "Salzburg": "AT-5",
    "Styria": "AT-6",
    "Tyrol": "AT-7",
    "Upper Austria": "AT-4",
    "Vienna": "AT-9",
    "Vorarlberg": "AT-8",

    # Netherlands (NL-XX format)
    "Limburg": "NL-LI",

    # Belgium (BE-XXX format)
    "Brussels": "BE-BRU",
    "Flanders": "BE-VLG",
    "Wallonia": "BE-WAL",

    # Indonesia (ID-XX format)
    "Bali": "ID-BA",
    "Sabah": "MY-12",  # Malaysia, not Indonesia

    # Australia (AU-XXX format)
    "Australian Capital Territory": "AU-ACT",
    "New South Wales": "AU-NSW",
    "Northern Territory": "AU-NT",
    "Queensland": "AU-QLD",
    "South Australia": "AU-SA",
    "Tasmania": "AU-TAS",
    "Victoria": "AU-VIC",
    "Western Australia": "AU-WA",

    # Canada (CA-XX format)
    "Alberta": "CA-AB",
    "Manitoba": "CA-MB",
    "New Brunswick": "CA-NB",
    "Newfoundland and Labrador": "CA-NL",
    "Nova Scotia": "CA-NS",
    "Ontario": "CA-ON",
    "Quebec": "CA-QC",
    "Saskatchewan": "CA-SK",

    # Spain (ES-XX format)
    "Andalusia": "ES-AN",
    "Balearic Islands": "ES-IB",
    "Basque Country": "ES-PV",
    "Catalonia": "ES-CT",
    "Galicia": "ES-GA",
    "Madrid": "ES-MD",
    "Valencia": "ES-VC",

    # India (IN-XX format)
    "Assam": "IN-AS",
    "Bihar": "IN-BR",
    "Kerala": "IN-KL",
    "West Bengal": "IN-WB",

    # Japan (JP-XX format)
    "Hoikkaido": "JP-01",  # Typo in source data (Hokkaido)
    "Kanagawa": "JP-14",
    "Okayama": "JP-33",

    # United Kingdom subdivisions
    "England": "GB-ENG",
    "Scotland": "GB-SCT",
    "Northern Ireland": "GB-NIR",
    "Wales": "GB-WLS",

    # Other countries
    "Canton": "CH-ZH",  # Switzerland (Zürich)
    "Corsica": "FR-H",  # France (Corse)
    "Hong Kong": "HK",  # Special Administrative Region
    "Madeira": "PT-30",  # Portugal
    "Tuscany": "IT-52",  # Italy

    # Special cases
    "Caribbean Netherlands": "BQ",  # Special ISO code
    "Pittsburgh": "US-PA",  # City listed as subregion (should be settlement)
    "Somerset": "GB-SOM",  # UK county

    # Unknown/incomplete mappings
    "Arua": "UG-ARUA",  # Uganda (district code needed)
    "Nagorno-Karabakh": "AZ-NKR",  # Disputed territory
    "Przysłup": "PL-PRZYS",  # Poland (locality code needed)
}

# Settlement name to GeoNames ID mapping
# Format: numeric GeoNames ID
SETTLEMENT_NAME_TO_GEONAMES = {
    "Amsterdam": 2759794,
    "Delft": 2757345,
    "Dresden": 2935022,
    "Ostend": 2789786,
    "Pittsburgh": 5206379,
    "Rio de Janeiro": 3451190,
    "Seattle": 5809844,
    "Warlubie": 3083271,
}


def extract_geographic_metadata(yaml_path: Path) -> Dict:
    """
    Parse Wikidata hyponyms_curated.yaml and extract geographic metadata.

    Returns:
        Dict with keys:
        - entities_with_geography: List of (Q-number, country, subregion, settlement)
        - countries: Set of country ISO codes
        - subregions: Set of ISO 3166-2 codes
        - settlements: Set of GeoNames IDs
        - unmapped_countries: List of country names without ISO mapping
        - unmapped_subregions: List of subregion names without ISO mapping
    """
    print(f"📖 Reading {yaml_path}...")

    with open(yaml_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    entities_with_geography = []
    countries_found = set()
    subregions_found = set()
    settlements_found = set()
    unmapped_countries = []
    unmapped_subregions = []

    hypernyms = data.get('hypernym', [])
    print(f"📊 Processing {len(hypernyms)} hypernym entries...")

    for item in hypernyms:
        q_number = item.get('label', 'UNKNOWN')

        # Extract country
        country_names = item.get('country', [])
        country_codes = []
        for country_name in country_names:
            if not country_name or country_name in ['', ' ']:
                continue  # Skip empty strings

            iso_code = COUNTRY_NAME_TO_ISO.get(country_name)
            if iso_code:
                country_codes.append(iso_code)
                countries_found.add(iso_code)
            else:
                # Check if it's a single letter typo
                if len(country_name) == 1:
                    print(f"⚠️  Skipping single-letter country '{country_name}' for {q_number}")
                    continue
                unmapped_countries.append((q_number, country_name))
                print(f"⚠️  Unmapped country: '{country_name}' for {q_number}")

        # Extract subregion
        subregion_names = item.get('subregion', [])
        subregion_codes = []
        for subregion_name in subregion_names:
            if not subregion_name or subregion_name in ['', ' ']:
                continue

            iso_code = SUBREGION_NAME_TO_ISO.get(subregion_name)
            if iso_code:
                subregion_codes.append(iso_code)
                subregions_found.add(iso_code)
            else:
                unmapped_subregions.append((q_number, subregion_name))
                print(f"⚠️  Unmapped subregion: '{subregion_name}' for {q_number}")

        # Extract settlement
        settlement_names = item.get('settlement', [])
        settlement_ids = []
        for settlement_name in settlement_names:
            if not settlement_name or settlement_name in ['', ' ']:
                continue

            geonames_id = SETTLEMENT_NAME_TO_GEONAMES.get(settlement_name)
            if geonames_id:
                settlement_ids.append(geonames_id)
                settlements_found.add(geonames_id)
            else:
                # Settlements without GeoNames IDs are acceptable (can be resolved later)
                print(f"ℹ️  Settlement without GeoNames ID: '{settlement_name}' for {q_number}")

        # Store entity if it has any geographic metadata
        if country_codes or subregion_codes or settlement_ids:
            entities_with_geography.append({
                'q_number': q_number,
                'countries': country_codes,
                'subregions': subregion_codes,
                'settlements': settlement_ids,
                'raw_country_names': country_names,
                'raw_subregion_names': subregion_names,
                'raw_settlement_names': settlement_names,
            })

    print(f"\n✅ Extraction complete!")
    print(f"   - {len(entities_with_geography)} entities with geographic metadata")
    print(f"   - {len(countries_found)} unique country codes")
    print(f"   - {len(subregions_found)} unique subregion codes")
    print(f"   - {len(settlements_found)} unique settlement IDs")
    print(f"   - {len(unmapped_countries)} unmapped country names")
    print(f"   - {len(unmapped_subregions)} unmapped subregion names")

    return {
        'entities_with_geography': entities_with_geography,
        'countries': sorted(countries_found),
        'subregions': sorted(subregions_found),
        'settlements': sorted(settlements_found),
        'unmapped_countries': unmapped_countries,
        'unmapped_subregions': unmapped_subregions,
    }


def generate_feature_type_annotations(geographic_data: Dict, output_path: Path):
    """
    Generate dcterms:spatial annotations for FeatureTypeEnum.yaml.

    Creates YAML snippet that can be manually integrated into FeatureTypeEnum.
    """
    print(f"\n📝 Generating FeatureTypeEnum annotations...")

    annotations = []

    for entity in geographic_data['entities_with_geography']:
        q_number = entity['q_number']
        countries = entity['countries']
        subregions = entity['subregions']
        settlements = entity['settlements']

        # Build annotation entry
        annotation = {
            'wikidata_id': q_number,
        }

        # Add dcterms:spatial for countries
        if countries:
            # Use primary country (first in list)
            annotation['dcterms:spatial'] = countries[0]
            if len(countries) > 1:
                annotation['dcterms:spatial_all'] = countries

        # Add ISO 3166-2 codes for subregions
        if subregions:
            annotation['iso_3166_2'] = subregions[0]
            if len(subregions) > 1:
                annotation['iso_3166_2_all'] = subregions

        # Add GeoNames IDs for settlements
        if settlements:
            annotation['geonames_id'] = settlements[0]
            if len(settlements) > 1:
                annotation['geonames_id_all'] = settlements

        # Add raw names for documentation
        annotation['raw_data'] = {
            'country': entity['raw_country_names'],
            'subregion': entity['raw_subregion_names'],
            'settlement': entity['raw_settlement_names'],
        }

        annotations.append(annotation)

    # Write to output file
    output_data = {
        'description': 'Geographic annotations for FeatureTypeEnum entries',
        'source': 'data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml',
        'extraction_date': '2025-11-22',
        'annotations': annotations,
    }

    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        yaml.dump(output_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    print(f"✅ Annotations written to {output_path}")
    print(f"   - {len(annotations)} annotated entries")


def main():
    """Main execution function."""
    print("🌍 Wikidata Geographic Metadata Extraction")
    print("=" * 60)

    # Paths
    wikidata_yaml = PROJECT_ROOT / "data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml"
    output_mapping = PROJECT_ROOT / "data/extracted/wikidata_geography_mapping.yaml"
    output_annotations = PROJECT_ROOT / "data/extracted/feature_type_geographic_annotations.yaml"

    # Extract geographic metadata
    geographic_data = extract_geographic_metadata(wikidata_yaml)

    # Write intermediate mapping file
    print(f"\n📝 Writing intermediate mapping to {output_mapping}...")
    output_mapping.parent.mkdir(parents=True, exist_ok=True)
    with open(output_mapping, 'w', encoding='utf-8') as f:
        yaml.dump(geographic_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
    print(f"✅ Mapping written to {output_mapping}")

    # Generate FeatureTypeEnum annotations
    generate_feature_type_annotations(geographic_data, output_annotations)

    # Summary report
    print("\n" + "=" * 60)
    print("📊 SUMMARY")
    print("=" * 60)
    print(f"Countries mapped: {len(geographic_data['countries'])}")
    print(f"Subregions mapped: {len(geographic_data['subregions'])}")
    print(f"Settlements mapped: {len(geographic_data['settlements'])}")
    print(f"Entities with geography: {len(geographic_data['entities_with_geography'])}")

    if geographic_data['unmapped_countries']:
        print(f"\n⚠️  UNMAPPED COUNTRIES ({len(geographic_data['unmapped_countries'])}):")
        for q_num, country in set(geographic_data['unmapped_countries']):
            print(f"   - {country}")

    if geographic_data['unmapped_subregions']:
        print(f"\n⚠️  UNMAPPED SUBREGIONS ({len(geographic_data['unmapped_subregions'])}):")
        for q_num, subregion in set(geographic_data['unmapped_subregions']):
            print(f"   - {subregion}")

    print("\n✅ Done! Next steps:")
    print("   1. Review unmapped countries/subregions above")
    print("   2. Update COUNTRY_NAME_TO_ISO / SUBREGION_NAME_TO_ISO dictionaries")
    print("   3. Re-run this script")
    print(f"   4. Integrate {output_annotations} into FeatureTypeEnum.yaml")


if __name__ == '__main__':
    main()