#!/usr/bin/env python3 """ Extract geographic metadata from Wikidata hyponyms_curated.yaml. This script: 1. Parses data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml 2. Extracts country, subregion, settlement fields from each hypernym entry 3. Maps human-readable names to ISO codes: - Country names → ISO 3166-1 alpha-2 codes (e.g., "Netherlands" → "NL") - Subregion names → ISO 3166-2 codes (e.g., "Pennsylvania" → "US-PA") - Settlement names → GeoNames IDs (e.g., "Pittsburgh" → 5206379) 4. Generates annotations for FeatureTypeEnum.yaml Output: - data/extracted/wikidata_geography_mapping.yaml (intermediate mapping) - data/extracted/feature_type_geographic_annotations.yaml (for schema integration) Usage: python3 scripts/extract_wikidata_geography.py Author: OpenCODE AI Assistant Date: 2025-11-22 """ import yaml import sys from pathlib import Path from typing import Dict, List, Set, Optional from collections import defaultdict # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) # Country name to ISO 3166-1 alpha-2 mapping # Source: Wikidata, ISO 3166 Maintenance Agency COUNTRY_NAME_TO_ISO = { # Modern countries (alphabetical) "Albania": "AL", "Argentina": "AR", "Armenia": "AM", "Aruba": "AW", "Australia": "AU", "Austria": "AT", "Azerbaijan": "AZ", "Bangladesh": "BD", "Barbados": "BB", "Bardbados": "BB", # Typo in source data "Belarus": "BY", "Belgium": "BE", "Bolivia": "BO", "Bosnia and Herzegovina": "BA", "Brazil": "BR", "Bulgaria": "BG", "Cameroon": "CM", "Canada": "CA", "Chile": "CL", "China": "CN", "Colombia": "CO", "Costa Rica": "CR", "Croatia": "HR", "Curaçao": "CW", "Czech Republic": "CZ", "Denmark": "DK", "Dominica": "DM", "Ecuador": "EC", "El Salvador": "SV", "England": "GB-ENG", # ISO 3166-2 for England "Estonia": "EE", "Finland": "FI", "France": "FR", "Gabon": "GA", "Germany": "DE", "Ghana": "GH", "Greece": "GR", "Guatemala": "GT", "Guinea": "GN", "Hungary": "HU", "Iceland": "IS", "India": "IN", "Indonesia": "ID", "Iran": "IR", "Ireland": "IE", "Israel": "IL", "Italy": "IT", "Ivory Coast": "CI", "Japan": "JP", "Kazakhstan": "KZ", "Kenya": "KE", "Kosovo": "XK", # User-assigned code "Kyrgyzstan": "KG", "Latvia": "LV", "Lesotho": "LS", "Libya": "LY", "Lithuania": "LT", "Luxembourg": "LU", "Madagascar": "MG", "Malaysia": "MY", "Mauritius": "MU", "Mexico": "MX", "Moldova": "MD", "Mongolia": "MN", "Montenegro": "ME", "Morocco": "MA", "Mozambique": "MZ", "Namibia": "NA", "Nepal": "NP", "Netherlands": "NL", "New Zealand": "NZ", "Nicaragua": "NI", "Nigeria": "NG", "North Korea": "KP", "North Macedonia": "MK", "Norway": "NO", "Norwegian": "NO", # Language/nationality in source data "Oman": "OM", "Pakistan": "PK", "Panama": "PA", "Paraguay": "PY", "Peru": "PE", "Philippines": "PH", "Poland": "PL", "Portugal": "PT", "Romania": "RO", "Russia": "RU", "Scotland": "GB-SCT", # ISO 3166-2 for Scotland "Senegal": "SN", "Serbia": "RS", "Seychelles": "SC", "Singapore": "SG", "Sint Maarten": "SX", "Slovakia": "SK", "Slovenia": "SI", "Somalia": "SO", "South Africa": "ZA", "South Korea": "KR", "Spain": "ES", "Sri Lanka": "LK", "Suriname": "SR", "Swaziland": "SZ", "Sweden": "SE", "Switzerland": "CH", "Taiwan": "TW", "Tanzania": "TZ", "Thailand": "TH", "Turkiye": "TR", "Turkmenistan": "TM", "UK": "GB", "USA": "US", "Uganda": "UG", "Ukraine": "UA", "Venezuela": "VE", "Vietnam": "VN", "Yemen": "YE", # Historical entities (use modern successor codes or special codes) "Byzantine Empire": "HIST-BYZ", # Historical entity "Czechoslovakia": "HIST-CS", # Dissolved 1993 → CZ + SK "Japanese Empire": "HIST-JP", # Historical Japan "Russian Empire": "HIST-RU", # Historical Russia "Soviet Union": "HIST-SU", # Dissolved 1991 } # Subregion name to ISO 3166-2 code mapping # Format: {country_alpha2}-{subdivision_code} SUBREGION_NAME_TO_ISO = { # United States (US-XX format) "Alabama": "US-AL", "Alaska": "US-AK", "Arizona": "US-AZ", "Arkansas": "US-AR", "California": "US-CA", "Colorado": "US-CO", "Connecticut": "US-CT", "Delaware": "US-DE", "Florida": "US-FL", "Georgia": "US-GA", "Hawaii": "US-HI", "Idaho": "US-ID", "Illinois": "US-IL", "Indiana": "US-IN", "Iowa": "US-IA", "Kansas": "US-KS", "Kentucky": "US-KY", "Louisiana": "US-LA", "Maine": "US-ME", "Maryland": "US-MD", "Massachusetts": "US-MA", "Michigan": "US-MI", "Minnesota": "US-MN", "Mississippi": "US-MS", "Missouri": "US-MO", "Montana": "US-MT", "Nebraska": "US-NE", "Nevada": "US-NV", "New Hampshire": "US-NH", "New Jersey": "US-NJ", "New Mexico": "US-NM", "New York": "US-NY", "North Carolina": "US-NC", "North Dakota": "US-ND", "Ohio": "US-OH", "Oklahoma": "US-OK", "Oregon": "US-OR", "Pennsylvania": "US-PA", "Rhode Island": "US-RI", "South Carolina": "US-SC", "South Dakota": "US-SD", "Tennessee": "US-TN", "Texas": "US-TX", "Utah": "US-UT", "Vermont": "US-VT", "Virginia": "US-VA", "Washington": "US-WA", "West Virginia": "US-WV", "Wisconsin": "US-WI", "Wyoming": "US-WY", # Germany (DE-XX format) "Baden-Württemberg": "DE-BW", "Bavaria": "DE-BY", "Brandenburg": "DE-BB", "Hesse": "DE-HE", "Mecklenburg-Western Pomerania": "DE-MV", "North-Rhine Westphalia": "DE-NW", "Saxony": "DE-SN", "Saxony-Anhalt": "DE-ST", "Schleswig-Holstein": "DE-SH", "Thuringia": "DE-TH", # Austria (AT-X format) "Burgenland": "AT-1", "Carinthia": "AT-2", "Lower Austria": "AT-3", "Salzburg": "AT-5", "Styria": "AT-6", "Tyrol": "AT-7", "Upper Austria": "AT-4", "Vienna": "AT-9", "Vorarlberg": "AT-8", # Netherlands (NL-XX format) "Limburg": "NL-LI", # Belgium (BE-XXX format) "Brussels": "BE-BRU", "Flanders": "BE-VLG", "Wallonia": "BE-WAL", # Indonesia (ID-XX format) "Bali": "ID-BA", "Sabah": "MY-12", # Malaysia, not Indonesia # Australia (AU-XXX format) "Australian Capital Territory": "AU-ACT", "New South Wales": "AU-NSW", "Northern Territory": "AU-NT", "Queensland": "AU-QLD", "South Australia": "AU-SA", "Tasmania": "AU-TAS", "Victoria": "AU-VIC", "Western Australia": "AU-WA", # Canada (CA-XX format) "Alberta": "CA-AB", "Manitoba": "CA-MB", "New Brunswick": "CA-NB", "Newfoundland and Labrador": "CA-NL", "Nova Scotia": "CA-NS", "Ontario": "CA-ON", "Quebec": "CA-QC", "Saskatchewan": "CA-SK", # Spain (ES-XX format) "Andalusia": "ES-AN", "Balearic Islands": "ES-IB", "Basque Country": "ES-PV", "Catalonia": "ES-CT", "Galicia": "ES-GA", "Madrid": "ES-MD", "Valencia": "ES-VC", # India (IN-XX format) "Assam": "IN-AS", "Bihar": "IN-BR", "Kerala": "IN-KL", "West Bengal": "IN-WB", # Japan (JP-XX format) "Hoikkaido": "JP-01", # Typo in source data (Hokkaido) "Kanagawa": "JP-14", "Okayama": "JP-33", # United Kingdom subdivisions "England": "GB-ENG", "Scotland": "GB-SCT", "Northern Ireland": "GB-NIR", "Wales": "GB-WLS", # Other countries "Canton": "CH-ZH", # Switzerland (Zürich) "Corsica": "FR-H", # France (Corse) "Hong Kong": "HK", # Special Administrative Region "Madeira": "PT-30", # Portugal "Tuscany": "IT-52", # Italy # Special cases "Caribbean Netherlands": "BQ", # Special ISO code "Pittsburgh": "US-PA", # City listed as subregion (should be settlement) "Somerset": "GB-SOM", # UK county # Unknown/incomplete mappings "Arua": "UG-ARUA", # Uganda (district code needed) "Nagorno-Karabakh": "AZ-NKR", # Disputed territory "Przysłup": "PL-PRZYS", # Poland (locality code needed) } # Settlement name to GeoNames ID mapping # Format: numeric GeoNames ID SETTLEMENT_NAME_TO_GEONAMES = { "Amsterdam": 2759794, "Delft": 2757345, "Dresden": 2935022, "Ostend": 2789786, "Pittsburgh": 5206379, "Rio de Janeiro": 3451190, "Seattle": 5809844, "Warlubie": 3083271, } def extract_geographic_metadata(yaml_path: Path) -> Dict: """ Parse Wikidata hyponyms_curated.yaml and extract geographic metadata. Returns: Dict with keys: - entities_with_geography: List of (Q-number, country, subregion, settlement) - countries: Set of country ISO codes - subregions: Set of ISO 3166-2 codes - settlements: Set of GeoNames IDs - unmapped_countries: List of country names without ISO mapping - unmapped_subregions: List of subregion names without ISO mapping """ print(f"📖 Reading {yaml_path}...") with open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) entities_with_geography = [] countries_found = set() subregions_found = set() settlements_found = set() unmapped_countries = [] unmapped_subregions = [] hypernyms = data.get('hypernym', []) print(f"📊 Processing {len(hypernyms)} hypernym entries...") for item in hypernyms: q_number = item.get('label', 'UNKNOWN') # Extract country country_names = item.get('country', []) country_codes = [] for country_name in country_names: if not country_name or country_name in ['', ' ']: continue # Skip empty strings iso_code = COUNTRY_NAME_TO_ISO.get(country_name) if iso_code: country_codes.append(iso_code) countries_found.add(iso_code) else: # Check if it's a single letter typo if len(country_name) == 1: print(f"⚠️ Skipping single-letter country '{country_name}' for {q_number}") continue unmapped_countries.append((q_number, country_name)) print(f"⚠️ Unmapped country: '{country_name}' for {q_number}") # Extract subregion subregion_names = item.get('subregion', []) subregion_codes = [] for subregion_name in subregion_names: if not subregion_name or subregion_name in ['', ' ']: continue iso_code = SUBREGION_NAME_TO_ISO.get(subregion_name) if iso_code: subregion_codes.append(iso_code) subregions_found.add(iso_code) else: unmapped_subregions.append((q_number, subregion_name)) print(f"⚠️ Unmapped subregion: '{subregion_name}' for {q_number}") # Extract settlement settlement_names = item.get('settlement', []) settlement_ids = [] for settlement_name in settlement_names: if not settlement_name or settlement_name in ['', ' ']: continue geonames_id = SETTLEMENT_NAME_TO_GEONAMES.get(settlement_name) if geonames_id: settlement_ids.append(geonames_id) settlements_found.add(geonames_id) else: # Settlements without GeoNames IDs are acceptable (can be resolved later) print(f"ℹ️ Settlement without GeoNames ID: '{settlement_name}' for {q_number}") # Store entity if it has any geographic metadata if country_codes or subregion_codes or settlement_ids: entities_with_geography.append({ 'q_number': q_number, 'countries': country_codes, 'subregions': subregion_codes, 'settlements': settlement_ids, 'raw_country_names': country_names, 'raw_subregion_names': subregion_names, 'raw_settlement_names': settlement_names, }) print(f"\n✅ Extraction complete!") print(f" - {len(entities_with_geography)} entities with geographic metadata") print(f" - {len(countries_found)} unique country codes") print(f" - {len(subregions_found)} unique subregion codes") print(f" - {len(settlements_found)} unique settlement IDs") print(f" - {len(unmapped_countries)} unmapped country names") print(f" - {len(unmapped_subregions)} unmapped subregion names") return { 'entities_with_geography': entities_with_geography, 'countries': sorted(countries_found), 'subregions': sorted(subregions_found), 'settlements': sorted(settlements_found), 'unmapped_countries': unmapped_countries, 'unmapped_subregions': unmapped_subregions, } def generate_feature_type_annotations(geographic_data: Dict, output_path: Path): """ Generate dcterms:spatial annotations for FeatureTypeEnum.yaml. Creates YAML snippet that can be manually integrated into FeatureTypeEnum. """ print(f"\n📝 Generating FeatureTypeEnum annotations...") annotations = [] for entity in geographic_data['entities_with_geography']: q_number = entity['q_number'] countries = entity['countries'] subregions = entity['subregions'] settlements = entity['settlements'] # Build annotation entry annotation = { 'wikidata_id': q_number, } # Add dcterms:spatial for countries if countries: # Use primary country (first in list) annotation['dcterms:spatial'] = countries[0] if len(countries) > 1: annotation['dcterms:spatial_all'] = countries # Add ISO 3166-2 codes for subregions if subregions: annotation['iso_3166_2'] = subregions[0] if len(subregions) > 1: annotation['iso_3166_2_all'] = subregions # Add GeoNames IDs for settlements if settlements: annotation['geonames_id'] = settlements[0] if len(settlements) > 1: annotation['geonames_id_all'] = settlements # Add raw names for documentation annotation['raw_data'] = { 'country': entity['raw_country_names'], 'subregion': entity['raw_subregion_names'], 'settlement': entity['raw_settlement_names'], } annotations.append(annotation) # Write to output file output_data = { 'description': 'Geographic annotations for FeatureTypeEnum entries', 'source': 'data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml', 'extraction_date': '2025-11-22', 'annotations': annotations, } output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(output_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"✅ Annotations written to {output_path}") print(f" - {len(annotations)} annotated entries") def main(): """Main execution function.""" print("🌍 Wikidata Geographic Metadata Extraction") print("=" * 60) # Paths wikidata_yaml = PROJECT_ROOT / "data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml" output_mapping = PROJECT_ROOT / "data/extracted/wikidata_geography_mapping.yaml" output_annotations = PROJECT_ROOT / "data/extracted/feature_type_geographic_annotations.yaml" # Extract geographic metadata geographic_data = extract_geographic_metadata(wikidata_yaml) # Write intermediate mapping file print(f"\n📝 Writing intermediate mapping to {output_mapping}...") output_mapping.parent.mkdir(parents=True, exist_ok=True) with open(output_mapping, 'w', encoding='utf-8') as f: yaml.dump(geographic_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"✅ Mapping written to {output_mapping}") # Generate FeatureTypeEnum annotations generate_feature_type_annotations(geographic_data, output_annotations) # Summary report print("\n" + "=" * 60) print("📊 SUMMARY") print("=" * 60) print(f"Countries mapped: {len(geographic_data['countries'])}") print(f"Subregions mapped: {len(geographic_data['subregions'])}") print(f"Settlements mapped: {len(geographic_data['settlements'])}") print(f"Entities with geography: {len(geographic_data['entities_with_geography'])}") if geographic_data['unmapped_countries']: print(f"\n⚠️ UNMAPPED COUNTRIES ({len(geographic_data['unmapped_countries'])}):") for q_num, country in set(geographic_data['unmapped_countries']): print(f" - {country}") if geographic_data['unmapped_subregions']: print(f"\n⚠️ UNMAPPED SUBREGIONS ({len(geographic_data['unmapped_subregions'])}):") for q_num, subregion in set(geographic_data['unmapped_subregions']): print(f" - {subregion}") print("\n✅ Done! Next steps:") print(" 1. Review unmapped countries/subregions above") print(" 2. Update COUNTRY_NAME_TO_ISO / SUBREGION_NAME_TO_ISO dictionaries") print(" 3. Re-run this script") print(f" 4. Integrate {output_annotations} into FeatureTypeEnum.yaml") if __name__ == '__main__': main()