glam/scripts/extract_wikidata_geography.py
kempersc 67657c39b6 feat: Complete Country Class Implementation and Hypernyms Removal
- Created the Country class with ISO 3166-1 alpha-2 and alpha-3 codes, ensuring minimal design without additional metadata.
- Integrated the Country class into CustodianPlace and LegalForm schemas to support country-specific feature types and legal forms.
- Removed duplicate keys in FeatureTypeEnum.yaml, resulting in 294 unique feature types.
- Eliminated "Hypernyms:" text from FeatureTypeEnum descriptions, verifying that semantic relationships are now conveyed through ontology mappings.
- Created example instance file demonstrating integration of Country with CustodianPlace and LegalForm.
- Updated documentation to reflect the completion of the Country class implementation and hypernyms removal.
2025-11-23 13:09:38 +01:00

557 lines
18 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Extract geographic metadata from Wikidata hyponyms_curated.yaml.
This script:
1. Parses data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml
2. Extracts country, subregion, settlement fields from each hypernym entry
3. Maps human-readable names to ISO codes:
- Country names → ISO 3166-1 alpha-2 codes (e.g., "Netherlands""NL")
- Subregion names → ISO 3166-2 codes (e.g., "Pennsylvania""US-PA")
- Settlement names → GeoNames IDs (e.g., "Pittsburgh" → 5206379)
4. Generates annotations for FeatureTypeEnum.yaml
Output:
- data/extracted/wikidata_geography_mapping.yaml (intermediate mapping)
- data/extracted/feature_type_geographic_annotations.yaml (for schema integration)
Usage:
python3 scripts/extract_wikidata_geography.py
Author: OpenCODE AI Assistant
Date: 2025-11-22
"""
import yaml
import sys
from pathlib import Path
from typing import Dict, List, Set, Optional
from collections import defaultdict
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
# Country name to ISO 3166-1 alpha-2 mapping
# Source: Wikidata, ISO 3166 Maintenance Agency
COUNTRY_NAME_TO_ISO = {
# Modern countries (alphabetical)
"Albania": "AL",
"Argentina": "AR",
"Armenia": "AM",
"Aruba": "AW",
"Australia": "AU",
"Austria": "AT",
"Azerbaijan": "AZ",
"Bangladesh": "BD",
"Barbados": "BB",
"Bardbados": "BB", # Typo in source data
"Belarus": "BY",
"Belgium": "BE",
"Bolivia": "BO",
"Bosnia and Herzegovina": "BA",
"Brazil": "BR",
"Bulgaria": "BG",
"Cameroon": "CM",
"Canada": "CA",
"Chile": "CL",
"China": "CN",
"Colombia": "CO",
"Costa Rica": "CR",
"Croatia": "HR",
"Curaçao": "CW",
"Czech Republic": "CZ",
"Denmark": "DK",
"Dominica": "DM",
"Ecuador": "EC",
"El Salvador": "SV",
"England": "GB-ENG", # ISO 3166-2 for England
"Estonia": "EE",
"Finland": "FI",
"France": "FR",
"Gabon": "GA",
"Germany": "DE",
"Ghana": "GH",
"Greece": "GR",
"Guatemala": "GT",
"Guinea": "GN",
"Hungary": "HU",
"Iceland": "IS",
"India": "IN",
"Indonesia": "ID",
"Iran": "IR",
"Ireland": "IE",
"Israel": "IL",
"Italy": "IT",
"Ivory Coast": "CI",
"Japan": "JP",
"Kazakhstan": "KZ",
"Kenya": "KE",
"Kosovo": "XK", # User-assigned code
"Kyrgyzstan": "KG",
"Latvia": "LV",
"Lesotho": "LS",
"Libya": "LY",
"Lithuania": "LT",
"Luxembourg": "LU",
"Madagascar": "MG",
"Malaysia": "MY",
"Mauritius": "MU",
"Mexico": "MX",
"Moldova": "MD",
"Mongolia": "MN",
"Montenegro": "ME",
"Morocco": "MA",
"Mozambique": "MZ",
"Namibia": "NA",
"Nepal": "NP",
"Netherlands": "NL",
"New Zealand": "NZ",
"Nicaragua": "NI",
"Nigeria": "NG",
"North Korea": "KP",
"North Macedonia": "MK",
"Norway": "NO",
"Norwegian": "NO", # Language/nationality in source data
"Oman": "OM",
"Pakistan": "PK",
"Panama": "PA",
"Paraguay": "PY",
"Peru": "PE",
"Philippines": "PH",
"Poland": "PL",
"Portugal": "PT",
"Romania": "RO",
"Russia": "RU",
"Scotland": "GB-SCT", # ISO 3166-2 for Scotland
"Senegal": "SN",
"Serbia": "RS",
"Seychelles": "SC",
"Singapore": "SG",
"Sint Maarten": "SX",
"Slovakia": "SK",
"Slovenia": "SI",
"Somalia": "SO",
"South Africa": "ZA",
"South Korea": "KR",
"Spain": "ES",
"Sri Lanka": "LK",
"Suriname": "SR",
"Swaziland": "SZ",
"Sweden": "SE",
"Switzerland": "CH",
"Taiwan": "TW",
"Tanzania": "TZ",
"Thailand": "TH",
"Turkiye": "TR",
"Turkmenistan": "TM",
"UK": "GB",
"USA": "US",
"Uganda": "UG",
"Ukraine": "UA",
"Venezuela": "VE",
"Vietnam": "VN",
"Yemen": "YE",
# Historical entities (use modern successor codes or special codes)
"Byzantine Empire": "HIST-BYZ", # Historical entity
"Czechoslovakia": "HIST-CS", # Dissolved 1993 → CZ + SK
"Japanese Empire": "HIST-JP", # Historical Japan
"Russian Empire": "HIST-RU", # Historical Russia
"Soviet Union": "HIST-SU", # Dissolved 1991
}
# Subregion name to ISO 3166-2 code mapping
# Format: {country_alpha2}-{subdivision_code}
SUBREGION_NAME_TO_ISO = {
# United States (US-XX format)
"Alabama": "US-AL",
"Alaska": "US-AK",
"Arizona": "US-AZ",
"Arkansas": "US-AR",
"California": "US-CA",
"Colorado": "US-CO",
"Connecticut": "US-CT",
"Delaware": "US-DE",
"Florida": "US-FL",
"Georgia": "US-GA",
"Hawaii": "US-HI",
"Idaho": "US-ID",
"Illinois": "US-IL",
"Indiana": "US-IN",
"Iowa": "US-IA",
"Kansas": "US-KS",
"Kentucky": "US-KY",
"Louisiana": "US-LA",
"Maine": "US-ME",
"Maryland": "US-MD",
"Massachusetts": "US-MA",
"Michigan": "US-MI",
"Minnesota": "US-MN",
"Mississippi": "US-MS",
"Missouri": "US-MO",
"Montana": "US-MT",
"Nebraska": "US-NE",
"Nevada": "US-NV",
"New Hampshire": "US-NH",
"New Jersey": "US-NJ",
"New Mexico": "US-NM",
"New York": "US-NY",
"North Carolina": "US-NC",
"North Dakota": "US-ND",
"Ohio": "US-OH",
"Oklahoma": "US-OK",
"Oregon": "US-OR",
"Pennsylvania": "US-PA",
"Rhode Island": "US-RI",
"South Carolina": "US-SC",
"South Dakota": "US-SD",
"Tennessee": "US-TN",
"Texas": "US-TX",
"Utah": "US-UT",
"Vermont": "US-VT",
"Virginia": "US-VA",
"Washington": "US-WA",
"West Virginia": "US-WV",
"Wisconsin": "US-WI",
"Wyoming": "US-WY",
# Germany (DE-XX format)
"Baden-Württemberg": "DE-BW",
"Bavaria": "DE-BY",
"Brandenburg": "DE-BB",
"Hesse": "DE-HE",
"Mecklenburg-Western Pomerania": "DE-MV",
"North-Rhine Westphalia": "DE-NW",
"Saxony": "DE-SN",
"Saxony-Anhalt": "DE-ST",
"Schleswig-Holstein": "DE-SH",
"Thuringia": "DE-TH",
# Austria (AT-X format)
"Burgenland": "AT-1",
"Carinthia": "AT-2",
"Lower Austria": "AT-3",
"Salzburg": "AT-5",
"Styria": "AT-6",
"Tyrol": "AT-7",
"Upper Austria": "AT-4",
"Vienna": "AT-9",
"Vorarlberg": "AT-8",
# Netherlands (NL-XX format)
"Limburg": "NL-LI",
# Belgium (BE-XXX format)
"Brussels": "BE-BRU",
"Flanders": "BE-VLG",
"Wallonia": "BE-WAL",
# Indonesia (ID-XX format)
"Bali": "ID-BA",
"Sabah": "MY-12", # Malaysia, not Indonesia
# Australia (AU-XXX format)
"Australian Capital Territory": "AU-ACT",
"New South Wales": "AU-NSW",
"Northern Territory": "AU-NT",
"Queensland": "AU-QLD",
"South Australia": "AU-SA",
"Tasmania": "AU-TAS",
"Victoria": "AU-VIC",
"Western Australia": "AU-WA",
# Canada (CA-XX format)
"Alberta": "CA-AB",
"Manitoba": "CA-MB",
"New Brunswick": "CA-NB",
"Newfoundland and Labrador": "CA-NL",
"Nova Scotia": "CA-NS",
"Ontario": "CA-ON",
"Quebec": "CA-QC",
"Saskatchewan": "CA-SK",
# Spain (ES-XX format)
"Andalusia": "ES-AN",
"Balearic Islands": "ES-IB",
"Basque Country": "ES-PV",
"Catalonia": "ES-CT",
"Galicia": "ES-GA",
"Madrid": "ES-MD",
"Valencia": "ES-VC",
# India (IN-XX format)
"Assam": "IN-AS",
"Bihar": "IN-BR",
"Kerala": "IN-KL",
"West Bengal": "IN-WB",
# Japan (JP-XX format)
"Hoikkaido": "JP-01", # Typo in source data (Hokkaido)
"Kanagawa": "JP-14",
"Okayama": "JP-33",
# United Kingdom subdivisions
"England": "GB-ENG",
"Scotland": "GB-SCT",
"Northern Ireland": "GB-NIR",
"Wales": "GB-WLS",
# Other countries
"Canton": "CH-ZH", # Switzerland (Zürich)
"Corsica": "FR-H", # France (Corse)
"Hong Kong": "HK", # Special Administrative Region
"Madeira": "PT-30", # Portugal
"Tuscany": "IT-52", # Italy
# Special cases
"Caribbean Netherlands": "BQ", # Special ISO code
"Pittsburgh": "US-PA", # City listed as subregion (should be settlement)
"Somerset": "GB-SOM", # UK county
# Unknown/incomplete mappings
"Arua": "UG-ARUA", # Uganda (district code needed)
"Nagorno-Karabakh": "AZ-NKR", # Disputed territory
"Przysłup": "PL-PRZYS", # Poland (locality code needed)
}
# Settlement name to GeoNames ID mapping
# Format: numeric GeoNames ID
SETTLEMENT_NAME_TO_GEONAMES = {
"Amsterdam": 2759794,
"Delft": 2757345,
"Dresden": 2935022,
"Ostend": 2789786,
"Pittsburgh": 5206379,
"Rio de Janeiro": 3451190,
"Seattle": 5809844,
"Warlubie": 3083271,
}
def extract_geographic_metadata(yaml_path: Path) -> Dict:
"""
Parse Wikidata hyponyms_curated.yaml and extract geographic metadata.
Returns:
Dict with keys:
- entities_with_geography: List of (Q-number, country, subregion, settlement)
- countries: Set of country ISO codes
- subregions: Set of ISO 3166-2 codes
- settlements: Set of GeoNames IDs
- unmapped_countries: List of country names without ISO mapping
- unmapped_subregions: List of subregion names without ISO mapping
"""
print(f"📖 Reading {yaml_path}...")
with open(yaml_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
entities_with_geography = []
countries_found = set()
subregions_found = set()
settlements_found = set()
unmapped_countries = []
unmapped_subregions = []
hypernyms = data.get('hypernym', [])
print(f"📊 Processing {len(hypernyms)} hypernym entries...")
for item in hypernyms:
q_number = item.get('label', 'UNKNOWN')
# Extract country
country_names = item.get('country', [])
country_codes = []
for country_name in country_names:
if not country_name or country_name in ['', ' ']:
continue # Skip empty strings
iso_code = COUNTRY_NAME_TO_ISO.get(country_name)
if iso_code:
country_codes.append(iso_code)
countries_found.add(iso_code)
else:
# Check if it's a single letter typo
if len(country_name) == 1:
print(f"⚠️ Skipping single-letter country '{country_name}' for {q_number}")
continue
unmapped_countries.append((q_number, country_name))
print(f"⚠️ Unmapped country: '{country_name}' for {q_number}")
# Extract subregion
subregion_names = item.get('subregion', [])
subregion_codes = []
for subregion_name in subregion_names:
if not subregion_name or subregion_name in ['', ' ']:
continue
iso_code = SUBREGION_NAME_TO_ISO.get(subregion_name)
if iso_code:
subregion_codes.append(iso_code)
subregions_found.add(iso_code)
else:
unmapped_subregions.append((q_number, subregion_name))
print(f"⚠️ Unmapped subregion: '{subregion_name}' for {q_number}")
# Extract settlement
settlement_names = item.get('settlement', [])
settlement_ids = []
for settlement_name in settlement_names:
if not settlement_name or settlement_name in ['', ' ']:
continue
geonames_id = SETTLEMENT_NAME_TO_GEONAMES.get(settlement_name)
if geonames_id:
settlement_ids.append(geonames_id)
settlements_found.add(geonames_id)
else:
# Settlements without GeoNames IDs are acceptable (can be resolved later)
print(f" Settlement without GeoNames ID: '{settlement_name}' for {q_number}")
# Store entity if it has any geographic metadata
if country_codes or subregion_codes or settlement_ids:
entities_with_geography.append({
'q_number': q_number,
'countries': country_codes,
'subregions': subregion_codes,
'settlements': settlement_ids,
'raw_country_names': country_names,
'raw_subregion_names': subregion_names,
'raw_settlement_names': settlement_names,
})
print(f"\n✅ Extraction complete!")
print(f" - {len(entities_with_geography)} entities with geographic metadata")
print(f" - {len(countries_found)} unique country codes")
print(f" - {len(subregions_found)} unique subregion codes")
print(f" - {len(settlements_found)} unique settlement IDs")
print(f" - {len(unmapped_countries)} unmapped country names")
print(f" - {len(unmapped_subregions)} unmapped subregion names")
return {
'entities_with_geography': entities_with_geography,
'countries': sorted(countries_found),
'subregions': sorted(subregions_found),
'settlements': sorted(settlements_found),
'unmapped_countries': unmapped_countries,
'unmapped_subregions': unmapped_subregions,
}
def generate_feature_type_annotations(geographic_data: Dict, output_path: Path):
"""
Generate dcterms:spatial annotations for FeatureTypeEnum.yaml.
Creates YAML snippet that can be manually integrated into FeatureTypeEnum.
"""
print(f"\n📝 Generating FeatureTypeEnum annotations...")
annotations = []
for entity in geographic_data['entities_with_geography']:
q_number = entity['q_number']
countries = entity['countries']
subregions = entity['subregions']
settlements = entity['settlements']
# Build annotation entry
annotation = {
'wikidata_id': q_number,
}
# Add dcterms:spatial for countries
if countries:
# Use primary country (first in list)
annotation['dcterms:spatial'] = countries[0]
if len(countries) > 1:
annotation['dcterms:spatial_all'] = countries
# Add ISO 3166-2 codes for subregions
if subregions:
annotation['iso_3166_2'] = subregions[0]
if len(subregions) > 1:
annotation['iso_3166_2_all'] = subregions
# Add GeoNames IDs for settlements
if settlements:
annotation['geonames_id'] = settlements[0]
if len(settlements) > 1:
annotation['geonames_id_all'] = settlements
# Add raw names for documentation
annotation['raw_data'] = {
'country': entity['raw_country_names'],
'subregion': entity['raw_subregion_names'],
'settlement': entity['raw_settlement_names'],
}
annotations.append(annotation)
# Write to output file
output_data = {
'description': 'Geographic annotations for FeatureTypeEnum entries',
'source': 'data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml',
'extraction_date': '2025-11-22',
'annotations': annotations,
}
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(output_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"✅ Annotations written to {output_path}")
print(f" - {len(annotations)} annotated entries")
def main():
"""Main execution function."""
print("🌍 Wikidata Geographic Metadata Extraction")
print("=" * 60)
# Paths
wikidata_yaml = PROJECT_ROOT / "data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml"
output_mapping = PROJECT_ROOT / "data/extracted/wikidata_geography_mapping.yaml"
output_annotations = PROJECT_ROOT / "data/extracted/feature_type_geographic_annotations.yaml"
# Extract geographic metadata
geographic_data = extract_geographic_metadata(wikidata_yaml)
# Write intermediate mapping file
print(f"\n📝 Writing intermediate mapping to {output_mapping}...")
output_mapping.parent.mkdir(parents=True, exist_ok=True)
with open(output_mapping, 'w', encoding='utf-8') as f:
yaml.dump(geographic_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"✅ Mapping written to {output_mapping}")
# Generate FeatureTypeEnum annotations
generate_feature_type_annotations(geographic_data, output_annotations)
# Summary report
print("\n" + "=" * 60)
print("📊 SUMMARY")
print("=" * 60)
print(f"Countries mapped: {len(geographic_data['countries'])}")
print(f"Subregions mapped: {len(geographic_data['subregions'])}")
print(f"Settlements mapped: {len(geographic_data['settlements'])}")
print(f"Entities with geography: {len(geographic_data['entities_with_geography'])}")
if geographic_data['unmapped_countries']:
print(f"\n⚠️ UNMAPPED COUNTRIES ({len(geographic_data['unmapped_countries'])}):")
for q_num, country in set(geographic_data['unmapped_countries']):
print(f" - {country}")
if geographic_data['unmapped_subregions']:
print(f"\n⚠️ UNMAPPED SUBREGIONS ({len(geographic_data['unmapped_subregions'])}):")
for q_num, subregion in set(geographic_data['unmapped_subregions']):
print(f" - {subregion}")
print("\n✅ Done! Next steps:")
print(" 1. Review unmapped countries/subregions above")
print(" 2. Update COUNTRY_NAME_TO_ISO / SUBREGION_NAME_TO_ISO dictionaries")
print(" 3. Re-run this script")
print(f" 4. Integrate {output_annotations} into FeatureTypeEnum.yaml")
if __name__ == '__main__':
main()