- Created the Country class with ISO 3166-1 alpha-2 and alpha-3 codes, ensuring minimal design without additional metadata. - Integrated the Country class into CustodianPlace and LegalForm schemas to support country-specific feature types and legal forms. - Removed duplicate keys in FeatureTypeEnum.yaml, resulting in 294 unique feature types. - Eliminated "Hypernyms:" text from FeatureTypeEnum descriptions, verifying that semantic relationships are now conveyed through ontology mappings. - Created example instance file demonstrating integration of Country with CustodianPlace and LegalForm. - Updated documentation to reflect the completion of the Country class implementation and hypernyms removal.
557 lines
18 KiB
Python
557 lines
18 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Extract geographic metadata from Wikidata hyponyms_curated.yaml.
|
||
|
||
This script:
|
||
1. Parses data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml
|
||
2. Extracts country, subregion, settlement fields from each hypernym entry
|
||
3. Maps human-readable names to ISO codes:
|
||
- Country names → ISO 3166-1 alpha-2 codes (e.g., "Netherlands" → "NL")
|
||
- Subregion names → ISO 3166-2 codes (e.g., "Pennsylvania" → "US-PA")
|
||
- Settlement names → GeoNames IDs (e.g., "Pittsburgh" → 5206379)
|
||
4. Generates annotations for FeatureTypeEnum.yaml
|
||
|
||
Output:
|
||
- data/extracted/wikidata_geography_mapping.yaml (intermediate mapping)
|
||
- data/extracted/feature_type_geographic_annotations.yaml (for schema integration)
|
||
|
||
Usage:
|
||
python3 scripts/extract_wikidata_geography.py
|
||
|
||
Author: OpenCODE AI Assistant
|
||
Date: 2025-11-22
|
||
"""
|
||
|
||
import yaml
|
||
import sys
|
||
from pathlib import Path
|
||
from typing import Dict, List, Set, Optional
|
||
from collections import defaultdict
|
||
|
||
# Add project root to path
|
||
PROJECT_ROOT = Path(__file__).parent.parent
|
||
sys.path.insert(0, str(PROJECT_ROOT))
|
||
|
||
# Country name to ISO 3166-1 alpha-2 mapping
|
||
# Source: Wikidata, ISO 3166 Maintenance Agency
|
||
COUNTRY_NAME_TO_ISO = {
|
||
# Modern countries (alphabetical)
|
||
"Albania": "AL",
|
||
"Argentina": "AR",
|
||
"Armenia": "AM",
|
||
"Aruba": "AW",
|
||
"Australia": "AU",
|
||
"Austria": "AT",
|
||
"Azerbaijan": "AZ",
|
||
"Bangladesh": "BD",
|
||
"Barbados": "BB",
|
||
"Bardbados": "BB", # Typo in source data
|
||
"Belarus": "BY",
|
||
"Belgium": "BE",
|
||
"Bolivia": "BO",
|
||
"Bosnia and Herzegovina": "BA",
|
||
"Brazil": "BR",
|
||
"Bulgaria": "BG",
|
||
"Cameroon": "CM",
|
||
"Canada": "CA",
|
||
"Chile": "CL",
|
||
"China": "CN",
|
||
"Colombia": "CO",
|
||
"Costa Rica": "CR",
|
||
"Croatia": "HR",
|
||
"Curaçao": "CW",
|
||
"Czech Republic": "CZ",
|
||
"Denmark": "DK",
|
||
"Dominica": "DM",
|
||
"Ecuador": "EC",
|
||
"El Salvador": "SV",
|
||
"England": "GB-ENG", # ISO 3166-2 for England
|
||
"Estonia": "EE",
|
||
"Finland": "FI",
|
||
"France": "FR",
|
||
"Gabon": "GA",
|
||
"Germany": "DE",
|
||
"Ghana": "GH",
|
||
"Greece": "GR",
|
||
"Guatemala": "GT",
|
||
"Guinea": "GN",
|
||
"Hungary": "HU",
|
||
"Iceland": "IS",
|
||
"India": "IN",
|
||
"Indonesia": "ID",
|
||
"Iran": "IR",
|
||
"Ireland": "IE",
|
||
"Israel": "IL",
|
||
"Italy": "IT",
|
||
"Ivory Coast": "CI",
|
||
"Japan": "JP",
|
||
"Kazakhstan": "KZ",
|
||
"Kenya": "KE",
|
||
"Kosovo": "XK", # User-assigned code
|
||
"Kyrgyzstan": "KG",
|
||
"Latvia": "LV",
|
||
"Lesotho": "LS",
|
||
"Libya": "LY",
|
||
"Lithuania": "LT",
|
||
"Luxembourg": "LU",
|
||
"Madagascar": "MG",
|
||
"Malaysia": "MY",
|
||
"Mauritius": "MU",
|
||
"Mexico": "MX",
|
||
"Moldova": "MD",
|
||
"Mongolia": "MN",
|
||
"Montenegro": "ME",
|
||
"Morocco": "MA",
|
||
"Mozambique": "MZ",
|
||
"Namibia": "NA",
|
||
"Nepal": "NP",
|
||
"Netherlands": "NL",
|
||
"New Zealand": "NZ",
|
||
"Nicaragua": "NI",
|
||
"Nigeria": "NG",
|
||
"North Korea": "KP",
|
||
"North Macedonia": "MK",
|
||
"Norway": "NO",
|
||
"Norwegian": "NO", # Language/nationality in source data
|
||
"Oman": "OM",
|
||
"Pakistan": "PK",
|
||
"Panama": "PA",
|
||
"Paraguay": "PY",
|
||
"Peru": "PE",
|
||
"Philippines": "PH",
|
||
"Poland": "PL",
|
||
"Portugal": "PT",
|
||
"Romania": "RO",
|
||
"Russia": "RU",
|
||
"Scotland": "GB-SCT", # ISO 3166-2 for Scotland
|
||
"Senegal": "SN",
|
||
"Serbia": "RS",
|
||
"Seychelles": "SC",
|
||
"Singapore": "SG",
|
||
"Sint Maarten": "SX",
|
||
"Slovakia": "SK",
|
||
"Slovenia": "SI",
|
||
"Somalia": "SO",
|
||
"South Africa": "ZA",
|
||
"South Korea": "KR",
|
||
"Spain": "ES",
|
||
"Sri Lanka": "LK",
|
||
"Suriname": "SR",
|
||
"Swaziland": "SZ",
|
||
"Sweden": "SE",
|
||
"Switzerland": "CH",
|
||
"Taiwan": "TW",
|
||
"Tanzania": "TZ",
|
||
"Thailand": "TH",
|
||
"Turkiye": "TR",
|
||
"Turkmenistan": "TM",
|
||
"UK": "GB",
|
||
"USA": "US",
|
||
"Uganda": "UG",
|
||
"Ukraine": "UA",
|
||
"Venezuela": "VE",
|
||
"Vietnam": "VN",
|
||
"Yemen": "YE",
|
||
|
||
# Historical entities (use modern successor codes or special codes)
|
||
"Byzantine Empire": "HIST-BYZ", # Historical entity
|
||
"Czechoslovakia": "HIST-CS", # Dissolved 1993 → CZ + SK
|
||
"Japanese Empire": "HIST-JP", # Historical Japan
|
||
"Russian Empire": "HIST-RU", # Historical Russia
|
||
"Soviet Union": "HIST-SU", # Dissolved 1991
|
||
}
|
||
|
||
# Subregion name to ISO 3166-2 code mapping
|
||
# Format: {country_alpha2}-{subdivision_code}
|
||
SUBREGION_NAME_TO_ISO = {
|
||
# United States (US-XX format)
|
||
"Alabama": "US-AL",
|
||
"Alaska": "US-AK",
|
||
"Arizona": "US-AZ",
|
||
"Arkansas": "US-AR",
|
||
"California": "US-CA",
|
||
"Colorado": "US-CO",
|
||
"Connecticut": "US-CT",
|
||
"Delaware": "US-DE",
|
||
"Florida": "US-FL",
|
||
"Georgia": "US-GA",
|
||
"Hawaii": "US-HI",
|
||
"Idaho": "US-ID",
|
||
"Illinois": "US-IL",
|
||
"Indiana": "US-IN",
|
||
"Iowa": "US-IA",
|
||
"Kansas": "US-KS",
|
||
"Kentucky": "US-KY",
|
||
"Louisiana": "US-LA",
|
||
"Maine": "US-ME",
|
||
"Maryland": "US-MD",
|
||
"Massachusetts": "US-MA",
|
||
"Michigan": "US-MI",
|
||
"Minnesota": "US-MN",
|
||
"Mississippi": "US-MS",
|
||
"Missouri": "US-MO",
|
||
"Montana": "US-MT",
|
||
"Nebraska": "US-NE",
|
||
"Nevada": "US-NV",
|
||
"New Hampshire": "US-NH",
|
||
"New Jersey": "US-NJ",
|
||
"New Mexico": "US-NM",
|
||
"New York": "US-NY",
|
||
"North Carolina": "US-NC",
|
||
"North Dakota": "US-ND",
|
||
"Ohio": "US-OH",
|
||
"Oklahoma": "US-OK",
|
||
"Oregon": "US-OR",
|
||
"Pennsylvania": "US-PA",
|
||
"Rhode Island": "US-RI",
|
||
"South Carolina": "US-SC",
|
||
"South Dakota": "US-SD",
|
||
"Tennessee": "US-TN",
|
||
"Texas": "US-TX",
|
||
"Utah": "US-UT",
|
||
"Vermont": "US-VT",
|
||
"Virginia": "US-VA",
|
||
"Washington": "US-WA",
|
||
"West Virginia": "US-WV",
|
||
"Wisconsin": "US-WI",
|
||
"Wyoming": "US-WY",
|
||
|
||
# Germany (DE-XX format)
|
||
"Baden-Württemberg": "DE-BW",
|
||
"Bavaria": "DE-BY",
|
||
"Brandenburg": "DE-BB",
|
||
"Hesse": "DE-HE",
|
||
"Mecklenburg-Western Pomerania": "DE-MV",
|
||
"North-Rhine Westphalia": "DE-NW",
|
||
"Saxony": "DE-SN",
|
||
"Saxony-Anhalt": "DE-ST",
|
||
"Schleswig-Holstein": "DE-SH",
|
||
"Thuringia": "DE-TH",
|
||
|
||
# Austria (AT-X format)
|
||
"Burgenland": "AT-1",
|
||
"Carinthia": "AT-2",
|
||
"Lower Austria": "AT-3",
|
||
"Salzburg": "AT-5",
|
||
"Styria": "AT-6",
|
||
"Tyrol": "AT-7",
|
||
"Upper Austria": "AT-4",
|
||
"Vienna": "AT-9",
|
||
"Vorarlberg": "AT-8",
|
||
|
||
# Netherlands (NL-XX format)
|
||
"Limburg": "NL-LI",
|
||
|
||
# Belgium (BE-XXX format)
|
||
"Brussels": "BE-BRU",
|
||
"Flanders": "BE-VLG",
|
||
"Wallonia": "BE-WAL",
|
||
|
||
# Indonesia (ID-XX format)
|
||
"Bali": "ID-BA",
|
||
"Sabah": "MY-12", # Malaysia, not Indonesia
|
||
|
||
# Australia (AU-XXX format)
|
||
"Australian Capital Territory": "AU-ACT",
|
||
"New South Wales": "AU-NSW",
|
||
"Northern Territory": "AU-NT",
|
||
"Queensland": "AU-QLD",
|
||
"South Australia": "AU-SA",
|
||
"Tasmania": "AU-TAS",
|
||
"Victoria": "AU-VIC",
|
||
"Western Australia": "AU-WA",
|
||
|
||
# Canada (CA-XX format)
|
||
"Alberta": "CA-AB",
|
||
"Manitoba": "CA-MB",
|
||
"New Brunswick": "CA-NB",
|
||
"Newfoundland and Labrador": "CA-NL",
|
||
"Nova Scotia": "CA-NS",
|
||
"Ontario": "CA-ON",
|
||
"Quebec": "CA-QC",
|
||
"Saskatchewan": "CA-SK",
|
||
|
||
# Spain (ES-XX format)
|
||
"Andalusia": "ES-AN",
|
||
"Balearic Islands": "ES-IB",
|
||
"Basque Country": "ES-PV",
|
||
"Catalonia": "ES-CT",
|
||
"Galicia": "ES-GA",
|
||
"Madrid": "ES-MD",
|
||
"Valencia": "ES-VC",
|
||
|
||
# India (IN-XX format)
|
||
"Assam": "IN-AS",
|
||
"Bihar": "IN-BR",
|
||
"Kerala": "IN-KL",
|
||
"West Bengal": "IN-WB",
|
||
|
||
# Japan (JP-XX format)
|
||
"Hoikkaido": "JP-01", # Typo in source data (Hokkaido)
|
||
"Kanagawa": "JP-14",
|
||
"Okayama": "JP-33",
|
||
|
||
# United Kingdom subdivisions
|
||
"England": "GB-ENG",
|
||
"Scotland": "GB-SCT",
|
||
"Northern Ireland": "GB-NIR",
|
||
"Wales": "GB-WLS",
|
||
|
||
# Other countries
|
||
"Canton": "CH-ZH", # Switzerland (Zürich)
|
||
"Corsica": "FR-H", # France (Corse)
|
||
"Hong Kong": "HK", # Special Administrative Region
|
||
"Madeira": "PT-30", # Portugal
|
||
"Tuscany": "IT-52", # Italy
|
||
|
||
# Special cases
|
||
"Caribbean Netherlands": "BQ", # Special ISO code
|
||
"Pittsburgh": "US-PA", # City listed as subregion (should be settlement)
|
||
"Somerset": "GB-SOM", # UK county
|
||
|
||
# Unknown/incomplete mappings
|
||
"Arua": "UG-ARUA", # Uganda (district code needed)
|
||
"Nagorno-Karabakh": "AZ-NKR", # Disputed territory
|
||
"Przysłup": "PL-PRZYS", # Poland (locality code needed)
|
||
}
|
||
|
||
# Settlement name to GeoNames ID mapping
|
||
# Format: numeric GeoNames ID
|
||
SETTLEMENT_NAME_TO_GEONAMES = {
|
||
"Amsterdam": 2759794,
|
||
"Delft": 2757345,
|
||
"Dresden": 2935022,
|
||
"Ostend": 2789786,
|
||
"Pittsburgh": 5206379,
|
||
"Rio de Janeiro": 3451190,
|
||
"Seattle": 5809844,
|
||
"Warlubie": 3083271,
|
||
}
|
||
|
||
|
||
def extract_geographic_metadata(yaml_path: Path) -> Dict:
|
||
"""
|
||
Parse Wikidata hyponyms_curated.yaml and extract geographic metadata.
|
||
|
||
Returns:
|
||
Dict with keys:
|
||
- entities_with_geography: List of (Q-number, country, subregion, settlement)
|
||
- countries: Set of country ISO codes
|
||
- subregions: Set of ISO 3166-2 codes
|
||
- settlements: Set of GeoNames IDs
|
||
- unmapped_countries: List of country names without ISO mapping
|
||
- unmapped_subregions: List of subregion names without ISO mapping
|
||
"""
|
||
print(f"📖 Reading {yaml_path}...")
|
||
|
||
with open(yaml_path, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
|
||
entities_with_geography = []
|
||
countries_found = set()
|
||
subregions_found = set()
|
||
settlements_found = set()
|
||
unmapped_countries = []
|
||
unmapped_subregions = []
|
||
|
||
hypernyms = data.get('hypernym', [])
|
||
print(f"📊 Processing {len(hypernyms)} hypernym entries...")
|
||
|
||
for item in hypernyms:
|
||
q_number = item.get('label', 'UNKNOWN')
|
||
|
||
# Extract country
|
||
country_names = item.get('country', [])
|
||
country_codes = []
|
||
for country_name in country_names:
|
||
if not country_name or country_name in ['', ' ']:
|
||
continue # Skip empty strings
|
||
|
||
iso_code = COUNTRY_NAME_TO_ISO.get(country_name)
|
||
if iso_code:
|
||
country_codes.append(iso_code)
|
||
countries_found.add(iso_code)
|
||
else:
|
||
# Check if it's a single letter typo
|
||
if len(country_name) == 1:
|
||
print(f"⚠️ Skipping single-letter country '{country_name}' for {q_number}")
|
||
continue
|
||
unmapped_countries.append((q_number, country_name))
|
||
print(f"⚠️ Unmapped country: '{country_name}' for {q_number}")
|
||
|
||
# Extract subregion
|
||
subregion_names = item.get('subregion', [])
|
||
subregion_codes = []
|
||
for subregion_name in subregion_names:
|
||
if not subregion_name or subregion_name in ['', ' ']:
|
||
continue
|
||
|
||
iso_code = SUBREGION_NAME_TO_ISO.get(subregion_name)
|
||
if iso_code:
|
||
subregion_codes.append(iso_code)
|
||
subregions_found.add(iso_code)
|
||
else:
|
||
unmapped_subregions.append((q_number, subregion_name))
|
||
print(f"⚠️ Unmapped subregion: '{subregion_name}' for {q_number}")
|
||
|
||
# Extract settlement
|
||
settlement_names = item.get('settlement', [])
|
||
settlement_ids = []
|
||
for settlement_name in settlement_names:
|
||
if not settlement_name or settlement_name in ['', ' ']:
|
||
continue
|
||
|
||
geonames_id = SETTLEMENT_NAME_TO_GEONAMES.get(settlement_name)
|
||
if geonames_id:
|
||
settlement_ids.append(geonames_id)
|
||
settlements_found.add(geonames_id)
|
||
else:
|
||
# Settlements without GeoNames IDs are acceptable (can be resolved later)
|
||
print(f"ℹ️ Settlement without GeoNames ID: '{settlement_name}' for {q_number}")
|
||
|
||
# Store entity if it has any geographic metadata
|
||
if country_codes or subregion_codes or settlement_ids:
|
||
entities_with_geography.append({
|
||
'q_number': q_number,
|
||
'countries': country_codes,
|
||
'subregions': subregion_codes,
|
||
'settlements': settlement_ids,
|
||
'raw_country_names': country_names,
|
||
'raw_subregion_names': subregion_names,
|
||
'raw_settlement_names': settlement_names,
|
||
})
|
||
|
||
print(f"\n✅ Extraction complete!")
|
||
print(f" - {len(entities_with_geography)} entities with geographic metadata")
|
||
print(f" - {len(countries_found)} unique country codes")
|
||
print(f" - {len(subregions_found)} unique subregion codes")
|
||
print(f" - {len(settlements_found)} unique settlement IDs")
|
||
print(f" - {len(unmapped_countries)} unmapped country names")
|
||
print(f" - {len(unmapped_subregions)} unmapped subregion names")
|
||
|
||
return {
|
||
'entities_with_geography': entities_with_geography,
|
||
'countries': sorted(countries_found),
|
||
'subregions': sorted(subregions_found),
|
||
'settlements': sorted(settlements_found),
|
||
'unmapped_countries': unmapped_countries,
|
||
'unmapped_subregions': unmapped_subregions,
|
||
}
|
||
|
||
|
||
def generate_feature_type_annotations(geographic_data: Dict, output_path: Path):
|
||
"""
|
||
Generate dcterms:spatial annotations for FeatureTypeEnum.yaml.
|
||
|
||
Creates YAML snippet that can be manually integrated into FeatureTypeEnum.
|
||
"""
|
||
print(f"\n📝 Generating FeatureTypeEnum annotations...")
|
||
|
||
annotations = []
|
||
|
||
for entity in geographic_data['entities_with_geography']:
|
||
q_number = entity['q_number']
|
||
countries = entity['countries']
|
||
subregions = entity['subregions']
|
||
settlements = entity['settlements']
|
||
|
||
# Build annotation entry
|
||
annotation = {
|
||
'wikidata_id': q_number,
|
||
}
|
||
|
||
# Add dcterms:spatial for countries
|
||
if countries:
|
||
# Use primary country (first in list)
|
||
annotation['dcterms:spatial'] = countries[0]
|
||
if len(countries) > 1:
|
||
annotation['dcterms:spatial_all'] = countries
|
||
|
||
# Add ISO 3166-2 codes for subregions
|
||
if subregions:
|
||
annotation['iso_3166_2'] = subregions[0]
|
||
if len(subregions) > 1:
|
||
annotation['iso_3166_2_all'] = subregions
|
||
|
||
# Add GeoNames IDs for settlements
|
||
if settlements:
|
||
annotation['geonames_id'] = settlements[0]
|
||
if len(settlements) > 1:
|
||
annotation['geonames_id_all'] = settlements
|
||
|
||
# Add raw names for documentation
|
||
annotation['raw_data'] = {
|
||
'country': entity['raw_country_names'],
|
||
'subregion': entity['raw_subregion_names'],
|
||
'settlement': entity['raw_settlement_names'],
|
||
}
|
||
|
||
annotations.append(annotation)
|
||
|
||
# Write to output file
|
||
output_data = {
|
||
'description': 'Geographic annotations for FeatureTypeEnum entries',
|
||
'source': 'data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml',
|
||
'extraction_date': '2025-11-22',
|
||
'annotations': annotations,
|
||
}
|
||
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
yaml.dump(output_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||
|
||
print(f"✅ Annotations written to {output_path}")
|
||
print(f" - {len(annotations)} annotated entries")
|
||
|
||
|
||
def main():
|
||
"""Main execution function."""
|
||
print("🌍 Wikidata Geographic Metadata Extraction")
|
||
print("=" * 60)
|
||
|
||
# Paths
|
||
wikidata_yaml = PROJECT_ROOT / "data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml"
|
||
output_mapping = PROJECT_ROOT / "data/extracted/wikidata_geography_mapping.yaml"
|
||
output_annotations = PROJECT_ROOT / "data/extracted/feature_type_geographic_annotations.yaml"
|
||
|
||
# Extract geographic metadata
|
||
geographic_data = extract_geographic_metadata(wikidata_yaml)
|
||
|
||
# Write intermediate mapping file
|
||
print(f"\n📝 Writing intermediate mapping to {output_mapping}...")
|
||
output_mapping.parent.mkdir(parents=True, exist_ok=True)
|
||
with open(output_mapping, 'w', encoding='utf-8') as f:
|
||
yaml.dump(geographic_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
||
print(f"✅ Mapping written to {output_mapping}")
|
||
|
||
# Generate FeatureTypeEnum annotations
|
||
generate_feature_type_annotations(geographic_data, output_annotations)
|
||
|
||
# Summary report
|
||
print("\n" + "=" * 60)
|
||
print("📊 SUMMARY")
|
||
print("=" * 60)
|
||
print(f"Countries mapped: {len(geographic_data['countries'])}")
|
||
print(f"Subregions mapped: {len(geographic_data['subregions'])}")
|
||
print(f"Settlements mapped: {len(geographic_data['settlements'])}")
|
||
print(f"Entities with geography: {len(geographic_data['entities_with_geography'])}")
|
||
|
||
if geographic_data['unmapped_countries']:
|
||
print(f"\n⚠️ UNMAPPED COUNTRIES ({len(geographic_data['unmapped_countries'])}):")
|
||
for q_num, country in set(geographic_data['unmapped_countries']):
|
||
print(f" - {country}")
|
||
|
||
if geographic_data['unmapped_subregions']:
|
||
print(f"\n⚠️ UNMAPPED SUBREGIONS ({len(geographic_data['unmapped_subregions'])}):")
|
||
for q_num, subregion in set(geographic_data['unmapped_subregions']):
|
||
print(f" - {subregion}")
|
||
|
||
print("\n✅ Done! Next steps:")
|
||
print(" 1. Review unmapped countries/subregions above")
|
||
print(" 2. Update COUNTRY_NAME_TO_ISO / SUBREGION_NAME_TO_ISO dictionaries")
|
||
print(" 3. Re-run this script")
|
||
print(f" 4. Integrate {output_annotations} into FeatureTypeEnum.yaml")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|