glam/scripts/add_geographic_annotations_to_enum.py
kempersc 67657c39b6 feat: Complete Country Class Implementation and Hypernyms Removal
- Created the Country class with ISO 3166-1 alpha-2 and alpha-3 codes, ensuring minimal design without additional metadata.
- Integrated the Country class into CustodianPlace and LegalForm schemas to support country-specific feature types and legal forms.
- Removed duplicate keys in FeatureTypeEnum.yaml, resulting in 294 unique feature types.
- Eliminated "Hypernyms:" text from FeatureTypeEnum descriptions, verifying that semantic relationships are now conveyed through ontology mappings.
- Created example instance file demonstrating integration of Country with CustodianPlace and LegalForm.
- Updated documentation to reflect the completion of the Country class implementation and hypernyms removal.
2025-11-23 13:09:38 +01:00

205 lines
7.1 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Add geographic annotations to FeatureTypeEnum.yaml.
This script:
1. Reads data/extracted/feature_type_geographic_annotations.yaml
2. Loads schemas/20251121/linkml/modules/enums/FeatureTypeEnum.yaml
3. Matches Q-numbers between annotation file and enum
4. Adds 'annotations' field to matching enum permissible_values
5. Writes updated FeatureTypeEnum.yaml
Geographic annotations added:
- dcterms:spatial: ISO 3166-1 alpha-2 country code (e.g., "NL")
- iso_3166_2: ISO 3166-2 subdivision code (e.g., "US-PA") [if available]
- geonames_id: GeoNames ID for settlements (e.g., 5206379) [if available]
Example output:
BUITENPLAATS:
meaning: wd:Q2927789
description: Dutch country estate
annotations:
dcterms:spatial: NL
wikidata_country: Netherlands
Usage:
python3 scripts/add_geographic_annotations_to_enum.py
Author: OpenCODE AI Assistant
Date: 2025-11-22
"""
import yaml
import sys
from pathlib import Path
from typing import Dict, List
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
def load_annotations(yaml_path: Path) -> Dict[str, Dict]:
"""
Load geographic annotations and index by Wikidata Q-number.
Returns:
Dict mapping Q-number to annotation data
"""
print(f"📖 Loading annotations from {yaml_path}...")
with open(yaml_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
annotations_by_q = {}
for annot in data['annotations']:
q_num = annot['wikidata_id']
annotations_by_q[q_num] = annot
print(f"✅ Loaded {len(annotations_by_q)} annotations")
return annotations_by_q
def add_annotations_to_enum(enum_path: Path, annotations: Dict[str, Dict], output_path: Path):
"""
Add geographic annotations to FeatureTypeEnum permissible values.
Args:
enum_path: Path to FeatureTypeEnum.yaml
annotations: Dict mapping Q-number to annotation data
output_path: Path to write updated enum
"""
print(f"\n📖 Loading FeatureTypeEnum from {enum_path}...")
with open(enum_path, 'r', encoding='utf-8') as f:
enum_data = yaml.safe_load(f)
permissible_values = enum_data['enums']['FeatureTypeEnum']['permissible_values']
print(f"✅ Loaded {len(permissible_values)} permissible values")
# Track statistics
matched = 0
updated = 0
skipped_no_match = 0
print(f"\n🔄 Processing permissible values...")
for pv_name, pv_data in permissible_values.items():
meaning = pv_data.get('meaning')
if not meaning or not meaning.startswith('wd:Q'):
skipped_no_match += 1
continue
q_num = meaning.replace('wd:', '')
if q_num in annotations:
matched += 1
annot = annotations[q_num]
# Build annotations dict
pv_annotations = {}
# Add dcterms:spatial (country code)
if 'dcterms:spatial' in annot:
pv_annotations['dcterms:spatial'] = annot['dcterms:spatial']
# Add ISO 3166-2 subdivision code (if available)
if 'iso_3166_2' in annot:
pv_annotations['iso_3166_2'] = annot['iso_3166_2']
# Add GeoNames ID (if available)
if 'geonames_id' in annot:
pv_annotations['geonames_id'] = annot['geonames_id']
# Add raw Wikidata country name for documentation
if annot['raw_data']['country']:
pv_annotations['wikidata_country'] = annot['raw_data']['country'][0]
# Add raw subregion name (if available)
if annot['raw_data']['subregion']:
pv_annotations['wikidata_subregion'] = annot['raw_data']['subregion'][0]
# Add raw settlement name (if available)
if annot['raw_data']['settlement']:
pv_annotations['wikidata_settlement'] = annot['raw_data']['settlement'][0]
# Add annotations to permissible value
if pv_annotations:
pv_data['annotations'] = pv_annotations
updated += 1
print(f"{pv_name}: {pv_annotations.get('dcterms:spatial', 'N/A')}", end='')
if 'iso_3166_2' in pv_annotations:
print(f" [{pv_annotations['iso_3166_2']}]", end='')
print()
print(f"\n📊 Statistics:")
print(f" - Matched: {matched}")
print(f" - Updated: {updated}")
print(f" - Skipped (no Q-number): {skipped_no_match}")
# Write updated enum
print(f"\n📝 Writing updated enum to {output_path}...")
with open(output_path, 'w', encoding='utf-8') as f:
# Add header comment
header = f"""# FeatureTypeEnum - Heritage Feature Types with Geographic Restrictions
#
# This file has been automatically updated with geographic annotations
# extracted from data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml
#
# Geographic annotations:
# - dcterms:spatial: ISO 3166-1 alpha-2 country code (e.g., "NL" for Netherlands)
# - iso_3166_2: ISO 3166-2 subdivision code (e.g., "US-PA" for Pennsylvania)
# - geonames_id: GeoNames ID for settlements (e.g., 5206379 for Pittsburgh)
# - wikidata_country: Human-readable country name from Wikidata
# - wikidata_subregion: Human-readable subregion name from Wikidata (if available)
# - wikidata_settlement: Human-readable settlement name from Wikidata (if available)
#
# Validation:
# - Custom Python validator checks that CustodianPlace.country matches dcterms:spatial
# - Validator implemented in: scripts/validate_geographic_restrictions.py
#
# Generation date: 2025-11-22
# Generated by: scripts/add_geographic_annotations_to_enum.py
#
"""
f.write(header)
# Write YAML with proper formatting
yaml.dump(enum_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)
print(f"✅ Updated enum written successfully")
print(f"\n {updated} permissible values now have geographic annotations")
def main():
"""Main execution function."""
print("🌍 Add Geographic Annotations to FeatureTypeEnum")
print("=" * 60)
# Paths
annotations_yaml = PROJECT_ROOT / "data/extracted/feature_type_geographic_annotations.yaml"
enum_yaml = PROJECT_ROOT / "schemas/20251121/linkml/modules/enums/FeatureTypeEnum.yaml"
output_yaml = enum_yaml # Overwrite in place
# Load annotations
annotations = load_annotations(annotations_yaml)
# Add annotations to enum
add_annotations_to_enum(enum_yaml, annotations, output_yaml)
print("\n" + "=" * 60)
print("✅ COMPLETE!")
print("=" * 60)
print("\nNext steps:")
print(" 1. Review updated FeatureTypeEnum.yaml")
print(" 2. Regenerate RDF/OWL schema")
print(" 3. Create validation script (validate_geographic_restrictions.py)")
print(" 4. Add test cases for geographic validation")
if __name__ == '__main__':
main()