- Created the Country class with ISO 3166-1 alpha-2 and alpha-3 codes, ensuring minimal design without additional metadata. - Integrated the Country class into CustodianPlace and LegalForm schemas to support country-specific feature types and legal forms. - Removed duplicate keys in FeatureTypeEnum.yaml, resulting in 294 unique feature types. - Eliminated "Hypernyms:" text from FeatureTypeEnum descriptions, verifying that semantic relationships are now conveyed through ontology mappings. - Created example instance file demonstrating integration of Country with CustodianPlace and LegalForm. - Updated documentation to reflect the completion of the Country class implementation and hypernyms removal.
205 lines
7.1 KiB
Python
205 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Add geographic annotations to FeatureTypeEnum.yaml.
|
||
|
||
This script:
|
||
1. Reads data/extracted/feature_type_geographic_annotations.yaml
|
||
2. Loads schemas/20251121/linkml/modules/enums/FeatureTypeEnum.yaml
|
||
3. Matches Q-numbers between annotation file and enum
|
||
4. Adds 'annotations' field to matching enum permissible_values
|
||
5. Writes updated FeatureTypeEnum.yaml
|
||
|
||
Geographic annotations added:
|
||
- dcterms:spatial: ISO 3166-1 alpha-2 country code (e.g., "NL")
|
||
- iso_3166_2: ISO 3166-2 subdivision code (e.g., "US-PA") [if available]
|
||
- geonames_id: GeoNames ID for settlements (e.g., 5206379) [if available]
|
||
|
||
Example output:
|
||
BUITENPLAATS:
|
||
meaning: wd:Q2927789
|
||
description: Dutch country estate
|
||
annotations:
|
||
dcterms:spatial: NL
|
||
wikidata_country: Netherlands
|
||
|
||
Usage:
|
||
python3 scripts/add_geographic_annotations_to_enum.py
|
||
|
||
Author: OpenCODE AI Assistant
|
||
Date: 2025-11-22
|
||
"""
|
||
|
||
import yaml
|
||
import sys
|
||
from pathlib import Path
|
||
from typing import Dict, List
|
||
|
||
# Add project root to path
|
||
PROJECT_ROOT = Path(__file__).parent.parent
|
||
sys.path.insert(0, str(PROJECT_ROOT))
|
||
|
||
|
||
def load_annotations(yaml_path: Path) -> Dict[str, Dict]:
|
||
"""
|
||
Load geographic annotations and index by Wikidata Q-number.
|
||
|
||
Returns:
|
||
Dict mapping Q-number to annotation data
|
||
"""
|
||
print(f"📖 Loading annotations from {yaml_path}...")
|
||
|
||
with open(yaml_path, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
|
||
annotations_by_q = {}
|
||
for annot in data['annotations']:
|
||
q_num = annot['wikidata_id']
|
||
annotations_by_q[q_num] = annot
|
||
|
||
print(f"✅ Loaded {len(annotations_by_q)} annotations")
|
||
return annotations_by_q
|
||
|
||
|
||
def add_annotations_to_enum(enum_path: Path, annotations: Dict[str, Dict], output_path: Path):
|
||
"""
|
||
Add geographic annotations to FeatureTypeEnum permissible values.
|
||
|
||
Args:
|
||
enum_path: Path to FeatureTypeEnum.yaml
|
||
annotations: Dict mapping Q-number to annotation data
|
||
output_path: Path to write updated enum
|
||
"""
|
||
print(f"\n📖 Loading FeatureTypeEnum from {enum_path}...")
|
||
|
||
with open(enum_path, 'r', encoding='utf-8') as f:
|
||
enum_data = yaml.safe_load(f)
|
||
|
||
permissible_values = enum_data['enums']['FeatureTypeEnum']['permissible_values']
|
||
|
||
print(f"✅ Loaded {len(permissible_values)} permissible values")
|
||
|
||
# Track statistics
|
||
matched = 0
|
||
updated = 0
|
||
skipped_no_match = 0
|
||
|
||
print(f"\n🔄 Processing permissible values...")
|
||
|
||
for pv_name, pv_data in permissible_values.items():
|
||
meaning = pv_data.get('meaning')
|
||
|
||
if not meaning or not meaning.startswith('wd:Q'):
|
||
skipped_no_match += 1
|
||
continue
|
||
|
||
q_num = meaning.replace('wd:', '')
|
||
|
||
if q_num in annotations:
|
||
matched += 1
|
||
annot = annotations[q_num]
|
||
|
||
# Build annotations dict
|
||
pv_annotations = {}
|
||
|
||
# Add dcterms:spatial (country code)
|
||
if 'dcterms:spatial' in annot:
|
||
pv_annotations['dcterms:spatial'] = annot['dcterms:spatial']
|
||
|
||
# Add ISO 3166-2 subdivision code (if available)
|
||
if 'iso_3166_2' in annot:
|
||
pv_annotations['iso_3166_2'] = annot['iso_3166_2']
|
||
|
||
# Add GeoNames ID (if available)
|
||
if 'geonames_id' in annot:
|
||
pv_annotations['geonames_id'] = annot['geonames_id']
|
||
|
||
# Add raw Wikidata country name for documentation
|
||
if annot['raw_data']['country']:
|
||
pv_annotations['wikidata_country'] = annot['raw_data']['country'][0]
|
||
|
||
# Add raw subregion name (if available)
|
||
if annot['raw_data']['subregion']:
|
||
pv_annotations['wikidata_subregion'] = annot['raw_data']['subregion'][0]
|
||
|
||
# Add raw settlement name (if available)
|
||
if annot['raw_data']['settlement']:
|
||
pv_annotations['wikidata_settlement'] = annot['raw_data']['settlement'][0]
|
||
|
||
# Add annotations to permissible value
|
||
if pv_annotations:
|
||
pv_data['annotations'] = pv_annotations
|
||
updated += 1
|
||
|
||
print(f" ✅ {pv_name}: {pv_annotations.get('dcterms:spatial', 'N/A')}", end='')
|
||
if 'iso_3166_2' in pv_annotations:
|
||
print(f" [{pv_annotations['iso_3166_2']}]", end='')
|
||
print()
|
||
|
||
print(f"\n📊 Statistics:")
|
||
print(f" - Matched: {matched}")
|
||
print(f" - Updated: {updated}")
|
||
print(f" - Skipped (no Q-number): {skipped_no_match}")
|
||
|
||
# Write updated enum
|
||
print(f"\n📝 Writing updated enum to {output_path}...")
|
||
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
# Add header comment
|
||
header = f"""# FeatureTypeEnum - Heritage Feature Types with Geographic Restrictions
|
||
#
|
||
# This file has been automatically updated with geographic annotations
|
||
# extracted from data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml
|
||
#
|
||
# Geographic annotations:
|
||
# - dcterms:spatial: ISO 3166-1 alpha-2 country code (e.g., "NL" for Netherlands)
|
||
# - iso_3166_2: ISO 3166-2 subdivision code (e.g., "US-PA" for Pennsylvania)
|
||
# - geonames_id: GeoNames ID for settlements (e.g., 5206379 for Pittsburgh)
|
||
# - wikidata_country: Human-readable country name from Wikidata
|
||
# - wikidata_subregion: Human-readable subregion name from Wikidata (if available)
|
||
# - wikidata_settlement: Human-readable settlement name from Wikidata (if available)
|
||
#
|
||
# Validation:
|
||
# - Custom Python validator checks that CustodianPlace.country matches dcterms:spatial
|
||
# - Validator implemented in: scripts/validate_geographic_restrictions.py
|
||
#
|
||
# Generation date: 2025-11-22
|
||
# Generated by: scripts/add_geographic_annotations_to_enum.py
|
||
#
|
||
"""
|
||
f.write(header)
|
||
|
||
# Write YAML with proper formatting
|
||
yaml.dump(enum_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120)
|
||
|
||
print(f"✅ Updated enum written successfully")
|
||
print(f"\nℹ️ {updated} permissible values now have geographic annotations")
|
||
|
||
|
||
def main():
|
||
"""Main execution function."""
|
||
print("🌍 Add Geographic Annotations to FeatureTypeEnum")
|
||
print("=" * 60)
|
||
|
||
# Paths
|
||
annotations_yaml = PROJECT_ROOT / "data/extracted/feature_type_geographic_annotations.yaml"
|
||
enum_yaml = PROJECT_ROOT / "schemas/20251121/linkml/modules/enums/FeatureTypeEnum.yaml"
|
||
output_yaml = enum_yaml # Overwrite in place
|
||
|
||
# Load annotations
|
||
annotations = load_annotations(annotations_yaml)
|
||
|
||
# Add annotations to enum
|
||
add_annotations_to_enum(enum_yaml, annotations, output_yaml)
|
||
|
||
print("\n" + "=" * 60)
|
||
print("✅ COMPLETE!")
|
||
print("=" * 60)
|
||
print("\nNext steps:")
|
||
print(" 1. Review updated FeatureTypeEnum.yaml")
|
||
print(" 2. Regenerate RDF/OWL schema")
|
||
print(" 3. Create validation script (validate_geographic_restrictions.py)")
|
||
print(" 4. Add test cases for geographic validation")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|