#!/usr/bin/env python3 """ Add geographic annotations to FeatureTypeEnum.yaml. This script: 1. Reads data/extracted/feature_type_geographic_annotations.yaml 2. Loads schemas/20251121/linkml/modules/enums/FeatureTypeEnum.yaml 3. Matches Q-numbers between annotation file and enum 4. Adds 'annotations' field to matching enum permissible_values 5. Writes updated FeatureTypeEnum.yaml Geographic annotations added: - dcterms:spatial: ISO 3166-1 alpha-2 country code (e.g., "NL") - iso_3166_2: ISO 3166-2 subdivision code (e.g., "US-PA") [if available] - geonames_id: GeoNames ID for settlements (e.g., 5206379) [if available] Example output: BUITENPLAATS: meaning: wd:Q2927789 description: Dutch country estate annotations: dcterms:spatial: NL wikidata_country: Netherlands Usage: python3 scripts/add_geographic_annotations_to_enum.py Author: OpenCODE AI Assistant Date: 2025-11-22 """ import yaml import sys from pathlib import Path from typing import Dict, List # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) def load_annotations(yaml_path: Path) -> Dict[str, Dict]: """ Load geographic annotations and index by Wikidata Q-number. Returns: Dict mapping Q-number to annotation data """ print(f"šŸ“– Loading annotations from {yaml_path}...") with open(yaml_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) annotations_by_q = {} for annot in data['annotations']: q_num = annot['wikidata_id'] annotations_by_q[q_num] = annot print(f"āœ… Loaded {len(annotations_by_q)} annotations") return annotations_by_q def add_annotations_to_enum(enum_path: Path, annotations: Dict[str, Dict], output_path: Path): """ Add geographic annotations to FeatureTypeEnum permissible values. Args: enum_path: Path to FeatureTypeEnum.yaml annotations: Dict mapping Q-number to annotation data output_path: Path to write updated enum """ print(f"\nšŸ“– Loading FeatureTypeEnum from {enum_path}...") with open(enum_path, 'r', encoding='utf-8') as f: enum_data = yaml.safe_load(f) permissible_values = enum_data['enums']['FeatureTypeEnum']['permissible_values'] print(f"āœ… Loaded {len(permissible_values)} permissible values") # Track statistics matched = 0 updated = 0 skipped_no_match = 0 print(f"\nšŸ”„ Processing permissible values...") for pv_name, pv_data in permissible_values.items(): meaning = pv_data.get('meaning') if not meaning or not meaning.startswith('wd:Q'): skipped_no_match += 1 continue q_num = meaning.replace('wd:', '') if q_num in annotations: matched += 1 annot = annotations[q_num] # Build annotations dict pv_annotations = {} # Add dcterms:spatial (country code) if 'dcterms:spatial' in annot: pv_annotations['dcterms:spatial'] = annot['dcterms:spatial'] # Add ISO 3166-2 subdivision code (if available) if 'iso_3166_2' in annot: pv_annotations['iso_3166_2'] = annot['iso_3166_2'] # Add GeoNames ID (if available) if 'geonames_id' in annot: pv_annotations['geonames_id'] = annot['geonames_id'] # Add raw Wikidata country name for documentation if annot['raw_data']['country']: pv_annotations['wikidata_country'] = annot['raw_data']['country'][0] # Add raw subregion name (if available) if annot['raw_data']['subregion']: pv_annotations['wikidata_subregion'] = annot['raw_data']['subregion'][0] # Add raw settlement name (if available) if annot['raw_data']['settlement']: pv_annotations['wikidata_settlement'] = annot['raw_data']['settlement'][0] # Add annotations to permissible value if pv_annotations: pv_data['annotations'] = pv_annotations updated += 1 print(f" āœ… {pv_name}: {pv_annotations.get('dcterms:spatial', 'N/A')}", end='') if 'iso_3166_2' in pv_annotations: print(f" [{pv_annotations['iso_3166_2']}]", end='') print() print(f"\nšŸ“Š Statistics:") print(f" - Matched: {matched}") print(f" - Updated: {updated}") print(f" - Skipped (no Q-number): {skipped_no_match}") # Write updated enum print(f"\nšŸ“ Writing updated enum to {output_path}...") with open(output_path, 'w', encoding='utf-8') as f: # Add header comment header = f"""# FeatureTypeEnum - Heritage Feature Types with Geographic Restrictions # # This file has been automatically updated with geographic annotations # extracted from data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml # # Geographic annotations: # - dcterms:spatial: ISO 3166-1 alpha-2 country code (e.g., "NL" for Netherlands) # - iso_3166_2: ISO 3166-2 subdivision code (e.g., "US-PA" for Pennsylvania) # - geonames_id: GeoNames ID for settlements (e.g., 5206379 for Pittsburgh) # - wikidata_country: Human-readable country name from Wikidata # - wikidata_subregion: Human-readable subregion name from Wikidata (if available) # - wikidata_settlement: Human-readable settlement name from Wikidata (if available) # # Validation: # - Custom Python validator checks that CustodianPlace.country matches dcterms:spatial # - Validator implemented in: scripts/validate_geographic_restrictions.py # # Generation date: 2025-11-22 # Generated by: scripts/add_geographic_annotations_to_enum.py # """ f.write(header) # Write YAML with proper formatting yaml.dump(enum_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120) print(f"āœ… Updated enum written successfully") print(f"\nā„¹ļø {updated} permissible values now have geographic annotations") def main(): """Main execution function.""" print("šŸŒ Add Geographic Annotations to FeatureTypeEnum") print("=" * 60) # Paths annotations_yaml = PROJECT_ROOT / "data/extracted/feature_type_geographic_annotations.yaml" enum_yaml = PROJECT_ROOT / "schemas/20251121/linkml/modules/enums/FeatureTypeEnum.yaml" output_yaml = enum_yaml # Overwrite in place # Load annotations annotations = load_annotations(annotations_yaml) # Add annotations to enum add_annotations_to_enum(enum_yaml, annotations, output_yaml) print("\n" + "=" * 60) print("āœ… COMPLETE!") print("=" * 60) print("\nNext steps:") print(" 1. Review updated FeatureTypeEnum.yaml") print(" 2. Regenerate RDF/OWL schema") print(" 3. Create validation script (validate_geographic_restrictions.py)") print(" 4. Add test cases for geographic validation") if __name__ == '__main__': main()