#!/usr/bin/env python3 """ Validate geographic restrictions for FeatureTypeEnum. This script validates that: 1. CustodianPlace.country matches FeaturePlace.feature_type.dcterms:spatial annotation 2. CustodianPlace.subregion matches FeaturePlace.feature_type.iso_3166_2 annotation (if present) 3. CustodianPlace.settlement matches FeaturePlace.feature_type.geonames_id annotation (if present) Geographic restriction violations indicate data quality issues: - Using BUITENPLAATS feature type outside Netherlands - Using SACRED_SHRINE_BALI outside Bali, Indonesia - Using CITY_OF_PITTSBURGH_HISTORIC_DESIGNATION outside USA Usage: python3 scripts/validate_geographic_restrictions.py [--data DATA_FILE] Options: --data DATA_FILE Path to YAML/JSON data file with custodian instances Examples: # Validate single data file python3 scripts/validate_geographic_restrictions.py --data data/instances/netherlands_museums.yaml # Validate all instance files python3 scripts/validate_geographic_restrictions.py --data "data/instances/*.yaml" Author: OpenCODE AI Assistant Date: 2025-11-22 """ import yaml import json import sys import argparse from pathlib import Path from typing import Dict, List, Tuple, Optional from collections import defaultdict # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) class GeographicRestrictionValidator: """Validator for geographic restrictions on FeatureTypeEnum.""" def __init__(self, enum_path: Path): """ Initialize validator with FeatureTypeEnum schema. Args: enum_path: Path to FeatureTypeEnum.yaml """ self.enum_path = enum_path self.feature_type_restrictions = {} self._load_feature_type_restrictions() def _load_feature_type_restrictions(self): """Load geographic restrictions from FeatureTypeEnum annotations.""" print(f"šŸ“– Loading FeatureTypeEnum from {self.enum_path}...") with open(self.enum_path, 'r', encoding='utf-8') as f: enum_data = yaml.safe_load(f) permissible_values = enum_data['enums']['FeatureTypeEnum']['permissible_values'] for pv_name, pv_data in permissible_values.items(): annotations = pv_data.get('annotations', {}) # Extract geographic restrictions restrictions = {} if 'dcterms:spatial' in annotations: restrictions['country'] = annotations['dcterms:spatial'] if 'iso_3166_2' in annotations: restrictions['subregion'] = annotations['iso_3166_2'] if 'geonames_id' in annotations: restrictions['settlement'] = annotations['geonames_id'] if restrictions: self.feature_type_restrictions[pv_name] = restrictions print(f"āœ… Loaded {len(self.feature_type_restrictions)} feature types with geographic restrictions") def validate_custodian_place(self, custodian_place: Dict) -> List[Tuple[str, str]]: """ Validate geographic restrictions for a CustodianPlace instance. Args: custodian_place: Dict representing CustodianPlace instance Returns: List of (error_type, error_message) tuples. Empty list if valid. """ errors = [] # Extract place geography place_name = custodian_place.get('place_name', 'UNKNOWN') place_country = custodian_place.get('country', {}) place_subregion = custodian_place.get('subregion', {}) place_settlement = custodian_place.get('settlement', {}) # Extract place country code (may be nested or direct) if isinstance(place_country, dict): place_country_code = place_country.get('alpha_2') elif isinstance(place_country, str): place_country_code = place_country # Assume ISO alpha-2 else: place_country_code = None # Extract subregion code if isinstance(place_subregion, dict): place_subregion_code = place_subregion.get('iso_3166_2_code') elif isinstance(place_subregion, str): place_subregion_code = place_subregion else: place_subregion_code = None # Extract settlement GeoNames ID if isinstance(place_settlement, dict): place_settlement_id = place_settlement.get('geonames_id') elif isinstance(place_settlement, int): place_settlement_id = place_settlement else: place_settlement_id = None # Get feature type (if present) has_feature_type = custodian_place.get('has_feature_type') if not has_feature_type: return errors # No feature type, nothing to validate # Extract feature type enum value if isinstance(has_feature_type, dict): feature_type_enum = has_feature_type.get('feature_type') elif isinstance(has_feature_type, str): feature_type_enum = has_feature_type else: return errors # Check if feature type has geographic restrictions restrictions = self.feature_type_restrictions.get(feature_type_enum) if not restrictions: return errors # No restrictions, valid # Validate country restriction if 'country' in restrictions: required_country = restrictions['country'] if not place_country_code: errors.append(( 'MISSING_COUNTRY', f"Place '{place_name}' uses {feature_type_enum} (requires country={required_country}) " f"but has no country specified" )) elif place_country_code != required_country: errors.append(( 'COUNTRY_MISMATCH', f"Place '{place_name}' uses {feature_type_enum} (requires country={required_country}) " f"but is in country={place_country_code}" )) # Validate subregion restriction (if present) if 'subregion' in restrictions: required_subregion = restrictions['subregion'] if not place_subregion_code: errors.append(( 'MISSING_SUBREGION', f"Place '{place_name}' uses {feature_type_enum} (requires subregion={required_subregion}) " f"but has no subregion specified" )) elif place_subregion_code != required_subregion: errors.append(( 'SUBREGION_MISMATCH', f"Place '{place_name}' uses {feature_type_enum} (requires subregion={required_subregion}) " f"but is in subregion={place_subregion_code}" )) # Validate settlement restriction (if present) if 'settlement' in restrictions: required_settlement = restrictions['settlement'] if not place_settlement_id: errors.append(( 'MISSING_SETTLEMENT', f"Place '{place_name}' uses {feature_type_enum} (requires settlement GeoNames ID={required_settlement}) " f"but has no settlement specified" )) elif place_settlement_id != required_settlement: errors.append(( 'SETTLEMENT_MISMATCH', f"Place '{place_name}' uses {feature_type_enum} (requires settlement GeoNames ID={required_settlement}) " f"but is in settlement GeoNames ID={place_settlement_id}" )) return errors def validate_data_file(self, data_path: Path) -> Tuple[int, int, List[Tuple[str, str]]]: """ Validate all CustodianPlace instances in a data file. Args: data_path: Path to YAML or JSON data file Returns: Tuple of (valid_count, invalid_count, all_errors) """ print(f"\nšŸ“– Validating {data_path}...") # Load data file with open(data_path, 'r', encoding='utf-8') as f: if data_path.suffix in ['.yaml', '.yml']: data = yaml.safe_load(f) elif data_path.suffix == '.json': data = json.load(f) else: print(f"āŒ Unsupported file type: {data_path.suffix}") return 0, 0, [] # Handle both single instance and list of instances if isinstance(data, list): instances = data else: instances = [data] valid_count = 0 invalid_count = 0 all_errors = [] for i, instance in enumerate(instances): # Check if this is a CustodianPlace instance if not isinstance(instance, dict): continue # Validate (check for CustodianPlace fields) if 'place_name' in instance: errors = self.validate_custodian_place(instance) if errors: invalid_count += 1 all_errors.extend(errors) print(f" āŒ Instance {i+1}: {len(errors)} error(s)") for error_type, error_msg in errors: print(f" [{error_type}] {error_msg}") else: valid_count += 1 print(f" āœ… Instance {i+1}: Valid") return valid_count, invalid_count, all_errors def main(): """Main execution function.""" parser = argparse.ArgumentParser( description='Validate geographic restrictions for FeatureTypeEnum', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Validate single file python3 scripts/validate_geographic_restrictions.py --data data/instances/netherlands_museums.yaml # Validate all instances python3 scripts/validate_geographic_restrictions.py --data "data/instances/*.yaml" """ ) parser.add_argument( '--data', type=str, help='Path to YAML/JSON data file (or glob pattern)' ) args = parser.parse_args() print("šŸŒ Geographic Restriction Validator") print("=" * 60) # Paths enum_path = PROJECT_ROOT / "schemas/20251121/linkml/modules/enums/FeatureTypeEnum.yaml" # Initialize validator validator = GeographicRestrictionValidator(enum_path) # Find data files if args.data: from glob import glob data_files = [Path(p) for p in glob(args.data)] if not data_files: print(f"āŒ No files found matching pattern: {args.data}") return 1 else: # Default: look for test data data_files = list((PROJECT_ROOT / "data/instances").glob("*.yaml")) if not data_files: print("ā„¹ļø No data files found. Use --data to specify file path.") print("\nāœ… Validator loaded successfully. Ready to validate data.") return 0 # Validate all files total_valid = 0 total_invalid = 0 for data_file in data_files: valid, invalid, errors = validator.validate_data_file(data_file) total_valid += valid total_invalid += invalid # Summary print("\n" + "=" * 60) print("šŸ“Š VALIDATION SUMMARY") print("=" * 60) print(f"Files validated: {len(data_files)}") print(f"Valid instances: {total_valid}") print(f"Invalid instances: {total_invalid}") if total_invalid > 0: print(f"\nāŒ {total_invalid} instances have geographic restriction violations") return 1 else: print(f"\nāœ… All {total_valid} instances pass geographic restriction validation") return 0 if __name__ == '__main__': sys.exit(main())