glam/scripts/validate_geographic_restrictions.py
kempersc 67657c39b6 feat: Complete Country Class Implementation and Hypernyms Removal
- Created the Country class with ISO 3166-1 alpha-2 and alpha-3 codes, ensuring minimal design without additional metadata.
- Integrated the Country class into CustodianPlace and LegalForm schemas to support country-specific feature types and legal forms.
- Removed duplicate keys in FeatureTypeEnum.yaml, resulting in 294 unique feature types.
- Eliminated "Hypernyms:" text from FeatureTypeEnum descriptions, verifying that semantic relationships are now conveyed through ontology mappings.
- Created example instance file demonstrating integration of Country with CustodianPlace and LegalForm.
- Updated documentation to reflect the completion of the Country class implementation and hypernyms removal.
2025-11-23 13:09:38 +01:00

329 lines
12 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Validate geographic restrictions for FeatureTypeEnum.
This script validates that:
1. CustodianPlace.country matches FeaturePlace.feature_type.dcterms:spatial annotation
2. CustodianPlace.subregion matches FeaturePlace.feature_type.iso_3166_2 annotation (if present)
3. CustodianPlace.settlement matches FeaturePlace.feature_type.geonames_id annotation (if present)
Geographic restriction violations indicate data quality issues:
- Using BUITENPLAATS feature type outside Netherlands
- Using SACRED_SHRINE_BALI outside Bali, Indonesia
- Using CITY_OF_PITTSBURGH_HISTORIC_DESIGNATION outside USA
Usage:
python3 scripts/validate_geographic_restrictions.py [--data DATA_FILE]
Options:
--data DATA_FILE Path to YAML/JSON data file with custodian instances
Examples:
# Validate single data file
python3 scripts/validate_geographic_restrictions.py --data data/instances/netherlands_museums.yaml
# Validate all instance files
python3 scripts/validate_geographic_restrictions.py --data "data/instances/*.yaml"
Author: OpenCODE AI Assistant
Date: 2025-11-22
"""
import yaml
import json
import sys
import argparse
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
class GeographicRestrictionValidator:
"""Validator for geographic restrictions on FeatureTypeEnum."""
def __init__(self, enum_path: Path):
"""
Initialize validator with FeatureTypeEnum schema.
Args:
enum_path: Path to FeatureTypeEnum.yaml
"""
self.enum_path = enum_path
self.feature_type_restrictions = {}
self._load_feature_type_restrictions()
def _load_feature_type_restrictions(self):
"""Load geographic restrictions from FeatureTypeEnum annotations."""
print(f"📖 Loading FeatureTypeEnum from {self.enum_path}...")
with open(self.enum_path, 'r', encoding='utf-8') as f:
enum_data = yaml.safe_load(f)
permissible_values = enum_data['enums']['FeatureTypeEnum']['permissible_values']
for pv_name, pv_data in permissible_values.items():
annotations = pv_data.get('annotations', {})
# Extract geographic restrictions
restrictions = {}
if 'dcterms:spatial' in annotations:
restrictions['country'] = annotations['dcterms:spatial']
if 'iso_3166_2' in annotations:
restrictions['subregion'] = annotations['iso_3166_2']
if 'geonames_id' in annotations:
restrictions['settlement'] = annotations['geonames_id']
if restrictions:
self.feature_type_restrictions[pv_name] = restrictions
print(f"✅ Loaded {len(self.feature_type_restrictions)} feature types with geographic restrictions")
def validate_custodian_place(self, custodian_place: Dict) -> List[Tuple[str, str]]:
"""
Validate geographic restrictions for a CustodianPlace instance.
Args:
custodian_place: Dict representing CustodianPlace instance
Returns:
List of (error_type, error_message) tuples. Empty list if valid.
"""
errors = []
# Extract place geography
place_name = custodian_place.get('place_name', 'UNKNOWN')
place_country = custodian_place.get('country', {})
place_subregion = custodian_place.get('subregion', {})
place_settlement = custodian_place.get('settlement', {})
# Extract place country code (may be nested or direct)
if isinstance(place_country, dict):
place_country_code = place_country.get('alpha_2')
elif isinstance(place_country, str):
place_country_code = place_country # Assume ISO alpha-2
else:
place_country_code = None
# Extract subregion code
if isinstance(place_subregion, dict):
place_subregion_code = place_subregion.get('iso_3166_2_code')
elif isinstance(place_subregion, str):
place_subregion_code = place_subregion
else:
place_subregion_code = None
# Extract settlement GeoNames ID
if isinstance(place_settlement, dict):
place_settlement_id = place_settlement.get('geonames_id')
elif isinstance(place_settlement, int):
place_settlement_id = place_settlement
else:
place_settlement_id = None
# Get feature type (if present)
has_feature_type = custodian_place.get('has_feature_type')
if not has_feature_type:
return errors # No feature type, nothing to validate
# Extract feature type enum value
if isinstance(has_feature_type, dict):
feature_type_enum = has_feature_type.get('feature_type')
elif isinstance(has_feature_type, str):
feature_type_enum = has_feature_type
else:
return errors
# Check if feature type has geographic restrictions
restrictions = self.feature_type_restrictions.get(feature_type_enum)
if not restrictions:
return errors # No restrictions, valid
# Validate country restriction
if 'country' in restrictions:
required_country = restrictions['country']
if not place_country_code:
errors.append((
'MISSING_COUNTRY',
f"Place '{place_name}' uses {feature_type_enum} (requires country={required_country}) "
f"but has no country specified"
))
elif place_country_code != required_country:
errors.append((
'COUNTRY_MISMATCH',
f"Place '{place_name}' uses {feature_type_enum} (requires country={required_country}) "
f"but is in country={place_country_code}"
))
# Validate subregion restriction (if present)
if 'subregion' in restrictions:
required_subregion = restrictions['subregion']
if not place_subregion_code:
errors.append((
'MISSING_SUBREGION',
f"Place '{place_name}' uses {feature_type_enum} (requires subregion={required_subregion}) "
f"but has no subregion specified"
))
elif place_subregion_code != required_subregion:
errors.append((
'SUBREGION_MISMATCH',
f"Place '{place_name}' uses {feature_type_enum} (requires subregion={required_subregion}) "
f"but is in subregion={place_subregion_code}"
))
# Validate settlement restriction (if present)
if 'settlement' in restrictions:
required_settlement = restrictions['settlement']
if not place_settlement_id:
errors.append((
'MISSING_SETTLEMENT',
f"Place '{place_name}' uses {feature_type_enum} (requires settlement GeoNames ID={required_settlement}) "
f"but has no settlement specified"
))
elif place_settlement_id != required_settlement:
errors.append((
'SETTLEMENT_MISMATCH',
f"Place '{place_name}' uses {feature_type_enum} (requires settlement GeoNames ID={required_settlement}) "
f"but is in settlement GeoNames ID={place_settlement_id}"
))
return errors
def validate_data_file(self, data_path: Path) -> Tuple[int, int, List[Tuple[str, str]]]:
"""
Validate all CustodianPlace instances in a data file.
Args:
data_path: Path to YAML or JSON data file
Returns:
Tuple of (valid_count, invalid_count, all_errors)
"""
print(f"\n📖 Validating {data_path}...")
# Load data file
with open(data_path, 'r', encoding='utf-8') as f:
if data_path.suffix in ['.yaml', '.yml']:
data = yaml.safe_load(f)
elif data_path.suffix == '.json':
data = json.load(f)
else:
print(f"❌ Unsupported file type: {data_path.suffix}")
return 0, 0, []
# Handle both single instance and list of instances
if isinstance(data, list):
instances = data
else:
instances = [data]
valid_count = 0
invalid_count = 0
all_errors = []
for i, instance in enumerate(instances):
# Check if this is a CustodianPlace instance
if not isinstance(instance, dict):
continue
# Validate (check for CustodianPlace fields)
if 'place_name' in instance:
errors = self.validate_custodian_place(instance)
if errors:
invalid_count += 1
all_errors.extend(errors)
print(f" ❌ Instance {i+1}: {len(errors)} error(s)")
for error_type, error_msg in errors:
print(f" [{error_type}] {error_msg}")
else:
valid_count += 1
print(f" ✅ Instance {i+1}: Valid")
return valid_count, invalid_count, all_errors
def main():
"""Main execution function."""
parser = argparse.ArgumentParser(
description='Validate geographic restrictions for FeatureTypeEnum',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Validate single file
python3 scripts/validate_geographic_restrictions.py --data data/instances/netherlands_museums.yaml
# Validate all instances
python3 scripts/validate_geographic_restrictions.py --data "data/instances/*.yaml"
"""
)
parser.add_argument(
'--data',
type=str,
help='Path to YAML/JSON data file (or glob pattern)'
)
args = parser.parse_args()
print("🌍 Geographic Restriction Validator")
print("=" * 60)
# Paths
enum_path = PROJECT_ROOT / "schemas/20251121/linkml/modules/enums/FeatureTypeEnum.yaml"
# Initialize validator
validator = GeographicRestrictionValidator(enum_path)
# Find data files
if args.data:
from glob import glob
data_files = [Path(p) for p in glob(args.data)]
if not data_files:
print(f"❌ No files found matching pattern: {args.data}")
return 1
else:
# Default: look for test data
data_files = list((PROJECT_ROOT / "data/instances").glob("*.yaml"))
if not data_files:
print(" No data files found. Use --data to specify file path.")
print("\n✅ Validator loaded successfully. Ready to validate data.")
return 0
# Validate all files
total_valid = 0
total_invalid = 0
for data_file in data_files:
valid, invalid, errors = validator.validate_data_file(data_file)
total_valid += valid
total_invalid += invalid
# Summary
print("\n" + "=" * 60)
print("📊 VALIDATION SUMMARY")
print("=" * 60)
print(f"Files validated: {len(data_files)}")
print(f"Valid instances: {total_valid}")
print(f"Invalid instances: {total_invalid}")
if total_invalid > 0:
print(f"\n{total_invalid} instances have geographic restriction violations")
return 1
else:
print(f"\n✅ All {total_valid} instances pass geographic restriction validation")
return 0
if __name__ == '__main__':
sys.exit(main())