- Created the Country class with ISO 3166-1 alpha-2 and alpha-3 codes, ensuring minimal design without additional metadata. - Integrated the Country class into CustodianPlace and LegalForm schemas to support country-specific feature types and legal forms. - Removed duplicate keys in FeatureTypeEnum.yaml, resulting in 294 unique feature types. - Eliminated "Hypernyms:" text from FeatureTypeEnum descriptions, verifying that semantic relationships are now conveyed through ontology mappings. - Created example instance file demonstrating integration of Country with CustodianPlace and LegalForm. - Updated documentation to reflect the completion of the Country class implementation and hypernyms removal.
329 lines
12 KiB
Python
329 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Validate geographic restrictions for FeatureTypeEnum.
|
||
|
||
This script validates that:
|
||
1. CustodianPlace.country matches FeaturePlace.feature_type.dcterms:spatial annotation
|
||
2. CustodianPlace.subregion matches FeaturePlace.feature_type.iso_3166_2 annotation (if present)
|
||
3. CustodianPlace.settlement matches FeaturePlace.feature_type.geonames_id annotation (if present)
|
||
|
||
Geographic restriction violations indicate data quality issues:
|
||
- Using BUITENPLAATS feature type outside Netherlands
|
||
- Using SACRED_SHRINE_BALI outside Bali, Indonesia
|
||
- Using CITY_OF_PITTSBURGH_HISTORIC_DESIGNATION outside USA
|
||
|
||
Usage:
|
||
python3 scripts/validate_geographic_restrictions.py [--data DATA_FILE]
|
||
|
||
Options:
|
||
--data DATA_FILE Path to YAML/JSON data file with custodian instances
|
||
|
||
Examples:
|
||
# Validate single data file
|
||
python3 scripts/validate_geographic_restrictions.py --data data/instances/netherlands_museums.yaml
|
||
|
||
# Validate all instance files
|
||
python3 scripts/validate_geographic_restrictions.py --data "data/instances/*.yaml"
|
||
|
||
Author: OpenCODE AI Assistant
|
||
Date: 2025-11-22
|
||
"""
|
||
|
||
import yaml
|
||
import json
|
||
import sys
|
||
import argparse
|
||
from pathlib import Path
|
||
from typing import Dict, List, Tuple, Optional
|
||
from collections import defaultdict
|
||
|
||
# Add project root to path
|
||
PROJECT_ROOT = Path(__file__).parent.parent
|
||
sys.path.insert(0, str(PROJECT_ROOT))
|
||
|
||
|
||
class GeographicRestrictionValidator:
|
||
"""Validator for geographic restrictions on FeatureTypeEnum."""
|
||
|
||
def __init__(self, enum_path: Path):
|
||
"""
|
||
Initialize validator with FeatureTypeEnum schema.
|
||
|
||
Args:
|
||
enum_path: Path to FeatureTypeEnum.yaml
|
||
"""
|
||
self.enum_path = enum_path
|
||
self.feature_type_restrictions = {}
|
||
self._load_feature_type_restrictions()
|
||
|
||
def _load_feature_type_restrictions(self):
|
||
"""Load geographic restrictions from FeatureTypeEnum annotations."""
|
||
print(f"📖 Loading FeatureTypeEnum from {self.enum_path}...")
|
||
|
||
with open(self.enum_path, 'r', encoding='utf-8') as f:
|
||
enum_data = yaml.safe_load(f)
|
||
|
||
permissible_values = enum_data['enums']['FeatureTypeEnum']['permissible_values']
|
||
|
||
for pv_name, pv_data in permissible_values.items():
|
||
annotations = pv_data.get('annotations', {})
|
||
|
||
# Extract geographic restrictions
|
||
restrictions = {}
|
||
|
||
if 'dcterms:spatial' in annotations:
|
||
restrictions['country'] = annotations['dcterms:spatial']
|
||
|
||
if 'iso_3166_2' in annotations:
|
||
restrictions['subregion'] = annotations['iso_3166_2']
|
||
|
||
if 'geonames_id' in annotations:
|
||
restrictions['settlement'] = annotations['geonames_id']
|
||
|
||
if restrictions:
|
||
self.feature_type_restrictions[pv_name] = restrictions
|
||
|
||
print(f"✅ Loaded {len(self.feature_type_restrictions)} feature types with geographic restrictions")
|
||
|
||
def validate_custodian_place(self, custodian_place: Dict) -> List[Tuple[str, str]]:
|
||
"""
|
||
Validate geographic restrictions for a CustodianPlace instance.
|
||
|
||
Args:
|
||
custodian_place: Dict representing CustodianPlace instance
|
||
|
||
Returns:
|
||
List of (error_type, error_message) tuples. Empty list if valid.
|
||
"""
|
||
errors = []
|
||
|
||
# Extract place geography
|
||
place_name = custodian_place.get('place_name', 'UNKNOWN')
|
||
place_country = custodian_place.get('country', {})
|
||
place_subregion = custodian_place.get('subregion', {})
|
||
place_settlement = custodian_place.get('settlement', {})
|
||
|
||
# Extract place country code (may be nested or direct)
|
||
if isinstance(place_country, dict):
|
||
place_country_code = place_country.get('alpha_2')
|
||
elif isinstance(place_country, str):
|
||
place_country_code = place_country # Assume ISO alpha-2
|
||
else:
|
||
place_country_code = None
|
||
|
||
# Extract subregion code
|
||
if isinstance(place_subregion, dict):
|
||
place_subregion_code = place_subregion.get('iso_3166_2_code')
|
||
elif isinstance(place_subregion, str):
|
||
place_subregion_code = place_subregion
|
||
else:
|
||
place_subregion_code = None
|
||
|
||
# Extract settlement GeoNames ID
|
||
if isinstance(place_settlement, dict):
|
||
place_settlement_id = place_settlement.get('geonames_id')
|
||
elif isinstance(place_settlement, int):
|
||
place_settlement_id = place_settlement
|
||
else:
|
||
place_settlement_id = None
|
||
|
||
# Get feature type (if present)
|
||
has_feature_type = custodian_place.get('has_feature_type')
|
||
if not has_feature_type:
|
||
return errors # No feature type, nothing to validate
|
||
|
||
# Extract feature type enum value
|
||
if isinstance(has_feature_type, dict):
|
||
feature_type_enum = has_feature_type.get('feature_type')
|
||
elif isinstance(has_feature_type, str):
|
||
feature_type_enum = has_feature_type
|
||
else:
|
||
return errors
|
||
|
||
# Check if feature type has geographic restrictions
|
||
restrictions = self.feature_type_restrictions.get(feature_type_enum)
|
||
if not restrictions:
|
||
return errors # No restrictions, valid
|
||
|
||
# Validate country restriction
|
||
if 'country' in restrictions:
|
||
required_country = restrictions['country']
|
||
|
||
if not place_country_code:
|
||
errors.append((
|
||
'MISSING_COUNTRY',
|
||
f"Place '{place_name}' uses {feature_type_enum} (requires country={required_country}) "
|
||
f"but has no country specified"
|
||
))
|
||
elif place_country_code != required_country:
|
||
errors.append((
|
||
'COUNTRY_MISMATCH',
|
||
f"Place '{place_name}' uses {feature_type_enum} (requires country={required_country}) "
|
||
f"but is in country={place_country_code}"
|
||
))
|
||
|
||
# Validate subregion restriction (if present)
|
||
if 'subregion' in restrictions:
|
||
required_subregion = restrictions['subregion']
|
||
|
||
if not place_subregion_code:
|
||
errors.append((
|
||
'MISSING_SUBREGION',
|
||
f"Place '{place_name}' uses {feature_type_enum} (requires subregion={required_subregion}) "
|
||
f"but has no subregion specified"
|
||
))
|
||
elif place_subregion_code != required_subregion:
|
||
errors.append((
|
||
'SUBREGION_MISMATCH',
|
||
f"Place '{place_name}' uses {feature_type_enum} (requires subregion={required_subregion}) "
|
||
f"but is in subregion={place_subregion_code}"
|
||
))
|
||
|
||
# Validate settlement restriction (if present)
|
||
if 'settlement' in restrictions:
|
||
required_settlement = restrictions['settlement']
|
||
|
||
if not place_settlement_id:
|
||
errors.append((
|
||
'MISSING_SETTLEMENT',
|
||
f"Place '{place_name}' uses {feature_type_enum} (requires settlement GeoNames ID={required_settlement}) "
|
||
f"but has no settlement specified"
|
||
))
|
||
elif place_settlement_id != required_settlement:
|
||
errors.append((
|
||
'SETTLEMENT_MISMATCH',
|
||
f"Place '{place_name}' uses {feature_type_enum} (requires settlement GeoNames ID={required_settlement}) "
|
||
f"but is in settlement GeoNames ID={place_settlement_id}"
|
||
))
|
||
|
||
return errors
|
||
|
||
def validate_data_file(self, data_path: Path) -> Tuple[int, int, List[Tuple[str, str]]]:
|
||
"""
|
||
Validate all CustodianPlace instances in a data file.
|
||
|
||
Args:
|
||
data_path: Path to YAML or JSON data file
|
||
|
||
Returns:
|
||
Tuple of (valid_count, invalid_count, all_errors)
|
||
"""
|
||
print(f"\n📖 Validating {data_path}...")
|
||
|
||
# Load data file
|
||
with open(data_path, 'r', encoding='utf-8') as f:
|
||
if data_path.suffix in ['.yaml', '.yml']:
|
||
data = yaml.safe_load(f)
|
||
elif data_path.suffix == '.json':
|
||
data = json.load(f)
|
||
else:
|
||
print(f"❌ Unsupported file type: {data_path.suffix}")
|
||
return 0, 0, []
|
||
|
||
# Handle both single instance and list of instances
|
||
if isinstance(data, list):
|
||
instances = data
|
||
else:
|
||
instances = [data]
|
||
|
||
valid_count = 0
|
||
invalid_count = 0
|
||
all_errors = []
|
||
|
||
for i, instance in enumerate(instances):
|
||
# Check if this is a CustodianPlace instance
|
||
if not isinstance(instance, dict):
|
||
continue
|
||
|
||
# Validate (check for CustodianPlace fields)
|
||
if 'place_name' in instance:
|
||
errors = self.validate_custodian_place(instance)
|
||
|
||
if errors:
|
||
invalid_count += 1
|
||
all_errors.extend(errors)
|
||
print(f" ❌ Instance {i+1}: {len(errors)} error(s)")
|
||
for error_type, error_msg in errors:
|
||
print(f" [{error_type}] {error_msg}")
|
||
else:
|
||
valid_count += 1
|
||
print(f" ✅ Instance {i+1}: Valid")
|
||
|
||
return valid_count, invalid_count, all_errors
|
||
|
||
|
||
def main():
|
||
"""Main execution function."""
|
||
parser = argparse.ArgumentParser(
|
||
description='Validate geographic restrictions for FeatureTypeEnum',
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
Examples:
|
||
# Validate single file
|
||
python3 scripts/validate_geographic_restrictions.py --data data/instances/netherlands_museums.yaml
|
||
|
||
# Validate all instances
|
||
python3 scripts/validate_geographic_restrictions.py --data "data/instances/*.yaml"
|
||
"""
|
||
)
|
||
parser.add_argument(
|
||
'--data',
|
||
type=str,
|
||
help='Path to YAML/JSON data file (or glob pattern)'
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
print("🌍 Geographic Restriction Validator")
|
||
print("=" * 60)
|
||
|
||
# Paths
|
||
enum_path = PROJECT_ROOT / "schemas/20251121/linkml/modules/enums/FeatureTypeEnum.yaml"
|
||
|
||
# Initialize validator
|
||
validator = GeographicRestrictionValidator(enum_path)
|
||
|
||
# Find data files
|
||
if args.data:
|
||
from glob import glob
|
||
data_files = [Path(p) for p in glob(args.data)]
|
||
|
||
if not data_files:
|
||
print(f"❌ No files found matching pattern: {args.data}")
|
||
return 1
|
||
else:
|
||
# Default: look for test data
|
||
data_files = list((PROJECT_ROOT / "data/instances").glob("*.yaml"))
|
||
|
||
if not data_files:
|
||
print("ℹ️ No data files found. Use --data to specify file path.")
|
||
print("\n✅ Validator loaded successfully. Ready to validate data.")
|
||
return 0
|
||
|
||
# Validate all files
|
||
total_valid = 0
|
||
total_invalid = 0
|
||
|
||
for data_file in data_files:
|
||
valid, invalid, errors = validator.validate_data_file(data_file)
|
||
total_valid += valid
|
||
total_invalid += invalid
|
||
|
||
# Summary
|
||
print("\n" + "=" * 60)
|
||
print("📊 VALIDATION SUMMARY")
|
||
print("=" * 60)
|
||
print(f"Files validated: {len(data_files)}")
|
||
print(f"Valid instances: {total_valid}")
|
||
print(f"Invalid instances: {total_invalid}")
|
||
|
||
if total_invalid > 0:
|
||
print(f"\n❌ {total_invalid} instances have geographic restriction violations")
|
||
return 1
|
||
else:
|
||
print(f"\n✅ All {total_valid} instances pass geographic restriction validation")
|
||
return 0
|
||
|
||
|
||
if __name__ == '__main__':
|
||
sys.exit(main())
|