glam/scripts/validate_yaml_instance.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

133 lines
4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Validate YAML instance files against the Pydantic models.
Usage:
python scripts/validate_yaml_instance.py <yaml_file>
"""
import sys
import yaml
from pathlib import Path
from datetime import date, datetime
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.models import (
HeritageCustodian,
Location,
Identifier,
ChangeEvent,
Provenance,
DigitalPlatform,
InstitutionType,
OrganizationStatus,
DataSource,
DataTier,
ChangeType,
)
def validate_yaml_file(yaml_path: Path) -> bool:
"""
Validate a YAML instance file against the Pydantic models.
Args:
yaml_path: Path to YAML file containing HeritageCustodian instances
Returns:
True if all instances are valid, False otherwise
"""
print(f"📄 Validating: {yaml_path.name}")
print()
# Load YAML
with open(yaml_path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
if not isinstance(data, list):
print("❌ Error: YAML must contain a list of HeritageCustodian instances")
return False
print(f"Found {len(data)} institution(s) to validate")
print()
all_valid = True
for i, instance_data in enumerate(data, 1):
try:
print(f"Validating institution {i}/{len(data)}: {instance_data.get('name', 'Unknown')}")
# Convert date strings to date objects
if 'change_history' in instance_data and instance_data['change_history']:
for event in instance_data['change_history']:
if 'event_date' in event and isinstance(event['event_date'], str):
event['event_date'] = date.fromisoformat(event['event_date'])
# Convert extraction_date string to datetime
if 'provenance' in instance_data and instance_data['provenance']:
prov = instance_data['provenance']
if 'extraction_date' in prov and isinstance(prov['extraction_date'], str):
prov['extraction_date'] = datetime.fromisoformat(prov['extraction_date'].replace('Z', '+00:00'))
# Validate with Pydantic
custodian = HeritageCustodian(**instance_data)
print(f" ✅ Valid: {custodian.name}")
print(f" - Type: {custodian.institution_type}")
print(f" - Locations: {len(custodian.locations or [])}")
print(f" - Identifiers: {len(custodian.identifiers or [])}")
print(f" - Events: {len(custodian.change_history or [])}")
print(f" - Confidence: {custodian.provenance.confidence_score}")
print()
except Exception as e:
print(f" ❌ Validation Error:")
print(f" {type(e).__name__}: {e}")
print()
all_valid = False
return all_valid
def main():
if len(sys.argv) < 2:
print("Usage: python scripts/validate_yaml_instance.py <yaml_file>")
print()
print("Example:")
print(" python scripts/validate_yaml_instance.py data/instances/test_outputs/test_brazilian_institutions.yaml")
sys.exit(1)
yaml_path = Path(sys.argv[1])
if not yaml_path.exists():
print(f"❌ Error: File not found: {yaml_path}")
sys.exit(1)
print("=" * 80)
print("YAML INSTANCE VALIDATION")
print("=" * 80)
print()
try:
valid = validate_yaml_file(yaml_path)
print("=" * 80)
if valid:
print("✅ All instances are valid!")
print("=" * 80)
sys.exit(0)
else:
print("❌ Some instances failed validation")
print("=" * 80)
sys.exit(1)
except Exception as e:
print(f"❌ Fatal Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()