glam/scripts/batch_fix_validation_errors.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

209 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Batch fix common validation errors across all YAML instance files.
Fixes:
1. extent → item_count (Collection field alias)
2. metadata_standards → implemented_standards (DigitalPlatform field alias)
3. Temporal coverage format (ISO 8601: YYYY-MM-DD/YYYY-MM-DD)
4. Invalid platform_type enum values (CATALOG → COLLECTION_MANAGEMENT, REPOSITORY → DIGITAL_REPOSITORY)
5. subject_areas → subjects (Collection field alias)
"""
import sys
from pathlib import Path
import re
def load_yaml_content(filepath):
"""Load YAML file as raw text (preserve formatting)."""
with open(filepath, 'r', encoding='utf-8') as f:
return f.read()
def save_yaml_content(filepath, content):
"""Save YAML content to file."""
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
def fix_alias_fields(content):
"""Fix alias field names to canonical names."""
changes = []
# Fix: extent → item_count (in Collection context)
# Pattern: Look for 'extent:' within collections section
pattern_extent = r'(\n\s+)(extent:)'
if re.search(pattern_extent, content):
content = re.sub(pattern_extent, r'\1item_count:', content)
changes.append("extent → item_count")
# Fix: metadata_standards → implemented_standards (in DigitalPlatform context)
# Pattern: Look for 'metadata_standards:' within digital_platforms section
pattern_metadata = r'(\n\s+)(metadata_standards:)'
if re.search(pattern_metadata, content):
content = re.sub(pattern_metadata, r'\1implemented_standards:', content)
changes.append("metadata_standards → implemented_standards")
# Fix: subject_areas → subjects (in Collection context)
pattern_subjects = r'(\n\s+)(subject_areas:)'
if re.search(pattern_subjects, content):
content = re.sub(pattern_subjects, r'\1subjects:', content)
changes.append("subject_areas → subjects")
return content, changes
def fix_temporal_coverage(content):
"""Fix temporal coverage format to ISO 8601 with full dates."""
changes = []
# Pattern: temporal_coverage: YYYY/YYYY or partial dates
# Matches:
# - 1800/1930 → 1800-01-01/1930-12-31
# - 800/1500 → 0800-01-01/1500-12-31
# - 1800/1930-12-31 → 1800-01-01/1930-12-31 (incomplete start)
def replace_temporal(match):
indent = match.group(1)
start = match.group(2)
end = match.group(3)
# Fix start date
if re.match(r'^\d{1,4}$', start):
# Just year: pad and add month/day
start_fixed = f"{start.zfill(4)}-01-01"
elif re.match(r'^-?\d{1,5}-\d{2}-\d{2}$', start):
# Already valid
start_fixed = start
else:
# Incomplete format, skip
return match.group(0)
# Fix end date
if re.match(r'^\d{1,4}$', end):
# Just year: pad and add month/day
end_fixed = f"{end.zfill(4)}-12-31"
elif re.match(r'^-?\d{1,5}-\d{2}-\d{2}$', end):
# Already valid
end_fixed = end
else:
# Incomplete format, skip
return match.group(0)
changes.append(f"temporal_coverage: {start}/{end}{start_fixed}/{end_fixed}")
return f"{indent}temporal_coverage: {start_fixed}/{end_fixed}"
# Match temporal_coverage with various formats
pattern = r'(\n\s+temporal_coverage: )([^\n]+?)/([^\n]+)'
content = re.sub(pattern, replace_temporal, content)
return content, changes
def fix_platform_types(content):
"""Fix invalid platform_type enum values."""
changes = []
# CATALOG → COLLECTION_MANAGEMENT
if 'platform_type: CATALOG' in content or "platform_type: 'CATALOG'" in content:
content = content.replace('platform_type: CATALOG', 'platform_type: COLLECTION_MANAGEMENT')
content = content.replace("platform_type: 'CATALOG'", 'platform_type: COLLECTION_MANAGEMENT')
changes.append("platform_type: CATALOG → COLLECTION_MANAGEMENT")
# REPOSITORY → DIGITAL_REPOSITORY
if 'platform_type: REPOSITORY' in content or "platform_type: 'REPOSITORY'" in content:
content = content.replace('platform_type: REPOSITORY', 'platform_type: DIGITAL_REPOSITORY')
content = content.replace("platform_type: 'REPOSITORY'", 'platform_type: DIGITAL_REPOSITORY')
changes.append("platform_type: REPOSITORY → DIGITAL_REPOSITORY")
return content, changes
def process_file(filepath):
"""Process a single YAML file and apply all fixes."""
try:
content = load_yaml_content(filepath)
original_content = content
all_changes = []
# Apply fixes
content, changes1 = fix_alias_fields(content)
all_changes.extend(changes1)
content, changes2 = fix_temporal_coverage(content)
all_changes.extend(changes2)
content, changes3 = fix_platform_types(content)
all_changes.extend(changes3)
# Only save if changes were made
if content != original_content:
save_yaml_content(filepath, content)
return True, all_changes
return False, []
except Exception as e:
print(f" ❌ Error processing {filepath}: {e}")
return False, []
def get_active_yaml_files():
"""Get all active YAML files (excluding archives, backups, tests, cache)."""
instance_dir = Path.cwd() / "data" / "instances"
all_files = list(instance_dir.rglob("*.yaml"))
# Filter out archives, backups, test files, cache
active_files = [
f for f in all_files
if "archive" not in str(f)
and "backups" not in str(f)
and "test_outputs" not in str(f)
and "cache" not in str(f)
]
return sorted(active_files)
def main():
print("🔧 Batch Fix Validation Errors")
print("=" * 80)
print()
# Get all files
files = get_active_yaml_files()
print(f"📄 Found {len(files)} active YAML files")
print()
# Process each file
files_modified = []
files_unchanged = []
for i, filepath in enumerate(files, 1):
relative_path = filepath.relative_to(Path.cwd())
print(f"[{i}/{len(files)}] Processing {relative_path}...", end=" ")
modified, changes = process_file(filepath)
if modified:
print(f"✅ Fixed ({len(changes)} changes)")
files_modified.append((str(relative_path), changes))
else:
print("⏭️ No changes needed")
files_unchanged.append(str(relative_path))
print()
print("=" * 80)
print(f"✅ Files modified: {len(files_modified)}")
print(f"⏭️ Files unchanged: {len(files_unchanged)}")
print()
if files_modified:
print("📝 CHANGES MADE:")
print("-" * 80)
for filepath, changes in files_modified:
print(f"\n{filepath}:")
for change in changes[:10]: # Show first 10 changes
print(f" - {change}")
if len(changes) > 10:
print(f" ... and {len(changes) - 10} more changes")
print()
print("✅ Batch fix complete!")
return 0
if __name__ == "__main__":
sys.exit(main())