- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
209 lines
7.2 KiB
Python
209 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch fix common validation errors across all YAML instance files.
|
|
|
|
Fixes:
|
|
1. extent → item_count (Collection field alias)
|
|
2. metadata_standards → implemented_standards (DigitalPlatform field alias)
|
|
3. Temporal coverage format (ISO 8601: YYYY-MM-DD/YYYY-MM-DD)
|
|
4. Invalid platform_type enum values (CATALOG → COLLECTION_MANAGEMENT, REPOSITORY → DIGITAL_REPOSITORY)
|
|
5. subject_areas → subjects (Collection field alias)
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
import re
|
|
|
|
def load_yaml_content(filepath):
|
|
"""Load YAML file as raw text (preserve formatting)."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return f.read()
|
|
|
|
def save_yaml_content(filepath, content):
|
|
"""Save YAML content to file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
def fix_alias_fields(content):
|
|
"""Fix alias field names to canonical names."""
|
|
changes = []
|
|
|
|
# Fix: extent → item_count (in Collection context)
|
|
# Pattern: Look for 'extent:' within collections section
|
|
pattern_extent = r'(\n\s+)(extent:)'
|
|
if re.search(pattern_extent, content):
|
|
content = re.sub(pattern_extent, r'\1item_count:', content)
|
|
changes.append("extent → item_count")
|
|
|
|
# Fix: metadata_standards → implemented_standards (in DigitalPlatform context)
|
|
# Pattern: Look for 'metadata_standards:' within digital_platforms section
|
|
pattern_metadata = r'(\n\s+)(metadata_standards:)'
|
|
if re.search(pattern_metadata, content):
|
|
content = re.sub(pattern_metadata, r'\1implemented_standards:', content)
|
|
changes.append("metadata_standards → implemented_standards")
|
|
|
|
# Fix: subject_areas → subjects (in Collection context)
|
|
pattern_subjects = r'(\n\s+)(subject_areas:)'
|
|
if re.search(pattern_subjects, content):
|
|
content = re.sub(pattern_subjects, r'\1subjects:', content)
|
|
changes.append("subject_areas → subjects")
|
|
|
|
return content, changes
|
|
|
|
def fix_temporal_coverage(content):
|
|
"""Fix temporal coverage format to ISO 8601 with full dates."""
|
|
changes = []
|
|
|
|
# Pattern: temporal_coverage: YYYY/YYYY or partial dates
|
|
# Matches:
|
|
# - 1800/1930 → 1800-01-01/1930-12-31
|
|
# - 800/1500 → 0800-01-01/1500-12-31
|
|
# - 1800/1930-12-31 → 1800-01-01/1930-12-31 (incomplete start)
|
|
|
|
def replace_temporal(match):
|
|
indent = match.group(1)
|
|
start = match.group(2)
|
|
end = match.group(3)
|
|
|
|
# Fix start date
|
|
if re.match(r'^\d{1,4}$', start):
|
|
# Just year: pad and add month/day
|
|
start_fixed = f"{start.zfill(4)}-01-01"
|
|
elif re.match(r'^-?\d{1,5}-\d{2}-\d{2}$', start):
|
|
# Already valid
|
|
start_fixed = start
|
|
else:
|
|
# Incomplete format, skip
|
|
return match.group(0)
|
|
|
|
# Fix end date
|
|
if re.match(r'^\d{1,4}$', end):
|
|
# Just year: pad and add month/day
|
|
end_fixed = f"{end.zfill(4)}-12-31"
|
|
elif re.match(r'^-?\d{1,5}-\d{2}-\d{2}$', end):
|
|
# Already valid
|
|
end_fixed = end
|
|
else:
|
|
# Incomplete format, skip
|
|
return match.group(0)
|
|
|
|
changes.append(f"temporal_coverage: {start}/{end} → {start_fixed}/{end_fixed}")
|
|
return f"{indent}temporal_coverage: {start_fixed}/{end_fixed}"
|
|
|
|
# Match temporal_coverage with various formats
|
|
pattern = r'(\n\s+temporal_coverage: )([^\n]+?)/([^\n]+)'
|
|
content = re.sub(pattern, replace_temporal, content)
|
|
|
|
return content, changes
|
|
|
|
def fix_platform_types(content):
|
|
"""Fix invalid platform_type enum values."""
|
|
changes = []
|
|
|
|
# CATALOG → COLLECTION_MANAGEMENT
|
|
if 'platform_type: CATALOG' in content or "platform_type: 'CATALOG'" in content:
|
|
content = content.replace('platform_type: CATALOG', 'platform_type: COLLECTION_MANAGEMENT')
|
|
content = content.replace("platform_type: 'CATALOG'", 'platform_type: COLLECTION_MANAGEMENT')
|
|
changes.append("platform_type: CATALOG → COLLECTION_MANAGEMENT")
|
|
|
|
# REPOSITORY → DIGITAL_REPOSITORY
|
|
if 'platform_type: REPOSITORY' in content or "platform_type: 'REPOSITORY'" in content:
|
|
content = content.replace('platform_type: REPOSITORY', 'platform_type: DIGITAL_REPOSITORY')
|
|
content = content.replace("platform_type: 'REPOSITORY'", 'platform_type: DIGITAL_REPOSITORY')
|
|
changes.append("platform_type: REPOSITORY → DIGITAL_REPOSITORY")
|
|
|
|
return content, changes
|
|
|
|
def process_file(filepath):
|
|
"""Process a single YAML file and apply all fixes."""
|
|
try:
|
|
content = load_yaml_content(filepath)
|
|
original_content = content
|
|
all_changes = []
|
|
|
|
# Apply fixes
|
|
content, changes1 = fix_alias_fields(content)
|
|
all_changes.extend(changes1)
|
|
|
|
content, changes2 = fix_temporal_coverage(content)
|
|
all_changes.extend(changes2)
|
|
|
|
content, changes3 = fix_platform_types(content)
|
|
all_changes.extend(changes3)
|
|
|
|
# Only save if changes were made
|
|
if content != original_content:
|
|
save_yaml_content(filepath, content)
|
|
return True, all_changes
|
|
|
|
return False, []
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error processing {filepath}: {e}")
|
|
return False, []
|
|
|
|
def get_active_yaml_files():
|
|
"""Get all active YAML files (excluding archives, backups, tests, cache)."""
|
|
instance_dir = Path.cwd() / "data" / "instances"
|
|
all_files = list(instance_dir.rglob("*.yaml"))
|
|
|
|
# Filter out archives, backups, test files, cache
|
|
active_files = [
|
|
f for f in all_files
|
|
if "archive" not in str(f)
|
|
and "backups" not in str(f)
|
|
and "test_outputs" not in str(f)
|
|
and "cache" not in str(f)
|
|
]
|
|
|
|
return sorted(active_files)
|
|
|
|
def main():
|
|
print("🔧 Batch Fix Validation Errors")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Get all files
|
|
files = get_active_yaml_files()
|
|
print(f"📄 Found {len(files)} active YAML files")
|
|
print()
|
|
|
|
# Process each file
|
|
files_modified = []
|
|
files_unchanged = []
|
|
|
|
for i, filepath in enumerate(files, 1):
|
|
relative_path = filepath.relative_to(Path.cwd())
|
|
print(f"[{i}/{len(files)}] Processing {relative_path}...", end=" ")
|
|
|
|
modified, changes = process_file(filepath)
|
|
|
|
if modified:
|
|
print(f"✅ Fixed ({len(changes)} changes)")
|
|
files_modified.append((str(relative_path), changes))
|
|
else:
|
|
print("⏭️ No changes needed")
|
|
files_unchanged.append(str(relative_path))
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print(f"✅ Files modified: {len(files_modified)}")
|
|
print(f"⏭️ Files unchanged: {len(files_unchanged)}")
|
|
print()
|
|
|
|
if files_modified:
|
|
print("📝 CHANGES MADE:")
|
|
print("-" * 80)
|
|
for filepath, changes in files_modified:
|
|
print(f"\n{filepath}:")
|
|
for change in changes[:10]: # Show first 10 changes
|
|
print(f" - {change}")
|
|
if len(changes) > 10:
|
|
print(f" ... and {len(changes) - 10} more changes")
|
|
|
|
print()
|
|
print("✅ Batch fix complete!")
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|