glam/scripts/fix_remaining_validation_errors.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

229 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""
Fix remaining validation errors in YAML files.
Common error patterns:
1. Invalid metadata standard enum values (Dublin Core → DUBLIN_CORE, DSpace, OAI-PMH, Z39.50, UNIMARC, ISAD(G))
2. 'notes' field in provenance (should be in HeritageCustodian.description)
3. 'description' field in collections/digital_platforms (not allowed)
4. 'enrichment_history' at root level (should be in provenance)
5. Temporal coverage with "present" or empty strings
6. Missing required fields in enrichment_history
"""
import yaml
import re
from pathlib import Path
from datetime import datetime
# Metadata standards mapping
STANDARDS_MAPPING = {
'Dublin Core': 'DUBLIN_CORE',
'dublin core': 'DUBLIN_CORE',
'DUBLIN-CORE': 'DUBLIN_CORE',
'DSpace': None, # DSpace is a platform, not a metadata standard - remove
'OAI-PMH': None, # OAI-PMH is a protocol, not a metadata standard - remove
'Z39.50': None, # Z39.50 is a protocol, not a metadata standard - remove
'UNIMARC': 'MARC21', # Map UNIMARC to MARC21 (closest equivalent)
'ISAD(G)': 'EAD', # ISAD(G) archival standard maps to EAD
}
def fix_metadata_standards(standards_list):
"""Map or remove invalid metadata standards."""
if not standards_list:
return standards_list
fixed = []
for std in standards_list:
if std in STANDARDS_MAPPING:
mapped = STANDARDS_MAPPING[std]
if mapped: # Only add if there's a valid mapping
fixed.append(mapped)
else:
fixed.append(std) # Keep valid standards as-is
return list(set(fixed)) if fixed else None # Remove duplicates
def fix_temporal_coverage(coverage_str):
"""Fix temporal coverage patterns like '1983-01-01/present' or empty strings."""
if not coverage_str or coverage_str.strip() == '':
return None # Remove empty temporal coverage
# Replace "present" with current year
if 'present' in coverage_str.lower():
current_year = datetime.now().year
coverage_str = re.sub(
r'/present',
f'/{current_year}-12-31',
coverage_str,
flags=re.IGNORECASE
)
return coverage_str
def fix_enrichment_history(entry):
"""Add required fields to enrichment_history entries."""
if 'enrichment_type' not in entry:
entry['enrichment_type'] = 'MANUAL' # Default to manual enrichment
if 'verified' not in entry:
entry['verified'] = False # Default to unverified
# Fix enrichment_source if it's a list (should be a string)
if 'enrichment_source' in entry and isinstance(entry['enrichment_source'], list):
entry['enrichment_source'] = entry['enrichment_source'][0] if entry['enrichment_source'] else None
return entry
def fix_institution(inst):
"""Fix a single institution record."""
changes = []
# Move 'notes' from provenance to description
if 'provenance' in inst and 'notes' in inst['provenance']:
notes = inst['provenance'].pop('notes')
if 'description' not in inst or not inst['description']:
inst['description'] = notes
else:
inst['description'] += f"\n\n{notes}"
changes.append("Moved provenance.notes to description")
# Move enrichment_history from root to provenance
if 'enrichment_history' in inst:
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['enrichment_history'] = inst.pop('enrichment_history')
changes.append("Moved enrichment_history to provenance")
# Fix enrichment_history entries
if 'provenance' in inst and 'enrichment_history' in inst['provenance']:
for entry in inst['provenance']['enrichment_history']:
fix_enrichment_history(entry)
changes.append("Fixed enrichment_history entries")
# Fix digital platforms
if 'digital_platforms' in inst:
for platform in inst['digital_platforms']:
# Remove 'description' field (not allowed)
if 'description' in platform:
platform.pop('description')
changes.append("Removed description from digital_platform")
# Remove 'notes' field (not allowed)
if 'notes' in platform:
platform.pop('notes')
changes.append("Removed notes from digital_platform")
# Fix metadata standards
if 'implemented_standards' in platform:
fixed_standards = fix_metadata_standards(platform['implemented_standards'])
if fixed_standards != platform['implemented_standards']:
platform['implemented_standards'] = fixed_standards
changes.append("Fixed implemented_standards")
# Fix collections
if 'collections' in inst:
for collection in inst['collections']:
# Remove 'description' field (not allowed)
if 'description' in collection:
collection.pop('description')
changes.append("Removed description from collection")
# Fix temporal coverage
if 'temporal_coverage' in collection:
fixed = fix_temporal_coverage(collection['temporal_coverage'])
if fixed != collection['temporal_coverage']:
collection['temporal_coverage'] = fixed
changes.append("Fixed temporal_coverage")
# Fix change_history
if 'change_history' in inst:
for event in inst['change_history']:
# Add required event_date if missing
if 'event_date' not in event:
# Try to infer from event_description or use a placeholder
event['event_date'] = '1900-01-01' # Placeholder
changes.append("Added missing event_date")
return changes
def process_file(file_path):
"""Process a single YAML file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return 0
total_changes = 0
# Handle different YAML structures
if isinstance(data, list):
# List of institutions
for inst in data:
changes = fix_institution(inst)
total_changes += len(changes)
elif isinstance(data, dict):
if 'institutions' in data:
# Wrapped structure with metadata
print(f"⚠️ Skipping {file_path.name} - wrapped structure not a valid HeritageCustodian")
return 0
else:
# Single institution
changes = fix_institution(data)
total_changes += len(changes)
if total_changes > 0:
# Write back
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"✅ Fixed {total_changes} issues in {file_path.relative_to(file_path.parent.parent.parent)}")
return total_changes
except Exception as e:
print(f"❌ Error processing {file_path.name}: {e}")
return 0
def main():
"""Process all YAML files in data/instances."""
repo_root = Path(__file__).parent.parent
instances_dir = repo_root / 'data' / 'instances'
# Skip metadata files
skip_files = {
'DATASET_STATISTICS.yaml',
'ENRICHMENT_CANDIDATES.yaml',
'tunisian_institutions_enhanced.yaml',
'tunisian_institutions_enhanced.backup.yaml',
}
total_fixes = 0
fixed_count = 0
# Process all YAML files recursively
for yaml_file in instances_dir.rglob('*.yaml'):
if yaml_file.name in skip_files:
print(f"⏭️ Skipping metadata file: {yaml_file.name}")
continue
changes = process_file(yaml_file)
if changes > 0:
total_fixes += changes
fixed_count += 1
print(f"\n{'='*70}")
print(f"✅ Fixed {total_fixes} issues across {fixed_count} files")
print(f"{'='*70}")
if __name__ == "__main__":
main()