glam/scripts/fix_validation_errors_by_country.py
2025-11-19 23:25:22 +01:00

271 lines
8.7 KiB
Python

#!/usr/bin/env python3
"""
Fix remaining validation errors in YAML files, processing by country directory.
This optimized version:
- Processes files by country directory
- Shows progress indicators
- Skips large consolidated files and metadata files
- Provides detailed reporting per country
"""
import yaml
import re
from pathlib import Path
from datetime import datetime
# Metadata standards mapping
STANDARDS_MAPPING = {
'Dublin Core': 'DUBLIN_CORE',
'dublin core': 'DUBLIN_CORE',
'DUBLIN-CORE': 'DUBLIN_CORE',
'DSpace': None, # Platform, not a standard
'OAI-PMH': None, # Protocol, not a standard
'Z39.50': None, # Protocol, not a standard
'UNIMARC': 'MARC21',
'ISAD(G)': 'EAD',
}
# Files to skip
SKIP_FILES = {
'DATASET_STATISTICS.yaml',
'ENRICHMENT_CANDIDATES.yaml',
'tunisian_institutions_enhanced.yaml',
'tunisian_institutions_enhanced.backup.yaml',
'globalglam-20251111.yaml', # Large consolidated file
}
# Directories to skip
SKIP_DIRS = {
'all', # Contains large consolidated files
'archive',
'backups',
'cache',
'exports',
'test_outputs',
}
def fix_metadata_standards(standards_list):
"""Map or remove invalid metadata standards."""
if not standards_list:
return standards_list
fixed = []
for std in standards_list:
if std in STANDARDS_MAPPING:
mapped = STANDARDS_MAPPING[std]
if mapped:
fixed.append(mapped)
else:
fixed.append(std)
return list(set(fixed)) if fixed else None
def fix_temporal_coverage(coverage_str):
"""Fix temporal coverage patterns like '1983-01-01/present' or empty strings."""
if not coverage_str or coverage_str.strip() == '':
return None
if 'present' in coverage_str.lower():
current_year = datetime.now().year
coverage_str = re.sub(
r'/present',
f'/{current_year}-12-31',
coverage_str,
flags=re.IGNORECASE
)
return coverage_str
def fix_enrichment_history(entry):
"""Add required fields to enrichment_history entries."""
if 'enrichment_type' not in entry:
entry['enrichment_type'] = 'MANUAL'
if 'verified' not in entry:
entry['verified'] = False
if 'enrichment_source' in entry and isinstance(entry['enrichment_source'], list):
entry['enrichment_source'] = entry['enrichment_source'][0] if entry['enrichment_source'] else None
return entry
def fix_institution(inst):
"""Fix a single institution record."""
changes = []
# Move 'notes' from provenance to description
if 'provenance' in inst and 'notes' in inst['provenance']:
notes = inst['provenance'].pop('notes')
if 'description' not in inst or not inst['description']:
inst['description'] = notes
else:
inst['description'] += f"\n\n{notes}"
changes.append("moved_provenance_notes")
# Move enrichment_history from root to provenance
if 'enrichment_history' in inst:
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['enrichment_history'] = inst.pop('enrichment_history')
changes.append("moved_enrichment_history")
# Fix enrichment_history entries
if 'provenance' in inst and 'enrichment_history' in inst['provenance']:
for entry in inst['provenance']['enrichment_history']:
fix_enrichment_history(entry)
changes.append("fixed_enrichment_entries")
# Fix digital platforms
if 'digital_platforms' in inst:
for platform in inst['digital_platforms']:
if 'description' in platform:
platform.pop('description')
changes.append("removed_platform_description")
if 'notes' in platform:
platform.pop('notes')
changes.append("removed_platform_notes")
if 'implemented_standards' in platform:
fixed_standards = fix_metadata_standards(platform['implemented_standards'])
if fixed_standards != platform['implemented_standards']:
platform['implemented_standards'] = fixed_standards
changes.append("fixed_metadata_standards")
# Fix collections
if 'collections' in inst:
for collection in inst['collections']:
if 'description' in collection:
collection.pop('description')
changes.append("removed_collection_description")
if 'temporal_coverage' in collection:
fixed = fix_temporal_coverage(collection['temporal_coverage'])
if fixed != collection['temporal_coverage']:
collection['temporal_coverage'] = fixed
changes.append("fixed_temporal_coverage")
# Fix change_history
if 'change_history' in inst:
for event in inst['change_history']:
if 'event_date' not in event:
event['event_date'] = '1900-01-01'
changes.append("added_event_date")
return changes
def process_file(file_path):
"""Process a single YAML file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return 0, []
all_changes = []
# Handle different YAML structures
if isinstance(data, list):
for inst in data:
changes = fix_institution(inst)
all_changes.extend(changes)
elif isinstance(data, dict):
if 'institutions' in data or 'metadata' in data:
return 0, ['skipped_metadata_structure']
else:
changes = fix_institution(data)
all_changes.extend(changes)
if all_changes:
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return len(all_changes), all_changes
except Exception as e:
return 0, [f'error: {str(e)}']
def process_country_directory(country_dir):
"""Process all YAML files in a country directory."""
yaml_files = list(country_dir.glob('*.yaml'))
if not yaml_files:
return 0, 0, []
print(f"\n{'='*70}")
print(f"📁 Processing: {country_dir.name}")
print(f"{'='*70}")
total_changes = 0
files_modified = 0
skipped_files = []
for i, yaml_file in enumerate(yaml_files, 1):
if yaml_file.name in SKIP_FILES:
print(f" ⏭️ [{i}/{len(yaml_files)}] Skipping: {yaml_file.name}")
skipped_files.append(yaml_file.name)
continue
changes_count, changes = process_file(yaml_file)
if 'skipped_metadata_structure' in changes:
print(f" ⏭️ [{i}/{len(yaml_files)}] Skipping metadata: {yaml_file.name}")
skipped_files.append(yaml_file.name)
elif any(c.startswith('error:') for c in changes):
print(f" ❌ [{i}/{len(yaml_files)}] Error: {yaml_file.name}")
elif changes_count > 0:
print(f" ✅ [{i}/{len(yaml_files)}] Fixed {changes_count} issues: {yaml_file.name}")
total_changes += changes_count
files_modified += 1
else:
print(f" ✓ [{i}/{len(yaml_files)}] No changes: {yaml_file.name}")
return total_changes, files_modified, skipped_files
def main():
"""Process YAML files by country directory."""
repo_root = Path(__file__).parent.parent
instances_dir = repo_root / 'data' / 'instances'
# Get all country directories
country_dirs = sorted([
d for d in instances_dir.iterdir()
if d.is_dir() and d.name not in SKIP_DIRS
])
print(f"🎯 Processing {len(country_dirs)} country directories")
print(f"⏭️ Skipping directories: {', '.join(sorted(SKIP_DIRS))}")
grand_total_changes = 0
grand_total_files = 0
all_skipped = []
for country_dir in country_dirs:
changes, files, skipped = process_country_directory(country_dir)
grand_total_changes += changes
grand_total_files += files
all_skipped.extend(skipped)
if changes > 0:
print(f" 📊 Summary: Fixed {changes} issues in {files} files")
print(f"\n{'='*70}")
print(f"🎉 GRAND TOTAL")
print(f"{'='*70}")
print(f" ✅ Fixed {grand_total_changes} issues across {grand_total_files} files")
print(f" ⏭️ Skipped {len(all_skipped)} files")
print(f"{'='*70}")
if __name__ == "__main__":
main()