271 lines
8.7 KiB
Python
271 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix remaining validation errors in YAML files, processing by country directory.
|
|
|
|
This optimized version:
|
|
- Processes files by country directory
|
|
- Shows progress indicators
|
|
- Skips large consolidated files and metadata files
|
|
- Provides detailed reporting per country
|
|
"""
|
|
|
|
import yaml
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
|
|
# Metadata standards mapping
|
|
STANDARDS_MAPPING = {
|
|
'Dublin Core': 'DUBLIN_CORE',
|
|
'dublin core': 'DUBLIN_CORE',
|
|
'DUBLIN-CORE': 'DUBLIN_CORE',
|
|
'DSpace': None, # Platform, not a standard
|
|
'OAI-PMH': None, # Protocol, not a standard
|
|
'Z39.50': None, # Protocol, not a standard
|
|
'UNIMARC': 'MARC21',
|
|
'ISAD(G)': 'EAD',
|
|
}
|
|
|
|
# Files to skip
|
|
SKIP_FILES = {
|
|
'DATASET_STATISTICS.yaml',
|
|
'ENRICHMENT_CANDIDATES.yaml',
|
|
'tunisian_institutions_enhanced.yaml',
|
|
'tunisian_institutions_enhanced.backup.yaml',
|
|
'globalglam-20251111.yaml', # Large consolidated file
|
|
}
|
|
|
|
# Directories to skip
|
|
SKIP_DIRS = {
|
|
'all', # Contains large consolidated files
|
|
'archive',
|
|
'backups',
|
|
'cache',
|
|
'exports',
|
|
'test_outputs',
|
|
}
|
|
|
|
|
|
def fix_metadata_standards(standards_list):
|
|
"""Map or remove invalid metadata standards."""
|
|
if not standards_list:
|
|
return standards_list
|
|
|
|
fixed = []
|
|
for std in standards_list:
|
|
if std in STANDARDS_MAPPING:
|
|
mapped = STANDARDS_MAPPING[std]
|
|
if mapped:
|
|
fixed.append(mapped)
|
|
else:
|
|
fixed.append(std)
|
|
|
|
return list(set(fixed)) if fixed else None
|
|
|
|
|
|
def fix_temporal_coverage(coverage_str):
|
|
"""Fix temporal coverage patterns like '1983-01-01/present' or empty strings."""
|
|
if not coverage_str or coverage_str.strip() == '':
|
|
return None
|
|
|
|
if 'present' in coverage_str.lower():
|
|
current_year = datetime.now().year
|
|
coverage_str = re.sub(
|
|
r'/present',
|
|
f'/{current_year}-12-31',
|
|
coverage_str,
|
|
flags=re.IGNORECASE
|
|
)
|
|
|
|
return coverage_str
|
|
|
|
|
|
def fix_enrichment_history(entry):
|
|
"""Add required fields to enrichment_history entries."""
|
|
if 'enrichment_type' not in entry:
|
|
entry['enrichment_type'] = 'MANUAL'
|
|
|
|
if 'verified' not in entry:
|
|
entry['verified'] = False
|
|
|
|
if 'enrichment_source' in entry and isinstance(entry['enrichment_source'], list):
|
|
entry['enrichment_source'] = entry['enrichment_source'][0] if entry['enrichment_source'] else None
|
|
|
|
return entry
|
|
|
|
|
|
def fix_institution(inst):
|
|
"""Fix a single institution record."""
|
|
changes = []
|
|
|
|
# Move 'notes' from provenance to description
|
|
if 'provenance' in inst and 'notes' in inst['provenance']:
|
|
notes = inst['provenance'].pop('notes')
|
|
if 'description' not in inst or not inst['description']:
|
|
inst['description'] = notes
|
|
else:
|
|
inst['description'] += f"\n\n{notes}"
|
|
changes.append("moved_provenance_notes")
|
|
|
|
# Move enrichment_history from root to provenance
|
|
if 'enrichment_history' in inst:
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
inst['provenance']['enrichment_history'] = inst.pop('enrichment_history')
|
|
changes.append("moved_enrichment_history")
|
|
|
|
# Fix enrichment_history entries
|
|
if 'provenance' in inst and 'enrichment_history' in inst['provenance']:
|
|
for entry in inst['provenance']['enrichment_history']:
|
|
fix_enrichment_history(entry)
|
|
changes.append("fixed_enrichment_entries")
|
|
|
|
# Fix digital platforms
|
|
if 'digital_platforms' in inst:
|
|
for platform in inst['digital_platforms']:
|
|
if 'description' in platform:
|
|
platform.pop('description')
|
|
changes.append("removed_platform_description")
|
|
|
|
if 'notes' in platform:
|
|
platform.pop('notes')
|
|
changes.append("removed_platform_notes")
|
|
|
|
if 'implemented_standards' in platform:
|
|
fixed_standards = fix_metadata_standards(platform['implemented_standards'])
|
|
if fixed_standards != platform['implemented_standards']:
|
|
platform['implemented_standards'] = fixed_standards
|
|
changes.append("fixed_metadata_standards")
|
|
|
|
# Fix collections
|
|
if 'collections' in inst:
|
|
for collection in inst['collections']:
|
|
if 'description' in collection:
|
|
collection.pop('description')
|
|
changes.append("removed_collection_description")
|
|
|
|
if 'temporal_coverage' in collection:
|
|
fixed = fix_temporal_coverage(collection['temporal_coverage'])
|
|
if fixed != collection['temporal_coverage']:
|
|
collection['temporal_coverage'] = fixed
|
|
changes.append("fixed_temporal_coverage")
|
|
|
|
# Fix change_history
|
|
if 'change_history' in inst:
|
|
for event in inst['change_history']:
|
|
if 'event_date' not in event:
|
|
event['event_date'] = '1900-01-01'
|
|
changes.append("added_event_date")
|
|
|
|
return changes
|
|
|
|
|
|
def process_file(file_path):
|
|
"""Process a single YAML file."""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return 0, []
|
|
|
|
all_changes = []
|
|
|
|
# Handle different YAML structures
|
|
if isinstance(data, list):
|
|
for inst in data:
|
|
changes = fix_institution(inst)
|
|
all_changes.extend(changes)
|
|
elif isinstance(data, dict):
|
|
if 'institutions' in data or 'metadata' in data:
|
|
return 0, ['skipped_metadata_structure']
|
|
else:
|
|
changes = fix_institution(data)
|
|
all_changes.extend(changes)
|
|
|
|
if all_changes:
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return len(all_changes), all_changes
|
|
|
|
except Exception as e:
|
|
return 0, [f'error: {str(e)}']
|
|
|
|
|
|
def process_country_directory(country_dir):
|
|
"""Process all YAML files in a country directory."""
|
|
yaml_files = list(country_dir.glob('*.yaml'))
|
|
|
|
if not yaml_files:
|
|
return 0, 0, []
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"📁 Processing: {country_dir.name}")
|
|
print(f"{'='*70}")
|
|
|
|
total_changes = 0
|
|
files_modified = 0
|
|
skipped_files = []
|
|
|
|
for i, yaml_file in enumerate(yaml_files, 1):
|
|
if yaml_file.name in SKIP_FILES:
|
|
print(f" ⏭️ [{i}/{len(yaml_files)}] Skipping: {yaml_file.name}")
|
|
skipped_files.append(yaml_file.name)
|
|
continue
|
|
|
|
changes_count, changes = process_file(yaml_file)
|
|
|
|
if 'skipped_metadata_structure' in changes:
|
|
print(f" ⏭️ [{i}/{len(yaml_files)}] Skipping metadata: {yaml_file.name}")
|
|
skipped_files.append(yaml_file.name)
|
|
elif any(c.startswith('error:') for c in changes):
|
|
print(f" ❌ [{i}/{len(yaml_files)}] Error: {yaml_file.name}")
|
|
elif changes_count > 0:
|
|
print(f" ✅ [{i}/{len(yaml_files)}] Fixed {changes_count} issues: {yaml_file.name}")
|
|
total_changes += changes_count
|
|
files_modified += 1
|
|
else:
|
|
print(f" ✓ [{i}/{len(yaml_files)}] No changes: {yaml_file.name}")
|
|
|
|
return total_changes, files_modified, skipped_files
|
|
|
|
|
|
def main():
|
|
"""Process YAML files by country directory."""
|
|
repo_root = Path(__file__).parent.parent
|
|
instances_dir = repo_root / 'data' / 'instances'
|
|
|
|
# Get all country directories
|
|
country_dirs = sorted([
|
|
d for d in instances_dir.iterdir()
|
|
if d.is_dir() and d.name not in SKIP_DIRS
|
|
])
|
|
|
|
print(f"🎯 Processing {len(country_dirs)} country directories")
|
|
print(f"⏭️ Skipping directories: {', '.join(sorted(SKIP_DIRS))}")
|
|
|
|
grand_total_changes = 0
|
|
grand_total_files = 0
|
|
all_skipped = []
|
|
|
|
for country_dir in country_dirs:
|
|
changes, files, skipped = process_country_directory(country_dir)
|
|
grand_total_changes += changes
|
|
grand_total_files += files
|
|
all_skipped.extend(skipped)
|
|
|
|
if changes > 0:
|
|
print(f" 📊 Summary: Fixed {changes} issues in {files} files")
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"🎉 GRAND TOTAL")
|
|
print(f"{'='*70}")
|
|
print(f" ✅ Fixed {grand_total_changes} issues across {grand_total_files} files")
|
|
print(f" ⏭️ Skipped {len(all_skipped)} files")
|
|
print(f"{'='*70}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|