glam/scripts/fix_validation_errors.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

189 lines
6.3 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fix validation errors in Norwegian heritage institution YAML files.
Fixes:
1. Temporal coverage format: "YYYY/YYYY" or "YYYY BCE/YYYY""YYYY-01-01/YYYY-12-31"
2. Remove access_rights from digital_platforms (move to collections if needed)
3. Remove description field from Location objects
4. Remove description field from Identifier objects
5. Add IDENTIFIER_SERVICE to PlatformTypeEnum (schema change needed)
6. Fix metadata_standards enum format: "Dublin Core""DUBLIN_CORE"
Usage:
python3 scripts/fix_validation_errors.py <input.yaml> <output.yaml>
"""
import sys
import re
from pathlib import Path
import ruamel.yaml
def parse_temporal_coverage(coverage_str: str) -> str:
"""
Convert simplified temporal coverage to ISO format.
Examples:
"8000 BCE/2000""-8000-01-01/2000-12-31"
"2000 BCE/1950""-2000-01-01/1950-12-31"
"800/1950""0800-01-01/1950-12-31"
"1850-01-01/2024-12-31""1850-01-01/2024-12-31" (unchanged)
"""
# Already in correct format
if re.match(r'^\d{4}-\d{2}-\d{2}/\d{4}-\d{2}-\d{2}$', coverage_str):
return coverage_str
# Parse start and end dates
parts = coverage_str.split('/')
if len(parts) != 2:
print(f"Warning: Cannot parse temporal coverage '{coverage_str}'")
return coverage_str
start, end = parts
# Parse start date
if 'BCE' in start:
year = start.replace('BCE', '').strip()
start_iso = f"-{int(year):04d}-01-01"
elif 'CE' in start:
year = start.replace('CE', '').strip()
start_iso = f"{int(year):04d}-01-01"
else:
year = start.strip()
start_iso = f"{int(year):04d}-01-01"
# Parse end date
if 'BCE' in end:
year = end.replace('BCE', '').strip()
end_iso = f"-{int(year):04d}-12-31"
elif 'CE' in end:
year = end.replace('CE', '').strip()
end_iso = f"{int(year):04d}-12-31"
else:
year = end.strip()
end_iso = f"{int(year):04d}-12-31"
return f"{start_iso}/{end_iso}"
def fix_metadata_standards_enum(standards_list: list) -> list:
"""
Fix metadata standards enum format.
"Dublin Core""DUBLIN_CORE"
"MARC 21""MARC21"
"""
if not standards_list:
return standards_list
fixed = []
for standard in standards_list:
if standard == "Dublin Core":
fixed.append("DUBLIN_CORE")
elif standard == "MARC 21":
fixed.append("MARC21")
else:
fixed.append(standard)
return fixed
def fix_institution(institution: dict) -> list:
"""Fix validation errors in a single institution record. Returns list of changes made."""
changes_made = []
# Fix temporal coverage in collections
if 'collections' in institution:
for collection in institution['collections']:
if 'temporal_coverage' in collection:
old_coverage = collection['temporal_coverage']
new_coverage = parse_temporal_coverage(old_coverage)
if old_coverage != new_coverage:
collection['temporal_coverage'] = new_coverage
changes_made.append(f"Fixed temporal_coverage: {old_coverage}{new_coverage}")
# Remove access_rights from digital_platforms
if 'digital_platforms' in institution:
for platform in institution['digital_platforms']:
if 'access_rights' in platform:
access_value = platform.pop('access_rights')
changes_made.append(f"Removed access_rights from digital_platform: {access_value}")
# Remove description from locations
if 'locations' in institution:
for location in institution['locations']:
if 'description' in location:
desc = location.pop('description')
changes_made.append(f"Removed description from location: {desc}")
# Remove description from identifiers
if 'identifiers' in institution:
for identifier in institution['identifiers']:
if 'description' in identifier:
desc = identifier.pop('description')
changes_made.append(f"Removed description from identifier: {desc}")
# Fix metadata_standards enum format
if 'digital_platforms' in institution:
for platform in institution['digital_platforms']:
if 'metadata_standards' in platform:
old_standards = platform['metadata_standards']
new_standards = fix_metadata_standards_enum(old_standards)
if old_standards != new_standards:
platform['metadata_standards'] = new_standards
changes_made.append(f"Fixed metadata_standards enum: {old_standards}{new_standards}")
return changes_made
def main():
if len(sys.argv) != 3:
print(__doc__)
sys.exit(1)
input_path = Path(sys.argv[1])
output_path = Path(sys.argv[2])
if not input_path.exists():
print(f"Error: Input file not found: {input_path}")
sys.exit(1)
# Load YAML with ruamel.yaml to preserve formatting
yaml = ruamel.yaml.YAML()
yaml.preserve_quotes = True
yaml.default_flow_style = False
yaml.width = 4096
with open(input_path, 'r', encoding='utf-8') as f:
data = yaml.load(f)
# Process each institution
total_changes = 0
institutions_modified = 0
for institution in data:
institution_name = institution.get('name', 'Unknown')
changes = fix_institution(institution)
if changes:
institutions_modified += 1
total_changes += len(changes)
print(f"\n✏️ {institution_name}:")
for change in changes:
print(f" - {change}")
# Write output
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f)
print(f"\n{'='*70}")
print(f"✅ Fixed {total_changes} validation errors in {institutions_modified} institutions")
print(f"✅ Output written to: {output_path}")
print(f"\nValidate with:")
print(f" python3 scripts/normalize_field_names.py {output_path} /tmp/test.yaml")
print(f" linkml-validate -s schemas/heritage_custodian.yaml -C HeritageCustodian /tmp/test.yaml")
if __name__ == '__main__':
main()