- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
189 lines
6.3 KiB
Python
Executable file
189 lines
6.3 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fix validation errors in Norwegian heritage institution YAML files.
|
|
|
|
Fixes:
|
|
1. Temporal coverage format: "YYYY/YYYY" or "YYYY BCE/YYYY" → "YYYY-01-01/YYYY-12-31"
|
|
2. Remove access_rights from digital_platforms (move to collections if needed)
|
|
3. Remove description field from Location objects
|
|
4. Remove description field from Identifier objects
|
|
5. Add IDENTIFIER_SERVICE to PlatformTypeEnum (schema change needed)
|
|
6. Fix metadata_standards enum format: "Dublin Core" → "DUBLIN_CORE"
|
|
|
|
Usage:
|
|
python3 scripts/fix_validation_errors.py <input.yaml> <output.yaml>
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
from pathlib import Path
|
|
import ruamel.yaml
|
|
|
|
|
|
def parse_temporal_coverage(coverage_str: str) -> str:
|
|
"""
|
|
Convert simplified temporal coverage to ISO format.
|
|
|
|
Examples:
|
|
"8000 BCE/2000" → "-8000-01-01/2000-12-31"
|
|
"2000 BCE/1950" → "-2000-01-01/1950-12-31"
|
|
"800/1950" → "0800-01-01/1950-12-31"
|
|
"1850-01-01/2024-12-31" → "1850-01-01/2024-12-31" (unchanged)
|
|
"""
|
|
# Already in correct format
|
|
if re.match(r'^\d{4}-\d{2}-\d{2}/\d{4}-\d{2}-\d{2}$', coverage_str):
|
|
return coverage_str
|
|
|
|
# Parse start and end dates
|
|
parts = coverage_str.split('/')
|
|
if len(parts) != 2:
|
|
print(f"Warning: Cannot parse temporal coverage '{coverage_str}'")
|
|
return coverage_str
|
|
|
|
start, end = parts
|
|
|
|
# Parse start date
|
|
if 'BCE' in start:
|
|
year = start.replace('BCE', '').strip()
|
|
start_iso = f"-{int(year):04d}-01-01"
|
|
elif 'CE' in start:
|
|
year = start.replace('CE', '').strip()
|
|
start_iso = f"{int(year):04d}-01-01"
|
|
else:
|
|
year = start.strip()
|
|
start_iso = f"{int(year):04d}-01-01"
|
|
|
|
# Parse end date
|
|
if 'BCE' in end:
|
|
year = end.replace('BCE', '').strip()
|
|
end_iso = f"-{int(year):04d}-12-31"
|
|
elif 'CE' in end:
|
|
year = end.replace('CE', '').strip()
|
|
end_iso = f"{int(year):04d}-12-31"
|
|
else:
|
|
year = end.strip()
|
|
end_iso = f"{int(year):04d}-12-31"
|
|
|
|
return f"{start_iso}/{end_iso}"
|
|
|
|
|
|
def fix_metadata_standards_enum(standards_list: list) -> list:
|
|
"""
|
|
Fix metadata standards enum format.
|
|
|
|
"Dublin Core" → "DUBLIN_CORE"
|
|
"MARC 21" → "MARC21"
|
|
"""
|
|
if not standards_list:
|
|
return standards_list
|
|
|
|
fixed = []
|
|
for standard in standards_list:
|
|
if standard == "Dublin Core":
|
|
fixed.append("DUBLIN_CORE")
|
|
elif standard == "MARC 21":
|
|
fixed.append("MARC21")
|
|
else:
|
|
fixed.append(standard)
|
|
|
|
return fixed
|
|
|
|
|
|
def fix_institution(institution: dict) -> list:
|
|
"""Fix validation errors in a single institution record. Returns list of changes made."""
|
|
changes_made = []
|
|
|
|
# Fix temporal coverage in collections
|
|
if 'collections' in institution:
|
|
for collection in institution['collections']:
|
|
if 'temporal_coverage' in collection:
|
|
old_coverage = collection['temporal_coverage']
|
|
new_coverage = parse_temporal_coverage(old_coverage)
|
|
if old_coverage != new_coverage:
|
|
collection['temporal_coverage'] = new_coverage
|
|
changes_made.append(f"Fixed temporal_coverage: {old_coverage} → {new_coverage}")
|
|
|
|
# Remove access_rights from digital_platforms
|
|
if 'digital_platforms' in institution:
|
|
for platform in institution['digital_platforms']:
|
|
if 'access_rights' in platform:
|
|
access_value = platform.pop('access_rights')
|
|
changes_made.append(f"Removed access_rights from digital_platform: {access_value}")
|
|
|
|
# Remove description from locations
|
|
if 'locations' in institution:
|
|
for location in institution['locations']:
|
|
if 'description' in location:
|
|
desc = location.pop('description')
|
|
changes_made.append(f"Removed description from location: {desc}")
|
|
|
|
# Remove description from identifiers
|
|
if 'identifiers' in institution:
|
|
for identifier in institution['identifiers']:
|
|
if 'description' in identifier:
|
|
desc = identifier.pop('description')
|
|
changes_made.append(f"Removed description from identifier: {desc}")
|
|
|
|
# Fix metadata_standards enum format
|
|
if 'digital_platforms' in institution:
|
|
for platform in institution['digital_platforms']:
|
|
if 'metadata_standards' in platform:
|
|
old_standards = platform['metadata_standards']
|
|
new_standards = fix_metadata_standards_enum(old_standards)
|
|
if old_standards != new_standards:
|
|
platform['metadata_standards'] = new_standards
|
|
changes_made.append(f"Fixed metadata_standards enum: {old_standards} → {new_standards}")
|
|
|
|
return changes_made
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) != 3:
|
|
print(__doc__)
|
|
sys.exit(1)
|
|
|
|
input_path = Path(sys.argv[1])
|
|
output_path = Path(sys.argv[2])
|
|
|
|
if not input_path.exists():
|
|
print(f"Error: Input file not found: {input_path}")
|
|
sys.exit(1)
|
|
|
|
# Load YAML with ruamel.yaml to preserve formatting
|
|
yaml = ruamel.yaml.YAML()
|
|
yaml.preserve_quotes = True
|
|
yaml.default_flow_style = False
|
|
yaml.width = 4096
|
|
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.load(f)
|
|
|
|
# Process each institution
|
|
total_changes = 0
|
|
institutions_modified = 0
|
|
|
|
for institution in data:
|
|
institution_name = institution.get('name', 'Unknown')
|
|
changes = fix_institution(institution)
|
|
|
|
if changes:
|
|
institutions_modified += 1
|
|
total_changes += len(changes)
|
|
print(f"\n✏️ {institution_name}:")
|
|
for change in changes:
|
|
print(f" - {change}")
|
|
|
|
# Write output
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f)
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"✅ Fixed {total_changes} validation errors in {institutions_modified} institutions")
|
|
print(f"✅ Output written to: {output_path}")
|
|
print(f"\nValidate with:")
|
|
print(f" python3 scripts/normalize_field_names.py {output_path} /tmp/test.yaml")
|
|
print(f" linkml-validate -s schemas/heritage_custodian.yaml -C HeritageCustodian /tmp/test.yaml")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|