glam/scripts/fix_alias_fields_v2.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

187 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""
Fix alias field names in YAML instance files to use canonical schema names.
LinkML aliases don't work in data validation - they're only for documentation.
This script renames fields to use canonical names defined in the schema.
Mappings:
- description (in DigitalPlatform) → platform_description
- description (in Collection) → collection_description
- description (in Location) → location_description (remove for now)
- description (in Identifier) → identifier_description (remove for now)
- metadata_standards (in DigitalPlatform) → implemented_standards
- notes (in Provenance) → provenance_notes
- subject_areas (in Collection) → subjects
"""
import yaml
import sys
from pathlib import Path
def fix_digital_platform(platform):
"""Fix DigitalPlatform field names."""
changed = False
if 'description' in platform:
platform['platform_description'] = platform.pop('description')
changed = True
if 'metadata_standards' in platform:
platform['implemented_standards'] = platform.pop('metadata_standards')
changed = True
return changed
def fix_collection(collection):
"""Fix Collection field names."""
changed = False
if 'description' in collection:
collection['collection_description'] = collection.pop('description')
changed = True
if 'subject_areas' in collection:
collection['subjects'] = collection.pop('subject_areas')
changed = True
return changed
def fix_location(location):
"""Fix Location field names."""
changed = False
if 'description' in location:
# Remove for now - not in schema yet
location.pop('description')
changed = True
return changed
def fix_identifier(identifier):
"""Fix Identifier field names."""
changed = False
if 'description' in identifier:
# Remove for now - not in schema yet
identifier.pop('description')
changed = True
return changed
def fix_provenance(provenance):
"""Fix Provenance field names."""
changed = False
if 'notes' in provenance:
provenance['provenance_notes'] = provenance.pop('notes')
changed = True
return changed
def fix_institution(institution):
"""Fix field names in a single institution record."""
changed = False
# Fix digital platforms
if 'digital_platforms' in institution:
for platform in institution['digital_platforms']:
if fix_digital_platform(platform):
changed = True
# Fix collections
if 'collections' in institution:
for collection in institution['collections']:
if fix_collection(collection):
changed = True
# Fix locations
if 'locations' in institution:
for location in institution['locations']:
if fix_location(location):
changed = True
# Fix identifiers
if 'identifiers' in institution:
for identifier in institution['identifiers']:
if fix_identifier(identifier):
changed = True
# Fix provenance
if 'provenance' in institution:
if fix_provenance(institution['provenance']):
changed = True
return changed
def fix_yaml_file(file_path):
"""Fix alias field names in a YAML file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return False, "Empty file"
# Handle both single institution and list of institutions
changed = False
if isinstance(data, list):
for inst in data:
if fix_institution(inst):
changed = True
else:
changed = fix_institution(data)
if changed:
# Write back to file
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True, "Fixed"
else:
return False, "No changes needed"
except Exception as e:
return False, f"Error: {e}"
def main():
"""Process all YAML instance files."""
# Find all YAML files in data/instances/
base_dir = Path(__file__).parent.parent
instances_dir = base_dir / 'data' / 'instances'
if not instances_dir.exists():
print(f"❌ Instances directory not found: {instances_dir}")
sys.exit(1)
yaml_files = list(instances_dir.rglob('*.yaml'))
yaml_files.extend(instances_dir.rglob('*.yml'))
if not yaml_files:
print("⚠️ No YAML files found")
sys.exit(0)
print(f"🔍 Found {len(yaml_files)} YAML files\n")
changed_count = 0
unchanged_count = 0
error_count = 0
for i, yaml_file in enumerate(sorted(yaml_files), 1):
rel_path = yaml_file.relative_to(base_dir)
changed, msg = fix_yaml_file(yaml_file)
if changed:
print(f"{i:3d}. ✅ {rel_path}: {msg}")
changed_count += 1
elif "Error" in msg:
print(f"{i:3d}. ❌ {rel_path}: {msg}")
error_count += 1
else:
unchanged_count += 1
print(f"\n{'='*80}")
print(f"✅ Modified: {changed_count} files")
print(f"⏭️ Unchanged: {unchanged_count} files")
print(f"❌ Errors: {error_count} files")
print(f"📊 Total: {len(yaml_files)} files")
if __name__ == '__main__':
main()