- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
186 lines
6.2 KiB
Python
Executable file
186 lines
6.2 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fix alias field names in YAML instance files to use canonical schema names.
|
|
|
|
LinkML aliases don't work in data validation - they're only for documentation.
|
|
This script renames fields to use canonical names defined in the schema.
|
|
|
|
Mappings:
|
|
- description (in DigitalPlatform) → platform_description
|
|
- description (in Collection) → collection_description
|
|
- description (in Location) → location_description (needs to be added to schema)
|
|
- description (in Identifier) → identifier_description (needs to be added to schema)
|
|
- metadata_standards (in DigitalPlatform) → implemented_standards
|
|
- notes (in Provenance) → provenance_notes
|
|
- subject_areas (in Collection) → subjects
|
|
"""
|
|
|
|
import yaml
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
def fix_digital_platform(platform: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Fix DigitalPlatform field names."""
|
|
if 'description' in platform:
|
|
platform['platform_description'] = platform.pop('description')
|
|
if 'metadata_standards' in platform:
|
|
platform['implemented_standards'] = platform.pop('metadata_standards')
|
|
return platform
|
|
|
|
|
|
def fix_collection(collection: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Fix Collection field names."""
|
|
if 'description' in collection:
|
|
collection['collection_description'] = collection.pop('description')
|
|
if 'subject_areas' in collection:
|
|
collection['subjects'] = collection.pop('subject_areas')
|
|
return collection
|
|
|
|
|
|
def fix_location(location: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Fix Location field names."""
|
|
if 'description' in location:
|
|
# For now, store in a comment or remove
|
|
# TODO: Add location_description to schema
|
|
desc = location.pop('description')
|
|
print(f" ⚠️ Removed location description (not in schema yet): {desc[:50]}...")
|
|
return location
|
|
|
|
|
|
def fix_identifier(identifier: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Fix Identifier field names."""
|
|
if 'description' in identifier:
|
|
# For now, store in a comment or remove
|
|
# TODO: Add identifier_description to schema
|
|
desc = identifier.pop('description')
|
|
print(f" ⚠️ Removed identifier description (not in schema yet): {desc[:50]}...")
|
|
return identifier
|
|
|
|
|
|
def fix_provenance(provenance: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Fix Provenance field names."""
|
|
if 'notes' in provenance:
|
|
provenance['provenance_notes'] = provenance.pop('notes')
|
|
return provenance
|
|
|
|
|
|
def fix_institution(institution: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Fix field names in a single institution record."""
|
|
|
|
# Fix digital platforms
|
|
if 'digital_platforms' in institution:
|
|
institution['digital_platforms'] = [
|
|
fix_digital_platform(p) for p in institution['digital_platforms']
|
|
]
|
|
|
|
# Fix collections
|
|
if 'collections' in institution:
|
|
institution['collections'] = [
|
|
fix_collection(c) for c in institution['collections']
|
|
]
|
|
|
|
# Fix locations
|
|
if 'locations' in institution:
|
|
institution['locations'] = [
|
|
fix_location(loc) for loc in institution['locations']
|
|
]
|
|
|
|
# Fix identifiers
|
|
if 'identifiers' in institution:
|
|
institution['identifiers'] = [
|
|
fix_identifier(ident) for ident in institution['identifiers']
|
|
]
|
|
|
|
# Fix provenance
|
|
if 'provenance' in institution:
|
|
institution['provenance'] = fix_provenance(institution['provenance'])
|
|
|
|
return institution
|
|
|
|
|
|
def fix_yaml_file(file_path: Path) -> bool:
|
|
"""
|
|
Fix alias field names in a YAML file.
|
|
|
|
Returns True if changes were made, False otherwise.
|
|
"""
|
|
print(f"\n📄 Processing: {file_path.relative_to(file_path.parents[2])}")
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
print(" ⚠️ Empty file, skipping")
|
|
return False
|
|
|
|
# Handle both single institution and list of institutions
|
|
if isinstance(data, list):
|
|
original_data = yaml.dump(data, default_flow_style=False, allow_unicode=True)
|
|
fixed_data = [fix_institution(inst) for inst in data]
|
|
else:
|
|
original_data = yaml.dump(data, default_flow_style=False, allow_unicode=True)
|
|
fixed_data = fix_institution(data)
|
|
|
|
# Check if anything changed
|
|
new_data_yaml = yaml.dump(fixed_data, default_flow_style=False, allow_unicode=True)
|
|
if original_data == new_data_yaml:
|
|
print(" ✅ No changes needed")
|
|
return False
|
|
|
|
# Write back to file
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(fixed_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(" ✅ Fixed and saved")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
"""Process all YAML instance files."""
|
|
|
|
# Find all YAML files in data/instances/
|
|
base_dir = Path(__file__).parent.parent
|
|
instances_dir = base_dir / 'data' / 'instances'
|
|
|
|
if not instances_dir.exists():
|
|
print(f"❌ Instances directory not found: {instances_dir}")
|
|
sys.exit(1)
|
|
|
|
yaml_files = list(instances_dir.rglob('*.yaml'))
|
|
yaml_files.extend(instances_dir.rglob('*.yml'))
|
|
|
|
if not yaml_files:
|
|
print("⚠️ No YAML files found")
|
|
sys.exit(0)
|
|
|
|
print(f"🔍 Found {len(yaml_files)} YAML files")
|
|
print("=" * 80)
|
|
|
|
changed_files = []
|
|
|
|
for yaml_file in sorted(yaml_files):
|
|
if fix_yaml_file(yaml_file):
|
|
changed_files.append(yaml_file)
|
|
|
|
print("\n" + "=" * 80)
|
|
print(f"✅ Processed {len(yaml_files)} files")
|
|
print(f"📝 Modified {len(changed_files)} files")
|
|
|
|
if changed_files:
|
|
print("\nChanged files:")
|
|
for f in changed_files:
|
|
print(f" - {f.relative_to(base_dir)}")
|
|
|
|
print("\n⚠️ NOTE: Location and Identifier descriptions were removed.")
|
|
print(" These fields need to be added to the schema before they can be used.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|