glam/scripts/normalize_field_names.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

149 lines
4.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Normalize field names from aliases to canonical names for LinkML validation.
This script converts intuitive field names (aliases) used in extracted data
to the canonical field names expected by the LinkML schema.
Field Mappings:
- Collection.description → collection_description
- Collection.extent → item_count
- Collection.subject_areas → subjects
- DigitalPlatform.description → platform_description
- DigitalPlatform.metadata_standards → implemented_standards
- Provenance.notes → provenance_notes
"""
import yaml
import sys
from pathlib import Path
from typing import Any, Dict, List
def normalize_collection(collection: Dict[str, Any]) -> Dict[str, Any]:
"""Normalize Collection field names from aliases to canonical."""
normalized = collection.copy()
# description → collection_description
if 'description' in normalized:
normalized['collection_description'] = normalized.pop('description')
# extent → item_count
if 'extent' in normalized:
normalized['item_count'] = normalized.pop('extent')
# subject_areas → subjects
if 'subject_areas' in normalized:
normalized['subjects'] = normalized.pop('subject_areas')
return normalized
def normalize_digital_platform(platform: Dict[str, Any]) -> Dict[str, Any]:
"""Normalize DigitalPlatform field names from aliases to canonical."""
normalized = platform.copy()
# description → platform_description
if 'description' in normalized:
normalized['platform_description'] = normalized.pop('description')
# metadata_standards → implemented_standards
if 'metadata_standards' in normalized:
normalized['implemented_standards'] = normalized.pop('metadata_standards')
return normalized
def normalize_provenance(provenance: Dict[str, Any]) -> Dict[str, Any]:
"""Normalize Provenance field names from aliases to canonical."""
normalized = provenance.copy()
# notes → provenance_notes
if 'notes' in normalized:
normalized['provenance_notes'] = normalized.pop('notes')
return normalized
def normalize_heritage_custodian(custodian: Dict[str, Any]) -> Dict[str, Any]:
"""Normalize HeritageCustodian and nested objects."""
normalized = custodian.copy()
# Normalize collections
if 'collections' in normalized and normalized['collections']:
normalized['collections'] = [
normalize_collection(coll) for coll in normalized['collections']
]
# Normalize digital_platforms
if 'digital_platforms' in normalized and normalized['digital_platforms']:
normalized['digital_platforms'] = [
normalize_digital_platform(plat) for plat in normalized['digital_platforms']
]
# Normalize provenance
if 'provenance' in normalized and normalized['provenance']:
normalized['provenance'] = normalize_provenance(normalized['provenance'])
return normalized
def normalize_yaml_file(input_path: Path, output_path: Path) -> None:
"""
Read YAML file, normalize field names, write to output file.
Args:
input_path: Path to input YAML file with alias field names
output_path: Path to output YAML file with canonical field names
"""
# Read input YAML
with open(input_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Normalize each institution
if isinstance(data, list):
normalized_data = [normalize_heritage_custodian(inst) for inst in data]
else:
normalized_data = normalize_heritage_custodian(data)
# Write output YAML
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(
normalized_data,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120
)
print(f"✅ Normalized {input_path}{output_path}")
def main():
"""Main entry point."""
if len(sys.argv) < 2:
print("Usage: python normalize_field_names.py <input.yaml> [output.yaml]")
print("\nNormalizes field names from aliases to canonical schema names.")
print("If output path not specified, writes to <input>_normalized.yaml")
sys.exit(1)
input_path = Path(sys.argv[1])
if len(sys.argv) >= 3:
output_path = Path(sys.argv[2])
else:
output_path = input_path.parent / f"{input_path.stem}_normalized.yaml"
if not input_path.exists():
print(f"❌ Error: Input file not found: {input_path}")
sys.exit(1)
normalize_yaml_file(input_path, output_path)
print(f"\n✅ Validation-ready file created: {output_path}")
print(f"\nValidate with:")
print(f" linkml-validate -s schemas/heritage_custodian.yaml -C HeritageCustodian {output_path}")
if __name__ == "__main__":
main()