- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
149 lines
4.8 KiB
Python
Executable file
149 lines
4.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Normalize field names from aliases to canonical names for LinkML validation.
|
|
|
|
This script converts intuitive field names (aliases) used in extracted data
|
|
to the canonical field names expected by the LinkML schema.
|
|
|
|
Field Mappings:
|
|
- Collection.description → collection_description
|
|
- Collection.extent → item_count
|
|
- Collection.subject_areas → subjects
|
|
- DigitalPlatform.description → platform_description
|
|
- DigitalPlatform.metadata_standards → implemented_standards
|
|
- Provenance.notes → provenance_notes
|
|
"""
|
|
|
|
import yaml
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
def normalize_collection(collection: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Normalize Collection field names from aliases to canonical."""
|
|
normalized = collection.copy()
|
|
|
|
# description → collection_description
|
|
if 'description' in normalized:
|
|
normalized['collection_description'] = normalized.pop('description')
|
|
|
|
# extent → item_count
|
|
if 'extent' in normalized:
|
|
normalized['item_count'] = normalized.pop('extent')
|
|
|
|
# subject_areas → subjects
|
|
if 'subject_areas' in normalized:
|
|
normalized['subjects'] = normalized.pop('subject_areas')
|
|
|
|
return normalized
|
|
|
|
|
|
def normalize_digital_platform(platform: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Normalize DigitalPlatform field names from aliases to canonical."""
|
|
normalized = platform.copy()
|
|
|
|
# description → platform_description
|
|
if 'description' in normalized:
|
|
normalized['platform_description'] = normalized.pop('description')
|
|
|
|
# metadata_standards → implemented_standards
|
|
if 'metadata_standards' in normalized:
|
|
normalized['implemented_standards'] = normalized.pop('metadata_standards')
|
|
|
|
return normalized
|
|
|
|
|
|
def normalize_provenance(provenance: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Normalize Provenance field names from aliases to canonical."""
|
|
normalized = provenance.copy()
|
|
|
|
# notes → provenance_notes
|
|
if 'notes' in normalized:
|
|
normalized['provenance_notes'] = normalized.pop('notes')
|
|
|
|
return normalized
|
|
|
|
|
|
def normalize_heritage_custodian(custodian: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Normalize HeritageCustodian and nested objects."""
|
|
normalized = custodian.copy()
|
|
|
|
# Normalize collections
|
|
if 'collections' in normalized and normalized['collections']:
|
|
normalized['collections'] = [
|
|
normalize_collection(coll) for coll in normalized['collections']
|
|
]
|
|
|
|
# Normalize digital_platforms
|
|
if 'digital_platforms' in normalized and normalized['digital_platforms']:
|
|
normalized['digital_platforms'] = [
|
|
normalize_digital_platform(plat) for plat in normalized['digital_platforms']
|
|
]
|
|
|
|
# Normalize provenance
|
|
if 'provenance' in normalized and normalized['provenance']:
|
|
normalized['provenance'] = normalize_provenance(normalized['provenance'])
|
|
|
|
return normalized
|
|
|
|
|
|
def normalize_yaml_file(input_path: Path, output_path: Path) -> None:
|
|
"""
|
|
Read YAML file, normalize field names, write to output file.
|
|
|
|
Args:
|
|
input_path: Path to input YAML file with alias field names
|
|
output_path: Path to output YAML file with canonical field names
|
|
"""
|
|
# Read input YAML
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Normalize each institution
|
|
if isinstance(data, list):
|
|
normalized_data = [normalize_heritage_custodian(inst) for inst in data]
|
|
else:
|
|
normalized_data = normalize_heritage_custodian(data)
|
|
|
|
# Write output YAML
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
normalized_data,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=120
|
|
)
|
|
|
|
print(f"✅ Normalized {input_path} → {output_path}")
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python normalize_field_names.py <input.yaml> [output.yaml]")
|
|
print("\nNormalizes field names from aliases to canonical schema names.")
|
|
print("If output path not specified, writes to <input>_normalized.yaml")
|
|
sys.exit(1)
|
|
|
|
input_path = Path(sys.argv[1])
|
|
|
|
if len(sys.argv) >= 3:
|
|
output_path = Path(sys.argv[2])
|
|
else:
|
|
output_path = input_path.parent / f"{input_path.stem}_normalized.yaml"
|
|
|
|
if not input_path.exists():
|
|
print(f"❌ Error: Input file not found: {input_path}")
|
|
sys.exit(1)
|
|
|
|
normalize_yaml_file(input_path, output_path)
|
|
print(f"\n✅ Validation-ready file created: {output_path}")
|
|
print(f"\nValidate with:")
|
|
print(f" linkml-validate -s schemas/heritage_custodian.yaml -C HeritageCustodian {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|