glam/scripts/fix_metadata_standards.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

365 lines
12 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fix metadata_standards field values in Norwegian institution data files.
Issues to fix:
1. Technical formats (REST API, JSON, XML) → move to integration_method
2. Software names (Primus) → move to vendor or platform_name
3. Non-standard formats → convert to canonical enum values or remove
4. Add missing integration_method fields
Schema-compliant values for MetadataStandardEnum:
- DUBLIN_CORE, MARC21, EAD, BIBFRAME, LIDO, CIDOC_CRM, SCHEMA_ORG,
RIC_O, MODS, PREMIS, SPECTRUM, DACS
"""
import yaml
import sys
from pathlib import Path
from typing import Any, Dict, List, Set
# Valid metadata standard enum values (from schemas/enums.yaml)
VALID_STANDARDS = {
'DUBLIN_CORE', 'MARC21', 'EAD', 'BIBFRAME', 'LIDO', 'CIDOC_CRM',
'SCHEMA_ORG', 'RIC_O', 'MODS', 'PREMIS', 'SPECTRUM', 'DACS'
}
# Mapping from text to canonical enum values
STANDARD_MAPPINGS = {
'Dublin Core': 'DUBLIN_CORE',
'DUBLIN CORE': 'DUBLIN_CORE',
'dublin core': 'DUBLIN_CORE',
'DC': 'DUBLIN_CORE',
'MARC21': 'MARC21',
'MARC 21': 'MARC21',
'marc21': 'MARC21',
'EAD': 'EAD',
'EAD (Encoded Archival Description)': 'EAD',
'Encoded Archival Description': 'EAD',
'BIBFRAME': 'BIBFRAME',
'LIDO': 'LIDO',
'CIDOC-CRM': 'CIDOC_CRM',
'CIDOC CRM': 'CIDOC_CRM',
'Schema.org': 'SCHEMA_ORG',
'schema.org': 'SCHEMA_ORG',
'RiC-O': 'RIC_O',
'RIC-O': 'RIC_O',
'Records in Contexts': 'RIC_O',
'MODS': 'MODS',
'PREMIS': 'PREMIS',
'SPECTRUM': 'SPECTRUM',
'DACS': 'DACS',
}
# Standards that need to be added to the schema enum
# These are valid but not yet in MetadataStandardEnum
PENDING_SCHEMA_ADDITIONS = {
'Europeana Data Model (EDM)': 'EDM',
'EDM': 'EDM',
'IIIF': 'IIIF',
'International Image Interoperability Framework': 'IIIF',
'Noark 5': 'NOARK5',
'NOARK 5': 'NOARK5',
'Noark-5': 'NOARK5',
}
# Semantic web technologies (not metadata standards per se)
# Should be noted in platform description or removed
SEMANTIC_WEB_TECH = {
'RDF', 'RDFS', 'OWL', 'SKOS', 'Linked Open Data', 'LOD',
'Linked Data', 'SPARQL', 'Triple Store'
}
# Technical formats that should go in integration_method
TECHNICAL_FORMATS = {
'REST API', 'REST', 'JSON', 'XML', 'OAI-PMH', 'SPARQL', 'GraphQL',
'SOAP', 'CSV', 'API', 'HTTP'
}
# Software/vendor names
SOFTWARE_NAMES = {
'Primus', 'CollectiveAccess', 'Adlib', 'TMS', 'Axiell', 'MAIS',
'Atlantis', 'DSpace', 'EPrints', 'Fedora', 'Islandora'
}
# Standards to investigate (not in enum but might be valid)
UNKNOWN_STANDARDS = set()
def normalize_standard(value: str) -> str | None:
"""
Normalize a metadata standard value to canonical enum format.
Returns:
Canonical enum value, or None if not a valid standard
"""
value_clean = value.strip()
# Check if already canonical
if value_clean in VALID_STANDARDS:
return value_clean
# Check mappings
if value_clean in STANDARD_MAPPINGS:
return STANDARD_MAPPINGS[value_clean]
# Check pending schema additions (valid but not yet in enum)
if value_clean in PENDING_SCHEMA_ADDITIONS:
mapped_value = PENDING_SCHEMA_ADDITIONS[value_clean]
UNKNOWN_STANDARDS.add(f"{value_clean} [→ {mapped_value}, pending schema update]")
return None # Will need schema update
# Check semantic web technologies (not metadata standards)
if value_clean in SEMANTIC_WEB_TECH or any(tech in value_clean for tech in SEMANTIC_WEB_TECH):
return None # Will be filtered out with note
return None
def is_technical_format(value: str) -> bool:
"""Check if value is a technical format (REST, JSON, etc.)."""
return any(fmt in value.upper() for fmt in TECHNICAL_FORMATS)
def is_software_name(value: str) -> bool:
"""Check if value is a software/vendor name."""
return any(sw in value for sw in SOFTWARE_NAMES)
def fix_digital_platform(platform: Dict[str, Any], changes: List[str]) -> Dict[str, Any]:
"""
Fix metadata_standards in a digital platform.
Returns:
Fixed platform dict, with changes logged
"""
fixed = platform.copy()
# Handle metadata_standards field
if 'metadata_standards' in fixed:
original_standards = fixed['metadata_standards']
valid_standards = []
integration_methods = []
vendors = []
for std in original_standards:
if isinstance(std, str):
# Check if it's a valid standard
canonical = normalize_standard(std)
if canonical:
valid_standards.append(canonical)
continue
# Check if it's a technical format
if is_technical_format(std):
integration_methods.append(std)
changes.append(f" - Moved '{std}' from metadata_standards to integration_method")
continue
# Check if it's software
if is_software_name(std):
vendors.append(std)
changes.append(f" - Moved '{std}' from metadata_standards to vendor")
continue
# Check if it's semantic web technology (not a metadata standard)
if std in SEMANTIC_WEB_TECH or any(tech in std for tech in SEMANTIC_WEB_TECH):
changes.append(f" - Removed '{std}' (semantic web technology, not a metadata standard)")
continue
# Unknown - log it
UNKNOWN_STANDARDS.add(std)
changes.append(f" - ⚠️ Unknown standard '{std}' - needs manual review")
# Update platform with fixed values
if valid_standards:
fixed['metadata_standards'] = valid_standards
else:
# Remove empty metadata_standards
del fixed['metadata_standards']
# Add integration_method if we found technical formats
if integration_methods:
existing_method = fixed.get('integration_method', '')
if existing_method:
# Merge with existing
all_methods = set(integration_methods + [existing_method])
fixed['integration_method'] = ', '.join(sorted(all_methods))
else:
fixed['integration_method'] = ', '.join(integration_methods)
# Add vendor if we found software names
if vendors:
existing_vendor = fixed.get('vendor', '')
if existing_vendor:
all_vendors = set(vendors + [existing_vendor])
fixed['vendor'] = ', '.join(sorted(all_vendors))
else:
fixed['vendor'] = ', '.join(vendors)
return fixed
def fix_heritage_custodian(custodian: Dict[str, Any]) -> tuple[Dict[str, Any], List[str]]:
"""
Fix metadata_standards across all digital platforms in a custodian.
Returns:
(fixed_custodian, list_of_changes)
"""
fixed = custodian.copy()
changes = []
if 'digital_platforms' in fixed and fixed['digital_platforms']:
fixed_platforms = []
for i, platform in enumerate(fixed['digital_platforms']):
platform_changes = []
fixed_platform = fix_digital_platform(platform, platform_changes)
fixed_platforms.append(fixed_platform)
if platform_changes:
platform_name = platform.get('platform_name', f'Platform {i+1}')
changes.append(f"\n Platform: {platform_name}")
changes.extend(platform_changes)
fixed['digital_platforms'] = fixed_platforms
return fixed, changes
def fix_yaml_file(file_path: Path, dry_run: bool = False) -> tuple[int, List[str]]:
"""
Fix metadata_standards in a YAML file.
Returns:
(number_of_institutions_changed, list_of_all_changes)
"""
# Read file
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
return 0, []
# Process each institution
all_changes = []
institutions_changed = 0
institutions = data if isinstance(data, list) else [data]
for i, inst in enumerate(institutions):
inst_changes = []
fixed_inst, changes = fix_heritage_custodian(inst)
if changes:
institutions_changed += 1
inst_name = inst.get('name', f'Institution {i+1}')
all_changes.append(f"\n📍 {inst_name}:")
all_changes.extend(changes)
# Update in place
if isinstance(data, list):
data[i] = fixed_inst
else:
data = fixed_inst
# Write back if not dry run
if not dry_run and institutions_changed > 0:
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(
data,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120
)
return institutions_changed, all_changes
def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(
description='Fix metadata_standards field values in Norwegian institution YAML files'
)
parser.add_argument(
'files',
nargs='+',
type=Path,
help='YAML files to fix (or directory containing YAML files)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be changed without modifying files'
)
args = parser.parse_args()
# Collect all YAML files
yaml_files = []
for path in args.files:
if path.is_dir():
yaml_files.extend(path.glob('*.yaml'))
yaml_files.extend(path.glob('*.yml'))
else:
yaml_files.append(path)
if not yaml_files:
print("❌ No YAML files found")
return 1
print(f"{'🔍 DRY RUN - ' if args.dry_run else ''}Processing {len(yaml_files)} files...\n")
total_institutions_changed = 0
total_files_changed = 0
for file_path in sorted(yaml_files):
print(f"\n{'='*80}")
print(f"📄 {file_path.name}")
print('='*80)
try:
num_changed, changes = fix_yaml_file(file_path, dry_run=args.dry_run)
if num_changed > 0:
total_files_changed += 1
total_institutions_changed += num_changed
print(f"\n✏️ {num_changed} institution(s) modified:")
for change in changes:
print(change)
else:
print("✅ No changes needed")
except Exception as e:
print(f"❌ Error processing {file_path}: {e}")
import traceback
traceback.print_exc()
# Summary
print(f"\n{'='*80}")
print("📊 SUMMARY")
print('='*80)
print(f"Files processed: {len(yaml_files)}")
print(f"Files changed: {total_files_changed}")
print(f"Institutions modified: {total_institutions_changed}")
if UNKNOWN_STANDARDS:
print(f"\n⚠️ Unknown standards found (need manual review or schema update):")
for std in sorted(UNKNOWN_STANDARDS):
print(f" - {std}")
if args.dry_run:
print(f"\n🔍 This was a dry run. Run without --dry-run to apply changes.")
else:
print(f"\n✅ Changes applied successfully!")
return 0
if __name__ == "__main__":
sys.exit(main())