- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
365 lines
12 KiB
Python
Executable file
365 lines
12 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fix metadata_standards field values in Norwegian institution data files.
|
|
|
|
Issues to fix:
|
|
1. Technical formats (REST API, JSON, XML) → move to integration_method
|
|
2. Software names (Primus) → move to vendor or platform_name
|
|
3. Non-standard formats → convert to canonical enum values or remove
|
|
4. Add missing integration_method fields
|
|
|
|
Schema-compliant values for MetadataStandardEnum:
|
|
- DUBLIN_CORE, MARC21, EAD, BIBFRAME, LIDO, CIDOC_CRM, SCHEMA_ORG,
|
|
RIC_O, MODS, PREMIS, SPECTRUM, DACS
|
|
"""
|
|
|
|
import yaml
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Set
|
|
|
|
|
|
# Valid metadata standard enum values (from schemas/enums.yaml)
|
|
VALID_STANDARDS = {
|
|
'DUBLIN_CORE', 'MARC21', 'EAD', 'BIBFRAME', 'LIDO', 'CIDOC_CRM',
|
|
'SCHEMA_ORG', 'RIC_O', 'MODS', 'PREMIS', 'SPECTRUM', 'DACS'
|
|
}
|
|
|
|
# Mapping from text to canonical enum values
|
|
STANDARD_MAPPINGS = {
|
|
'Dublin Core': 'DUBLIN_CORE',
|
|
'DUBLIN CORE': 'DUBLIN_CORE',
|
|
'dublin core': 'DUBLIN_CORE',
|
|
'DC': 'DUBLIN_CORE',
|
|
'MARC21': 'MARC21',
|
|
'MARC 21': 'MARC21',
|
|
'marc21': 'MARC21',
|
|
'EAD': 'EAD',
|
|
'EAD (Encoded Archival Description)': 'EAD',
|
|
'Encoded Archival Description': 'EAD',
|
|
'BIBFRAME': 'BIBFRAME',
|
|
'LIDO': 'LIDO',
|
|
'CIDOC-CRM': 'CIDOC_CRM',
|
|
'CIDOC CRM': 'CIDOC_CRM',
|
|
'Schema.org': 'SCHEMA_ORG',
|
|
'schema.org': 'SCHEMA_ORG',
|
|
'RiC-O': 'RIC_O',
|
|
'RIC-O': 'RIC_O',
|
|
'Records in Contexts': 'RIC_O',
|
|
'MODS': 'MODS',
|
|
'PREMIS': 'PREMIS',
|
|
'SPECTRUM': 'SPECTRUM',
|
|
'DACS': 'DACS',
|
|
}
|
|
|
|
# Standards that need to be added to the schema enum
|
|
# These are valid but not yet in MetadataStandardEnum
|
|
PENDING_SCHEMA_ADDITIONS = {
|
|
'Europeana Data Model (EDM)': 'EDM',
|
|
'EDM': 'EDM',
|
|
'IIIF': 'IIIF',
|
|
'International Image Interoperability Framework': 'IIIF',
|
|
'Noark 5': 'NOARK5',
|
|
'NOARK 5': 'NOARK5',
|
|
'Noark-5': 'NOARK5',
|
|
}
|
|
|
|
# Semantic web technologies (not metadata standards per se)
|
|
# Should be noted in platform description or removed
|
|
SEMANTIC_WEB_TECH = {
|
|
'RDF', 'RDFS', 'OWL', 'SKOS', 'Linked Open Data', 'LOD',
|
|
'Linked Data', 'SPARQL', 'Triple Store'
|
|
}
|
|
|
|
# Technical formats that should go in integration_method
|
|
TECHNICAL_FORMATS = {
|
|
'REST API', 'REST', 'JSON', 'XML', 'OAI-PMH', 'SPARQL', 'GraphQL',
|
|
'SOAP', 'CSV', 'API', 'HTTP'
|
|
}
|
|
|
|
# Software/vendor names
|
|
SOFTWARE_NAMES = {
|
|
'Primus', 'CollectiveAccess', 'Adlib', 'TMS', 'Axiell', 'MAIS',
|
|
'Atlantis', 'DSpace', 'EPrints', 'Fedora', 'Islandora'
|
|
}
|
|
|
|
# Standards to investigate (not in enum but might be valid)
|
|
UNKNOWN_STANDARDS = set()
|
|
|
|
|
|
def normalize_standard(value: str) -> str | None:
|
|
"""
|
|
Normalize a metadata standard value to canonical enum format.
|
|
|
|
Returns:
|
|
Canonical enum value, or None if not a valid standard
|
|
"""
|
|
value_clean = value.strip()
|
|
|
|
# Check if already canonical
|
|
if value_clean in VALID_STANDARDS:
|
|
return value_clean
|
|
|
|
# Check mappings
|
|
if value_clean in STANDARD_MAPPINGS:
|
|
return STANDARD_MAPPINGS[value_clean]
|
|
|
|
# Check pending schema additions (valid but not yet in enum)
|
|
if value_clean in PENDING_SCHEMA_ADDITIONS:
|
|
mapped_value = PENDING_SCHEMA_ADDITIONS[value_clean]
|
|
UNKNOWN_STANDARDS.add(f"{value_clean} [→ {mapped_value}, pending schema update]")
|
|
return None # Will need schema update
|
|
|
|
# Check semantic web technologies (not metadata standards)
|
|
if value_clean in SEMANTIC_WEB_TECH or any(tech in value_clean for tech in SEMANTIC_WEB_TECH):
|
|
return None # Will be filtered out with note
|
|
|
|
return None
|
|
|
|
|
|
def is_technical_format(value: str) -> bool:
|
|
"""Check if value is a technical format (REST, JSON, etc.)."""
|
|
return any(fmt in value.upper() for fmt in TECHNICAL_FORMATS)
|
|
|
|
|
|
def is_software_name(value: str) -> bool:
|
|
"""Check if value is a software/vendor name."""
|
|
return any(sw in value for sw in SOFTWARE_NAMES)
|
|
|
|
|
|
def fix_digital_platform(platform: Dict[str, Any], changes: List[str]) -> Dict[str, Any]:
|
|
"""
|
|
Fix metadata_standards in a digital platform.
|
|
|
|
Returns:
|
|
Fixed platform dict, with changes logged
|
|
"""
|
|
fixed = platform.copy()
|
|
|
|
# Handle metadata_standards field
|
|
if 'metadata_standards' in fixed:
|
|
original_standards = fixed['metadata_standards']
|
|
valid_standards = []
|
|
integration_methods = []
|
|
vendors = []
|
|
|
|
for std in original_standards:
|
|
if isinstance(std, str):
|
|
# Check if it's a valid standard
|
|
canonical = normalize_standard(std)
|
|
if canonical:
|
|
valid_standards.append(canonical)
|
|
continue
|
|
|
|
# Check if it's a technical format
|
|
if is_technical_format(std):
|
|
integration_methods.append(std)
|
|
changes.append(f" - Moved '{std}' from metadata_standards to integration_method")
|
|
continue
|
|
|
|
# Check if it's software
|
|
if is_software_name(std):
|
|
vendors.append(std)
|
|
changes.append(f" - Moved '{std}' from metadata_standards to vendor")
|
|
continue
|
|
|
|
# Check if it's semantic web technology (not a metadata standard)
|
|
if std in SEMANTIC_WEB_TECH or any(tech in std for tech in SEMANTIC_WEB_TECH):
|
|
changes.append(f" - Removed '{std}' (semantic web technology, not a metadata standard)")
|
|
continue
|
|
|
|
# Unknown - log it
|
|
UNKNOWN_STANDARDS.add(std)
|
|
changes.append(f" - ⚠️ Unknown standard '{std}' - needs manual review")
|
|
|
|
# Update platform with fixed values
|
|
if valid_standards:
|
|
fixed['metadata_standards'] = valid_standards
|
|
else:
|
|
# Remove empty metadata_standards
|
|
del fixed['metadata_standards']
|
|
|
|
# Add integration_method if we found technical formats
|
|
if integration_methods:
|
|
existing_method = fixed.get('integration_method', '')
|
|
if existing_method:
|
|
# Merge with existing
|
|
all_methods = set(integration_methods + [existing_method])
|
|
fixed['integration_method'] = ', '.join(sorted(all_methods))
|
|
else:
|
|
fixed['integration_method'] = ', '.join(integration_methods)
|
|
|
|
# Add vendor if we found software names
|
|
if vendors:
|
|
existing_vendor = fixed.get('vendor', '')
|
|
if existing_vendor:
|
|
all_vendors = set(vendors + [existing_vendor])
|
|
fixed['vendor'] = ', '.join(sorted(all_vendors))
|
|
else:
|
|
fixed['vendor'] = ', '.join(vendors)
|
|
|
|
return fixed
|
|
|
|
|
|
def fix_heritage_custodian(custodian: Dict[str, Any]) -> tuple[Dict[str, Any], List[str]]:
|
|
"""
|
|
Fix metadata_standards across all digital platforms in a custodian.
|
|
|
|
Returns:
|
|
(fixed_custodian, list_of_changes)
|
|
"""
|
|
fixed = custodian.copy()
|
|
changes = []
|
|
|
|
if 'digital_platforms' in fixed and fixed['digital_platforms']:
|
|
fixed_platforms = []
|
|
for i, platform in enumerate(fixed['digital_platforms']):
|
|
platform_changes = []
|
|
fixed_platform = fix_digital_platform(platform, platform_changes)
|
|
fixed_platforms.append(fixed_platform)
|
|
|
|
if platform_changes:
|
|
platform_name = platform.get('platform_name', f'Platform {i+1}')
|
|
changes.append(f"\n Platform: {platform_name}")
|
|
changes.extend(platform_changes)
|
|
|
|
fixed['digital_platforms'] = fixed_platforms
|
|
|
|
return fixed, changes
|
|
|
|
|
|
def fix_yaml_file(file_path: Path, dry_run: bool = False) -> tuple[int, List[str]]:
|
|
"""
|
|
Fix metadata_standards in a YAML file.
|
|
|
|
Returns:
|
|
(number_of_institutions_changed, list_of_all_changes)
|
|
"""
|
|
# Read file
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return 0, []
|
|
|
|
# Process each institution
|
|
all_changes = []
|
|
institutions_changed = 0
|
|
|
|
institutions = data if isinstance(data, list) else [data]
|
|
|
|
for i, inst in enumerate(institutions):
|
|
inst_changes = []
|
|
fixed_inst, changes = fix_heritage_custodian(inst)
|
|
|
|
if changes:
|
|
institutions_changed += 1
|
|
inst_name = inst.get('name', f'Institution {i+1}')
|
|
all_changes.append(f"\n📍 {inst_name}:")
|
|
all_changes.extend(changes)
|
|
|
|
# Update in place
|
|
if isinstance(data, list):
|
|
data[i] = fixed_inst
|
|
else:
|
|
data = fixed_inst
|
|
|
|
# Write back if not dry run
|
|
if not dry_run and institutions_changed > 0:
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
data,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=120
|
|
)
|
|
|
|
return institutions_changed, all_changes
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description='Fix metadata_standards field values in Norwegian institution YAML files'
|
|
)
|
|
parser.add_argument(
|
|
'files',
|
|
nargs='+',
|
|
type=Path,
|
|
help='YAML files to fix (or directory containing YAML files)'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Show what would be changed without modifying files'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Collect all YAML files
|
|
yaml_files = []
|
|
for path in args.files:
|
|
if path.is_dir():
|
|
yaml_files.extend(path.glob('*.yaml'))
|
|
yaml_files.extend(path.glob('*.yml'))
|
|
else:
|
|
yaml_files.append(path)
|
|
|
|
if not yaml_files:
|
|
print("❌ No YAML files found")
|
|
return 1
|
|
|
|
print(f"{'🔍 DRY RUN - ' if args.dry_run else ''}Processing {len(yaml_files)} files...\n")
|
|
|
|
total_institutions_changed = 0
|
|
total_files_changed = 0
|
|
|
|
for file_path in sorted(yaml_files):
|
|
print(f"\n{'='*80}")
|
|
print(f"📄 {file_path.name}")
|
|
print('='*80)
|
|
|
|
try:
|
|
num_changed, changes = fix_yaml_file(file_path, dry_run=args.dry_run)
|
|
|
|
if num_changed > 0:
|
|
total_files_changed += 1
|
|
total_institutions_changed += num_changed
|
|
print(f"\n✏️ {num_changed} institution(s) modified:")
|
|
for change in changes:
|
|
print(change)
|
|
else:
|
|
print("✅ No changes needed")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error processing {file_path}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
# Summary
|
|
print(f"\n{'='*80}")
|
|
print("📊 SUMMARY")
|
|
print('='*80)
|
|
print(f"Files processed: {len(yaml_files)}")
|
|
print(f"Files changed: {total_files_changed}")
|
|
print(f"Institutions modified: {total_institutions_changed}")
|
|
|
|
if UNKNOWN_STANDARDS:
|
|
print(f"\n⚠️ Unknown standards found (need manual review or schema update):")
|
|
for std in sorted(UNKNOWN_STANDARDS):
|
|
print(f" - {std}")
|
|
|
|
if args.dry_run:
|
|
print(f"\n🔍 This was a dry run. Run without --dry-run to apply changes.")
|
|
else:
|
|
print(f"\n✅ Changes applied successfully!")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|