glam/scripts/migrate_to_schema_v0.2.2_enrichment.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

456 lines
18 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Migration Script: Old Enrichment Format → Schema v0.2.2 EnrichmentHistoryEntry
Converts three different old enrichment formats to schema v0.2.2 compliant
enrichment_history structure.
OLD FORMATS:
1. Provenance flat fields (most common - 117 instances in Chile):
provenance:
enrichment_batch: 7
enrichment_method: SPARQL_BULK_QUERY
wikidata_verified: true
notes:
- "Batch 7: SPARQL match - exact name match"
2. Old enrichment_history (10 instances in Chile):
enrichment_history:
- enrichment_date: "2025-11-09T18:10:41.851904+00:00"
enrichment_method: "Chilean Batch 5 - University + museum Wikidata verification"
enrichment_batch: batch_5
q_number: Q3551323
verification: "Universidad Arturo Prat, public university..."
3. Unstructured notes (Tunisia, Algeria, Libya):
notes: "Wikidata enriched 2025-11-10 (Q549445, match: 84%)..."
NEW FORMAT (schema v0.2.2):
enrichment_history:
- enrichment_date: "2025-11-10T14:30:00+00:00"
enrichment_type: WIKIDATA_IDENTIFIER
enrichment_method: "Wikidata SPARQL query with fuzzy matching"
match_score: 0.84
verified: false
enrichment_source: "https://www.wikidata.org"
enrichment_notes: "Matched to '...' (Q549445)"
DATASETS TO MIGRATE:
- Chile: 90 institutions (71 with Wikidata)
- Tunisia: 68 institutions
- Algeria: 19 institutions
- Libya: 24 institutions
Total: 201 institutions
USAGE:
python scripts/migrate_to_schema_v0.2.2_enrichment.py --dry-run
python scripts/migrate_to_schema_v0.2.2_enrichment.py --apply
"""
import argparse
import re
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
from dataclasses import dataclass
@dataclass
class EnrichmentEntry:
"""Schema v0.2.2 compliant enrichment history entry"""
enrichment_date: str
enrichment_type: str
enrichment_method: str
match_score: Optional[float] = None
verified: bool = False
enrichment_source: Optional[str] = None
enrichment_notes: Optional[str] = None
class EnrichmentMigrator:
"""Migrates old enrichment formats to schema v0.2.2"""
# Regex patterns for parsing unstructured notes
WIKIDATA_NOTE_PATTERN = re.compile(
r'Wikidata enriched (\d{4}-\d{2}-\d{2})\s*\((Q\d+),\s*match:\s*(\d+)%\)'
)
BATCH_NOTE_PATTERN = re.compile(
r'Batch (\d+):\s*SPARQL match\s*-\s*(.+)'
)
def __init__(self, dry_run: bool = True):
self.dry_run = dry_run
self.stats = {
'processed': 0,
'migrated': 0,
'skipped': 0,
'errors': 0
}
def migrate_institution(self, inst: Dict[str, Any]) -> bool:
"""
Migrate a single institution's enrichment data.
Returns True if migration was performed, False if skipped.
"""
if 'provenance' not in inst:
return False
prov = inst['provenance']
# Check if already has new format enrichment_history
if 'enrichment_history' in prov and isinstance(prov['enrichment_history'], list):
if prov['enrichment_history'] and 'enrichment_type' in prov['enrichment_history'][0]:
# Already in new format
return False
# Initialize new enrichment_history list
new_history: List[EnrichmentEntry] = []
# Migration Path 1: Flat provenance fields (enrichment_batch, wikidata_verified)
if 'enrichment_batch' in prov or 'wikidata_verified' in prov:
entries = self._migrate_flat_provenance(prov, inst)
new_history.extend(entries)
# Migration Path 2: Old enrichment_history format
elif 'enrichment_history' in prov and isinstance(prov['enrichment_history'], list):
if prov['enrichment_history'] and 'q_number' in prov['enrichment_history'][0]:
entries = self._migrate_old_enrichment_history(prov['enrichment_history'])
new_history.extend(entries)
# Migration Path 3: Standalone enrichment_method field (later batches)
# NOTE: enrichment_date is optional - we use extraction_date as fallback
elif 'enrichment_method' in prov:
entries = self._migrate_standalone_enrichment(prov, inst)
new_history.extend(entries)
# Migration Path 4: Parse unstructured notes
if 'notes' in prov and not new_history:
entries = self._parse_notes(prov['notes'])
new_history.extend(entries)
# Apply migration if entries were created
if new_history:
# Remove old fields
old_fields = ['enrichment_batch', 'enrichment_method', 'enrichment_confidence',
'wikidata_verified', 'wikidata_match_confidence', 'enrichment_date', 'notes']
for field in old_fields:
prov.pop(field, None)
# Add new enrichment_history (convert EnrichmentEntry objects to dicts)
prov['enrichment_history'] = [self._entry_to_dict(e) for e in new_history]
return True
return False
def _migrate_flat_provenance(self, prov: Dict[str, Any], inst: Dict[str, Any]) -> List[EnrichmentEntry]:
"""Migrate flat provenance fields to structured history"""
entries = []
# Extract Wikidata Q-number from identifiers
q_number = None
if 'identifiers' in inst:
for ident in inst['identifiers']:
if ident.get('identifier_scheme') == 'Wikidata':
q_number = ident.get('identifier_value')
break
# Parse enrichment date from notes or use extraction_date
enrichment_date = prov.get('extraction_date', datetime.now(timezone.utc).isoformat())
# Parse match confidence from notes
match_score = None
enrichment_notes = None
if 'notes' in prov:
notes_text = prov['notes']
if isinstance(notes_text, list):
notes_text = ' '.join(str(n) for n in notes_text)
# Extract batch and match type
batch_match = self.BATCH_NOTE_PATTERN.search(notes_text)
if batch_match:
batch_num = batch_match.group(1)
match_type = batch_match.group(2)
enrichment_notes = f"Batch {batch_num}: {match_type}"
# Infer match score from match type
if 'exact name match' in match_type:
match_score = 1.0
elif 'partial name' in match_type:
match_score = 0.85
elif 'includes full' in match_type:
match_score = 0.9
# Create Wikidata enrichment entry
if q_number:
entry = EnrichmentEntry(
enrichment_date=enrichment_date,
enrichment_type='WIKIDATA_IDENTIFIER',
enrichment_method=prov.get('enrichment_method', 'Wikidata SPARQL bulk query'),
match_score=match_score,
verified=prov.get('wikidata_verified', False),
enrichment_source='https://www.wikidata.org',
enrichment_notes=enrichment_notes or f"Matched to Wikidata entity {q_number}"
)
entries.append(entry)
return entries
def _migrate_old_enrichment_history(self, old_history: List[Dict[str, Any]]) -> List[EnrichmentEntry]:
"""Migrate old enrichment_history format to new schema"""
entries = []
for old_entry in old_history:
q_number = old_entry.get('q_number')
verification = old_entry.get('verification', '')
entry = EnrichmentEntry(
enrichment_date=old_entry.get('enrichment_date', datetime.now(timezone.utc).isoformat()),
enrichment_type='WIKIDATA_IDENTIFIER',
enrichment_method=old_entry.get('enrichment_method', 'Wikidata verification'),
match_score=0.95, # Manual verification implies high confidence
verified=True, # Old enrichment_history was manually verified
enrichment_source='https://www.wikidata.org',
enrichment_notes=f"Matched to {verification} ({q_number})" if q_number else verification
)
entries.append(entry)
return entries
def _migrate_standalone_enrichment(self, prov: Dict[str, Any], inst: Dict[str, Any]) -> List[EnrichmentEntry]:
"""Migrate standalone enrichment_method field (later batches)"""
entries = []
# Extract Wikidata Q-number from identifiers
q_number = None
if 'identifiers' in inst:
for ident in inst['identifiers']:
if ident.get('identifier_scheme') == 'Wikidata':
q_number = ident.get('identifier_value')
break
if not q_number:
return entries
# Parse match confidence
match_confidence = prov.get('wikidata_match_confidence', 'unknown')
match_score = None
if match_confidence == 'high':
match_score = 0.95
elif match_confidence == 'partial':
match_score = 0.80
elif match_confidence == 'medium':
match_score = 0.75
# Determine if verified (high confidence = verified)
verified = match_confidence == 'high'
# Extract enrichment notes from notes field
enrichment_notes = None
if 'notes' in prov:
notes = prov['notes']
if isinstance(notes, list):
enrichment_notes = ' '.join(str(n) for n in notes)
else:
enrichment_notes = str(notes)
# Use enrichment_date if available, otherwise fall back to extraction_date
enrichment_date = prov.get('enrichment_date') or prov.get('extraction_date', datetime.now(timezone.utc).isoformat())
# Create entry
entry = EnrichmentEntry(
enrichment_date=enrichment_date,
enrichment_type='WIKIDATA_IDENTIFIER',
enrichment_method=prov.get('enrichment_method', 'Wikidata enrichment'),
match_score=match_score,
verified=verified,
enrichment_source='https://www.wikidata.org',
enrichment_notes=enrichment_notes or f"Matched to Wikidata entity {q_number}"
)
entries.append(entry)
return entries
def _parse_notes(self, notes: Any) -> List[EnrichmentEntry]:
"""Parse unstructured notes for enrichment information"""
entries = []
if not notes:
return entries
notes_text = notes if isinstance(notes, str) else ' '.join(str(n) for n in notes)
# Pattern: "Wikidata enriched 2025-11-10 (Q549445, match: 84%)"
match = self.WIKIDATA_NOTE_PATTERN.search(notes_text)
if match:
date_str, q_number, match_pct = match.groups()
entry = EnrichmentEntry(
enrichment_date=f"{date_str}T12:00:00+00:00", # Assume midday UTC
enrichment_type='WIKIDATA_IDENTIFIER',
enrichment_method='Wikidata SPARQL query with fuzzy matching',
match_score=int(match_pct) / 100.0,
verified=False,
enrichment_source='https://www.wikidata.org',
enrichment_notes=f"Matched to Wikidata entity {q_number}"
)
entries.append(entry)
return entries
def _entry_to_dict(self, entry: EnrichmentEntry) -> Dict[str, Any]:
"""Convert EnrichmentEntry to dict, omitting None values"""
return {
k: v for k, v in {
'enrichment_date': entry.enrichment_date,
'enrichment_type': entry.enrichment_type,
'enrichment_method': entry.enrichment_method,
'match_score': entry.match_score,
'verified': entry.verified,
'enrichment_source': entry.enrichment_source,
'enrichment_notes': entry.enrichment_notes
}.items() if v is not None
}
def migrate_file(self, input_path: Path, output_path: Optional[Path] = None) -> None:
"""Migrate all institutions in a YAML file"""
print(f"\n{'[DRY RUN] ' if self.dry_run else ''}Processing: {input_path}")
# Read file
with open(input_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Handle different file structures
institutions = []
metadata = None
if isinstance(data, dict) and 'institutions' in data:
# Tunisia format with metadata
institutions = data['institutions']
metadata = {k: v for k, v in data.items() if k != 'institutions'}
elif isinstance(data, list):
# Direct list format
institutions = data
else:
print(f" ⚠️ Unknown file structure, skipping")
self.stats['errors'] += 1
return
# Migrate each institution
migrated_count = 0
for inst in institutions:
self.stats['processed'] += 1
if self.migrate_institution(inst):
migrated_count += 1
self.stats['migrated'] += 1
# Show example
if migrated_count == 1:
print(f"\n ✓ Example migration:")
print(f" Institution: {inst.get('name', 'Unknown')}")
if 'enrichment_history' in inst['provenance']:
print(f" New enrichment_history entries: {len(inst['provenance']['enrichment_history'])}")
else:
self.stats['skipped'] += 1
print(f"\n Processed: {len(institutions)} institutions")
print(f" Migrated: {migrated_count}")
print(f" Skipped: {len(institutions) - migrated_count}")
# Write output
if not self.dry_run:
output_path = output_path or input_path
# Create backup
backup_path = input_path.with_suffix('.yaml.pre_v0.2.2_backup')
if not backup_path.exists():
import shutil
shutil.copy2(input_path, backup_path)
print(f" 📦 Backup created: {backup_path.name}")
# Write migrated data
with open(output_path, 'w', encoding='utf-8') as f:
if metadata:
# Update metadata
if '_metadata' in metadata:
metadata['_metadata']['schema_version'] = '0.2.2'
metadata['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
if 'enhancements' not in metadata['_metadata']:
metadata['_metadata']['enhancements'] = []
metadata['_metadata']['enhancements'].append('Schema v0.2.2 enrichment_history migration')
output_data = {**metadata, 'institutions': institutions}
else:
output_data = institutions
yaml.dump(output_data, f, allow_unicode=True, sort_keys=False, width=120)
print(f" ✅ Written: {output_path}")
def main():
parser = argparse.ArgumentParser(
description='Migrate old enrichment formats to schema v0.2.2',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument('--dry-run', action='store_true', default=True,
help='Preview changes without writing (default)')
parser.add_argument('--apply', action='store_true',
help='Apply changes and write files')
parser.add_argument('--files', nargs='+',
help='Specific files to migrate (default: all datasets)')
args = parser.parse_args()
# Default dataset files
base_path = Path(__file__).parent.parent / 'data' / 'instances'
default_files = [
base_path / 'chile' / 'chilean_institutions_batch19_enriched.yaml',
base_path / 'tunisia' / 'tunisian_institutions_enhanced.yaml',
base_path / 'algeria' / 'algerian_institutions.yaml',
base_path / 'libya' / 'libyan_institutions.yaml'
]
files_to_migrate = [Path(f) for f in args.files] if args.files else default_files
# Create migrator
migrator = EnrichmentMigrator(dry_run=not args.apply)
print("=" * 80)
print("SCHEMA v0.2.2 ENRICHMENT MIGRATION")
print("=" * 80)
print(f"Mode: {'DRY RUN (preview only)' if migrator.dry_run else 'APPLY CHANGES'}")
print(f"Files: {len(files_to_migrate)}")
# Migrate files
for file_path in files_to_migrate:
if not file_path.exists():
print(f"\n⚠️ File not found: {file_path}")
continue
migrator.migrate_file(file_path)
# Summary
print("\n" + "=" * 80)
print("MIGRATION SUMMARY")
print("=" * 80)
print(f"Total institutions processed: {migrator.stats['processed']}")
print(f"Migrated to v0.2.2: {migrator.stats['migrated']}")
print(f"Skipped (already migrated or no enrichment): {migrator.stats['skipped']}")
print(f"Errors: {migrator.stats['errors']}")
if migrator.dry_run:
print("\n⚠️ DRY RUN MODE - No files were modified")
print("Run with --apply to write changes")
else:
print("\n✅ Migration completed")
print("Backups created with .pre_v0.2.2_backup extension")
if __name__ == '__main__':
main()