#!/usr/bin/env python3 """ Migration Script: Old Enrichment Format → Schema v0.2.2 EnrichmentHistoryEntry Converts three different old enrichment formats to schema v0.2.2 compliant enrichment_history structure. OLD FORMATS: 1. Provenance flat fields (most common - 117 instances in Chile): provenance: enrichment_batch: 7 enrichment_method: SPARQL_BULK_QUERY wikidata_verified: true notes: - "Batch 7: SPARQL match - exact name match" 2. Old enrichment_history (10 instances in Chile): enrichment_history: - enrichment_date: "2025-11-09T18:10:41.851904+00:00" enrichment_method: "Chilean Batch 5 - University + museum Wikidata verification" enrichment_batch: batch_5 q_number: Q3551323 verification: "Universidad Arturo Prat, public university..." 3. Unstructured notes (Tunisia, Algeria, Libya): notes: "Wikidata enriched 2025-11-10 (Q549445, match: 84%)..." NEW FORMAT (schema v0.2.2): enrichment_history: - enrichment_date: "2025-11-10T14:30:00+00:00" enrichment_type: WIKIDATA_IDENTIFIER enrichment_method: "Wikidata SPARQL query with fuzzy matching" match_score: 0.84 verified: false enrichment_source: "https://www.wikidata.org" enrichment_notes: "Matched to '...' (Q549445)" DATASETS TO MIGRATE: - Chile: 90 institutions (71 with Wikidata) - Tunisia: 68 institutions - Algeria: 19 institutions - Libya: 24 institutions Total: 201 institutions USAGE: python scripts/migrate_to_schema_v0.2.2_enrichment.py --dry-run python scripts/migrate_to_schema_v0.2.2_enrichment.py --apply """ import argparse import re import yaml from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional from dataclasses import dataclass @dataclass class EnrichmentEntry: """Schema v0.2.2 compliant enrichment history entry""" enrichment_date: str enrichment_type: str enrichment_method: str match_score: Optional[float] = None verified: bool = False enrichment_source: Optional[str] = None enrichment_notes: Optional[str] = None class EnrichmentMigrator: """Migrates old enrichment formats to schema v0.2.2""" # Regex patterns for parsing unstructured notes WIKIDATA_NOTE_PATTERN = re.compile( r'Wikidata enriched (\d{4}-\d{2}-\d{2})\s*\((Q\d+),\s*match:\s*(\d+)%\)' ) BATCH_NOTE_PATTERN = re.compile( r'Batch (\d+):\s*SPARQL match\s*-\s*(.+)' ) def __init__(self, dry_run: bool = True): self.dry_run = dry_run self.stats = { 'processed': 0, 'migrated': 0, 'skipped': 0, 'errors': 0 } def migrate_institution(self, inst: Dict[str, Any]) -> bool: """ Migrate a single institution's enrichment data. Returns True if migration was performed, False if skipped. """ if 'provenance' not in inst: return False prov = inst['provenance'] # Check if already has new format enrichment_history if 'enrichment_history' in prov and isinstance(prov['enrichment_history'], list): if prov['enrichment_history'] and 'enrichment_type' in prov['enrichment_history'][0]: # Already in new format return False # Initialize new enrichment_history list new_history: List[EnrichmentEntry] = [] # Migration Path 1: Flat provenance fields (enrichment_batch, wikidata_verified) if 'enrichment_batch' in prov or 'wikidata_verified' in prov: entries = self._migrate_flat_provenance(prov, inst) new_history.extend(entries) # Migration Path 2: Old enrichment_history format elif 'enrichment_history' in prov and isinstance(prov['enrichment_history'], list): if prov['enrichment_history'] and 'q_number' in prov['enrichment_history'][0]: entries = self._migrate_old_enrichment_history(prov['enrichment_history']) new_history.extend(entries) # Migration Path 3: Standalone enrichment_method field (later batches) # NOTE: enrichment_date is optional - we use extraction_date as fallback elif 'enrichment_method' in prov: entries = self._migrate_standalone_enrichment(prov, inst) new_history.extend(entries) # Migration Path 4: Parse unstructured notes if 'notes' in prov and not new_history: entries = self._parse_notes(prov['notes']) new_history.extend(entries) # Apply migration if entries were created if new_history: # Remove old fields old_fields = ['enrichment_batch', 'enrichment_method', 'enrichment_confidence', 'wikidata_verified', 'wikidata_match_confidence', 'enrichment_date', 'notes'] for field in old_fields: prov.pop(field, None) # Add new enrichment_history (convert EnrichmentEntry objects to dicts) prov['enrichment_history'] = [self._entry_to_dict(e) for e in new_history] return True return False def _migrate_flat_provenance(self, prov: Dict[str, Any], inst: Dict[str, Any]) -> List[EnrichmentEntry]: """Migrate flat provenance fields to structured history""" entries = [] # Extract Wikidata Q-number from identifiers q_number = None if 'identifiers' in inst: for ident in inst['identifiers']: if ident.get('identifier_scheme') == 'Wikidata': q_number = ident.get('identifier_value') break # Parse enrichment date from notes or use extraction_date enrichment_date = prov.get('extraction_date', datetime.now(timezone.utc).isoformat()) # Parse match confidence from notes match_score = None enrichment_notes = None if 'notes' in prov: notes_text = prov['notes'] if isinstance(notes_text, list): notes_text = ' '.join(str(n) for n in notes_text) # Extract batch and match type batch_match = self.BATCH_NOTE_PATTERN.search(notes_text) if batch_match: batch_num = batch_match.group(1) match_type = batch_match.group(2) enrichment_notes = f"Batch {batch_num}: {match_type}" # Infer match score from match type if 'exact name match' in match_type: match_score = 1.0 elif 'partial name' in match_type: match_score = 0.85 elif 'includes full' in match_type: match_score = 0.9 # Create Wikidata enrichment entry if q_number: entry = EnrichmentEntry( enrichment_date=enrichment_date, enrichment_type='WIKIDATA_IDENTIFIER', enrichment_method=prov.get('enrichment_method', 'Wikidata SPARQL bulk query'), match_score=match_score, verified=prov.get('wikidata_verified', False), enrichment_source='https://www.wikidata.org', enrichment_notes=enrichment_notes or f"Matched to Wikidata entity {q_number}" ) entries.append(entry) return entries def _migrate_old_enrichment_history(self, old_history: List[Dict[str, Any]]) -> List[EnrichmentEntry]: """Migrate old enrichment_history format to new schema""" entries = [] for old_entry in old_history: q_number = old_entry.get('q_number') verification = old_entry.get('verification', '') entry = EnrichmentEntry( enrichment_date=old_entry.get('enrichment_date', datetime.now(timezone.utc).isoformat()), enrichment_type='WIKIDATA_IDENTIFIER', enrichment_method=old_entry.get('enrichment_method', 'Wikidata verification'), match_score=0.95, # Manual verification implies high confidence verified=True, # Old enrichment_history was manually verified enrichment_source='https://www.wikidata.org', enrichment_notes=f"Matched to {verification} ({q_number})" if q_number else verification ) entries.append(entry) return entries def _migrate_standalone_enrichment(self, prov: Dict[str, Any], inst: Dict[str, Any]) -> List[EnrichmentEntry]: """Migrate standalone enrichment_method field (later batches)""" entries = [] # Extract Wikidata Q-number from identifiers q_number = None if 'identifiers' in inst: for ident in inst['identifiers']: if ident.get('identifier_scheme') == 'Wikidata': q_number = ident.get('identifier_value') break if not q_number: return entries # Parse match confidence match_confidence = prov.get('wikidata_match_confidence', 'unknown') match_score = None if match_confidence == 'high': match_score = 0.95 elif match_confidence == 'partial': match_score = 0.80 elif match_confidence == 'medium': match_score = 0.75 # Determine if verified (high confidence = verified) verified = match_confidence == 'high' # Extract enrichment notes from notes field enrichment_notes = None if 'notes' in prov: notes = prov['notes'] if isinstance(notes, list): enrichment_notes = ' '.join(str(n) for n in notes) else: enrichment_notes = str(notes) # Use enrichment_date if available, otherwise fall back to extraction_date enrichment_date = prov.get('enrichment_date') or prov.get('extraction_date', datetime.now(timezone.utc).isoformat()) # Create entry entry = EnrichmentEntry( enrichment_date=enrichment_date, enrichment_type='WIKIDATA_IDENTIFIER', enrichment_method=prov.get('enrichment_method', 'Wikidata enrichment'), match_score=match_score, verified=verified, enrichment_source='https://www.wikidata.org', enrichment_notes=enrichment_notes or f"Matched to Wikidata entity {q_number}" ) entries.append(entry) return entries def _parse_notes(self, notes: Any) -> List[EnrichmentEntry]: """Parse unstructured notes for enrichment information""" entries = [] if not notes: return entries notes_text = notes if isinstance(notes, str) else ' '.join(str(n) for n in notes) # Pattern: "Wikidata enriched 2025-11-10 (Q549445, match: 84%)" match = self.WIKIDATA_NOTE_PATTERN.search(notes_text) if match: date_str, q_number, match_pct = match.groups() entry = EnrichmentEntry( enrichment_date=f"{date_str}T12:00:00+00:00", # Assume midday UTC enrichment_type='WIKIDATA_IDENTIFIER', enrichment_method='Wikidata SPARQL query with fuzzy matching', match_score=int(match_pct) / 100.0, verified=False, enrichment_source='https://www.wikidata.org', enrichment_notes=f"Matched to Wikidata entity {q_number}" ) entries.append(entry) return entries def _entry_to_dict(self, entry: EnrichmentEntry) -> Dict[str, Any]: """Convert EnrichmentEntry to dict, omitting None values""" return { k: v for k, v in { 'enrichment_date': entry.enrichment_date, 'enrichment_type': entry.enrichment_type, 'enrichment_method': entry.enrichment_method, 'match_score': entry.match_score, 'verified': entry.verified, 'enrichment_source': entry.enrichment_source, 'enrichment_notes': entry.enrichment_notes }.items() if v is not None } def migrate_file(self, input_path: Path, output_path: Optional[Path] = None) -> None: """Migrate all institutions in a YAML file""" print(f"\n{'[DRY RUN] ' if self.dry_run else ''}Processing: {input_path}") # Read file with open(input_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Handle different file structures institutions = [] metadata = None if isinstance(data, dict) and 'institutions' in data: # Tunisia format with metadata institutions = data['institutions'] metadata = {k: v for k, v in data.items() if k != 'institutions'} elif isinstance(data, list): # Direct list format institutions = data else: print(f" ⚠️ Unknown file structure, skipping") self.stats['errors'] += 1 return # Migrate each institution migrated_count = 0 for inst in institutions: self.stats['processed'] += 1 if self.migrate_institution(inst): migrated_count += 1 self.stats['migrated'] += 1 # Show example if migrated_count == 1: print(f"\n ✓ Example migration:") print(f" Institution: {inst.get('name', 'Unknown')}") if 'enrichment_history' in inst['provenance']: print(f" New enrichment_history entries: {len(inst['provenance']['enrichment_history'])}") else: self.stats['skipped'] += 1 print(f"\n Processed: {len(institutions)} institutions") print(f" Migrated: {migrated_count}") print(f" Skipped: {len(institutions) - migrated_count}") # Write output if not self.dry_run: output_path = output_path or input_path # Create backup backup_path = input_path.with_suffix('.yaml.pre_v0.2.2_backup') if not backup_path.exists(): import shutil shutil.copy2(input_path, backup_path) print(f" 📦 Backup created: {backup_path.name}") # Write migrated data with open(output_path, 'w', encoding='utf-8') as f: if metadata: # Update metadata if '_metadata' in metadata: metadata['_metadata']['schema_version'] = '0.2.2' metadata['_metadata']['generated'] = datetime.now(timezone.utc).isoformat() if 'enhancements' not in metadata['_metadata']: metadata['_metadata']['enhancements'] = [] metadata['_metadata']['enhancements'].append('Schema v0.2.2 enrichment_history migration') output_data = {**metadata, 'institutions': institutions} else: output_data = institutions yaml.dump(output_data, f, allow_unicode=True, sort_keys=False, width=120) print(f" ✅ Written: {output_path}") def main(): parser = argparse.ArgumentParser( description='Migrate old enrichment formats to schema v0.2.2', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__ ) parser.add_argument('--dry-run', action='store_true', default=True, help='Preview changes without writing (default)') parser.add_argument('--apply', action='store_true', help='Apply changes and write files') parser.add_argument('--files', nargs='+', help='Specific files to migrate (default: all datasets)') args = parser.parse_args() # Default dataset files base_path = Path(__file__).parent.parent / 'data' / 'instances' default_files = [ base_path / 'chile' / 'chilean_institutions_batch19_enriched.yaml', base_path / 'tunisia' / 'tunisian_institutions_enhanced.yaml', base_path / 'algeria' / 'algerian_institutions.yaml', base_path / 'libya' / 'libyan_institutions.yaml' ] files_to_migrate = [Path(f) for f in args.files] if args.files else default_files # Create migrator migrator = EnrichmentMigrator(dry_run=not args.apply) print("=" * 80) print("SCHEMA v0.2.2 ENRICHMENT MIGRATION") print("=" * 80) print(f"Mode: {'DRY RUN (preview only)' if migrator.dry_run else 'APPLY CHANGES'}") print(f"Files: {len(files_to_migrate)}") # Migrate files for file_path in files_to_migrate: if not file_path.exists(): print(f"\n⚠️ File not found: {file_path}") continue migrator.migrate_file(file_path) # Summary print("\n" + "=" * 80) print("MIGRATION SUMMARY") print("=" * 80) print(f"Total institutions processed: {migrator.stats['processed']}") print(f"Migrated to v0.2.2: {migrator.stats['migrated']}") print(f"Skipped (already migrated or no enrichment): {migrator.stats['skipped']}") print(f"Errors: {migrator.stats['errors']}") if migrator.dry_run: print("\n⚠️ DRY RUN MODE - No files were modified") print("Run with --apply to write changes") else: print("\n✅ Migration completed") print("Backups created with .pre_v0.2.2_backup extension") if __name__ == '__main__': main()