glam/scripts/migrate_to_schema_v0.2.2_enrichment.py

#!/usr/bin/env python3
"""
Migration Script: Old Enrichment Format → Schema v0.2.2 EnrichmentHistoryEntry

Converts three different old enrichment formats to schema v0.2.2 compliant
enrichment_history structure.

OLD FORMATS:
1. Provenance flat fields (most common - 117 instances in Chile):
   provenance:
     enrichment_batch: 7
     enrichment_method: SPARQL_BULK_QUERY
     wikidata_verified: true
     notes:
       - "Batch 7: SPARQL match - exact name match"

2. Old enrichment_history (10 instances in Chile):
   enrichment_history:
     - enrichment_date: "2025-11-09T18:10:41.851904+00:00"
       enrichment_method: "Chilean Batch 5 - University + museum Wikidata verification"
       enrichment_batch: batch_5
       q_number: Q3551323
       verification: "Universidad Arturo Prat, public university..."

3. Unstructured notes (Tunisia, Algeria, Libya):
   notes: "Wikidata enriched 2025-11-10 (Q549445, match: 84%)..."

NEW FORMAT (schema v0.2.2):
   enrichment_history:
     - enrichment_date: "2025-11-10T14:30:00+00:00"
       enrichment_type: WIKIDATA_IDENTIFIER
       enrichment_method: "Wikidata SPARQL query with fuzzy matching"
       match_score: 0.84
       verified: false
       enrichment_source: "https://www.wikidata.org"
       enrichment_notes: "Matched to '...' (Q549445)"

DATASETS TO MIGRATE:
- Chile: 90 institutions (71 with Wikidata)
- Tunisia: 68 institutions
- Algeria: 19 institutions
- Libya: 24 institutions
Total: 201 institutions

USAGE:
    python scripts/migrate_to_schema_v0.2.2_enrichment.py --dry-run
    python scripts/migrate_to_schema_v0.2.2_enrichment.py --apply
"""

import argparse
import re
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
from dataclasses import dataclass


@dataclass
class EnrichmentEntry:
    """Schema v0.2.2 compliant enrichment history entry"""
    enrichment_date: str
    enrichment_type: str
    enrichment_method: str
    match_score: Optional[float] = None
    verified: bool = False
    enrichment_source: Optional[str] = None
    enrichment_notes: Optional[str] = None


class EnrichmentMigrator:
    """Migrates old enrichment formats to schema v0.2.2"""

    # Regex patterns for parsing unstructured notes
    WIKIDATA_NOTE_PATTERN = re.compile(
        r'Wikidata enriched (\d{4}-\d{2}-\d{2})\s*\((Q\d+),\s*match:\s*(\d+)%\)'
    )
    BATCH_NOTE_PATTERN = re.compile(
        r'Batch (\d+):\s*SPARQL match\s*-\s*(.+)'
    )

    def __init__(self, dry_run: bool = True):
        self.dry_run = dry_run
        self.stats = {
            'processed': 0,
            'migrated': 0,
            'skipped': 0,
            'errors': 0
        }

    def migrate_institution(self, inst: Dict[str, Any]) -> bool:
        """
        Migrate a single institution's enrichment data.

        Returns True if migration was performed, False if skipped.
        """
        if 'provenance' not in inst:
            return False

        prov = inst['provenance']

        # Check if already has new format enrichment_history
        if 'enrichment_history' in prov and isinstance(prov['enrichment_history'], list):
            if prov['enrichment_history'] and 'enrichment_type' in prov['enrichment_history'][0]:
                # Already in new format
                return False

        # Initialize new enrichment_history list
        new_history: List[EnrichmentEntry] = []

        # Migration Path 1: Flat provenance fields (enrichment_batch, wikidata_verified)
        if 'enrichment_batch' in prov or 'wikidata_verified' in prov:
            entries = self._migrate_flat_provenance(prov, inst)
            new_history.extend(entries)

        # Migration Path 2: Old enrichment_history format
        elif 'enrichment_history' in prov and isinstance(prov['enrichment_history'], list):
            if prov['enrichment_history'] and 'q_number' in prov['enrichment_history'][0]:
                entries = self._migrate_old_enrichment_history(prov['enrichment_history'])
                new_history.extend(entries)

        # Migration Path 3: Standalone enrichment_method field (later batches)
        # NOTE: enrichment_date is optional - we use extraction_date as fallback
        elif 'enrichment_method' in prov:
            entries = self._migrate_standalone_enrichment(prov, inst)
            new_history.extend(entries)

        # Migration Path 4: Parse unstructured notes
        if 'notes' in prov and not new_history:
            entries = self._parse_notes(prov['notes'])
            new_history.extend(entries)

        # Apply migration if entries were created
        if new_history:
            # Remove old fields
            old_fields = ['enrichment_batch', 'enrichment_method', 'enrichment_confidence',
                          'wikidata_verified', 'wikidata_match_confidence', 'enrichment_date', 'notes']
            for field in old_fields:
                prov.pop(field, None)

            # Add new enrichment_history (convert EnrichmentEntry objects to dicts)
            prov['enrichment_history'] = [self._entry_to_dict(e) for e in new_history]

            return True

        return False

    def _migrate_flat_provenance(self, prov: Dict[str, Any], inst: Dict[str, Any]) -> List[EnrichmentEntry]:
        """Migrate flat provenance fields to structured history"""
        entries = []

        # Extract Wikidata Q-number from identifiers
        q_number = None
        if 'identifiers' in inst:
            for ident in inst['identifiers']:
                if ident.get('identifier_scheme') == 'Wikidata':
                    q_number = ident.get('identifier_value')
                    break

        # Parse enrichment date from notes or use extraction_date
        enrichment_date = prov.get('extraction_date', datetime.now(timezone.utc).isoformat())

        # Parse match confidence from notes
        match_score = None
        enrichment_notes = None

        if 'notes' in prov:
            notes_text = prov['notes']
            if isinstance(notes_text, list):
                notes_text = ' '.join(str(n) for n in notes_text)

            # Extract batch and match type
            batch_match = self.BATCH_NOTE_PATTERN.search(notes_text)
            if batch_match:
                batch_num = batch_match.group(1)
                match_type = batch_match.group(2)
                enrichment_notes = f"Batch {batch_num}: {match_type}"

                # Infer match score from match type
                if 'exact name match' in match_type:
                    match_score = 1.0
                elif 'partial name' in match_type:
                    match_score = 0.85
                elif 'includes full' in match_type:
                    match_score = 0.9

        # Create Wikidata enrichment entry
        if q_number:
            entry = EnrichmentEntry(
                enrichment_date=enrichment_date,
                enrichment_type='WIKIDATA_IDENTIFIER',
                enrichment_method=prov.get('enrichment_method', 'Wikidata SPARQL bulk query'),
                match_score=match_score,
                verified=prov.get('wikidata_verified', False),
                enrichment_source='https://www.wikidata.org',
                enrichment_notes=enrichment_notes or f"Matched to Wikidata entity {q_number}"
            )
            entries.append(entry)

        return entries

    def _migrate_old_enrichment_history(self, old_history: List[Dict[str, Any]]) -> List[EnrichmentEntry]:
        """Migrate old enrichment_history format to new schema"""
        entries = []

        for old_entry in old_history:
            q_number = old_entry.get('q_number')
            verification = old_entry.get('verification', '')

            entry = EnrichmentEntry(
                enrichment_date=old_entry.get('enrichment_date', datetime.now(timezone.utc).isoformat()),
                enrichment_type='WIKIDATA_IDENTIFIER',
                enrichment_method=old_entry.get('enrichment_method', 'Wikidata verification'),
                match_score=0.95,  # Manual verification implies high confidence
                verified=True,  # Old enrichment_history was manually verified
                enrichment_source='https://www.wikidata.org',
                enrichment_notes=f"Matched to {verification} ({q_number})" if q_number else verification
            )
            entries.append(entry)

        return entries

    def _migrate_standalone_enrichment(self, prov: Dict[str, Any], inst: Dict[str, Any]) -> List[EnrichmentEntry]:
        """Migrate standalone enrichment_method field (later batches)"""
        entries = []

        # Extract Wikidata Q-number from identifiers
        q_number = None
        if 'identifiers' in inst:
            for ident in inst['identifiers']:
                if ident.get('identifier_scheme') == 'Wikidata':
                    q_number = ident.get('identifier_value')
                    break

        if not q_number:
            return entries

        # Parse match confidence
        match_confidence = prov.get('wikidata_match_confidence', 'unknown')
        match_score = None
        if match_confidence == 'high':
            match_score = 0.95
        elif match_confidence == 'partial':
            match_score = 0.80
        elif match_confidence == 'medium':
            match_score = 0.75

        # Determine if verified (high confidence = verified)
        verified = match_confidence == 'high'

        # Extract enrichment notes from notes field
        enrichment_notes = None
        if 'notes' in prov:
            notes = prov['notes']
            if isinstance(notes, list):
                enrichment_notes = ' '.join(str(n) for n in notes)
            else:
                enrichment_notes = str(notes)

        # Use enrichment_date if available, otherwise fall back to extraction_date
        enrichment_date = prov.get('enrichment_date') or prov.get('extraction_date', datetime.now(timezone.utc).isoformat())

        # Create entry
        entry = EnrichmentEntry(
            enrichment_date=enrichment_date,
            enrichment_type='WIKIDATA_IDENTIFIER',
            enrichment_method=prov.get('enrichment_method', 'Wikidata enrichment'),
            match_score=match_score,
            verified=verified,
            enrichment_source='https://www.wikidata.org',
            enrichment_notes=enrichment_notes or f"Matched to Wikidata entity {q_number}"
        )
        entries.append(entry)

        return entries

    def _parse_notes(self, notes: Any) -> List[EnrichmentEntry]:
        """Parse unstructured notes for enrichment information"""
        entries = []

        if not notes:
            return entries

        notes_text = notes if isinstance(notes, str) else ' '.join(str(n) for n in notes)

        # Pattern: "Wikidata enriched 2025-11-10 (Q549445, match: 84%)"
        match = self.WIKIDATA_NOTE_PATTERN.search(notes_text)
        if match:
            date_str, q_number, match_pct = match.groups()

            entry = EnrichmentEntry(
                enrichment_date=f"{date_str}T12:00:00+00:00",  # Assume midday UTC
                enrichment_type='WIKIDATA_IDENTIFIER',
                enrichment_method='Wikidata SPARQL query with fuzzy matching',
                match_score=int(match_pct) / 100.0,
                verified=False,
                enrichment_source='https://www.wikidata.org',
                enrichment_notes=f"Matched to Wikidata entity {q_number}"
            )
            entries.append(entry)

        return entries

    def _entry_to_dict(self, entry: EnrichmentEntry) -> Dict[str, Any]:
        """Convert EnrichmentEntry to dict, omitting None values"""
        return {
            k: v for k, v in {
                'enrichment_date': entry.enrichment_date,
                'enrichment_type': entry.enrichment_type,
                'enrichment_method': entry.enrichment_method,
                'match_score': entry.match_score,
                'verified': entry.verified,
                'enrichment_source': entry.enrichment_source,
                'enrichment_notes': entry.enrichment_notes
            }.items() if v is not None
        }

    def migrate_file(self, input_path: Path, output_path: Optional[Path] = None) -> None:
        """Migrate all institutions in a YAML file"""
        print(f"\n{'[DRY RUN] ' if self.dry_run else ''}Processing: {input_path}")

        # Read file
        with open(input_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)

        # Handle different file structures
        institutions = []
        metadata = None

        if isinstance(data, dict) and 'institutions' in data:
            # Tunisia format with metadata
            institutions = data['institutions']
            metadata = {k: v for k, v in data.items() if k != 'institutions'}
        elif isinstance(data, list):
            # Direct list format
            institutions = data
        else:
            print(f"  ⚠️  Unknown file structure, skipping")
            self.stats['errors'] += 1
            return

        # Migrate each institution
        migrated_count = 0
        for inst in institutions:
            self.stats['processed'] += 1

            if self.migrate_institution(inst):
                migrated_count += 1
                self.stats['migrated'] += 1

                # Show example
                if migrated_count == 1:
                    print(f"\n  ✓ Example migration:")
                    print(f"    Institution: {inst.get('name', 'Unknown')}")
                    if 'enrichment_history' in inst['provenance']:
                        print(f"    New enrichment_history entries: {len(inst['provenance']['enrichment_history'])}")
            else:
                self.stats['skipped'] += 1

        print(f"\n  Processed: {len(institutions)} institutions")
        print(f"  Migrated: {migrated_count}")
        print(f"  Skipped: {len(institutions) - migrated_count}")

        # Write output
        if not self.dry_run:
            output_path = output_path or input_path

            # Create backup
            backup_path = input_path.with_suffix('.yaml.pre_v0.2.2_backup')
            if not backup_path.exists():
                import shutil
                shutil.copy2(input_path, backup_path)
                print(f"  📦 Backup created: {backup_path.name}")

            # Write migrated data
            with open(output_path, 'w', encoding='utf-8') as f:
                if metadata:
                    # Update metadata
                    if '_metadata' in metadata:
                        metadata['_metadata']['schema_version'] = '0.2.2'
                        metadata['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
                        if 'enhancements' not in metadata['_metadata']:
                            metadata['_metadata']['enhancements'] = []
                        metadata['_metadata']['enhancements'].append('Schema v0.2.2 enrichment_history migration')

                    output_data = {**metadata, 'institutions': institutions}
                else:
                    output_data = institutions

                yaml.dump(output_data, f, allow_unicode=True, sort_keys=False, width=120)

            print(f"  ✅ Written: {output_path}")


def main():
    parser = argparse.ArgumentParser(
        description='Migrate old enrichment formats to schema v0.2.2',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__
    )
    parser.add_argument('--dry-run', action='store_true', default=True,
                        help='Preview changes without writing (default)')
    parser.add_argument('--apply', action='store_true',
                        help='Apply changes and write files')
    parser.add_argument('--files', nargs='+',
                        help='Specific files to migrate (default: all datasets)')

    args = parser.parse_args()

    # Default dataset files
    base_path = Path(__file__).parent.parent / 'data' / 'instances'
    default_files = [
        base_path / 'chile' / 'chilean_institutions_batch19_enriched.yaml',
        base_path / 'tunisia' / 'tunisian_institutions_enhanced.yaml',
        base_path / 'algeria' / 'algerian_institutions.yaml',
        base_path / 'libya' / 'libyan_institutions.yaml'
    ]

    files_to_migrate = [Path(f) for f in args.files] if args.files else default_files

    # Create migrator
    migrator = EnrichmentMigrator(dry_run=not args.apply)

    print("=" * 80)
    print("SCHEMA v0.2.2 ENRICHMENT MIGRATION")
    print("=" * 80)
    print(f"Mode: {'DRY RUN (preview only)' if migrator.dry_run else 'APPLY CHANGES'}")
    print(f"Files: {len(files_to_migrate)}")

    # Migrate files
    for file_path in files_to_migrate:
        if not file_path.exists():
            print(f"\n⚠️  File not found: {file_path}")
            continue

        migrator.migrate_file(file_path)

    # Summary
    print("\n" + "=" * 80)
    print("MIGRATION SUMMARY")
    print("=" * 80)
    print(f"Total institutions processed: {migrator.stats['processed']}")
    print(f"Migrated to v0.2.2: {migrator.stats['migrated']}")
    print(f"Skipped (already migrated or no enrichment): {migrator.stats['skipped']}")
    print(f"Errors: {migrator.stats['errors']}")

    if migrator.dry_run:
        print("\n⚠️  DRY RUN MODE - No files were modified")
        print("Run with --apply to write changes")
    else:
        print("\n✅ Migration completed")
        print("Backups created with .pre_v0.2.2_backup extension")


if __name__ == '__main__':
    main()