#!/usr/bin/env python3
"""
Enrich Existing Thüringen Records with v4.0 Metadata

Updates the 140 existing Thüringen institutions in the German dataset with rich
metadata from the v4.0 harvest (95.6% completeness).

Strategy:
- Match by fuzzy name matching (>90% similarity)
- ENRICH existing records instead of skipping
- Add contact, administrative, collections, description fields
- Preserve existing ISIL codes and identifiers

Input:
- data/isil/germany/german_institutions_unified_v4_20251120_113920.json (20,944)
- data/isil/germany/thueringen_archives_100percent_20251120_095757.json (149)

Output:
- data/isil/germany/german_institutions_unified_v4_enriched_{timestamp}.json
- Enrichment statistics report

Expected Enrichment:
- 140 existing Thüringen records updated with v4.0 metadata
- Fields added: contact, administrative, collections, description
- 9 new records already added in previous merge

Author: OpenCode AI Agent
Date: 2025-11-20
"""

import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
from rapidfuzz import fuzz

# Constants
FUZZY_MATCH_THRESHOLD = 90.0  # 90% similarity for matching

def load_json(filepath: Path) -> dict:
    """Load JSON file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data: dict, filepath: Path):
    """Save JSON file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def find_matching_harvest_record(
    institution: dict,
    harvest_archives: List[dict],
    threshold: float = FUZZY_MATCH_THRESHOLD
) -> Optional[dict]:
    """
    Find matching archive in Thüringen harvest by fuzzy name matching.
    
    Args:
        institution: Institution from German dataset
        harvest_archives: List of Thüringen harvest archives
        threshold: Minimum similarity score (0-100)
    
    Returns:
        Matching harvest record or None
    """
    name = institution['name']
    city = None
    if institution.get('locations'):
        city = institution['locations'][0].get('city')
    
    best_match = None
    best_score = 0
    
    for harvest_arch in harvest_archives:
        # Fuzzy match on name
        name_similarity = fuzz.ratio(name.lower(), harvest_arch['name'].lower())
        
        # Bonus if city matches
        city_match = False
        if city and harvest_arch.get('city'):
            if city.lower() == harvest_arch['city'].lower():
                city_match = True
        
        # Match threshold
        if name_similarity >= threshold:
            if city_match or name_similarity >= 95:  # High confidence
                if name_similarity > best_score:
                    best_match = harvest_arch
                    best_score = name_similarity
    
    return best_match if best_score >= threshold else None

def enrich_institution_with_harvest_data(institution: dict, harvest_record: dict) -> dict:
    """
    Enrich existing institution record with v4.0 harvest metadata.
    
    Adds/updates:
    - Physical address (street, postal code)
    - Contact (email, phone, fax, website)
    - Administrative (director, opening hours)
    - Collections (size, temporal coverage)
    - Description (archive history)
    """
    enriched = institution.copy()
    
    # Update location with physical address
    physical_addr = harvest_record.get('physical_address') or harvest_record.get('postal_address')
    if physical_addr and enriched.get('locations'):
        location = enriched['locations'][0]
        if physical_addr.get('street') and not location.get('street_address'):
            location['street_address'] = physical_addr['street']
        if physical_addr.get('postal_code') and not location.get('postal_code'):
            location['postal_code'] = physical_addr['postal_code']
    
    # Add contact info
    contact = {}
    if harvest_record.get('email'):
        contact['email'] = harvest_record['email']
    if harvest_record.get('phone'):
        contact['phone'] = harvest_record['phone']
    if harvest_record.get('fax'):
        contact['fax'] = harvest_record['fax']
    if harvest_record.get('website'):
        contact['website'] = harvest_record['website']
    
    if contact:
        enriched['contact'] = contact
    
    # Add administrative info
    administrative = {}
    if harvest_record.get('director'):
        administrative['director'] = harvest_record['director']
    if harvest_record.get('opening_hours'):
        administrative['opening_hours'] = harvest_record['opening_hours']
    
    if administrative:
        enriched['administrative'] = administrative
    
    # Add collections metadata
    collections = []
    if harvest_record.get('collection_size') or harvest_record.get('temporal_coverage'):
        collection = {}
        if harvest_record.get('collection_size'):
            collection['collection_size'] = harvest_record['collection_size']
        if harvest_record.get('temporal_coverage'):
            collection['temporal_coverage'] = harvest_record['temporal_coverage']
        collections.append(collection)
    
    if collections:
        enriched['collections'] = collections
    
    # Add archive history as description
    if harvest_record.get('archive_history') and not enriched.get('description'):
        history = harvest_record['archive_history']
        enriched['description'] = history[:2000] + '...' if len(history) > 2000 else history
    
    # Update provenance to reflect enrichment
    if enriched.get('provenance'):
        enriched['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
        enriched['provenance']['enrichment_source'] = 'Thüringen archives v4.0 (95.6% completeness)'
    
    return enriched

def enrich_existing_thueringen_records():
    """Main enrichment logic."""
    print("🔬 Enriching Existing Thüringen Records with v4.0 Metadata")
    print("=" * 70)
    print()
    
    # Load datasets
    print("📁 Loading datasets...")
    german_unified_path = Path("data/isil/germany/german_institutions_unified_v4_20251120_113920.json")
    thueringen_harvest_path = Path("data/isil/germany/thueringen_archives_100percent_20251120_095757.json")
    
    german_data = load_json(german_unified_path)
    thueringen_data = load_json(thueringen_harvest_path)
    
    print(f"   German unified v4: {len(german_data['institutions'])} institutions")
    print(f"   Thüringen harvest v4.0: {len(thueringen_data['archives'])} archives")
    print()
    
    # Statistics
    stats = {
        "total_checked": 0,
        "matches_found": 0,
        "enriched_count": 0,
        "fields_added": {
            "contact": 0,
            "administrative": 0,
            "collections": 0,
            "description": 0
        }
    }
    
    # Process institutions
    print("🔍 Matching and enriching records...")
    print()
    
    enriched_institutions = []
    
    for institution in german_data['institutions']:
        stats["total_checked"] += 1
        
        # Check if institution might be from Thüringen
        is_thueringen = False
        if institution.get('locations'):
            region = institution['locations'][0].get('region', '')
            if 'thüringen' in region.lower() or 'thuringen' in region.lower():
                is_thueringen = True
        
        # Also check source portals
        if institution.get('source_portals'):
            if 'archive-in-thueringen.de' in institution['source_portals']:
                is_thueringen = True
        
        # If not Thüringen, keep as-is
        if not is_thueringen:
            enriched_institutions.append(institution)
            continue
        
        # Find matching harvest record
        harvest_match = find_matching_harvest_record(institution, thueringen_data['archives'])
        
        if harvest_match:
            stats["matches_found"] += 1
            
            # Check what will be added
            has_contact_before = bool(institution.get('contact'))
            has_admin_before = bool(institution.get('administrative'))
            has_colls_before = bool(institution.get('collections'))
            has_desc_before = bool(institution.get('description'))
            
            # Enrich record
            enriched = enrich_institution_with_harvest_data(institution, harvest_match)
            
            # Track what was added
            if not has_contact_before and enriched.get('contact'):
                stats["fields_added"]["contact"] += 1
            if not has_admin_before and enriched.get('administrative'):
                stats["fields_added"]["administrative"] += 1
            if not has_colls_before and enriched.get('collections'):
                stats["fields_added"]["collections"] += 1
            if not has_desc_before and enriched.get('description'):
                stats["fields_added"]["description"] += 1
            
            enriched_institutions.append(enriched)
            stats["enriched_count"] += 1
            
            print(f"   ✅ ENRICHED: {institution['name']}")
        else:
            enriched_institutions.append(institution)
    
    print()
    print("=" * 70)
    print("📊 Enrichment Statistics")
    print("=" * 70)
    print(f"   Total institutions checked: {stats['total_checked']}")
    print(f"   Thüringen matches found: {stats['matches_found']}")
    print(f"   Records enriched: {stats['enriched_count']}")
    print()
    print("   Fields added:")
    print(f"     Contact metadata: {stats['fields_added']['contact']} records")
    print(f"     Administrative metadata: {stats['fields_added']['administrative']} records")
    print(f"     Collections metadata: {stats['fields_added']['collections']} records")
    print(f"     Descriptions: {stats['fields_added']['description']} records")
    print()
    
    # Update metadata
    german_data['institutions'] = enriched_institutions
    german_data['metadata'] = {
        "version": "v4.0-enriched",
        "created_at": datetime.now(timezone.utc).isoformat(),
        "total_institutions": len(enriched_institutions),
        "sources": ["ISIL Registry", "DDB SPARQL", "NRW Archives Portal", "Thüringen Archives Portal v4.0 (95.6% metadata completeness)"],
        "enrichment_statistics": stats,
        "thueringen_v4_features": {
            "physical_addresses": "100%",
            "directors": "96%",
            "opening_hours": "99.3%",
            "archive_histories": "84.6%",
            "overall_completeness": "95.6%"
        }
    }
    
    # Save enriched dataset
    output_path = Path(f"data/isil/germany/german_institutions_unified_v4_enriched_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
    save_json(german_data, output_path)
    
    print(f"💾 Saved: {output_path}")
    print(f"   File size: {output_path.stat().st_size / 1024 / 1024:.1f} MB")
    print()
    print("✅ Enrichment complete!")

if __name__ == '__main__':
    enrich_existing_thueringen_records()