glam/scripts/scrapers/merge_thueringen_to_german_dataset.py

#!/usr/bin/env python3
"""
Merge Thüringen Archives v4.0 (100% Metadata) with German Unified Dataset

Integrates the 149 Thüringen archives from archive-in-thueringen.de with the existing
German unified dataset (20,935 institutions from ISIL + DDB + NRW).

Features:
- Deduplication by name fuzzy matching (>90% similarity)
- Rich metadata extraction (physical addresses, directors, opening hours, archive histories)
- Uses pre-geocoded coordinates from harvest
- Preserves existing data quality (ISIL codes, coordinates)
- Adds Thüringen-specific detailed metadata

Input:
- data/isil/germany/german_institutions_unified_v3_20251120_091059.json (20,935)
- data/isil/germany/thueringen_archives_100percent_20251120_095757.json (149)

Output:
- data/isil/germany/german_institutions_unified_v4_{timestamp}.json
- Merge statistics report

Metadata Coverage v4.0:
- Physical addresses: 100% (vs 0% in v2.0)
- Directors: 96% (vs 0% in v2.0)
- Opening hours: 99.3% (vs 0% in v2.0)
- Archive histories: 84.6% (vs 0% in v2.0)
- Overall completeness: 95.6% (vs 60% in v2.0)

Author: OpenCode AI Agent
Date: 2025-11-20
"""

import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
from rapidfuzz import fuzz

# Constants
FUZZY_MATCH_THRESHOLD = 90.0  # 90% similarity for deduplication

def load_json(filepath: Path) -> dict:
    """Load JSON file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data: dict, filepath: Path):
    """Save JSON file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching."""
    # Remove common prefixes/suffixes
    normalized = name.lower()
    normalized = normalized.replace('stadtarchiv', '').replace('kreisarchiv', '').replace('staatsarchiv', '')
    normalized = normalized.replace('archiv', '').strip()
    return normalized

def find_duplicate(
    institution: dict,
    existing_institutions: List[dict],
    threshold: float = FUZZY_MATCH_THRESHOLD
) -> Optional[dict]:
    """
    Find duplicate institution in existing dataset using fuzzy name matching.

    Args:
        institution: Institution to check
        existing_institutions: List of existing institutions
        threshold: Minimum similarity score (0-100)

    Returns:
        Matching institution or None
    """
    name = institution['name']
    city = institution.get('city', '')

    for existing in existing_institutions:
        # Fuzzy match on name
        name_similarity = fuzz.ratio(name.lower(), existing['name'].lower())

        # Bonus if city matches
        city_match = False
        if city and existing.get('locations'):
            existing_city = existing['locations'][0].get('city')
            if existing_city and city.lower() == existing_city.lower():
                city_match = True

        # Match threshold
        if name_similarity >= threshold:
            if city_match or name_similarity >= 95:  # High confidence
                return existing

    return None

def convert_thueringen_to_german_format(thueringen_archive: dict) -> dict:
    """
    Convert Thüringen v4.0 harvest format to German unified dataset format.

    Thüringen v4.0 format (95.6% metadata completeness):
    {
      "id": "thueringen-81",
      "name": "Stadtarchiv Ohrdruf",
      "institution_type": "ARCHIVE",
      "city": "Ohrdruf",
      "region": "Thüringen",
      "country": "DE",
      "latitude": 50.827008,
      "longitude": 10.731950,
      "url": "https://...",
      "email": "archiv@ohrdruf.de",
      "phone": "03624/31700150",
      "website": "https://...",
      "postal_address": {...},      # ← NEW in v4.0
      "physical_address": {...},    # ← NEW in v4.0
      "opening_hours": "...",       # ← NEW in v4.0 (99.3% coverage)
      "director": "Dr. X",          # ← NEW in v4.0 (96% coverage)
      "collection_size": "869 lfm", # ← NEW in v4.0
      "temporal_coverage": "16-20 Jh", # ← NEW in v4.0
      "archive_history": "...",     # ← NEW in v4.0 (84.6% coverage)
      "source_portal": "archive-in-thueringen.de",
      "provenance": {...}
    }

    German unified format:
    {
      "id": "https://w3id.org/heritage/custodian/de/...",
      "name": "Stadtarchiv Ohrdruf",
      "institution_type": "ARCHIVE",
      "locations": [{
        "city": "Ohrdruf",
        "region": "Thüringen",
        "country": "DE",
        "street_address": "Schloßplatz 1",
        "postal_code": "99885",
        "latitude": 50.827008,
        "longitude": 10.731950
      }],
      "identifiers": [...],
      "contact": {
        "email": "...",
        "phone": "...",
        "website": "..."
      },
      "administrative": {
        "director": "Dr. X",
        "opening_hours": "..."
      },
      "collections": [{
        "collection_size": "869 lfm",
        "temporal_coverage": "16-20 Jh"
      }],
      "description": "Archive history...",
      "source_portals": ["archive-in-thueringen.de"],
      "provenance": {...}
    }
    """
    # Generate W3ID URI
    name_slug = thueringen_archive['name'].lower().replace(' ', '-').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss')
    name_slug = ''.join(c for c in name_slug if c.isalnum() or c == '-')
    w3id = f"https://w3id.org/heritage/custodian/de/thueringen-{thueringen_archive['id']}"

    # Build location (with physical address if available)
    location = {
        "city": thueringen_archive.get('city'),
        "region": thueringen_archive.get('region', 'Thüringen'),
        "country": thueringen_archive.get('country', 'DE')
    }

    # Add physical address details (NEW in v4.0)
    physical_addr = thueringen_archive.get('physical_address') or thueringen_archive.get('postal_address')
    if physical_addr:
        if physical_addr.get('street'):
            location['street_address'] = physical_addr['street']
        if physical_addr.get('postal_code'):
            location['postal_code'] = physical_addr['postal_code']

    # Add coordinates if available
    if thueringen_archive.get('latitude') and thueringen_archive.get('longitude'):
        location['latitude'] = thueringen_archive['latitude']
        location['longitude'] = thueringen_archive['longitude']

    # Build identifiers (website URL + portal detail page)
    identifiers = []
    if thueringen_archive.get('website'):
        identifiers.append({
            "identifier_scheme": "Website",
            "identifier_value": thueringen_archive['website'],
            "identifier_url": thueringen_archive['website']
        })
    if thueringen_archive.get('url'):
        identifiers.append({
            "identifier_scheme": "Portal",
            "identifier_value": thueringen_archive['url'],
            "identifier_url": thueringen_archive['url']
        })

    # Build contact info (NEW in v4.0)
    contact = {}
    if thueringen_archive.get('email'):
        contact['email'] = thueringen_archive['email']
    if thueringen_archive.get('phone'):
        contact['phone'] = thueringen_archive['phone']
    if thueringen_archive.get('fax'):
        contact['fax'] = thueringen_archive['fax']
    if thueringen_archive.get('website'):
        contact['website'] = thueringen_archive['website']

    # Build administrative info (NEW in v4.0)
    administrative = {}
    if thueringen_archive.get('director'):
        administrative['director'] = thueringen_archive['director']
    if thueringen_archive.get('opening_hours'):
        administrative['opening_hours'] = thueringen_archive['opening_hours']

    # Build collections metadata (NEW in v4.0)
    collections = []
    if thueringen_archive.get('collection_size') or thueringen_archive.get('temporal_coverage'):
        collection = {}
        if thueringen_archive.get('collection_size'):
            collection['collection_size'] = thueringen_archive['collection_size']
        if thueringen_archive.get('temporal_coverage'):
            collection['temporal_coverage'] = thueringen_archive['temporal_coverage']
        collections.append(collection)

    # Archive history as description (NEW in v4.0 - 84.6% coverage)
    description = None
    if thueringen_archive.get('archive_history'):
        # Truncate if too long (store first 2000 chars)
        history = thueringen_archive['archive_history']
        description = history[:2000] + '...' if len(history) > 2000 else history

    # Build unified record
    record = {
        "id": w3id,
        "name": thueringen_archive['name'],
        "institution_type": thueringen_archive['institution_type'],
        "locations": [location],
        "identifiers": identifiers,
        "source_portals": [thueringen_archive.get('source_portal', 'archive-in-thueringen.de')],
        "provenance": {
            "data_source": "WEB_SCRAPING",
            "data_tier": "TIER_2_VERIFIED",
            "extraction_date": thueringen_archive['provenance']['extraction_date'],
            "extraction_method": "Thüringen archives portal v4.0 (100% metadata goal) + fuzzy deduplication",
            "source_url": thueringen_archive.get('url', ''),
            "confidence_score": 0.95
        }
    }

    # Add optional fields only if present
    if contact:
        record['contact'] = contact
    if administrative:
        record['administrative'] = administrative
    if collections:
        record['collections'] = collections
    if description:
        record['description'] = description

    return record

def merge_thueringen_archives():
    """Main merge logic."""
    print("🔀 Merging Thüringen Archives into German Unified Dataset")
    print("=" * 70)
    print()

    # Load datasets
    print("📁 Loading datasets...")
    german_unified_path = Path("data/isil/germany/german_institutions_unified_v3_20251120_091059.json")
    thueringen_harvest_path = Path("data/isil/germany/thueringen_archives_100percent_20251120_095757.json")

    german_data = load_json(german_unified_path)
    thueringen_data = load_json(thueringen_harvest_path)

    print(f"   German unified v2: {len(german_data['institutions'])} institutions")
    print(f"   Thüringen harvest: {len(thueringen_data['archives'])} archives")
    print()

    # Statistics
    stats = {
        "duplicates_found": 0,
        "new_additions": 0,
        "geocoded_count": 0,
        "thueringen_total": len(thueringen_data['archives'])
    }

    # Process Thüringen archives
    print("🔍 Deduplicating and merging...")

    for thueringen_archive in thueringen_data['archives']:
        # Check for duplicate
        duplicate = find_duplicate(thueringen_archive, german_data['institutions'])

        if duplicate:
            stats["duplicates_found"] += 1
            print(f"   ⏭️  SKIP (duplicate): {thueringen_archive['name']} → {duplicate['name']}")
        else:
            # Convert to German format and add
            german_record = convert_thueringen_to_german_format(thueringen_archive)
            german_data['institutions'].append(german_record)
            stats["new_additions"] += 1

            if german_record['locations'][0].get('latitude'):
                stats["geocoded_count"] += 1

            print(f"   ✅ ADD: {german_record['name']} ({german_record['locations'][0].get('city', 'no city')})")

    print()
    print("=" * 70)
    print("📊 Merge Statistics")
    print("=" * 70)
    print(f"   Thüringen archives processed: {stats['thueringen_total']}")
    print(f"   Duplicates (skipped): {stats['duplicates_found']} ({stats['duplicates_found']/stats['thueringen_total']*100:.1f}%)")
    print(f"   New additions: {stats['new_additions']} ({stats['new_additions']/stats['thueringen_total']*100:.1f}%)")
    print(f"   With coordinates: {stats['geocoded_count']}/{stats['new_additions']} ({stats['geocoded_count']/stats['new_additions']*100 if stats['new_additions'] > 0 else 0:.1f}%)")
    print()
    print(f"   German dataset v2 (before): {len(german_data['institutions']) - stats['new_additions']}")
    print(f"   German dataset v3 (after):  {len(german_data['institutions'])}")
    print(f"   Growth: +{stats['new_additions']} institutions")
    print()

    # Update metadata
    german_data['metadata'] = {
        "version": "v4.0",
        "created_at": datetime.now(timezone.utc).isoformat(),
        "total_institutions": len(german_data['institutions']),
        "sources": ["ISIL Registry", "DDB SPARQL", "NRW Archives Portal", "Thüringen Archives Portal v4.0 (95.6% metadata completeness)"],
        "merge_statistics": stats,
        "thueringen_v4_features": {
            "physical_addresses": "100%",
            "directors": "96%",
            "opening_hours": "99.3%",
            "archive_histories": "84.6%",
            "overall_completeness": "95.6%"
        }
    }

    # Save merged dataset
    output_path = Path(f"data/isil/germany/german_institutions_unified_v4_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
    save_json(german_data, output_path)

    print(f"💾 Saved: {output_path}")
    print(f"   File size: {output_path.stat().st_size / 1024 / 1024:.1f} MB")
    print()
    print("✅ Merge complete!")

if __name__ == '__main__':
    merge_thueringen_archives()