glam/scripts/merge_sachsen_anhalt_complete.py

#!/usr/bin/env python3
"""
Merge Sachsen-Anhalt Enriched Museums + Archives
Creates complete Sachsen-Anhalt dataset with full metadata
"""

import json
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
from collections import Counter

def convert_archive_to_linkml(archive: Dict[str, Any]) -> Dict[str, Any]:
    """Convert archive record from flat structure to LinkML format."""

    linkml_record = {
        "name": archive.get("name", ""),
        "institution_type": archive.get("institution_type", "ARCHIVE"),
        "description": archive.get("address_text", ""),
        "locations": [
            {
                "city": archive.get("city", ""),
                "region": archive.get("region", "Sachsen-Anhalt"),
                "country": archive.get("country", "DE")
            }
        ],
        "identifiers": [],
        "provenance": archive.get("provenance", {})
    }

    # Add URL as identifier
    if archive.get("url"):
        linkml_record["identifiers"].append({
            "identifier_scheme": "Website",
            "identifier_value": archive["url"],
            "identifier_url": archive["url"]
        })

    # Add email if present
    if archive.get("email"):
        linkml_record["identifiers"].append({
            "identifier_scheme": "Email",
            "identifier_value": archive["email"],
            "identifier_url": f"mailto:{archive['email']}"
        })

    # Add phone if present
    if archive.get("phone"):
        linkml_record["identifiers"].append({
            "identifier_scheme": "Phone",
            "identifier_value": archive["phone"],
            "identifier_url": f"tel:{archive['phone']}"
        })

    return linkml_record

def main():
    """Main execution."""

    print("=" * 80)
    print("Merge Sachsen-Anhalt Complete Dataset")
    print("=" * 80)
    print()

    data_dir = Path('data/isil/germany')

    # Load enriched museums
    museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_enriched_*.json'), reverse=True)
    if not museum_files:
        print("❌ No enriched museum files found")
        return

    museum_file = museum_files[0]
    print(f"Loading museums: {museum_file.name}")
    with open(museum_file, 'r', encoding='utf-8') as f:
        museums = json.load(f)

    # Load archives
    archive_files = sorted(data_dir.glob('sachsen_anhalt_archives_*.json'), reverse=True)
    if not archive_files:
        print("❌ No archive files found")
        return

    archive_file = archive_files[0]
    print(f"Loading archives: {archive_file.name}")
    with open(archive_file, 'r', encoding='utf-8') as f:
        archive_data = json.load(f)

    # Convert archives to LinkML format
    archives = []
    if isinstance(archive_data, dict) and 'archives' in archive_data:
        for archive in archive_data['archives']:
            archives.append(convert_archive_to_linkml(archive))
    elif isinstance(archive_data, list):
        for archive in archive_data:
            archives.append(convert_archive_to_linkml(archive))

    print()
    print(f"Loaded:")
    print(f"  Museums: {len(museums)}")
    print(f"  Archives: {len(archives)}")
    print()

    # Merge
    merged = museums + archives
    total = len(merged)

    print(f"Total institutions: {total}")
    print()

    # Calculate completeness
    has_name = sum(1 for inst in merged if inst.get('name'))
    has_type = sum(1 for inst in merged if inst.get('institution_type'))
    has_desc = sum(1 for inst in merged if inst.get('description'))
    has_city = sum(1 for inst in merged if inst.get('locations') and any(loc.get('city') for loc in inst['locations']))
    has_address = sum(1 for inst in merged if inst.get('locations') and any(loc.get('street_address') for loc in inst['locations']))
    has_postal = sum(1 for inst in merged if inst.get('locations') and any(loc.get('postal_code') for loc in inst['locations']))
    has_website = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Website' for i in inst['identifiers']))
    has_phone = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Phone' for i in inst['identifiers']))
    has_email = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Email' for i in inst['identifiers']))

    print("Data Completeness:")
    print(f"  Name:           {has_name:3d}/{total} ({has_name/total*100:5.1f}%)")
    print(f"  Type:           {has_type:3d}/{total} ({has_type/total*100:5.1f}%)")
    print(f"  Description:    {has_desc:3d}/{total} ({has_desc/total*100:5.1f}%)")
    print(f"  City:           {has_city:3d}/{total} ({has_city/total*100:5.1f}%)")
    print(f"  Street Address: {has_address:3d}/{total} ({has_address/total*100:5.1f}%)")
    print(f"  Postal Code:    {has_postal:3d}/{total} ({has_postal/total*100:5.1f}%)")
    print(f"  Website:        {has_website:3d}/{total} ({has_website/total*100:5.1f}%)")
    print(f"  Phone:          {has_phone:3d}/{total} ({has_phone/total*100:5.1f}%)")
    print(f"  Email:          {has_email:3d}/{total} ({has_email/total*100:5.1f}%)")
    print()

    # Institution types
    type_counts = Counter()
    for inst in merged:
        type_counts[inst.get('institution_type', 'UNKNOWN')] += 1

    print("Institution Types:")
    for itype, count in type_counts.most_common():
        print(f"  {itype:20s}: {count:3d}")
    print()

    # Geographic coverage
    city_counts = Counter()
    for inst in merged:
        if inst.get('locations'):
            for loc in inst['locations']:
                city = loc.get('city', '')
                if city:
                    city_counts[city] += 1

    print(f"Geographic Coverage: {len(city_counts)} cities")
    print()
    print("Top 20 Cities:")
    for city, count in city_counts.most_common(20):
        print(f"  {city:35s}: {count:2d}")
    print()

    # Save complete dataset
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_path = data_dir / f'sachsen_anhalt_complete_{timestamp}.json'

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)

    file_size_kb = output_path.stat().st_size / 1024

    print(f"✅ Saved to: {output_path}")
    print(f"   File size: {file_size_kb:.1f} KB")
    print(f"   Total institutions: {total}")
    print()
    print("=" * 80)
    print("Sachsen-Anhalt Complete Dataset Ready!")
    print("=" * 80)

if __name__ == '__main__':
    main()