glam/scripts/enrich_luxembourg_manual.py

#!/usr/bin/env python3
"""
Manual enrichment for Luxembourg institutions (Phase 1 - Final Country)

Enriches Court of Justice of the European Union with:
- Wikidata Q-number (Q4951)
- VIAF identifier (124913422)
- Enhanced description
- Additional alternative names
"""

import yaml
from datetime import datetime, timezone
from pathlib import Path

UNIFIED_DATASET = Path("data/instances/all/globalglam-20251111.yaml")
BACKUP_PATH = Path("data/instances/all/globalglam-20251111.yaml.backup")


def load_yaml(filepath: Path) -> list:
    """Load YAML file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def save_yaml(data: list, filepath: Path):
    """Save YAML file."""
    with open(filepath, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, sort_keys=False, width=120)


def enrich_luxembourg_institutions(data: list) -> tuple[list, int]:
    """
    Enrich Luxembourg institutions with Wikidata and VIAF identifiers.

    Returns:
        Tuple of (enriched_data, count_enriched)
    """
    enriched_count = 0

    # Enrichment data for Court of Justice of the European Union
    cjeu_enrichment = {
        'id': 'EUR-CURIA0001',
        'wikidata': 'Q4951',
        'viaf': '124913422',
        'viaf_alt': '140116137',  # Alternative VIAF cluster
        'alternative_names': [
            'CJEU',
            'CJUE',
            'CURIA',
            'Court of Justice of the European Communities',
            'CJEC',
            'Gerichtshof der Europäischen Union',
            'Cour de justice de l\'Union européenne'
        ],
        'description': (
            'The Court of Justice of the European Union (CJEU) is the highest '
            'judicial authority in the European Union, consisting of the Court '
            'of Justice and the General Court. Founded in 1952 as the Court of '
            'Justice of the European Communities. The CJEU ensures the uniform '
            'interpretation and application of EU law across all member states. '
            'The Court\'s library holds over 340,000 bibliographic records, '
            'including more than 80,000 concerning European Union law, making it '
            'one of the most complete law libraries in the world regarding EU law. '
            'Archives held at Historical Archives of the European Union (HAEU) in Florence, Italy.'
        )
    }

    for institution in data:
        if institution.get('id') == cjeu_enrichment['id']:
            print(f"Enriching: {institution['name']}")

            # Update description
            institution['description'] = cjeu_enrichment['description']

            # Update alternative names
            institution['alternative_names'] = cjeu_enrichment['alternative_names']

            # Add Wikidata identifier
            wikidata_exists = any(
                i.get('identifier_scheme') == 'Wikidata'
                for i in institution.get('identifiers', [])
            )
            if not wikidata_exists:
                institution['identifiers'].append({
                    'identifier_scheme': 'Wikidata',
                    'identifier_value': cjeu_enrichment['wikidata'],
                    'identifier_url': f"https://www.wikidata.org/wiki/{cjeu_enrichment['wikidata']}"
                })
                print(f"  + Added Wikidata: {cjeu_enrichment['wikidata']}")

            # Add VIAF identifiers (both clusters)
            viaf_exists = any(
                i.get('identifier_scheme') == 'VIAF'
                for i in institution.get('identifiers', [])
            )
            if not viaf_exists:
                # Primary VIAF cluster
                institution['identifiers'].append({
                    'identifier_scheme': 'VIAF',
                    'identifier_value': cjeu_enrichment['viaf'],
                    'identifier_url': f"https://viaf.org/viaf/{cjeu_enrichment['viaf']}"
                })
                print(f"  + Added VIAF: {cjeu_enrichment['viaf']}")

                # Alternative VIAF cluster (for merged records)
                institution['identifiers'].append({
                    'identifier_scheme': 'VIAF',
                    'identifier_value': cjeu_enrichment['viaf_alt'],
                    'identifier_url': f"https://viaf.org/viaf/{cjeu_enrichment['viaf_alt']}",
                    'notes': 'Alternative VIAF cluster for earlier institutional form'
                })
                print(f"  + Added VIAF (alt): {cjeu_enrichment['viaf_alt']}")

            # Update provenance
            if 'provenance' not in institution:
                institution['provenance'] = {}

            enrichment_note = (
                f"Wikidata Q{cjeu_enrichment['wikidata']} and VIAF {cjeu_enrichment['viaf']} "
                f"added via manual research (wikidata.org verification). Enhanced description "
                f"includes library holdings (340k+ records) and archival information (HAEU Florence). "
                f"Phase 1 final country enrichment completed {datetime.now(timezone.utc).strftime('%Y-%m-%d')}."
            )

            institution['provenance']['enrichment_notes'] = enrichment_note
            institution['provenance']['last_enriched'] = datetime.now(timezone.utc).isoformat()

            enriched_count += 1
            print(f"  ✓ Enrichment complete")

    return data, enriched_count


def main():
    """Main enrichment workflow."""
    print("=" * 70)
    print("Luxembourg Institution Enrichment - Phase 1 Final Country")
    print("=" * 70)
    print()

    # Backup unified dataset
    print(f"Creating backup: {BACKUP_PATH}")
    if UNIFIED_DATASET.exists():
        import shutil
        shutil.copy(UNIFIED_DATASET, BACKUP_PATH)
        print("✓ Backup created")

    # Load data
    print(f"\nLoading: {UNIFIED_DATASET}")
    data = load_yaml(UNIFIED_DATASET)
    print(f"✓ Loaded {len(data):,} institutions")

    # Enrich Luxembourg institutions
    print("\n" + "-" * 70)
    print("Enriching Luxembourg Institutions")
    print("-" * 70)
    enriched_data, enriched_count = enrich_luxembourg_institutions(data)

    # Save enriched data
    print("\n" + "-" * 70)
    print(f"Saving enriched dataset: {UNIFIED_DATASET}")
    save_yaml(enriched_data, UNIFIED_DATASET)
    print(f"✓ Saved {len(enriched_data):,} institutions")

    # Summary
    print("\n" + "=" * 70)
    print("ENRICHMENT COMPLETE")
    print("=" * 70)
    print(f"Luxembourg institutions enriched: {enriched_count}")
    print(f"Total institutions in dataset: {len(enriched_data):,}")
    print()
    print("Phase 1 Complete - All 5 countries enriched:")
    print("  ✓ Georgia (GE) - 14 institutions")
    print("  ✓ Great Britain (GB) - 4 institutions")
    print("  ✓ Belgium (BE) - 7 institutions")
    print("  ✓ United States (US) - 7 institutions")
    print("  ✓ Luxembourg (LU) - 1 institution")
    print()
    print(f"Total Phase 1 enriched: 33 institutions")
    print("=" * 70)


if __name__ == "__main__":
    main()