glam/scripts/enrich_belgium_eu.py

#!/usr/bin/env python3
"""
Enrich Belgium EU institutions with Wikidata identifiers.

Belgium dataset consists of 7 EU institutions in Brussels (0% Wikidata coverage).
All are well-documented EU bodies with likely Wikidata entries.

Strategy:
1. Load Belgium institutions from master dataset
2. Query Wikidata for EU institutions (P31=Q43229, P17=Q29999)
3. Fuzzy match names
4. Apply high-confidence matches (>0.85)
"""

import sys
from pathlib import Path
from typing import Any, Optional
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON  # type: ignore


def normalize_name(name: str) -> str:
    """Normalize institution name for fuzzy matching."""
    # Lowercase
    name = name.lower()

    # Remove common EU prefixes
    name = re.sub(r'^(european|eu)\s+', '', name)
    name = re.sub(r'\s+(library|archive|archives|committee|commission|parliament|council)$', '', name)

    # Remove punctuation
    name = re.sub(r'[^\w\s]', ' ', name)

    # Normalize whitespace
    name = ' '.join(name.split())

    return name


def similarity_score(name1: str, name2: str) -> float:
    """Calculate similarity between two names (0-1)."""
    norm1 = normalize_name(name1)
    norm2 = normalize_name(name2)
    return SequenceMatcher(None, norm1, norm2).ratio()


def query_wikidata_eu_institutions(sparql: SPARQLWrapper) -> dict[str, dict[str, Any]]:
    """
    Query Wikidata for EU institutions and their heritage units.

    Queries for:
    - EU institutions (Q43229)
    - EU agencies (Q1338914)
    - Located in Belgium (Q31) or EU (Q458)
    """

    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?isil ?viaf ?coords ?website ?inception ?typeLabel
    WHERE {
      VALUES ?type { wd:Q43229 wd:Q1338914 wd:Q7075 wd:Q166118 }

      ?item wdt:P31 ?type .                # instance of EU institution/library/archive

      # Must be located in Belgium or EU
      { ?item wdt:P17 wd:Q31 . }           # country: Belgium
      UNION
      { ?item wdt:P17 wd:Q458 . }          # country: European Union
      UNION
      { ?item wdt:P131 wd:Q239 . }         # located in: Brussels

      OPTIONAL { ?item wdt:P791 ?isil . }
      OPTIONAL { ?item wdt:P214 ?viaf . }
      OPTIONAL { ?item wdt:P625 ?coords . }
      OPTIONAL { ?item wdt:P856 ?website . }
      OPTIONAL { ?item wdt:P571 ?inception . }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "en,fr,nl,de". }
    }
    ORDER BY ?itemLabel
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(SPARQL_JSON)

    print("🔍 Querying Wikidata for EU institutions...")
    results = sparql.query().convert()

    # Parse results
    institutions = {}
    for result in results['results']['bindings']:
        qid = result['item']['value'].split('/')[-1]

        institutions[qid] = {
            'qid': qid,
            'name': result['itemLabel']['value'],
            'description': result.get('itemDescription', {}).get('value', ''),
            'isil': result.get('isil', {}).get('value'),
            'viaf': result.get('viaf', {}).get('value'),
            'website': result.get('website', {}).get('value'),
            'inception': result.get('inception', {}).get('value', '').split('T')[0],
            'type': result.get('typeLabel', {}).get('value', ''),
            'coords': result.get('coords', {}).get('value')
        }

    print(f"✅ Found {len(institutions)} EU institutions in Wikidata")
    return institutions


def match_institution(
    inst: dict[str, Any],
    wikidata_institutions: dict[str, dict[str, Any]],
    threshold: float = 0.85
) -> Optional[dict[str, Any]]:
    """
    Match a local institution to Wikidata using fuzzy name matching.

    Returns best match if score > threshold, else None.
    """
    inst_name = inst.get('name', '')
    if not inst_name:
        return None

    best_match = None
    best_score = 0.0

    for qid, wd_inst in wikidata_institutions.items():
        wd_name = wd_inst['name']

        # Calculate similarity
        score = similarity_score(inst_name, wd_name)

        if score > best_score:
            best_score = score
            best_match = wd_inst

    if best_score >= threshold:
        return best_match

    return None


def enrich_institution(
    inst: dict[str, Any],
    wd_match: dict[str, Any]
) -> dict[str, Any]:
    """Add Wikidata enrichment to institution record."""

    # Add Wikidata identifier
    identifiers = inst.get('identifiers', [])

    # Check if Wikidata already exists
    has_wikidata = any(
        id.get('identifier_scheme') == 'Wikidata'
        for id in identifiers
    )

    if not has_wikidata:
        identifiers.append({
            'identifier_scheme': 'Wikidata',
            'identifier_value': wd_match['qid'],
            'identifier_url': f"https://www.wikidata.org/wiki/{wd_match['qid']}"
        })

    # Add VIAF if available
    if wd_match.get('viaf'):
        has_viaf = any(
            id.get('identifier_scheme') == 'VIAF'
            for id in identifiers
        )
        if not has_viaf:
            identifiers.append({
                'identifier_scheme': 'VIAF',
                'identifier_value': wd_match['viaf'],
                'identifier_url': f"https://viaf.org/viaf/{wd_match['viaf']}"
            })

    # Add ISIL if available
    if wd_match.get('isil'):
        has_isil = any(
            id.get('identifier_scheme') == 'ISIL'
            for id in identifiers
        )
        if not has_isil:
            identifiers.append({
                'identifier_scheme': 'ISIL',
                'identifier_value': wd_match['isil'],
                # ISIL codes don't have a universal URLisil']}"
            })

    # Add website if not present
    if wd_match.get('website'):
        has_website = any(
            id.get('identifier_scheme') == 'Website'
            for id in identifiers
        )
        if not has_website:
            identifiers.append({
                'identifier_scheme': 'Website',
                'identifier_value': wd_match['website'],
                'identifier_url': wd_match['website']
            })

    inst['identifiers'] = identifiers

    # Add coordinates if available
    if wd_match.get('coords'):
        coords = wd_match['coords'].replace('Point(', '').replace(')', '').split()
        lon, lat = float(coords[0]), float(coords[1])

        locations = inst.get('locations', [])
        if locations and not locations[0].get('latitude'):
            locations[0]['latitude'] = lat
            locations[0]['longitude'] = lon

    # Enhance description with Wikidata description
    if wd_match.get('description') and not inst.get('description'):
        inst['description'] = wd_match['description']

    # Add enrichment provenance
    provenance = inst.get('provenance', {})
    if 'enrichment_history' not in provenance:
        provenance['enrichment_history'] = []

    provenance['enrichment_history'].append({
        'enrichment_date': datetime.now(timezone.utc).isoformat(),
        'enrichment_method': 'Wikidata SPARQL query + fuzzy name matching',
        'identifiers_added': ['Wikidata'] +
            (['VIAF'] if wd_match.get('viaf') else []) +
            (['ISIL'] if wd_match.get('isil') else []),
        'verified': True
    })

    inst['provenance'] = provenance

    return inst


def main():
    """Main enrichment workflow."""

    # Paths
    project_root = Path(__file__).parent.parent
    master_file = project_root / 'data' / 'instances' / 'all' / 'globalglam-20251111.yaml'
    output_dir = project_root / 'data' / 'instances' / 'belgium'
    output_dir.mkdir(parents=True, exist_ok=True)

    print("🇧🇪 Belgium EU Institutions Enrichment")
    print("=" * 70)

    # Load master dataset
    print(f"📖 Loading master dataset from {master_file.name}...")
    with open(master_file, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    institutions = data if isinstance(data, list) else data.get('institutions', [])
    print(f"✅ Loaded {len(institutions)} total institutions")

    # Filter Belgium institutions without Wikidata
    be_institutions = [
        i for i in institutions
        if i.get('locations')
        and any(loc.get('country') == 'BE' for loc in i.get('locations', []))
        and not any(
            id.get('identifier_scheme') == 'Wikidata'
            for id in i.get('identifiers', [])
        )
    ]

    print(f"🎯 Found {len(be_institutions)} Belgium institutions without Wikidata")

    if not be_institutions:
        print("✅ All Belgium institutions already have Wikidata IDs!")
        return

    # Initialize SPARQL endpoint
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.addCustomHttpHeader("User-Agent", "GLAM-Data-Extraction/0.2.1 (https://github.com/your-repo)")

    # Query Wikidata
    wd_institutions = query_wikidata_eu_institutions(sparql)
    time.sleep(1)  # Rate limiting

    # Match and enrich
    enriched = []
    unmatched = []

    print("\n🔗 Matching institutions...")
    print("-" * 70)

    for inst in be_institutions:
        name = inst.get('name', 'UNKNOWN')

        # Try fuzzy matching
        match = match_institution(inst, wd_institutions, threshold=0.85)

        if match:
            print(f"✅ MATCHED: {name}")
            print(f"   → Wikidata: {match['name']} ({match['qid']})")
            print(f"   → Confidence: {similarity_score(name, match['name']):.2%}")

            enriched_inst = enrich_institution(inst, match)
            enriched.append(enriched_inst)
        else:
            print(f"❌ NO MATCH: {name}")
            unmatched.append(inst)

    # Summary
    print("\n" + "=" * 70)
    print(f"📊 Enrichment Summary")
    print("=" * 70)
    print(f"✅ Matched: {len(enriched)}/{len(be_institutions)} ({len(enriched)/len(be_institutions)*100:.1f}%)")
    print(f"❌ Unmatched: {len(unmatched)}")

    # Save enriched dataset
    if enriched:
        output_file = output_dir / 'belgium_institutions_enriched.yaml'

        output_data = {
            '_metadata': {
                'generated': datetime.now(timezone.utc).isoformat(),
                'project': 'GLAM Data Extraction',
                'schema_version': 'v0.2.1',
                'country': 'BE',
                'description': 'Belgium EU institutions enriched with Wikidata',
                'enrichment_method': 'Wikidata SPARQL + fuzzy matching',
                'total_institutions': len(enriched),
                'wikidata_coverage': sum(1 for i in enriched if any(
                    id.get('identifier_scheme') == 'Wikidata'
                    for id in i.get('identifiers', [])
                ))
            },
            'institutions': enriched
        }

        with open(output_file, 'w', encoding='utf-8') as f:
            yaml.dump(output_data, f, allow_unicode=True, sort_keys=False, width=120)

        print(f"\n💾 Saved {len(enriched)} enriched institutions to:")
        print(f"   {output_file}")

    # Save unmatched for manual review
    if unmatched:
        unmatched_file = output_dir / 'belgium_unmatched.yaml'

        with open(unmatched_file, 'w', encoding='utf-8') as f:
            yaml.dump(unmatched, f, allow_unicode=True, sort_keys=False, width=120)

        print(f"\n⚠️  Saved {len(unmatched)} unmatched institutions to:")
        print(f"   {unmatched_file}")
        print(f"   → Manual review recommended")


if __name__ == '__main__':
    main()