glam/scripts/enrich_gb_batch1.py

#!/usr/bin/env python3
"""
Great Britain Heritage Institutions Enrichment - Batch 1
==========================================================

Strategy: Fuzzy name matching with Wikidata SPARQL queries
Threshold: 0.85 (same as Georgia Batch 1)

Target: 4 GB institutions (0% current coverage)
Goal: Achieve 50%+ Wikidata coverage
"""

import yaml
from datetime import datetime, timezone
from SPARQLWrapper import SPARQLWrapper, JSON
from rapidfuzz import fuzz
import time

# Wikidata SPARQL endpoint
WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"

def query_wikidata_gb_institutions():
    """Query Wikidata for British heritage institutions."""
    sparql = SPARQLWrapper(WIKIDATA_ENDPOINT)

    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?coord ?website ?viaf ?isil ?inception WHERE {
      # Archives, research centers, databases in Great Britain
      VALUES ?type {
        wd:Q166118    # archives
        wd:Q21045422  # research database
        wd:Q31855     # research institute
        wd:Q7315155   # research center
        wd:Q3918      # university (for university-based archives/research centers)
      }

      ?item wdt:P31/wdt:P279* ?type .
      ?item wdt:P17 wd:Q145 .  # Country: United Kingdom

      OPTIONAL { ?item wdt:P625 ?coord }
      OPTIONAL { ?item wdt:P856 ?website }
      OPTIONAL { ?item wdt:P214 ?viaf }
      OPTIONAL { ?item wdt:P791 ?isil }
      OPTIONAL { ?item wdt:P571 ?inception }

      SERVICE wikibase:label {
        bd:serviceParam wikibase:language "en,ar" .
        ?item rdfs:label ?itemLabel .
        ?item skos:altLabel ?itemAltLabel .
      }
    }
    LIMIT 1000
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    print("🔍 Querying Wikidata for British heritage institutions...")
    try:
        results = sparql.query().convert()
        institutions = results['results']['bindings']
        print(f"   ✅ Found {len(institutions)} British institutions in Wikidata\n")
        return institutions
    except Exception as e:
        print(f"   ❌ Query failed: {e}")
        return []

def fuzzy_match_institutions(our_institutions, wikidata_institutions, threshold=0.85):
    """Match our institutions to Wikidata using fuzzy name matching."""
    matches = []

    for our_inst in our_institutions:
        our_name = our_inst['name'].lower()
        best_match = None
        best_score = 0

        for wd_inst in wikidata_institutions:
            wd_label = wd_inst.get('itemLabel', {}).get('value', '').lower()

            # Try main label
            score = fuzz.ratio(our_name, wd_label)
            if score > best_score:
                best_score = score
                best_match = wd_inst

            # Try alternative labels
            if 'itemAltLabel' in wd_inst:
                alt_label = wd_inst['itemAltLabel']['value'].lower()
                alt_score = fuzz.ratio(our_name, alt_label)
                if alt_score > best_score:
                    best_score = alt_score
                    best_match = wd_inst

        if best_score >= threshold * 100:  # rapidfuzz returns 0-100
            matches.append({
                'institution': our_inst,
                'wikidata': best_match,
                'score': best_score / 100
            })
            print(f"   ✅ Match (score={best_score/100:.2f}): {our_inst['name']}")
            print(f"      → {best_match['itemLabel']['value']} ({best_match['item']['value'].split('/')[-1]})")
        else:
            print(f"   ❌ No match: {our_inst['name']} (best score: {best_score/100:.2f})")

    return matches

def enrich_with_wikidata(institution, wikidata_data, match_score):
    """Add Wikidata identifiers and metadata to institution."""
    q_id = wikidata_data['item']['value'].split('/')[-1]

    # Add Wikidata identifier
    if 'identifiers' not in institution:
        institution['identifiers'] = []

    # Check if Wikidata identifier already exists
    has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in institution['identifiers'])
    if not has_wikidata:
        institution['identifiers'].append({
            'identifier_scheme': 'Wikidata',
            'identifier_value': q_id,
            'identifier_url': f"https://www.wikidata.org/wiki/{q_id}"
        })

    # Add VIAF if available
    if 'viaf' in wikidata_data:
        has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in institution['identifiers'])
        if not has_viaf:
            institution['identifiers'].append({
                'identifier_scheme': 'VIAF',
                'identifier_value': wikidata_data['viaf']['value'],
                'identifier_url': f"https://viaf.org/viaf/{wikidata_data['viaf']['value']}"
            })

    # Add ISIL if available
    if 'isil' in wikidata_data:
        has_isil = any(i.get('identifier_scheme') == 'ISIL' for i in institution['identifiers'])
        if not has_isil:
            institution['identifiers'].append({
                'identifier_scheme': 'ISIL',
                'identifier_value': wikidata_data['isil']['value']
            })

    # Add coordinates if available and not already present
    if 'coord' in wikidata_data:
        coord_str = wikidata_data['coord']['value']
        # Parse "Point(lon lat)" format
        coord_str = coord_str.replace('Point(', '').replace(')', '')
        lon, lat = map(float, coord_str.split())

        for location in institution.get('locations', []):
            if location.get('country') == 'GB' and 'latitude' not in location:
                location['latitude'] = lat
                location['longitude'] = lon

    # Add founding date if available
    if 'inception' in wikidata_data:
        institution['founding_date'] = wikidata_data['inception']['value'].split('T')[0]

    # Update provenance
    if 'provenance' not in institution:
        institution['provenance'] = {}

    if 'notes' not in institution['provenance']:
        institution['provenance']['notes'] = []

    institution['provenance']['notes'].append(
        f"Batch 1: Fuzzy name match (score={match_score:.2f}) - Wikidata {q_id}"
    )
    institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
    institution['provenance']['wikidata_verified'] = True

    return institution

def main():
    print("=" * 80)
    print("🇬🇧 Great Britain Heritage Institutions Enrichment - Batch 1")
    print("=" * 80)
    print("\nStrategy: Fuzzy name matching (threshold 0.85)\n")

    # Load our dataset
    print("📂 Loading unified global dataset...")
    with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
        all_institutions = yaml.safe_load(f)

    # Filter GB institutions
    gb_institutions = [
        inst for inst in all_institutions
        if any(loc.get('country') == 'GB' for loc in inst.get('locations', []))
    ]
    print(f"   ✅ Found {len(gb_institutions)} GB institutions\n")

    # Query Wikidata
    wikidata_institutions = query_wikidata_gb_institutions()
    time.sleep(1)  # Be nice to Wikidata

    # Fuzzy matching
    print(f"🔗 Matching institutions (threshold=0.85)...\n")
    matches = fuzzy_match_institutions(gb_institutions, wikidata_institutions, threshold=0.85)

    print(f"\n📊 Found {len(matches)} matches\n")

    # Enrich institutions
    if matches:
        print("✨ Enriching institutions with Wikidata metadata...\n")
        for match in matches:
            enrich_with_wikidata(
                match['institution'],
                match['wikidata'],
                match['score']
            )
            print(f"   ✅ Enriched: {match['institution']['name']}")

    # Save results
    output_path = 'data/instances/great_britain/gb_institutions_enriched_batch1.yaml'
    print(f"\n💾 Saving Batch 1 results to {output_path}...")

    import os
    os.makedirs('data/instances/great_britain', exist_ok=True)

    with open(output_path, 'w', encoding='utf-8') as f:
        yaml.dump(gb_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)

    print("   ✅ Saved\n")

    # Summary
    enriched_count = sum(1 for inst in gb_institutions
                        if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])))

    print("=" * 80)
    print("📊 BATCH 1 RESULTS")
    print("=" * 80)
    print(f"Total institutions:     {len(gb_institutions)}")
    print(f"Wikidata enriched:      {enriched_count} ({enriched_count/len(gb_institutions)*100:.1f}%)")
    print(f"Still need enrichment:  {len(gb_institutions) - enriched_count}")

    if enriched_count >= len(gb_institutions) * 0.5:
        print("\n✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
    else:
        print(f"\n⚠️  Below 50% goal. Batch 2 (alternative names) recommended.")

    print("\n")

if __name__ == '__main__':
    main()