glam/scripts/enrich_latam_alternative_names.py

#!/usr/bin/env python3
"""
Wikidata enrichment for Latin American heritage institutions with alternative name matching.

This script applies the successful Tunisia enrichment strategy to Latin America:
- Entity type validation (museums must be museums, not banks)
- Geographic validation (institutions must be in correct city/country)
- Alternative name matching (Portuguese/Spanish ↔ English)
- Fuzzy matching with 70% threshold
- Prevents false positives through multiple validation layers

Target: 304 Latin American institutions (BR: 97, MX: 109, CL: 90, AR: 1, US: 7)
Current Wikidata coverage: 56/304 (18.4%)
Expected improvement: 60-75% coverage (similar to Tunisia's 76.5%)

GLAM Data Extraction Project
Schema: LinkML v0.2.1
"""

import yaml
import time
import requests
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List, Set
from rapidfuzz import fuzz

SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-LatAm-Wikidata-Enrichment/1.0"

# Institution type mapping: LinkML enum -> Wikidata entity types
INSTITUTION_TYPE_MAPPING = {
    'MUSEUM': {
        'Q33506',      # Museum
        'Q1030034',    # Archaeological museum
        'Q3329412',    # Archaeological museum (variant)
        'Q473972',     # Art museum
        'Q2668072',    # National museum
        'Q207694',     # History museum
        'Q7328910',    # Science museum
        'Q15243387',   # Cultural heritage site
        'Q3152824',    # Archaeological site
        'Q1153562',    # Open-air museum
        'Q1496967',    # Folk museum
        'Q17431399',   # Heritage museum
        'Q28835878',   # Heritage site
        'Q641635',     # Natural history museum
        'Q2142332',    # Contemporary art museum
    },
    'LIBRARY': {
        'Q7075',       # Library
        'Q2668072',    # National library
        'Q570116',     # Public library
        'Q5193377',    # University library
        'Q28564',      # Academic library
        'Q1479716',    # Regional library
        'Q1622062',    # Digital library
        'Q17297735',   # Diocesan library
        'Q105338594',  # Bibliothèque diocésaine
    },
    'ARCHIVE': {
        'Q166118',     # Archive
        'Q7840289',    # Art gallery (can have archival collections)
        'Q2668072',    # National archive
        'Q1497375',    # Historical archive
        'Q64578911',   # Regional archive
        'Q11396317',   # State archive
    },
    'HOLY_SITES': {
        'Q22687',      # Synagogue
        'Q16970',      # Church
        'Q32815',      # Mosque
        'Q44539',      # Temple
        'Q44613',      # Monastery
        'Q34627',      # Synagogue
        'Q697295',     # Cathedral
        'Q56242275',   # Pilgrimage site
    },
    'GALLERY': {
        'Q7840289',    # Art gallery
        'Q473972',     # Art museum
        'Q1007870',    # Art centre
    },
    'UNIVERSITY': {
        'Q3918',       # University
        'Q875538',     # Public university
        'Q2467461',    # Private university
        'Q15936437',   # Research university
        'Q38723',      # Higher education institution
        'Q3354859',    # Technical university
    },
    'RESEARCH_CENTER': {
        'Q31855',      # Research institute
        'Q7315155',    # Research center
        'Q2467461',    # Research institution
        'Q483242',     # Laboratory
        'Q1664720',    # Institute
    },
    'EDUCATION_PROVIDER': {
        'Q2385804',    # Educational institution
        'Q5341295',    # Music school
        'Q1664720',    # Institute
        'Q180958',     # Faculty
        'Q38723',      # Higher education institution
    },
    'OFFICIAL_INSTITUTION': {
        'Q7210356',    # Cultural institution
        'Q7840289',    # Cultural center
        'Q1030034',    # Cultural heritage institution
        'Q1664720',    # Institute
        'Q7210356',    # Government cultural organization
        'Q24398318',   # Theatre building
        'Q17431399',   # Cultural center
    },
    'PERSONAL_COLLECTION': {
        'Q7075',       # Library (personal libraries)
        'Q166118',     # Archive (personal archives)
        'Q33506',      # Museum (personal museums)
    },
    'MIXED': {
        'Q33506',      # Museum
        'Q7075',       # Library
        'Q166118',     # Archive
        'Q7210356',    # Cultural institution
        'Q7840289',    # Cultural center
        'Q1030034',    # Cultural complex
    }
}

# Country name mapping for Wikidata queries
COUNTRY_MAPPING = {
    'BR': {'qid': 'Q155', 'name': 'Brazil', 'lang': 'pt,en,es'},
    'MX': {'qid': 'Q96', 'name': 'Mexico', 'lang': 'es,en'},
    'CL': {'qid': 'Q298', 'name': 'Chile', 'lang': 'es,en'},
    'AR': {'qid': 'Q414', 'name': 'Argentina', 'lang': 'es,en'},
    'US': {'qid': 'Q30', 'name': 'United States', 'lang': 'en,es,pt'},
}

def get_valid_types_for_institution(inst_type: str) -> Set[str]:
    """Get set of valid Wikidata entity types for institution type."""
    return INSTITUTION_TYPE_MAPPING.get(inst_type, set())

def generate_alternative_names(name: str, inst_type: str, country: str) -> List[str]:
    """
    Generate alternative name variations for multilingual matching.

    Handles Portuguese (Brazil) and Spanish (Mexico, Chile) translations:
    - Biblioteca/Biblioteca → Library
    - Museu/Museo → Museum
    - Arquivo/Archivo → Archive
    - Teatro → Theatre
    - Centro Cultural → Cultural Center
    """
    alternatives = []
    name_lower = name.lower()

    # Portuguese → English (Brazil)
    if country == 'BR':
        if 'biblioteca' in name_lower:
            alternatives.append(name.replace('Biblioteca', 'Library').replace('biblioteca', 'library'))
        if 'museu' in name_lower:
            alternatives.append(name.replace('Museu', 'Museum').replace('museu', 'museum'))
        if 'arquivo' in name_lower:
            alternatives.append(name.replace('Arquivo', 'Archive').replace('arquivo', 'archive'))
        if 'teatro' in name_lower:
            alternatives.append(name.replace('Teatro', 'Theatre').replace('teatro', 'theatre'))
        if 'centro cultural' in name_lower:
            alternatives.append(name.replace('Centro Cultural', 'Cultural Center').replace('centro cultural', 'cultural center'))

    # Spanish → English (Mexico, Chile, Argentina)
    elif country in {'MX', 'CL', 'AR'}:
        if 'biblioteca' in name_lower:
            alternatives.append(name.replace('Biblioteca', 'Library').replace('biblioteca', 'library'))
        if 'museo' in name_lower:
            alternatives.append(name.replace('Museo', 'Museum').replace('museo', 'museum'))
        if 'archivo' in name_lower:
            alternatives.append(name.replace('Archivo', 'Archive').replace('archivo', 'archive'))
        if 'teatro' in name_lower:
            alternatives.append(name.replace('Teatro', 'Theatre').replace('teatro', 'theatre'))
        if 'centro cultural' in name_lower:
            alternatives.append(name.replace('Centro Cultural', 'Cultural Center').replace('centro cultural', 'cultural center'))

    return alternatives

def search_wikidata_with_validation(
    name: str,
    inst_type: str,
    country: str,
    city: Optional[str] = None,
    alternative_names: Optional[List[str]] = None,
    timeout: int = 60
) -> Optional[Dict[str, Any]]:
    """
    Search Wikidata for Latin American heritage institutions with validation.

    Args:
        name: Institution name to search
        inst_type: Institution type (MUSEUM, LIBRARY, ARCHIVE, etc.)
        country: ISO country code (BR, MX, CL, AR, US)
        city: Optional city name for additional filtering
        alternative_names: List of alternative names to try
        timeout: Query timeout in seconds

    Returns:
        Dict with Wikidata data if valid match found, None otherwise
    """

    # Get valid Wikidata entity types
    valid_types = get_valid_types_for_institution(inst_type)

    if not valid_types:
        print(f"  ⚠️  Unknown institution type: {inst_type}")
        return None

    # Get country info
    country_info = COUNTRY_MAPPING.get(country)
    if not country_info:
        print(f"  ⚠️  Unknown country: {country}")
        return None

    # Build VALUES clause for entity types
    type_values = " ".join([f"wd:{qid}" for qid in valid_types])

    # Build SPARQL query - country-specific
    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
                    ?viaf ?isil ?website ?coords ?inception ?itemAltLabel
                    ?location ?locationLabel
    WHERE {{
      # Must be in {country_info['name']}
      ?item wdt:P17 wd:{country_info['qid']} .

      # Must have instance-of type matching institution type
      ?item wdt:P31 ?type .

      # Filter to relevant types (server-side)
      VALUES ?type {{ {type_values} }}

      # Location (P131: located in administrative territorial entity)
      OPTIONAL {{ ?item wdt:P131 ?location . }}

      OPTIONAL {{ ?item wdt:P214 ?viaf . }}
      OPTIONAL {{ ?item wdt:P791 ?isil . }}
      OPTIONAL {{ ?item wdt:P856 ?website . }}
      OPTIONAL {{ ?item wdt:P625 ?coords . }}
      OPTIONAL {{ ?item wdt:P571 ?inception . }}
      OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ({', '.join(f'"{lang}"' for lang in country_info['lang'].split(','))})) }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{country_info['lang']}" . }}
    }}
    LIMIT 200
    """

    headers = {'User-Agent': USER_AGENT}
    params = {
        'query': query,
        'format': 'json'
    }

    try:
        time.sleep(1.5)  # Rate limiting
        response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
        response.raise_for_status()

        results = response.json()
        bindings = results.get("results", {}).get("bindings", [])

        if not bindings:
            return None

        # Fuzzy match with entity type AND geographic validation
        best_match = None
        best_score = 0
        matched_name = name

        # Prepare all names to try (primary + existing alternatives + generated)
        names_to_try = [name]
        if alternative_names:
            names_to_try.extend(alternative_names)

        # Generate multilingual alternatives
        generated_alternatives = generate_alternative_names(name, inst_type, country)
        names_to_try.extend(generated_alternatives)

        city_lower = city.lower() if city else None

        # Location-specific institutions require geographic matching
        requires_city_match = inst_type in {'UNIVERSITY', 'RESEARCH_CENTER', 'EDUCATION_PROVIDER'}

        # Try each name variation
        for name_variant in names_to_try:
            name_lower = name_variant.lower()

            for binding in bindings:
                # VALIDATE ENTITY TYPE FIRST
                entity_type_uri = binding.get("type", {}).get("value", "")
                entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None

                if entity_type_qid not in valid_types:
                    continue

                # GEOGRAPHIC VALIDATION for location-specific institutions
                if city_lower and requires_city_match:
                    location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""

                    if not location_label:
                        continue

                    location_match = fuzz.ratio(city_lower, location_label)
                    if location_match < 70:
                        continue

                # Fuzzy matching on validated entities
                item_label = binding.get("itemLabel", {}).get("value", "").lower()

                # Multiple fuzzy match strategies
                label_score = fuzz.ratio(name_lower, item_label)
                partial_score = fuzz.partial_ratio(name_lower, item_label)
                token_score = fuzz.token_set_ratio(name_lower, item_label)

                score = max(label_score, partial_score, token_score)

                if score > best_score:
                    best_score = score
                    best_match = binding
                    matched_name = name_variant

        # Require minimum 70% match
        if best_score < 70:
            return None

        if not best_match:
            return None

        item_uri = best_match.get("item", {}).get("value", "")
        qid = item_uri.split("/")[-1] if item_uri else None

        if not qid or not qid.startswith("Q"):
            return None

        result = {
            "qid": qid,
            "name": best_match.get("itemLabel", {}).get("value", ""),
            "description": best_match.get("itemDescription", {}).get("value", ""),
            "entity_type": best_match.get("typeLabel", {}).get("value", ""),
            "match_score": best_score,
            "matched_name": matched_name
        }

        # Add optional fields
        viaf_data = best_match.get("viaf")
        if viaf_data and isinstance(viaf_data, dict):
            result["viaf"] = viaf_data.get("value", "")

        isil_data = best_match.get("isil")
        if isil_data and isinstance(isil_data, dict):
            result["isil"] = isil_data.get("value", "")

        website_data = best_match.get("website")
        if website_data and isinstance(website_data, dict):
            result["website"] = website_data.get("value", "")

        inception_data = best_match.get("inception")
        if inception_data and isinstance(inception_data, dict):
            result["founded_date"] = inception_data.get("value", "").split("T")[0]

        coords_data = best_match.get("coords")
        if coords_data and isinstance(coords_data, dict):
            coords_str = coords_data.get("value", "")
            if coords_str and coords_str.startswith("Point("):
                lon, lat = coords_str[6:-1].split()
                result["latitude"] = float(lat)
                result["longitude"] = float(lon)

        return result

    except requests.exceptions.Timeout:
        print(f"  ⏱️  Query timeout (>{timeout}s)")
        return None
    except requests.exceptions.RequestException as e:
        print(f"  ❌ Network error: {e}")
        return None
    except Exception as e:
        print(f"  ❌ Error: {e}")
        return None

def add_wikidata_to_institution(institution: dict, wikidata_result: dict):
    """Add Wikidata information to institution record."""

    if 'identifiers' not in institution:
        institution['identifiers'] = []

    # Check existing schemes
    existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}

    # Add Wikidata
    if 'Wikidata' not in existing_schemes:
        institution['identifiers'].append({
            'identifier_scheme': 'Wikidata',
            'identifier_value': wikidata_result['qid'],
            'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}"
        })

    # Add VIAF if present
    if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes:
        institution['identifiers'].append({
            'identifier_scheme': 'VIAF',
            'identifier_value': wikidata_result['viaf'],
            'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}"
        })

    # Add ISIL if present
    if wikidata_result.get('isil') and 'ISIL' not in existing_schemes:
        institution['identifiers'].append({
            'identifier_scheme': 'ISIL',
            'identifier_value': wikidata_result['isil'],
            'identifier_url': f"https://isil.org/{wikidata_result['isil']}"
        })

    # Update provenance
    if 'provenance' not in institution:
        institution['provenance'] = {}

    # Create enrichment note
    entity_type = wikidata_result.get('entity_type', 'unknown')
    matched_name = wikidata_result.get('matched_name', institution.get('name'))
    match_note = f" [matched: {matched_name}]" if matched_name != institution.get('name') else ""

    enrich_note = f" Wikidata enriched {datetime.now(timezone.utc).strftime('%Y-%m-%d')} ({wikidata_result['qid']} [{entity_type}], match: {wikidata_result.get('match_score', 0):.0f}%{match_note}, validated)."

    # Append to existing notes
    existing_notes = institution['provenance'].get('notes', '')
    institution['provenance']['notes'] = (existing_notes + enrich_note).strip()

def save_checkpoint(institutions: list, output_file: Path, stats: dict):
    """Save progress checkpoint."""
    print(f"\n💾 Saving checkpoint... (enriched: {stats['enriched']}, total coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']})")

    # Create metadata
    data = {
        '_metadata': {
            'generated': datetime.now(timezone.utc).isoformat(),
            'source': 'Latin American GLAM institutions',
            'enhancements': [
                'Wikidata enrichment (alternative names + validation)',
                'Entity type validation',
                'Geographic validation',
                'Multilingual matching (Portuguese/Spanish/English)'
            ],
            'statistics': {
                'total_institutions': stats['total'],
                'wikidata_coverage': stats['already_enriched'] + stats['enriched'],
                'newly_enriched': stats['enriched']
            }
        },
        'institutions': institutions
    }

    with open(output_file, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

def main():
    input_file = Path('data/instances/latin_american_institutions_AUTHORITATIVE.yaml')
    output_file = Path('data/instances/latin_american_institutions_AUTHORITATIVE.yaml')  # Overwrite

    print("Latin American Wikidata Enrichment (Alternative Names + Validation)")
    print("=" * 80)
    print("Target: 304 institutions (BR: 97, MX: 109, CL: 90, AR: 1, US: 7)")
    print("Current coverage: 56/304 (18.4%)")
    print()
    print("Features:")
    print("  ✅ Entity type validation (museums must be museums)")
    print("  ✅ Geographic validation (institutions in correct cities)")
    print("  ✅ Alternative name matching (Portuguese/Spanish ↔ English)")
    print("  ✅ Fuzzy matching (70% threshold)")
    print("  ✅ Checkpoint saving every 10 institutions")
    print("=" * 80)

    # Load data
    print(f"\nReading: {input_file}")
    with open(input_file, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    # Handle both list and dict formats
    institutions = data if isinstance(data, list) else data.get('institutions', [])
    print(f"Total institutions: {len(institutions)}")

    # Statistics
    stats = {
        'total': len(institutions),
        'already_enriched': 0,
        'searched': 0,
        'found': 0,
        'enriched': 0,
        'failed': 0,
        'by_country': {
            'BR': {'searched': 0, 'found': 0, 'enriched': 0},
            'MX': {'searched': 0, 'found': 0, 'enriched': 0},
            'CL': {'searched': 0, 'found': 0, 'enriched': 0},
            'AR': {'searched': 0, 'found': 0, 'enriched': 0},
            'US': {'searched': 0, 'found': 0, 'enriched': 0},
        }
    }

    # Process each institution
    checkpoint_interval = 10

    for i, inst in enumerate(institutions, 1):
        name = inst.get('name', '')
        inst_type = inst.get('institution_type', 'MIXED')
        locations = inst.get('locations', [])
        country = locations[0].get('country', '') if locations else ''
        city = locations[0].get('city', '') if locations else ''

        # Check if already has Wikidata
        identifiers = inst.get('identifiers', [])
        existing_schemes = {id.get('identifier_scheme') for id in identifiers}

        if 'Wikidata' in existing_schemes:
            stats['already_enriched'] += 1
            qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown')
            print(f"[{i}/{len(institutions)}] ✓ {name} (already has {qid})")
            continue

        # Search Wikidata with validation
        print(f"[{i}/{len(institutions)}] Searching: {name} [{inst_type}] ({city}, {country})")
        stats['searched'] += 1
        if country in stats['by_country']:
            stats['by_country'][country]['searched'] += 1

        # Get existing alternative names
        alt_names = inst.get('alternative_names', [])

        result = search_wikidata_with_validation(name, inst_type, country, city, alternative_names=alt_names, timeout=60)

        if result:
            stats['found'] += 1
            if country in stats['by_country']:
                stats['by_country'][country]['found'] += 1

            match_score = result.get('match_score', 0)
            entity_type = result.get('entity_type', 'unknown')
            matched_name = result.get('matched_name', name)

            name_note = f" [matched: {matched_name}]" if matched_name != name else ""
            print(f"  ✅ Found: {result['qid']} [{entity_type}] - {result.get('name', '')} (match: {match_score:.0f}%{name_note})")

            add_wikidata_to_institution(inst, result)
            stats['enriched'] += 1
            if country in stats['by_country']:
                stats['by_country'][country]['enriched'] += 1
            print(f"  ✅ Enriched with validated match")
        else:
            stats['failed'] += 1
            print(f"  ❌ Not found or type mismatch")

        # Checkpoint every N institutions
        if i % checkpoint_interval == 0 or i == len(institutions):
            save_checkpoint(institutions, output_file, stats)

    # Final save
    save_checkpoint(institutions, output_file, stats)

    # Print statistics
    print("\n" + "=" * 80)
    print("LATIN AMERICA WIKIDATA ENRICHMENT STATISTICS")
    print("=" * 80)
    print(f"Total institutions: {stats['total']}")
    print(f"Already enriched: {stats['already_enriched']}")
    print(f"Searched: {stats['searched']}")
    print(f"Found (validated): {stats['found']}")
    print(f"Enriched (new): {stats['enriched']}")
    print(f"Failed: {stats['failed']}")
    print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)")

    print("\nBy country:")
    for country, country_stats in sorted(stats['by_country'].items()):
        if country_stats['searched'] > 0:
            print(f"  {country}: searched {country_stats['searched']}, found {country_stats['found']}, enriched {country_stats['enriched']}")

    if stats['enriched'] > 0:
        improvement = stats['enriched']
        old_coverage = stats['already_enriched']
        new_coverage = stats['already_enriched'] + stats['enriched']
        print(f"\n✨ Added {improvement} new validated Wikidata identifiers!")
        print(f"✨ Coverage improved from {old_coverage}/{stats['total']} ({100*old_coverage/stats['total']:.1f}%) to {new_coverage}/{stats['total']} ({100*new_coverage/stats['total']:.1f}%)")
        print(f"✅ All matches validated against correct entity types and countries")

    print("\n✅ Wikidata enrichment complete!")

if __name__ == '__main__':
    main()