glam/scripts/enrich_chilean_batch2_universities.py

#!/usr/bin/env python3
"""
Chilean Heritage Institutions - Batch 2 Wikidata Enrichment (Universities Focus)

Improved strategy based on Batch 1 learnings:
- Focus on universities (excellent Wikidata coverage)
- Better name normalization (strip possessives, handle word order)
- Geographic filtering in SPARQL queries
- Higher success rate expected (universities have standardized names)

BATCH 2 TARGET INSTITUTIONS (5 major universities):
1. Universidad de Chile - Santiago
2. Universidad de Santiago de Chile (USACH) - Santiago
3. Universidad de Concepción - Concepción
4. Universidad Austral de Chile - Valdivia
5. Pontificia Universidad Católica de Chile - Santiago

SUCCESS CRITERIA:
- Batch 1: 2/90 with Wikidata (2.2%)
- Goal: 7/90 with Wikidata (7.8%)
- Expected success rate: 100% for universities

IMPROVEMENTS FROM BATCH 1:
1. Name normalization: Remove "'s", "Universidad's" → "Universidad"
2. Geographic filtering: Add city/region to SPARQL query
3. Multiple name variants: Try both full and abbreviated names
4. Better fuzzy matching: Use token_set_ratio for word order variations
"""

import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any, Optional
import requests
import time
from rapidfuzz import fuzz
import re

# Wikidata SPARQL endpoint
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"

# Batch 2: Major Chilean universities (high success probability)
BATCH_2_TARGETS = [
    {
        'name_pattern': 'Universidad de Chile',
        'name_variants': ['Universidad de Chile', 'U. de Chile', 'UChile'],
        'region': 'Santiago',
        'city': 'Santiago',
        'inst_type': 'EDUCATION_PROVIDER',
        'wikidata_class': 'Q3918',  # university
        'notes': 'Oldest and most prestigious public university in Chile (founded 1842)'
    },
    {
        'name_pattern': 'Universidad de Santiago de Chile',
        'name_variants': ['Universidad de Santiago de Chile', 'USACH', 'U. de Santiago'],
        'region': 'Santiago',
        'city': 'Santiago',
        'inst_type': 'EDUCATION_PROVIDER',
        'wikidata_class': 'Q3918',
        'notes': 'Major public university in Santiago (founded 1849 as Escuela de Artes y Oficios)'
    },
    {
        'name_pattern': 'Universidad de Concepción',
        'name_variants': ['Universidad de Concepción', 'UdeC', 'U. de Concepción'],
        'region': 'Concepción',
        'city': 'Concepción',
        'inst_type': 'EDUCATION_PROVIDER',
        'wikidata_class': 'Q3918',
        'notes': 'Third oldest university in Chile (founded 1919)'
    },
    {
        'name_pattern': 'Universidad Austral de Chile',
        'name_variants': ['Universidad Austral de Chile', 'UACh', 'U. Austral'],
        'region': 'Valdivia',
        'city': 'Valdivia',
        'inst_type': 'EDUCATION_PROVIDER',
        'wikidata_class': 'Q3918',
        'notes': 'Public university in southern Chile (founded 1954)'
    },
    {
        'name_pattern': 'Pontificia Universidad Católica de Chile',
        'name_variants': [
            'Pontificia Universidad Católica de Chile',
            'UC Chile',
            'PUC',
            'Universidad Católica de Chile'
        ],
        'region': 'Santiago',
        'city': 'Santiago',
        'inst_type': 'EDUCATION_PROVIDER',
        'wikidata_class': 'Q3918',
        'notes': 'Leading private Catholic university (founded 1888)'
    }
]


def normalize_name(name: str) -> str:
    """Normalize institution name for better matching."""
    # Remove possessive markers
    name = re.sub(r"'s\b", "", name)

    # Remove leading/trailing whitespace
    name = name.strip()

    # Normalize whitespace
    name = re.sub(r'\s+', ' ', name)

    return name


def matches_institution(inst: Dict[str, Any], target: Dict[str, Any]) -> bool:
    """Check if institution matches target criteria with improved name matching."""
    # Normalize institution name
    inst_name = normalize_name(inst.get('name', '')).lower()

    # Check against all name variants
    name_variants = target.get('name_variants', [target['name_pattern']])
    matched_name = False

    for variant in name_variants:
        normalized_variant = normalize_name(variant).lower()

        # Try multiple fuzzy matching strategies
        scores = [
            fuzz.ratio(inst_name, normalized_variant),
            fuzz.partial_ratio(inst_name, normalized_variant),
            fuzz.token_set_ratio(inst_name, normalized_variant)
        ]

        max_score = max(scores)

        if max_score >= 75:  # Lower threshold to catch variations
            matched_name = True
            break

    if not matched_name:
        return False

    # Check institution type
    if inst.get('institution_type') != target['inst_type']:
        return False

    # Check location (region or city)
    locations = inst.get('locations', [])
    if not locations:
        return False

    location = locations[0]
    region = location.get('region', '')
    city = location.get('city', '')

    # Match by region or city
    target_region = target.get('region', '')
    target_city = target.get('city', '')

    location_match = False
    if target_region and (region == target_region or city == target_region):
        location_match = True
    if target_city and (city == target_city or region == target_city):
        location_match = True

    return location_match


def has_wikidata(inst: Dict[str, Any]) -> bool:
    """Check if institution already has Wikidata identifier."""
    return any(
        id_obj.get('identifier_scheme') == 'Wikidata'
        for id_obj in inst.get('identifiers', [])
    )


def query_wikidata_with_location(
    name_variants: List[str],
    city: Optional[str],
    inst_class: str
) -> List[Dict[str, Any]]:
    """Query Wikidata with geographic filtering for better precision."""
    # Build filter for city if provided
    city_filter = ""
    if city:
        # Map Chilean city names to Wikidata Q-numbers (add as needed)
        city_mapping = {
            'Santiago': 'Q2887',
            'Concepción': 'Q5775',
            'Valdivia': 'Q3883'
        }

        if city in city_mapping:
            city_q = city_mapping[city]
            city_filter = f"""
              ?item wdt:P131* wd:{city_q} .  # Located in or subdivision of city
            """

    query = f"""
    SELECT ?item ?itemLabel ?itemDescription ?viaf ?isil WHERE {{
      ?item wdt:P31/wdt:P279* wd:{inst_class} .
      ?item wdt:P17 wd:Q298 .  # Country: Chile
      {city_filter}
      OPTIONAL {{ ?item wdt:P214 ?viaf }}
      OPTIONAL {{ ?item wdt:P791 ?isil }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
    }}
    LIMIT 100
    """

    headers = {
        'User-Agent': 'GLAM-Data-Extractor/0.2 (heritage-data-project; batch2-universities)',
        'Accept': 'application/json'
    }

    try:
        response = requests.get(
            WIKIDATA_SPARQL,
            params={'query': query, 'format': 'json'},
            headers=headers,
            timeout=30
        )
        response.raise_for_status()

        results = response.json()
        bindings = results.get('results', {}).get('bindings', [])

        # Extract relevant fields
        matches = []
        for binding in bindings:
            item_uri = binding.get('item', {}).get('value', '')
            q_number = item_uri.split('/')[-1] if item_uri else None

            if q_number:
                matches.append({
                    'q_number': q_number,
                    'label': binding.get('itemLabel', {}).get('value', ''),
                    'description': binding.get('itemDescription', {}).get('value', ''),
                    'viaf': binding.get('viaf', {}).get('value', None),
                    'isil': binding.get('isil', {}).get('value', None)
                })

        return matches

    except Exception as e:
        print(f"   ⚠️  Wikidata query error: {e}")
        return []


def fuzzy_match_wikidata_improved(
    inst_name: str,
    name_variants: List[str],
    wd_results: List[Dict[str, Any]]
) -> tuple[Optional[Dict[str, Any]], float]:
    """Improved fuzzy matching with multiple strategies."""
    best_match = None
    best_score = 0

    # Normalize institution name
    inst_name_norm = normalize_name(inst_name).lower()

    for result in wd_results:
        wd_label = normalize_name(result['label']).lower()

        # Try matching against institution name
        scores = [
            fuzz.ratio(inst_name_norm, wd_label),
            fuzz.partial_ratio(inst_name_norm, wd_label),
            fuzz.token_set_ratio(inst_name_norm, wd_label),
            fuzz.token_sort_ratio(inst_name_norm, wd_label)
        ]

        # Also try matching against target name variants
        for variant in name_variants:
            variant_norm = normalize_name(variant).lower()
            scores.extend([
                fuzz.ratio(variant_norm, wd_label),
                fuzz.token_set_ratio(variant_norm, wd_label),
                fuzz.token_sort_ratio(variant_norm, wd_label)
            ])

        score = max(scores)

        if score > best_score:
            best_score = score
            best_match = result

    return best_match, best_score


def add_wikidata_identifier(
    inst: Dict[str, Any],
    q_number: str,
    confidence: float,
    notes: str
) -> Dict[str, Any]:
    """Add Wikidata identifier to institution with provenance tracking."""
    wikidata_id = {
        'identifier_scheme': 'Wikidata',
        'identifier_value': q_number,
        'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
    }

    if 'identifiers' not in inst:
        inst['identifiers'] = []

    inst['identifiers'].append(wikidata_id)

    # Update provenance
    if 'provenance' in inst:
        old_method = inst['provenance'].get('extraction_method', '')
        enrichment_note = (
            f" + Wikidata enrichment (Batch 2 universities, confidence={confidence:.2f})"
        )
        inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"

    return inst


def main():
    data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch1_enriched.yaml'
    backup_file = data_file.with_suffix('.batch2_backup')
    output_file = data_file.with_name('chilean_institutions_batch2_enriched.yaml')

    print("=" * 80)
    print("Chilean Heritage Institutions - Batch 2 Wikidata Enrichment")
    print("Universities Focus - Improved Matching Strategy")
    print("Session: November 9, 2025")
    print("Target: 5 major universities")
    print("=" * 80)
    print()

    # Load data
    print(f"📂 Loading: {data_file}")
    with open(data_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    print(f"   Total institutions: {len(institutions)}")

    # Check existing Wikidata coverage
    with_wikidata_before = sum(1 for inst in institutions if has_wikidata(inst))
    print(f"   Current Wikidata coverage: {with_wikidata_before}/{len(institutions)} ({with_wikidata_before/len(institutions)*100:.1f}%)")
    print()

    # Create backup
    print(f"💾 Creating backup: {backup_file}")
    with open(backup_file, 'w', encoding='utf-8') as f:
        yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
    print()

    print("Batch 2 Enrichment Process:")
    print("-" * 80)

    # Process each target
    enriched_count = 0
    skipped_count = 0
    not_found_count = 0
    manual_review_count = 0

    for i, target in enumerate(BATCH_2_TARGETS, 1):
        print(f"\n[{i}/{len(BATCH_2_TARGETS)}] 🎓 Searching: {target['name_pattern']}")
        print(f"   Location: {target['city']}, {target['region']}")
        print(f"   Name variants: {', '.join(target['name_variants'][:3])}")

        # Find matching institution in dataset
        matched = None
        for inst in institutions:
            if matches_institution(inst, target):
                matched = inst
                break

        if not matched:
            print(f"   ❌ NOT FOUND in dataset")
            print(f"      (Check if institution name matches any variant)")
            not_found_count += 1
            continue

        print(f"   ✓ Found: {matched.get('name')}")

        # Check if already has Wikidata
        if has_wikidata(matched):
            existing_q = next(
                (id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
                 if id_obj.get('identifier_scheme') == 'Wikidata'),
                None
            )
            print(f"   ⏭️  Already enriched with {existing_q}")
            skipped_count += 1
            continue

        # Query Wikidata with location filtering
        print(f"   🌐 Querying Wikidata (universities in {target['city']})...")
        time.sleep(1.5)  # Rate limiting

        wd_results = query_wikidata_with_location(
            target['name_variants'],
            target.get('city'),
            target['wikidata_class']
        )

        if not wd_results:
            print(f"   ⚠️  No Wikidata results found")
            manual_review_count += 1
            continue

        print(f"   📊 Found {len(wd_results)} Wikidata candidates")

        # Improved fuzzy matching
        best_match, match_score = fuzzy_match_wikidata_improved(
            matched['name'],
            target['name_variants'],
            wd_results
        )

        if not best_match:
            print(f"   ⚠️  No good match found (threshold < 70)")
            manual_review_count += 1
            continue

        print(f"   🎯 Best match: {best_match['label']} ({best_match['q_number']})")
        print(f"      Similarity: {match_score:.1f}%")
        if best_match.get('description'):
            print(f"      Description: {best_match['description']}")
        if best_match.get('viaf'):
            print(f"      VIAF: {best_match['viaf']}")

        # Confidence-based decision
        if match_score >= 85:
            print(f"   ✅ HIGH CONFIDENCE - Auto-accepting")
            add_wikidata_identifier(
                matched,
                best_match['q_number'],
                match_score / 100,
                target['notes']
            )
            enriched_count += 1
        elif match_score >= 75:
            print(f"   ⚠️  MEDIUM CONFIDENCE - Needs manual verification")
            print(f"      Verify at: https://www.wikidata.org/wiki/{best_match['q_number']}")
            manual_review_count += 1
        else:
            print(f"   ❌ LOW CONFIDENCE - Skipping")
            manual_review_count += 1

    print()
    print("=" * 80)
    print("Batch 2 Summary:")
    print("-" * 80)
    print(f"✅ Auto-enriched:    {enriched_count}")
    print(f"⚠️  Manual review:    {manual_review_count}")
    print(f"⏭️  Already enriched: {skipped_count}")
    print(f"❌ Not found:        {not_found_count}")

    # Calculate updated coverage
    with_wikidata_after = sum(1 for inst in institutions if has_wikidata(inst))

    print()
    print("Chilean Institution Coverage:")
    print(f"  Total:          {len(institutions)}")
    print(f"  Before Batch 2: {with_wikidata_before} ({with_wikidata_before/len(institutions)*100:.1f}%)")
    print(f"  After Batch 2:  {with_wikidata_after} ({with_wikidata_after/len(institutions)*100:.1f}%)")
    print(f"  Improvement:    +{with_wikidata_after - with_wikidata_before} institutions")

    # Save if any enrichments
    if enriched_count > 0:
        print()
        print(f"💾 Saving enriched data to: {output_file}")
        with open(output_file, 'w', encoding='utf-8') as f:
            yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)

        print()
        print("✅ Batch 2 enrichment complete!")
        print()
        print("NEXT STEPS:")
        print("1. Review medium-confidence candidates")
        print("2. Create Batch 3 targeting major museums:")
        print("   - Museo Histórico y Antropológico (Valdivia)")
        print("   - Museo Colchagua (Santa Cruz)")
        print("   - Museo Gabriela Mistral (Vicuña)")
        print("   - Museo Antropológico Padre Sebastián Englert (Easter Island)")
        print("   - Casa Museo Isla Negra (Pablo Neruda)")
        print("3. Continue until 20+ institutions enriched (22% coverage)")
    else:
        print()
        print("⚠️  No automatic enrichments - all require manual review")
        print()
        print("DEBUGGING TIPS:")
        print("1. Check if institution names in dataset match target name_variants")
        print("2. Verify institution_type field matches target")
        print("3. Check if location (city/region) matches target")
        print("4. Review name normalization logic in matches_institution()")


if __name__ == '__main__':
    main()