glam/scripts/enrich_czech_wikidata_fuzzy.py

#!/usr/bin/env python3
"""
Enrich Czech custodian files with Wikidata Q-numbers using fuzzy name matching.

Uses Wikidata SPARQL endpoint to find matching institutions by name + location.
Writes enrichment data directly to individual custodian YAML files.

Process:
1. Query Wikidata for ALL Czech heritage institutions
2. Load CZ-*.yaml files without wikidata_enrichment
3. Fuzzy match by name + city location
4. Add Wikidata identifiers to matched files
5. Mark with enrichment_version: 2.1_generic

Usage:
    python scripts/enrich_czech_wikidata_fuzzy.py [--limit N] [--dry-run] [--threshold N]
"""

import yaml
import requests
import argparse
import sys
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from datetime import datetime, timezone
from rapidfuzz import fuzz

# Wikidata SPARQL endpoint
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"

# Czech Republic Wikidata ID
CZECHIA_QID = "Q213"

# Languages for Czech institutions (Czech primary, then English, German, Slovak)
CZECH_LANGUAGES = "cs,en,de,sk"

# Default similarity threshold
DEFAULT_THRESHOLD = 85.0


def query_wikidata_czech_institutions() -> List[Dict]:
    """
    Query Wikidata for ALL Czech heritage institutions.

    Returns:
        List of dicts with: qid, label, type, location, coordinates, isil, viaf
    """

    # Simplified SPARQL query - minimal optionals to avoid timeout
    # Czech libraries are mostly municipal/public, so focus on those
    query = f"""
    SELECT DISTINCT ?item ?itemLabel ?locationLabel ?isil
    WHERE {{
      # Direct instance of heritage institution types
      VALUES ?type {{
        wd:Q33506      # museum
        wd:Q7075       # library
        wd:Q166118     # archive
        wd:Q1007870    # art gallery
        wd:Q28564      # public library
        wd:Q207694     # art museum
      }}

      ?item wdt:P31 ?type .
      ?item wdt:P17 wd:{CZECHIA_QID} .

      # Location is key for matching
      OPTIONAL {{ ?item wdt:P131 ?location }}

      # ISIL is valuable
      OPTIONAL {{ ?item wdt:P791 ?isil }}

      SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "{CZECH_LANGUAGES}"
      }}
    }}
    LIMIT 10000
    """

    print("Querying Wikidata for Czech heritage institutions...")
    print(f"  Endpoint: {WIKIDATA_SPARQL}")
    print(f"  Languages: {CZECH_LANGUAGES}")

    headers = {
        'User-Agent': 'GLAM-Data-Extraction/0.2.1 (Czech heritage institution research)',
        'Accept': 'application/sparql-results+json'
    }

    try:
        response = requests.get(
            WIKIDATA_SPARQL,
            params={'query': query},
            headers=headers,
            timeout=180  # Generous timeout for large query
        )
        response.raise_for_status()
        data = response.json()

        # Parse results
        institutions = []
        seen_qids = set()  # Deduplicate by QID

        for binding in data['results']['bindings']:
            qid = binding['item']['value'].split('/')[-1]

            # Skip duplicates (same institution may have multiple types)
            if qid in seen_qids:
                continue
            seen_qids.add(qid)

            label = binding['itemLabel']['value']
            inst_type = ''
            location = binding.get('locationLabel', {}).get('value', '')
            isil = binding.get('isil', {}).get('value', '')

            institutions.append({
                'qid': qid,
                'label': label,
                'type': inst_type,
                'location': location,
                'isil': isil
            })

        print(f"  Found {len(institutions)} unique institutions in Wikidata")
        return institutions

    except requests.exceptions.Timeout:
        print("ERROR: Wikidata query timed out. Try again later.")
        return []
    except requests.exceptions.RequestException as e:
        print(f"ERROR: Failed to query Wikidata: {e}")
        return []
    except Exception as e:
        print(f"ERROR: Unexpected error: {e}")
        return []


def fuzzy_match_institution(
    inst_name: str,
    inst_city: str,
    wikidata_results: List[Dict],
    threshold: float = DEFAULT_THRESHOLD
) -> Optional[Tuple[Dict, float]]:
    """
    Fuzzy match institution to Wikidata results.

    Uses a two-pass algorithm:
    1. First try to find matches with BOTH name and location match (high confidence)
    2. If no location match, fall back to name-only match with higher threshold

    Args:
        inst_name: Institution name from our dataset
        inst_city: City location
        wikidata_results: List of Wikidata query results
        threshold: Minimum similarity threshold (0-100)

    Returns:
        Tuple of (matched_wikidata_record, confidence_score) or None
    """

    best_match = None
    best_score = 0.0
    best_has_location_match = False

    # Normalize our institution name
    inst_name_lower = inst_name.lower().strip()
    inst_city_lower = inst_city.lower().strip() if inst_city else ''

    for wd in wikidata_results:
        wd_label_lower = wd['label'].lower().strip()
        wd_location_lower = wd.get('location', '').lower()

        # Name similarity using token sort ratio (handles word reordering)
        name_score = fuzz.token_sort_ratio(inst_name_lower, wd_label_lower)

        # Check for location match
        location_match = False
        location_boost = 0

        if inst_city_lower and wd_location_lower:
            # Exact city name match in location
            if inst_city_lower in wd_location_lower:
                location_match = True
                location_boost = 10
            # Also check if city name is IN the Wikidata label itself
            elif inst_city_lower in wd_label_lower:
                location_match = True
                location_boost = 8
            # Fuzzy location match
            elif fuzz.partial_ratio(inst_city_lower, wd_location_lower) > 90:
                location_match = True
                location_boost = 5

        # If we have a city but Wikidata label contains a DIFFERENT city, penalize
        if inst_city_lower and not location_match:
            # Check if Wikidata label contains a different Czech city
            # Major Czech cities that might cause false matches
            czech_cities = ['praha', 'prague', 'brno', 'ostrava', 'plzeň', 'pilsen',
                           'liberec', 'olomouc', 'české budějovice', 'hradec králové',
                           'ústí nad labem', 'pardubice', 'zlín', 'havířov', 'kladno',
                           'opava', 'karviná', 'teplice', 'děčín', 'jihlava']
            for city in czech_cities:
                if city in wd_label_lower and city != inst_city_lower:
                    # Different city mentioned in Wikidata label - big penalty
                    name_score = max(0, name_score - 20)
                    break

        # Combined score
        total_score = min(name_score + location_boost, 100)

        # Prefer matches with location confirmation
        is_better = False
        if total_score >= threshold:
            if location_match and not best_has_location_match:
                # Location match beats non-location match
                is_better = True
            elif location_match == best_has_location_match and total_score > best_score:
                # Same location status, higher score wins
                is_better = True

        if is_better:
            best_score = total_score
            best_match = wd
            best_has_location_match = location_match

    # For matches without location confirmation, require higher threshold
    if best_match and not best_has_location_match:
        # Require 95% name match if no location confirmation
        if best_score < 95:
            return None

    if best_match:
        return (best_match, best_score)
    return None


def load_unenriched_files(custodian_dir: Path, limit: Optional[int] = None) -> List[Tuple[Path, Dict]]:
    """
    Load CZ-*.yaml files that don't have wikidata_enrichment.

    Args:
        custodian_dir: Path to data/custodian directory
        limit: Optional limit on number of files to load

    Returns:
        List of (file_path, data_dict) tuples
    """

    files = []
    cz_files = sorted(custodian_dir.glob("CZ-*.yaml"))

    print(f"Scanning {len(cz_files)} CZ-*.yaml files...")

    for filepath in cz_files:
        if limit and len(files) >= limit:
            break

        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)

            # Skip if already has wikidata_enrichment
            if data.get('wikidata_enrichment'):
                continue

            # Skip if already has Wikidata identifier in top-level identifiers
            # Note: Czech files may have Wikidata in original_entry.identifiers which is fine to update
            has_wikidata = False
            for identifier in data.get('identifiers', []):
                if isinstance(identifier, dict):
                    if identifier.get('identifier_scheme') == 'Wikidata':
                        has_wikidata = True
                        break
            if has_wikidata:
                continue

            files.append((filepath, data))

        except Exception as e:
            print(f"  Warning: Could not load {filepath.name}: {e}")

    print(f"  Found {len(files)} files needing Wikidata enrichment")
    return files


def save_enriched_file(filepath: Path, data: Dict) -> bool:
    """Save enriched data back to YAML file."""
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            yaml.dump(
                data,
                f,
                allow_unicode=True,
                sort_keys=False,
                default_flow_style=False,
                width=120
            )
        return True
    except Exception as e:
        print(f"  ERROR saving {filepath.name}: {e}")
        return False


def enrich_with_wikidata(
    limit: Optional[int] = None,
    dry_run: bool = False,
    threshold: float = DEFAULT_THRESHOLD
):
    """Main enrichment workflow."""

    print("=" * 80)
    print("CZECH INSTITUTIONS - WIKIDATA FUZZY MATCHING ENRICHMENT")
    print("=" * 80)
    print()

    # Setup paths
    custodian_dir = Path(__file__).parent.parent / "data" / "custodian"

    if not custodian_dir.exists():
        print(f"ERROR: Custodian directory not found: {custodian_dir}")
        sys.exit(1)

    # Query Wikidata for Czech institutions
    wikidata_results = query_wikidata_czech_institutions()

    if not wikidata_results:
        print("No Wikidata results found. Exiting.")
        sys.exit(1)

    print()

    # Load unenriched files
    files_to_enrich = load_unenriched_files(custodian_dir, limit)

    if not files_to_enrich:
        print("No files need enrichment. Exiting.")
        return

    print()
    print(f"Fuzzy matching {len(files_to_enrich)} institutions...")
    print(f"  Match threshold: {threshold}%")
    print(f"  Dry run: {dry_run}")
    print()

    # Statistics
    matched = 0
    high_confidence = 0
    low_confidence = 0
    saved = 0
    errors = 0

    timestamp = datetime.now(timezone.utc).isoformat()

    for idx, (filepath, data) in enumerate(files_to_enrich, 1):
        # Progress indicator
        if idx % 50 == 0 or idx == len(files_to_enrich):
            print(f"  [{idx}/{len(files_to_enrich)}] Matched: {matched}, Saved: {saved}")

        # Extract institution name
        inst_name = None
        if data.get('custodian_name', {}).get('claim_value'):
            inst_name = data['custodian_name']['claim_value']
        elif data.get('original_entry', {}).get('name'):
            inst_name = data['original_entry']['name']

        if not inst_name:
            continue

        # Extract city
        inst_city = ''
        if data.get('location', {}).get('city'):
            inst_city = data['location']['city']
        elif data.get('ghcid', {}).get('location_resolution', {}).get('city_name'):
            inst_city = data['ghcid']['location_resolution']['city_name']
        elif data.get('original_entry', {}).get('locations'):
            locs = data['original_entry']['locations']
            if locs and isinstance(locs, list) and locs[0].get('city'):
                inst_city = locs[0]['city']

        # Fuzzy match
        match_result = fuzzy_match_institution(
            inst_name,
            inst_city,
            wikidata_results,
            threshold=threshold
        )

        if not match_result:
            continue

        matched_wd, confidence = match_result
        matched += 1

        if confidence >= 95:
            high_confidence += 1
        else:
            low_confidence += 1

        if dry_run:
            print(f"  [DRY RUN] Would match: {inst_name}")
            print(f"            -> {matched_wd['qid']} ({matched_wd['label']}) [{confidence:.1f}%]")
            continue

        # Add Wikidata enrichment
        data['wikidata_enrichment'] = {
            'wikidata_id': matched_wd['qid'],
            'wikidata_label': matched_wd['label'],
            'wikidata_url': f"https://www.wikidata.org/wiki/{matched_wd['qid']}",
            'enrichment_date': timestamp,
            'enrichment_version': '2.1_generic',
            'enrichment_method': 'wikidata_fuzzy_match',
            'match_confidence': round(confidence, 1),
            'match_location': matched_wd.get('location', ''),
        }

        # Add ISIL if available from Wikidata
        if matched_wd.get('isil'):
            # Check if already has this ISIL
            has_isil = any(
                i.get('identifier_scheme') == 'ISIL' and i.get('identifier_value') == matched_wd['isil']
                for i in data.get('identifiers', [])
                if isinstance(i, dict)
            )
            if not has_isil:
                if 'identifiers' not in data:
                    data['identifiers'] = []
                data['identifiers'].append({
                    'identifier_scheme': 'ISIL',
                    'identifier_value': matched_wd['isil'],
                    'identifier_source': 'wikidata'
                })

        # Add Wikidata identifier to top-level identifiers
        has_wd_id = any(
            i.get('identifier_scheme') == 'Wikidata'
            for i in data.get('identifiers', [])
            if isinstance(i, dict)
        )
        if not has_wd_id:
            if 'identifiers' not in data:
                data['identifiers'] = []
            data['identifiers'].append({
                'identifier_scheme': 'Wikidata',
                'identifier_value': matched_wd['qid'],
                'identifier_url': f"https://www.wikidata.org/wiki/{matched_wd['qid']}",
                'identifier_source': 'wikidata_fuzzy_match'
            })

        # Update provenance notes
        if 'provenance' not in data:
            data['provenance'] = {}
        if 'notes' not in data['provenance']:
            data['provenance']['notes'] = []

        data['provenance']['notes'].append(
            f"Wikidata fuzzy match enrichment {timestamp}: "
            f"Matched to {matched_wd['qid']} ({matched_wd['label']}) "
            f"with {confidence:.1f}% confidence"
        )

        # Save file
        if save_enriched_file(filepath, data):
            saved += 1
        else:
            errors += 1

    # Final summary
    print()
    print("=" * 80)
    print("ENRICHMENT COMPLETE")
    print("=" * 80)
    print(f"  Files scanned: {len(files_to_enrich)}")
    print(f"  Matched: {matched} ({matched/len(files_to_enrich)*100:.1f}%)")
    print(f"    High confidence (>=95%): {high_confidence}")
    print(f"    Low confidence (<95%): {low_confidence}")
    if not dry_run:
        print(f"  Saved: {saved}")
        print(f"  Errors: {errors}")
    else:
        print(f"  [DRY RUN - no files modified]")
    print()


def main():
    parser = argparse.ArgumentParser(
        description="Enrich Czech custodian files with Wikidata via fuzzy matching"
    )
    parser.add_argument(
        '--limit', '-l',
        type=int,
        default=None,
        help='Limit number of files to process (for testing)'
    )
    parser.add_argument(
        '--dry-run', '-n',
        action='store_true',
        help='Show what would be matched without saving'
    )
    parser.add_argument(
        '--threshold', '-t',
        type=float,
        default=DEFAULT_THRESHOLD,
        help=f'Minimum similarity threshold (default: {DEFAULT_THRESHOLD})'
    )

    args = parser.parse_args()

    enrich_with_wikidata(
        limit=args.limit,
        dry_run=args.dry_run,
        threshold=args.threshold
    )


if __name__ == '__main__':
    main()