glam/enrich_japan_isil.py

#!/usr/bin/env python3
"""
Japanese ISIL Registry Enrichment Script
Enriches 12,064 Japanese institutions with Wikidata and OSM data.
Uses batched queries to handle large dataset.
"""

import json
import time
import yaml
import requests
from datetime import datetime, timezone
from typing import List, Dict, Optional
from rapidfuzz import fuzz
from collections import Counter

def fetch_wikidata_japan() -> List[Dict]:
    """Query Wikidata for Japanese heritage institutions."""
    print("Querying Wikidata for Japanese institutions...")

    endpoint = "https://query.wikidata.org/sparql"
    query = """
    SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?isil ?viaf ?website ?coords
           ?prefLabel ?cityLabel WHERE {
      VALUES ?type {
        wd:Q7075 wd:Q166118 wd:Q1030034 wd:Q33506 wd:Q636400
        wd:Q212805 wd:Q213441 wd:Q1362225 wd:Q24398318
      }
      ?item wdt:P31/wdt:P279* ?type .
      ?item wdt:P17 wd:Q17 .

      OPTIONAL { ?item wdt:P791 ?isil }
      OPTIONAL { ?item wdt:P214 ?viaf }
      OPTIONAL { ?item wdt:P856 ?website }
      OPTIONAL { ?item wdt:P625 ?coords }
      OPTIONAL { ?item wdt:P131 ?pref . ?pref rdfs:label ?prefLabel . FILTER(LANG(?prefLabel) = "en") }
      OPTIONAL { ?item wdt:P131/wdt:P131 ?city . ?city rdfs:label ?cityLabel . FILTER(LANG(?cityLabel) = "en") }

      SERVICE wikibase:label {
        bd:serviceParam wikibase:language "ja,en".
        ?item rdfs:label ?itemLabel .
        ?item skos:altLabel ?itemAltLabel .
      }
    }
    """

    try:
        response = requests.get(
            endpoint,
            params={'query': query, 'format': 'json'},
            headers={'User-Agent': 'GLAM-Extractor/1.0'},
            timeout=120
        )
        response.raise_for_status()
        data = response.json()

        institutions = []
        for result in data['results']['bindings']:
            item_id = result['item']['value'].split('/')[-1]

            coords_str = result.get('coords', {}).get('value', '')
            lat, lon = None, None
            if coords_str and coords_str.startswith('Point('):
                parts = coords_str.replace('Point(', '').replace(')', '').split()
                if len(parts) == 2:
                    lon, lat = float(parts[0]), float(parts[1])

            institutions.append({
                'qid': item_id,
                'label': result.get('itemLabel', {}).get('value'),
                'alt_labels': result.get('itemAltLabel', {}).get('value', '').split(',') if result.get('itemAltLabel') else [],
                'isil': result.get('isil', {}).get('value'),
                'viaf': result.get('viaf', {}).get('value'),
                'website': result.get('website', {}).get('value'),
                'latitude': lat,
                'longitude': lon,
                'prefecture': result.get('prefLabel', {}).get('value'),
                'city': result.get('cityLabel', {}).get('value')
            })

        print(f"✓ Found {len(institutions)} institutions in Wikidata")
        return institutions

    except Exception as e:
        print(f"✗ Wikidata query failed: {e}")
        return []

def fetch_osm_japan_batched() -> List[Dict]:
    """Fetch Japanese libraries/archives/museums from OSM (batched by region)."""
    print("Fetching OSM data for Japan (batched queries)...")

    overpass_url = "https://overpass-api.de/api/interpreter"

    # Major regions to batch queries
    regions = [
        ("Hokkaido", "43.0,140.0,45.5,145.5"),
        ("Tohoku", "37.5,139.5,41.0,141.5"),
        ("Kanto", "34.5,138.5,36.5,140.5"),
        ("Chubu", "34.5,136.0,37.5,138.5"),
        ("Kansai", "33.5,134.5,35.5,136.0"),
        ("Chugoku", "33.5,131.0,35.5,134.5"),
        ("Shikoku", "32.5,132.5,34.5,134.5"),
        ("Kyushu", "30.0,128.5,34.0,132.0")
    ]

    all_libraries = []

    for region_name, bbox in regions:
        print(f"  Fetching {region_name} region...")

        # Split bbox
        south, west, north, east = map(float, bbox.split(','))

        query = f"""
        [out:json][timeout:60];
        (
          node["amenity"="library"]({south},{west},{north},{east});
          way["amenity"="library"]({south},{west},{north},{east});
          node["amenity"="archive"]({south},{west},{north},{east});
          way["amenity"="archive"]({south},{west},{north},{east});
          node["tourism"="museum"]({south},{west},{north},{east});
          way["tourism"="museum"]({south},{west},{north},{east});
        );
        out body;
        >;
        out skel qt;
        """

        try:
            response = requests.post(overpass_url, data={'data': query}, timeout=90)
            response.raise_for_status()
            data = response.json()

            for element in data.get('elements', []):
                if element.get('type') == 'node' or (element.get('type') == 'way' and 'center' in element):
                    tags = element.get('tags', {})
                    name = tags.get('name') or tags.get('name:ja') or tags.get('name:en')
                    if name:
                        lat = element.get('lat') or element.get('center', {}).get('lat')
                        lon = element.get('lon') or element.get('center', {}).get('lon')

                        all_libraries.append({
                            'name': name,
                            'name_ja': tags.get('name:ja'),
                            'name_en': tags.get('name:en'),
                            'latitude': lat,
                            'longitude': lon,
                            'city': tags.get('addr:city'),
                            'prefecture': tags.get('addr:province') or tags.get('addr:state'),
                            'website': tags.get('website') or tags.get('contact:website'),
                            'wikidata': tags.get('wikidata'),
                            'osm_id': element.get('id'),
                            'region': region_name
                        })

            print(f"    ✓ Found {len([l for l in all_libraries if l['region'] == region_name])} locations")
            time.sleep(2)  # Rate limiting between regions

        except Exception as e:
            print(f"    ✗ Failed to fetch {region_name}: {e}")
            continue

    print(f"✓ Total OSM locations: {len(all_libraries)}")
    return all_libraries

def fuzzy_match_institution(inst_name: str, candidates: List[Dict], name_fields: List[str], threshold: int = 75) -> Optional[tuple]:
    """Fuzzy match institution name against candidates."""
    best_match = None
    best_score = 0

    inst_name_clean = inst_name.lower().strip()

    for candidate in candidates:
        names_to_try = []
        for field in name_fields:
            value = candidate.get(field)
            if value:
                if isinstance(value, list):
                    names_to_try.extend(value)
                else:
                    names_to_try.append(value)

        for name in names_to_try:
            if not name:
                continue

            name_clean = name.lower().strip()
            score = fuzz.token_sort_ratio(inst_name_clean, name_clean)

            if score > best_score:
                best_score = score
                best_match = candidate

    if best_score >= threshold:
        return (best_match, best_score)
    return None

def enrich_institutions(base_file: str, wikidata_data: List[Dict], osm_data: List[Dict]) -> tuple:
    """Enrich Japanese institutions with external data."""
    print(f"\nLoading base institutions from {base_file}...")

    with open(base_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    print(f"✓ Loaded {len(institutions)} institutions")
    print("\nEnriching institutions (this may take a while)...")

    enrichments = []
    stats = {
        'total': len(institutions),
        'wikidata_matched': 0,
        'viaf_added': 0,
        'coords_added': 0,
        'website_added': 0,
        'osm_matched': 0,
        'high_confidence': 0,
        'medium_confidence': 0,
        'isil_exact': 0
    }

    # Create ISIL lookup for fast exact matching
    wikidata_by_isil = {wd['isil']: wd for wd in wikidata_data if wd.get('isil')}
    print(f"✓ Built ISIL lookup index ({len(wikidata_by_isil)} entries)")

    for idx, inst in enumerate(institutions, 1):
        inst_name = inst.get('name', '')
        inst_id = inst.get('id')

        enrichment = {
            'id': inst_id,
            'name': inst_name,
            'isil': next((i['identifier_value'] for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'ISIL'), None),
            'matches': []
        }

        # Try Wikidata match by ISIL first (exact)
        inst_isil = enrichment['isil']
        wikidata_match = None
        match_score = 0

        if inst_isil and inst_isil in wikidata_by_isil:
            wikidata_match = wikidata_by_isil[inst_isil]
            match_score = 100
            enrichment['matches'].append({
                'source': 'wikidata',
                'match_type': 'isil_exact',
                'score': 100,
                'qid': wikidata_match['qid']
            })
            stats['wikidata_matched'] += 1
            stats['isil_exact'] += 1
            stats['high_confidence'] += 1

        # Fuzzy match by name if no ISIL match
        if not wikidata_match:
            result = fuzzy_match_institution(
                inst_name,
                wikidata_data,
                ['label', 'alt_labels'],
                threshold=80  # Higher threshold for Japan (English names in ISIL registry)
            )
            if result:
                wikidata_match, match_score = result
                enrichment['matches'].append({
                    'source': 'wikidata',
                    'match_type': 'name_fuzzy',
                    'score': match_score,
                    'qid': wikidata_match['qid']
                })
                stats['wikidata_matched'] += 1
                if match_score >= 85:
                    stats['high_confidence'] += 1
                else:
                    stats['medium_confidence'] += 1

        # Add Wikidata enrichments
        if wikidata_match:
            if wikidata_match.get('viaf'):
                enrichment['viaf'] = wikidata_match['viaf']
                stats['viaf_added'] += 1

            if wikidata_match.get('website') and not inst.get('homepage'):
                enrichment['website'] = wikidata_match['website']
                stats['website_added'] += 1

            if wikidata_match.get('latitude') and wikidata_match.get('longitude'):
                location = inst.get('locations', [{}])[0]
                if not location.get('latitude'):
                    enrichment['latitude'] = wikidata_match['latitude']
                    enrichment['longitude'] = wikidata_match['longitude']
                    stats['coords_added'] += 1

        # Try OSM match (only if needed)
        if not enrichment.get('latitude') or not enrichment.get('website'):
            osm_result = fuzzy_match_institution(
                inst_name,
                osm_data,
                ['name', 'name_ja', 'name_en'],
                threshold=80
            )

            if osm_result:
                osm_match, osm_score = osm_result
                enrichment['matches'].append({
                    'source': 'osm',
                    'match_type': 'name_fuzzy',
                    'score': osm_score,
                    'osm_id': osm_match['osm_id']
                })
                stats['osm_matched'] += 1

                # Add OSM data if missing
                if osm_match.get('latitude') and osm_match.get('longitude'):
                    if 'latitude' not in enrichment:
                        enrichment['latitude'] = osm_match['latitude']
                        enrichment['longitude'] = osm_match['longitude']
                        stats['coords_added'] += 1

                if osm_match.get('website') and 'website' not in enrichment:
                    enrichment['website'] = osm_match['website']
                    stats['website_added'] += 1

        if enrichment['matches']:
            enrichments.append(enrichment)

        if idx % 500 == 0:
            print(f"  Processed {idx}/{stats['total']} institutions ({idx/stats['total']*100:.1f}%)...")

    print(f"\n✓ Enrichment complete")
    print(f"\nStatistics:")
    print(f"  Total institutions: {stats['total']}")
    print(f"  Wikidata matches: {stats['wikidata_matched']} ({stats['wikidata_matched']/stats['total']*100:.1f}%)")
    print(f"    - ISIL exact matches: {stats['isil_exact']}")
    print(f"    - High confidence (≥85%): {stats['high_confidence']}")
    print(f"    - Medium confidence (80-84%): {stats['medium_confidence']}")
    print(f"  OSM matches: {stats['osm_matched']} ({stats['osm_matched']/stats['total']*100:.1f}%)")
    print(f"  VIAF IDs added: {stats['viaf_added']}")
    print(f"  Coordinates added: {stats['coords_added']}")
    print(f"  Websites added: {stats['website_added']}")
    print(f"  Total enriched: {len(enrichments)} ({len(enrichments)/stats['total']*100:.1f}%)")

    return enrichments, stats

def apply_enrichments(base_file: str, enrichments: List[Dict], output_file: str):
    """Apply enrichments to base YAML and save."""
    print(f"\nApplying enrichments to {base_file}...")

    with open(base_file, 'r', encoding='utf-8') as f:
        institutions = yaml.safe_load(f)

    enrichment_map = {e['id']: e for e in enrichments}

    enriched_count = 0
    for inst in institutions:
        inst_id = inst.get('id')
        if inst_id in enrichment_map:
            enrich_data = enrichment_map[inst_id]

            # Add Wikidata ID
            if any(m.get('qid') for m in enrich_data.get('matches', [])):
                qid = next(m['qid'] for m in enrich_data['matches'] if m.get('qid'))
                identifiers = inst.setdefault('identifiers', [])
                if not any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers):
                    identifiers.append({
                        'identifier_scheme': 'Wikidata',
                        'identifier_value': qid,
                        'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
                    })

            # Add VIAF ID
            if 'viaf' in enrich_data:
                identifiers = inst.setdefault('identifiers', [])
                if not any(i.get('identifier_scheme') == 'VIAF' for i in identifiers):
                    identifiers.append({
                        'identifier_scheme': 'VIAF',
                        'identifier_value': enrich_data['viaf'],
                        'identifier_url': f'https://viaf.org/viaf/{enrich_data["viaf"]}'
                    })

            # Add coordinates
            if 'latitude' in enrich_data and 'longitude' in enrich_data:
                locations = inst.setdefault('locations', [{}])
                if not locations[0].get('latitude'):
                    locations[0]['latitude'] = enrich_data['latitude']
                    locations[0]['longitude'] = enrich_data['longitude']

            # Add website
            if 'website' in enrich_data and not inst.get('homepage'):
                inst['homepage'] = enrich_data['website']
                identifiers = inst.setdefault('identifiers', [])
                if not any(i.get('identifier_scheme') == 'Website' for i in identifiers):
                    identifiers.append({
                        'identifier_scheme': 'Website',
                        'identifier_value': enrich_data['website'],
                        'identifier_url': enrich_data['website']
                    })

            enriched_count += 1

    # Save enriched YAML
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("---\n")
        f.write(f"# Japanese ISIL Registry - Enriched\n")
        f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
        f.write(f"# Enriched: {enriched_count}/{len(institutions)} institutions\n")
        f.write(f"# Source: {base_file}\n\n")
        yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    print(f"✓ Saved enriched data to {output_file}")
    print(f"  Enriched {enriched_count}/{len(institutions)} institutions")

def main():
    print("=" * 70)
    print("Japanese ISIL Registry Enrichment")
    print("=" * 70)

    base_file = "data/instances/japan_isil_all.yaml"
    output_file = "data/instances/japan_complete.yaml"

    # Step 1: Fetch Wikidata
    wikidata_data = fetch_wikidata_japan()
    with open('data/isil/japan/japan_wikidata_institutions.json', 'w', encoding='utf-8') as f:
        json.dump(wikidata_data, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved Wikidata data to data/isil/japan/japan_wikidata_institutions.json")

    # Step 2: Fetch OSM data (batched)
    time.sleep(3)  # Rate limiting
    osm_data = fetch_osm_japan_batched()
    with open('data/isil/japan/japan_osm_libraries.json', 'w', encoding='utf-8') as f:
        json.dump(osm_data, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved OSM data to data/isil/japan/japan_osm_libraries.json")

    # Step 3: Enrich
    enrichments, stats = enrich_institutions(base_file, wikidata_data, osm_data)
    with open('data/isil/japan/japan_enrichments.json', 'w', encoding='utf-8') as f:
        json.dump({
            'enrichments': enrichments,
            'stats': stats,
            'generated': datetime.now(timezone.utc).isoformat()
        }, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved enrichment data to data/isil/japan/japan_enrichments.json")

    # Step 4: Apply enrichments
    apply_enrichments(base_file, enrichments, output_file)

    print("\n" + "=" * 70)
    print("✓ Japanese ISIL enrichment complete!")
    print("=" * 70)

if __name__ == '__main__':
    main()