#!/usr/bin/env python3 """ Japanese ISIL Registry Enrichment Script Enriches 12,064 Japanese institutions with Wikidata and OSM data. Uses batched queries to handle large dataset. """ import json import time import yaml import requests from datetime import datetime, timezone from typing import List, Dict, Optional from rapidfuzz import fuzz from collections import Counter def fetch_wikidata_japan() -> List[Dict]: """Query Wikidata for Japanese heritage institutions.""" print("Querying Wikidata for Japanese institutions...") endpoint = "https://query.wikidata.org/sparql" query = """ SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?isil ?viaf ?website ?coords ?prefLabel ?cityLabel WHERE { VALUES ?type { wd:Q7075 wd:Q166118 wd:Q1030034 wd:Q33506 wd:Q636400 wd:Q212805 wd:Q213441 wd:Q1362225 wd:Q24398318 } ?item wdt:P31/wdt:P279* ?type . ?item wdt:P17 wd:Q17 . OPTIONAL { ?item wdt:P791 ?isil } OPTIONAL { ?item wdt:P214 ?viaf } OPTIONAL { ?item wdt:P856 ?website } OPTIONAL { ?item wdt:P625 ?coords } OPTIONAL { ?item wdt:P131 ?pref . ?pref rdfs:label ?prefLabel . FILTER(LANG(?prefLabel) = "en") } OPTIONAL { ?item wdt:P131/wdt:P131 ?city . ?city rdfs:label ?cityLabel . FILTER(LANG(?cityLabel) = "en") } SERVICE wikibase:label { bd:serviceParam wikibase:language "ja,en". ?item rdfs:label ?itemLabel . ?item skos:altLabel ?itemAltLabel . } } """ try: response = requests.get( endpoint, params={'query': query, 'format': 'json'}, headers={'User-Agent': 'GLAM-Extractor/1.0'}, timeout=120 ) response.raise_for_status() data = response.json() institutions = [] for result in data['results']['bindings']: item_id = result['item']['value'].split('/')[-1] coords_str = result.get('coords', {}).get('value', '') lat, lon = None, None if coords_str and coords_str.startswith('Point('): parts = coords_str.replace('Point(', '').replace(')', '').split() if len(parts) == 2: lon, lat = float(parts[0]), float(parts[1]) institutions.append({ 'qid': item_id, 'label': result.get('itemLabel', {}).get('value'), 'alt_labels': result.get('itemAltLabel', {}).get('value', '').split(',') if result.get('itemAltLabel') else [], 'isil': result.get('isil', {}).get('value'), 'viaf': result.get('viaf', {}).get('value'), 'website': result.get('website', {}).get('value'), 'latitude': lat, 'longitude': lon, 'prefecture': result.get('prefLabel', {}).get('value'), 'city': result.get('cityLabel', {}).get('value') }) print(f"✓ Found {len(institutions)} institutions in Wikidata") return institutions except Exception as e: print(f"✗ Wikidata query failed: {e}") return [] def fetch_osm_japan_batched() -> List[Dict]: """Fetch Japanese libraries/archives/museums from OSM (batched by region).""" print("Fetching OSM data for Japan (batched queries)...") overpass_url = "https://overpass-api.de/api/interpreter" # Major regions to batch queries regions = [ ("Hokkaido", "43.0,140.0,45.5,145.5"), ("Tohoku", "37.5,139.5,41.0,141.5"), ("Kanto", "34.5,138.5,36.5,140.5"), ("Chubu", "34.5,136.0,37.5,138.5"), ("Kansai", "33.5,134.5,35.5,136.0"), ("Chugoku", "33.5,131.0,35.5,134.5"), ("Shikoku", "32.5,132.5,34.5,134.5"), ("Kyushu", "30.0,128.5,34.0,132.0") ] all_libraries = [] for region_name, bbox in regions: print(f" Fetching {region_name} region...") # Split bbox south, west, north, east = map(float, bbox.split(',')) query = f""" [out:json][timeout:60]; ( node["amenity"="library"]({south},{west},{north},{east}); way["amenity"="library"]({south},{west},{north},{east}); node["amenity"="archive"]({south},{west},{north},{east}); way["amenity"="archive"]({south},{west},{north},{east}); node["tourism"="museum"]({south},{west},{north},{east}); way["tourism"="museum"]({south},{west},{north},{east}); ); out body; >; out skel qt; """ try: response = requests.post(overpass_url, data={'data': query}, timeout=90) response.raise_for_status() data = response.json() for element in data.get('elements', []): if element.get('type') == 'node' or (element.get('type') == 'way' and 'center' in element): tags = element.get('tags', {}) name = tags.get('name') or tags.get('name:ja') or tags.get('name:en') if name: lat = element.get('lat') or element.get('center', {}).get('lat') lon = element.get('lon') or element.get('center', {}).get('lon') all_libraries.append({ 'name': name, 'name_ja': tags.get('name:ja'), 'name_en': tags.get('name:en'), 'latitude': lat, 'longitude': lon, 'city': tags.get('addr:city'), 'prefecture': tags.get('addr:province') or tags.get('addr:state'), 'website': tags.get('website') or tags.get('contact:website'), 'wikidata': tags.get('wikidata'), 'osm_id': element.get('id'), 'region': region_name }) print(f" ✓ Found {len([l for l in all_libraries if l['region'] == region_name])} locations") time.sleep(2) # Rate limiting between regions except Exception as e: print(f" ✗ Failed to fetch {region_name}: {e}") continue print(f"✓ Total OSM locations: {len(all_libraries)}") return all_libraries def fuzzy_match_institution(inst_name: str, candidates: List[Dict], name_fields: List[str], threshold: int = 75) -> Optional[tuple]: """Fuzzy match institution name against candidates.""" best_match = None best_score = 0 inst_name_clean = inst_name.lower().strip() for candidate in candidates: names_to_try = [] for field in name_fields: value = candidate.get(field) if value: if isinstance(value, list): names_to_try.extend(value) else: names_to_try.append(value) for name in names_to_try: if not name: continue name_clean = name.lower().strip() score = fuzz.token_sort_ratio(inst_name_clean, name_clean) if score > best_score: best_score = score best_match = candidate if best_score >= threshold: return (best_match, best_score) return None def enrich_institutions(base_file: str, wikidata_data: List[Dict], osm_data: List[Dict]) -> tuple: """Enrich Japanese institutions with external data.""" print(f"\nLoading base institutions from {base_file}...") with open(base_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"✓ Loaded {len(institutions)} institutions") print("\nEnriching institutions (this may take a while)...") enrichments = [] stats = { 'total': len(institutions), 'wikidata_matched': 0, 'viaf_added': 0, 'coords_added': 0, 'website_added': 0, 'osm_matched': 0, 'high_confidence': 0, 'medium_confidence': 0, 'isil_exact': 0 } # Create ISIL lookup for fast exact matching wikidata_by_isil = {wd['isil']: wd for wd in wikidata_data if wd.get('isil')} print(f"✓ Built ISIL lookup index ({len(wikidata_by_isil)} entries)") for idx, inst in enumerate(institutions, 1): inst_name = inst.get('name', '') inst_id = inst.get('id') enrichment = { 'id': inst_id, 'name': inst_name, 'isil': next((i['identifier_value'] for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'ISIL'), None), 'matches': [] } # Try Wikidata match by ISIL first (exact) inst_isil = enrichment['isil'] wikidata_match = None match_score = 0 if inst_isil and inst_isil in wikidata_by_isil: wikidata_match = wikidata_by_isil[inst_isil] match_score = 100 enrichment['matches'].append({ 'source': 'wikidata', 'match_type': 'isil_exact', 'score': 100, 'qid': wikidata_match['qid'] }) stats['wikidata_matched'] += 1 stats['isil_exact'] += 1 stats['high_confidence'] += 1 # Fuzzy match by name if no ISIL match if not wikidata_match: result = fuzzy_match_institution( inst_name, wikidata_data, ['label', 'alt_labels'], threshold=80 # Higher threshold for Japan (English names in ISIL registry) ) if result: wikidata_match, match_score = result enrichment['matches'].append({ 'source': 'wikidata', 'match_type': 'name_fuzzy', 'score': match_score, 'qid': wikidata_match['qid'] }) stats['wikidata_matched'] += 1 if match_score >= 85: stats['high_confidence'] += 1 else: stats['medium_confidence'] += 1 # Add Wikidata enrichments if wikidata_match: if wikidata_match.get('viaf'): enrichment['viaf'] = wikidata_match['viaf'] stats['viaf_added'] += 1 if wikidata_match.get('website') and not inst.get('homepage'): enrichment['website'] = wikidata_match['website'] stats['website_added'] += 1 if wikidata_match.get('latitude') and wikidata_match.get('longitude'): location = inst.get('locations', [{}])[0] if not location.get('latitude'): enrichment['latitude'] = wikidata_match['latitude'] enrichment['longitude'] = wikidata_match['longitude'] stats['coords_added'] += 1 # Try OSM match (only if needed) if not enrichment.get('latitude') or not enrichment.get('website'): osm_result = fuzzy_match_institution( inst_name, osm_data, ['name', 'name_ja', 'name_en'], threshold=80 ) if osm_result: osm_match, osm_score = osm_result enrichment['matches'].append({ 'source': 'osm', 'match_type': 'name_fuzzy', 'score': osm_score, 'osm_id': osm_match['osm_id'] }) stats['osm_matched'] += 1 # Add OSM data if missing if osm_match.get('latitude') and osm_match.get('longitude'): if 'latitude' not in enrichment: enrichment['latitude'] = osm_match['latitude'] enrichment['longitude'] = osm_match['longitude'] stats['coords_added'] += 1 if osm_match.get('website') and 'website' not in enrichment: enrichment['website'] = osm_match['website'] stats['website_added'] += 1 if enrichment['matches']: enrichments.append(enrichment) if idx % 500 == 0: print(f" Processed {idx}/{stats['total']} institutions ({idx/stats['total']*100:.1f}%)...") print(f"\n✓ Enrichment complete") print(f"\nStatistics:") print(f" Total institutions: {stats['total']}") print(f" Wikidata matches: {stats['wikidata_matched']} ({stats['wikidata_matched']/stats['total']*100:.1f}%)") print(f" - ISIL exact matches: {stats['isil_exact']}") print(f" - High confidence (≥85%): {stats['high_confidence']}") print(f" - Medium confidence (80-84%): {stats['medium_confidence']}") print(f" OSM matches: {stats['osm_matched']} ({stats['osm_matched']/stats['total']*100:.1f}%)") print(f" VIAF IDs added: {stats['viaf_added']}") print(f" Coordinates added: {stats['coords_added']}") print(f" Websites added: {stats['website_added']}") print(f" Total enriched: {len(enrichments)} ({len(enrichments)/stats['total']*100:.1f}%)") return enrichments, stats def apply_enrichments(base_file: str, enrichments: List[Dict], output_file: str): """Apply enrichments to base YAML and save.""" print(f"\nApplying enrichments to {base_file}...") with open(base_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) enrichment_map = {e['id']: e for e in enrichments} enriched_count = 0 for inst in institutions: inst_id = inst.get('id') if inst_id in enrichment_map: enrich_data = enrichment_map[inst_id] # Add Wikidata ID if any(m.get('qid') for m in enrich_data.get('matches', [])): qid = next(m['qid'] for m in enrich_data['matches'] if m.get('qid')) identifiers = inst.setdefault('identifiers', []) if not any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers): identifiers.append({ 'identifier_scheme': 'Wikidata', 'identifier_value': qid, 'identifier_url': f'https://www.wikidata.org/wiki/{qid}' }) # Add VIAF ID if 'viaf' in enrich_data: identifiers = inst.setdefault('identifiers', []) if not any(i.get('identifier_scheme') == 'VIAF' for i in identifiers): identifiers.append({ 'identifier_scheme': 'VIAF', 'identifier_value': enrich_data['viaf'], 'identifier_url': f'https://viaf.org/viaf/{enrich_data["viaf"]}' }) # Add coordinates if 'latitude' in enrich_data and 'longitude' in enrich_data: locations = inst.setdefault('locations', [{}]) if not locations[0].get('latitude'): locations[0]['latitude'] = enrich_data['latitude'] locations[0]['longitude'] = enrich_data['longitude'] # Add website if 'website' in enrich_data and not inst.get('homepage'): inst['homepage'] = enrich_data['website'] identifiers = inst.setdefault('identifiers', []) if not any(i.get('identifier_scheme') == 'Website' for i in identifiers): identifiers.append({ 'identifier_scheme': 'Website', 'identifier_value': enrich_data['website'], 'identifier_url': enrich_data['website'] }) enriched_count += 1 # Save enriched YAML with open(output_file, 'w', encoding='utf-8') as f: f.write("---\n") f.write(f"# Japanese ISIL Registry - Enriched\n") f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Enriched: {enriched_count}/{len(institutions)} institutions\n") f.write(f"# Source: {base_file}\n\n") yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Saved enriched data to {output_file}") print(f" Enriched {enriched_count}/{len(institutions)} institutions") def main(): print("=" * 70) print("Japanese ISIL Registry Enrichment") print("=" * 70) base_file = "data/instances/japan_isil_all.yaml" output_file = "data/instances/japan_complete.yaml" # Step 1: Fetch Wikidata wikidata_data = fetch_wikidata_japan() with open('data/isil/japan/japan_wikidata_institutions.json', 'w', encoding='utf-8') as f: json.dump(wikidata_data, f, ensure_ascii=False, indent=2) print(f"✓ Saved Wikidata data to data/isil/japan/japan_wikidata_institutions.json") # Step 2: Fetch OSM data (batched) time.sleep(3) # Rate limiting osm_data = fetch_osm_japan_batched() with open('data/isil/japan/japan_osm_libraries.json', 'w', encoding='utf-8') as f: json.dump(osm_data, f, ensure_ascii=False, indent=2) print(f"✓ Saved OSM data to data/isil/japan/japan_osm_libraries.json") # Step 3: Enrich enrichments, stats = enrich_institutions(base_file, wikidata_data, osm_data) with open('data/isil/japan/japan_enrichments.json', 'w', encoding='utf-8') as f: json.dump({ 'enrichments': enrichments, 'stats': stats, 'generated': datetime.now(timezone.utc).isoformat() }, f, ensure_ascii=False, indent=2) print(f"✓ Saved enrichment data to data/isil/japan/japan_enrichments.json") # Step 4: Apply enrichments apply_enrichments(base_file, enrichments, output_file) print("\n" + "=" * 70) print("✓ Japanese ISIL enrichment complete!") print("=" * 70) if __name__ == '__main__': main()