glam/enrich_japan_isil.py
2025-11-19 23:25:22 +01:00

451 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Japanese ISIL Registry Enrichment Script
Enriches 12,064 Japanese institutions with Wikidata and OSM data.
Uses batched queries to handle large dataset.
"""
import json
import time
import yaml
import requests
from datetime import datetime, timezone
from typing import List, Dict, Optional
from rapidfuzz import fuzz
from collections import Counter
def fetch_wikidata_japan() -> List[Dict]:
"""Query Wikidata for Japanese heritage institutions."""
print("Querying Wikidata for Japanese institutions...")
endpoint = "https://query.wikidata.org/sparql"
query = """
SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?isil ?viaf ?website ?coords
?prefLabel ?cityLabel WHERE {
VALUES ?type {
wd:Q7075 wd:Q166118 wd:Q1030034 wd:Q33506 wd:Q636400
wd:Q212805 wd:Q213441 wd:Q1362225 wd:Q24398318
}
?item wdt:P31/wdt:P279* ?type .
?item wdt:P17 wd:Q17 .
OPTIONAL { ?item wdt:P791 ?isil }
OPTIONAL { ?item wdt:P214 ?viaf }
OPTIONAL { ?item wdt:P856 ?website }
OPTIONAL { ?item wdt:P625 ?coords }
OPTIONAL { ?item wdt:P131 ?pref . ?pref rdfs:label ?prefLabel . FILTER(LANG(?prefLabel) = "en") }
OPTIONAL { ?item wdt:P131/wdt:P131 ?city . ?city rdfs:label ?cityLabel . FILTER(LANG(?cityLabel) = "en") }
SERVICE wikibase:label {
bd:serviceParam wikibase:language "ja,en".
?item rdfs:label ?itemLabel .
?item skos:altLabel ?itemAltLabel .
}
}
"""
try:
response = requests.get(
endpoint,
params={'query': query, 'format': 'json'},
headers={'User-Agent': 'GLAM-Extractor/1.0'},
timeout=120
)
response.raise_for_status()
data = response.json()
institutions = []
for result in data['results']['bindings']:
item_id = result['item']['value'].split('/')[-1]
coords_str = result.get('coords', {}).get('value', '')
lat, lon = None, None
if coords_str and coords_str.startswith('Point('):
parts = coords_str.replace('Point(', '').replace(')', '').split()
if len(parts) == 2:
lon, lat = float(parts[0]), float(parts[1])
institutions.append({
'qid': item_id,
'label': result.get('itemLabel', {}).get('value'),
'alt_labels': result.get('itemAltLabel', {}).get('value', '').split(',') if result.get('itemAltLabel') else [],
'isil': result.get('isil', {}).get('value'),
'viaf': result.get('viaf', {}).get('value'),
'website': result.get('website', {}).get('value'),
'latitude': lat,
'longitude': lon,
'prefecture': result.get('prefLabel', {}).get('value'),
'city': result.get('cityLabel', {}).get('value')
})
print(f"✓ Found {len(institutions)} institutions in Wikidata")
return institutions
except Exception as e:
print(f"✗ Wikidata query failed: {e}")
return []
def fetch_osm_japan_batched() -> List[Dict]:
"""Fetch Japanese libraries/archives/museums from OSM (batched by region)."""
print("Fetching OSM data for Japan (batched queries)...")
overpass_url = "https://overpass-api.de/api/interpreter"
# Major regions to batch queries
regions = [
("Hokkaido", "43.0,140.0,45.5,145.5"),
("Tohoku", "37.5,139.5,41.0,141.5"),
("Kanto", "34.5,138.5,36.5,140.5"),
("Chubu", "34.5,136.0,37.5,138.5"),
("Kansai", "33.5,134.5,35.5,136.0"),
("Chugoku", "33.5,131.0,35.5,134.5"),
("Shikoku", "32.5,132.5,34.5,134.5"),
("Kyushu", "30.0,128.5,34.0,132.0")
]
all_libraries = []
for region_name, bbox in regions:
print(f" Fetching {region_name} region...")
# Split bbox
south, west, north, east = map(float, bbox.split(','))
query = f"""
[out:json][timeout:60];
(
node["amenity"="library"]({south},{west},{north},{east});
way["amenity"="library"]({south},{west},{north},{east});
node["amenity"="archive"]({south},{west},{north},{east});
way["amenity"="archive"]({south},{west},{north},{east});
node["tourism"="museum"]({south},{west},{north},{east});
way["tourism"="museum"]({south},{west},{north},{east});
);
out body;
>;
out skel qt;
"""
try:
response = requests.post(overpass_url, data={'data': query}, timeout=90)
response.raise_for_status()
data = response.json()
for element in data.get('elements', []):
if element.get('type') == 'node' or (element.get('type') == 'way' and 'center' in element):
tags = element.get('tags', {})
name = tags.get('name') or tags.get('name:ja') or tags.get('name:en')
if name:
lat = element.get('lat') or element.get('center', {}).get('lat')
lon = element.get('lon') or element.get('center', {}).get('lon')
all_libraries.append({
'name': name,
'name_ja': tags.get('name:ja'),
'name_en': tags.get('name:en'),
'latitude': lat,
'longitude': lon,
'city': tags.get('addr:city'),
'prefecture': tags.get('addr:province') or tags.get('addr:state'),
'website': tags.get('website') or tags.get('contact:website'),
'wikidata': tags.get('wikidata'),
'osm_id': element.get('id'),
'region': region_name
})
print(f" ✓ Found {len([l for l in all_libraries if l['region'] == region_name])} locations")
time.sleep(2) # Rate limiting between regions
except Exception as e:
print(f" ✗ Failed to fetch {region_name}: {e}")
continue
print(f"✓ Total OSM locations: {len(all_libraries)}")
return all_libraries
def fuzzy_match_institution(inst_name: str, candidates: List[Dict], name_fields: List[str], threshold: int = 75) -> Optional[tuple]:
"""Fuzzy match institution name against candidates."""
best_match = None
best_score = 0
inst_name_clean = inst_name.lower().strip()
for candidate in candidates:
names_to_try = []
for field in name_fields:
value = candidate.get(field)
if value:
if isinstance(value, list):
names_to_try.extend(value)
else:
names_to_try.append(value)
for name in names_to_try:
if not name:
continue
name_clean = name.lower().strip()
score = fuzz.token_sort_ratio(inst_name_clean, name_clean)
if score > best_score:
best_score = score
best_match = candidate
if best_score >= threshold:
return (best_match, best_score)
return None
def enrich_institutions(base_file: str, wikidata_data: List[Dict], osm_data: List[Dict]) -> tuple:
"""Enrich Japanese institutions with external data."""
print(f"\nLoading base institutions from {base_file}...")
with open(base_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"✓ Loaded {len(institutions)} institutions")
print("\nEnriching institutions (this may take a while)...")
enrichments = []
stats = {
'total': len(institutions),
'wikidata_matched': 0,
'viaf_added': 0,
'coords_added': 0,
'website_added': 0,
'osm_matched': 0,
'high_confidence': 0,
'medium_confidence': 0,
'isil_exact': 0
}
# Create ISIL lookup for fast exact matching
wikidata_by_isil = {wd['isil']: wd for wd in wikidata_data if wd.get('isil')}
print(f"✓ Built ISIL lookup index ({len(wikidata_by_isil)} entries)")
for idx, inst in enumerate(institutions, 1):
inst_name = inst.get('name', '')
inst_id = inst.get('id')
enrichment = {
'id': inst_id,
'name': inst_name,
'isil': next((i['identifier_value'] for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'ISIL'), None),
'matches': []
}
# Try Wikidata match by ISIL first (exact)
inst_isil = enrichment['isil']
wikidata_match = None
match_score = 0
if inst_isil and inst_isil in wikidata_by_isil:
wikidata_match = wikidata_by_isil[inst_isil]
match_score = 100
enrichment['matches'].append({
'source': 'wikidata',
'match_type': 'isil_exact',
'score': 100,
'qid': wikidata_match['qid']
})
stats['wikidata_matched'] += 1
stats['isil_exact'] += 1
stats['high_confidence'] += 1
# Fuzzy match by name if no ISIL match
if not wikidata_match:
result = fuzzy_match_institution(
inst_name,
wikidata_data,
['label', 'alt_labels'],
threshold=80 # Higher threshold for Japan (English names in ISIL registry)
)
if result:
wikidata_match, match_score = result
enrichment['matches'].append({
'source': 'wikidata',
'match_type': 'name_fuzzy',
'score': match_score,
'qid': wikidata_match['qid']
})
stats['wikidata_matched'] += 1
if match_score >= 85:
stats['high_confidence'] += 1
else:
stats['medium_confidence'] += 1
# Add Wikidata enrichments
if wikidata_match:
if wikidata_match.get('viaf'):
enrichment['viaf'] = wikidata_match['viaf']
stats['viaf_added'] += 1
if wikidata_match.get('website') and not inst.get('homepage'):
enrichment['website'] = wikidata_match['website']
stats['website_added'] += 1
if wikidata_match.get('latitude') and wikidata_match.get('longitude'):
location = inst.get('locations', [{}])[0]
if not location.get('latitude'):
enrichment['latitude'] = wikidata_match['latitude']
enrichment['longitude'] = wikidata_match['longitude']
stats['coords_added'] += 1
# Try OSM match (only if needed)
if not enrichment.get('latitude') or not enrichment.get('website'):
osm_result = fuzzy_match_institution(
inst_name,
osm_data,
['name', 'name_ja', 'name_en'],
threshold=80
)
if osm_result:
osm_match, osm_score = osm_result
enrichment['matches'].append({
'source': 'osm',
'match_type': 'name_fuzzy',
'score': osm_score,
'osm_id': osm_match['osm_id']
})
stats['osm_matched'] += 1
# Add OSM data if missing
if osm_match.get('latitude') and osm_match.get('longitude'):
if 'latitude' not in enrichment:
enrichment['latitude'] = osm_match['latitude']
enrichment['longitude'] = osm_match['longitude']
stats['coords_added'] += 1
if osm_match.get('website') and 'website' not in enrichment:
enrichment['website'] = osm_match['website']
stats['website_added'] += 1
if enrichment['matches']:
enrichments.append(enrichment)
if idx % 500 == 0:
print(f" Processed {idx}/{stats['total']} institutions ({idx/stats['total']*100:.1f}%)...")
print(f"\n✓ Enrichment complete")
print(f"\nStatistics:")
print(f" Total institutions: {stats['total']}")
print(f" Wikidata matches: {stats['wikidata_matched']} ({stats['wikidata_matched']/stats['total']*100:.1f}%)")
print(f" - ISIL exact matches: {stats['isil_exact']}")
print(f" - High confidence (≥85%): {stats['high_confidence']}")
print(f" - Medium confidence (80-84%): {stats['medium_confidence']}")
print(f" OSM matches: {stats['osm_matched']} ({stats['osm_matched']/stats['total']*100:.1f}%)")
print(f" VIAF IDs added: {stats['viaf_added']}")
print(f" Coordinates added: {stats['coords_added']}")
print(f" Websites added: {stats['website_added']}")
print(f" Total enriched: {len(enrichments)} ({len(enrichments)/stats['total']*100:.1f}%)")
return enrichments, stats
def apply_enrichments(base_file: str, enrichments: List[Dict], output_file: str):
"""Apply enrichments to base YAML and save."""
print(f"\nApplying enrichments to {base_file}...")
with open(base_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
enrichment_map = {e['id']: e for e in enrichments}
enriched_count = 0
for inst in institutions:
inst_id = inst.get('id')
if inst_id in enrichment_map:
enrich_data = enrichment_map[inst_id]
# Add Wikidata ID
if any(m.get('qid') for m in enrich_data.get('matches', [])):
qid = next(m['qid'] for m in enrich_data['matches'] if m.get('qid'))
identifiers = inst.setdefault('identifiers', [])
if not any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers):
identifiers.append({
'identifier_scheme': 'Wikidata',
'identifier_value': qid,
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
})
# Add VIAF ID
if 'viaf' in enrich_data:
identifiers = inst.setdefault('identifiers', [])
if not any(i.get('identifier_scheme') == 'VIAF' for i in identifiers):
identifiers.append({
'identifier_scheme': 'VIAF',
'identifier_value': enrich_data['viaf'],
'identifier_url': f'https://viaf.org/viaf/{enrich_data["viaf"]}'
})
# Add coordinates
if 'latitude' in enrich_data and 'longitude' in enrich_data:
locations = inst.setdefault('locations', [{}])
if not locations[0].get('latitude'):
locations[0]['latitude'] = enrich_data['latitude']
locations[0]['longitude'] = enrich_data['longitude']
# Add website
if 'website' in enrich_data and not inst.get('homepage'):
inst['homepage'] = enrich_data['website']
identifiers = inst.setdefault('identifiers', [])
if not any(i.get('identifier_scheme') == 'Website' for i in identifiers):
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': enrich_data['website'],
'identifier_url': enrich_data['website']
})
enriched_count += 1
# Save enriched YAML
with open(output_file, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write(f"# Japanese ISIL Registry - Enriched\n")
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Enriched: {enriched_count}/{len(institutions)} institutions\n")
f.write(f"# Source: {base_file}\n\n")
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Saved enriched data to {output_file}")
print(f" Enriched {enriched_count}/{len(institutions)} institutions")
def main():
print("=" * 70)
print("Japanese ISIL Registry Enrichment")
print("=" * 70)
base_file = "data/instances/japan_isil_all.yaml"
output_file = "data/instances/japan_complete.yaml"
# Step 1: Fetch Wikidata
wikidata_data = fetch_wikidata_japan()
with open('data/isil/japan/japan_wikidata_institutions.json', 'w', encoding='utf-8') as f:
json.dump(wikidata_data, f, ensure_ascii=False, indent=2)
print(f"✓ Saved Wikidata data to data/isil/japan/japan_wikidata_institutions.json")
# Step 2: Fetch OSM data (batched)
time.sleep(3) # Rate limiting
osm_data = fetch_osm_japan_batched()
with open('data/isil/japan/japan_osm_libraries.json', 'w', encoding='utf-8') as f:
json.dump(osm_data, f, ensure_ascii=False, indent=2)
print(f"✓ Saved OSM data to data/isil/japan/japan_osm_libraries.json")
# Step 3: Enrich
enrichments, stats = enrich_institutions(base_file, wikidata_data, osm_data)
with open('data/isil/japan/japan_enrichments.json', 'w', encoding='utf-8') as f:
json.dump({
'enrichments': enrichments,
'stats': stats,
'generated': datetime.now(timezone.utc).isoformat()
}, f, ensure_ascii=False, indent=2)
print(f"✓ Saved enrichment data to data/isil/japan/japan_enrichments.json")
# Step 4: Apply enrichments
apply_enrichments(base_file, enrichments, output_file)
print("\n" + "=" * 70)
print("✓ Japanese ISIL enrichment complete!")
print("=" * 70)
if __name__ == '__main__':
main()