451 lines
18 KiB
Python
451 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Japanese ISIL Registry Enrichment Script
|
|
Enriches 12,064 Japanese institutions with Wikidata and OSM data.
|
|
Uses batched queries to handle large dataset.
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import yaml
|
|
import requests
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Optional
|
|
from rapidfuzz import fuzz
|
|
from collections import Counter
|
|
|
|
def fetch_wikidata_japan() -> List[Dict]:
|
|
"""Query Wikidata for Japanese heritage institutions."""
|
|
print("Querying Wikidata for Japanese institutions...")
|
|
|
|
endpoint = "https://query.wikidata.org/sparql"
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?isil ?viaf ?website ?coords
|
|
?prefLabel ?cityLabel WHERE {
|
|
VALUES ?type {
|
|
wd:Q7075 wd:Q166118 wd:Q1030034 wd:Q33506 wd:Q636400
|
|
wd:Q212805 wd:Q213441 wd:Q1362225 wd:Q24398318
|
|
}
|
|
?item wdt:P31/wdt:P279* ?type .
|
|
?item wdt:P17 wd:Q17 .
|
|
|
|
OPTIONAL { ?item wdt:P791 ?isil }
|
|
OPTIONAL { ?item wdt:P214 ?viaf }
|
|
OPTIONAL { ?item wdt:P856 ?website }
|
|
OPTIONAL { ?item wdt:P625 ?coords }
|
|
OPTIONAL { ?item wdt:P131 ?pref . ?pref rdfs:label ?prefLabel . FILTER(LANG(?prefLabel) = "en") }
|
|
OPTIONAL { ?item wdt:P131/wdt:P131 ?city . ?city rdfs:label ?cityLabel . FILTER(LANG(?cityLabel) = "en") }
|
|
|
|
SERVICE wikibase:label {
|
|
bd:serviceParam wikibase:language "ja,en".
|
|
?item rdfs:label ?itemLabel .
|
|
?item skos:altLabel ?itemAltLabel .
|
|
}
|
|
}
|
|
"""
|
|
|
|
try:
|
|
response = requests.get(
|
|
endpoint,
|
|
params={'query': query, 'format': 'json'},
|
|
headers={'User-Agent': 'GLAM-Extractor/1.0'},
|
|
timeout=120
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
institutions = []
|
|
for result in data['results']['bindings']:
|
|
item_id = result['item']['value'].split('/')[-1]
|
|
|
|
coords_str = result.get('coords', {}).get('value', '')
|
|
lat, lon = None, None
|
|
if coords_str and coords_str.startswith('Point('):
|
|
parts = coords_str.replace('Point(', '').replace(')', '').split()
|
|
if len(parts) == 2:
|
|
lon, lat = float(parts[0]), float(parts[1])
|
|
|
|
institutions.append({
|
|
'qid': item_id,
|
|
'label': result.get('itemLabel', {}).get('value'),
|
|
'alt_labels': result.get('itemAltLabel', {}).get('value', '').split(',') if result.get('itemAltLabel') else [],
|
|
'isil': result.get('isil', {}).get('value'),
|
|
'viaf': result.get('viaf', {}).get('value'),
|
|
'website': result.get('website', {}).get('value'),
|
|
'latitude': lat,
|
|
'longitude': lon,
|
|
'prefecture': result.get('prefLabel', {}).get('value'),
|
|
'city': result.get('cityLabel', {}).get('value')
|
|
})
|
|
|
|
print(f"✓ Found {len(institutions)} institutions in Wikidata")
|
|
return institutions
|
|
|
|
except Exception as e:
|
|
print(f"✗ Wikidata query failed: {e}")
|
|
return []
|
|
|
|
def fetch_osm_japan_batched() -> List[Dict]:
|
|
"""Fetch Japanese libraries/archives/museums from OSM (batched by region)."""
|
|
print("Fetching OSM data for Japan (batched queries)...")
|
|
|
|
overpass_url = "https://overpass-api.de/api/interpreter"
|
|
|
|
# Major regions to batch queries
|
|
regions = [
|
|
("Hokkaido", "43.0,140.0,45.5,145.5"),
|
|
("Tohoku", "37.5,139.5,41.0,141.5"),
|
|
("Kanto", "34.5,138.5,36.5,140.5"),
|
|
("Chubu", "34.5,136.0,37.5,138.5"),
|
|
("Kansai", "33.5,134.5,35.5,136.0"),
|
|
("Chugoku", "33.5,131.0,35.5,134.5"),
|
|
("Shikoku", "32.5,132.5,34.5,134.5"),
|
|
("Kyushu", "30.0,128.5,34.0,132.0")
|
|
]
|
|
|
|
all_libraries = []
|
|
|
|
for region_name, bbox in regions:
|
|
print(f" Fetching {region_name} region...")
|
|
|
|
# Split bbox
|
|
south, west, north, east = map(float, bbox.split(','))
|
|
|
|
query = f"""
|
|
[out:json][timeout:60];
|
|
(
|
|
node["amenity"="library"]({south},{west},{north},{east});
|
|
way["amenity"="library"]({south},{west},{north},{east});
|
|
node["amenity"="archive"]({south},{west},{north},{east});
|
|
way["amenity"="archive"]({south},{west},{north},{east});
|
|
node["tourism"="museum"]({south},{west},{north},{east});
|
|
way["tourism"="museum"]({south},{west},{north},{east});
|
|
);
|
|
out body;
|
|
>;
|
|
out skel qt;
|
|
"""
|
|
|
|
try:
|
|
response = requests.post(overpass_url, data={'data': query}, timeout=90)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
for element in data.get('elements', []):
|
|
if element.get('type') == 'node' or (element.get('type') == 'way' and 'center' in element):
|
|
tags = element.get('tags', {})
|
|
name = tags.get('name') or tags.get('name:ja') or tags.get('name:en')
|
|
if name:
|
|
lat = element.get('lat') or element.get('center', {}).get('lat')
|
|
lon = element.get('lon') or element.get('center', {}).get('lon')
|
|
|
|
all_libraries.append({
|
|
'name': name,
|
|
'name_ja': tags.get('name:ja'),
|
|
'name_en': tags.get('name:en'),
|
|
'latitude': lat,
|
|
'longitude': lon,
|
|
'city': tags.get('addr:city'),
|
|
'prefecture': tags.get('addr:province') or tags.get('addr:state'),
|
|
'website': tags.get('website') or tags.get('contact:website'),
|
|
'wikidata': tags.get('wikidata'),
|
|
'osm_id': element.get('id'),
|
|
'region': region_name
|
|
})
|
|
|
|
print(f" ✓ Found {len([l for l in all_libraries if l['region'] == region_name])} locations")
|
|
time.sleep(2) # Rate limiting between regions
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Failed to fetch {region_name}: {e}")
|
|
continue
|
|
|
|
print(f"✓ Total OSM locations: {len(all_libraries)}")
|
|
return all_libraries
|
|
|
|
def fuzzy_match_institution(inst_name: str, candidates: List[Dict], name_fields: List[str], threshold: int = 75) -> Optional[tuple]:
|
|
"""Fuzzy match institution name against candidates."""
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
inst_name_clean = inst_name.lower().strip()
|
|
|
|
for candidate in candidates:
|
|
names_to_try = []
|
|
for field in name_fields:
|
|
value = candidate.get(field)
|
|
if value:
|
|
if isinstance(value, list):
|
|
names_to_try.extend(value)
|
|
else:
|
|
names_to_try.append(value)
|
|
|
|
for name in names_to_try:
|
|
if not name:
|
|
continue
|
|
|
|
name_clean = name.lower().strip()
|
|
score = fuzz.token_sort_ratio(inst_name_clean, name_clean)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = candidate
|
|
|
|
if best_score >= threshold:
|
|
return (best_match, best_score)
|
|
return None
|
|
|
|
def enrich_institutions(base_file: str, wikidata_data: List[Dict], osm_data: List[Dict]) -> tuple:
|
|
"""Enrich Japanese institutions with external data."""
|
|
print(f"\nLoading base institutions from {base_file}...")
|
|
|
|
with open(base_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"✓ Loaded {len(institutions)} institutions")
|
|
print("\nEnriching institutions (this may take a while)...")
|
|
|
|
enrichments = []
|
|
stats = {
|
|
'total': len(institutions),
|
|
'wikidata_matched': 0,
|
|
'viaf_added': 0,
|
|
'coords_added': 0,
|
|
'website_added': 0,
|
|
'osm_matched': 0,
|
|
'high_confidence': 0,
|
|
'medium_confidence': 0,
|
|
'isil_exact': 0
|
|
}
|
|
|
|
# Create ISIL lookup for fast exact matching
|
|
wikidata_by_isil = {wd['isil']: wd for wd in wikidata_data if wd.get('isil')}
|
|
print(f"✓ Built ISIL lookup index ({len(wikidata_by_isil)} entries)")
|
|
|
|
for idx, inst in enumerate(institutions, 1):
|
|
inst_name = inst.get('name', '')
|
|
inst_id = inst.get('id')
|
|
|
|
enrichment = {
|
|
'id': inst_id,
|
|
'name': inst_name,
|
|
'isil': next((i['identifier_value'] for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'ISIL'), None),
|
|
'matches': []
|
|
}
|
|
|
|
# Try Wikidata match by ISIL first (exact)
|
|
inst_isil = enrichment['isil']
|
|
wikidata_match = None
|
|
match_score = 0
|
|
|
|
if inst_isil and inst_isil in wikidata_by_isil:
|
|
wikidata_match = wikidata_by_isil[inst_isil]
|
|
match_score = 100
|
|
enrichment['matches'].append({
|
|
'source': 'wikidata',
|
|
'match_type': 'isil_exact',
|
|
'score': 100,
|
|
'qid': wikidata_match['qid']
|
|
})
|
|
stats['wikidata_matched'] += 1
|
|
stats['isil_exact'] += 1
|
|
stats['high_confidence'] += 1
|
|
|
|
# Fuzzy match by name if no ISIL match
|
|
if not wikidata_match:
|
|
result = fuzzy_match_institution(
|
|
inst_name,
|
|
wikidata_data,
|
|
['label', 'alt_labels'],
|
|
threshold=80 # Higher threshold for Japan (English names in ISIL registry)
|
|
)
|
|
if result:
|
|
wikidata_match, match_score = result
|
|
enrichment['matches'].append({
|
|
'source': 'wikidata',
|
|
'match_type': 'name_fuzzy',
|
|
'score': match_score,
|
|
'qid': wikidata_match['qid']
|
|
})
|
|
stats['wikidata_matched'] += 1
|
|
if match_score >= 85:
|
|
stats['high_confidence'] += 1
|
|
else:
|
|
stats['medium_confidence'] += 1
|
|
|
|
# Add Wikidata enrichments
|
|
if wikidata_match:
|
|
if wikidata_match.get('viaf'):
|
|
enrichment['viaf'] = wikidata_match['viaf']
|
|
stats['viaf_added'] += 1
|
|
|
|
if wikidata_match.get('website') and not inst.get('homepage'):
|
|
enrichment['website'] = wikidata_match['website']
|
|
stats['website_added'] += 1
|
|
|
|
if wikidata_match.get('latitude') and wikidata_match.get('longitude'):
|
|
location = inst.get('locations', [{}])[0]
|
|
if not location.get('latitude'):
|
|
enrichment['latitude'] = wikidata_match['latitude']
|
|
enrichment['longitude'] = wikidata_match['longitude']
|
|
stats['coords_added'] += 1
|
|
|
|
# Try OSM match (only if needed)
|
|
if not enrichment.get('latitude') or not enrichment.get('website'):
|
|
osm_result = fuzzy_match_institution(
|
|
inst_name,
|
|
osm_data,
|
|
['name', 'name_ja', 'name_en'],
|
|
threshold=80
|
|
)
|
|
|
|
if osm_result:
|
|
osm_match, osm_score = osm_result
|
|
enrichment['matches'].append({
|
|
'source': 'osm',
|
|
'match_type': 'name_fuzzy',
|
|
'score': osm_score,
|
|
'osm_id': osm_match['osm_id']
|
|
})
|
|
stats['osm_matched'] += 1
|
|
|
|
# Add OSM data if missing
|
|
if osm_match.get('latitude') and osm_match.get('longitude'):
|
|
if 'latitude' not in enrichment:
|
|
enrichment['latitude'] = osm_match['latitude']
|
|
enrichment['longitude'] = osm_match['longitude']
|
|
stats['coords_added'] += 1
|
|
|
|
if osm_match.get('website') and 'website' not in enrichment:
|
|
enrichment['website'] = osm_match['website']
|
|
stats['website_added'] += 1
|
|
|
|
if enrichment['matches']:
|
|
enrichments.append(enrichment)
|
|
|
|
if idx % 500 == 0:
|
|
print(f" Processed {idx}/{stats['total']} institutions ({idx/stats['total']*100:.1f}%)...")
|
|
|
|
print(f"\n✓ Enrichment complete")
|
|
print(f"\nStatistics:")
|
|
print(f" Total institutions: {stats['total']}")
|
|
print(f" Wikidata matches: {stats['wikidata_matched']} ({stats['wikidata_matched']/stats['total']*100:.1f}%)")
|
|
print(f" - ISIL exact matches: {stats['isil_exact']}")
|
|
print(f" - High confidence (≥85%): {stats['high_confidence']}")
|
|
print(f" - Medium confidence (80-84%): {stats['medium_confidence']}")
|
|
print(f" OSM matches: {stats['osm_matched']} ({stats['osm_matched']/stats['total']*100:.1f}%)")
|
|
print(f" VIAF IDs added: {stats['viaf_added']}")
|
|
print(f" Coordinates added: {stats['coords_added']}")
|
|
print(f" Websites added: {stats['website_added']}")
|
|
print(f" Total enriched: {len(enrichments)} ({len(enrichments)/stats['total']*100:.1f}%)")
|
|
|
|
return enrichments, stats
|
|
|
|
def apply_enrichments(base_file: str, enrichments: List[Dict], output_file: str):
|
|
"""Apply enrichments to base YAML and save."""
|
|
print(f"\nApplying enrichments to {base_file}...")
|
|
|
|
with open(base_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
enrichment_map = {e['id']: e for e in enrichments}
|
|
|
|
enriched_count = 0
|
|
for inst in institutions:
|
|
inst_id = inst.get('id')
|
|
if inst_id in enrichment_map:
|
|
enrich_data = enrichment_map[inst_id]
|
|
|
|
# Add Wikidata ID
|
|
if any(m.get('qid') for m in enrich_data.get('matches', [])):
|
|
qid = next(m['qid'] for m in enrich_data['matches'] if m.get('qid'))
|
|
identifiers = inst.setdefault('identifiers', [])
|
|
if not any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers):
|
|
identifiers.append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': qid,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
|
|
})
|
|
|
|
# Add VIAF ID
|
|
if 'viaf' in enrich_data:
|
|
identifiers = inst.setdefault('identifiers', [])
|
|
if not any(i.get('identifier_scheme') == 'VIAF' for i in identifiers):
|
|
identifiers.append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': enrich_data['viaf'],
|
|
'identifier_url': f'https://viaf.org/viaf/{enrich_data["viaf"]}'
|
|
})
|
|
|
|
# Add coordinates
|
|
if 'latitude' in enrich_data and 'longitude' in enrich_data:
|
|
locations = inst.setdefault('locations', [{}])
|
|
if not locations[0].get('latitude'):
|
|
locations[0]['latitude'] = enrich_data['latitude']
|
|
locations[0]['longitude'] = enrich_data['longitude']
|
|
|
|
# Add website
|
|
if 'website' in enrich_data and not inst.get('homepage'):
|
|
inst['homepage'] = enrich_data['website']
|
|
identifiers = inst.setdefault('identifiers', [])
|
|
if not any(i.get('identifier_scheme') == 'Website' for i in identifiers):
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': enrich_data['website'],
|
|
'identifier_url': enrich_data['website']
|
|
})
|
|
|
|
enriched_count += 1
|
|
|
|
# Save enriched YAML
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write(f"# Japanese ISIL Registry - Enriched\n")
|
|
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Enriched: {enriched_count}/{len(institutions)} institutions\n")
|
|
f.write(f"# Source: {base_file}\n\n")
|
|
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"✓ Saved enriched data to {output_file}")
|
|
print(f" Enriched {enriched_count}/{len(institutions)} institutions")
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print("Japanese ISIL Registry Enrichment")
|
|
print("=" * 70)
|
|
|
|
base_file = "data/instances/japan_isil_all.yaml"
|
|
output_file = "data/instances/japan_complete.yaml"
|
|
|
|
# Step 1: Fetch Wikidata
|
|
wikidata_data = fetch_wikidata_japan()
|
|
with open('data/isil/japan/japan_wikidata_institutions.json', 'w', encoding='utf-8') as f:
|
|
json.dump(wikidata_data, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved Wikidata data to data/isil/japan/japan_wikidata_institutions.json")
|
|
|
|
# Step 2: Fetch OSM data (batched)
|
|
time.sleep(3) # Rate limiting
|
|
osm_data = fetch_osm_japan_batched()
|
|
with open('data/isil/japan/japan_osm_libraries.json', 'w', encoding='utf-8') as f:
|
|
json.dump(osm_data, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved OSM data to data/isil/japan/japan_osm_libraries.json")
|
|
|
|
# Step 3: Enrich
|
|
enrichments, stats = enrich_institutions(base_file, wikidata_data, osm_data)
|
|
with open('data/isil/japan/japan_enrichments.json', 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'enrichments': enrichments,
|
|
'stats': stats,
|
|
'generated': datetime.now(timezone.utc).isoformat()
|
|
}, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved enrichment data to data/isil/japan/japan_enrichments.json")
|
|
|
|
# Step 4: Apply enrichments
|
|
apply_enrichments(base_file, enrichments, output_file)
|
|
|
|
print("\n" + "=" * 70)
|
|
print("✓ Japanese ISIL enrichment complete!")
|
|
print("=" * 70)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|