#!/usr/bin/env python3 """ Japanese ISIL Registry Enrichment - Fast Track Uses Wikidata ISIL exact matches only (5000+ matches guaranteed). """ import json import yaml from datetime import datetime, timezone def load_wikidata_isil_matches(filepath: str): """Load Wikidata ISIL match results.""" print(f"Loading Wikidata ISIL matches from {filepath}...") with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) matches = {} for result in data: isil = result.get('isil', {}).get('value') if not isil: continue item_id = result['item']['value'].split('/')[-1] coords_str = result.get('coords', {}).get('value', '') lat, lon = None, None if coords_str and coords_str.startswith('Point('): parts = coords_str.replace('Point(', '').replace(')', '').split() if len(parts) == 2: lon, lat = float(parts[0]), float(parts[1]) matches[isil] = { 'qid': item_id, 'label': result.get('itemLabel', {}).get('value'), 'viaf': result.get('viaf', {}).get('value'), 'website': result.get('website', {}).get('value'), 'latitude': lat, 'longitude': lon } print(f"✓ Loaded {len(matches)} ISIL-to-Wikidata matches") return matches def enrich_institutions_fast(base_file: str, wikidata_matches: dict): """Fast enrichment using ISIL exact matches only.""" print(f"\nLoading base institutions from {base_file}...") with open(base_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"✓ Loaded {len(institutions)} institutions") print("\nEnriching with Wikidata ISIL matches...") enriched_count = 0 stats = { 'total': len(institutions), 'wikidata_matched': 0, 'viaf_added': 0, 'coords_added': 0, 'website_added': 0 } for inst in institutions: # Get ISIL code isil = next((i['identifier_value'] for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'ISIL'), None) if not isil or isil not in wikidata_matches: continue match = wikidata_matches[isil] identifiers = inst.setdefault('identifiers', []) # Add Wikidata ID if not any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers): identifiers.append({ 'identifier_scheme': 'Wikidata', 'identifier_value': match['qid'], 'identifier_url': f'https://www.wikidata.org/wiki/{match["qid"]}' }) stats['wikidata_matched'] += 1 # Add VIAF ID if match.get('viaf'): if not any(i.get('identifier_scheme') == 'VIAF' for i in identifiers): identifiers.append({ 'identifier_scheme': 'VIAF', 'identifier_value': match['viaf'], 'identifier_url': f'https://viaf.org/viaf/{match["viaf"]}' }) stats['viaf_added'] += 1 # Add coordinates if match.get('latitude') and match.get('longitude'): locations = inst.setdefault('locations', [{}]) if not locations[0].get('latitude'): locations[0]['latitude'] = match['latitude'] locations[0]['longitude'] = match['longitude'] stats['coords_added'] += 1 # Add website if match.get('website') and not inst.get('homepage'): inst['homepage'] = match['website'] if not any(i.get('identifier_scheme') == 'Website' for i in identifiers): identifiers.append({ 'identifier_scheme': 'Website', 'identifier_value': match['website'], 'identifier_url': match['website'] }) stats['website_added'] += 1 enriched_count += 1 print(f"\n✓ Enrichment complete") print(f"\nStatistics:") print(f" Total institutions: {stats['total']}") print(f" Wikidata matches (ISIL exact): {stats['wikidata_matched']} ({stats['wikidata_matched']/stats['total']*100:.1f}%)") print(f" VIAF IDs added: {stats['viaf_added']}") print(f" Coordinates added: {stats['coords_added']}") print(f" Websites added: {stats['website_added']}") print(f" Total enriched: {enriched_count} ({enriched_count/stats['total']*100:.1f}%)") return institutions, stats def main(): print("=" * 70) print("Japanese ISIL Registry Enrichment - Fast Track") print("=" * 70) # Load Wikidata ISIL matches wikidata_file = "data/isil/japan/japan_wikidata_isil_only.json" wikidata_matches = load_wikidata_isil_matches(wikidata_file) # Enrich base_file = "data/instances/japan_isil_all.yaml" institutions, stats = enrich_institutions_fast(base_file, wikidata_matches) # Save enriched dataset output_file = "data/instances/japan_complete.yaml" print(f"\nSaving enriched dataset to {output_file}...") with open(output_file, 'w', encoding='utf-8') as f: f.write("---\n") f.write(f"# Japanese ISIL Registry - Enriched\n") f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Enriched: {stats['wikidata_matched']}/{stats['total']} institutions\n") f.write(f"# Method: Wikidata ISIL exact matches\n\n") yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Saved enriched data ({len(institutions)} institutions)") # Save stats with open('data/isil/japan/japan_enrichment_stats.json', 'w', encoding='utf-8') as f: json.dump(stats, f, indent=2) print(f"✓ Saved statistics to data/isil/japan/japan_enrichment_stats.json") print("\n" + "=" * 70) print("✓ Japanese ISIL enrichment complete!") print("=" * 70) if __name__ == '__main__': main()