glam/enrich_japan_fast.py
2025-11-19 23:25:22 +01:00

161 lines
6 KiB
Python

#!/usr/bin/env python3
"""
Japanese ISIL Registry Enrichment - Fast Track
Uses Wikidata ISIL exact matches only (5000+ matches guaranteed).
"""
import json
import yaml
from datetime import datetime, timezone
def load_wikidata_isil_matches(filepath: str):
"""Load Wikidata ISIL match results."""
print(f"Loading Wikidata ISIL matches from {filepath}...")
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
matches = {}
for result in data:
isil = result.get('isil', {}).get('value')
if not isil:
continue
item_id = result['item']['value'].split('/')[-1]
coords_str = result.get('coords', {}).get('value', '')
lat, lon = None, None
if coords_str and coords_str.startswith('Point('):
parts = coords_str.replace('Point(', '').replace(')', '').split()
if len(parts) == 2:
lon, lat = float(parts[0]), float(parts[1])
matches[isil] = {
'qid': item_id,
'label': result.get('itemLabel', {}).get('value'),
'viaf': result.get('viaf', {}).get('value'),
'website': result.get('website', {}).get('value'),
'latitude': lat,
'longitude': lon
}
print(f"✓ Loaded {len(matches)} ISIL-to-Wikidata matches")
return matches
def enrich_institutions_fast(base_file: str, wikidata_matches: dict):
"""Fast enrichment using ISIL exact matches only."""
print(f"\nLoading base institutions from {base_file}...")
with open(base_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"✓ Loaded {len(institutions)} institutions")
print("\nEnriching with Wikidata ISIL matches...")
enriched_count = 0
stats = {
'total': len(institutions),
'wikidata_matched': 0,
'viaf_added': 0,
'coords_added': 0,
'website_added': 0
}
for inst in institutions:
# Get ISIL code
isil = next((i['identifier_value'] for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'ISIL'), None)
if not isil or isil not in wikidata_matches:
continue
match = wikidata_matches[isil]
identifiers = inst.setdefault('identifiers', [])
# Add Wikidata ID
if not any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers):
identifiers.append({
'identifier_scheme': 'Wikidata',
'identifier_value': match['qid'],
'identifier_url': f'https://www.wikidata.org/wiki/{match["qid"]}'
})
stats['wikidata_matched'] += 1
# Add VIAF ID
if match.get('viaf'):
if not any(i.get('identifier_scheme') == 'VIAF' for i in identifiers):
identifiers.append({
'identifier_scheme': 'VIAF',
'identifier_value': match['viaf'],
'identifier_url': f'https://viaf.org/viaf/{match["viaf"]}'
})
stats['viaf_added'] += 1
# Add coordinates
if match.get('latitude') and match.get('longitude'):
locations = inst.setdefault('locations', [{}])
if not locations[0].get('latitude'):
locations[0]['latitude'] = match['latitude']
locations[0]['longitude'] = match['longitude']
stats['coords_added'] += 1
# Add website
if match.get('website') and not inst.get('homepage'):
inst['homepage'] = match['website']
if not any(i.get('identifier_scheme') == 'Website' for i in identifiers):
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': match['website'],
'identifier_url': match['website']
})
stats['website_added'] += 1
enriched_count += 1
print(f"\n✓ Enrichment complete")
print(f"\nStatistics:")
print(f" Total institutions: {stats['total']}")
print(f" Wikidata matches (ISIL exact): {stats['wikidata_matched']} ({stats['wikidata_matched']/stats['total']*100:.1f}%)")
print(f" VIAF IDs added: {stats['viaf_added']}")
print(f" Coordinates added: {stats['coords_added']}")
print(f" Websites added: {stats['website_added']}")
print(f" Total enriched: {enriched_count} ({enriched_count/stats['total']*100:.1f}%)")
return institutions, stats
def main():
print("=" * 70)
print("Japanese ISIL Registry Enrichment - Fast Track")
print("=" * 70)
# Load Wikidata ISIL matches
wikidata_file = "data/isil/japan/japan_wikidata_isil_only.json"
wikidata_matches = load_wikidata_isil_matches(wikidata_file)
# Enrich
base_file = "data/instances/japan_isil_all.yaml"
institutions, stats = enrich_institutions_fast(base_file, wikidata_matches)
# Save enriched dataset
output_file = "data/instances/japan_complete.yaml"
print(f"\nSaving enriched dataset to {output_file}...")
with open(output_file, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write(f"# Japanese ISIL Registry - Enriched\n")
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Enriched: {stats['wikidata_matched']}/{stats['total']} institutions\n")
f.write(f"# Method: Wikidata ISIL exact matches\n\n")
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Saved enriched data ({len(institutions)} institutions)")
# Save stats
with open('data/isil/japan/japan_enrichment_stats.json', 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2)
print(f"✓ Saved statistics to data/isil/japan/japan_enrichment_stats.json")
print("\n" + "=" * 70)
print("✓ Japanese ISIL enrichment complete!")
print("=" * 70)
if __name__ == '__main__':
main()