161 lines
6 KiB
Python
161 lines
6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Japanese ISIL Registry Enrichment - Fast Track
|
|
Uses Wikidata ISIL exact matches only (5000+ matches guaranteed).
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
|
|
def load_wikidata_isil_matches(filepath: str):
|
|
"""Load Wikidata ISIL match results."""
|
|
print(f"Loading Wikidata ISIL matches from {filepath}...")
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
matches = {}
|
|
for result in data:
|
|
isil = result.get('isil', {}).get('value')
|
|
if not isil:
|
|
continue
|
|
|
|
item_id = result['item']['value'].split('/')[-1]
|
|
|
|
coords_str = result.get('coords', {}).get('value', '')
|
|
lat, lon = None, None
|
|
if coords_str and coords_str.startswith('Point('):
|
|
parts = coords_str.replace('Point(', '').replace(')', '').split()
|
|
if len(parts) == 2:
|
|
lon, lat = float(parts[0]), float(parts[1])
|
|
|
|
matches[isil] = {
|
|
'qid': item_id,
|
|
'label': result.get('itemLabel', {}).get('value'),
|
|
'viaf': result.get('viaf', {}).get('value'),
|
|
'website': result.get('website', {}).get('value'),
|
|
'latitude': lat,
|
|
'longitude': lon
|
|
}
|
|
|
|
print(f"✓ Loaded {len(matches)} ISIL-to-Wikidata matches")
|
|
return matches
|
|
|
|
def enrich_institutions_fast(base_file: str, wikidata_matches: dict):
|
|
"""Fast enrichment using ISIL exact matches only."""
|
|
print(f"\nLoading base institutions from {base_file}...")
|
|
|
|
with open(base_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"✓ Loaded {len(institutions)} institutions")
|
|
print("\nEnriching with Wikidata ISIL matches...")
|
|
|
|
enriched_count = 0
|
|
stats = {
|
|
'total': len(institutions),
|
|
'wikidata_matched': 0,
|
|
'viaf_added': 0,
|
|
'coords_added': 0,
|
|
'website_added': 0
|
|
}
|
|
|
|
for inst in institutions:
|
|
# Get ISIL code
|
|
isil = next((i['identifier_value'] for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'ISIL'), None)
|
|
|
|
if not isil or isil not in wikidata_matches:
|
|
continue
|
|
|
|
match = wikidata_matches[isil]
|
|
identifiers = inst.setdefault('identifiers', [])
|
|
|
|
# Add Wikidata ID
|
|
if not any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers):
|
|
identifiers.append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': match['qid'],
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{match["qid"]}'
|
|
})
|
|
stats['wikidata_matched'] += 1
|
|
|
|
# Add VIAF ID
|
|
if match.get('viaf'):
|
|
if not any(i.get('identifier_scheme') == 'VIAF' for i in identifiers):
|
|
identifiers.append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': match['viaf'],
|
|
'identifier_url': f'https://viaf.org/viaf/{match["viaf"]}'
|
|
})
|
|
stats['viaf_added'] += 1
|
|
|
|
# Add coordinates
|
|
if match.get('latitude') and match.get('longitude'):
|
|
locations = inst.setdefault('locations', [{}])
|
|
if not locations[0].get('latitude'):
|
|
locations[0]['latitude'] = match['latitude']
|
|
locations[0]['longitude'] = match['longitude']
|
|
stats['coords_added'] += 1
|
|
|
|
# Add website
|
|
if match.get('website') and not inst.get('homepage'):
|
|
inst['homepage'] = match['website']
|
|
if not any(i.get('identifier_scheme') == 'Website' for i in identifiers):
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': match['website'],
|
|
'identifier_url': match['website']
|
|
})
|
|
stats['website_added'] += 1
|
|
|
|
enriched_count += 1
|
|
|
|
print(f"\n✓ Enrichment complete")
|
|
print(f"\nStatistics:")
|
|
print(f" Total institutions: {stats['total']}")
|
|
print(f" Wikidata matches (ISIL exact): {stats['wikidata_matched']} ({stats['wikidata_matched']/stats['total']*100:.1f}%)")
|
|
print(f" VIAF IDs added: {stats['viaf_added']}")
|
|
print(f" Coordinates added: {stats['coords_added']}")
|
|
print(f" Websites added: {stats['website_added']}")
|
|
print(f" Total enriched: {enriched_count} ({enriched_count/stats['total']*100:.1f}%)")
|
|
|
|
return institutions, stats
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print("Japanese ISIL Registry Enrichment - Fast Track")
|
|
print("=" * 70)
|
|
|
|
# Load Wikidata ISIL matches
|
|
wikidata_file = "data/isil/japan/japan_wikidata_isil_only.json"
|
|
wikidata_matches = load_wikidata_isil_matches(wikidata_file)
|
|
|
|
# Enrich
|
|
base_file = "data/instances/japan_isil_all.yaml"
|
|
institutions, stats = enrich_institutions_fast(base_file, wikidata_matches)
|
|
|
|
# Save enriched dataset
|
|
output_file = "data/instances/japan_complete.yaml"
|
|
print(f"\nSaving enriched dataset to {output_file}...")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write(f"# Japanese ISIL Registry - Enriched\n")
|
|
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Enriched: {stats['wikidata_matched']}/{stats['total']} institutions\n")
|
|
f.write(f"# Method: Wikidata ISIL exact matches\n\n")
|
|
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"✓ Saved enriched data ({len(institutions)} institutions)")
|
|
|
|
# Save stats
|
|
with open('data/isil/japan/japan_enrichment_stats.json', 'w', encoding='utf-8') as f:
|
|
json.dump(stats, f, indent=2)
|
|
print(f"✓ Saved statistics to data/isil/japan/japan_enrichment_stats.json")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("✓ Japanese ISIL enrichment complete!")
|
|
print("=" * 70)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|