#!/usr/bin/env python3 """Parse Japanese ISIL registries to LinkML format.""" import csv import yaml from datetime import datetime, timezone from typing import List, Dict def parse_japanese_csv(filepath: str, inst_type: str) -> List[Dict]: """Parse Japanese ISIL CSV file.""" institutions = [] print(f"Parsing {filepath}...") with open(filepath, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: isil = row.get('ISIL', '').strip() if not isil or isil.startswith('DELETE'): continue name_en = row.get('Institution name in English', '').strip() if not name_en: continue # Create institution ID inst_id = f"https://w3id.org/heritage/custodian/jp/{isil.lower().replace('-', '')}" # Parse location prefecture = row.get('Prefecture', '').strip().title() city = row.get('City/Ward/Town/Village', '').strip().title() postal_code = row.get('Postal code', '').strip() street = row.get('Address (Street Level)', '').strip() # Build address address_parts = [street, city, prefecture, postal_code] street_address = ', '.join([p for p in address_parts if p]) # Get additional fields phone = row.get('Telephone number', '').strip() website = row.get('URL', '').strip() # Create LinkML record institution = { 'id': inst_id, 'name': name_en, 'institution_type': inst_type, 'locations': [{ 'country': 'JP', 'city': city if city else None, 'region': prefecture if prefecture else None, 'postal_code': postal_code if postal_code else None, 'street_address': street_address if street_address else None }], 'identifiers': [ { 'identifier_scheme': 'ISIL', 'identifier_value': isil, 'identifier_url': f'https://isil.org/{isil}' } ], 'provenance': { 'data_source': 'CSV_REGISTRY', 'data_tier': 'TIER_1_AUTHORITATIVE', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'CSV parsing from Japanese ISIL registry', 'confidence_score': 0.98, 'source_url': 'https://www.ndl.go.jp/jp/aboutus/isil.html' } } # Add optional fields if website: institution['homepage'] = website institution['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': website, 'identifier_url': website }) if phone: institution['contact_info'] = {'phone': phone} # Clean up empty location fields institution['locations'] = [ {k: v for k, v in loc.items() if v} for loc in institution['locations'] ] institutions.append(institution) print(f" ✓ Parsed {len(institutions)} institutions") return institutions def main(): print("=" * 70) print("Japanese ISIL Registry Parsing") print("=" * 70) datasets = [ ('data/isil/JP/archives.csv', 'ARCHIVE', 'archives'), ('data/isil/JP/museums.csv', 'MUSEUM', 'museums'), ('data/isil/JP/libraries_public.csv', 'LIBRARY', 'libraries_public'), ('data/isil/JP/libraries_other.csv', 'LIBRARY', 'libraries_other') ] all_institutions = [] stats = {} for filepath, inst_type, label in datasets: institutions = parse_japanese_csv(filepath, inst_type) all_institutions.extend(institutions) stats[label] = len(institutions) print(f"\n✓ Total institutions parsed: {len(all_institutions)}") print("\nBreakdown:") for label, count in stats.items(): print(f" {label}: {count}") # Save combined file output_file = "data/instances/japan_isil_all.yaml" with open(output_file, 'w', encoding='utf-8') as f: f.write("---\n") f.write("# Japanese ISIL Registry - All Institutions\n") f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Total institutions: {len(all_institutions)}\n") f.write(f"# Archives: {stats.get('archives', 0)}\n") f.write(f"# Museums: {stats.get('museums', 0)}\n") f.write(f"# Public Libraries: {stats.get('libraries_public', 0)}\n") f.write(f"# Other Libraries: {stats.get('libraries_other', 0)}\n\n") yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"\n✓ Saved combined dataset to {output_file}") # Also save individual files for filepath, inst_type, label in datasets: institutions = [i for i in all_institutions if i['institution_type'] == inst_type and label in filepath] if not institutions: continue output = f"data/instances/japan_{label}.yaml" with open(output, 'w', encoding='utf-8') as f: f.write("---\n") f.write(f"# Japanese ISIL Registry - {label.replace('_', ' ').title()}\n") f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Total institutions: {len(institutions)}\n\n") yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f" ✓ Saved {output}") print("\n" + "=" * 70) print("✓ Japanese ISIL parsing complete!") print("=" * 70) if __name__ == '__main__': main()