glam/parse_japan_isil.py
2025-11-19 23:25:22 +01:00

157 lines
6 KiB
Python

#!/usr/bin/env python3
"""Parse Japanese ISIL registries to LinkML format."""
import csv
import yaml
from datetime import datetime, timezone
from typing import List, Dict
def parse_japanese_csv(filepath: str, inst_type: str) -> List[Dict]:
"""Parse Japanese ISIL CSV file."""
institutions = []
print(f"Parsing {filepath}...")
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
isil = row.get('ISIL', '').strip()
if not isil or isil.startswith('DELETE'):
continue
name_en = row.get('Institution name in English', '').strip()
if not name_en:
continue
# Create institution ID
inst_id = f"https://w3id.org/heritage/custodian/jp/{isil.lower().replace('-', '')}"
# Parse location
prefecture = row.get('Prefecture', '').strip().title()
city = row.get('City/Ward/Town/Village', '').strip().title()
postal_code = row.get('Postal code', '').strip()
street = row.get('Address (Street Level)', '').strip()
# Build address
address_parts = [street, city, prefecture, postal_code]
street_address = ', '.join([p for p in address_parts if p])
# Get additional fields
phone = row.get('Telephone number', '').strip()
website = row.get('URL', '').strip()
# Create LinkML record
institution = {
'id': inst_id,
'name': name_en,
'institution_type': inst_type,
'locations': [{
'country': 'JP',
'city': city if city else None,
'region': prefecture if prefecture else None,
'postal_code': postal_code if postal_code else None,
'street_address': street_address if street_address else None
}],
'identifiers': [
{
'identifier_scheme': 'ISIL',
'identifier_value': isil,
'identifier_url': f'https://isil.org/{isil}'
}
],
'provenance': {
'data_source': 'CSV_REGISTRY',
'data_tier': 'TIER_1_AUTHORITATIVE',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'CSV parsing from Japanese ISIL registry',
'confidence_score': 0.98,
'source_url': 'https://www.ndl.go.jp/jp/aboutus/isil.html'
}
}
# Add optional fields
if website:
institution['homepage'] = website
institution['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': website,
'identifier_url': website
})
if phone:
institution['contact_info'] = {'phone': phone}
# Clean up empty location fields
institution['locations'] = [
{k: v for k, v in loc.items() if v}
for loc in institution['locations']
]
institutions.append(institution)
print(f" ✓ Parsed {len(institutions)} institutions")
return institutions
def main():
print("=" * 70)
print("Japanese ISIL Registry Parsing")
print("=" * 70)
datasets = [
('data/isil/JP/archives.csv', 'ARCHIVE', 'archives'),
('data/isil/JP/museums.csv', 'MUSEUM', 'museums'),
('data/isil/JP/libraries_public.csv', 'LIBRARY', 'libraries_public'),
('data/isil/JP/libraries_other.csv', 'LIBRARY', 'libraries_other')
]
all_institutions = []
stats = {}
for filepath, inst_type, label in datasets:
institutions = parse_japanese_csv(filepath, inst_type)
all_institutions.extend(institutions)
stats[label] = len(institutions)
print(f"\n✓ Total institutions parsed: {len(all_institutions)}")
print("\nBreakdown:")
for label, count in stats.items():
print(f" {label}: {count}")
# Save combined file
output_file = "data/instances/japan_isil_all.yaml"
with open(output_file, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write("# Japanese ISIL Registry - All Institutions\n")
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Total institutions: {len(all_institutions)}\n")
f.write(f"# Archives: {stats.get('archives', 0)}\n")
f.write(f"# Museums: {stats.get('museums', 0)}\n")
f.write(f"# Public Libraries: {stats.get('libraries_public', 0)}\n")
f.write(f"# Other Libraries: {stats.get('libraries_other', 0)}\n\n")
yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"\n✓ Saved combined dataset to {output_file}")
# Also save individual files
for filepath, inst_type, label in datasets:
institutions = [i for i in all_institutions if i['institution_type'] == inst_type and label in filepath]
if not institutions:
continue
output = f"data/instances/japan_{label}.yaml"
with open(output, 'w', encoding='utf-8') as f:
f.write("---\n")
f.write(f"# Japanese ISIL Registry - {label.replace('_', ' ').title()}\n")
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
f.write(f"# Total institutions: {len(institutions)}\n\n")
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f" ✓ Saved {output}")
print("\n" + "=" * 70)
print("✓ Japanese ISIL parsing complete!")
print("=" * 70)
if __name__ == '__main__':
main()