157 lines
6 KiB
Python
157 lines
6 KiB
Python
#!/usr/bin/env python3
|
|
"""Parse Japanese ISIL registries to LinkML format."""
|
|
|
|
import csv
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict
|
|
|
|
def parse_japanese_csv(filepath: str, inst_type: str) -> List[Dict]:
|
|
"""Parse Japanese ISIL CSV file."""
|
|
institutions = []
|
|
|
|
print(f"Parsing {filepath}...")
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
|
|
for row in reader:
|
|
isil = row.get('ISIL', '').strip()
|
|
if not isil or isil.startswith('DELETE'):
|
|
continue
|
|
|
|
name_en = row.get('Institution name in English', '').strip()
|
|
if not name_en:
|
|
continue
|
|
|
|
# Create institution ID
|
|
inst_id = f"https://w3id.org/heritage/custodian/jp/{isil.lower().replace('-', '')}"
|
|
|
|
# Parse location
|
|
prefecture = row.get('Prefecture', '').strip().title()
|
|
city = row.get('City/Ward/Town/Village', '').strip().title()
|
|
postal_code = row.get('Postal code', '').strip()
|
|
street = row.get('Address (Street Level)', '').strip()
|
|
|
|
# Build address
|
|
address_parts = [street, city, prefecture, postal_code]
|
|
street_address = ', '.join([p for p in address_parts if p])
|
|
|
|
# Get additional fields
|
|
phone = row.get('Telephone number', '').strip()
|
|
website = row.get('URL', '').strip()
|
|
|
|
# Create LinkML record
|
|
institution = {
|
|
'id': inst_id,
|
|
'name': name_en,
|
|
'institution_type': inst_type,
|
|
'locations': [{
|
|
'country': 'JP',
|
|
'city': city if city else None,
|
|
'region': prefecture if prefecture else None,
|
|
'postal_code': postal_code if postal_code else None,
|
|
'street_address': street_address if street_address else None
|
|
}],
|
|
'identifiers': [
|
|
{
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': isil,
|
|
'identifier_url': f'https://isil.org/{isil}'
|
|
}
|
|
],
|
|
'provenance': {
|
|
'data_source': 'CSV_REGISTRY',
|
|
'data_tier': 'TIER_1_AUTHORITATIVE',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'CSV parsing from Japanese ISIL registry',
|
|
'confidence_score': 0.98,
|
|
'source_url': 'https://www.ndl.go.jp/jp/aboutus/isil.html'
|
|
}
|
|
}
|
|
|
|
# Add optional fields
|
|
if website:
|
|
institution['homepage'] = website
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': website,
|
|
'identifier_url': website
|
|
})
|
|
|
|
if phone:
|
|
institution['contact_info'] = {'phone': phone}
|
|
|
|
# Clean up empty location fields
|
|
institution['locations'] = [
|
|
{k: v for k, v in loc.items() if v}
|
|
for loc in institution['locations']
|
|
]
|
|
|
|
institutions.append(institution)
|
|
|
|
print(f" ✓ Parsed {len(institutions)} institutions")
|
|
return institutions
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print("Japanese ISIL Registry Parsing")
|
|
print("=" * 70)
|
|
|
|
datasets = [
|
|
('data/isil/JP/archives.csv', 'ARCHIVE', 'archives'),
|
|
('data/isil/JP/museums.csv', 'MUSEUM', 'museums'),
|
|
('data/isil/JP/libraries_public.csv', 'LIBRARY', 'libraries_public'),
|
|
('data/isil/JP/libraries_other.csv', 'LIBRARY', 'libraries_other')
|
|
]
|
|
|
|
all_institutions = []
|
|
stats = {}
|
|
|
|
for filepath, inst_type, label in datasets:
|
|
institutions = parse_japanese_csv(filepath, inst_type)
|
|
all_institutions.extend(institutions)
|
|
stats[label] = len(institutions)
|
|
|
|
print(f"\n✓ Total institutions parsed: {len(all_institutions)}")
|
|
print("\nBreakdown:")
|
|
for label, count in stats.items():
|
|
print(f" {label}: {count}")
|
|
|
|
# Save combined file
|
|
output_file = "data/instances/japan_isil_all.yaml"
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write("# Japanese ISIL Registry - All Institutions\n")
|
|
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Total institutions: {len(all_institutions)}\n")
|
|
f.write(f"# Archives: {stats.get('archives', 0)}\n")
|
|
f.write(f"# Museums: {stats.get('museums', 0)}\n")
|
|
f.write(f"# Public Libraries: {stats.get('libraries_public', 0)}\n")
|
|
f.write(f"# Other Libraries: {stats.get('libraries_other', 0)}\n\n")
|
|
yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"\n✓ Saved combined dataset to {output_file}")
|
|
|
|
# Also save individual files
|
|
for filepath, inst_type, label in datasets:
|
|
institutions = [i for i in all_institutions if i['institution_type'] == inst_type and label in filepath]
|
|
if not institutions:
|
|
continue
|
|
|
|
output = f"data/instances/japan_{label}.yaml"
|
|
with open(output, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write(f"# Japanese ISIL Registry - {label.replace('_', ' ').title()}\n")
|
|
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Total institutions: {len(institutions)}\n\n")
|
|
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f" ✓ Saved {output}")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("✓ Japanese ISIL parsing complete!")
|
|
print("=" * 70)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|