glam/scripts/export_japanese_isil_to_linkml.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

284 lines
10 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Export ALL Japanese ISIL registry institutions to LinkML format.
This script performs a COMPLETE export of all Japanese heritage institutions
from the National Diet Library ISIL registry (12,065+ institutions) into a
single consolidated LinkML-compliant YAML file.
Data Sources:
- Public libraries (libraries_public.csv)
- Academic/specialized libraries (libraries_other.csv)
- Museums (museums.csv)
- Archives (archives.csv)
Output: data/instances/japan/jp_institutions.yaml (COMPLETE dataset)
Data Quality: TIER_1_AUTHORITATIVE (official National Diet Library registry)
Coverage: All Japanese prefectures (47 prefectures)
"""
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, List
from collections import Counter
# Add project root to Python path
project_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(project_root / "src"))
from glam_extractor.parsers.japanese_isil import JapaneseISILParser
from glam_extractor.models import InstitutionType, HeritageCustodian
import yaml
def serialize_custodian(custodian: HeritageCustodian) -> dict:
"""
Serialize HeritageCustodian to YAML-safe dictionary.
Uses JSON serialization for clean conversion of all Pydantic models,
datetime objects, and HttpUrl objects to plain Python types.
"""
import json
# Use Pydantic's JSON serialization (handles all nested models correctly)
json_str = custodian.json(exclude_none=True)
data = json.loads(json_str)
return data
def analyze_coverage(custodians: List[HeritageCustodian]) -> Dict:
"""Generate comprehensive statistics about the dataset."""
total = len(custodians)
# Count by institution type
types = Counter(c.institution_type for c in custodians)
# Count by prefecture
prefectures = Counter(
c.locations[0].region if c.locations and c.locations[0].region else "UNKNOWN"
for c in custodians
)
# Coverage metrics
with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None)
with_url = sum(
1 for c in custodians
if c.identifiers and any(i.identifier_scheme == "Website" for i in c.identifiers)
)
with_phone = sum(
1 for c in custodians
if c.description and ("Tel:" in c.description or "電話:" in c.description)
)
with_postal = sum(
1 for c in custodians
if c.locations and c.locations[0].postal_code
)
with_street = sum(
1 for c in custodians
if c.locations and c.locations[0].street_address
)
# ISIL code patterns
isil_codes = [
next((i.identifier_value for i in c.identifiers if i.identifier_scheme == "ISIL"), None)
for c in custodians
]
isil_prefixes = Counter(
code.split('-')[1][:2] if code and '-' in code else "UNKNOWN"
for code in isil_codes
)
return {
"total_institutions": total,
"by_type": dict(types),
"by_prefecture": dict(prefectures.most_common(10)), # Top 10
"total_prefectures": len([p for p in prefectures if p != "UNKNOWN"]),
"coverage": {
"ghcid": {"count": with_ghcid, "percentage": f"{with_ghcid/total*100:.1f}%"},
"website": {"count": with_url, "percentage": f"{with_url/total*100:.1f}%"},
"phone": {"count": with_phone, "percentage": f"{with_phone/total*100:.1f}%"},
"postal_code": {"count": with_postal, "percentage": f"{with_postal/total*100:.1f}%"},
"street_address": {"count": with_street, "percentage": f"{with_street/total*100:.1f}%"},
},
"isil_distribution": dict(isil_prefixes.most_common(10)), # Top 10 ISIL prefixes
}
def main():
"""Export all Japanese ISIL institutions to consolidated LinkML YAML."""
print("="*80)
print("JAPANESE ISIL REGISTRY → LinkML EXPORT")
print("="*80)
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()
data_dir = project_root / "data" / "isil" / "JP"
output_dir = project_root / "data" / "instances" / "japan"
output_dir.mkdir(parents=True, exist_ok=True)
# Output file for complete dataset
output_path = output_dir / "jp_institutions.yaml"
parser = JapaneseISILParser()
# Define CSV files and their institution types
csv_files = [
("libraries_public.csv", InstitutionType.LIBRARY, "Public libraries"),
("libraries_other.csv", InstitutionType.LIBRARY, "Academic/specialized libraries"),
("museums.csv", InstitutionType.MUSEUM, "Museums"),
("archives.csv", InstitutionType.ARCHIVE, "Archives"),
]
all_custodians: List[HeritageCustodian] = []
file_stats: Dict[str, Dict] = {}
# Process each CSV file
for csv_filename, inst_type, description in csv_files:
csv_path = data_dir / csv_filename
if not csv_path.exists():
print(f"⚠️ File not found: {csv_path}")
print(f" Skipping {description}")
continue
print(f"\n{''*80}")
print(f"Processing: {csv_filename}")
print(f"Type: {inst_type.value}")
print(f"Description: {description}")
print(f"{''*80}")
try:
# Parse CSV
print(f" → Parsing CSV...")
records = parser.parse_file(csv_path, inst_type)
print(f" ✓ Parsed {len(records):,} raw records")
# Convert to HeritageCustodian models
print(f" → Converting to LinkML format...")
custodians = [
parser.to_heritage_custodian(record, inst_type, str(csv_path))
for record in records
]
print(f" ✓ Converted {len(custodians):,} HeritageCustodian instances")
# Calculate statistics
with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None)
with_url = sum(
1 for c in custodians
if c.identifiers and any(i.identifier_scheme == "Website" for i in c.identifiers)
)
prefectures = len(set(
c.locations[0].region
for c in custodians
if c.locations and c.locations[0].region
))
stats = {
"total_records": len(records),
"with_ghcid": with_ghcid,
"with_url": with_url,
"prefectures_covered": prefectures,
"ghcid_coverage": f"{with_ghcid/len(records)*100:.1f}%" if records else "0%",
"url_coverage": f"{with_url/len(records)*100:.1f}%" if records else "0%",
}
file_stats[csv_filename] = stats
print(f" → Statistics:")
print(f" • GHCID coverage: {with_ghcid:,}/{len(records):,} ({stats['ghcid_coverage']})")
print(f" • Website URLs: {with_url:,}/{len(records):,} ({stats['url_coverage']})")
print(f" • Prefectures: {prefectures}")
# Add to master list
all_custodians.extend(custodians)
print(f" ✓ Added to master dataset")
except Exception as e:
print(f" ❌ Error processing {csv_filename}: {e}")
import traceback
traceback.print_exc()
continue
# Generate comprehensive statistics
print(f"\n{'='*80}")
print("ANALYZING COMPLETE DATASET")
print(f"{'='*80}")
stats = analyze_coverage(all_custodians)
print(f"\n📊 Dataset Overview:")
print(f" • Total institutions: {stats['total_institutions']:,}")
print(f" • Prefectures covered: {stats['total_prefectures']}/47")
print(f"\n📚 By Institution Type:")
for inst_type, count in stats["by_type"].items():
print(f"{inst_type}: {count:,}")
print(f"\n🗺️ Top 10 Prefectures:")
for prefecture, count in stats["by_prefecture"].items():
print(f"{prefecture}: {count:,}")
print(f"\n✅ Data Coverage:")
for field, coverage in stats["coverage"].items():
print(f"{field.replace('_', ' ').title()}: {coverage['count']:,} ({coverage['percentage']})")
# Export to YAML
print(f"\n{'='*80}")
print("EXPORTING TO LINKML YAML")
print(f"{'='*80}")
print(f" → Serializing {len(all_custodians):,} institutions...")
serialized = [serialize_custodian(c) for c in all_custodians]
print(f" → Writing to {output_path}...")
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(
serialized,
f,
allow_unicode=True,
sort_keys=False,
default_flow_style=False,
width=120,
)
file_size_mb = output_path.stat().st_size / (1024 * 1024)
print(f" ✓ Export complete!")
print(f" 📁 File: {output_path}")
print(f" 📏 Size: {file_size_mb:.2f} MB")
# Export statistics summary
stats_path = output_dir / "jp_institutions_statistics.yaml"
print(f"\n → Writing statistics to {stats_path}...")
with open(stats_path, 'w', encoding='utf-8') as f:
yaml.dump(
{
"export_date": datetime.now().isoformat(),
"source_files": file_stats,
"dataset_statistics": stats,
},
f,
allow_unicode=True,
sort_keys=False,
default_flow_style=False,
)
print(f" ✓ Statistics exported")
# Final summary
print(f"\n{'='*80}")
print("✅ EXPORT COMPLETE")
print(f"{'='*80}")
print(f"\n📦 Exported Files:")
print(f" • Main dataset: {output_path}")
print(f" • Statistics: {stats_path}")
print(f"\n📊 Summary:")
print(f" • Total institutions: {len(all_custodians):,}")
print(f" • Data tier: TIER_1_AUTHORITATIVE")
print(f" • Source: National Diet Library ISIL Registry")
print(f" • Geographic coverage: {stats['total_prefectures']}/47 Japanese prefectures")
print(f"\n⏱️ Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()
if __name__ == "__main__":
main()