- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
284 lines
10 KiB
Python
Executable file
284 lines
10 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Export ALL Japanese ISIL registry institutions to LinkML format.
|
|
|
|
This script performs a COMPLETE export of all Japanese heritage institutions
|
|
from the National Diet Library ISIL registry (12,065+ institutions) into a
|
|
single consolidated LinkML-compliant YAML file.
|
|
|
|
Data Sources:
|
|
- Public libraries (libraries_public.csv)
|
|
- Academic/specialized libraries (libraries_other.csv)
|
|
- Museums (museums.csv)
|
|
- Archives (archives.csv)
|
|
|
|
Output: data/instances/japan/jp_institutions.yaml (COMPLETE dataset)
|
|
|
|
Data Quality: TIER_1_AUTHORITATIVE (official National Diet Library registry)
|
|
Coverage: All Japanese prefectures (47 prefectures)
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Dict, List
|
|
from collections import Counter
|
|
|
|
# Add project root to Python path
|
|
project_root = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(project_root / "src"))
|
|
|
|
from glam_extractor.parsers.japanese_isil import JapaneseISILParser
|
|
from glam_extractor.models import InstitutionType, HeritageCustodian
|
|
import yaml
|
|
|
|
|
|
def serialize_custodian(custodian: HeritageCustodian) -> dict:
|
|
"""
|
|
Serialize HeritageCustodian to YAML-safe dictionary.
|
|
|
|
Uses JSON serialization for clean conversion of all Pydantic models,
|
|
datetime objects, and HttpUrl objects to plain Python types.
|
|
"""
|
|
import json
|
|
|
|
# Use Pydantic's JSON serialization (handles all nested models correctly)
|
|
json_str = custodian.json(exclude_none=True)
|
|
data = json.loads(json_str)
|
|
|
|
return data
|
|
|
|
|
|
def analyze_coverage(custodians: List[HeritageCustodian]) -> Dict:
|
|
"""Generate comprehensive statistics about the dataset."""
|
|
|
|
total = len(custodians)
|
|
|
|
# Count by institution type
|
|
types = Counter(c.institution_type for c in custodians)
|
|
|
|
# Count by prefecture
|
|
prefectures = Counter(
|
|
c.locations[0].region if c.locations and c.locations[0].region else "UNKNOWN"
|
|
for c in custodians
|
|
)
|
|
|
|
# Coverage metrics
|
|
with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None)
|
|
with_url = sum(
|
|
1 for c in custodians
|
|
if c.identifiers and any(i.identifier_scheme == "Website" for i in c.identifiers)
|
|
)
|
|
with_phone = sum(
|
|
1 for c in custodians
|
|
if c.description and ("Tel:" in c.description or "電話:" in c.description)
|
|
)
|
|
with_postal = sum(
|
|
1 for c in custodians
|
|
if c.locations and c.locations[0].postal_code
|
|
)
|
|
with_street = sum(
|
|
1 for c in custodians
|
|
if c.locations and c.locations[0].street_address
|
|
)
|
|
|
|
# ISIL code patterns
|
|
isil_codes = [
|
|
next((i.identifier_value for i in c.identifiers if i.identifier_scheme == "ISIL"), None)
|
|
for c in custodians
|
|
]
|
|
isil_prefixes = Counter(
|
|
code.split('-')[1][:2] if code and '-' in code else "UNKNOWN"
|
|
for code in isil_codes
|
|
)
|
|
|
|
return {
|
|
"total_institutions": total,
|
|
"by_type": dict(types),
|
|
"by_prefecture": dict(prefectures.most_common(10)), # Top 10
|
|
"total_prefectures": len([p for p in prefectures if p != "UNKNOWN"]),
|
|
"coverage": {
|
|
"ghcid": {"count": with_ghcid, "percentage": f"{with_ghcid/total*100:.1f}%"},
|
|
"website": {"count": with_url, "percentage": f"{with_url/total*100:.1f}%"},
|
|
"phone": {"count": with_phone, "percentage": f"{with_phone/total*100:.1f}%"},
|
|
"postal_code": {"count": with_postal, "percentage": f"{with_postal/total*100:.1f}%"},
|
|
"street_address": {"count": with_street, "percentage": f"{with_street/total*100:.1f}%"},
|
|
},
|
|
"isil_distribution": dict(isil_prefixes.most_common(10)), # Top 10 ISIL prefixes
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Export all Japanese ISIL institutions to consolidated LinkML YAML."""
|
|
|
|
print("="*80)
|
|
print("JAPANESE ISIL REGISTRY → LinkML EXPORT")
|
|
print("="*80)
|
|
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print()
|
|
|
|
data_dir = project_root / "data" / "isil" / "JP"
|
|
output_dir = project_root / "data" / "instances" / "japan"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Output file for complete dataset
|
|
output_path = output_dir / "jp_institutions.yaml"
|
|
|
|
parser = JapaneseISILParser()
|
|
|
|
# Define CSV files and their institution types
|
|
csv_files = [
|
|
("libraries_public.csv", InstitutionType.LIBRARY, "Public libraries"),
|
|
("libraries_other.csv", InstitutionType.LIBRARY, "Academic/specialized libraries"),
|
|
("museums.csv", InstitutionType.MUSEUM, "Museums"),
|
|
("archives.csv", InstitutionType.ARCHIVE, "Archives"),
|
|
]
|
|
|
|
all_custodians: List[HeritageCustodian] = []
|
|
file_stats: Dict[str, Dict] = {}
|
|
|
|
# Process each CSV file
|
|
for csv_filename, inst_type, description in csv_files:
|
|
csv_path = data_dir / csv_filename
|
|
|
|
if not csv_path.exists():
|
|
print(f"⚠️ File not found: {csv_path}")
|
|
print(f" Skipping {description}")
|
|
continue
|
|
|
|
print(f"\n{'─'*80}")
|
|
print(f"Processing: {csv_filename}")
|
|
print(f"Type: {inst_type.value}")
|
|
print(f"Description: {description}")
|
|
print(f"{'─'*80}")
|
|
|
|
try:
|
|
# Parse CSV
|
|
print(f" → Parsing CSV...")
|
|
records = parser.parse_file(csv_path, inst_type)
|
|
print(f" ✓ Parsed {len(records):,} raw records")
|
|
|
|
# Convert to HeritageCustodian models
|
|
print(f" → Converting to LinkML format...")
|
|
custodians = [
|
|
parser.to_heritage_custodian(record, inst_type, str(csv_path))
|
|
for record in records
|
|
]
|
|
print(f" ✓ Converted {len(custodians):,} HeritageCustodian instances")
|
|
|
|
# Calculate statistics
|
|
with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None)
|
|
with_url = sum(
|
|
1 for c in custodians
|
|
if c.identifiers and any(i.identifier_scheme == "Website" for i in c.identifiers)
|
|
)
|
|
prefectures = len(set(
|
|
c.locations[0].region
|
|
for c in custodians
|
|
if c.locations and c.locations[0].region
|
|
))
|
|
|
|
stats = {
|
|
"total_records": len(records),
|
|
"with_ghcid": with_ghcid,
|
|
"with_url": with_url,
|
|
"prefectures_covered": prefectures,
|
|
"ghcid_coverage": f"{with_ghcid/len(records)*100:.1f}%" if records else "0%",
|
|
"url_coverage": f"{with_url/len(records)*100:.1f}%" if records else "0%",
|
|
}
|
|
file_stats[csv_filename] = stats
|
|
|
|
print(f" → Statistics:")
|
|
print(f" • GHCID coverage: {with_ghcid:,}/{len(records):,} ({stats['ghcid_coverage']})")
|
|
print(f" • Website URLs: {with_url:,}/{len(records):,} ({stats['url_coverage']})")
|
|
print(f" • Prefectures: {prefectures}")
|
|
|
|
# Add to master list
|
|
all_custodians.extend(custodians)
|
|
print(f" ✓ Added to master dataset")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error processing {csv_filename}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
continue
|
|
|
|
# Generate comprehensive statistics
|
|
print(f"\n{'='*80}")
|
|
print("ANALYZING COMPLETE DATASET")
|
|
print(f"{'='*80}")
|
|
|
|
stats = analyze_coverage(all_custodians)
|
|
|
|
print(f"\n📊 Dataset Overview:")
|
|
print(f" • Total institutions: {stats['total_institutions']:,}")
|
|
print(f" • Prefectures covered: {stats['total_prefectures']}/47")
|
|
print(f"\n📚 By Institution Type:")
|
|
for inst_type, count in stats["by_type"].items():
|
|
print(f" • {inst_type}: {count:,}")
|
|
print(f"\n🗺️ Top 10 Prefectures:")
|
|
for prefecture, count in stats["by_prefecture"].items():
|
|
print(f" • {prefecture}: {count:,}")
|
|
print(f"\n✅ Data Coverage:")
|
|
for field, coverage in stats["coverage"].items():
|
|
print(f" • {field.replace('_', ' ').title()}: {coverage['count']:,} ({coverage['percentage']})")
|
|
|
|
# Export to YAML
|
|
print(f"\n{'='*80}")
|
|
print("EXPORTING TO LINKML YAML")
|
|
print(f"{'='*80}")
|
|
|
|
print(f" → Serializing {len(all_custodians):,} institutions...")
|
|
serialized = [serialize_custodian(c) for c in all_custodians]
|
|
|
|
print(f" → Writing to {output_path}...")
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
serialized,
|
|
f,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
default_flow_style=False,
|
|
width=120,
|
|
)
|
|
|
|
file_size_mb = output_path.stat().st_size / (1024 * 1024)
|
|
print(f" ✓ Export complete!")
|
|
print(f" 📁 File: {output_path}")
|
|
print(f" 📏 Size: {file_size_mb:.2f} MB")
|
|
|
|
# Export statistics summary
|
|
stats_path = output_dir / "jp_institutions_statistics.yaml"
|
|
print(f"\n → Writing statistics to {stats_path}...")
|
|
with open(stats_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
{
|
|
"export_date": datetime.now().isoformat(),
|
|
"source_files": file_stats,
|
|
"dataset_statistics": stats,
|
|
},
|
|
f,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
default_flow_style=False,
|
|
)
|
|
print(f" ✓ Statistics exported")
|
|
|
|
# Final summary
|
|
print(f"\n{'='*80}")
|
|
print("✅ EXPORT COMPLETE")
|
|
print(f"{'='*80}")
|
|
print(f"\n📦 Exported Files:")
|
|
print(f" • Main dataset: {output_path}")
|
|
print(f" • Statistics: {stats_path}")
|
|
print(f"\n📊 Summary:")
|
|
print(f" • Total institutions: {len(all_custodians):,}")
|
|
print(f" • Data tier: TIER_1_AUTHORITATIVE")
|
|
print(f" • Source: National Diet Library ISIL Registry")
|
|
print(f" • Geographic coverage: {stats['total_prefectures']}/47 Japanese prefectures")
|
|
print(f"\n⏱️ Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|