glam/scripts/parse_japanese_isil.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

149 lines
5.7 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Parse Japanese ISIL registry CSV files and generate LinkML-compliant instances.
This script processes the Japanese ISIL registry CSVs from the National Diet Library:
- Public libraries
- Other libraries (academic, specialized)
- Museums
- Archives
Output: YAML files in data/instances/japan/
"""
import sys
from pathlib import Path
from datetime import datetime
# Add project root to Python path
project_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(project_root / "src"))
from glam_extractor.parsers.japanese_isil import JapaneseISILParser
from glam_extractor.models import InstitutionType
import yaml
def main():
"""Parse all Japanese ISIL CSV files"""
data_dir = project_root / "data" / "isil" / "JP"
output_dir = project_root / "data" / "instances" / "japan"
output_dir.mkdir(parents=True, exist_ok=True)
parser = JapaneseISILParser()
# Define CSV files and their institution types
csv_files = [
("libraries_public.csv", InstitutionType.LIBRARY, "Japanese public libraries"),
("libraries_other.csv", InstitutionType.LIBRARY, "Japanese academic and specialized libraries"),
("museums.csv", InstitutionType.MUSEUM, "Japanese museums"),
("archives.csv", InstitutionType.ARCHIVE, "Japanese archives"),
]
all_stats = {}
for csv_filename, inst_type, description in csv_files:
csv_path = data_dir / csv_filename
if not csv_path.exists():
print(f"⚠️ File not found: {csv_path}")
continue
print(f"\n{'='*70}")
print(f"Processing: {csv_filename}")
print(f"Type: {inst_type}")
print(f"Description: {description}")
print(f"{'='*70}\n")
try:
# Parse CSV
records = parser.parse_file(csv_path, inst_type)
print(f"✓ Parsed {len(records)} raw records")
# Convert to HeritageCustodian models
custodians = [
parser.to_heritage_custodian(record, inst_type, str(csv_path))
for record in records
]
print(f"✓ Converted to {len(custodians)} HeritageCustodian instances")
# Statistics
with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None)
with_url = sum(1 for c in custodians if any(i.identifier_scheme == "Website" for i in c.identifiers))
stats = {
"total_records": len(records),
"with_ghcid": with_ghcid,
"with_url": with_url,
"ghcid_coverage": f"{with_ghcid/len(records)*100:.1f}%" if records else "0%",
"url_coverage": f"{with_url/len(records)*100:.1f}%" if records else "0%",
}
all_stats[csv_filename] = stats
print(f" - Records with GHCID: {with_ghcid}/{len(records)} ({stats['ghcid_coverage']})")
print(f" - Records with URL: {with_url}/{len(records)} ({stats['url_coverage']})")
# Export to YAML (sample first 10 for testing)
output_filename = csv_filename.replace(".csv", "_sample.yaml")
output_path = output_dir / output_filename
sample_custodians = custodians[:10] # First 10 records
# Convert to dicts for YAML serialization
sample_dicts = []
for custodian in sample_custodians:
# Use model_dump() for Pydantic v2, dict() for v1
try:
data = custodian.model_dump(exclude_none=True)
except AttributeError:
data = custodian.dict(exclude_none=True)
# Convert datetime objects to ISO strings
if data.get("provenance"):
prov = data["provenance"]
if prov.get("extraction_date"):
prov["extraction_date"] = prov["extraction_date"].isoformat()
if prov.get("verified_date"):
prov["verified_date"] = prov["verified_date"].isoformat()
# Convert history dates
if data.get("ghcid_history"):
for entry in data["ghcid_history"]:
if entry.get("valid_from"):
entry["valid_from"] = entry["valid_from"].isoformat()
if entry.get("valid_to"):
entry["valid_to"] = entry["valid_to"].isoformat()
sample_dicts.append(data)
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(sample_dicts, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(f"✓ Exported sample (10 records) to: {output_path}")
except Exception as e:
print(f"❌ Error processing {csv_filename}: {e}")
import traceback
traceback.print_exc()
continue
# Print summary
print(f"\n{'='*70}")
print("SUMMARY")
print(f"{'='*70}\n")
for filename, stats in all_stats.items():
print(f"{filename}:")
print(f" Total records: {stats['total_records']}")
print(f" GHCID coverage: {stats['ghcid_coverage']}")
print(f" URL coverage: {stats['url_coverage']}")
print()
total_records = sum(s["total_records"] for s in all_stats.values())
print(f"Grand total: {total_records} Japanese heritage institutions")
print(f"\nSample YAML files written to: {output_dir}")
if __name__ == "__main__":
main()