- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
149 lines
5.7 KiB
Python
Executable file
149 lines
5.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Parse Japanese ISIL registry CSV files and generate LinkML-compliant instances.
|
|
|
|
This script processes the Japanese ISIL registry CSVs from the National Diet Library:
|
|
- Public libraries
|
|
- Other libraries (academic, specialized)
|
|
- Museums
|
|
- Archives
|
|
|
|
Output: YAML files in data/instances/japan/
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Add project root to Python path
|
|
project_root = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(project_root / "src"))
|
|
|
|
from glam_extractor.parsers.japanese_isil import JapaneseISILParser
|
|
from glam_extractor.models import InstitutionType
|
|
import yaml
|
|
|
|
|
|
def main():
|
|
"""Parse all Japanese ISIL CSV files"""
|
|
|
|
data_dir = project_root / "data" / "isil" / "JP"
|
|
output_dir = project_root / "data" / "instances" / "japan"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
parser = JapaneseISILParser()
|
|
|
|
# Define CSV files and their institution types
|
|
csv_files = [
|
|
("libraries_public.csv", InstitutionType.LIBRARY, "Japanese public libraries"),
|
|
("libraries_other.csv", InstitutionType.LIBRARY, "Japanese academic and specialized libraries"),
|
|
("museums.csv", InstitutionType.MUSEUM, "Japanese museums"),
|
|
("archives.csv", InstitutionType.ARCHIVE, "Japanese archives"),
|
|
]
|
|
|
|
all_stats = {}
|
|
|
|
for csv_filename, inst_type, description in csv_files:
|
|
csv_path = data_dir / csv_filename
|
|
|
|
if not csv_path.exists():
|
|
print(f"⚠️ File not found: {csv_path}")
|
|
continue
|
|
|
|
print(f"\n{'='*70}")
|
|
print(f"Processing: {csv_filename}")
|
|
print(f"Type: {inst_type}")
|
|
print(f"Description: {description}")
|
|
print(f"{'='*70}\n")
|
|
|
|
try:
|
|
# Parse CSV
|
|
records = parser.parse_file(csv_path, inst_type)
|
|
print(f"✓ Parsed {len(records)} raw records")
|
|
|
|
# Convert to HeritageCustodian models
|
|
custodians = [
|
|
parser.to_heritage_custodian(record, inst_type, str(csv_path))
|
|
for record in records
|
|
]
|
|
print(f"✓ Converted to {len(custodians)} HeritageCustodian instances")
|
|
|
|
# Statistics
|
|
with_ghcid = sum(1 for c in custodians if c.ghcid_numeric is not None)
|
|
with_url = sum(1 for c in custodians if any(i.identifier_scheme == "Website" for i in c.identifiers))
|
|
|
|
stats = {
|
|
"total_records": len(records),
|
|
"with_ghcid": with_ghcid,
|
|
"with_url": with_url,
|
|
"ghcid_coverage": f"{with_ghcid/len(records)*100:.1f}%" if records else "0%",
|
|
"url_coverage": f"{with_url/len(records)*100:.1f}%" if records else "0%",
|
|
}
|
|
all_stats[csv_filename] = stats
|
|
|
|
print(f" - Records with GHCID: {with_ghcid}/{len(records)} ({stats['ghcid_coverage']})")
|
|
print(f" - Records with URL: {with_url}/{len(records)} ({stats['url_coverage']})")
|
|
|
|
# Export to YAML (sample first 10 for testing)
|
|
output_filename = csv_filename.replace(".csv", "_sample.yaml")
|
|
output_path = output_dir / output_filename
|
|
|
|
sample_custodians = custodians[:10] # First 10 records
|
|
|
|
# Convert to dicts for YAML serialization
|
|
sample_dicts = []
|
|
for custodian in sample_custodians:
|
|
# Use model_dump() for Pydantic v2, dict() for v1
|
|
try:
|
|
data = custodian.model_dump(exclude_none=True)
|
|
except AttributeError:
|
|
data = custodian.dict(exclude_none=True)
|
|
|
|
# Convert datetime objects to ISO strings
|
|
if data.get("provenance"):
|
|
prov = data["provenance"]
|
|
if prov.get("extraction_date"):
|
|
prov["extraction_date"] = prov["extraction_date"].isoformat()
|
|
if prov.get("verified_date"):
|
|
prov["verified_date"] = prov["verified_date"].isoformat()
|
|
|
|
# Convert history dates
|
|
if data.get("ghcid_history"):
|
|
for entry in data["ghcid_history"]:
|
|
if entry.get("valid_from"):
|
|
entry["valid_from"] = entry["valid_from"].isoformat()
|
|
if entry.get("valid_to"):
|
|
entry["valid_to"] = entry["valid_to"].isoformat()
|
|
|
|
sample_dicts.append(data)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(sample_dicts, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print(f"✓ Exported sample (10 records) to: {output_path}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error processing {csv_filename}: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
continue
|
|
|
|
# Print summary
|
|
print(f"\n{'='*70}")
|
|
print("SUMMARY")
|
|
print(f"{'='*70}\n")
|
|
|
|
for filename, stats in all_stats.items():
|
|
print(f"{filename}:")
|
|
print(f" Total records: {stats['total_records']}")
|
|
print(f" GHCID coverage: {stats['ghcid_coverage']}")
|
|
print(f" URL coverage: {stats['url_coverage']}")
|
|
print()
|
|
|
|
total_records = sum(s["total_records"] for s in all_stats.values())
|
|
print(f"Grand total: {total_records} Japanese heritage institutions")
|
|
print(f"\nSample YAML files written to: {output_dir}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|