glam/scripts/validate_japan_dataset.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

302 lines
11 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Validate Japan ISIL Dataset
Creates a comprehensive validation report for the Japan heritage institutions dataset.
Checks schema compliance, data quality, and completeness.
"""
import sys
import yaml
from pathlib import Path
from collections import Counter, defaultdict
from datetime import datetime
from typing import Dict, List, Any
def load_yaml_file(file_path: Path) -> List[Dict[str, Any]]:
"""Load YAML file as list of records."""
with open(file_path, 'r', encoding='utf-8') as f:
records = yaml.safe_load(f)
# Handle both list and single document
if not isinstance(records, list):
records = [records]
return records
def validate_record(record: Dict[str, Any], index: int) -> Dict[str, Any]:
"""Validate a single record - basic schema checks without Pydantic."""
result = {
'index': index,
'id': record.get('id', 'UNKNOWN'),
'name': record.get('name', 'UNKNOWN'),
'valid': False,
'errors': []
}
try:
# Basic validation checks
required_fields = ['id', 'name', 'institution_type', 'provenance']
missing = [f for f in required_fields if f not in record or not record[f]]
if missing:
result['errors'].append(f"Missing required fields: {', '.join(missing)}")
else:
result['valid'] = True
result['institution_type'] = record.get('institution_type', 'UNKNOWN')
# Check for website
identifiers = record.get('identifiers', [])
result['has_website'] = any(
i.get('identifier_scheme') == 'Website' for i in identifiers
)
# Check for address
locations = record.get('locations', [])
result['has_address'] = bool(locations and locations[0].get('street_address'))
# Get GHCID
result['ghcid'] = record.get('ghcid', '')
except Exception as e:
result['errors'].append(f"Unexpected error: {str(e)}")
return result
def analyze_dataset(records: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Perform comprehensive analysis of the dataset."""
print(f"Analyzing {len(records)} records...")
validation_results = []
institution_types = Counter()
prefectures = Counter()
data_quality = {
'has_website': 0,
'has_address': 0,
'has_postal_code': 0,
'has_ghcid': 0
}
errors_by_type = defaultdict(int)
sample_errors = []
for idx, record in enumerate(records):
if idx % 1000 == 0:
print(f" Processed {idx}/{len(records)} records...")
result = validate_record(record, idx)
validation_results.append(result)
if result['valid']:
institution_types[result.get('institution_type', 'UNKNOWN')] += 1
if result.get('has_website'):
data_quality['has_website'] += 1
if result.get('has_address'):
data_quality['has_address'] += 1
if result.get('ghcid'):
data_quality['has_ghcid'] += 1
# Extract prefecture from GHCID or location
if result.get('ghcid'):
parts = result['ghcid'].split('-')
if len(parts) >= 3:
prefectures[parts[1]] += 1 # Prefecture code
# Check postal code
locations = record.get('locations', [])
if locations and locations[0].get('postal_code'):
data_quality['has_postal_code'] += 1
else:
for error in result['errors']:
errors_by_type[error] += 1
if len(sample_errors) < 10:
sample_errors.append({
'record_id': result['id'],
'name': result['name'],
'error': error
})
valid_count = sum(1 for r in validation_results if r['valid'])
invalid_count = len(validation_results) - valid_count
return {
'total_records': len(records),
'valid_records': valid_count,
'invalid_records': invalid_count,
'validation_rate': (valid_count / len(records) * 100) if records else 0,
'institution_types': dict(institution_types.most_common()),
'prefectures': dict(prefectures.most_common()),
'prefecture_count': len(prefectures),
'data_quality': data_quality,
'errors_by_type': dict(errors_by_type),
'sample_errors': sample_errors,
'validation_results': validation_results
}
def generate_report(analysis: Dict[str, Any], output_path: Path):
"""Generate comprehensive validation report."""
report_lines = []
report_lines.append("=" * 80)
report_lines.append("JAPAN ISIL DATASET VALIDATION REPORT")
report_lines.append("=" * 80)
report_lines.append(f"Generated: {datetime.now().isoformat()}")
report_lines.append(f"Dataset: data/instances/japan/jp_institutions.yaml")
report_lines.append("")
# Summary
report_lines.append("VALIDATION SUMMARY")
report_lines.append("-" * 80)
report_lines.append(f"Total Records: {analysis['total_records']:,}")
report_lines.append(f"Valid Records: {analysis['valid_records']:,} ({analysis['validation_rate']:.2f}%)")
report_lines.append(f"Invalid Records: {analysis['invalid_records']:,}")
report_lines.append("")
# Institution Type Breakdown
report_lines.append("INSTITUTION TYPE BREAKDOWN")
report_lines.append("-" * 80)
for inst_type, count in sorted(analysis['institution_types'].items(), key=lambda x: -x[1]):
percentage = (count / analysis['total_records'] * 100)
report_lines.append(f" {inst_type:20s} {count:6,} ({percentage:5.2f}%)")
report_lines.append("")
# Geographic Coverage
report_lines.append("GEOGRAPHIC COVERAGE")
report_lines.append("-" * 80)
report_lines.append(f"Total Prefectures: {analysis['prefecture_count']}")
report_lines.append("")
report_lines.append("Top 10 Prefectures by Institution Count:")
for prefecture, count in list(analysis['prefectures'].items())[:10]:
report_lines.append(f" {prefecture:10s} {count:6,}")
report_lines.append("")
# Data Quality Metrics
report_lines.append("DATA QUALITY METRICS")
report_lines.append("-" * 80)
total = analysis['total_records']
dq = analysis['data_quality']
report_lines.append(f"GHCID Coverage: {dq['has_ghcid']:6,} / {total:6,} ({dq['has_ghcid']/total*100:5.2f}%)")
report_lines.append(f"Website URLs: {dq['has_website']:6,} / {total:6,} ({dq['has_website']/total*100:5.2f}%)")
report_lines.append(f"Street Addresses: {dq['has_address']:6,} / {total:6,} ({dq['has_address']/total*100:5.2f}%)")
report_lines.append(f"Postal Codes: {dq['has_postal_code']:6,} / {total:6,} ({dq['has_postal_code']/total*100:5.2f}%)")
report_lines.append("")
# Validation Errors
if analysis['errors_by_type']:
report_lines.append("VALIDATION ERRORS")
report_lines.append("-" * 80)
for error_type, count in sorted(analysis['errors_by_type'].items(), key=lambda x: -x[1])[:10]:
report_lines.append(f" [{count:4}] {error_type[:70]}")
report_lines.append("")
if analysis['sample_errors']:
report_lines.append("Sample Error Records (first 10):")
for i, sample in enumerate(analysis['sample_errors'], 1):
report_lines.append(f" {i}. {sample['name']}")
report_lines.append(f" ID: {sample['record_id']}")
report_lines.append(f" Error: {sample['error'][:70]}")
report_lines.append("")
else:
report_lines.append("✅ NO VALIDATION ERRORS - All records valid!")
report_lines.append("")
# Schema Compliance
report_lines.append("SCHEMA COMPLIANCE")
report_lines.append("-" * 80)
if analysis['validation_rate'] == 100.0:
report_lines.append("✅ PASS - All records conform to LinkML schema")
elif analysis['validation_rate'] >= 95.0:
report_lines.append("⚠️ WARN - Most records valid, minor issues detected")
else:
report_lines.append("❌ FAIL - Significant schema validation issues")
report_lines.append("")
# Recommendations
report_lines.append("RECOMMENDATIONS")
report_lines.append("-" * 80)
if dq['has_website'] / total < 0.90:
report_lines.append(f"⚠️ Website coverage at {dq['has_website']/total*100:.1f}% - consider enriching from web")
else:
report_lines.append(f"✅ Website coverage excellent ({dq['has_website']/total*100:.1f}%)")
if dq['has_address'] / total < 0.95:
report_lines.append(f"⚠️ Address coverage at {dq['has_address']/total*100:.1f}% - geocoding may be incomplete")
else:
report_lines.append(f"✅ Address coverage excellent ({dq['has_address']/total*100:.1f}%)")
if analysis['prefecture_count'] < 47:
report_lines.append(f" Only {analysis['prefecture_count']}/47 prefectures represented - may be incomplete")
else:
report_lines.append(f"✅ All 47 prefectures represented")
report_lines.append("")
report_lines.append("NEXT STEPS")
report_lines.append("-" * 80)
report_lines.append("1. Review sample error records if validation < 100%")
report_lines.append("2. Consider geocoding addresses to add lat/lon coordinates")
report_lines.append("3. Map prefecture codes to ISO 3166-2 (JP-01 through JP-47)")
report_lines.append("4. Merge with global dataset (NL, EUR, Latin America)")
report_lines.append("5. Export to GeoJSON for geographic visualization")
report_lines.append("")
report_lines.append("=" * 80)
report_lines.append("END OF REPORT")
report_lines.append("=" * 80)
# Write report
report_text = "\n".join(report_lines)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(report_text)
return report_text
def main():
"""Main validation workflow."""
# Paths
base_dir = Path(__file__).parent.parent
dataset_path = base_dir / 'data' / 'instances' / 'japan' / 'jp_institutions.yaml'
report_path = base_dir / 'data' / 'instances' / 'japan' / 'validation_report.txt'
print("=" * 80)
print("Japan ISIL Dataset Validation")
print("=" * 80)
print(f"Input: {dataset_path}")
print(f"Output: {report_path}")
print()
# Load dataset
print("Loading dataset...")
records = load_yaml_file(dataset_path)
print(f"Loaded {len(records)} records")
print()
# Analyze
analysis = analyze_dataset(records)
# Generate report
print("\nGenerating report...")
report_text = generate_report(analysis, report_path)
# Print to console
print("\n" + report_text)
print(f"\n✅ Validation report saved to: {report_path}")
# Return exit code based on validation
if analysis['validation_rate'] < 95.0:
sys.exit(1) # Fail if < 95% valid
else:
sys.exit(0)
if __name__ == '__main__':
main()