- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
302 lines
11 KiB
Python
302 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Validate Japan ISIL Dataset
|
||
|
||
Creates a comprehensive validation report for the Japan heritage institutions dataset.
|
||
Checks schema compliance, data quality, and completeness.
|
||
"""
|
||
|
||
import sys
|
||
import yaml
|
||
from pathlib import Path
|
||
from collections import Counter, defaultdict
|
||
from datetime import datetime
|
||
from typing import Dict, List, Any
|
||
|
||
|
||
def load_yaml_file(file_path: Path) -> List[Dict[str, Any]]:
|
||
"""Load YAML file as list of records."""
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
records = yaml.safe_load(f)
|
||
# Handle both list and single document
|
||
if not isinstance(records, list):
|
||
records = [records]
|
||
return records
|
||
|
||
|
||
def validate_record(record: Dict[str, Any], index: int) -> Dict[str, Any]:
|
||
"""Validate a single record - basic schema checks without Pydantic."""
|
||
result = {
|
||
'index': index,
|
||
'id': record.get('id', 'UNKNOWN'),
|
||
'name': record.get('name', 'UNKNOWN'),
|
||
'valid': False,
|
||
'errors': []
|
||
}
|
||
|
||
try:
|
||
# Basic validation checks
|
||
required_fields = ['id', 'name', 'institution_type', 'provenance']
|
||
missing = [f for f in required_fields if f not in record or not record[f]]
|
||
|
||
if missing:
|
||
result['errors'].append(f"Missing required fields: {', '.join(missing)}")
|
||
else:
|
||
result['valid'] = True
|
||
result['institution_type'] = record.get('institution_type', 'UNKNOWN')
|
||
|
||
# Check for website
|
||
identifiers = record.get('identifiers', [])
|
||
result['has_website'] = any(
|
||
i.get('identifier_scheme') == 'Website' for i in identifiers
|
||
)
|
||
|
||
# Check for address
|
||
locations = record.get('locations', [])
|
||
result['has_address'] = bool(locations and locations[0].get('street_address'))
|
||
|
||
# Get GHCID
|
||
result['ghcid'] = record.get('ghcid', '')
|
||
|
||
except Exception as e:
|
||
result['errors'].append(f"Unexpected error: {str(e)}")
|
||
|
||
return result
|
||
|
||
|
||
def analyze_dataset(records: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||
"""Perform comprehensive analysis of the dataset."""
|
||
|
||
print(f"Analyzing {len(records)} records...")
|
||
|
||
validation_results = []
|
||
institution_types = Counter()
|
||
prefectures = Counter()
|
||
data_quality = {
|
||
'has_website': 0,
|
||
'has_address': 0,
|
||
'has_postal_code': 0,
|
||
'has_ghcid': 0
|
||
}
|
||
|
||
errors_by_type = defaultdict(int)
|
||
sample_errors = []
|
||
|
||
for idx, record in enumerate(records):
|
||
if idx % 1000 == 0:
|
||
print(f" Processed {idx}/{len(records)} records...")
|
||
|
||
result = validate_record(record, idx)
|
||
validation_results.append(result)
|
||
|
||
if result['valid']:
|
||
institution_types[result.get('institution_type', 'UNKNOWN')] += 1
|
||
|
||
if result.get('has_website'):
|
||
data_quality['has_website'] += 1
|
||
if result.get('has_address'):
|
||
data_quality['has_address'] += 1
|
||
if result.get('ghcid'):
|
||
data_quality['has_ghcid'] += 1
|
||
|
||
# Extract prefecture from GHCID or location
|
||
if result.get('ghcid'):
|
||
parts = result['ghcid'].split('-')
|
||
if len(parts) >= 3:
|
||
prefectures[parts[1]] += 1 # Prefecture code
|
||
|
||
# Check postal code
|
||
locations = record.get('locations', [])
|
||
if locations and locations[0].get('postal_code'):
|
||
data_quality['has_postal_code'] += 1
|
||
else:
|
||
for error in result['errors']:
|
||
errors_by_type[error] += 1
|
||
if len(sample_errors) < 10:
|
||
sample_errors.append({
|
||
'record_id': result['id'],
|
||
'name': result['name'],
|
||
'error': error
|
||
})
|
||
|
||
valid_count = sum(1 for r in validation_results if r['valid'])
|
||
invalid_count = len(validation_results) - valid_count
|
||
|
||
return {
|
||
'total_records': len(records),
|
||
'valid_records': valid_count,
|
||
'invalid_records': invalid_count,
|
||
'validation_rate': (valid_count / len(records) * 100) if records else 0,
|
||
'institution_types': dict(institution_types.most_common()),
|
||
'prefectures': dict(prefectures.most_common()),
|
||
'prefecture_count': len(prefectures),
|
||
'data_quality': data_quality,
|
||
'errors_by_type': dict(errors_by_type),
|
||
'sample_errors': sample_errors,
|
||
'validation_results': validation_results
|
||
}
|
||
|
||
|
||
def generate_report(analysis: Dict[str, Any], output_path: Path):
|
||
"""Generate comprehensive validation report."""
|
||
|
||
report_lines = []
|
||
|
||
report_lines.append("=" * 80)
|
||
report_lines.append("JAPAN ISIL DATASET VALIDATION REPORT")
|
||
report_lines.append("=" * 80)
|
||
report_lines.append(f"Generated: {datetime.now().isoformat()}")
|
||
report_lines.append(f"Dataset: data/instances/japan/jp_institutions.yaml")
|
||
report_lines.append("")
|
||
|
||
# Summary
|
||
report_lines.append("VALIDATION SUMMARY")
|
||
report_lines.append("-" * 80)
|
||
report_lines.append(f"Total Records: {analysis['total_records']:,}")
|
||
report_lines.append(f"Valid Records: {analysis['valid_records']:,} ({analysis['validation_rate']:.2f}%)")
|
||
report_lines.append(f"Invalid Records: {analysis['invalid_records']:,}")
|
||
report_lines.append("")
|
||
|
||
# Institution Type Breakdown
|
||
report_lines.append("INSTITUTION TYPE BREAKDOWN")
|
||
report_lines.append("-" * 80)
|
||
for inst_type, count in sorted(analysis['institution_types'].items(), key=lambda x: -x[1]):
|
||
percentage = (count / analysis['total_records'] * 100)
|
||
report_lines.append(f" {inst_type:20s} {count:6,} ({percentage:5.2f}%)")
|
||
report_lines.append("")
|
||
|
||
# Geographic Coverage
|
||
report_lines.append("GEOGRAPHIC COVERAGE")
|
||
report_lines.append("-" * 80)
|
||
report_lines.append(f"Total Prefectures: {analysis['prefecture_count']}")
|
||
report_lines.append("")
|
||
report_lines.append("Top 10 Prefectures by Institution Count:")
|
||
for prefecture, count in list(analysis['prefectures'].items())[:10]:
|
||
report_lines.append(f" {prefecture:10s} {count:6,}")
|
||
report_lines.append("")
|
||
|
||
# Data Quality Metrics
|
||
report_lines.append("DATA QUALITY METRICS")
|
||
report_lines.append("-" * 80)
|
||
total = analysis['total_records']
|
||
dq = analysis['data_quality']
|
||
|
||
report_lines.append(f"GHCID Coverage: {dq['has_ghcid']:6,} / {total:6,} ({dq['has_ghcid']/total*100:5.2f}%)")
|
||
report_lines.append(f"Website URLs: {dq['has_website']:6,} / {total:6,} ({dq['has_website']/total*100:5.2f}%)")
|
||
report_lines.append(f"Street Addresses: {dq['has_address']:6,} / {total:6,} ({dq['has_address']/total*100:5.2f}%)")
|
||
report_lines.append(f"Postal Codes: {dq['has_postal_code']:6,} / {total:6,} ({dq['has_postal_code']/total*100:5.2f}%)")
|
||
report_lines.append("")
|
||
|
||
# Validation Errors
|
||
if analysis['errors_by_type']:
|
||
report_lines.append("VALIDATION ERRORS")
|
||
report_lines.append("-" * 80)
|
||
for error_type, count in sorted(analysis['errors_by_type'].items(), key=lambda x: -x[1])[:10]:
|
||
report_lines.append(f" [{count:4}] {error_type[:70]}")
|
||
report_lines.append("")
|
||
|
||
if analysis['sample_errors']:
|
||
report_lines.append("Sample Error Records (first 10):")
|
||
for i, sample in enumerate(analysis['sample_errors'], 1):
|
||
report_lines.append(f" {i}. {sample['name']}")
|
||
report_lines.append(f" ID: {sample['record_id']}")
|
||
report_lines.append(f" Error: {sample['error'][:70]}")
|
||
report_lines.append("")
|
||
else:
|
||
report_lines.append("✅ NO VALIDATION ERRORS - All records valid!")
|
||
report_lines.append("")
|
||
|
||
# Schema Compliance
|
||
report_lines.append("SCHEMA COMPLIANCE")
|
||
report_lines.append("-" * 80)
|
||
if analysis['validation_rate'] == 100.0:
|
||
report_lines.append("✅ PASS - All records conform to LinkML schema")
|
||
elif analysis['validation_rate'] >= 95.0:
|
||
report_lines.append("⚠️ WARN - Most records valid, minor issues detected")
|
||
else:
|
||
report_lines.append("❌ FAIL - Significant schema validation issues")
|
||
report_lines.append("")
|
||
|
||
# Recommendations
|
||
report_lines.append("RECOMMENDATIONS")
|
||
report_lines.append("-" * 80)
|
||
|
||
if dq['has_website'] / total < 0.90:
|
||
report_lines.append(f"⚠️ Website coverage at {dq['has_website']/total*100:.1f}% - consider enriching from web")
|
||
else:
|
||
report_lines.append(f"✅ Website coverage excellent ({dq['has_website']/total*100:.1f}%)")
|
||
|
||
if dq['has_address'] / total < 0.95:
|
||
report_lines.append(f"⚠️ Address coverage at {dq['has_address']/total*100:.1f}% - geocoding may be incomplete")
|
||
else:
|
||
report_lines.append(f"✅ Address coverage excellent ({dq['has_address']/total*100:.1f}%)")
|
||
|
||
if analysis['prefecture_count'] < 47:
|
||
report_lines.append(f"ℹ️ Only {analysis['prefecture_count']}/47 prefectures represented - may be incomplete")
|
||
else:
|
||
report_lines.append(f"✅ All 47 prefectures represented")
|
||
|
||
report_lines.append("")
|
||
report_lines.append("NEXT STEPS")
|
||
report_lines.append("-" * 80)
|
||
report_lines.append("1. Review sample error records if validation < 100%")
|
||
report_lines.append("2. Consider geocoding addresses to add lat/lon coordinates")
|
||
report_lines.append("3. Map prefecture codes to ISO 3166-2 (JP-01 through JP-47)")
|
||
report_lines.append("4. Merge with global dataset (NL, EUR, Latin America)")
|
||
report_lines.append("5. Export to GeoJSON for geographic visualization")
|
||
report_lines.append("")
|
||
|
||
report_lines.append("=" * 80)
|
||
report_lines.append("END OF REPORT")
|
||
report_lines.append("=" * 80)
|
||
|
||
# Write report
|
||
report_text = "\n".join(report_lines)
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
f.write(report_text)
|
||
|
||
return report_text
|
||
|
||
|
||
def main():
|
||
"""Main validation workflow."""
|
||
|
||
# Paths
|
||
base_dir = Path(__file__).parent.parent
|
||
dataset_path = base_dir / 'data' / 'instances' / 'japan' / 'jp_institutions.yaml'
|
||
report_path = base_dir / 'data' / 'instances' / 'japan' / 'validation_report.txt'
|
||
|
||
print("=" * 80)
|
||
print("Japan ISIL Dataset Validation")
|
||
print("=" * 80)
|
||
print(f"Input: {dataset_path}")
|
||
print(f"Output: {report_path}")
|
||
print()
|
||
|
||
# Load dataset
|
||
print("Loading dataset...")
|
||
records = load_yaml_file(dataset_path)
|
||
print(f"Loaded {len(records)} records")
|
||
print()
|
||
|
||
# Analyze
|
||
analysis = analyze_dataset(records)
|
||
|
||
# Generate report
|
||
print("\nGenerating report...")
|
||
report_text = generate_report(analysis, report_path)
|
||
|
||
# Print to console
|
||
print("\n" + report_text)
|
||
|
||
print(f"\n✅ Validation report saved to: {report_path}")
|
||
|
||
# Return exit code based on validation
|
||
if analysis['validation_rate'] < 95.0:
|
||
sys.exit(1) # Fail if < 95% valid
|
||
else:
|
||
sys.exit(0)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|