- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
156 lines
5.4 KiB
Python
156 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Export Libya Heritage Institutions to RDF/Turtle
|
|
|
|
Generates RDF serializations of the 52 Libyan heritage institutions
|
|
with 75% Wikidata coverage (39/52 institutions).
|
|
|
|
Outputs:
|
|
- data/exports/libya_institutions.ttl (Turtle format)
|
|
- data/exports/libya_institutions.rdf (RDF/XML format)
|
|
- data/exports/libya_institutions.jsonld (JSON-LD format)
|
|
- data/exports/libya_statistics.yaml (coverage statistics)
|
|
"""
|
|
|
|
import sys
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# Add src to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
|
|
|
from glam_extractor.models import HeritageCustodian
|
|
from glam_extractor.exporters.rdf_exporter import RDFExporter
|
|
|
|
|
|
def load_libya_dataset(filepath: Path) -> list[HeritageCustodian]:
|
|
"""Load Libya institutions from YAML and convert to Pydantic models."""
|
|
print(f"Loading Libya dataset from: {filepath.name}")
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
print(f" Loaded {len(data)} institutions")
|
|
|
|
# Convert to Pydantic models
|
|
custodians = []
|
|
for inst_data in data:
|
|
try:
|
|
custodian = HeritageCustodian(**inst_data)
|
|
custodians.append(custodian)
|
|
except Exception as e:
|
|
print(f" ⚠️ Skipping invalid record: {inst_data.get('name', 'UNKNOWN')}: {e}")
|
|
|
|
print(f" ✅ Validated {len(custodians)} institutions")
|
|
return custodians
|
|
|
|
|
|
def calculate_statistics(custodians: list[HeritageCustodian]) -> dict:
|
|
"""Calculate coverage statistics for Libya dataset."""
|
|
stats = {
|
|
'total_institutions': len(custodians),
|
|
'country': 'Libya (LY)',
|
|
'generated': datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
# Wikidata coverage
|
|
with_wikidata = sum(
|
|
1 for c in custodians
|
|
if any(id.identifier_scheme == 'Wikidata' for id in c.identifiers)
|
|
)
|
|
stats['wikidata_coverage'] = {
|
|
'count': with_wikidata,
|
|
'percentage': round(with_wikidata / len(custodians) * 100, 1)
|
|
}
|
|
|
|
# Institution types
|
|
type_counts = {}
|
|
for custodian in custodians:
|
|
inst_type = custodian.institution_type
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
stats['institution_types'] = dict(sorted(type_counts.items(), key=lambda x: x[1], reverse=True))
|
|
|
|
# Geographic distribution
|
|
cities = {}
|
|
for custodian in custodians:
|
|
if custodian.locations:
|
|
city = custodian.locations[0].city
|
|
cities[city] = cities.get(city, 0) + 1
|
|
|
|
stats['cities'] = dict(sorted(cities.items(), key=lambda x: x[1], reverse=True))
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main export workflow."""
|
|
print("\n" + "="*80)
|
|
print("Libya Heritage Institutions - RDF Export")
|
|
print("="*80 + "\n")
|
|
|
|
# Paths
|
|
base_dir = Path('/Users/kempersc/apps/glam')
|
|
input_file = base_dir / 'data/instances/libya/libyan_institutions.yaml'
|
|
output_dir = base_dir / 'data/exports'
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
# Load dataset
|
|
custodians = load_libya_dataset(input_file)
|
|
|
|
# Calculate statistics
|
|
print("\n📊 Calculating statistics...")
|
|
stats = calculate_statistics(custodians)
|
|
|
|
print(f" Total institutions: {stats['total_institutions']}")
|
|
print(f" Wikidata coverage: {stats['wikidata_coverage']['count']}/{stats['total_institutions']} ({stats['wikidata_coverage']['percentage']}%)")
|
|
print(f" Institution types: {len(stats['institution_types'])}")
|
|
print(f" Cities covered: {len(stats['cities'])}")
|
|
|
|
# Export to RDF formats
|
|
print("\n📤 Exporting to RDF formats...")
|
|
|
|
exporter = RDFExporter()
|
|
|
|
# Turtle format
|
|
ttl_file = output_dir / 'libya_institutions.ttl'
|
|
print(f" Exporting Turtle: {ttl_file.name}")
|
|
exporter.export_to_file(custodians, str(ttl_file), format='turtle')
|
|
print(f" ✅ Saved ({ttl_file.stat().st_size:,} bytes)")
|
|
|
|
# RDF/XML format
|
|
exporter_xml = RDFExporter() # New instance for clean graph
|
|
rdf_file = output_dir / 'libya_institutions.rdf'
|
|
print(f" Exporting RDF/XML: {rdf_file.name}")
|
|
exporter_xml.export_to_file(custodians, str(rdf_file), format='xml')
|
|
print(f" ✅ Saved ({rdf_file.stat().st_size:,} bytes)")
|
|
|
|
# JSON-LD format
|
|
exporter_jsonld = RDFExporter() # New instance for clean graph
|
|
jsonld_file = output_dir / 'libya_institutions.jsonld'
|
|
print(f" Exporting JSON-LD: {jsonld_file.name}")
|
|
exporter_jsonld.export_to_file(custodians, str(jsonld_file), format='json-ld')
|
|
print(f" ✅ Saved ({jsonld_file.stat().st_size:,} bytes)")
|
|
|
|
# Save statistics
|
|
stats_file = output_dir / 'libya_statistics.yaml'
|
|
print(f"\n📊 Saving statistics: {stats_file.name}")
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(stats, f, allow_unicode=True, sort_keys=False)
|
|
print(f" ✅ Saved")
|
|
|
|
print("\n" + "="*80)
|
|
print("✅ EXPORT COMPLETE!")
|
|
print("="*80)
|
|
print(f"\n📁 Output files in: {output_dir}/")
|
|
print(f" - libya_institutions.ttl (Turtle)")
|
|
print(f" - libya_institutions.rdf (RDF/XML)")
|
|
print(f" - libya_institutions.jsonld (JSON-LD)")
|
|
print(f" - libya_statistics.yaml (Statistics)")
|
|
print(f"\n🎯 Libya dataset ready for semantic web integration!")
|
|
print(f" 52 institutions, 39 with Wikidata (75.0% coverage)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|