glam/scripts/merge_egypt_steps.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

96 lines
3.5 KiB
Python

#!/usr/bin/env python3
"""
Egyptian GLAM Institution Extraction - Merge All Steps
Combines Steps 1-4 into a single comprehensive dataset.
"""
import sys
from pathlib import Path
from collections import Counter
import yaml
def main():
"""Merge all step files into final dataset."""
print("="*60)
print("Egyptian GLAM Extraction - Merging All Steps")
print("="*60)
data_dir = Path(__file__).parent.parent / "data" / "instances"
# Load all step files
step_files = [
("egypt_step1_2.yaml", "Steps 1+2: National libraries/archives + Museums"),
("egypt_step3.yaml", "Step 3: University Libraries"),
("egypt_step4.yaml", "Step 4: Galleries & Cultural Centers")
]
all_institutions = []
for filename, description in step_files:
filepath = data_dir / filename
print(f"\nLoading {filename}...")
print(f" ({description})")
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
count = len(data)
all_institutions.extend(data)
print(f" ✓ Loaded {count} institutions")
print("\n" + "="*60)
print(f"TOTAL INSTITUTIONS: {len(all_institutions)}")
print("="*60)
# Count by institution type
type_counts = Counter(inst['institution_type'] for inst in all_institutions)
print("\nBreakdown by Institution Type:")
for inst_type, count in sorted(type_counts.items()):
print(f" - {inst_type:20s}: {count:2d}")
# Count institutions with identifiers
with_identifiers = sum(1 for inst in all_institutions if inst.get('identifiers'))
with_locations = sum(1 for inst in all_institutions if inst.get('locations'))
print("\nMetadata Coverage:")
print(f" - With identifiers: {with_identifiers}/{len(all_institutions)} ({100*with_identifiers/len(all_institutions):.1f}%)")
print(f" - With locations: {with_locations}/{len(all_institutions)} ({100*with_locations/len(all_institutions):.1f}%)")
# Save merged file
output_path = data_dir / "egypt_institutions.yaml"
print(f"\nSaving merged dataset to: {output_path}")
with open(output_path, 'w', encoding='utf-8') as f:
# Add header comment
f.write("# Egyptian GLAM Institutions Dataset\n")
f.write("# Extracted from conversation: 39e11630-a2af-407c-a365-d485eb8257b0\n")
f.write(f"# Total institutions: {len(all_institutions)}\n")
f.write("# Data tier: TIER_4_INFERRED (from conversation NLP extraction)\n")
f.write("#\n")
f.write("# Coverage:\n")
for inst_type, count in sorted(type_counts.items()):
f.write(f"# - {inst_type}: {count}\n")
f.write("#\n")
f.write("---\n")
yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Saved {len(all_institutions)} institutions to egypt_institutions.yaml")
# Print sample institutions
print("\nSample Institutions (first 10):")
for i, inst in enumerate(all_institutions[:10], 1):
print(f" {i:2d}. {inst['name']:50s} ({inst['institution_type']})")
if len(all_institutions) > 10:
print(f" ... and {len(all_institutions) - 10} more")
print("\n" + "="*60)
print("Merge complete!")
print("="*60)
print(f"\nFinal dataset: data/instances/egypt_institutions.yaml")
print(f"Total records: {len(all_institutions)}")
if __name__ == "__main__":
main()