- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
96 lines
3.5 KiB
Python
96 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Egyptian GLAM Institution Extraction - Merge All Steps
|
|
Combines Steps 1-4 into a single comprehensive dataset.
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
import yaml
|
|
|
|
def main():
|
|
"""Merge all step files into final dataset."""
|
|
print("="*60)
|
|
print("Egyptian GLAM Extraction - Merging All Steps")
|
|
print("="*60)
|
|
|
|
data_dir = Path(__file__).parent.parent / "data" / "instances"
|
|
|
|
# Load all step files
|
|
step_files = [
|
|
("egypt_step1_2.yaml", "Steps 1+2: National libraries/archives + Museums"),
|
|
("egypt_step3.yaml", "Step 3: University Libraries"),
|
|
("egypt_step4.yaml", "Step 4: Galleries & Cultural Centers")
|
|
]
|
|
|
|
all_institutions = []
|
|
|
|
for filename, description in step_files:
|
|
filepath = data_dir / filename
|
|
print(f"\nLoading {filename}...")
|
|
print(f" ({description})")
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
count = len(data)
|
|
all_institutions.extend(data)
|
|
print(f" ✓ Loaded {count} institutions")
|
|
|
|
print("\n" + "="*60)
|
|
print(f"TOTAL INSTITUTIONS: {len(all_institutions)}")
|
|
print("="*60)
|
|
|
|
# Count by institution type
|
|
type_counts = Counter(inst['institution_type'] for inst in all_institutions)
|
|
|
|
print("\nBreakdown by Institution Type:")
|
|
for inst_type, count in sorted(type_counts.items()):
|
|
print(f" - {inst_type:20s}: {count:2d}")
|
|
|
|
# Count institutions with identifiers
|
|
with_identifiers = sum(1 for inst in all_institutions if inst.get('identifiers'))
|
|
with_locations = sum(1 for inst in all_institutions if inst.get('locations'))
|
|
|
|
print("\nMetadata Coverage:")
|
|
print(f" - With identifiers: {with_identifiers}/{len(all_institutions)} ({100*with_identifiers/len(all_institutions):.1f}%)")
|
|
print(f" - With locations: {with_locations}/{len(all_institutions)} ({100*with_locations/len(all_institutions):.1f}%)")
|
|
|
|
# Save merged file
|
|
output_path = data_dir / "egypt_institutions.yaml"
|
|
|
|
print(f"\nSaving merged dataset to: {output_path}")
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
# Add header comment
|
|
f.write("# Egyptian GLAM Institutions Dataset\n")
|
|
f.write("# Extracted from conversation: 39e11630-a2af-407c-a365-d485eb8257b0\n")
|
|
f.write(f"# Total institutions: {len(all_institutions)}\n")
|
|
f.write("# Data tier: TIER_4_INFERRED (from conversation NLP extraction)\n")
|
|
f.write("#\n")
|
|
f.write("# Coverage:\n")
|
|
for inst_type, count in sorted(type_counts.items()):
|
|
f.write(f"# - {inst_type}: {count}\n")
|
|
f.write("#\n")
|
|
f.write("---\n")
|
|
|
|
yaml.dump(all_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"✓ Saved {len(all_institutions)} institutions to egypt_institutions.yaml")
|
|
|
|
# Print sample institutions
|
|
print("\nSample Institutions (first 10):")
|
|
for i, inst in enumerate(all_institutions[:10], 1):
|
|
print(f" {i:2d}. {inst['name']:50s} ({inst['institution_type']})")
|
|
|
|
if len(all_institutions) > 10:
|
|
print(f" ... and {len(all_institutions) - 10} more")
|
|
|
|
print("\n" + "="*60)
|
|
print("Merge complete!")
|
|
print("="*60)
|
|
print(f"\nFinal dataset: data/instances/egypt_institutions.yaml")
|
|
print(f"Total records: {len(all_institutions)}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|