glam/scripts/merge_gb_enriched.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

89 lines
3.3 KiB
Python

#!/usr/bin/env python3
"""
Merge GB enriched institutions back into unified global dataset.
"""
import yaml
from datetime import datetime, timezone
def merge_gb_enriched():
print("=" * 80)
print("🔀 Merging GB enriched data into unified dataset")
print("=" * 80)
# Load unified dataset
print("\n📂 Loading unified dataset...")
with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
all_institutions = yaml.safe_load(f)
print(f" ✅ Loaded {len(all_institutions)} institutions")
# Load enriched GB data
print("\n📂 Loading GB enriched data...")
with open('data/instances/great_britain/gb_institutions_enriched_manual.yaml', 'r', encoding='utf-8') as f:
gb_enriched = yaml.safe_load(f)
print(f" ✅ Loaded {len(gb_enriched)} enriched GB institutions")
# Create lookup by ID URL for quick matching
gb_by_id = {inst['id']: inst for inst in gb_enriched}
# Merge enriched data
print("\n🔄 Merging enriched data...")
merged_count = 0
for i, inst in enumerate(all_institutions):
inst_id = inst.get('id')
if inst_id in gb_by_id:
# Preserve GHCID from original if it exists
enriched_inst = gb_by_id[inst_id].copy()
if 'ghcid' in inst:
enriched_inst['ghcid'] = inst['ghcid']
if 'ghcid_uuid' in inst:
enriched_inst['ghcid_uuid'] = inst['ghcid_uuid']
if 'ghcid_uuid_sha256' in inst:
enriched_inst['ghcid_uuid_sha256'] = inst['ghcid_uuid_sha256']
if 'ghcid_numeric' in inst:
enriched_inst['ghcid_numeric'] = inst['ghcid_numeric']
# Replace with enriched version
all_institutions[i] = enriched_inst
merged_count += 1
print(f" ✅ Merged: {inst['name']}")
print(f"\n 📊 Total merged: {merged_count}")
# Save unified dataset with timestamp
timestamp = datetime.now(timezone.utc).strftime('%Y%m%d')
output_path = f'data/instances/all/globalglam-{timestamp}.yaml'
print(f"\n💾 Saving updated unified dataset to {output_path}...")
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(" ✅ Saved")
# Verify GB coverage
gb_institutions = [
inst for inst in all_institutions
if any(loc.get('country') == 'GB' for loc in inst.get('locations', []))
]
gb_with_wikidata = sum(
1 for inst in gb_institutions
if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))
)
print("\n" + "=" * 80)
print("📊 VERIFICATION - GB Institutions in Unified Dataset")
print("=" * 80)
print(f"Total GB institutions: {len(gb_institutions)}")
print(f"With Wikidata identifiers: {gb_with_wikidata}")
print(f"Coverage: {gb_with_wikidata/len(gb_institutions)*100:.1f}%")
if gb_with_wikidata == len(gb_institutions):
print("\n✅ SUCCESS: 100% Wikidata coverage verified in unified dataset!")
print("🇬🇧 Phase 1 Great Britain: COMPLETE")
print("\n")
if __name__ == '__main__':
merge_gb_enriched()