- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
78 lines
2.8 KiB
Python
Executable file
78 lines
2.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Merge US enriched institutions back into unified global dataset.
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
|
|
def merge_us_enriched():
|
|
print("=" * 80)
|
|
print("🔀 Merging US enriched data into unified dataset")
|
|
print("=" * 80)
|
|
|
|
# Load unified dataset
|
|
print("\n📂 Loading unified dataset...")
|
|
with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
|
|
all_institutions = yaml.safe_load(f)
|
|
print(f" ✅ Loaded {len(all_institutions)} institutions")
|
|
|
|
# Load enriched US data
|
|
print("\n📂 Loading US enriched data...")
|
|
with open('data/instances/united_states/us_institutions_enriched_manual.yaml', 'r', encoding='utf-8') as f:
|
|
us_enriched = yaml.safe_load(f)
|
|
print(f" ✅ Loaded {len(us_enriched)} enriched US institutions")
|
|
|
|
# Create lookup by GHCID for quick matching
|
|
us_by_ghcid = {inst['ghcid']: inst for inst in us_enriched}
|
|
|
|
# Merge enriched data
|
|
print("\n🔄 Merging enriched data...")
|
|
merged_count = 0
|
|
|
|
for i, inst in enumerate(all_institutions):
|
|
ghcid = inst.get('ghcid')
|
|
if ghcid in us_by_ghcid:
|
|
# Replace with enriched version
|
|
all_institutions[i] = us_by_ghcid[ghcid]
|
|
merged_count += 1
|
|
print(f" ✅ Merged: {inst['name']}")
|
|
|
|
print(f"\n 📊 Total merged: {merged_count}")
|
|
|
|
# Save unified dataset with timestamp
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%d')
|
|
output_path = f'data/instances/all/globalglam-{timestamp}.yaml'
|
|
|
|
print(f"\n💾 Saving updated unified dataset to {output_path}...")
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print(" ✅ Saved")
|
|
|
|
# Verify US coverage
|
|
us_institutions = [
|
|
inst for inst in all_institutions
|
|
if any(loc.get('country') == 'US' for loc in inst.get('locations', []))
|
|
]
|
|
|
|
us_with_wikidata = sum(
|
|
1 for inst in us_institutions
|
|
if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))
|
|
)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("📊 VERIFICATION - US Institutions in Unified Dataset")
|
|
print("=" * 80)
|
|
print(f"Total US institutions: {len(us_institutions)}")
|
|
print(f"With Wikidata identifiers: {us_with_wikidata}")
|
|
print(f"Coverage: {us_with_wikidata/len(us_institutions)*100:.1f}%")
|
|
|
|
if us_with_wikidata == len(us_institutions):
|
|
print("\n✅ SUCCESS: 100% Wikidata coverage verified in unified dataset!")
|
|
print("🇺🇸 Phase 1 United States: COMPLETE")
|
|
|
|
print("\n")
|
|
|
|
if __name__ == '__main__':
|
|
merge_us_enriched()
|