glam/scripts/merge_georgia_enrichment.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

123 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
Merge Georgia Wikidata enrichment into unified global dataset.
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
def merge_georgia_enrichment():
"""Merge Georgia enriched institutions into unified dataset."""
# Load enriched Georgia data
georgia_enriched_path = Path('data/instances/georgia/georgian_institutions_enriched_batch3_final.yaml')
with open(georgia_enriched_path, 'r', encoding='utf-8') as f:
georgia_enriched = yaml.safe_load(f)
# Load unified dataset
unified_path = Path('data/instances/all/globalglam-20251111.yaml')
with open(unified_path, 'r', encoding='utf-8') as f:
unified = yaml.safe_load(f)
print(f"Loaded {len(georgia_enriched)} enriched Georgia institutions")
print(f"Loaded {len(unified)} institutions from unified dataset")
# Create lookup by institution ID
georgia_enriched_lookup = {}
for inst in georgia_enriched:
inst_id = inst.get('id')
if inst_id:
georgia_enriched_lookup[inst_id] = inst
print(f"\nCreated lookup for {len(georgia_enriched_lookup)} Georgia institutions")
# Merge enrichment data into unified dataset
merged_count = 0
updated_records = []
for i, inst in enumerate(unified):
inst_id = inst.get('id')
# Check if this is a Georgia institution with enrichment
if not inst_id or inst_id not in georgia_enriched_lookup:
continue
# Found matching institution
enriched = georgia_enriched_lookup[inst_id]
# Get existing identifiers (or create empty list if field missing)
if 'identifiers' not in inst or inst['identifiers'] is None:
inst['identifiers'] = []
existing_identifiers = inst['identifiers']
existing_schemes = {id.get('identifier_scheme') for id in existing_identifiers}
# Add Wikidata and VIAF from enriched version if not present
added_identifiers = []
for id in enriched.get('identifiers', []):
scheme = id.get('identifier_scheme')
if scheme in ['Wikidata', 'VIAF'] and scheme not in existing_schemes:
inst['identifiers'].append(id)
added_identifiers.append(scheme)
existing_schemes.add(scheme)
# Also add founding_date if present in enriched version
if enriched.get('founding_date') and not inst.get('founding_date'):
inst['founding_date'] = enriched['founding_date']
added_identifiers.append('founding_date')
if added_identifiers:
# Update provenance
provenance = inst.get('provenance', {})
if not provenance:
provenance = {}
inst['provenance'] = provenance
# Update enrichment_history
enrichment_history = enriched.get('provenance', {}).get('enrichment_history', [])
if enrichment_history:
if 'enrichment_history' not in provenance:
provenance['enrichment_history'] = []
provenance['enrichment_history'].extend(enrichment_history)
# Add merge timestamp
provenance['last_updated'] = datetime.now(timezone.utc).isoformat()
provenance['wikidata_verified'] = True
merged_count += 1
updated_records.append({
'name': inst.get('name'),
'id': inst_id,
'added': added_identifiers
})
print(f"✓ Merged {inst.get('name')}: added {', '.join(added_identifiers)}")
# Save updated unified dataset
if merged_count > 0:
backup_path = unified_path.with_suffix('.yaml.backup')
import shutil
shutil.copy(unified_path, backup_path)
print(f"\n✓ Created backup: {backup_path}")
with open(unified_path, 'w', encoding='utf-8') as f:
yaml.dump(unified, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"\n✓ Updated unified dataset: {unified_path}")
print(f"✓ Merged {merged_count} Georgia institutions")
# Print summary
print("\n" + "="*60)
print("MERGE SUMMARY")
print("="*60)
for record in updated_records:
print(f" {record['name']} ({record['id']})")
print(f" Added: {', '.join(record['added'])}")
else:
print("\n⚠ No institutions merged (already up to date)")
return merged_count
if __name__ == '__main__':
merge_georgia_enrichment()