glam/scripts/merge_enriched_to_global.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

314 lines
10 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Merge enriched datasets into unified global heritage institutions database.
Merges enriched datasets (Task 6+ - November 2025):
1. Tunisia Enhanced (68 institutions, 76.5% Wikidata)
2. Georgia Enriched (14 institutions, 78.6% Wikidata)
3. Belgium Manual Enriched (7 institutions, 100% Wikidata)
Strategy:
- Handle both file formats (plain list vs. _metadata wrapper)
- Deduplicate by ID/GHCID
- Preserve enrichment metadata (replace if new is more enriched)
- Create timestamped backup before merge
- Accept country parameter to selectively merge datasets
"""
import yaml
import sys
from pathlib import Path
from datetime import datetime, timezone
from collections import defaultdict
from typing import Dict, List, Any
# File paths
BASE_DIR = Path("/Users/kempersc/apps/glam")
UNIFIED_DB = BASE_DIR / "data/instances/all/globalglam-20251111.yaml"
# Mapping of country codes to enriched dataset files
ENRICHED_FILES = {
'tunisia': BASE_DIR / "data/instances/tunisia/tunisian_institutions_enhanced.yaml",
'georgia': BASE_DIR / "data/instances/georgia/georgian_institutions_enriched_batch3_final.yaml",
'belgium': BASE_DIR / "data/instances/belgium/be_institutions_enriched_manual.yaml",
}
# Default: merge all available enriched files
DEFAULT_SOURCE_FILES = list(ENRICHED_FILES.values())
def load_yaml_file(filepath: Path) -> List[Dict[str, Any]]:
"""Load YAML file and extract institutions list."""
print(f"📖 Loading: {filepath.name}")
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Handle both formats
if isinstance(data, dict) and 'institutions' in data:
# Format: {_metadata: {...}, institutions: [...]}
institutions = data['institutions']
print(f" Format: metadata wrapper, {len(institutions)} institutions")
elif isinstance(data, list):
# Format: [...]
institutions = data
print(f" Format: plain list, {len(institutions)} institutions")
else:
raise ValueError(f"Unexpected format in {filepath}")
return institutions
def get_institution_key(inst: Dict[str, Any]) -> str:
"""Get unique key for institution (for deduplication)."""
# Primary: id field
if 'id' in inst and inst['id']:
return inst['id']
# Secondary: GHCID
if 'ghcid' in inst and inst['ghcid']:
return f"ghcid:{inst['ghcid']}"
# Fallback: name + country
name = inst.get('name', 'unknown')
country = 'unknown'
if 'locations' in inst and inst['locations']:
country = inst['locations'][0].get('country', 'unknown')
return f"name:{country}:{name}"
def has_wikidata(inst: Dict[str, Any]) -> bool:
"""Check if institution has Wikidata identifier."""
if 'identifiers' not in inst:
return False
for ident in inst['identifiers']:
if ident.get('identifier_scheme') == 'Wikidata':
return True
return False
def count_wikidata(institutions: List[Dict[str, Any]]) -> int:
"""Count institutions with Wikidata identifiers."""
return sum(1 for inst in institutions if has_wikidata(inst))
def is_more_enriched(new: Dict[str, Any], existing: Dict[str, Any]) -> bool:
"""Determine if new record is more enriched than existing."""
# Check Wikidata presence
new_has_wd = has_wikidata(new)
existing_has_wd = has_wikidata(existing)
if new_has_wd and not existing_has_wd:
return True
if existing_has_wd and not new_has_wd:
return False
# Check enrichment history presence
new_has_history = 'enrichment_history' in new.get('provenance', {})
existing_has_history = 'enrichment_history' in existing.get('provenance', {})
if new_has_history and not existing_has_history:
return True
if existing_has_history and not new_has_history:
return False
# Check number of identifiers
new_id_count = len(new.get('identifiers', []))
existing_id_count = len(existing.get('identifiers', []))
if new_id_count > existing_id_count:
return True
# Default: keep existing
return False
def merge_institutions(
unified: List[Dict[str, Any]],
new_institutions: List[Dict[str, Any]],
source_name: str
) -> tuple[List[Dict[str, Any]], Dict[str, Any]]:
"""
Merge new institutions into unified list.
Returns: (merged_list, stats_dict)
"""
# Build index of existing institutions
unified_index = {get_institution_key(inst): i for i, inst in enumerate(unified)}
stats = {
'source': source_name,
'total_new': len(new_institutions),
'added': 0,
'duplicates_skipped': 0,
'duplicates_replaced': 0,
}
for new_inst in new_institutions:
key = get_institution_key(new_inst)
if key in unified_index:
# Duplicate found - check which is more enriched
existing_idx = unified_index[key]
existing_inst = unified[existing_idx]
if is_more_enriched(new_inst, existing_inst):
# Replace with more enriched version
unified[existing_idx] = new_inst
stats['duplicates_replaced'] += 1
print(f" ✏️ Replaced: {new_inst.get('name', 'unknown')} (more enriched)")
else:
# Keep existing
stats['duplicates_skipped'] += 1
else:
# New institution - add it
unified.append(new_inst)
unified_index[key] = len(unified) - 1
stats['added'] += 1
return unified, stats
def create_backup(filepath: Path) -> Path:
"""Create timestamped backup of file."""
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
backup_path = filepath.parent / f"{filepath.stem}_backup_{timestamp}{filepath.suffix}"
print(f"💾 Creating backup: {backup_path.name}")
import shutil
shutil.copy2(filepath, backup_path)
return backup_path
def main():
print("=" * 80)
print("MERGE ENRICHED DATASETS INTO UNIFIED GLOBAL DATABASE")
print("=" * 80)
print()
# Parse command line arguments
if len(sys.argv) > 1:
country = sys.argv[1].lower()
if country in ENRICHED_FILES:
source_files = [ENRICHED_FILES[country]]
print(f"🌍 Mode: Merging {country.upper()} only\n")
else:
print(f"❌ Unknown country: {country}")
print(f"Available countries: {', '.join(ENRICHED_FILES.keys())}")
return 1
else:
source_files = DEFAULT_SOURCE_FILES
print(f"🌍 Mode: Merging all enriched datasets\n")
# Step 1: Load unified database
print("📚 STEP 1: Load unified global database")
print("-" * 80)
unified_institutions = load_yaml_file(UNIFIED_DB)
initial_count = len(unified_institutions)
initial_wikidata = count_wikidata(unified_institutions)
print(f"✅ Loaded {initial_count:,} institutions ({initial_wikidata:,} with Wikidata, {initial_wikidata/initial_count*100:.1f}%)")
print()
# Step 2: Create backup
print("💾 STEP 2: Create backup")
print("-" * 80)
backup_path = create_backup(UNIFIED_DB)
print(f"✅ Backup created: {backup_path}")
print()
# Step 3: Load and merge source files
print("📥 STEP 3: Load and merge source files")
print("-" * 80)
all_stats = []
for source_file in source_files:
print(f"\n🔄 Processing: {source_file.name}")
try:
new_institutions = load_yaml_file(source_file)
new_wikidata = count_wikidata(new_institutions)
print(f" Loaded: {len(new_institutions)} institutions ({new_wikidata} with Wikidata, {new_wikidata/len(new_institutions)*100:.1f}%)")
unified_institutions, stats = merge_institutions(
unified_institutions,
new_institutions,
source_file.name
)
all_stats.append(stats)
print(f" ✅ Added: {stats['added']}")
print(f" ⏭️ Skipped duplicates: {stats['duplicates_skipped']}")
print(f" ✏️ Replaced (more enriched): {stats['duplicates_replaced']}")
except Exception as e:
print(f" ❌ ERROR: {e}")
import traceback
traceback.print_exc()
continue
print()
print("-" * 80)
# Step 4: Statistics
print()
print("📊 STEP 4: Statistics")
print("-" * 80)
final_count = len(unified_institutions)
final_wikidata = count_wikidata(unified_institutions)
total_added = sum(s['added'] for s in all_stats)
total_replaced = sum(s['duplicates_replaced'] for s in all_stats)
total_skipped = sum(s['duplicates_skipped'] for s in all_stats)
print(f"Initial count: {initial_count:,} institutions")
print(f"Final count: {final_count:,} institutions")
print(f"Net change: +{final_count - initial_count:,} institutions")
print()
print(f"Added (new): {total_added:,}")
print(f"Replaced (enriched): {total_replaced:,}")
print(f"Skipped (duplicates): {total_skipped:,}")
print()
print(f"Wikidata coverage:")
print(f" Before: {initial_wikidata:,} / {initial_count:,} ({initial_wikidata/initial_count*100:.1f}%)")
print(f" After: {final_wikidata:,} / {final_count:,} ({final_wikidata/final_count*100:.1f}%)")
print(f" Change: +{final_wikidata - initial_wikidata:,} Wikidata IDs")
print()
# Step 5: Save merged database
print("💾 STEP 5: Save merged database")
print("-" * 80)
print(f"Writing to: {UNIFIED_DB}")
with open(UNIFIED_DB, 'w', encoding='utf-8') as f:
yaml.dump(unified_institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"✅ Saved {final_count:,} institutions")
print()
# Final summary
print("=" * 80)
print("✅ MERGE COMPLETE")
print("=" * 80)
print()
print(f"Unified database: {UNIFIED_DB}")
print(f"Backup location: {backup_path}")
print(f"Total institutions: {initial_count:,}{final_count:,} (+{final_count - initial_count:,})")
print(f"Wikidata coverage: {initial_wikidata/initial_count*100:.1f}% → {final_wikidata/final_count*100:.1f}%")
print()
return 0
if __name__ == "__main__":
sys.exit(main())