- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
314 lines
10 KiB
Python
Executable file
314 lines
10 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Merge enriched datasets into unified global heritage institutions database.
|
|
|
|
Merges enriched datasets (Task 6+ - November 2025):
|
|
1. Tunisia Enhanced (68 institutions, 76.5% Wikidata)
|
|
2. Georgia Enriched (14 institutions, 78.6% Wikidata)
|
|
3. Belgium Manual Enriched (7 institutions, 100% Wikidata)
|
|
|
|
Strategy:
|
|
- Handle both file formats (plain list vs. _metadata wrapper)
|
|
- Deduplicate by ID/GHCID
|
|
- Preserve enrichment metadata (replace if new is more enriched)
|
|
- Create timestamped backup before merge
|
|
- Accept country parameter to selectively merge datasets
|
|
"""
|
|
|
|
import yaml
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Any
|
|
|
|
# File paths
|
|
BASE_DIR = Path("/Users/kempersc/apps/glam")
|
|
UNIFIED_DB = BASE_DIR / "data/instances/all/globalglam-20251111.yaml"
|
|
|
|
# Mapping of country codes to enriched dataset files
|
|
ENRICHED_FILES = {
|
|
'tunisia': BASE_DIR / "data/instances/tunisia/tunisian_institutions_enhanced.yaml",
|
|
'georgia': BASE_DIR / "data/instances/georgia/georgian_institutions_enriched_batch3_final.yaml",
|
|
'belgium': BASE_DIR / "data/instances/belgium/be_institutions_enriched_manual.yaml",
|
|
}
|
|
|
|
# Default: merge all available enriched files
|
|
DEFAULT_SOURCE_FILES = list(ENRICHED_FILES.values())
|
|
|
|
|
|
def load_yaml_file(filepath: Path) -> List[Dict[str, Any]]:
|
|
"""Load YAML file and extract institutions list."""
|
|
print(f"📖 Loading: {filepath.name}")
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Handle both formats
|
|
if isinstance(data, dict) and 'institutions' in data:
|
|
# Format: {_metadata: {...}, institutions: [...]}
|
|
institutions = data['institutions']
|
|
print(f" Format: metadata wrapper, {len(institutions)} institutions")
|
|
elif isinstance(data, list):
|
|
# Format: [...]
|
|
institutions = data
|
|
print(f" Format: plain list, {len(institutions)} institutions")
|
|
else:
|
|
raise ValueError(f"Unexpected format in {filepath}")
|
|
|
|
return institutions
|
|
|
|
|
|
def get_institution_key(inst: Dict[str, Any]) -> str:
|
|
"""Get unique key for institution (for deduplication)."""
|
|
# Primary: id field
|
|
if 'id' in inst and inst['id']:
|
|
return inst['id']
|
|
|
|
# Secondary: GHCID
|
|
if 'ghcid' in inst and inst['ghcid']:
|
|
return f"ghcid:{inst['ghcid']}"
|
|
|
|
# Fallback: name + country
|
|
name = inst.get('name', 'unknown')
|
|
country = 'unknown'
|
|
if 'locations' in inst and inst['locations']:
|
|
country = inst['locations'][0].get('country', 'unknown')
|
|
|
|
return f"name:{country}:{name}"
|
|
|
|
|
|
def has_wikidata(inst: Dict[str, Any]) -> bool:
|
|
"""Check if institution has Wikidata identifier."""
|
|
if 'identifiers' not in inst:
|
|
return False
|
|
|
|
for ident in inst['identifiers']:
|
|
if ident.get('identifier_scheme') == 'Wikidata':
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def count_wikidata(institutions: List[Dict[str, Any]]) -> int:
|
|
"""Count institutions with Wikidata identifiers."""
|
|
return sum(1 for inst in institutions if has_wikidata(inst))
|
|
|
|
|
|
def is_more_enriched(new: Dict[str, Any], existing: Dict[str, Any]) -> bool:
|
|
"""Determine if new record is more enriched than existing."""
|
|
|
|
# Check Wikidata presence
|
|
new_has_wd = has_wikidata(new)
|
|
existing_has_wd = has_wikidata(existing)
|
|
|
|
if new_has_wd and not existing_has_wd:
|
|
return True
|
|
if existing_has_wd and not new_has_wd:
|
|
return False
|
|
|
|
# Check enrichment history presence
|
|
new_has_history = 'enrichment_history' in new.get('provenance', {})
|
|
existing_has_history = 'enrichment_history' in existing.get('provenance', {})
|
|
|
|
if new_has_history and not existing_has_history:
|
|
return True
|
|
if existing_has_history and not new_has_history:
|
|
return False
|
|
|
|
# Check number of identifiers
|
|
new_id_count = len(new.get('identifiers', []))
|
|
existing_id_count = len(existing.get('identifiers', []))
|
|
|
|
if new_id_count > existing_id_count:
|
|
return True
|
|
|
|
# Default: keep existing
|
|
return False
|
|
|
|
|
|
def merge_institutions(
|
|
unified: List[Dict[str, Any]],
|
|
new_institutions: List[Dict[str, Any]],
|
|
source_name: str
|
|
) -> tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
|
"""
|
|
Merge new institutions into unified list.
|
|
|
|
Returns: (merged_list, stats_dict)
|
|
"""
|
|
|
|
# Build index of existing institutions
|
|
unified_index = {get_institution_key(inst): i for i, inst in enumerate(unified)}
|
|
|
|
stats = {
|
|
'source': source_name,
|
|
'total_new': len(new_institutions),
|
|
'added': 0,
|
|
'duplicates_skipped': 0,
|
|
'duplicates_replaced': 0,
|
|
}
|
|
|
|
for new_inst in new_institutions:
|
|
key = get_institution_key(new_inst)
|
|
|
|
if key in unified_index:
|
|
# Duplicate found - check which is more enriched
|
|
existing_idx = unified_index[key]
|
|
existing_inst = unified[existing_idx]
|
|
|
|
if is_more_enriched(new_inst, existing_inst):
|
|
# Replace with more enriched version
|
|
unified[existing_idx] = new_inst
|
|
stats['duplicates_replaced'] += 1
|
|
print(f" ✏️ Replaced: {new_inst.get('name', 'unknown')} (more enriched)")
|
|
else:
|
|
# Keep existing
|
|
stats['duplicates_skipped'] += 1
|
|
else:
|
|
# New institution - add it
|
|
unified.append(new_inst)
|
|
unified_index[key] = len(unified) - 1
|
|
stats['added'] += 1
|
|
|
|
return unified, stats
|
|
|
|
|
|
def create_backup(filepath: Path) -> Path:
|
|
"""Create timestamped backup of file."""
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
backup_path = filepath.parent / f"{filepath.stem}_backup_{timestamp}{filepath.suffix}"
|
|
|
|
print(f"💾 Creating backup: {backup_path.name}")
|
|
|
|
import shutil
|
|
shutil.copy2(filepath, backup_path)
|
|
|
|
return backup_path
|
|
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("MERGE ENRICHED DATASETS INTO UNIFIED GLOBAL DATABASE")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Parse command line arguments
|
|
if len(sys.argv) > 1:
|
|
country = sys.argv[1].lower()
|
|
if country in ENRICHED_FILES:
|
|
source_files = [ENRICHED_FILES[country]]
|
|
print(f"🌍 Mode: Merging {country.upper()} only\n")
|
|
else:
|
|
print(f"❌ Unknown country: {country}")
|
|
print(f"Available countries: {', '.join(ENRICHED_FILES.keys())}")
|
|
return 1
|
|
else:
|
|
source_files = DEFAULT_SOURCE_FILES
|
|
print(f"🌍 Mode: Merging all enriched datasets\n")
|
|
|
|
# Step 1: Load unified database
|
|
print("📚 STEP 1: Load unified global database")
|
|
print("-" * 80)
|
|
unified_institutions = load_yaml_file(UNIFIED_DB)
|
|
initial_count = len(unified_institutions)
|
|
initial_wikidata = count_wikidata(unified_institutions)
|
|
print(f"✅ Loaded {initial_count:,} institutions ({initial_wikidata:,} with Wikidata, {initial_wikidata/initial_count*100:.1f}%)")
|
|
print()
|
|
|
|
# Step 2: Create backup
|
|
print("💾 STEP 2: Create backup")
|
|
print("-" * 80)
|
|
backup_path = create_backup(UNIFIED_DB)
|
|
print(f"✅ Backup created: {backup_path}")
|
|
print()
|
|
|
|
# Step 3: Load and merge source files
|
|
print("📥 STEP 3: Load and merge source files")
|
|
print("-" * 80)
|
|
|
|
all_stats = []
|
|
|
|
for source_file in source_files:
|
|
print(f"\n🔄 Processing: {source_file.name}")
|
|
|
|
try:
|
|
new_institutions = load_yaml_file(source_file)
|
|
new_wikidata = count_wikidata(new_institutions)
|
|
print(f" Loaded: {len(new_institutions)} institutions ({new_wikidata} with Wikidata, {new_wikidata/len(new_institutions)*100:.1f}%)")
|
|
|
|
unified_institutions, stats = merge_institutions(
|
|
unified_institutions,
|
|
new_institutions,
|
|
source_file.name
|
|
)
|
|
|
|
all_stats.append(stats)
|
|
|
|
print(f" ✅ Added: {stats['added']}")
|
|
print(f" ⏭️ Skipped duplicates: {stats['duplicates_skipped']}")
|
|
print(f" ✏️ Replaced (more enriched): {stats['duplicates_replaced']}")
|
|
|
|
except Exception as e:
|
|
print(f" ❌ ERROR: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
continue
|
|
|
|
print()
|
|
print("-" * 80)
|
|
|
|
# Step 4: Statistics
|
|
print()
|
|
print("📊 STEP 4: Statistics")
|
|
print("-" * 80)
|
|
|
|
final_count = len(unified_institutions)
|
|
final_wikidata = count_wikidata(unified_institutions)
|
|
|
|
total_added = sum(s['added'] for s in all_stats)
|
|
total_replaced = sum(s['duplicates_replaced'] for s in all_stats)
|
|
total_skipped = sum(s['duplicates_skipped'] for s in all_stats)
|
|
|
|
print(f"Initial count: {initial_count:,} institutions")
|
|
print(f"Final count: {final_count:,} institutions")
|
|
print(f"Net change: +{final_count - initial_count:,} institutions")
|
|
print()
|
|
print(f"Added (new): {total_added:,}")
|
|
print(f"Replaced (enriched): {total_replaced:,}")
|
|
print(f"Skipped (duplicates): {total_skipped:,}")
|
|
print()
|
|
print(f"Wikidata coverage:")
|
|
print(f" Before: {initial_wikidata:,} / {initial_count:,} ({initial_wikidata/initial_count*100:.1f}%)")
|
|
print(f" After: {final_wikidata:,} / {final_count:,} ({final_wikidata/final_count*100:.1f}%)")
|
|
print(f" Change: +{final_wikidata - initial_wikidata:,} Wikidata IDs")
|
|
print()
|
|
|
|
# Step 5: Save merged database
|
|
print("💾 STEP 5: Save merged database")
|
|
print("-" * 80)
|
|
|
|
print(f"Writing to: {UNIFIED_DB}")
|
|
|
|
with open(UNIFIED_DB, 'w', encoding='utf-8') as f:
|
|
yaml.dump(unified_institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"✅ Saved {final_count:,} institutions")
|
|
print()
|
|
|
|
# Final summary
|
|
print("=" * 80)
|
|
print("✅ MERGE COMPLETE")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Unified database: {UNIFIED_DB}")
|
|
print(f"Backup location: {backup_path}")
|
|
print(f"Total institutions: {initial_count:,} → {final_count:,} (+{final_count - initial_count:,})")
|
|
print(f"Wikidata coverage: {initial_wikidata/initial_count*100:.1f}% → {final_wikidata/final_count*100:.1f}%")
|
|
print()
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|