glam/scripts/merge_batch16.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

152 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
Merge Batch 16 enrichments into main GlobalGLAM dataset.
This script:
1. Backs up the current dataset
2. Loads Batch 16 enriched institutions
3. Updates 5 existing institution records with enrichment
4. Appends 1 new institution (Museu Casa de Rui Barbosa)
5. Saves merged dataset
"""
import yaml
import shutil
from datetime import datetime
from pathlib import Path
# Paths
BASE_DIR = Path(__file__).parent.parent
MAIN_DATASET = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111.yaml"
BATCH16_FILE = BASE_DIR / "data" / "instances" / "brazil" / "batch16_enriched.yaml"
OUTPUT_FILE = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111-batch16.yaml"
BACKUP_FILE = BASE_DIR / "data" / "instances" / "all" / f"globalglam-20251111-pre-batch16-{datetime.now().strftime('%Y%m%d-%H%M%S')}.yaml"
def load_yaml(filepath):
"""Load YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_yaml(data, filepath):
"""Save YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def find_institution_by_old_id(institutions, old_id_value):
"""Find institution by OLD_ID identifier."""
for idx, inst in enumerate(institutions):
identifiers = inst.get('identifiers', [])
for ident in identifiers:
if ident.get('identifier_scheme') == 'OLD_ID' and ident.get('identifier_value') == old_id_value:
return idx
return None
def main():
print("=" * 80)
print("BRAZIL BATCH 16 MERGE")
print("=" * 80)
print()
# Backup current dataset
print(f"1. Creating backup: {BACKUP_FILE.name}")
shutil.copy2(MAIN_DATASET, BACKUP_FILE)
print(f" ✓ Backup created")
print()
# Load datasets
print("2. Loading datasets...")
main_data = load_yaml(MAIN_DATASET)
batch16_data = load_yaml(BATCH16_FILE)
print(f" ✓ Main dataset: {len(main_data)} institutions")
print(f" ✓ Batch 16: {len(batch16_data)} institutions")
print()
# Count Brazilian institutions before merge
brazil_before = sum(1 for inst in main_data
if any(loc.get('country') == 'BR' for loc in inst.get('locations', [])))
brazil_wikidata_before = sum(1 for inst in main_data
if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))
and any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])))
print(f"3. Brazilian institutions before merge:")
print(f" Total: {brazil_before}")
print(f" With Wikidata: {brazil_wikidata_before} ({brazil_wikidata_before/brazil_before*100:.1f}%)")
print()
# Process Batch 16 institutions
print("4. Processing Batch 16 institutions:")
updated_count = 0
new_count = 0
for inst in batch16_data:
inst_name = inst.get('name')
inst_id = inst.get('id')
# Check for OLD_ID (existing institutions)
old_ids = [i.get('identifier_value') for i in inst.get('identifiers', [])
if i.get('identifier_scheme') == 'OLD_ID']
if old_ids:
# This is an existing institution - update it
old_id = old_ids[0]
idx = find_institution_by_old_id(main_data, old_id)
if idx is not None:
# Update the institution
main_data[idx] = inst
updated_count += 1
wikidata_id = next((i.get('identifier_value') for i in inst.get('identifiers', [])
if i.get('identifier_scheme') == 'Wikidata'), 'N/A')
print(f" ✓ Updated: {inst_name} ({wikidata_id})")
else:
print(f" ⚠ WARNING: Could not find institution with OLD_ID: {old_id}")
else:
# This is a new institution - append it
main_data.append(inst)
new_count += 1
wikidata_id = next((i.get('identifier_value') for i in inst.get('identifiers', [])
if i.get('identifier_scheme') == 'Wikidata'), 'N/A')
print(f" ✓ Added NEW: {inst_name} ({wikidata_id})")
print()
print(f" Summary: {updated_count} updated, {new_count} new")
print()
# Count Brazilian institutions after merge
brazil_after = sum(1 for inst in main_data
if any(loc.get('country') == 'BR' for loc in inst.get('locations', [])))
brazil_wikidata_after = sum(1 for inst in main_data
if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))
and any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])))
print(f"5. Brazilian institutions after merge:")
print(f" Total: {brazil_after} (was {brazil_before}, +{brazil_after - brazil_before})")
print(f" With Wikidata: {brazil_wikidata_after} (was {brazil_wikidata_before}, +{brazil_wikidata_after - brazil_wikidata_before})")
print(f" Coverage: {brazil_wikidata_after/brazil_after*100:.1f}% (was {brazil_wikidata_before/brazil_before*100:.1f}%)")
print()
# Save merged dataset
print(f"6. Saving merged dataset: {OUTPUT_FILE.name}")
save_yaml(main_data, OUTPUT_FILE)
print(f" ✓ Saved {len(main_data)} institutions")
print()
print("=" * 80)
print("MERGE COMPLETE")
print("=" * 80)
print()
print(f"Backup: {BACKUP_FILE}")
print(f"Output: {OUTPUT_FILE}")
print()
print("Next steps:")
print("1. Validate merged dataset")
print("2. Generate Batch 16 report")
print("3. Decide if Batch 17 is needed (70% goal = 88/126)")
if __name__ == '__main__':
main()