- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
152 lines
5.8 KiB
Python
152 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge Batch 16 enrichments into main GlobalGLAM dataset.
|
|
|
|
This script:
|
|
1. Backs up the current dataset
|
|
2. Loads Batch 16 enriched institutions
|
|
3. Updates 5 existing institution records with enrichment
|
|
4. Appends 1 new institution (Museu Casa de Rui Barbosa)
|
|
5. Saves merged dataset
|
|
"""
|
|
|
|
import yaml
|
|
import shutil
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# Paths
|
|
BASE_DIR = Path(__file__).parent.parent
|
|
MAIN_DATASET = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111.yaml"
|
|
BATCH16_FILE = BASE_DIR / "data" / "instances" / "brazil" / "batch16_enriched.yaml"
|
|
OUTPUT_FILE = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111-batch16.yaml"
|
|
BACKUP_FILE = BASE_DIR / "data" / "instances" / "all" / f"globalglam-20251111-pre-batch16-{datetime.now().strftime('%Y%m%d-%H%M%S')}.yaml"
|
|
|
|
|
|
def load_yaml(filepath):
|
|
"""Load YAML file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def save_yaml(data, filepath):
|
|
"""Save YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
|
|
def find_institution_by_old_id(institutions, old_id_value):
|
|
"""Find institution by OLD_ID identifier."""
|
|
for idx, inst in enumerate(institutions):
|
|
identifiers = inst.get('identifiers', [])
|
|
for ident in identifiers:
|
|
if ident.get('identifier_scheme') == 'OLD_ID' and ident.get('identifier_value') == old_id_value:
|
|
return idx
|
|
return None
|
|
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("BRAZIL BATCH 16 MERGE")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Backup current dataset
|
|
print(f"1. Creating backup: {BACKUP_FILE.name}")
|
|
shutil.copy2(MAIN_DATASET, BACKUP_FILE)
|
|
print(f" ✓ Backup created")
|
|
print()
|
|
|
|
# Load datasets
|
|
print("2. Loading datasets...")
|
|
main_data = load_yaml(MAIN_DATASET)
|
|
batch16_data = load_yaml(BATCH16_FILE)
|
|
print(f" ✓ Main dataset: {len(main_data)} institutions")
|
|
print(f" ✓ Batch 16: {len(batch16_data)} institutions")
|
|
print()
|
|
|
|
# Count Brazilian institutions before merge
|
|
brazil_before = sum(1 for inst in main_data
|
|
if any(loc.get('country') == 'BR' for loc in inst.get('locations', [])))
|
|
brazil_wikidata_before = sum(1 for inst in main_data
|
|
if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))
|
|
and any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])))
|
|
|
|
print(f"3. Brazilian institutions before merge:")
|
|
print(f" Total: {brazil_before}")
|
|
print(f" With Wikidata: {brazil_wikidata_before} ({brazil_wikidata_before/brazil_before*100:.1f}%)")
|
|
print()
|
|
|
|
# Process Batch 16 institutions
|
|
print("4. Processing Batch 16 institutions:")
|
|
updated_count = 0
|
|
new_count = 0
|
|
|
|
for inst in batch16_data:
|
|
inst_name = inst.get('name')
|
|
inst_id = inst.get('id')
|
|
|
|
# Check for OLD_ID (existing institutions)
|
|
old_ids = [i.get('identifier_value') for i in inst.get('identifiers', [])
|
|
if i.get('identifier_scheme') == 'OLD_ID']
|
|
|
|
if old_ids:
|
|
# This is an existing institution - update it
|
|
old_id = old_ids[0]
|
|
idx = find_institution_by_old_id(main_data, old_id)
|
|
|
|
if idx is not None:
|
|
# Update the institution
|
|
main_data[idx] = inst
|
|
updated_count += 1
|
|
wikidata_id = next((i.get('identifier_value') for i in inst.get('identifiers', [])
|
|
if i.get('identifier_scheme') == 'Wikidata'), 'N/A')
|
|
print(f" ✓ Updated: {inst_name} ({wikidata_id})")
|
|
else:
|
|
print(f" ⚠ WARNING: Could not find institution with OLD_ID: {old_id}")
|
|
else:
|
|
# This is a new institution - append it
|
|
main_data.append(inst)
|
|
new_count += 1
|
|
wikidata_id = next((i.get('identifier_value') for i in inst.get('identifiers', [])
|
|
if i.get('identifier_scheme') == 'Wikidata'), 'N/A')
|
|
print(f" ✓ Added NEW: {inst_name} ({wikidata_id})")
|
|
|
|
print()
|
|
print(f" Summary: {updated_count} updated, {new_count} new")
|
|
print()
|
|
|
|
# Count Brazilian institutions after merge
|
|
brazil_after = sum(1 for inst in main_data
|
|
if any(loc.get('country') == 'BR' for loc in inst.get('locations', [])))
|
|
brazil_wikidata_after = sum(1 for inst in main_data
|
|
if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))
|
|
and any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])))
|
|
|
|
print(f"5. Brazilian institutions after merge:")
|
|
print(f" Total: {brazil_after} (was {brazil_before}, +{brazil_after - brazil_before})")
|
|
print(f" With Wikidata: {brazil_wikidata_after} (was {brazil_wikidata_before}, +{brazil_wikidata_after - brazil_wikidata_before})")
|
|
print(f" Coverage: {brazil_wikidata_after/brazil_after*100:.1f}% (was {brazil_wikidata_before/brazil_before*100:.1f}%)")
|
|
print()
|
|
|
|
# Save merged dataset
|
|
print(f"6. Saving merged dataset: {OUTPUT_FILE.name}")
|
|
save_yaml(main_data, OUTPUT_FILE)
|
|
print(f" ✓ Saved {len(main_data)} institutions")
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print("MERGE COMPLETE")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Backup: {BACKUP_FILE}")
|
|
print(f"Output: {OUTPUT_FILE}")
|
|
print()
|
|
print("Next steps:")
|
|
print("1. Validate merged dataset")
|
|
print("2. Generate Batch 16 report")
|
|
print("3. Decide if Batch 17 is needed (70% goal = 88/126)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|