#!/usr/bin/env python3 """ Merge Brazil Batch 13 enriched institutions back into unified global dataset. This script merges 9 verified Wikidata Q-numbers from Batch 13. """ import yaml from datetime import datetime, timezone def merge_brazil_batch13(): print("=" * 80) print("šŸ”€ Merging Brazil Batch 13 enriched data into unified dataset") print("=" * 80) # Load unified dataset print("\nšŸ“‚ Loading unified dataset...") with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) print(f" āœ… Loaded {len(all_institutions)} institutions") # Load enriched Brazil batch 13 data print("\nšŸ“‚ Loading Brazil Batch 13 enriched data...") with open('data/instances/brazil/batch13_enriched.yaml', 'r', encoding='utf-8') as f: br_batch13 = yaml.safe_load(f) successful_matches = br_batch13['successful_matches'] print(f" āœ… Loaded {len(successful_matches)} enriched institutions") # Create mapping of institution IDs to Q-numbers enrichment_map = {} for match in successful_matches: inst_id = match.get('institution_id') if inst_id: enrichment_map[inst_id] = { 'qid': match['wikidata_qid'], 'label': match['wikidata_label'], 'description': match.get('wikidata_description', ''), 'confidence': match['confidence'] } print(f" šŸ“‹ Enrichment map created for {len(enrichment_map)} institutions") # Merge enriched data print("\nšŸ”„ Merging enriched data...") merged_count = 0 updated_count = 0 not_found_count = 0 for i, inst in enumerate(all_institutions): inst_id = inst.get('id') if inst_id in enrichment_map: enrichment = enrichment_map[inst_id] # Check if institution already has Wikidata has_wd_original = any( idf.get('identifier_scheme') == 'Wikidata' for idf in inst.get('identifiers', []) ) if not has_wd_original: # Add new Wikidata identifier if 'identifiers' not in inst: inst['identifiers'] = [] inst['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': enrichment['qid'], 'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}" }) # Update provenance if 'provenance' not in inst: inst['provenance'] = {} if 'enrichment_history' not in inst['provenance']: inst['provenance']['enrichment_history'] = [] inst['provenance']['enrichment_history'].append({ 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enrichment_type': 'WIKIDATA_IDENTIFIER', 'enrichment_method': 'AUTHENTICATED_SEARCH_BATCH13', 'match_score': enrichment['confidence'], 'verified': True, 'enrichment_source': 'https://www.wikidata.org', 'enrichment_notes': f"Batch 13: {enrichment['label']}" }) inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat() merged_count += 1 print(f" āœ… Merged: {inst['name']} -> {enrichment['qid']}") else: updated_count += 1 # Get existing Q-number existing_q = next( (idf.get('identifier_value') for idf in inst.get('identifiers', []) if idf.get('identifier_scheme') == 'Wikidata'), 'N/A' ) print(f" ā­ļø Already has Wikidata: {inst['name']} ({existing_q})") # Report institutions not found in main dataset found_ids = {inst['id'] for inst in all_institutions} for inst_id in enrichment_map: if inst_id not in found_ids: not_found_count += 1 print(f" āš ļø Not found in dataset: {inst_id}") print(f"\n šŸ“Š New Wikidata enrichments merged: {merged_count}") print(f" šŸ“Š Already enriched (skipped): {updated_count}") print(f" šŸ“Š Not found in main dataset: {not_found_count}") # Save unified dataset with timestamp timestamp = datetime.now(timezone.utc).strftime('%Y%m%d') output_path = f'data/instances/all/globalglam-{timestamp}.yaml' print(f"\nšŸ’¾ Saving updated unified dataset to {output_path}...") with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(" āœ… Saved") # Verify Brazil coverage br_institutions = [ inst for inst in all_institutions if any(loc.get('country') == 'BR' for loc in inst.get('locations', [])) ] br_with_wikidata = sum( 1 for inst in br_institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])) ) print("\n" + "=" * 80) print("šŸ“Š VERIFICATION - Brazil Institutions in Unified Dataset") print("=" * 80) print(f"Total Brazil institutions: {len(br_institutions)}") print(f"With Wikidata identifiers: {br_with_wikidata}") print(f"Coverage: {br_with_wikidata/len(br_institutions)*100:.1f}%") print(f"Progress: {br_with_wikidata}/{len(br_institutions)}") if merged_count > 0: print(f"\nāœ… SUCCESS: {merged_count} new Wikidata enrichments merged!") print(f"šŸ‡§šŸ‡· Brazil coverage improved: 67 → {br_with_wikidata} institutions") print(f" Coverage gain: {(br_with_wikidata - 67)/len(br_institutions)*100:.1f}%") else: print("\nāš ļø No new enrichments to merge (already up to date)") print("\n") if __name__ == '__main__': merge_brazil_batch13()