153 lines
6.1 KiB
Python
153 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge Brazil Batch 13 enriched institutions back into unified global dataset.
|
|
This script merges 9 verified Wikidata Q-numbers from Batch 13.
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
|
|
def merge_brazil_batch13():
|
|
print("=" * 80)
|
|
print("🔀 Merging Brazil Batch 13 enriched data into unified dataset")
|
|
print("=" * 80)
|
|
|
|
# Load unified dataset
|
|
print("\n📂 Loading unified dataset...")
|
|
with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
|
|
all_institutions = yaml.safe_load(f)
|
|
print(f" ✅ Loaded {len(all_institutions)} institutions")
|
|
|
|
# Load enriched Brazil batch 13 data
|
|
print("\n📂 Loading Brazil Batch 13 enriched data...")
|
|
with open('data/instances/brazil/batch13_enriched.yaml', 'r', encoding='utf-8') as f:
|
|
br_batch13 = yaml.safe_load(f)
|
|
|
|
successful_matches = br_batch13['successful_matches']
|
|
print(f" ✅ Loaded {len(successful_matches)} enriched institutions")
|
|
|
|
# Create mapping of institution IDs to Q-numbers
|
|
enrichment_map = {}
|
|
for match in successful_matches:
|
|
inst_id = match.get('institution_id')
|
|
if inst_id:
|
|
enrichment_map[inst_id] = {
|
|
'qid': match['wikidata_qid'],
|
|
'label': match['wikidata_label'],
|
|
'description': match.get('wikidata_description', ''),
|
|
'confidence': match['confidence']
|
|
}
|
|
|
|
print(f" 📋 Enrichment map created for {len(enrichment_map)} institutions")
|
|
|
|
# Merge enriched data
|
|
print("\n🔄 Merging enriched data...")
|
|
merged_count = 0
|
|
updated_count = 0
|
|
not_found_count = 0
|
|
|
|
for i, inst in enumerate(all_institutions):
|
|
inst_id = inst.get('id')
|
|
|
|
if inst_id in enrichment_map:
|
|
enrichment = enrichment_map[inst_id]
|
|
|
|
# Check if institution already has Wikidata
|
|
has_wd_original = any(
|
|
idf.get('identifier_scheme') == 'Wikidata'
|
|
for idf in inst.get('identifiers', [])
|
|
)
|
|
|
|
if not has_wd_original:
|
|
# Add new Wikidata identifier
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': enrichment['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}"
|
|
})
|
|
|
|
# Update provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
if 'enrichment_history' not in inst['provenance']:
|
|
inst['provenance']['enrichment_history'] = []
|
|
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_type': 'WIKIDATA_IDENTIFIER',
|
|
'enrichment_method': 'AUTHENTICATED_SEARCH_BATCH13',
|
|
'match_score': enrichment['confidence'],
|
|
'verified': True,
|
|
'enrichment_source': 'https://www.wikidata.org',
|
|
'enrichment_notes': f"Batch 13: {enrichment['label']}"
|
|
})
|
|
|
|
inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
merged_count += 1
|
|
print(f" ✅ Merged: {inst['name']} -> {enrichment['qid']}")
|
|
else:
|
|
updated_count += 1
|
|
# Get existing Q-number
|
|
existing_q = next(
|
|
(idf.get('identifier_value') for idf in inst.get('identifiers', [])
|
|
if idf.get('identifier_scheme') == 'Wikidata'),
|
|
'N/A'
|
|
)
|
|
print(f" ⏭️ Already has Wikidata: {inst['name']} ({existing_q})")
|
|
|
|
# Report institutions not found in main dataset
|
|
found_ids = {inst['id'] for inst in all_institutions}
|
|
for inst_id in enrichment_map:
|
|
if inst_id not in found_ids:
|
|
not_found_count += 1
|
|
print(f" ⚠️ Not found in dataset: {inst_id}")
|
|
|
|
print(f"\n 📊 New Wikidata enrichments merged: {merged_count}")
|
|
print(f" 📊 Already enriched (skipped): {updated_count}")
|
|
print(f" 📊 Not found in main dataset: {not_found_count}")
|
|
|
|
# Save unified dataset with timestamp
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%d')
|
|
output_path = f'data/instances/all/globalglam-{timestamp}.yaml'
|
|
|
|
print(f"\n💾 Saving updated unified dataset to {output_path}...")
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(all_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print(" ✅ Saved")
|
|
|
|
# Verify Brazil coverage
|
|
br_institutions = [
|
|
inst for inst in all_institutions
|
|
if any(loc.get('country') == 'BR' for loc in inst.get('locations', []))
|
|
]
|
|
|
|
br_with_wikidata = sum(
|
|
1 for inst in br_institutions
|
|
if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))
|
|
)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("📊 VERIFICATION - Brazil Institutions in Unified Dataset")
|
|
print("=" * 80)
|
|
print(f"Total Brazil institutions: {len(br_institutions)}")
|
|
print(f"With Wikidata identifiers: {br_with_wikidata}")
|
|
print(f"Coverage: {br_with_wikidata/len(br_institutions)*100:.1f}%")
|
|
print(f"Progress: {br_with_wikidata}/{len(br_institutions)}")
|
|
|
|
if merged_count > 0:
|
|
print(f"\n✅ SUCCESS: {merged_count} new Wikidata enrichments merged!")
|
|
print(f"🇧🇷 Brazil coverage improved: 67 → {br_with_wikidata} institutions")
|
|
print(f" Coverage gain: {(br_with_wikidata - 67)/len(br_institutions)*100:.1f}%")
|
|
else:
|
|
print("\n⚠️ No new enrichments to merge (already up to date)")
|
|
|
|
print("\n")
|
|
|
|
if __name__ == '__main__':
|
|
merge_brazil_batch13()
|