glam/merge_batch13_corrected.py
2025-11-19 23:25:22 +01:00

181 lines
6.1 KiB
Python

#!/usr/bin/env python3
"""
Merge Batch 13 Wikidata Enrichments - Corrected Version
Adds verified Q-numbers to 3 Brazilian institutions.
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
# Verified enrichments (IDs corrected)
ENRICHMENTS = {
"3008281717687280329": {
"name": "UNIR",
"qid": "Q7894377",
"label": "Federal University of Rondônia",
"description": "Brazilian public university"
},
"709508309148680086": {
"name": "Secult Tocantins",
"qid": "Q108397863",
"label": "Secretary of Culture of the State of Tocantins",
"description": "state secretariat responsible for cultural related affairs in the state of Tocantins, Brazil"
},
"2519599505258789521": {
"name": "Instituto Histórico e Geográfico de Alagoas",
"qid": "Q10302531",
"label": "Instituto Histórico e Geográfico de Alagoas",
"description": "research institute and museum in Maceió, Brazil"
}
}
def main():
dataset_path = Path("data/instances/all/globalglam-20251111.yaml")
backup_path = Path("data/instances/all/globalglam-20251111.yaml.bak.batch13")
print("="*80)
print("BRAZIL BATCH 13 WIKIDATA ENRICHMENT MERGE")
print("="*80)
# Load dataset
print(f"\nLoading dataset: {dataset_path}")
with open(dataset_path, 'r', encoding='utf-8') as f:
institutions = list(yaml.safe_load_all(f))
if len(institutions) == 1 and isinstance(institutions[0], list):
institutions = institutions[0]
print(f"Loaded {len(institutions)} institutions")
# Track changes
enriched_count = 0
skipped_count = 0
errors = []
print(f"\n" + "="*80)
print("PROCESSING ENRICHMENTS")
print("="*80)
for inst in institutions:
if not isinstance(inst, dict):
continue
inst_id = str(inst.get('id', ''))
if inst_id in ENRICHMENTS:
enrichment = ENRICHMENTS[inst_id]
# Check if already has Wikidata
identifiers = inst.get('identifiers', [])
has_wikidata = any(
i.get('identifier_scheme') == 'Wikidata'
for i in identifiers if isinstance(i, dict)
)
if has_wikidata:
print(f"\n⚠ SKIP: {inst.get('name')} (ID: {inst_id})")
print(f" Already has Wikidata Q-number")
skipped_count += 1
continue
# Add Wikidata identifier
new_identifier = {
'identifier_scheme': 'Wikidata',
'identifier_value': enrichment['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}"
}
if 'identifiers' not in inst or inst['identifiers'] is None:
inst['identifiers'] = []
inst['identifiers'].append(new_identifier)
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
if 'enrichment_history' not in inst['provenance']:
inst['provenance']['enrichment_history'] = []
inst['provenance']['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'Wikidata authenticated entity search (Batch 13)',
'enrichment_source': 'batch13_enriched.yaml',
'fields_enriched': ['identifiers.Wikidata'],
'wikidata_label': enrichment['label'],
'wikidata_description': enrichment['description']
})
print(f"\n✓ ENRICHED: {inst.get('name')}")
print(f" ID: {inst_id}")
print(f" Q-number: {enrichment['qid']}")
print(f" Label: {enrichment['label']}")
enriched_count += 1
# Summary
print(f"\n" + "="*80)
print("MERGE SUMMARY")
print("="*80)
print(f"Institutions enriched: {enriched_count}")
print(f"Institutions skipped: {skipped_count}")
print(f"Errors: {len(errors)}")
if errors:
print("\nErrors:")
for error in errors:
print(f" - {error}")
if enriched_count > 0:
# Create backup
print(f"\nCreating backup: {backup_path}")
import shutil
shutil.copy2(dataset_path, backup_path)
# Write updated dataset
print(f"Writing updated dataset: {dataset_path}")
with open(dataset_path, 'w', encoding='utf-8') as f:
yaml.dump_all(
[institutions],
f,
allow_unicode=True,
default_flow_style=False,
sort_keys=False,
width=1000
)
print("\n✓ Merge completed successfully!")
# Calculate coverage
brazil_institutions = [
inst for inst in institutions
if isinstance(inst, dict) and any(
loc.get('country') == 'BR'
for loc in inst.get('locations', [])
if isinstance(loc, dict)
)
]
brazil_with_wikidata = [
inst for inst in brazil_institutions
if any(
i.get('identifier_scheme') == 'Wikidata'
for i in inst.get('identifiers', [])
if isinstance(i, dict)
)
]
coverage = len(brazil_with_wikidata) / len(brazil_institutions) * 100
print(f"\n" + "="*80)
print("WIKIDATA COVERAGE (BRAZIL)")
print("="*80)
print(f"Total Brazilian institutions: {len(brazil_institutions)}")
print(f"With Wikidata Q-numbers: {len(brazil_with_wikidata)}")
print(f"Coverage: {coverage:.1f}%")
print(f"Previous coverage: 57.0% (69/121)")
print(f"Improvement: +{len(brazil_with_wikidata) - 69} institutions (+{coverage - 57.0:.1f}%)")
else:
print("\nNo changes made - no merge performed.")
if __name__ == "__main__":
main()