181 lines
6.1 KiB
Python
181 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge Batch 13 Wikidata Enrichments - Corrected Version
|
|
Adds verified Q-numbers to 3 Brazilian institutions.
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Verified enrichments (IDs corrected)
|
|
ENRICHMENTS = {
|
|
"3008281717687280329": {
|
|
"name": "UNIR",
|
|
"qid": "Q7894377",
|
|
"label": "Federal University of Rondônia",
|
|
"description": "Brazilian public university"
|
|
},
|
|
"709508309148680086": {
|
|
"name": "Secult Tocantins",
|
|
"qid": "Q108397863",
|
|
"label": "Secretary of Culture of the State of Tocantins",
|
|
"description": "state secretariat responsible for cultural related affairs in the state of Tocantins, Brazil"
|
|
},
|
|
"2519599505258789521": {
|
|
"name": "Instituto Histórico e Geográfico de Alagoas",
|
|
"qid": "Q10302531",
|
|
"label": "Instituto Histórico e Geográfico de Alagoas",
|
|
"description": "research institute and museum in Maceió, Brazil"
|
|
}
|
|
}
|
|
|
|
def main():
|
|
dataset_path = Path("data/instances/all/globalglam-20251111.yaml")
|
|
backup_path = Path("data/instances/all/globalglam-20251111.yaml.bak.batch13")
|
|
|
|
print("="*80)
|
|
print("BRAZIL BATCH 13 WIKIDATA ENRICHMENT MERGE")
|
|
print("="*80)
|
|
|
|
# Load dataset
|
|
print(f"\nLoading dataset: {dataset_path}")
|
|
with open(dataset_path, 'r', encoding='utf-8') as f:
|
|
institutions = list(yaml.safe_load_all(f))
|
|
if len(institutions) == 1 and isinstance(institutions[0], list):
|
|
institutions = institutions[0]
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
|
|
# Track changes
|
|
enriched_count = 0
|
|
skipped_count = 0
|
|
errors = []
|
|
|
|
print(f"\n" + "="*80)
|
|
print("PROCESSING ENRICHMENTS")
|
|
print("="*80)
|
|
|
|
for inst in institutions:
|
|
if not isinstance(inst, dict):
|
|
continue
|
|
|
|
inst_id = str(inst.get('id', ''))
|
|
|
|
if inst_id in ENRICHMENTS:
|
|
enrichment = ENRICHMENTS[inst_id]
|
|
|
|
# Check if already has Wikidata
|
|
identifiers = inst.get('identifiers', [])
|
|
has_wikidata = any(
|
|
i.get('identifier_scheme') == 'Wikidata'
|
|
for i in identifiers if isinstance(i, dict)
|
|
)
|
|
|
|
if has_wikidata:
|
|
print(f"\n⚠ SKIP: {inst.get('name')} (ID: {inst_id})")
|
|
print(f" Already has Wikidata Q-number")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Add Wikidata identifier
|
|
new_identifier = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': enrichment['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{enrichment['qid']}"
|
|
}
|
|
|
|
if 'identifiers' not in inst or inst['identifiers'] is None:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append(new_identifier)
|
|
|
|
# Update provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
if 'enrichment_history' not in inst['provenance']:
|
|
inst['provenance']['enrichment_history'] = []
|
|
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'Wikidata authenticated entity search (Batch 13)',
|
|
'enrichment_source': 'batch13_enriched.yaml',
|
|
'fields_enriched': ['identifiers.Wikidata'],
|
|
'wikidata_label': enrichment['label'],
|
|
'wikidata_description': enrichment['description']
|
|
})
|
|
|
|
print(f"\n✓ ENRICHED: {inst.get('name')}")
|
|
print(f" ID: {inst_id}")
|
|
print(f" Q-number: {enrichment['qid']}")
|
|
print(f" Label: {enrichment['label']}")
|
|
enriched_count += 1
|
|
|
|
# Summary
|
|
print(f"\n" + "="*80)
|
|
print("MERGE SUMMARY")
|
|
print("="*80)
|
|
print(f"Institutions enriched: {enriched_count}")
|
|
print(f"Institutions skipped: {skipped_count}")
|
|
print(f"Errors: {len(errors)}")
|
|
|
|
if errors:
|
|
print("\nErrors:")
|
|
for error in errors:
|
|
print(f" - {error}")
|
|
|
|
if enriched_count > 0:
|
|
# Create backup
|
|
print(f"\nCreating backup: {backup_path}")
|
|
import shutil
|
|
shutil.copy2(dataset_path, backup_path)
|
|
|
|
# Write updated dataset
|
|
print(f"Writing updated dataset: {dataset_path}")
|
|
with open(dataset_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump_all(
|
|
[institutions],
|
|
f,
|
|
allow_unicode=True,
|
|
default_flow_style=False,
|
|
sort_keys=False,
|
|
width=1000
|
|
)
|
|
|
|
print("\n✓ Merge completed successfully!")
|
|
|
|
# Calculate coverage
|
|
brazil_institutions = [
|
|
inst for inst in institutions
|
|
if isinstance(inst, dict) and any(
|
|
loc.get('country') == 'BR'
|
|
for loc in inst.get('locations', [])
|
|
if isinstance(loc, dict)
|
|
)
|
|
]
|
|
|
|
brazil_with_wikidata = [
|
|
inst for inst in brazil_institutions
|
|
if any(
|
|
i.get('identifier_scheme') == 'Wikidata'
|
|
for i in inst.get('identifiers', [])
|
|
if isinstance(i, dict)
|
|
)
|
|
]
|
|
|
|
coverage = len(brazil_with_wikidata) / len(brazil_institutions) * 100
|
|
|
|
print(f"\n" + "="*80)
|
|
print("WIKIDATA COVERAGE (BRAZIL)")
|
|
print("="*80)
|
|
print(f"Total Brazilian institutions: {len(brazil_institutions)}")
|
|
print(f"With Wikidata Q-numbers: {len(brazil_with_wikidata)}")
|
|
print(f"Coverage: {coverage:.1f}%")
|
|
print(f"Previous coverage: 57.0% (69/121)")
|
|
print(f"Improvement: +{len(brazil_with_wikidata) - 69} institutions (+{coverage - 57.0:.1f}%)")
|
|
else:
|
|
print("\nNo changes made - no merge performed.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|