#!/usr/bin/env python3 """ Manual enrichment for Italian institutions with Wikidata identifiers. Research findings: 1. Giovio Musaeum - Already has Wikidata (Q3868171) 2. European University Institute - Archives → Q1378099 (parent institution) 3. European University Institute - Library → Q1378099 (parent institution) Note: EUI sub-units (Archives/Library) do not have separate Wikidata entities, so both use the parent institution Q1378099. """ import yaml from datetime import datetime, timezone def enrich_italian_institutions(): print("=" * 80) print("šŸ‡®šŸ‡¹ Italian Institutions Manual Enrichment") print("=" * 80) # Load unified dataset print("\nšŸ“‚ Loading unified dataset...") with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) print(f" āœ… Loaded {len(all_institutions)} institutions") # Extract Italian institutions italian = [ inst for inst in all_institutions if any(loc.get('country') == 'IT' for loc in inst.get('locations', [])) ] print(f"\nšŸ” Found {len(italian)} Italian institutions") # Enrichment data enrichments = { 'EUR-EUI0001': { # European University Institute - Archives 'wikidata': 'Q1378099', 'viaf': '133087619', 'coordinates': {'latitude': 43.7524, 'longitude': 11.2947}, 'website': 'https://www.eui.eu/Research/HistoricalArchivesOfEU', 'note': 'Using parent institution Q1378099 (EUI) as no separate Wikidata entity exists for Archives' }, 'EUR-EUI0002': { # European University Institute - Library 'wikidata': 'Q1378099', 'viaf': '133087619', 'coordinates': {'latitude': 43.7524, 'longitude': 11.2947}, 'website': 'https://www.eui.eu/Research/Library', 'note': 'Using parent institution Q1378099 (EUI) as no separate Wikidata entity exists for Library' } } print("\nšŸ”„ Applying enrichments...") enriched_count = 0 for inst in italian: inst_id = inst.get('id') if inst_id in enrichments: enrich_data = enrichments[inst_id] # Initialize identifiers list if missing if 'identifiers' not in inst: inst['identifiers'] = [] # Add/update Wikidata identifier existing_wd = [i for i in inst['identifiers'] if i.get('identifier_scheme') == 'Wikidata'] if not existing_wd: inst['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': enrich_data['wikidata'], 'identifier_url': f"https://www.wikidata.org/wiki/{enrich_data['wikidata']}" }) print(f" āœ… Added Wikidata {enrich_data['wikidata']}: {inst['name']}") # Add/update VIAF identifier existing_viaf = [i for i in inst['identifiers'] if i.get('identifier_scheme') == 'VIAF'] if not existing_viaf and enrich_data.get('viaf'): inst['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': enrich_data['viaf'], 'identifier_url': f"https://viaf.org/viaf/{enrich_data['viaf']}" }) print(f" + VIAF {enrich_data['viaf']}") # Add/update Website existing_website = [i for i in inst['identifiers'] if i.get('identifier_scheme') == 'Website'] if not existing_website and enrich_data.get('website'): inst['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': enrich_data['website'], 'identifier_url': enrich_data['website'] }) print(f" + Website") # Add coordinates to location if inst.get('locations') and enrich_data.get('coordinates'): loc = inst['locations'][0] if 'latitude' not in loc: loc.update(enrich_data['coordinates']) print(f" + Coordinates") # Update provenance if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['last_enrichment_date'] = datetime.now(timezone.utc).isoformat() inst['provenance']['enrichment_method'] = 'Manual Wikidata research' # Add note to description if provided if enrich_data.get('note'): if 'description' in inst: inst['description'] = f"{inst['description']} [Wikidata note: {enrich_data['note']}]" enriched_count += 1 print(f"\n šŸ“Š Total enriched: {enriched_count}") # Calculate coverage italian_with_wd = sum( 1 for inst in italian if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])) ) print("\n" + "=" * 80) print("šŸ“Š Italian Institutions - Wikidata Coverage") print("=" * 80) print(f"Total Italian institutions: {len(italian)}") print(f"With Wikidata identifiers: {italian_with_wd}") print(f"Coverage: {italian_with_wd/len(italian)*100:.1f}%") if italian_with_wd == len(italian): print("\nāœ… SUCCESS: 100% Wikidata coverage achieved!") # Save enriched Italian institutions output_path = 'data/instances/italy/it_institutions_enriched_manual.yaml' print(f"\nšŸ’¾ Saving enriched Italian institutions to {output_path}...") import os os.makedirs('data/instances/italy', exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(italian, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(" āœ… Saved") print("\nšŸ”„ Next step: Run scripts/merge_it_enriched.py to merge back into unified dataset") print() if __name__ == '__main__': enrich_italian_institutions()