#!/usr/bin/env python3 """ Great Britain Heritage Institutions Enrichment - Manual Matches (V2) =================================================================== Optimized version that only loads/saves GB institutions """ import yaml from datetime import datetime, timezone import os def apply_manual_matches(): """Apply manually researched Wikidata matches.""" print("=" * 80) print("šŸ‡¬šŸ‡§ Great Britain Heritage Institutions Enrichment - Manual Matches") print("=" * 80) print("\nStrategy: Direct Wikidata assignment based on research\n") # Load unified dataset print("šŸ“‚ Loading unified global dataset...") with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) # Filter GB institutions gb_institutions = [ inst for inst in all_institutions if any(loc.get('country') == 'GB' for loc in inst.get('locations', [])) ] print(f" āœ… Found {len(gb_institutions)} GB institutions\n") # Manual match mappings manual_matches = { 'BILNAS Archive': { 'q_number': 'Q1333399', 'label': 'University of Leicester', 'relation': 'Hosted by', 'coordinates': (52.621389, -1.125), 'viaf': None, 'notes': 'BILNAS Archive is housed at University of Leicester since 2012' }, 'British Institute for Libyan and Northern African Studies Digital Archive': { 'q_number': 'Q1333399', 'label': 'University of Leicester', 'relation': 'Hosted by', 'coordinates': (52.621389, -1.125), 'viaf': None, 'notes': 'Digital archive hosted by University of Leicester School of Archaeology' }, 'Heritage Gazetteer of Libya': { 'q_number': 'Q1333399', # University of Leicester (via BILNAS) 'label': 'University of Leicester', 'relation': 'Managed by BILNAS at', 'coordinates': None, # No specific location 'viaf': None, 'notes': 'Heritage Gazetteer managed by Society for Libyan Studies, archived at University of Leicester' }, 'Endangered Archaeology in the Middle East and North Africa Database': { 'q_number': 'Q55098884', 'label': 'School of Archaeology, University of Oxford', 'relation': 'Managed by', 'coordinates': (51.7548, -1.2544), # Oxford approximate 'viaf': '156266402', 'notes': 'EAMENA project led by University of Oxford School of Archaeology in partnership with Leicester and Durham' } } print("āœļø Applying manual Wikidata matches...\n") enriched_count = 0 for inst in gb_institutions: inst_name = inst['name'] if inst_name in manual_matches: match = manual_matches[inst_name] print(f" āœ… Applying manual match: {inst_name}") print(f" → {match['label']} ({match['q_number']})") # Add Wikidata identifier if 'identifiers' not in inst: inst['identifiers'] = [] # Check if Wikidata already exists has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in inst['identifiers']) if not has_wikidata: inst['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': match['q_number'], 'identifier_url': f"https://www.wikidata.org/wiki/{match['q_number']}" }) # Add VIAF if available if match['viaf']: has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in inst['identifiers']) if not has_viaf: inst['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': match['viaf'], 'identifier_url': f"https://viaf.org/viaf/{match['viaf']}" }) print(f" šŸ“‡ Added VIAF: {match['viaf']}") # Add coordinates if available if match['coordinates']: for location in inst.get('locations', []): if location.get('country') == 'GB' and 'latitude' not in location: location['latitude'] = match['coordinates'][0] location['longitude'] = match['coordinates'][1] print(f" šŸ“ Coordinates: {match['coordinates'][0]}, {match['coordinates'][1]}") # Update description with relationship if 'description' in inst: inst['description'] = f"{match['relation']} {match['label']}. {inst['description']}" else: inst['description'] = f"{match['relation']} {match['label']}." # Update provenance if 'provenance' not in inst: inst['provenance'] = {} # Append enrichment info to extraction_method enrichment_note = f"Manual Wikidata enrichment: Linked to parent organization {match['label']} ({match['q_number']}). {match['notes']}" if 'extraction_method' in inst['provenance']: inst['provenance']['extraction_method'] = f"{inst['provenance']['extraction_method']} + {enrichment_note}" else: inst['provenance']['extraction_method'] = enrichment_note inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat() inst['provenance']['wikidata_verified'] = True enriched_count += 1 print() # Save results (ONLY GB institutions) output_path = 'data/instances/great_britain/gb_institutions_enriched_manual.yaml' print(f"šŸ’¾ Saving manual enrichment results to {output_path}...") os.makedirs('data/instances/great_britain', exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(gb_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(" āœ… Saved\n") # Summary total_enriched = sum(1 for inst in gb_institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))) print("=" * 80) print("šŸ“Š FINAL GREAT BRITAIN ENRICHMENT RESULTS") print("=" * 80) print(f"Total institutions: {len(gb_institutions)}") print(f"Wikidata enriched: {total_enriched} ({total_enriched/len(gb_institutions)*100:.1f}%)") print(f"Still need enrichment: {len(gb_institutions) - total_enriched}") if total_enriched >= len(gb_institutions) * 0.5: print("\nāœ… SUCCESS: Achieved 50%+ Wikidata coverage goal!") if total_enriched == len(gb_institutions): print(" šŸŽÆ PERFECT: 100% coverage achieved!") print("\nPhase 1 Great Britain: COMPLETE āœ…") print("\nNext steps:") print("1. Merge GB enriched data back into unified dataset") print("2. Apply same methodology to Belgium (BE) - 7 institutions") print("3. Continue to United States (US) - 7 institutions") print("4. Complete Luxembourg (LU) - 1 institution") print("\n") if __name__ == '__main__': apply_manual_matches()