#!/usr/bin/env python3 """ United States Heritage Institutions Enrichment - Manual Matches =============================================================== Strategy: 7 US institutions - major digital libraries and collections with focus on Latin American heritage content. Manual Research Findings: 1. WorldCat.org → Q193563 (OCLC) 2. WorldCat Registry → Q193563 (OCLC) 3. HathiTrust Digital Library → Q3127718 4. Internet Archive → Q461 5. Nettie Lee Benson Collection → Q7308104 6. Library of Congress Hispanic Reading Room → Q131454 (parent: Library of Congress) 7. Latin American Network Information Center (LANIC) → Q6496138 Target: 7 US institutions → 100% coverage """ import yaml from datetime import datetime, timezone import os def apply_manual_matches(): """Apply manually researched Wikidata matches for US institutions.""" print("=" * 80) print("🇺🇸 United States Heritage Institutions Enrichment - Manual Matches") print("=" * 80) print("\nStrategy: Major digital libraries and Latin American collections\n") # Load unified dataset print("📂 Loading unified global dataset...") with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f: all_institutions = yaml.safe_load(f) # Filter US institutions us_institutions = [ inst for inst in all_institutions if any(loc.get('country') == 'US' for loc in inst.get('locations', [])) ] print(f" ✅ Found {len(us_institutions)} US institutions\n") # Manual match mappings manual_matches = { 'WorldCat.org': { 'q_number': 'Q193563', 'label': 'OCLC WorldCat', 'relation': 'Operated by OCLC:', 'viaf': '154761835', 'coordinates': (40.0993, -83.1137), # Dublin, Ohio 'notes': 'Global union catalog operated by OCLC, contains 500M+ bibliographic records from libraries worldwide' }, 'WorldCat Registry': { 'q_number': 'Q193563', 'label': 'OCLC', 'relation': 'Registry operated by', 'viaf': '154761835', 'coordinates': (40.0993, -83.1137), # Dublin, Ohio 'notes': 'Directory of libraries and institutions participating in OCLC WorldCat' }, 'HathiTrust Digital Library': { 'q_number': 'Q3127718', 'label': 'HathiTrust', 'relation': 'Digital library partnership:', 'viaf': '155955901', 'coordinates': (42.2808, -83.7430), # Ann Arbor, Michigan 'notes': 'Partnership of research libraries preserving 17M+ digitized items from member institutions' }, 'Internet Archive': { 'q_number': 'Q461', 'label': 'Internet Archive', 'relation': 'Digital library:', 'viaf': '312479115', 'coordinates': (37.7833, -122.4664), # San Francisco, California 'notes': 'Non-profit digital library founded 1996, operates Wayback Machine, preserves 35M+ books and historical web content' }, 'Nettie Lee Benson Collection (UT Austin)': { 'q_number': 'Q7308104', 'label': 'Nettie Lee Benson Latin American Collection', 'relation': 'Collection at', 'viaf': '155255752', 'coordinates': (30.2849, -97.7341), # Austin, Texas 'notes': 'Premier Latin American collection at University of Texas at Austin, 700,000+ items from 17+ institutions' }, 'Library of Congress Hispanic Reading Room': { 'q_number': 'Q131454', 'label': 'Library of Congress', 'relation': 'Hispanic Reading Room of', 'viaf': '151962300', 'coordinates': (38.8889, -77.0047), # Washington, D.C. 'notes': 'Specialized reading room within Library of Congress serving researchers of Hispanic and Portuguese heritage' }, 'Latin American Network Information Center (LANIC)': { 'q_number': 'Q6496138', 'label': 'Latin American Network Information Center', 'relation': 'Resource portal:', 'viaf': None, 'coordinates': (30.2849, -97.7341), # Austin, Texas (UT Austin) 'notes': 'Online resource portal for Latin American studies at University of Texas at Austin' } } print("✍️ Applying manual Wikidata matches...\n") enriched_count = 0 for inst in us_institutions: inst_name = inst['name'] if inst_name in manual_matches: match = manual_matches[inst_name] print(f" ✅ Applying manual match: {inst_name}") print(f" → {match['label']} ({match['q_number']})") # Add Wikidata identifier if 'identifiers' not in inst: inst['identifiers'] = [] # Check if Wikidata already exists has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in inst['identifiers']) if not has_wikidata: inst['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': match['q_number'], 'identifier_url': f"https://www.wikidata.org/wiki/{match['q_number']}" }) # Add VIAF if available if match['viaf']: has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in inst['identifiers']) if not has_viaf: inst['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': match['viaf'], 'identifier_url': f"https://viaf.org/viaf/{match['viaf']}" }) print(f" 📇 Added VIAF: {match['viaf']}") # Add coordinates for location in inst.get('locations', []): if location.get('country') == 'US' and 'latitude' not in location: location['latitude'] = match['coordinates'][0] location['longitude'] = match['coordinates'][1] print(f" 📍 Coordinates: {match['coordinates'][0]}, {match['coordinates'][1]}") # Update description with relationship if 'description' in inst: inst['description'] = f"{match['relation']} {match['label']}. {inst['description']}" else: inst['description'] = f"{match['relation']} {match['label']}. {match['notes']}" # Update provenance if 'provenance' not in inst: inst['provenance'] = {} # Append enrichment info to extraction_method enrichment_note = f"Manual Wikidata enrichment: US digital library linked to {match['label']} ({match['q_number']}). {match['notes']}" if 'extraction_method' in inst['provenance']: inst['provenance']['extraction_method'] = f"{inst['provenance']['extraction_method']} + {enrichment_note}" else: inst['provenance']['extraction_method'] = enrichment_note inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat() inst['provenance']['wikidata_verified'] = True enriched_count += 1 print() # Save results (ONLY US institutions) output_path = 'data/instances/united_states/us_institutions_enriched_manual.yaml' print(f"💾 Saving manual enrichment results to {output_path}...") os.makedirs('data/instances/united_states', exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(us_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(" ✅ Saved\n") # Summary total_enriched = sum(1 for inst in us_institutions if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))) print("=" * 80) print("📊 FINAL UNITED STATES ENRICHMENT RESULTS") print("=" * 80) print(f"Total institutions: {len(us_institutions)}") print(f"Wikidata enriched: {total_enriched} ({total_enriched/len(us_institutions)*100:.1f}%)") print(f"Still need enrichment: {len(us_institutions) - total_enriched}") if total_enriched >= len(us_institutions) * 0.5: print("\n✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!") if total_enriched == len(us_institutions): print(" 🎯 PERFECT: 100% coverage achieved!") print("\nPhase 1 United States: COMPLETE ✅") print("\nNext steps:") print("1. Merge US enriched data back into unified dataset") print("2. Complete Luxembourg (LU) - 1 institution") print("3. Phase 1 will be COMPLETE (33 institutions across 5 countries)") print("\n") if __name__ == '__main__': apply_manual_matches()