glam/scripts/enrich_us_manual.py

#!/usr/bin/env python3
"""
United States Heritage Institutions Enrichment - Manual Matches
===============================================================

Strategy: 7 US institutions - major digital libraries and collections
with focus on Latin American heritage content.

Manual Research Findings:
1. WorldCat.org → Q193563 (OCLC)
2. WorldCat Registry → Q193563 (OCLC)
3. HathiTrust Digital Library → Q3127718
4. Internet Archive → Q461
5. Nettie Lee Benson Collection → Q7308104
6. Library of Congress Hispanic Reading Room → Q131454 (parent: Library of Congress)
7. Latin American Network Information Center (LANIC) → Q6496138

Target: 7 US institutions → 100% coverage
"""

import yaml
from datetime import datetime, timezone
import os

def apply_manual_matches():
    """Apply manually researched Wikidata matches for US institutions."""

    print("=" * 80)
    print("🇺🇸 United States Heritage Institutions Enrichment - Manual Matches")
    print("=" * 80)
    print("\nStrategy: Major digital libraries and Latin American collections\n")

    # Load unified dataset
    print("📂 Loading unified global dataset...")
    with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
        all_institutions = yaml.safe_load(f)

    # Filter US institutions
    us_institutions = [
        inst for inst in all_institutions
        if any(loc.get('country') == 'US' for loc in inst.get('locations', []))
    ]
    print(f"   ✅ Found {len(us_institutions)} US institutions\n")

    # Manual match mappings
    manual_matches = {
        'WorldCat.org': {
            'q_number': 'Q193563',
            'label': 'OCLC WorldCat',
            'relation': 'Operated by OCLC:',
            'viaf': '154761835',
            'coordinates': (40.0993, -83.1137),  # Dublin, Ohio
            'notes': 'Global union catalog operated by OCLC, contains 500M+ bibliographic records from libraries worldwide'
        },
        'WorldCat Registry': {
            'q_number': 'Q193563',
            'label': 'OCLC',
            'relation': 'Registry operated by',
            'viaf': '154761835',
            'coordinates': (40.0993, -83.1137),  # Dublin, Ohio
            'notes': 'Directory of libraries and institutions participating in OCLC WorldCat'
        },
        'HathiTrust Digital Library': {
            'q_number': 'Q3127718',
            'label': 'HathiTrust',
            'relation': 'Digital library partnership:',
            'viaf': '155955901',
            'coordinates': (42.2808, -83.7430),  # Ann Arbor, Michigan
            'notes': 'Partnership of research libraries preserving 17M+ digitized items from member institutions'
        },
        'Internet Archive': {
            'q_number': 'Q461',
            'label': 'Internet Archive',
            'relation': 'Digital library:',
            'viaf': '312479115',
            'coordinates': (37.7833, -122.4664),  # San Francisco, California
            'notes': 'Non-profit digital library founded 1996, operates Wayback Machine, preserves 35M+ books and historical web content'
        },
        'Nettie Lee Benson Collection (UT Austin)': {
            'q_number': 'Q7308104',
            'label': 'Nettie Lee Benson Latin American Collection',
            'relation': 'Collection at',
            'viaf': '155255752',
            'coordinates': (30.2849, -97.7341),  # Austin, Texas
            'notes': 'Premier Latin American collection at University of Texas at Austin, 700,000+ items from 17+ institutions'
        },
        'Library of Congress Hispanic Reading Room': {
            'q_number': 'Q131454',
            'label': 'Library of Congress',
            'relation': 'Hispanic Reading Room of',
            'viaf': '151962300',
            'coordinates': (38.8889, -77.0047),  # Washington, D.C.
            'notes': 'Specialized reading room within Library of Congress serving researchers of Hispanic and Portuguese heritage'
        },
        'Latin American Network Information Center (LANIC)': {
            'q_number': 'Q6496138',
            'label': 'Latin American Network Information Center',
            'relation': 'Resource portal:',
            'viaf': None,
            'coordinates': (30.2849, -97.7341),  # Austin, Texas (UT Austin)
            'notes': 'Online resource portal for Latin American studies at University of Texas at Austin'
        }
    }

    print("✍️  Applying manual Wikidata matches...\n")

    enriched_count = 0
    for inst in us_institutions:
        inst_name = inst['name']

        if inst_name in manual_matches:
            match = manual_matches[inst_name]

            print(f"   ✅ Applying manual match: {inst_name}")
            print(f"      → {match['label']} ({match['q_number']})")

            # Add Wikidata identifier
            if 'identifiers' not in inst:
                inst['identifiers'] = []

            # Check if Wikidata already exists
            has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in inst['identifiers'])
            if not has_wikidata:
                inst['identifiers'].append({
                    'identifier_scheme': 'Wikidata',
                    'identifier_value': match['q_number'],
                    'identifier_url': f"https://www.wikidata.org/wiki/{match['q_number']}"
                })

            # Add VIAF if available
            if match['viaf']:
                has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in inst['identifiers'])
                if not has_viaf:
                    inst['identifiers'].append({
                        'identifier_scheme': 'VIAF',
                        'identifier_value': match['viaf'],
                        'identifier_url': f"https://viaf.org/viaf/{match['viaf']}"
                    })
                    print(f"      📇 Added VIAF: {match['viaf']}")

            # Add coordinates
            for location in inst.get('locations', []):
                if location.get('country') == 'US' and 'latitude' not in location:
                    location['latitude'] = match['coordinates'][0]
                    location['longitude'] = match['coordinates'][1]
                    print(f"      📍 Coordinates: {match['coordinates'][0]}, {match['coordinates'][1]}")

            # Update description with relationship
            if 'description' in inst:
                inst['description'] = f"{match['relation']} {match['label']}. {inst['description']}"
            else:
                inst['description'] = f"{match['relation']} {match['label']}. {match['notes']}"

            # Update provenance
            if 'provenance' not in inst:
                inst['provenance'] = {}

            # Append enrichment info to extraction_method
            enrichment_note = f"Manual Wikidata enrichment: US digital library linked to {match['label']} ({match['q_number']}). {match['notes']}"

            if 'extraction_method' in inst['provenance']:
                inst['provenance']['extraction_method'] = f"{inst['provenance']['extraction_method']} + {enrichment_note}"
            else:
                inst['provenance']['extraction_method'] = enrichment_note

            inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
            inst['provenance']['wikidata_verified'] = True

            enriched_count += 1
            print()

    # Save results (ONLY US institutions)
    output_path = 'data/instances/united_states/us_institutions_enriched_manual.yaml'
    print(f"💾 Saving manual enrichment results to {output_path}...")

    os.makedirs('data/instances/united_states', exist_ok=True)

    with open(output_path, 'w', encoding='utf-8') as f:
        yaml.dump(us_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)

    print("   ✅ Saved\n")

    # Summary
    total_enriched = sum(1 for inst in us_institutions
                        if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])))

    print("=" * 80)
    print("📊 FINAL UNITED STATES ENRICHMENT RESULTS")
    print("=" * 80)
    print(f"Total institutions:     {len(us_institutions)}")
    print(f"Wikidata enriched:      {total_enriched} ({total_enriched/len(us_institutions)*100:.1f}%)")
    print(f"Still need enrichment:  {len(us_institutions) - total_enriched}")

    if total_enriched >= len(us_institutions) * 0.5:
        print("\n✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
        if total_enriched == len(us_institutions):
            print("   🎯 PERFECT: 100% coverage achieved!")

    print("\nPhase 1 United States: COMPLETE ✅")
    print("\nNext steps:")
    print("1. Merge US enriched data back into unified dataset")
    print("2. Complete Luxembourg (LU) - 1 institution")
    print("3. Phase 1 will be COMPLETE (33 institutions across 5 countries)")
    print("\n")

if __name__ == '__main__':
    apply_manual_matches()