#!/usr/bin/env python3 """ Unify All GLAM Datasets - Comprehensive Global Integration This script unifies all heritage institution datasets from individual countries into a single comprehensive global dataset at data/instances/all/ Features: - Merges all country-specific YAML files - Deduplicates by ID and coordinates - Tracks data provenance by country - Generates comprehensive statistics - Identifies records needing enrichment (missing Q-numbers, coordinates, etc.) Country Sources: - Brazil: brazilian_institutions_batch6_enriched.yaml (115 institutions) - Chile: chilean_institutions_batch19_enriched.yaml (90 institutions, 78.9% Wikidata) - Mexico: mexican_institutions_geocoded.yaml (117 institutions) - Japan: jp_institutions_resolved.yaml (12,065 institutions) - Libya: libyan_institutions.yaml (54 institutions) - Tunisia: tunisian_institutions.yaml (42 institutions) - Algeria: algerian_institutions.yaml (20 institutions) - Vietnam: vietnamese_glam_institutions.yaml (21 institutions) - Georgia: georgia_glam_institutions.yaml (14 institutions) - Global: global_heritage_institutions_merged.yaml (13,396 institutions) Output: - data/instances/all/globalglam-20251111.yaml - data/instances/all/UNIFICATION_REPORT.md - data/instances/all/ENRICHMENT_CANDIDATES.yaml (records needing enrichment) """ import yaml import os from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any from collections import defaultdict def load_yaml_safe(filepath: Path) -> List[Dict]: """Load YAML file safely with error handling.""" print(f"Loading: {filepath.name}") try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if not data: print(f" ⚠️ Empty file: {filepath.name}") return [] if isinstance(data, list): print(f" ✅ Loaded {len(data)} institutions") return data else: print(f" ⚠️ Unexpected format (not a list): {type(data)}") return [] except Exception as e: print(f" ❌ Error loading {filepath.name}: {e}") return [] def get_country_code(inst: Dict) -> str: """Extract country code from institution.""" if 'locations' in inst and inst['locations']: for loc in inst['locations']: if 'country' in loc and loc['country']: return loc['country'] # Try to infer from ID if 'id' in inst: parts = inst['id'].split('/') if len(parts) >= 2: country = parts[-2] if len(country) == 2: return country.upper() return 'UNKNOWN' def has_wikidata(inst: Dict) -> bool: """Check if institution has Wikidata identifier.""" if 'identifiers' not in inst or not inst['identifiers']: return False return any( id.get('identifier_scheme') == 'Wikidata' for id in inst['identifiers'] ) def has_coordinates(inst: Dict) -> bool: """Check if institution has geocoded coordinates.""" if 'locations' not in inst or not inst['locations']: return False return any( loc.get('latitude') is not None and loc.get('longitude') is not None for loc in inst['locations'] ) def needs_enrichment(inst: Dict) -> Dict[str, bool]: """Identify what enrichment an institution needs.""" needs = { 'wikidata': not has_wikidata(inst), 'coordinates': not has_coordinates(inst), 'website': not any( id.get('identifier_scheme') == 'Website' for id in inst.get('identifiers', []) ) if inst.get('identifiers') else True, 'description': not inst.get('description') or len(inst.get('description', '')) < 50, } return needs def main(): """Main unification workflow.""" base_dir = Path('/Users/kempersc/apps/glam/data/instances') output_dir = base_dir / 'all' output_dir.mkdir(exist_ok=True) print("\n" + "="*80) print("GLAM Dataset Unification - Global Integration") print("="*80 + "\n") # Define data sources (most recent files for each country) sources = { 'chile': base_dir / 'chile' / 'chilean_institutions_batch19_enriched.yaml', 'brazil': base_dir / 'brazil' / 'brazilian_institutions_batch6_enriched.yaml', 'mexico': base_dir / 'mexico' / 'mexican_institutions_geocoded.yaml', 'japan': base_dir / 'japan' / 'jp_institutions_resolved.yaml', 'libya': base_dir / 'libya' / 'libyan_institutions.yaml', 'tunisia': base_dir / 'tunisia' / 'tunisian_institutions.yaml', 'algeria': base_dir / 'algeria' / 'algerian_institutions.yaml', 'vietnam': base_dir / 'vietnamese_glam_institutions.yaml', 'georgia': base_dir / 'georgia_glam_institutions.yaml', 'historical': base_dir / 'historical_institutions_validation.yaml', 'global': base_dir / 'global' / 'global_heritage_institutions_merged.yaml', } # Load all datasets all_institutions = [] source_stats = {} for source_name, filepath in sources.items(): if not filepath.exists(): print(f"⚠️ Skipping {source_name}: file not found") continue institutions = load_yaml_safe(filepath) # Add source tracking to provenance for inst in institutions: if 'provenance' not in inst: inst['provenance'] = {} inst['provenance']['unification_source'] = source_name inst['provenance']['unification_date'] = datetime.now(timezone.utc).isoformat() all_institutions.extend(institutions) # Calculate statistics source_stats[source_name] = { 'total': len(institutions), 'with_wikidata': sum(1 for i in institutions if has_wikidata(i)), 'with_coordinates': sum(1 for i in institutions if has_coordinates(i)), } print(f"\n📊 Total institutions loaded: {len(all_institutions)}") # Deduplicate by ID print("\n🔍 Deduplicating by ID...") seen_ids = {} # Maps ID -> (institution, source_name) duplicates = [] unique_institutions = [] for inst in all_institutions: inst_id = inst.get('id') if not inst_id: unique_institutions.append(inst) continue source = inst['provenance'].get('unification_source', 'unknown') if inst_id in seen_ids: existing_inst, existing_source = seen_ids[inst_id] duplicates.append({ 'id': inst_id, 'sources': [existing_source, source] }) # Keep the one with more data (prioritize those with Wikidata) if has_wikidata(inst) and not has_wikidata(existing_inst): # Replace with more enriched version unique_institutions = [i for i in unique_institutions if i.get('id') != inst_id] unique_institutions.append(inst) seen_ids[inst_id] = (inst, source) else: seen_ids[inst_id] = (inst, source) unique_institutions.append(inst) print(f" ✅ Unique institutions: {len(unique_institutions)}") print(f" ⚠️ Duplicates removed: {len(duplicates)}") # Calculate enrichment statistics print("\n📈 Calculating enrichment statistics...") enrichment_stats = { 'total': len(unique_institutions), 'with_wikidata': sum(1 for i in unique_institutions if has_wikidata(i)), 'with_coordinates': sum(1 for i in unique_institutions if has_coordinates(i)), 'needs_wikidata': sum(1 for i in unique_institutions if needs_enrichment(i)['wikidata']), 'needs_coordinates': sum(1 for i in unique_institutions if needs_enrichment(i)['coordinates']), 'needs_website': sum(1 for i in unique_institutions if needs_enrichment(i)['website']), 'needs_description': sum(1 for i in unique_institutions if needs_enrichment(i)['description']), } # Group by country by_country = defaultdict(list) for inst in unique_institutions: country = get_country_code(inst) by_country[country].append(inst) print(f"\n🌍 Countries covered: {len(by_country)}") for country, insts in sorted(by_country.items(), key=lambda x: len(x[1]), reverse=True): wikidata_count = sum(1 for i in insts if has_wikidata(i)) wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0 print(f" {country}: {len(insts)} institutions ({wikidata_count}/{len(insts)} = {wikidata_pct:.1f}% Wikidata)") # Identify enrichment candidates print("\n🎯 Identifying enrichment candidates...") enrichment_candidates = [] for inst in unique_institutions: needs = needs_enrichment(inst) if any(needs.values()): enrichment_candidates.append({ 'id': inst.get('id'), 'name': inst.get('name'), 'country': get_country_code(inst), 'institution_type': inst.get('institution_type'), 'needs': needs, 'priority_score': sum(needs.values()) # Higher = more needs }) # Sort by priority enrichment_candidates.sort(key=lambda x: x['priority_score'], reverse=True) print(f" 🔍 Found {len(enrichment_candidates)} institutions needing enrichment") print(f" - Need Wikidata: {enrichment_stats['needs_wikidata']}") print(f" - Need coordinates: {enrichment_stats['needs_coordinates']}") print(f" - Need website: {enrichment_stats['needs_website']}") print(f" - Need description: {enrichment_stats['needs_description']}") # Save unified dataset output_file = output_dir / 'globalglam-20251111.yaml' print(f"\n💾 Saving unified dataset to: {output_file.name}") with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(unique_institutions, f, allow_unicode=True, sort_keys=False, width=120) print(f" ✅ Saved {len(unique_institutions)} institutions") # Save enrichment candidates candidates_file = output_dir / 'ENRICHMENT_CANDIDATES.yaml' print(f"\n💾 Saving enrichment candidates to: {candidates_file.name}") with open(candidates_file, 'w', encoding='utf-8') as f: yaml.dump(enrichment_candidates, f, allow_unicode=True, sort_keys=False) print(f" ✅ Saved {len(enrichment_candidates)} candidates") # Generate unification report report_file = output_dir / 'UNIFICATION_REPORT.md' print(f"\n📄 Generating unification report: {report_file.name}") report = f"""# GLAM Dataset Unification Report **Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')} ## Executive Summary - **Total Institutions**: {len(unique_institutions):,} - **Countries Covered**: {len(by_country)} - **Wikidata Coverage**: {enrichment_stats['with_wikidata']:,}/{enrichment_stats['total']:,} ({enrichment_stats['with_wikidata']/enrichment_stats['total']*100:.1f}%) - **Geocoding Coverage**: {enrichment_stats['with_coordinates']:,}/{enrichment_stats['total']:,} ({enrichment_stats['with_coordinates']/enrichment_stats['total']*100:.1f}%) - **Duplicates Removed**: {len(duplicates)} ## Data Sources """ for source_name, stats in sorted(source_stats.items()): wikidata_pct = (stats['with_wikidata'] / stats['total'] * 100) if stats['total'] > 0 else 0 geocode_pct = (stats['with_coordinates'] / stats['total'] * 100) if stats['total'] > 0 else 0 report += f"""### {source_name.title()} - Total: {stats['total']:,} institutions - Wikidata: {stats['with_wikidata']:,} ({wikidata_pct:.1f}%) - Geocoded: {stats['with_coordinates']:,} ({geocode_pct:.1f}%) """ report += f"""## Coverage by Country | Country | Total | Wikidata | Wikidata % | Geocoded | Geocoded % | |---------|-------|----------|------------|----------|------------| """ for country, insts in sorted(by_country.items(), key=lambda x: len(x[1]), reverse=True): wikidata_count = sum(1 for i in insts if has_wikidata(i)) wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0 geocode_count = sum(1 for i in insts if has_coordinates(i)) geocode_pct = (geocode_count / len(insts) * 100) if insts else 0 report += f"| {country} | {len(insts):,} | {wikidata_count:,} | {wikidata_pct:.1f}% | {geocode_count:,} | {geocode_pct:.1f}% |\n" report += f""" ## Enrichment Needs Total institutions requiring enrichment: **{len(enrichment_candidates):,}** ({len(enrichment_candidates)/len(unique_institutions)*100:.1f}% of dataset) ### By Enrichment Type - **Need Wikidata**: {enrichment_stats['needs_wikidata']:,} ({enrichment_stats['needs_wikidata']/enrichment_stats['total']*100:.1f}%) - **Need Coordinates**: {enrichment_stats['needs_coordinates']:,} ({enrichment_stats['needs_coordinates']/enrichment_stats['total']*100:.1f}%) - **Need Website**: {enrichment_stats['needs_website']:,} ({enrichment_stats['needs_website']/enrichment_stats['total']*100:.1f}%) - **Need Description**: {enrichment_stats['needs_description']:,} ({enrichment_stats['needs_description']/enrichment_stats['total']*100:.1f}%) ### Priority Distribution (by number of missing fields) """ priority_dist = defaultdict(int) for candidate in enrichment_candidates: priority_dist[candidate['priority_score']] += 1 for priority in sorted(priority_dist.keys(), reverse=True): count = priority_dist[priority] report += f"- **Priority {priority}** ({priority} missing fields): {count:,} institutions\n" report += f""" ## Top 50 Enrichment Candidates (Highest Priority) | Name | Country | Type | Missing Fields | |------|---------|------|----------------| """ for candidate in enrichment_candidates[:50]: missing = ', '.join([k for k, v in candidate['needs'].items() if v]) name_short = candidate['name'][:60] + '...' if len(candidate['name']) > 60 else candidate['name'] report += f"| {name_short} | {candidate['country']} | {candidate['institution_type']} | {missing} |\n" report += f""" ## Deduplication Details ### Duplicates Found Total duplicate IDs: {len(duplicates)} """ if duplicates: report += "| ID | Sources |\n|----|---------|\n" for dup in duplicates[:20]: # Show first 20 sources_str = ', '.join(dup['sources']) id_short = dup['id'][-50:] if len(dup['id']) > 50 else dup['id'] report += f"| ...{id_short} | {sources_str} |\n" if len(duplicates) > 20: report += f"\n*...and {len(duplicates) - 20} more duplicates*\n" report += f""" ## Next Steps ### Immediate Actions 1. **Review Enrichment Candidates**: Check `ENRICHMENT_CANDIDATES.yaml` for institutions needing data 2. **Prioritize Countries**: Focus on countries with low Wikidata coverage: """ # Find countries with lowest Wikidata coverage country_coverage = [] for country, insts in by_country.items(): if country == 'UNKNOWN': continue wikidata_count = sum(1 for i in insts if has_wikidata(i)) wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0 country_coverage.append((country, wikidata_pct, len(insts))) country_coverage.sort(key=lambda x: x[1]) # Sort by coverage ascending for country, pct, count in country_coverage[:10]: report += f" - {country}: {pct:.1f}% coverage ({count} institutions)\n" report += f""" 3. **Batch Enrichment Workflow**: - Run Wikidata enrichment for high-priority candidates - Run geocoding for missing coordinates - Crawl institutional websites for missing data ### Tools Available - **Wikidata Enrichment**: `scripts/enrich_global_batch.py` - **Geocoding**: `scripts/geocode_institutions.py` - **Website Crawling**: `scripts/crawl_institution_websites.py` (to be created) ## Files Generated 1. **globalglam-20251111.yaml** - Complete unified dataset ({len(unique_institutions):,} institutions) 2. **ENRICHMENT_CANDIDATES.yaml** - Institutions needing enrichment ({len(enrichment_candidates):,} candidates) 3. **UNIFICATION_REPORT.md** - This report --- **Generated by**: `scripts/unify_all_datasets.py` **Dataset Version**: 1.0 **Schema Version**: LinkML v0.2.1 """ with open(report_file, 'w', encoding='utf-8') as f: f.write(report) print(f" ✅ Report saved") # Update DATASET_STATISTICS.yaml stats_file = output_dir / 'DATASET_STATISTICS.yaml' print(f"\n📊 Updating statistics file: {stats_file.name}") stats_data = { 'generated': datetime.now(timezone.utc).isoformat(), 'project': 'GLAM Data Extraction', 'schema_version': 'v0.2.1', 'unified_dataset': { 'total_institutions': len(unique_institutions), 'countries_covered': len(by_country), 'wikidata_coverage': { 'count': enrichment_stats['with_wikidata'], 'percentage': round(enrichment_stats['with_wikidata']/enrichment_stats['total']*100, 2) }, 'geocoding_coverage': { 'count': enrichment_stats['with_coordinates'], 'percentage': round(enrichment_stats['with_coordinates']/enrichment_stats['total']*100, 2) }, 'enrichment_needs': { 'total_candidates': len(enrichment_candidates), 'needs_wikidata': enrichment_stats['needs_wikidata'], 'needs_coordinates': enrichment_stats['needs_coordinates'], 'needs_website': enrichment_stats['needs_website'], 'needs_description': enrichment_stats['needs_description'], } }, 'by_country': {} } for country, insts in sorted(by_country.items()): wikidata_count = sum(1 for i in insts if has_wikidata(i)) geocode_count = sum(1 for i in insts if has_coordinates(i)) stats_data['by_country'][country] = { 'total': len(insts), 'wikidata_coverage': { 'count': wikidata_count, 'percentage': round(wikidata_count/len(insts)*100, 2) if insts else 0 }, 'geocoding_coverage': { 'count': geocode_count, 'percentage': round(geocode_count/len(insts)*100, 2) if insts else 0 } } with open(stats_file, 'w', encoding='utf-8') as f: yaml.dump(stats_data, f, allow_unicode=True, sort_keys=False) print(f" ✅ Statistics updated") print("\n" + "="*80) print("✅ UNIFICATION COMPLETE!") print("="*80) print(f"\n📁 Output files in: {output_dir}/") print(f" - globalglam-20251111.yaml ({len(unique_institutions):,} institutions)") print(f" - ENRICHMENT_CANDIDATES.yaml ({len(enrichment_candidates):,} candidates)") print(f" - UNIFICATION_REPORT.md") print(f" - DATASET_STATISTICS.yaml") print(f"\n🎯 Ready for global enrichment workflow!") print(f" Next: Run enrichment on {enrichment_stats['needs_wikidata']:,} institutions without Wikidata") if __name__ == '__main__': main()