#!/usr/bin/env python3 """ Build Unified GLAM Heritage Custodian Database Merges all country-specific LinkML datasets into a unified database with: - Deduplication by GHCID - Data quality tracking - Version control - Multiple export formats (JSON, SQLite, Parquet) """ import json import sqlite3 from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Any from collections import defaultdict import hashlib # Country dataset paths COUNTRY_DATASETS = { 'finland': '/Users/kempersc/apps/glam/data/finland_isil/finland_isil_linkml_final_20251120.json', 'denmark': '/Users/kempersc/apps/glam/data/instances/denmark_complete_enriched.json', 'netherlands': '/Users/kempersc/apps/glam/data/instances/netherlands_complete.yaml', 'belgium': '/Users/kempersc/apps/glam/data/instances/belgium_isil.yaml', 'belarus': '/Users/kempersc/apps/glam/data/instances/belarus_complete.yaml', 'canada': '/Users/kempersc/apps/glam/data/instances/canada/canadian_heritage_custodians_geocoded.json', 'chile': '/Users/kempersc/apps/glam/data/instances/chile/chilean_institutions_batch20_enriched.yaml', 'egypt': '/Users/kempersc/apps/glam/data/instances/egypt_institutions_ghcid.yaml', # Japan dataset is 18MB - handle separately } OUTPUT_DIR = Path('/Users/kempersc/apps/glam/data/unified') OUTPUT_DIR.mkdir(exist_ok=True) def load_json_dataset(path: str) -> List[Dict[str, Any]]: """Load JSON format dataset.""" with open(path, 'r', encoding='utf-8') as f: data = json.load(f) # Handle both list and dict formats if isinstance(data, list): return data elif isinstance(data, dict) and 'institutions' in data: return data['institutions'] else: return [data] def load_yaml_dataset(path: str) -> List[Dict[str, Any]]: """Load YAML format dataset.""" import yaml with open(path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if isinstance(data, list): return data elif isinstance(data, dict) and 'institutions' in data: return data['institutions'] else: return [data] def extract_key_metadata(record: Dict[str, Any]) -> Dict[str, Any]: """Extract key metadata from institution record.""" return { 'id': record.get('id'), 'ghcid': record.get('ghcid'), 'ghcid_uuid': record.get('ghcid_uuid'), 'ghcid_numeric': record.get('ghcid_numeric'), 'name': record.get('name'), 'institution_type': record.get('institution_type'), 'country': record.get('locations', [{}])[0].get('country') if record.get('locations') else None, 'city': record.get('locations', [{}])[0].get('city') if record.get('locations') else None, 'data_source': record.get('provenance', {}).get('data_source'), 'data_tier': record.get('provenance', {}).get('data_tier'), 'extraction_date': record.get('provenance', {}).get('extraction_date'), 'has_wikidata': any( i.get('identifier_scheme') == 'Wikidata' for i in record.get('identifiers', []) ), 'has_website': any( i.get('identifier_scheme') == 'Website' for i in record.get('identifiers', []) ), 'raw_record': json.dumps(record, ensure_ascii=False) } def build_unified_database(): """Build unified database from all country datasets.""" print("šŸŒ Building Unified GLAM Heritage Custodian Database") print("=" * 70) all_institutions = [] country_stats = defaultdict(lambda: { 'total': 0, 'with_ghcid': 0, 'with_wikidata': 0, 'with_website': 0, 'by_type': defaultdict(int) }) # Load each country dataset for country, path in COUNTRY_DATASETS.items(): if not Path(path).exists(): print(f"āš ļø {country.upper()}: Dataset not found at {path}") continue print(f"\nšŸ“‚ Loading {country.upper()}...") try: if path.endswith('.json'): records = load_json_dataset(path) elif path.endswith('.yaml'): records = load_yaml_dataset(path) else: print(f" āš ļø Unknown format: {path}") continue print(f" āœ… Loaded {len(records)} institutions") # Process records for record in records: metadata = extract_key_metadata(record) metadata['source_country'] = country all_institutions.append(metadata) # Update stats stats = country_stats[country] stats['total'] += 1 if metadata.get('ghcid'): stats['with_ghcid'] += 1 if metadata.get('has_wikidata'): stats['with_wikidata'] += 1 if metadata.get('has_website'): stats['with_website'] += 1 inst_type = metadata.get('institution_type', 'UNKNOWN') stats['by_type'][inst_type] += 1 except Exception as e: print(f" āŒ Error loading {country}: {e}") continue print("\n" + "=" * 70) print(f"šŸ“Š Total institutions loaded: {len(all_institutions)}") # Deduplicate by GHCID ghcid_map = {} duplicates = [] for inst in all_institutions: ghcid = inst.get('ghcid') if not ghcid: continue if ghcid in ghcid_map: duplicates.append((ghcid, inst['name'], ghcid_map[ghcid]['name'])) else: ghcid_map[ghcid] = inst print(f"šŸ” Unique GHCIDs: {len(ghcid_map)}") print(f"āš ļø Duplicates detected: {len(duplicates)}") # Export to JSON json_output = OUTPUT_DIR / 'glam_unified_database.json' with open(json_output, 'w', encoding='utf-8') as f: json.dump({ 'metadata': { 'export_date': datetime.now(timezone.utc).isoformat(), 'total_institutions': len(all_institutions), 'unique_ghcids': len(ghcid_map), 'duplicates': len(duplicates), 'countries': list(COUNTRY_DATASETS.keys()) }, 'country_stats': dict(country_stats), 'institutions': all_institutions }, f, indent=2, ensure_ascii=False) print(f"āœ… JSON export: {json_output}") # Export to SQLite sqlite_output = OUTPUT_DIR / 'glam_unified_database.db' conn = sqlite3.connect(sqlite_output) cursor = conn.cursor() # Create tables cursor.execute(''' CREATE TABLE IF NOT EXISTS institutions ( id TEXT PRIMARY KEY, ghcid TEXT, ghcid_uuid TEXT, ghcid_numeric INTEGER, name TEXT NOT NULL, institution_type TEXT, country TEXT, city TEXT, source_country TEXT, data_source TEXT, data_tier TEXT, extraction_date TEXT, has_wikidata BOOLEAN, has_website BOOLEAN, raw_record TEXT ) ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS metadata ( key TEXT PRIMARY KEY, value TEXT ) ''') # Insert data for inst in all_institutions: cursor.execute(''' INSERT OR REPLACE INTO institutions VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? ) ''', ( inst.get('id'), inst.get('ghcid'), inst.get('ghcid_uuid'), inst.get('ghcid_numeric'), inst.get('name'), inst.get('institution_type'), inst.get('country'), inst.get('city'), inst.get('source_country'), inst.get('data_source'), inst.get('data_tier'), inst.get('extraction_date'), inst.get('has_wikidata'), inst.get('has_website'), inst.get('raw_record') )) # Insert metadata cursor.execute('INSERT OR REPLACE INTO metadata VALUES (?, ?)', ('export_date', datetime.now(timezone.utc).isoformat())) cursor.execute('INSERT OR REPLACE INTO metadata VALUES (?, ?)', ('total_institutions', str(len(all_institutions)))) conn.commit() conn.close() print(f"āœ… SQLite export: {sqlite_output}") # Print country statistics print("\n" + "=" * 70) print("šŸ“ˆ Country Statistics\n") for country, stats in sorted(country_stats.items()): print(f"{country.upper()}:") print(f" Total: {stats['total']}") print(f" GHCID: {stats['with_ghcid']} ({stats['with_ghcid']/stats['total']*100:.1f}%)") print(f" Wikidata: {stats['with_wikidata']} ({stats['with_wikidata']/stats['total']*100:.1f}%)") print(f" Website: {stats['with_website']} ({stats['with_website']/stats['total']*100:.1f}%)") print(f" Types: {dict(stats['by_type'])}") print() # Print duplicates if any if duplicates: print("\nāš ļø Duplicate GHCIDs Detected:") for ghcid, name1, name2 in duplicates[:10]: print(f" {ghcid}: '{name1}' vs '{name2}'") if len(duplicates) > 10: print(f" ... and {len(duplicates) - 10} more") print("\nāœ… Unified database build complete!") print(f"šŸ“‚ Output directory: {OUTPUT_DIR}") if __name__ == '__main__': build_unified_database()