glam/scripts/build_unified_database.py
2025-11-21 22:12:33 +01:00

274 lines
9.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Build Unified GLAM Heritage Custodian Database
Merges all country-specific LinkML datasets into a unified database with:
- Deduplication by GHCID
- Data quality tracking
- Version control
- Multiple export formats (JSON, SQLite, Parquet)
"""
import json
import sqlite3
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any
from collections import defaultdict
import hashlib
# Country dataset paths
COUNTRY_DATASETS = {
'finland': '/Users/kempersc/apps/glam/data/finland_isil/finland_isil_linkml_final_20251120.json',
'denmark': '/Users/kempersc/apps/glam/data/instances/denmark_complete_enriched.json',
'netherlands': '/Users/kempersc/apps/glam/data/instances/netherlands_complete.yaml',
'belgium': '/Users/kempersc/apps/glam/data/instances/belgium_isil.yaml',
'belarus': '/Users/kempersc/apps/glam/data/instances/belarus_complete.yaml',
'canada': '/Users/kempersc/apps/glam/data/instances/canada/canadian_heritage_custodians_geocoded.json',
'chile': '/Users/kempersc/apps/glam/data/instances/chile/chilean_institutions_batch20_enriched.yaml',
'egypt': '/Users/kempersc/apps/glam/data/instances/egypt_institutions_ghcid.yaml',
# Japan dataset is 18MB - handle separately
}
OUTPUT_DIR = Path('/Users/kempersc/apps/glam/data/unified')
OUTPUT_DIR.mkdir(exist_ok=True)
def load_json_dataset(path: str) -> List[Dict[str, Any]]:
"""Load JSON format dataset."""
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Handle both list and dict formats
if isinstance(data, list):
return data
elif isinstance(data, dict) and 'institutions' in data:
return data['institutions']
else:
return [data]
def load_yaml_dataset(path: str) -> List[Dict[str, Any]]:
"""Load YAML format dataset."""
import yaml
with open(path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if isinstance(data, list):
return data
elif isinstance(data, dict) and 'institutions' in data:
return data['institutions']
else:
return [data]
def extract_key_metadata(record: Dict[str, Any]) -> Dict[str, Any]:
"""Extract key metadata from institution record."""
return {
'id': record.get('id'),
'ghcid': record.get('ghcid'),
'ghcid_uuid': record.get('ghcid_uuid'),
'ghcid_numeric': record.get('ghcid_numeric'),
'name': record.get('name'),
'institution_type': record.get('institution_type'),
'country': record.get('locations', [{}])[0].get('country') if record.get('locations') else None,
'city': record.get('locations', [{}])[0].get('city') if record.get('locations') else None,
'data_source': record.get('provenance', {}).get('data_source'),
'data_tier': record.get('provenance', {}).get('data_tier'),
'extraction_date': record.get('provenance', {}).get('extraction_date'),
'has_wikidata': any(
i.get('identifier_scheme') == 'Wikidata'
for i in record.get('identifiers', [])
),
'has_website': any(
i.get('identifier_scheme') == 'Website'
for i in record.get('identifiers', [])
),
'raw_record': json.dumps(record, ensure_ascii=False)
}
def build_unified_database():
"""Build unified database from all country datasets."""
print("🌍 Building Unified GLAM Heritage Custodian Database")
print("=" * 70)
all_institutions = []
country_stats = defaultdict(lambda: {
'total': 0,
'with_ghcid': 0,
'with_wikidata': 0,
'with_website': 0,
'by_type': defaultdict(int)
})
# Load each country dataset
for country, path in COUNTRY_DATASETS.items():
if not Path(path).exists():
print(f"⚠️ {country.upper()}: Dataset not found at {path}")
continue
print(f"\n📂 Loading {country.upper()}...")
try:
if path.endswith('.json'):
records = load_json_dataset(path)
elif path.endswith('.yaml'):
records = load_yaml_dataset(path)
else:
print(f" ⚠️ Unknown format: {path}")
continue
print(f" ✅ Loaded {len(records)} institutions")
# Process records
for record in records:
metadata = extract_key_metadata(record)
metadata['source_country'] = country
all_institutions.append(metadata)
# Update stats
stats = country_stats[country]
stats['total'] += 1
if metadata.get('ghcid'):
stats['with_ghcid'] += 1
if metadata.get('has_wikidata'):
stats['with_wikidata'] += 1
if metadata.get('has_website'):
stats['with_website'] += 1
inst_type = metadata.get('institution_type', 'UNKNOWN')
stats['by_type'][inst_type] += 1
except Exception as e:
print(f" ❌ Error loading {country}: {e}")
continue
print("\n" + "=" * 70)
print(f"📊 Total institutions loaded: {len(all_institutions)}")
# Deduplicate by GHCID
ghcid_map = {}
duplicates = []
for inst in all_institutions:
ghcid = inst.get('ghcid')
if not ghcid:
continue
if ghcid in ghcid_map:
duplicates.append((ghcid, inst['name'], ghcid_map[ghcid]['name']))
else:
ghcid_map[ghcid] = inst
print(f"🔍 Unique GHCIDs: {len(ghcid_map)}")
print(f"⚠️ Duplicates detected: {len(duplicates)}")
# Export to JSON
json_output = OUTPUT_DIR / 'glam_unified_database.json'
with open(json_output, 'w', encoding='utf-8') as f:
json.dump({
'metadata': {
'export_date': datetime.now(timezone.utc).isoformat(),
'total_institutions': len(all_institutions),
'unique_ghcids': len(ghcid_map),
'duplicates': len(duplicates),
'countries': list(COUNTRY_DATASETS.keys())
},
'country_stats': dict(country_stats),
'institutions': all_institutions
}, f, indent=2, ensure_ascii=False)
print(f"✅ JSON export: {json_output}")
# Export to SQLite
sqlite_output = OUTPUT_DIR / 'glam_unified_database.db'
conn = sqlite3.connect(sqlite_output)
cursor = conn.cursor()
# Create tables
cursor.execute('''
CREATE TABLE IF NOT EXISTS institutions (
id TEXT PRIMARY KEY,
ghcid TEXT,
ghcid_uuid TEXT,
ghcid_numeric INTEGER,
name TEXT NOT NULL,
institution_type TEXT,
country TEXT,
city TEXT,
source_country TEXT,
data_source TEXT,
data_tier TEXT,
extraction_date TEXT,
has_wikidata BOOLEAN,
has_website BOOLEAN,
raw_record TEXT
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS metadata (
key TEXT PRIMARY KEY,
value TEXT
)
''')
# Insert data
for inst in all_institutions:
cursor.execute('''
INSERT OR REPLACE INTO institutions VALUES (
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
)
''', (
inst.get('id'),
inst.get('ghcid'),
inst.get('ghcid_uuid'),
inst.get('ghcid_numeric'),
inst.get('name'),
inst.get('institution_type'),
inst.get('country'),
inst.get('city'),
inst.get('source_country'),
inst.get('data_source'),
inst.get('data_tier'),
inst.get('extraction_date'),
inst.get('has_wikidata'),
inst.get('has_website'),
inst.get('raw_record')
))
# Insert metadata
cursor.execute('INSERT OR REPLACE INTO metadata VALUES (?, ?)',
('export_date', datetime.now(timezone.utc).isoformat()))
cursor.execute('INSERT OR REPLACE INTO metadata VALUES (?, ?)',
('total_institutions', str(len(all_institutions))))
conn.commit()
conn.close()
print(f"✅ SQLite export: {sqlite_output}")
# Print country statistics
print("\n" + "=" * 70)
print("📈 Country Statistics\n")
for country, stats in sorted(country_stats.items()):
print(f"{country.upper()}:")
print(f" Total: {stats['total']}")
print(f" GHCID: {stats['with_ghcid']} ({stats['with_ghcid']/stats['total']*100:.1f}%)")
print(f" Wikidata: {stats['with_wikidata']} ({stats['with_wikidata']/stats['total']*100:.1f}%)")
print(f" Website: {stats['with_website']} ({stats['with_website']/stats['total']*100:.1f}%)")
print(f" Types: {dict(stats['by_type'])}")
print()
# Print duplicates if any
if duplicates:
print("\n⚠️ Duplicate GHCIDs Detected:")
for ghcid, name1, name2 in duplicates[:10]:
print(f" {ghcid}: '{name1}' vs '{name2}'")
if len(duplicates) > 10:
print(f" ... and {len(duplicates) - 10} more")
print("\n✅ Unified database build complete!")
print(f"📂 Output directory: {OUTPUT_DIR}")
if __name__ == '__main__':
build_unified_database()