274 lines
9.5 KiB
Python
Executable file
274 lines
9.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Build Unified GLAM Heritage Custodian Database
|
|
|
|
Merges all country-specific LinkML datasets into a unified database with:
|
|
- Deduplication by GHCID
|
|
- Data quality tracking
|
|
- Version control
|
|
- Multiple export formats (JSON, SQLite, Parquet)
|
|
"""
|
|
|
|
import json
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Any
|
|
from collections import defaultdict
|
|
import hashlib
|
|
|
|
# Country dataset paths
|
|
COUNTRY_DATASETS = {
|
|
'finland': '/Users/kempersc/apps/glam/data/finland_isil/finland_isil_linkml_final_20251120.json',
|
|
'denmark': '/Users/kempersc/apps/glam/data/instances/denmark_complete_enriched.json',
|
|
'netherlands': '/Users/kempersc/apps/glam/data/instances/netherlands_complete.yaml',
|
|
'belgium': '/Users/kempersc/apps/glam/data/instances/belgium_isil.yaml',
|
|
'belarus': '/Users/kempersc/apps/glam/data/instances/belarus_complete.yaml',
|
|
'canada': '/Users/kempersc/apps/glam/data/instances/canada/canadian_heritage_custodians_geocoded.json',
|
|
'chile': '/Users/kempersc/apps/glam/data/instances/chile/chilean_institutions_batch20_enriched.yaml',
|
|
'egypt': '/Users/kempersc/apps/glam/data/instances/egypt_institutions_ghcid.yaml',
|
|
# Japan dataset is 18MB - handle separately
|
|
}
|
|
|
|
OUTPUT_DIR = Path('/Users/kempersc/apps/glam/data/unified')
|
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
|
|
def load_json_dataset(path: str) -> List[Dict[str, Any]]:
|
|
"""Load JSON format dataset."""
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Handle both list and dict formats
|
|
if isinstance(data, list):
|
|
return data
|
|
elif isinstance(data, dict) and 'institutions' in data:
|
|
return data['institutions']
|
|
else:
|
|
return [data]
|
|
|
|
def load_yaml_dataset(path: str) -> List[Dict[str, Any]]:
|
|
"""Load YAML format dataset."""
|
|
import yaml
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if isinstance(data, list):
|
|
return data
|
|
elif isinstance(data, dict) and 'institutions' in data:
|
|
return data['institutions']
|
|
else:
|
|
return [data]
|
|
|
|
def extract_key_metadata(record: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Extract key metadata from institution record."""
|
|
return {
|
|
'id': record.get('id'),
|
|
'ghcid': record.get('ghcid'),
|
|
'ghcid_uuid': record.get('ghcid_uuid'),
|
|
'ghcid_numeric': record.get('ghcid_numeric'),
|
|
'name': record.get('name'),
|
|
'institution_type': record.get('institution_type'),
|
|
'country': record.get('locations', [{}])[0].get('country') if record.get('locations') else None,
|
|
'city': record.get('locations', [{}])[0].get('city') if record.get('locations') else None,
|
|
'data_source': record.get('provenance', {}).get('data_source'),
|
|
'data_tier': record.get('provenance', {}).get('data_tier'),
|
|
'extraction_date': record.get('provenance', {}).get('extraction_date'),
|
|
'has_wikidata': any(
|
|
i.get('identifier_scheme') == 'Wikidata'
|
|
for i in record.get('identifiers', [])
|
|
),
|
|
'has_website': any(
|
|
i.get('identifier_scheme') == 'Website'
|
|
for i in record.get('identifiers', [])
|
|
),
|
|
'raw_record': json.dumps(record, ensure_ascii=False)
|
|
}
|
|
|
|
def build_unified_database():
|
|
"""Build unified database from all country datasets."""
|
|
|
|
print("🌍 Building Unified GLAM Heritage Custodian Database")
|
|
print("=" * 70)
|
|
|
|
all_institutions = []
|
|
country_stats = defaultdict(lambda: {
|
|
'total': 0,
|
|
'with_ghcid': 0,
|
|
'with_wikidata': 0,
|
|
'with_website': 0,
|
|
'by_type': defaultdict(int)
|
|
})
|
|
|
|
# Load each country dataset
|
|
for country, path in COUNTRY_DATASETS.items():
|
|
if not Path(path).exists():
|
|
print(f"⚠️ {country.upper()}: Dataset not found at {path}")
|
|
continue
|
|
|
|
print(f"\n📂 Loading {country.upper()}...")
|
|
|
|
try:
|
|
if path.endswith('.json'):
|
|
records = load_json_dataset(path)
|
|
elif path.endswith('.yaml'):
|
|
records = load_yaml_dataset(path)
|
|
else:
|
|
print(f" ⚠️ Unknown format: {path}")
|
|
continue
|
|
|
|
print(f" ✅ Loaded {len(records)} institutions")
|
|
|
|
# Process records
|
|
for record in records:
|
|
metadata = extract_key_metadata(record)
|
|
metadata['source_country'] = country
|
|
all_institutions.append(metadata)
|
|
|
|
# Update stats
|
|
stats = country_stats[country]
|
|
stats['total'] += 1
|
|
if metadata.get('ghcid'):
|
|
stats['with_ghcid'] += 1
|
|
if metadata.get('has_wikidata'):
|
|
stats['with_wikidata'] += 1
|
|
if metadata.get('has_website'):
|
|
stats['with_website'] += 1
|
|
|
|
inst_type = metadata.get('institution_type', 'UNKNOWN')
|
|
stats['by_type'][inst_type] += 1
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error loading {country}: {e}")
|
|
continue
|
|
|
|
print("\n" + "=" * 70)
|
|
print(f"📊 Total institutions loaded: {len(all_institutions)}")
|
|
|
|
# Deduplicate by GHCID
|
|
ghcid_map = {}
|
|
duplicates = []
|
|
|
|
for inst in all_institutions:
|
|
ghcid = inst.get('ghcid')
|
|
if not ghcid:
|
|
continue
|
|
|
|
if ghcid in ghcid_map:
|
|
duplicates.append((ghcid, inst['name'], ghcid_map[ghcid]['name']))
|
|
else:
|
|
ghcid_map[ghcid] = inst
|
|
|
|
print(f"🔍 Unique GHCIDs: {len(ghcid_map)}")
|
|
print(f"⚠️ Duplicates detected: {len(duplicates)}")
|
|
|
|
# Export to JSON
|
|
json_output = OUTPUT_DIR / 'glam_unified_database.json'
|
|
with open(json_output, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'metadata': {
|
|
'export_date': datetime.now(timezone.utc).isoformat(),
|
|
'total_institutions': len(all_institutions),
|
|
'unique_ghcids': len(ghcid_map),
|
|
'duplicates': len(duplicates),
|
|
'countries': list(COUNTRY_DATASETS.keys())
|
|
},
|
|
'country_stats': dict(country_stats),
|
|
'institutions': all_institutions
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✅ JSON export: {json_output}")
|
|
|
|
# Export to SQLite
|
|
sqlite_output = OUTPUT_DIR / 'glam_unified_database.db'
|
|
conn = sqlite3.connect(sqlite_output)
|
|
cursor = conn.cursor()
|
|
|
|
# Create tables
|
|
cursor.execute('''
|
|
CREATE TABLE IF NOT EXISTS institutions (
|
|
id TEXT PRIMARY KEY,
|
|
ghcid TEXT,
|
|
ghcid_uuid TEXT,
|
|
ghcid_numeric INTEGER,
|
|
name TEXT NOT NULL,
|
|
institution_type TEXT,
|
|
country TEXT,
|
|
city TEXT,
|
|
source_country TEXT,
|
|
data_source TEXT,
|
|
data_tier TEXT,
|
|
extraction_date TEXT,
|
|
has_wikidata BOOLEAN,
|
|
has_website BOOLEAN,
|
|
raw_record TEXT
|
|
)
|
|
''')
|
|
|
|
cursor.execute('''
|
|
CREATE TABLE IF NOT EXISTS metadata (
|
|
key TEXT PRIMARY KEY,
|
|
value TEXT
|
|
)
|
|
''')
|
|
|
|
# Insert data
|
|
for inst in all_institutions:
|
|
cursor.execute('''
|
|
INSERT OR REPLACE INTO institutions VALUES (
|
|
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
|
|
)
|
|
''', (
|
|
inst.get('id'),
|
|
inst.get('ghcid'),
|
|
inst.get('ghcid_uuid'),
|
|
inst.get('ghcid_numeric'),
|
|
inst.get('name'),
|
|
inst.get('institution_type'),
|
|
inst.get('country'),
|
|
inst.get('city'),
|
|
inst.get('source_country'),
|
|
inst.get('data_source'),
|
|
inst.get('data_tier'),
|
|
inst.get('extraction_date'),
|
|
inst.get('has_wikidata'),
|
|
inst.get('has_website'),
|
|
inst.get('raw_record')
|
|
))
|
|
|
|
# Insert metadata
|
|
cursor.execute('INSERT OR REPLACE INTO metadata VALUES (?, ?)',
|
|
('export_date', datetime.now(timezone.utc).isoformat()))
|
|
cursor.execute('INSERT OR REPLACE INTO metadata VALUES (?, ?)',
|
|
('total_institutions', str(len(all_institutions))))
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
print(f"✅ SQLite export: {sqlite_output}")
|
|
|
|
# Print country statistics
|
|
print("\n" + "=" * 70)
|
|
print("📈 Country Statistics\n")
|
|
|
|
for country, stats in sorted(country_stats.items()):
|
|
print(f"{country.upper()}:")
|
|
print(f" Total: {stats['total']}")
|
|
print(f" GHCID: {stats['with_ghcid']} ({stats['with_ghcid']/stats['total']*100:.1f}%)")
|
|
print(f" Wikidata: {stats['with_wikidata']} ({stats['with_wikidata']/stats['total']*100:.1f}%)")
|
|
print(f" Website: {stats['with_website']} ({stats['with_website']/stats['total']*100:.1f}%)")
|
|
print(f" Types: {dict(stats['by_type'])}")
|
|
print()
|
|
|
|
# Print duplicates if any
|
|
if duplicates:
|
|
print("\n⚠️ Duplicate GHCIDs Detected:")
|
|
for ghcid, name1, name2 in duplicates[:10]:
|
|
print(f" {ghcid}: '{name1}' vs '{name2}'")
|
|
if len(duplicates) > 10:
|
|
print(f" ... and {len(duplicates) - 10} more")
|
|
|
|
print("\n✅ Unified database build complete!")
|
|
print(f"📂 Output directory: {OUTPUT_DIR}")
|
|
|
|
if __name__ == '__main__':
|
|
build_unified_database()
|