#!/usr/bin/env python3 """ Global Dataset Merge Script Merges all regional ISIL datasets into a unified global heritage custodian database: - Japan ISIL institutions (12,065 records) - Netherlands ISIL institutions (369 records) - EU institutions (10 records) - Latin America institutions (304 records) Output: Comprehensive global dataset with ~12,748 institutions Author: GLAM Data Extraction Project Date: 2025-11-07 """ import yaml from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Any from collections import Counter def load_yaml_dataset(file_path: Path) -> List[Dict[str, Any]]: """Load YAML dataset from file.""" print(f"Loading {file_path.name}...") with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Handle both list and dict formats if isinstance(data, list): return data elif isinstance(data, dict): # If it's a single institution, wrap in list return [data] else: raise ValueError(f"Unexpected data format in {file_path}") def analyze_dataset(institutions: List[Dict[str, Any]], name: str) -> Dict[str, Any]: """Analyze dataset structure and contents.""" print(f"\n{'='*60}") print(f"Analyzing {name}") print(f"{'='*60}") stats = { 'name': name, 'total_count': len(institutions), 'countries': Counter(), 'institution_types': Counter(), 'data_sources': Counter(), 'data_tiers': Counter(), 'has_ghcid': 0, 'has_coordinates': 0, 'has_website': 0, 'has_identifiers': 0, } for inst in institutions: # Country distribution if 'locations' in inst and inst['locations']: country = inst['locations'][0].get('country', 'UNKNOWN') stats['countries'][country] += 1 # Check for coordinates if inst['locations'][0].get('latitude') or inst['locations'][0].get('longitude'): stats['has_coordinates'] += 1 # Institution type inst_type = inst.get('institution_type', 'UNKNOWN') stats['institution_types'][inst_type] += 1 # Provenance if 'provenance' in inst: prov = inst['provenance'] stats['data_sources'][prov.get('data_source', 'UNKNOWN')] += 1 stats['data_tiers'][prov.get('data_tier', 'UNKNOWN')] += 1 # GHCID if inst.get('ghcid'): stats['has_ghcid'] += 1 # Identifiers if inst.get('identifiers'): stats['has_identifiers'] += 1 # Check for website for identifier in inst['identifiers']: if identifier.get('identifier_scheme') == 'Website': stats['has_website'] += 1 break # Print summary print(f"Total Records: {stats['total_count']:,}") print(f"\nCountries ({len(stats['countries'])}):") for country, count in stats['countries'].most_common(): print(f" {country}: {count:,} ({count/stats['total_count']*100:.1f}%)") print(f"\nInstitution Types:") for inst_type, count in stats['institution_types'].most_common(): print(f" {inst_type}: {count:,} ({count/stats['total_count']*100:.1f}%)") print(f"\nData Quality:") print(f" GHCID Coverage: {stats['has_ghcid']:,} ({stats['has_ghcid']/stats['total_count']*100:.1f}%)") print(f" Has Coordinates: {stats['has_coordinates']:,} ({stats['has_coordinates']/stats['total_count']*100:.1f}%)") print(f" Has Website: {stats['has_website']:,} ({stats['has_website']/stats['total_count']*100:.1f}%)") print(f" Has Identifiers: {stats['has_identifiers']:,} ({stats['has_identifiers']/stats['total_count']*100:.1f}%)") return stats def deduplicate_institutions(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Deduplicate institutions by GHCID (primary) or ISIL code (fallback). For duplicates, prefer records with: 1. Higher data tier (TIER_1 > TIER_2 > TIER_3 > TIER_4) 2. More complete data (more fields filled) 3. Most recent extraction date """ seen_ghcids = {} seen_isil = {} duplicates = [] tier_priority = { 'TIER_1_AUTHORITATIVE': 4, 'TIER_2_VERIFIED': 3, 'TIER_3_CROWD_SOURCED': 2, 'TIER_4_INFERRED': 1, } def completeness_score(inst: Dict[str, Any]) -> int: """Calculate how complete an institution record is.""" score = 0 score += 1 if inst.get('name') else 0 score += 1 if inst.get('description') else 0 score += len(inst.get('identifiers', [])) score += len(inst.get('locations', [])) score += len(inst.get('digital_platforms', [])) score += 1 if inst.get('ghcid') else 0 if inst.get('locations'): loc = inst['locations'][0] score += 1 if loc.get('street_address') else 0 score += 1 if loc.get('postal_code') else 0 score += 1 if loc.get('latitude') else 0 return score def is_better_record(new_inst: Dict[str, Any], existing_inst: Dict[str, Any]) -> bool: """Determine if new record is better than existing.""" new_prov = new_inst.get('provenance', {}) existing_prov = existing_inst.get('provenance', {}) # Compare data tiers new_tier = tier_priority.get(new_prov.get('data_tier', ''), 0) existing_tier = tier_priority.get(existing_prov.get('data_tier', ''), 0) if new_tier != existing_tier: return new_tier > existing_tier # Compare completeness new_score = completeness_score(new_inst) existing_score = completeness_score(existing_inst) if new_score != existing_score: return new_score > existing_score # Compare extraction dates (more recent is better) new_date = new_prov.get('extraction_date', '') existing_date = existing_prov.get('extraction_date', '') return new_date > existing_date deduped = [] for inst in institutions: ghcid = inst.get('ghcid') isil_code = None # Extract ISIL code if inst.get('identifiers'): for identifier in inst['identifiers']: if identifier.get('identifier_scheme') == 'ISIL': isil_code = identifier.get('identifier_value') break # Check for GHCID duplicates if ghcid: if ghcid in seen_ghcids: duplicates.append({ 'ghcid': ghcid, 'name1': seen_ghcids[ghcid].get('name'), 'name2': inst.get('name'), }) # Keep better record if is_better_record(inst, seen_ghcids[ghcid]): # Remove old record from deduped deduped = [i for i in deduped if i.get('ghcid') != ghcid] seen_ghcids[ghcid] = inst deduped.append(inst) # else: keep existing record continue else: seen_ghcids[ghcid] = inst # Check for ISIL duplicates (only if no GHCID) elif isil_code: if isil_code in seen_isil: duplicates.append({ 'isil': isil_code, 'name1': seen_isil[isil_code].get('name'), 'name2': inst.get('name'), }) # Keep better record if is_better_record(inst, seen_isil[isil_code]): # Remove old record from deduped deduped = [i for i in deduped if not any(id.get('identifier_value') == isil_code for id in i.get('identifiers', []) if id.get('identifier_scheme') == 'ISIL')] seen_isil[isil_code] = inst deduped.append(inst) continue else: seen_isil[isil_code] = inst deduped.append(inst) if duplicates: print(f"\n⚠️ Found {len(duplicates)} duplicates (resolved by keeping best record):") for dup in duplicates[:10]: # Show first 10 if 'ghcid' in dup: print(f" GHCID {dup['ghcid']}: '{dup['name1']}' vs '{dup['name2']}'") else: print(f" ISIL {dup['isil']}: '{dup['name1']}' vs '{dup['name2']}'") if len(duplicates) > 10: print(f" ... and {len(duplicates) - 10} more") return deduped def merge_datasets(datasets: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: """Merge all datasets with deduplication.""" print(f"\n{'='*60}") print("Merging Datasets") print(f"{'='*60}") # Combine all institutions all_institutions = [] for name, institutions in datasets.items(): print(f"Adding {len(institutions):,} records from {name}") all_institutions.extend(institutions) print(f"\nTotal before deduplication: {len(all_institutions):,}") # Deduplicate merged = deduplicate_institutions(all_institutions) print(f"Total after deduplication: {len(merged):,}") print(f"Removed: {len(all_institutions) - len(merged):,} duplicates") return merged def main(): """Main execution.""" base_path = Path('/Users/kempersc/apps/glam') # Define source datasets datasets = { 'Japan ISIL': base_path / 'data/instances/japan/jp_institutions_resolved.yaml', # Using collision-resolved dataset 'Netherlands ISIL': base_path / 'data/dutch_institutions_with_ghcids.yaml', 'EU Institutions': base_path / 'data/instances/eu_institutions.yaml', 'Latin America': base_path / 'data/instances/latin_american_institutions_AUTHORITATIVE.yaml', } # Load all datasets loaded_datasets = {} for name, path in datasets.items(): if path.exists(): loaded_datasets[name] = load_yaml_dataset(path) else: print(f"⚠️ Warning: {name} not found at {path}") # Analyze each dataset stats = {} for name, institutions in loaded_datasets.items(): stats[name] = analyze_dataset(institutions, name) # Merge datasets merged_institutions = merge_datasets(loaded_datasets) # Analyze merged dataset merged_stats = analyze_dataset(merged_institutions, "GLOBAL MERGED DATASET") # Generate output files output_dir = base_path / 'data/instances/global' output_dir.mkdir(parents=True, exist_ok=True) # 1. Main YAML file output_yaml = output_dir / 'global_heritage_institutions.yaml' print(f"\n{'='*60}") print(f"Writing merged dataset to {output_yaml.name}") print(f"{'='*60}") with open(output_yaml, 'w', encoding='utf-8') as f: yaml.dump(merged_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120) print(f"✅ Wrote {len(merged_institutions):,} institutions to {output_yaml}") # 2. Statistics report stats_file = output_dir / 'merge_statistics.yaml' merge_metadata = { 'merge_date': datetime.now(timezone.utc).isoformat(), 'total_institutions': len(merged_institutions), 'source_datasets': { name: { 'count': len(institutions), 'file': str(datasets[name].relative_to(base_path)) } for name, institutions in loaded_datasets.items() }, 'regional_statistics': stats, 'merged_statistics': merged_stats, } with open(stats_file, 'w', encoding='utf-8') as f: yaml.dump(merge_metadata, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✅ Wrote statistics to {stats_file}") # 3. Summary report (markdown) report_file = output_dir / 'merge_report.md' with open(report_file, 'w', encoding='utf-8') as f: f.write("# Global Heritage Institutions Dataset - Merge Report\n\n") f.write(f"**Merge Date**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}\n\n") f.write(f"**Total Institutions**: {len(merged_institutions):,}\n\n") f.write("## Source Datasets\n\n") for name, institutions in loaded_datasets.items(): f.write(f"- **{name}**: {len(institutions):,} records\n") f.write(f"\n## Country Distribution\n\n") for country, count in merged_stats['countries'].most_common(): pct = count / merged_stats['total_count'] * 100 f.write(f"- **{country}**: {count:,} ({pct:.1f}%)\n") f.write(f"\n## Institution Types\n\n") for inst_type, count in merged_stats['institution_types'].most_common(): pct = count / merged_stats['total_count'] * 100 f.write(f"- **{inst_type}**: {count:,} ({pct:.1f}%)\n") f.write(f"\n## Data Quality Metrics\n\n") total = merged_stats['total_count'] f.write(f"- **GHCID Coverage**: {merged_stats['has_ghcid']:,} ({merged_stats['has_ghcid']/total*100:.1f}%)\n") f.write(f"- **Geocoded (has coordinates)**: {merged_stats['has_coordinates']:,} ({merged_stats['has_coordinates']/total*100:.1f}%)\n") f.write(f"- **Has Website**: {merged_stats['has_website']:,} ({merged_stats['has_website']/total*100:.1f}%)\n") f.write(f"- **Has Identifiers**: {merged_stats['has_identifiers']:,} ({merged_stats['has_identifiers']/total*100:.1f}%)\n") f.write(f"\n## Next Steps\n\n") f.write("1. **Geocoding**: Add coordinates to remaining institutions\n") f.write("2. **Enrichment**: Add Wikidata/VIAF identifiers\n") f.write("3. **Validation**: Schema compliance check\n") f.write("4. **Export**: Generate JSON-LD, GeoJSON, CSV formats\n") print(f"✅ Wrote report to {report_file}") print(f"\n{'='*60}") print("Global Dataset Merge Complete! 🎉") print(f"{'='*60}") print(f"Output directory: {output_dir}") print(f"Total institutions: {len(merged_institutions):,}") if __name__ == '__main__': main()