#!/usr/bin/env python3 """ Create Unified German Heritage Institution Dataset This script combines: 1. ISIL registry data (16,979 institutions - libraries, archives, museums) 2. Archivportal-D data (~10,000-20,000 archives) 3. Merge analysis (matched pairs + new discoveries) Output: Comprehensive German heritage dataset (~25,000-27,000 institutions) Deduplication strategy: - Use matched pairs from merge script - Prefer ISIL data for matched institutions (Tier 1) - Enrich with Archivportal-D metadata where available - Add new discoveries from Archivportal-D Author: OpenCode + MCP Tools Date: 2025-11-19 """ import json from pathlib import Path from typing import List, Dict, Optional from datetime import datetime # Configuration DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany") OUTPUT_DIR = DATA_DIR def load_isil_data() -> List[Dict]: """Load ISIL registry data.""" isil_file = DATA_DIR / "german_isil_complete_20251119_134939.json" print(f"Loading ISIL data: {isil_file}") with open(isil_file, 'r', encoding='utf-8') as f: data = json.load(f) institutions = data.get('institutions', []) print(f" Loaded {len(institutions)} ISIL records\n") return institutions def load_merge_results() -> Dict: """Load merge analysis results.""" matched_files = list(DATA_DIR.glob("merged_matched_*.json")) new_files = list(DATA_DIR.glob("merged_new_discoveries_*.json")) if not matched_files or not new_files: raise FileNotFoundError( f"Merge results not found in {DATA_DIR}. " "Run merge_archivportal_isil.py first." ) # Use most recent files matched_file = max(matched_files, key=lambda p: p.stat().st_mtime) new_file = max(new_files, key=lambda p: p.stat().st_mtime) print(f"Loading merge results:") print(f" Matched: {matched_file}") print(f" New: {new_file}") with open(matched_file, 'r', encoding='utf-8') as f: matched_data = json.load(f) with open(new_file, 'r', encoding='utf-8') as f: new_data = json.load(f) print(f" Matched pairs: {len(matched_data.get('matches', []))}") print(f" New discoveries: {len(new_data.get('archives', []))}\n") return { 'matched': matched_data.get('matches', []), 'new_discoveries': new_data.get('archives', []) } def enrich_isil_with_archivportal(isil_record: Dict, ap_record: Dict) -> Dict: """ Enrich ISIL record with Archivportal-D metadata. ISIL data takes precedence (Tier 1), but we add missing fields from Archivportal-D where available. """ enriched = isil_record.copy() # Add Archivportal-D ID if ap_record.get('id'): enriched['archivportal_id'] = ap_record['id'] enriched['archivportal_url'] = ap_record.get('profile_url') # Add archive type if missing if not enriched.get('institution_subtype') and ap_record.get('archive_type'): enriched['institution_subtype'] = ap_record['archive_type'] # Add federal state if missing if not enriched.get('federal_state') and ap_record.get('federal_state'): enriched['federal_state'] = ap_record['federal_state'] # Add/improve coordinates if missing or better if ap_record.get('latitude') and ap_record.get('longitude'): if not enriched.get('latitude'): enriched['latitude'] = ap_record['latitude'] enriched['longitude'] = ap_record['longitude'] enriched['coordinate_source'] = 'archivportal-d' # Add thumbnail if ap_record.get('thumbnail'): enriched['thumbnail'] = ap_record['thumbnail'] # Mark as enriched enriched['data_sources'] = ['ISIL', 'Archivportal-D'] enriched['enriched_from_archivportal'] = True return enriched def convert_archivportal_to_unified(ap_record: Dict) -> Dict: """ Convert Archivportal-D record to unified format. These are new discoveries not in ISIL registry. """ return { 'id': ap_record.get('id'), 'institution_name': ap_record.get('name'), 'city': ap_record.get('location'), 'federal_state': ap_record.get('federal_state'), 'institution_type': 'ARCHIVE', 'institution_subtype': ap_record.get('archive_type'), 'isil_code': ap_record.get('isil'), # May be None 'latitude': ap_record.get('latitude'), 'longitude': ap_record.get('longitude'), 'thumbnail': ap_record.get('thumbnail'), 'archivportal_id': ap_record.get('id'), 'archivportal_url': ap_record.get('profile_url'), 'data_sources': ['Archivportal-D'], 'data_tier': 'TIER_2_VERIFIED', # From official portal 'needs_isil_assignment': not ap_record.get('isil') } def create_unified_dataset( isil_records: List[Dict], merge_results: Dict ) -> List[Dict]: """ Create unified dataset combining all sources. """ print("Creating unified dataset...") unified = [] # 1. Add matched institutions (ISIL + Archivportal enrichment) matched_isil_codes = set() for match in merge_results['matched']: isil_record = match['isil'] ap_record = match['archivportal'] enriched = enrich_isil_with_archivportal(isil_record, ap_record) unified.append(enriched) matched_isil_codes.add(isil_record.get('isil_code')) print(f" Added {len(unified)} enriched records (matched)") # 2. Add ISIL-only records (not in Archivportal) isil_only_count = 0 for isil_record in isil_records: if isil_record.get('isil_code') not in matched_isil_codes: record = isil_record.copy() record['data_sources'] = ['ISIL'] unified.append(record) isil_only_count += 1 print(f" Added {isil_only_count} ISIL-only records") # 3. Add new discoveries from Archivportal new_count = 0 for ap_record in merge_results['new_discoveries']: unified_record = convert_archivportal_to_unified(ap_record) unified.append(unified_record) new_count += 1 print(f" Added {new_count} new discoveries (Archivportal-only)") print(f"\nTotal unified records: {len(unified)}\n") return unified def generate_statistics(unified: List[Dict]) -> Dict: """Generate comprehensive statistics.""" stats = { 'total': len(unified), 'by_type': {}, 'by_state': {}, 'by_source': {}, 'data_completeness': { 'with_isil': 0, 'with_coordinates': 0, 'with_website': 0, 'with_email': 0, 'enriched_from_archivportal': 0, 'needs_isil_assignment': 0 } } for record in unified: # By type inst_type = record.get('institution_type', 'Unknown') stats['by_type'][inst_type] = stats['by_type'].get(inst_type, 0) + 1 # By state state = record.get('federal_state', 'Unknown') stats['by_state'][state] = stats['by_state'].get(state, 0) + 1 # By source sources = ','.join(record.get('data_sources', ['Unknown'])) stats['by_source'][sources] = stats['by_source'].get(sources, 0) + 1 # Completeness if record.get('isil_code'): stats['data_completeness']['with_isil'] += 1 if record.get('latitude'): stats['data_completeness']['with_coordinates'] += 1 if record.get('website'): stats['data_completeness']['with_website'] += 1 if record.get('email'): stats['data_completeness']['with_email'] += 1 if record.get('enriched_from_archivportal'): stats['data_completeness']['enriched_from_archivportal'] += 1 if record.get('needs_isil_assignment'): stats['data_completeness']['needs_isil_assignment'] += 1 # Print report print(f"\n{'='*70}") print("UNIFIED DATASET STATISTICS") print(f"{'='*70}") print(f"\nTotal institutions: {stats['total']}") print(f"\nBy institution type:") for inst_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True): pct = count / stats['total'] * 100 print(f" - {inst_type}: {count} ({pct:.1f}%)") print(f"\nBy data source:") for source, count in sorted(stats['by_source'].items(), key=lambda x: x[1], reverse=True): pct = count / stats['total'] * 100 print(f" - {source}: {count} ({pct:.1f}%)") print(f"\nData completeness:") for metric, count in stats['data_completeness'].items(): pct = count / stats['total'] * 100 print(f" - {metric.replace('_', ' ').title()}: {count} ({pct:.1f}%)") print(f"\nTop 10 federal states:") for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True)[:10]: pct = count / stats['total'] * 100 print(f" - {state}: {count} ({pct:.1f}%)") print(f"{'='*70}\n") return stats def save_unified_dataset(unified: List[Dict], stats: Dict): """Save unified dataset to JSON file.""" timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') output_file = OUTPUT_DIR / f"german_unified_{timestamp}.json" output = { 'metadata': { 'description': 'Unified German heritage institution dataset', 'sources': [ 'ISIL Registry (16,979 institutions)', 'Archivportal-D via DDB API (~10,000-20,000 archives)' ], 'creation_date': datetime.utcnow().isoformat() + 'Z', 'total_institutions': len(unified), 'coverage': 'Germany - all heritage sectors', 'data_tiers': 'TIER_1 (ISIL), TIER_2 (Archivportal-D)', 'license': 'CC0 1.0 Universal (Public Domain)' }, 'statistics': stats, 'institutions': unified } with open(output_file, 'w', encoding='utf-8') as f: json.dump(output, f, ensure_ascii=False, indent=2) print(f"✓ Saved unified dataset: {output_file}") print(f" File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n") # Also save as JSONL for easier processing jsonl_file = OUTPUT_DIR / f"german_unified_{timestamp}.jsonl" with open(jsonl_file, 'w', encoding='utf-8') as f: for record in unified: f.write(json.dumps(record, ensure_ascii=False) + '\n') print(f"✓ Saved JSONL format: {jsonl_file}") print(f" File size: {jsonl_file.stat().st_size / 1024 / 1024:.2f} MB\n") return output_file def main(): """Main execution.""" print(f"\n{'#'*70}") print(f"# Create Unified German Heritage Dataset") print(f"{'#'*70}\n") # Load data isil_records = load_isil_data() merge_results = load_merge_results() # Create unified dataset unified = create_unified_dataset(isil_records, merge_results) # Generate statistics stats = generate_statistics(unified) # Save output_file = save_unified_dataset(unified, stats) print("✓ Unified dataset creation complete!\n") print("Summary:") print(f" - Total institutions: {len(unified)}") print(f" - ISIL records: {sum(1 for r in unified if 'ISIL' in r.get('data_sources', []))}") print(f" - Archivportal enriched: {sum(1 for r in unified if r.get('enriched_from_archivportal'))}") print(f" - New discoveries: {sum(1 for r in unified if r.get('data_sources') == ['Archivportal-D'])}") print(f" - With ISIL codes: {sum(1 for r in unified if r.get('isil_code'))}") print(f" - Needing ISIL: {sum(1 for r in unified if r.get('needs_isil_assignment'))}") print(f"\nNext steps:") print(f" 1. Review unified dataset: {output_file}") print(f" 2. Convert to LinkML format") print(f" 3. Generate GHCIDs") print(f" 4. Export to RDF/CSV/Parquet\n") if __name__ == "__main__": main()