#!/usr/bin/env python3 """ Merge all Bavaria heritage institution sources into unified dataset. Combines: - Bavarian State Archives (8 institutions) - Bavarian University Libraries (6 institutions) - Bavaria Museums from isil.museum registry (~1,231 institutions) Total: ~1,245 institutions """ import json from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Any def load_json(filepath: Path) -> List[Dict[str, Any]]: """Load JSON file and return list of institutions.""" with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) def merge_bayern_sources() -> List[Dict[str, Any]]: """Merge all Bavaria sources into unified dataset.""" data_dir = Path('data/isil/germany') # Find latest files for each source archives_files = sorted(data_dir.glob('bayern_archives_*.json')) libraries_files = sorted(data_dir.glob('bayern_libraries_*.json')) museum_files = sorted(data_dir.glob('bayern_museums_*.json')) if not archives_files: raise FileNotFoundError("No archives file found") if not libraries_files: raise FileNotFoundError("No libraries file found") if not museum_files: raise FileNotFoundError("No museums file found") # Load latest files archives = load_json(archives_files[-1]) libraries = load_json(libraries_files[-1]) museums = load_json(museum_files[-1]) print(f"\nLoading sources:") print(f" Archives: {len(archives):4d} institutions") print(f" Libraries: {len(libraries):4d} institutions") print(f" Museums: {len(museums):4d} institutions") print(f" {'─' * 50}") print(f" Total: {len(archives) + len(libraries) + len(museums):4d} institutions") # Merge all institutions merged = [] merged.extend(archives) merged.extend(libraries) merged.extend(museums) # Sort by city, then by name merged.sort(key=lambda x: ( x.get('locations', [{}])[0].get('city', ''), x.get('name', '') )) # Add merge metadata to provenance for institution in merged: if 'provenance' in institution: institution['provenance']['merged_dataset'] = 'bayern_complete' institution['provenance']['merge_date'] = datetime.now(timezone.utc).isoformat() return merged def calculate_completeness(institutions: List[Dict[str, Any]]) -> Dict[str, float]: """Calculate metadata completeness statistics.""" total = len(institutions) if total == 0: return {} stats = { 'name': 0, 'institution_type': 0, 'city': 0, 'street_address': 0, 'postal_code': 0, 'phone': 0, 'email': 0, 'website': 0, 'isil_code': 0, 'description': 0, 'wikidata': 0, 'viaf': 0, } for inst in institutions: if inst.get('name'): stats['name'] += 1 if inst.get('institution_type'): stats['institution_type'] += 1 if inst.get('description'): stats['description'] += 1 # Location fields if inst.get('locations'): loc = inst['locations'][0] if loc.get('city'): stats['city'] += 1 if loc.get('street_address'): stats['street_address'] += 1 if loc.get('postal_code'): stats['postal_code'] += 1 # Contact fields (stored in locations) if inst.get('locations'): loc = inst['locations'][0] if loc.get('email'): stats['email'] += 1 if loc.get('phone'): stats['phone'] += 1 # Website (stored in identifiers) if inst.get('identifiers'): for ident in inst['identifiers']: if ident.get('identifier_scheme', '').lower() == 'website': stats['website'] += 1 break # Identifiers if inst.get('identifiers'): for ident in inst['identifiers']: scheme = ident.get('identifier_scheme', '').lower() if scheme == 'isil': stats['isil_code'] += 1 elif scheme == 'wikidata': stats['wikidata'] += 1 elif scheme == 'viaf': stats['viaf'] += 1 # Calculate percentages percentages = {k: (v / total * 100) for k, v in stats.items()} return percentages def print_statistics(institutions: List[Dict[str, Any]]) -> None: """Print merge statistics.""" print("\n" + "=" * 80) print("Saxony Complete Dataset - Merge Statistics") print("=" * 80) # Count by type type_counts = {} for inst in institutions: inst_type = inst.get('institution_type', 'UNKNOWN') type_counts[inst_type] = type_counts.get(inst_type, 0) + 1 print(f"\nTotal Institutions: {len(institutions)}") print("\nBy Type:") for inst_type, count in sorted(type_counts.items()): print(f" {inst_type}: {count}") # Count by city city_counts = {} for inst in institutions: if inst.get('locations'): city = inst['locations'][0].get('city', 'Unknown') city_counts[city] = city_counts.get(city, 0) + 1 print("\nBy City:") for city, count in sorted(city_counts.items(), key=lambda x: -x[1]): print(f" {city}: {count}") # Completeness statistics completeness = calculate_completeness(institutions) print("\n" + "=" * 80) print("Metadata Completeness") print("=" * 80) print("\nCore Fields:") print(f" Name : {completeness['name']:.1f}%") print(f" Institution Type : {completeness['institution_type']:.1f}%") print(f" Description : {completeness['description']:.1f}%") print("\nLocation Fields:") print(f" City : {completeness['city']:.1f}%") print(f" Street Address : {completeness['street_address']:.1f}%") print(f" Postal Code : {completeness['postal_code']:.1f}%") print("\nContact Fields:") print(f" Phone : {completeness['phone']:.1f}%") print(f" Email : {completeness['email']:.1f}%") print(f" Website : {completeness['website']:.1f}%") print("\nIdentifiers:") print(f" ISIL Code : {completeness['isil_code']:.1f}%") print(f" Wikidata ID : {completeness['wikidata']:.1f}%") print(f" VIAF ID : {completeness['viaf']:.1f}%") # Calculate average completeness avg = sum(completeness.values()) / len(completeness) print(f"\nAverage Completeness: {avg:.1f}%") print("\n" + "=" * 80) def main(): """Main merge workflow.""" print("=" * 80) print("Saxony Complete Dataset Merger") print("=" * 80) # Merge sources print("\nMerging sources...") merged = merge_bayern_sources() # Print statistics print_statistics(merged) # Export merged dataset timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S') output_file = Path(f'data/isil/germany/bayern_complete_{timestamp}.json') with open(output_file, 'w', encoding='utf-8') as f: json.dump(merged, f, indent=2, ensure_ascii=False) print(f"\nāœ“ Exported to: {output_file}") print(f" File size: {output_file.stat().st_size:,} bytes") print(f" Institutions: {len(merged)}") print("\n" + "=" * 80) print("Merge complete!") print("=" * 80) if __name__ == '__main__': main()