#!/usr/bin/env python3 """ Merge Sachsen-Anhalt Datasets Combines archives + museums for comprehensive regional coverage """ import json from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Any def load_json(file_path: Path) -> List[Dict[str, Any]]: """Load JSON file.""" with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) # Handle wrapped format (with metadata.archives) if isinstance(data, dict) and 'archives' in data: return data['archives'] # Handle list format elif isinstance(data, list): return data # Handle single institution elif isinstance(data, dict): return [data] return [] def merge_datasets() -> List[Dict[str, Any]]: """Merge all Sachsen-Anhalt datasets.""" data_dir = Path('data/isil/germany') # Load archives (4 institutions) archives_file = data_dir / 'sachsen_anhalt_archives_20251120_131330.json' archives_raw = load_json(archives_file) if archives_file.exists() else [] # Normalize archives to LinkML format archives = [] for arch in archives_raw: normalized = { 'name': arch.get('name', 'Unknown'), 'institution_type': 'ARCHIVE', 'description': arch.get('description', ''), 'locations': [{ 'city': arch.get('city', ''), 'region': arch.get('region', 'Sachsen-Anhalt'), 'country': arch.get('country', 'DE') }], 'identifiers': [], 'provenance': { 'data_source': 'WEBSITE_SCRAPING', 'data_tier': 'TIER_2_VERIFIED', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Landesarchiv Sachsen-Anhalt website', 'confidence_score': 0.95, 'source_url': arch.get('url', '') } } if arch.get('url'): normalized['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': arch['url'], 'identifier_url': arch['url'] }) archives.append(normalized) print(f"Loaded {len(archives)} archives") # Load museums (162 institutions) museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_*.json'), reverse=True) museums = load_json(museum_files[0]) if museum_files else [] print(f"Loaded {len(museums)} museums") # Combine all institutions all_institutions = archives + museums print(f"\nTotal: {len(all_institutions)} institutions") return all_institutions def generate_statistics(institutions: List[Dict[str, Any]]): """Generate dataset statistics.""" print("\n" + "=" * 80) print("DATASET STATISTICS") print("=" * 80) # Institution types type_counts = {} for inst in institutions: inst_type = inst['institution_type'] type_counts[inst_type] = type_counts.get(inst_type, 0) + 1 print("\nInstitution Types:") for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]): print(f" {inst_type}: {count}") # Data completeness has_city = sum(1 for i in institutions if i['locations'][0].get('city')) has_address = sum(1 for i in institutions if i['locations'][0].get('street_address')) has_description = sum(1 for i in institutions if i.get('description')) has_website = sum(1 for i in institutions if i.get('identifiers')) print("\nData Completeness:") print(f" Name: {len(institutions)}/{len(institutions)} (100%)") print(f" City: {has_city}/{len(institutions)} ({has_city/len(institutions)*100:.1f}%)") print(f" Street Address: {has_address}/{len(institutions)} ({has_address/len(institutions)*100:.1f}%)") print(f" Description: {has_description}/{len(institutions)} ({has_description/len(institutions)*100:.1f}%)") print(f" Website: {has_website}/{len(institutions)} ({has_website/len(institutions)*100:.1f}%)") # Cities with coverage city_counts = {} for inst in institutions: city = inst['locations'][0].get('city', 'Unknown') if city: city_counts[city] = city_counts.get(city, 0) + 1 print(f"\nGeographic Coverage:") print(f" {len(city_counts)} cities/towns") print("\n Top 10 Cities:") for city, count in sorted(city_counts.items(), key=lambda x: -x[1])[:10]: print(f" {city}: {count}") def main(): """Main execution.""" print("=" * 80) print("Merge Sachsen-Anhalt Datasets") print("=" * 80) print() # Merge datasets merged_institutions = merge_datasets() # Generate statistics generate_statistics(merged_institutions) # Save merged dataset timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') output_path = Path('data/isil/germany') / f'sachsen_anhalt_merged_{timestamp}.json' with open(output_path, 'w', encoding='utf-8') as f: json.dump(merged_institutions, f, ensure_ascii=False, indent=2) file_size_kb = output_path.stat().st_size / 1024 print() print("=" * 80) print(f"✅ Saved to: {output_path}") print(f" File size: {file_size_kb:.1f} KB") print(f" Total institutions: {len(merged_institutions)}") print("=" * 80) if __name__ == '__main__': main()