#!/usr/bin/env python3 """ Merge Sachsen-Anhalt Enriched Museums + Archives Creates complete Sachsen-Anhalt dataset with full metadata """ import json from datetime import datetime from pathlib import Path from typing import List, Dict, Any from collections import Counter def convert_archive_to_linkml(archive: Dict[str, Any]) -> Dict[str, Any]: """Convert archive record from flat structure to LinkML format.""" linkml_record = { "name": archive.get("name", ""), "institution_type": archive.get("institution_type", "ARCHIVE"), "description": archive.get("address_text", ""), "locations": [ { "city": archive.get("city", ""), "region": archive.get("region", "Sachsen-Anhalt"), "country": archive.get("country", "DE") } ], "identifiers": [], "provenance": archive.get("provenance", {}) } # Add URL as identifier if archive.get("url"): linkml_record["identifiers"].append({ "identifier_scheme": "Website", "identifier_value": archive["url"], "identifier_url": archive["url"] }) # Add email if present if archive.get("email"): linkml_record["identifiers"].append({ "identifier_scheme": "Email", "identifier_value": archive["email"], "identifier_url": f"mailto:{archive['email']}" }) # Add phone if present if archive.get("phone"): linkml_record["identifiers"].append({ "identifier_scheme": "Phone", "identifier_value": archive["phone"], "identifier_url": f"tel:{archive['phone']}" }) return linkml_record def main(): """Main execution.""" print("=" * 80) print("Merge Sachsen-Anhalt Complete Dataset") print("=" * 80) print() data_dir = Path('data/isil/germany') # Load enriched museums museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_enriched_*.json'), reverse=True) if not museum_files: print("❌ No enriched museum files found") return museum_file = museum_files[0] print(f"Loading museums: {museum_file.name}") with open(museum_file, 'r', encoding='utf-8') as f: museums = json.load(f) # Load archives archive_files = sorted(data_dir.glob('sachsen_anhalt_archives_*.json'), reverse=True) if not archive_files: print("❌ No archive files found") return archive_file = archive_files[0] print(f"Loading archives: {archive_file.name}") with open(archive_file, 'r', encoding='utf-8') as f: archive_data = json.load(f) # Convert archives to LinkML format archives = [] if isinstance(archive_data, dict) and 'archives' in archive_data: for archive in archive_data['archives']: archives.append(convert_archive_to_linkml(archive)) elif isinstance(archive_data, list): for archive in archive_data: archives.append(convert_archive_to_linkml(archive)) print() print(f"Loaded:") print(f" Museums: {len(museums)}") print(f" Archives: {len(archives)}") print() # Merge merged = museums + archives total = len(merged) print(f"Total institutions: {total}") print() # Calculate completeness has_name = sum(1 for inst in merged if inst.get('name')) has_type = sum(1 for inst in merged if inst.get('institution_type')) has_desc = sum(1 for inst in merged if inst.get('description')) has_city = sum(1 for inst in merged if inst.get('locations') and any(loc.get('city') for loc in inst['locations'])) has_address = sum(1 for inst in merged if inst.get('locations') and any(loc.get('street_address') for loc in inst['locations'])) has_postal = sum(1 for inst in merged if inst.get('locations') and any(loc.get('postal_code') for loc in inst['locations'])) has_website = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Website' for i in inst['identifiers'])) has_phone = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Phone' for i in inst['identifiers'])) has_email = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Email' for i in inst['identifiers'])) print("Data Completeness:") print(f" Name: {has_name:3d}/{total} ({has_name/total*100:5.1f}%)") print(f" Type: {has_type:3d}/{total} ({has_type/total*100:5.1f}%)") print(f" Description: {has_desc:3d}/{total} ({has_desc/total*100:5.1f}%)") print(f" City: {has_city:3d}/{total} ({has_city/total*100:5.1f}%)") print(f" Street Address: {has_address:3d}/{total} ({has_address/total*100:5.1f}%)") print(f" Postal Code: {has_postal:3d}/{total} ({has_postal/total*100:5.1f}%)") print(f" Website: {has_website:3d}/{total} ({has_website/total*100:5.1f}%)") print(f" Phone: {has_phone:3d}/{total} ({has_phone/total*100:5.1f}%)") print(f" Email: {has_email:3d}/{total} ({has_email/total*100:5.1f}%)") print() # Institution types type_counts = Counter() for inst in merged: type_counts[inst.get('institution_type', 'UNKNOWN')] += 1 print("Institution Types:") for itype, count in type_counts.most_common(): print(f" {itype:20s}: {count:3d}") print() # Geographic coverage city_counts = Counter() for inst in merged: if inst.get('locations'): for loc in inst['locations']: city = loc.get('city', '') if city: city_counts[city] += 1 print(f"Geographic Coverage: {len(city_counts)} cities") print() print("Top 20 Cities:") for city, count in city_counts.most_common(20): print(f" {city:35s}: {count:2d}") print() # Save complete dataset timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') output_path = data_dir / f'sachsen_anhalt_complete_{timestamp}.json' with open(output_path, 'w', encoding='utf-8') as f: json.dump(merged, f, ensure_ascii=False, indent=2) file_size_kb = output_path.stat().st_size / 1024 print(f"✅ Saved to: {output_path}") print(f" File size: {file_size_kb:.1f} KB") print(f" Total institutions: {total}") print() print("=" * 80) print("Sachsen-Anhalt Complete Dataset Ready!") print("=" * 80) if __name__ == '__main__': main()