glam/scripts/merge_sachsen_anhalt_complete.py
2025-11-21 22:12:33 +01:00

178 lines
6.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Merge Sachsen-Anhalt Enriched Museums + Archives
Creates complete Sachsen-Anhalt dataset with full metadata
"""
import json
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
from collections import Counter
def convert_archive_to_linkml(archive: Dict[str, Any]) -> Dict[str, Any]:
"""Convert archive record from flat structure to LinkML format."""
linkml_record = {
"name": archive.get("name", ""),
"institution_type": archive.get("institution_type", "ARCHIVE"),
"description": archive.get("address_text", ""),
"locations": [
{
"city": archive.get("city", ""),
"region": archive.get("region", "Sachsen-Anhalt"),
"country": archive.get("country", "DE")
}
],
"identifiers": [],
"provenance": archive.get("provenance", {})
}
# Add URL as identifier
if archive.get("url"):
linkml_record["identifiers"].append({
"identifier_scheme": "Website",
"identifier_value": archive["url"],
"identifier_url": archive["url"]
})
# Add email if present
if archive.get("email"):
linkml_record["identifiers"].append({
"identifier_scheme": "Email",
"identifier_value": archive["email"],
"identifier_url": f"mailto:{archive['email']}"
})
# Add phone if present
if archive.get("phone"):
linkml_record["identifiers"].append({
"identifier_scheme": "Phone",
"identifier_value": archive["phone"],
"identifier_url": f"tel:{archive['phone']}"
})
return linkml_record
def main():
"""Main execution."""
print("=" * 80)
print("Merge Sachsen-Anhalt Complete Dataset")
print("=" * 80)
print()
data_dir = Path('data/isil/germany')
# Load enriched museums
museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_enriched_*.json'), reverse=True)
if not museum_files:
print("❌ No enriched museum files found")
return
museum_file = museum_files[0]
print(f"Loading museums: {museum_file.name}")
with open(museum_file, 'r', encoding='utf-8') as f:
museums = json.load(f)
# Load archives
archive_files = sorted(data_dir.glob('sachsen_anhalt_archives_*.json'), reverse=True)
if not archive_files:
print("❌ No archive files found")
return
archive_file = archive_files[0]
print(f"Loading archives: {archive_file.name}")
with open(archive_file, 'r', encoding='utf-8') as f:
archive_data = json.load(f)
# Convert archives to LinkML format
archives = []
if isinstance(archive_data, dict) and 'archives' in archive_data:
for archive in archive_data['archives']:
archives.append(convert_archive_to_linkml(archive))
elif isinstance(archive_data, list):
for archive in archive_data:
archives.append(convert_archive_to_linkml(archive))
print()
print(f"Loaded:")
print(f" Museums: {len(museums)}")
print(f" Archives: {len(archives)}")
print()
# Merge
merged = museums + archives
total = len(merged)
print(f"Total institutions: {total}")
print()
# Calculate completeness
has_name = sum(1 for inst in merged if inst.get('name'))
has_type = sum(1 for inst in merged if inst.get('institution_type'))
has_desc = sum(1 for inst in merged if inst.get('description'))
has_city = sum(1 for inst in merged if inst.get('locations') and any(loc.get('city') for loc in inst['locations']))
has_address = sum(1 for inst in merged if inst.get('locations') and any(loc.get('street_address') for loc in inst['locations']))
has_postal = sum(1 for inst in merged if inst.get('locations') and any(loc.get('postal_code') for loc in inst['locations']))
has_website = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Website' for i in inst['identifiers']))
has_phone = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Phone' for i in inst['identifiers']))
has_email = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Email' for i in inst['identifiers']))
print("Data Completeness:")
print(f" Name: {has_name:3d}/{total} ({has_name/total*100:5.1f}%)")
print(f" Type: {has_type:3d}/{total} ({has_type/total*100:5.1f}%)")
print(f" Description: {has_desc:3d}/{total} ({has_desc/total*100:5.1f}%)")
print(f" City: {has_city:3d}/{total} ({has_city/total*100:5.1f}%)")
print(f" Street Address: {has_address:3d}/{total} ({has_address/total*100:5.1f}%)")
print(f" Postal Code: {has_postal:3d}/{total} ({has_postal/total*100:5.1f}%)")
print(f" Website: {has_website:3d}/{total} ({has_website/total*100:5.1f}%)")
print(f" Phone: {has_phone:3d}/{total} ({has_phone/total*100:5.1f}%)")
print(f" Email: {has_email:3d}/{total} ({has_email/total*100:5.1f}%)")
print()
# Institution types
type_counts = Counter()
for inst in merged:
type_counts[inst.get('institution_type', 'UNKNOWN')] += 1
print("Institution Types:")
for itype, count in type_counts.most_common():
print(f" {itype:20s}: {count:3d}")
print()
# Geographic coverage
city_counts = Counter()
for inst in merged:
if inst.get('locations'):
for loc in inst['locations']:
city = loc.get('city', '')
if city:
city_counts[city] += 1
print(f"Geographic Coverage: {len(city_counts)} cities")
print()
print("Top 20 Cities:")
for city, count in city_counts.most_common(20):
print(f" {city:35s}: {count:2d}")
print()
# Save complete dataset
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = data_dir / f'sachsen_anhalt_complete_{timestamp}.json'
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(merged, f, ensure_ascii=False, indent=2)
file_size_kb = output_path.stat().st_size / 1024
print(f"✅ Saved to: {output_path}")
print(f" File size: {file_size_kb:.1f} KB")
print(f" Total institutions: {total}")
print()
print("=" * 80)
print("Sachsen-Anhalt Complete Dataset Ready!")
print("=" * 80)
if __name__ == '__main__':
main()