glam/scripts/merge_sachsen_anhalt_datasets.py
2025-11-21 22:12:33 +01:00

156 lines
5.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Merge Sachsen-Anhalt Datasets
Combines archives + museums for comprehensive regional coverage
"""
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any
def load_json(file_path: Path) -> List[Dict[str, Any]]:
"""Load JSON file."""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Handle wrapped format (with metadata.archives)
if isinstance(data, dict) and 'archives' in data:
return data['archives']
# Handle list format
elif isinstance(data, list):
return data
# Handle single institution
elif isinstance(data, dict):
return [data]
return []
def merge_datasets() -> List[Dict[str, Any]]:
"""Merge all Sachsen-Anhalt datasets."""
data_dir = Path('data/isil/germany')
# Load archives (4 institutions)
archives_file = data_dir / 'sachsen_anhalt_archives_20251120_131330.json'
archives_raw = load_json(archives_file) if archives_file.exists() else []
# Normalize archives to LinkML format
archives = []
for arch in archives_raw:
normalized = {
'name': arch.get('name', 'Unknown'),
'institution_type': 'ARCHIVE',
'description': arch.get('description', ''),
'locations': [{
'city': arch.get('city', ''),
'region': arch.get('region', 'Sachsen-Anhalt'),
'country': arch.get('country', 'DE')
}],
'identifiers': [],
'provenance': {
'data_source': 'WEBSITE_SCRAPING',
'data_tier': 'TIER_2_VERIFIED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Landesarchiv Sachsen-Anhalt website',
'confidence_score': 0.95,
'source_url': arch.get('url', '')
}
}
if arch.get('url'):
normalized['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': arch['url'],
'identifier_url': arch['url']
})
archives.append(normalized)
print(f"Loaded {len(archives)} archives")
# Load museums (162 institutions)
museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_*.json'), reverse=True)
museums = load_json(museum_files[0]) if museum_files else []
print(f"Loaded {len(museums)} museums")
# Combine all institutions
all_institutions = archives + museums
print(f"\nTotal: {len(all_institutions)} institutions")
return all_institutions
def generate_statistics(institutions: List[Dict[str, Any]]):
"""Generate dataset statistics."""
print("\n" + "=" * 80)
print("DATASET STATISTICS")
print("=" * 80)
# Institution types
type_counts = {}
for inst in institutions:
inst_type = inst['institution_type']
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
print("\nInstitution Types:")
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {inst_type}: {count}")
# Data completeness
has_city = sum(1 for i in institutions if i['locations'][0].get('city'))
has_address = sum(1 for i in institutions if i['locations'][0].get('street_address'))
has_description = sum(1 for i in institutions if i.get('description'))
has_website = sum(1 for i in institutions if i.get('identifiers'))
print("\nData Completeness:")
print(f" Name: {len(institutions)}/{len(institutions)} (100%)")
print(f" City: {has_city}/{len(institutions)} ({has_city/len(institutions)*100:.1f}%)")
print(f" Street Address: {has_address}/{len(institutions)} ({has_address/len(institutions)*100:.1f}%)")
print(f" Description: {has_description}/{len(institutions)} ({has_description/len(institutions)*100:.1f}%)")
print(f" Website: {has_website}/{len(institutions)} ({has_website/len(institutions)*100:.1f}%)")
# Cities with coverage
city_counts = {}
for inst in institutions:
city = inst['locations'][0].get('city', 'Unknown')
if city:
city_counts[city] = city_counts.get(city, 0) + 1
print(f"\nGeographic Coverage:")
print(f" {len(city_counts)} cities/towns")
print("\n Top 10 Cities:")
for city, count in sorted(city_counts.items(), key=lambda x: -x[1])[:10]:
print(f" {city}: {count}")
def main():
"""Main execution."""
print("=" * 80)
print("Merge Sachsen-Anhalt Datasets")
print("=" * 80)
print()
# Merge datasets
merged_institutions = merge_datasets()
# Generate statistics
generate_statistics(merged_institutions)
# Save merged dataset
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = Path('data/isil/germany') / f'sachsen_anhalt_merged_{timestamp}.json'
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(merged_institutions, f, ensure_ascii=False, indent=2)
file_size_kb = output_path.stat().st_size / 1024
print()
print("=" * 80)
print(f"✅ Saved to: {output_path}")
print(f" File size: {file_size_kb:.1f} KB")
print(f" Total institutions: {len(merged_institutions)}")
print("=" * 80)
if __name__ == '__main__':
main()