156 lines
5.4 KiB
Python
Executable file
156 lines
5.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Merge Sachsen-Anhalt Datasets
|
|
Combines archives + museums for comprehensive regional coverage
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Any
|
|
|
|
def load_json(file_path: Path) -> List[Dict[str, Any]]:
|
|
"""Load JSON file."""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
# Handle wrapped format (with metadata.archives)
|
|
if isinstance(data, dict) and 'archives' in data:
|
|
return data['archives']
|
|
# Handle list format
|
|
elif isinstance(data, list):
|
|
return data
|
|
# Handle single institution
|
|
elif isinstance(data, dict):
|
|
return [data]
|
|
return []
|
|
|
|
def merge_datasets() -> List[Dict[str, Any]]:
|
|
"""Merge all Sachsen-Anhalt datasets."""
|
|
|
|
data_dir = Path('data/isil/germany')
|
|
|
|
# Load archives (4 institutions)
|
|
archives_file = data_dir / 'sachsen_anhalt_archives_20251120_131330.json'
|
|
archives_raw = load_json(archives_file) if archives_file.exists() else []
|
|
|
|
# Normalize archives to LinkML format
|
|
archives = []
|
|
for arch in archives_raw:
|
|
normalized = {
|
|
'name': arch.get('name', 'Unknown'),
|
|
'institution_type': 'ARCHIVE',
|
|
'description': arch.get('description', ''),
|
|
'locations': [{
|
|
'city': arch.get('city', ''),
|
|
'region': arch.get('region', 'Sachsen-Anhalt'),
|
|
'country': arch.get('country', 'DE')
|
|
}],
|
|
'identifiers': [],
|
|
'provenance': {
|
|
'data_source': 'WEBSITE_SCRAPING',
|
|
'data_tier': 'TIER_2_VERIFIED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Landesarchiv Sachsen-Anhalt website',
|
|
'confidence_score': 0.95,
|
|
'source_url': arch.get('url', '')
|
|
}
|
|
}
|
|
|
|
if arch.get('url'):
|
|
normalized['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': arch['url'],
|
|
'identifier_url': arch['url']
|
|
})
|
|
|
|
archives.append(normalized)
|
|
|
|
print(f"Loaded {len(archives)} archives")
|
|
|
|
# Load museums (162 institutions)
|
|
museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_*.json'), reverse=True)
|
|
museums = load_json(museum_files[0]) if museum_files else []
|
|
print(f"Loaded {len(museums)} museums")
|
|
|
|
# Combine all institutions
|
|
all_institutions = archives + museums
|
|
|
|
print(f"\nTotal: {len(all_institutions)} institutions")
|
|
|
|
return all_institutions
|
|
|
|
def generate_statistics(institutions: List[Dict[str, Any]]):
|
|
"""Generate dataset statistics."""
|
|
|
|
print("\n" + "=" * 80)
|
|
print("DATASET STATISTICS")
|
|
print("=" * 80)
|
|
|
|
# Institution types
|
|
type_counts = {}
|
|
for inst in institutions:
|
|
inst_type = inst['institution_type']
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
print("\nInstitution Types:")
|
|
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {inst_type}: {count}")
|
|
|
|
# Data completeness
|
|
has_city = sum(1 for i in institutions if i['locations'][0].get('city'))
|
|
has_address = sum(1 for i in institutions if i['locations'][0].get('street_address'))
|
|
has_description = sum(1 for i in institutions if i.get('description'))
|
|
has_website = sum(1 for i in institutions if i.get('identifiers'))
|
|
|
|
print("\nData Completeness:")
|
|
print(f" Name: {len(institutions)}/{len(institutions)} (100%)")
|
|
print(f" City: {has_city}/{len(institutions)} ({has_city/len(institutions)*100:.1f}%)")
|
|
print(f" Street Address: {has_address}/{len(institutions)} ({has_address/len(institutions)*100:.1f}%)")
|
|
print(f" Description: {has_description}/{len(institutions)} ({has_description/len(institutions)*100:.1f}%)")
|
|
print(f" Website: {has_website}/{len(institutions)} ({has_website/len(institutions)*100:.1f}%)")
|
|
|
|
# Cities with coverage
|
|
city_counts = {}
|
|
for inst in institutions:
|
|
city = inst['locations'][0].get('city', 'Unknown')
|
|
if city:
|
|
city_counts[city] = city_counts.get(city, 0) + 1
|
|
|
|
print(f"\nGeographic Coverage:")
|
|
print(f" {len(city_counts)} cities/towns")
|
|
print("\n Top 10 Cities:")
|
|
for city, count in sorted(city_counts.items(), key=lambda x: -x[1])[:10]:
|
|
print(f" {city}: {count}")
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
|
|
print("=" * 80)
|
|
print("Merge Sachsen-Anhalt Datasets")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Merge datasets
|
|
merged_institutions = merge_datasets()
|
|
|
|
# Generate statistics
|
|
generate_statistics(merged_institutions)
|
|
|
|
# Save merged dataset
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_path = Path('data/isil/germany') / f'sachsen_anhalt_merged_{timestamp}.json'
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(merged_institutions, f, ensure_ascii=False, indent=2)
|
|
|
|
file_size_kb = output_path.stat().st_size / 1024
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print(f"✅ Saved to: {output_path}")
|
|
print(f" File size: {file_size_kb:.1f} KB")
|
|
print(f" Total institutions: {len(merged_institutions)}")
|
|
print("=" * 80)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|