glam/scripts/merge_bayern_complete.py
2025-11-21 22:12:33 +01:00

235 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""
Merge all Bavaria heritage institution sources into unified dataset.
Combines:
- Bavarian State Archives (8 institutions)
- Bavarian University Libraries (6 institutions)
- Bavaria Museums from isil.museum registry (~1,231 institutions)
Total: ~1,245 institutions
"""
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any
def load_json(filepath: Path) -> List[Dict[str, Any]]:
"""Load JSON file and return list of institutions."""
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
def merge_bayern_sources() -> List[Dict[str, Any]]:
"""Merge all Bavaria sources into unified dataset."""
data_dir = Path('data/isil/germany')
# Find latest files for each source
archives_files = sorted(data_dir.glob('bayern_archives_*.json'))
libraries_files = sorted(data_dir.glob('bayern_libraries_*.json'))
museum_files = sorted(data_dir.glob('bayern_museums_*.json'))
if not archives_files:
raise FileNotFoundError("No archives file found")
if not libraries_files:
raise FileNotFoundError("No libraries file found")
if not museum_files:
raise FileNotFoundError("No museums file found")
# Load latest files
archives = load_json(archives_files[-1])
libraries = load_json(libraries_files[-1])
museums = load_json(museum_files[-1])
print(f"\nLoading sources:")
print(f" Archives: {len(archives):4d} institutions")
print(f" Libraries: {len(libraries):4d} institutions")
print(f" Museums: {len(museums):4d} institutions")
print(f" {'' * 50}")
print(f" Total: {len(archives) + len(libraries) + len(museums):4d} institutions")
# Merge all institutions
merged = []
merged.extend(archives)
merged.extend(libraries)
merged.extend(museums)
# Sort by city, then by name
merged.sort(key=lambda x: (
x.get('locations', [{}])[0].get('city', ''),
x.get('name', '')
))
# Add merge metadata to provenance
for institution in merged:
if 'provenance' in institution:
institution['provenance']['merged_dataset'] = 'bayern_complete'
institution['provenance']['merge_date'] = datetime.now(timezone.utc).isoformat()
return merged
def calculate_completeness(institutions: List[Dict[str, Any]]) -> Dict[str, float]:
"""Calculate metadata completeness statistics."""
total = len(institutions)
if total == 0:
return {}
stats = {
'name': 0,
'institution_type': 0,
'city': 0,
'street_address': 0,
'postal_code': 0,
'phone': 0,
'email': 0,
'website': 0,
'isil_code': 0,
'description': 0,
'wikidata': 0,
'viaf': 0,
}
for inst in institutions:
if inst.get('name'):
stats['name'] += 1
if inst.get('institution_type'):
stats['institution_type'] += 1
if inst.get('description'):
stats['description'] += 1
# Location fields
if inst.get('locations'):
loc = inst['locations'][0]
if loc.get('city'):
stats['city'] += 1
if loc.get('street_address'):
stats['street_address'] += 1
if loc.get('postal_code'):
stats['postal_code'] += 1
# Contact fields (stored in locations)
if inst.get('locations'):
loc = inst['locations'][0]
if loc.get('email'):
stats['email'] += 1
if loc.get('phone'):
stats['phone'] += 1
# Website (stored in identifiers)
if inst.get('identifiers'):
for ident in inst['identifiers']:
if ident.get('identifier_scheme', '').lower() == 'website':
stats['website'] += 1
break
# Identifiers
if inst.get('identifiers'):
for ident in inst['identifiers']:
scheme = ident.get('identifier_scheme', '').lower()
if scheme == 'isil':
stats['isil_code'] += 1
elif scheme == 'wikidata':
stats['wikidata'] += 1
elif scheme == 'viaf':
stats['viaf'] += 1
# Calculate percentages
percentages = {k: (v / total * 100) for k, v in stats.items()}
return percentages
def print_statistics(institutions: List[Dict[str, Any]]) -> None:
"""Print merge statistics."""
print("\n" + "=" * 80)
print("Saxony Complete Dataset - Merge Statistics")
print("=" * 80)
# Count by type
type_counts = {}
for inst in institutions:
inst_type = inst.get('institution_type', 'UNKNOWN')
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
print(f"\nTotal Institutions: {len(institutions)}")
print("\nBy Type:")
for inst_type, count in sorted(type_counts.items()):
print(f" {inst_type}: {count}")
# Count by city
city_counts = {}
for inst in institutions:
if inst.get('locations'):
city = inst['locations'][0].get('city', 'Unknown')
city_counts[city] = city_counts.get(city, 0) + 1
print("\nBy City:")
for city, count in sorted(city_counts.items(), key=lambda x: -x[1]):
print(f" {city}: {count}")
# Completeness statistics
completeness = calculate_completeness(institutions)
print("\n" + "=" * 80)
print("Metadata Completeness")
print("=" * 80)
print("\nCore Fields:")
print(f" Name : {completeness['name']:.1f}%")
print(f" Institution Type : {completeness['institution_type']:.1f}%")
print(f" Description : {completeness['description']:.1f}%")
print("\nLocation Fields:")
print(f" City : {completeness['city']:.1f}%")
print(f" Street Address : {completeness['street_address']:.1f}%")
print(f" Postal Code : {completeness['postal_code']:.1f}%")
print("\nContact Fields:")
print(f" Phone : {completeness['phone']:.1f}%")
print(f" Email : {completeness['email']:.1f}%")
print(f" Website : {completeness['website']:.1f}%")
print("\nIdentifiers:")
print(f" ISIL Code : {completeness['isil_code']:.1f}%")
print(f" Wikidata ID : {completeness['wikidata']:.1f}%")
print(f" VIAF ID : {completeness['viaf']:.1f}%")
# Calculate average completeness
avg = sum(completeness.values()) / len(completeness)
print(f"\nAverage Completeness: {avg:.1f}%")
print("\n" + "=" * 80)
def main():
"""Main merge workflow."""
print("=" * 80)
print("Saxony Complete Dataset Merger")
print("=" * 80)
# Merge sources
print("\nMerging sources...")
merged = merge_bayern_sources()
# Print statistics
print_statistics(merged)
# Export merged dataset
timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')
output_file = Path(f'data/isil/germany/bayern_complete_{timestamp}.json')
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(merged, f, indent=2, ensure_ascii=False)
print(f"\n✓ Exported to: {output_file}")
print(f" File size: {output_file.stat().st_size:,} bytes")
print(f" Institutions: {len(merged)}")
print("\n" + "=" * 80)
print("Merge complete!")
print("=" * 80)
if __name__ == '__main__':
main()