235 lines
7.6 KiB
Python
235 lines
7.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge all Bavaria heritage institution sources into unified dataset.
|
|
|
|
Combines:
|
|
- Bavarian State Archives (8 institutions)
|
|
- Bavarian University Libraries (6 institutions)
|
|
- Bavaria Museums from isil.museum registry (~1,231 institutions)
|
|
|
|
Total: ~1,245 institutions
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Any
|
|
|
|
def load_json(filepath: Path) -> List[Dict[str, Any]]:
|
|
"""Load JSON file and return list of institutions."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
def merge_bayern_sources() -> List[Dict[str, Any]]:
|
|
"""Merge all Bavaria sources into unified dataset."""
|
|
|
|
data_dir = Path('data/isil/germany')
|
|
|
|
# Find latest files for each source
|
|
archives_files = sorted(data_dir.glob('bayern_archives_*.json'))
|
|
libraries_files = sorted(data_dir.glob('bayern_libraries_*.json'))
|
|
museum_files = sorted(data_dir.glob('bayern_museums_*.json'))
|
|
|
|
if not archives_files:
|
|
raise FileNotFoundError("No archives file found")
|
|
if not libraries_files:
|
|
raise FileNotFoundError("No libraries file found")
|
|
if not museum_files:
|
|
raise FileNotFoundError("No museums file found")
|
|
|
|
# Load latest files
|
|
archives = load_json(archives_files[-1])
|
|
libraries = load_json(libraries_files[-1])
|
|
museums = load_json(museum_files[-1])
|
|
|
|
print(f"\nLoading sources:")
|
|
print(f" Archives: {len(archives):4d} institutions")
|
|
print(f" Libraries: {len(libraries):4d} institutions")
|
|
print(f" Museums: {len(museums):4d} institutions")
|
|
print(f" {'─' * 50}")
|
|
print(f" Total: {len(archives) + len(libraries) + len(museums):4d} institutions")
|
|
|
|
# Merge all institutions
|
|
merged = []
|
|
merged.extend(archives)
|
|
merged.extend(libraries)
|
|
merged.extend(museums)
|
|
|
|
# Sort by city, then by name
|
|
merged.sort(key=lambda x: (
|
|
x.get('locations', [{}])[0].get('city', ''),
|
|
x.get('name', '')
|
|
))
|
|
|
|
# Add merge metadata to provenance
|
|
for institution in merged:
|
|
if 'provenance' in institution:
|
|
institution['provenance']['merged_dataset'] = 'bayern_complete'
|
|
institution['provenance']['merge_date'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
return merged
|
|
|
|
def calculate_completeness(institutions: List[Dict[str, Any]]) -> Dict[str, float]:
|
|
"""Calculate metadata completeness statistics."""
|
|
|
|
total = len(institutions)
|
|
if total == 0:
|
|
return {}
|
|
|
|
stats = {
|
|
'name': 0,
|
|
'institution_type': 0,
|
|
'city': 0,
|
|
'street_address': 0,
|
|
'postal_code': 0,
|
|
'phone': 0,
|
|
'email': 0,
|
|
'website': 0,
|
|
'isil_code': 0,
|
|
'description': 0,
|
|
'wikidata': 0,
|
|
'viaf': 0,
|
|
}
|
|
|
|
for inst in institutions:
|
|
if inst.get('name'):
|
|
stats['name'] += 1
|
|
if inst.get('institution_type'):
|
|
stats['institution_type'] += 1
|
|
if inst.get('description'):
|
|
stats['description'] += 1
|
|
|
|
# Location fields
|
|
if inst.get('locations'):
|
|
loc = inst['locations'][0]
|
|
if loc.get('city'):
|
|
stats['city'] += 1
|
|
if loc.get('street_address'):
|
|
stats['street_address'] += 1
|
|
if loc.get('postal_code'):
|
|
stats['postal_code'] += 1
|
|
|
|
# Contact fields (stored in locations)
|
|
if inst.get('locations'):
|
|
loc = inst['locations'][0]
|
|
if loc.get('email'):
|
|
stats['email'] += 1
|
|
if loc.get('phone'):
|
|
stats['phone'] += 1
|
|
|
|
# Website (stored in identifiers)
|
|
if inst.get('identifiers'):
|
|
for ident in inst['identifiers']:
|
|
if ident.get('identifier_scheme', '').lower() == 'website':
|
|
stats['website'] += 1
|
|
break
|
|
|
|
# Identifiers
|
|
if inst.get('identifiers'):
|
|
for ident in inst['identifiers']:
|
|
scheme = ident.get('identifier_scheme', '').lower()
|
|
if scheme == 'isil':
|
|
stats['isil_code'] += 1
|
|
elif scheme == 'wikidata':
|
|
stats['wikidata'] += 1
|
|
elif scheme == 'viaf':
|
|
stats['viaf'] += 1
|
|
|
|
# Calculate percentages
|
|
percentages = {k: (v / total * 100) for k, v in stats.items()}
|
|
|
|
return percentages
|
|
|
|
def print_statistics(institutions: List[Dict[str, Any]]) -> None:
|
|
"""Print merge statistics."""
|
|
|
|
print("\n" + "=" * 80)
|
|
print("Saxony Complete Dataset - Merge Statistics")
|
|
print("=" * 80)
|
|
|
|
# Count by type
|
|
type_counts = {}
|
|
for inst in institutions:
|
|
inst_type = inst.get('institution_type', 'UNKNOWN')
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
print(f"\nTotal Institutions: {len(institutions)}")
|
|
print("\nBy Type:")
|
|
for inst_type, count in sorted(type_counts.items()):
|
|
print(f" {inst_type}: {count}")
|
|
|
|
# Count by city
|
|
city_counts = {}
|
|
for inst in institutions:
|
|
if inst.get('locations'):
|
|
city = inst['locations'][0].get('city', 'Unknown')
|
|
city_counts[city] = city_counts.get(city, 0) + 1
|
|
|
|
print("\nBy City:")
|
|
for city, count in sorted(city_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {city}: {count}")
|
|
|
|
# Completeness statistics
|
|
completeness = calculate_completeness(institutions)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("Metadata Completeness")
|
|
print("=" * 80)
|
|
|
|
print("\nCore Fields:")
|
|
print(f" Name : {completeness['name']:.1f}%")
|
|
print(f" Institution Type : {completeness['institution_type']:.1f}%")
|
|
print(f" Description : {completeness['description']:.1f}%")
|
|
|
|
print("\nLocation Fields:")
|
|
print(f" City : {completeness['city']:.1f}%")
|
|
print(f" Street Address : {completeness['street_address']:.1f}%")
|
|
print(f" Postal Code : {completeness['postal_code']:.1f}%")
|
|
|
|
print("\nContact Fields:")
|
|
print(f" Phone : {completeness['phone']:.1f}%")
|
|
print(f" Email : {completeness['email']:.1f}%")
|
|
print(f" Website : {completeness['website']:.1f}%")
|
|
|
|
print("\nIdentifiers:")
|
|
print(f" ISIL Code : {completeness['isil_code']:.1f}%")
|
|
print(f" Wikidata ID : {completeness['wikidata']:.1f}%")
|
|
print(f" VIAF ID : {completeness['viaf']:.1f}%")
|
|
|
|
# Calculate average completeness
|
|
avg = sum(completeness.values()) / len(completeness)
|
|
print(f"\nAverage Completeness: {avg:.1f}%")
|
|
|
|
print("\n" + "=" * 80)
|
|
|
|
def main():
|
|
"""Main merge workflow."""
|
|
|
|
print("=" * 80)
|
|
print("Saxony Complete Dataset Merger")
|
|
print("=" * 80)
|
|
|
|
# Merge sources
|
|
print("\nMerging sources...")
|
|
merged = merge_bayern_sources()
|
|
|
|
# Print statistics
|
|
print_statistics(merged)
|
|
|
|
# Export merged dataset
|
|
timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')
|
|
output_file = Path(f'data/isil/germany/bayern_complete_{timestamp}.json')
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(merged, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n✓ Exported to: {output_file}")
|
|
print(f" File size: {output_file.stat().st_size:,} bytes")
|
|
print(f" Institutions: {len(merged)}")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("Merge complete!")
|
|
print("=" * 80)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|