343 lines
12 KiB
Python
343 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Create Unified German Heritage Institution Dataset
|
|
|
|
This script combines:
|
|
1. ISIL registry data (16,979 institutions - libraries, archives, museums)
|
|
2. Archivportal-D data (~10,000-20,000 archives)
|
|
3. Merge analysis (matched pairs + new discoveries)
|
|
|
|
Output: Comprehensive German heritage dataset (~25,000-27,000 institutions)
|
|
|
|
Deduplication strategy:
|
|
- Use matched pairs from merge script
|
|
- Prefer ISIL data for matched institutions (Tier 1)
|
|
- Enrich with Archivportal-D metadata where available
|
|
- Add new discoveries from Archivportal-D
|
|
|
|
Author: OpenCode + MCP Tools
|
|
Date: 2025-11-19
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime
|
|
|
|
# Configuration
|
|
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
|
|
OUTPUT_DIR = DATA_DIR
|
|
|
|
|
|
def load_isil_data() -> List[Dict]:
|
|
"""Load ISIL registry data."""
|
|
isil_file = DATA_DIR / "german_isil_complete_20251119_134939.json"
|
|
|
|
print(f"Loading ISIL data: {isil_file}")
|
|
with open(isil_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
institutions = data.get('institutions', [])
|
|
print(f" Loaded {len(institutions)} ISIL records\n")
|
|
|
|
return institutions
|
|
|
|
|
|
def load_merge_results() -> Dict:
|
|
"""Load merge analysis results."""
|
|
matched_files = list(DATA_DIR.glob("merged_matched_*.json"))
|
|
new_files = list(DATA_DIR.glob("merged_new_discoveries_*.json"))
|
|
|
|
if not matched_files or not new_files:
|
|
raise FileNotFoundError(
|
|
f"Merge results not found in {DATA_DIR}. "
|
|
"Run merge_archivportal_isil.py first."
|
|
)
|
|
|
|
# Use most recent files
|
|
matched_file = max(matched_files, key=lambda p: p.stat().st_mtime)
|
|
new_file = max(new_files, key=lambda p: p.stat().st_mtime)
|
|
|
|
print(f"Loading merge results:")
|
|
print(f" Matched: {matched_file}")
|
|
print(f" New: {new_file}")
|
|
|
|
with open(matched_file, 'r', encoding='utf-8') as f:
|
|
matched_data = json.load(f)
|
|
|
|
with open(new_file, 'r', encoding='utf-8') as f:
|
|
new_data = json.load(f)
|
|
|
|
print(f" Matched pairs: {len(matched_data.get('matches', []))}")
|
|
print(f" New discoveries: {len(new_data.get('archives', []))}\n")
|
|
|
|
return {
|
|
'matched': matched_data.get('matches', []),
|
|
'new_discoveries': new_data.get('archives', [])
|
|
}
|
|
|
|
|
|
def enrich_isil_with_archivportal(isil_record: Dict, ap_record: Dict) -> Dict:
|
|
"""
|
|
Enrich ISIL record with Archivportal-D metadata.
|
|
|
|
ISIL data takes precedence (Tier 1), but we add missing fields
|
|
from Archivportal-D where available.
|
|
"""
|
|
enriched = isil_record.copy()
|
|
|
|
# Add Archivportal-D ID
|
|
if ap_record.get('id'):
|
|
enriched['archivportal_id'] = ap_record['id']
|
|
enriched['archivportal_url'] = ap_record.get('profile_url')
|
|
|
|
# Add archive type if missing
|
|
if not enriched.get('institution_subtype') and ap_record.get('archive_type'):
|
|
enriched['institution_subtype'] = ap_record['archive_type']
|
|
|
|
# Add federal state if missing
|
|
if not enriched.get('federal_state') and ap_record.get('federal_state'):
|
|
enriched['federal_state'] = ap_record['federal_state']
|
|
|
|
# Add/improve coordinates if missing or better
|
|
if ap_record.get('latitude') and ap_record.get('longitude'):
|
|
if not enriched.get('latitude'):
|
|
enriched['latitude'] = ap_record['latitude']
|
|
enriched['longitude'] = ap_record['longitude']
|
|
enriched['coordinate_source'] = 'archivportal-d'
|
|
|
|
# Add thumbnail
|
|
if ap_record.get('thumbnail'):
|
|
enriched['thumbnail'] = ap_record['thumbnail']
|
|
|
|
# Mark as enriched
|
|
enriched['data_sources'] = ['ISIL', 'Archivportal-D']
|
|
enriched['enriched_from_archivportal'] = True
|
|
|
|
return enriched
|
|
|
|
|
|
def convert_archivportal_to_unified(ap_record: Dict) -> Dict:
|
|
"""
|
|
Convert Archivportal-D record to unified format.
|
|
|
|
These are new discoveries not in ISIL registry.
|
|
"""
|
|
return {
|
|
'id': ap_record.get('id'),
|
|
'institution_name': ap_record.get('name'),
|
|
'city': ap_record.get('location'),
|
|
'federal_state': ap_record.get('federal_state'),
|
|
'institution_type': 'ARCHIVE',
|
|
'institution_subtype': ap_record.get('archive_type'),
|
|
'isil_code': ap_record.get('isil'), # May be None
|
|
'latitude': ap_record.get('latitude'),
|
|
'longitude': ap_record.get('longitude'),
|
|
'thumbnail': ap_record.get('thumbnail'),
|
|
'archivportal_id': ap_record.get('id'),
|
|
'archivportal_url': ap_record.get('profile_url'),
|
|
'data_sources': ['Archivportal-D'],
|
|
'data_tier': 'TIER_2_VERIFIED', # From official portal
|
|
'needs_isil_assignment': not ap_record.get('isil')
|
|
}
|
|
|
|
|
|
def create_unified_dataset(
|
|
isil_records: List[Dict],
|
|
merge_results: Dict
|
|
) -> List[Dict]:
|
|
"""
|
|
Create unified dataset combining all sources.
|
|
"""
|
|
print("Creating unified dataset...")
|
|
|
|
unified = []
|
|
|
|
# 1. Add matched institutions (ISIL + Archivportal enrichment)
|
|
matched_isil_codes = set()
|
|
for match in merge_results['matched']:
|
|
isil_record = match['isil']
|
|
ap_record = match['archivportal']
|
|
|
|
enriched = enrich_isil_with_archivportal(isil_record, ap_record)
|
|
unified.append(enriched)
|
|
|
|
matched_isil_codes.add(isil_record.get('isil_code'))
|
|
|
|
print(f" Added {len(unified)} enriched records (matched)")
|
|
|
|
# 2. Add ISIL-only records (not in Archivportal)
|
|
isil_only_count = 0
|
|
for isil_record in isil_records:
|
|
if isil_record.get('isil_code') not in matched_isil_codes:
|
|
record = isil_record.copy()
|
|
record['data_sources'] = ['ISIL']
|
|
unified.append(record)
|
|
isil_only_count += 1
|
|
|
|
print(f" Added {isil_only_count} ISIL-only records")
|
|
|
|
# 3. Add new discoveries from Archivportal
|
|
new_count = 0
|
|
for ap_record in merge_results['new_discoveries']:
|
|
unified_record = convert_archivportal_to_unified(ap_record)
|
|
unified.append(unified_record)
|
|
new_count += 1
|
|
|
|
print(f" Added {new_count} new discoveries (Archivportal-only)")
|
|
print(f"\nTotal unified records: {len(unified)}\n")
|
|
|
|
return unified
|
|
|
|
|
|
def generate_statistics(unified: List[Dict]) -> Dict:
|
|
"""Generate comprehensive statistics."""
|
|
stats = {
|
|
'total': len(unified),
|
|
'by_type': {},
|
|
'by_state': {},
|
|
'by_source': {},
|
|
'data_completeness': {
|
|
'with_isil': 0,
|
|
'with_coordinates': 0,
|
|
'with_website': 0,
|
|
'with_email': 0,
|
|
'enriched_from_archivportal': 0,
|
|
'needs_isil_assignment': 0
|
|
}
|
|
}
|
|
|
|
for record in unified:
|
|
# By type
|
|
inst_type = record.get('institution_type', 'Unknown')
|
|
stats['by_type'][inst_type] = stats['by_type'].get(inst_type, 0) + 1
|
|
|
|
# By state
|
|
state = record.get('federal_state', 'Unknown')
|
|
stats['by_state'][state] = stats['by_state'].get(state, 0) + 1
|
|
|
|
# By source
|
|
sources = ','.join(record.get('data_sources', ['Unknown']))
|
|
stats['by_source'][sources] = stats['by_source'].get(sources, 0) + 1
|
|
|
|
# Completeness
|
|
if record.get('isil_code'):
|
|
stats['data_completeness']['with_isil'] += 1
|
|
if record.get('latitude'):
|
|
stats['data_completeness']['with_coordinates'] += 1
|
|
if record.get('website'):
|
|
stats['data_completeness']['with_website'] += 1
|
|
if record.get('email'):
|
|
stats['data_completeness']['with_email'] += 1
|
|
if record.get('enriched_from_archivportal'):
|
|
stats['data_completeness']['enriched_from_archivportal'] += 1
|
|
if record.get('needs_isil_assignment'):
|
|
stats['data_completeness']['needs_isil_assignment'] += 1
|
|
|
|
# Print report
|
|
print(f"\n{'='*70}")
|
|
print("UNIFIED DATASET STATISTICS")
|
|
print(f"{'='*70}")
|
|
print(f"\nTotal institutions: {stats['total']}")
|
|
|
|
print(f"\nBy institution type:")
|
|
for inst_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True):
|
|
pct = count / stats['total'] * 100
|
|
print(f" - {inst_type}: {count} ({pct:.1f}%)")
|
|
|
|
print(f"\nBy data source:")
|
|
for source, count in sorted(stats['by_source'].items(), key=lambda x: x[1], reverse=True):
|
|
pct = count / stats['total'] * 100
|
|
print(f" - {source}: {count} ({pct:.1f}%)")
|
|
|
|
print(f"\nData completeness:")
|
|
for metric, count in stats['data_completeness'].items():
|
|
pct = count / stats['total'] * 100
|
|
print(f" - {metric.replace('_', ' ').title()}: {count} ({pct:.1f}%)")
|
|
|
|
print(f"\nTop 10 federal states:")
|
|
for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True)[:10]:
|
|
pct = count / stats['total'] * 100
|
|
print(f" - {state}: {count} ({pct:.1f}%)")
|
|
|
|
print(f"{'='*70}\n")
|
|
|
|
return stats
|
|
|
|
|
|
def save_unified_dataset(unified: List[Dict], stats: Dict):
|
|
"""Save unified dataset to JSON file."""
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_file = OUTPUT_DIR / f"german_unified_{timestamp}.json"
|
|
|
|
output = {
|
|
'metadata': {
|
|
'description': 'Unified German heritage institution dataset',
|
|
'sources': [
|
|
'ISIL Registry (16,979 institutions)',
|
|
'Archivportal-D via DDB API (~10,000-20,000 archives)'
|
|
],
|
|
'creation_date': datetime.utcnow().isoformat() + 'Z',
|
|
'total_institutions': len(unified),
|
|
'coverage': 'Germany - all heritage sectors',
|
|
'data_tiers': 'TIER_1 (ISIL), TIER_2 (Archivportal-D)',
|
|
'license': 'CC0 1.0 Universal (Public Domain)'
|
|
},
|
|
'statistics': stats,
|
|
'institutions': unified
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"✓ Saved unified dataset: {output_file}")
|
|
print(f" File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")
|
|
|
|
# Also save as JSONL for easier processing
|
|
jsonl_file = OUTPUT_DIR / f"german_unified_{timestamp}.jsonl"
|
|
with open(jsonl_file, 'w', encoding='utf-8') as f:
|
|
for record in unified:
|
|
f.write(json.dumps(record, ensure_ascii=False) + '\n')
|
|
|
|
print(f"✓ Saved JSONL format: {jsonl_file}")
|
|
print(f" File size: {jsonl_file.stat().st_size / 1024 / 1024:.2f} MB\n")
|
|
|
|
return output_file
|
|
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
print(f"\n{'#'*70}")
|
|
print(f"# Create Unified German Heritage Dataset")
|
|
print(f"{'#'*70}\n")
|
|
|
|
# Load data
|
|
isil_records = load_isil_data()
|
|
merge_results = load_merge_results()
|
|
|
|
# Create unified dataset
|
|
unified = create_unified_dataset(isil_records, merge_results)
|
|
|
|
# Generate statistics
|
|
stats = generate_statistics(unified)
|
|
|
|
# Save
|
|
output_file = save_unified_dataset(unified, stats)
|
|
|
|
print("✓ Unified dataset creation complete!\n")
|
|
print("Summary:")
|
|
print(f" - Total institutions: {len(unified)}")
|
|
print(f" - ISIL records: {sum(1 for r in unified if 'ISIL' in r.get('data_sources', []))}")
|
|
print(f" - Archivportal enriched: {sum(1 for r in unified if r.get('enriched_from_archivportal'))}")
|
|
print(f" - New discoveries: {sum(1 for r in unified if r.get('data_sources') == ['Archivportal-D'])}")
|
|
print(f" - With ISIL codes: {sum(1 for r in unified if r.get('isil_code'))}")
|
|
print(f" - Needing ISIL: {sum(1 for r in unified if r.get('needs_isil_assignment'))}")
|
|
print(f"\nNext steps:")
|
|
print(f" 1. Review unified dataset: {output_file}")
|
|
print(f" 2. Convert to LinkML format")
|
|
print(f" 3. Generate GHCIDs")
|
|
print(f" 4. Export to RDF/CSV/Parquet\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|