glam/scripts/scrapers/create_german_unified_dataset.py
2025-11-19 23:25:22 +01:00

343 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Create Unified German Heritage Institution Dataset
This script combines:
1. ISIL registry data (16,979 institutions - libraries, archives, museums)
2. Archivportal-D data (~10,000-20,000 archives)
3. Merge analysis (matched pairs + new discoveries)
Output: Comprehensive German heritage dataset (~25,000-27,000 institutions)
Deduplication strategy:
- Use matched pairs from merge script
- Prefer ISIL data for matched institutions (Tier 1)
- Enrich with Archivportal-D metadata where available
- Add new discoveries from Archivportal-D
Author: OpenCode + MCP Tools
Date: 2025-11-19
"""
import json
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime
# Configuration
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR = DATA_DIR
def load_isil_data() -> List[Dict]:
"""Load ISIL registry data."""
isil_file = DATA_DIR / "german_isil_complete_20251119_134939.json"
print(f"Loading ISIL data: {isil_file}")
with open(isil_file, 'r', encoding='utf-8') as f:
data = json.load(f)
institutions = data.get('institutions', [])
print(f" Loaded {len(institutions)} ISIL records\n")
return institutions
def load_merge_results() -> Dict:
"""Load merge analysis results."""
matched_files = list(DATA_DIR.glob("merged_matched_*.json"))
new_files = list(DATA_DIR.glob("merged_new_discoveries_*.json"))
if not matched_files or not new_files:
raise FileNotFoundError(
f"Merge results not found in {DATA_DIR}. "
"Run merge_archivportal_isil.py first."
)
# Use most recent files
matched_file = max(matched_files, key=lambda p: p.stat().st_mtime)
new_file = max(new_files, key=lambda p: p.stat().st_mtime)
print(f"Loading merge results:")
print(f" Matched: {matched_file}")
print(f" New: {new_file}")
with open(matched_file, 'r', encoding='utf-8') as f:
matched_data = json.load(f)
with open(new_file, 'r', encoding='utf-8') as f:
new_data = json.load(f)
print(f" Matched pairs: {len(matched_data.get('matches', []))}")
print(f" New discoveries: {len(new_data.get('archives', []))}\n")
return {
'matched': matched_data.get('matches', []),
'new_discoveries': new_data.get('archives', [])
}
def enrich_isil_with_archivportal(isil_record: Dict, ap_record: Dict) -> Dict:
"""
Enrich ISIL record with Archivportal-D metadata.
ISIL data takes precedence (Tier 1), but we add missing fields
from Archivportal-D where available.
"""
enriched = isil_record.copy()
# Add Archivportal-D ID
if ap_record.get('id'):
enriched['archivportal_id'] = ap_record['id']
enriched['archivportal_url'] = ap_record.get('profile_url')
# Add archive type if missing
if not enriched.get('institution_subtype') and ap_record.get('archive_type'):
enriched['institution_subtype'] = ap_record['archive_type']
# Add federal state if missing
if not enriched.get('federal_state') and ap_record.get('federal_state'):
enriched['federal_state'] = ap_record['federal_state']
# Add/improve coordinates if missing or better
if ap_record.get('latitude') and ap_record.get('longitude'):
if not enriched.get('latitude'):
enriched['latitude'] = ap_record['latitude']
enriched['longitude'] = ap_record['longitude']
enriched['coordinate_source'] = 'archivportal-d'
# Add thumbnail
if ap_record.get('thumbnail'):
enriched['thumbnail'] = ap_record['thumbnail']
# Mark as enriched
enriched['data_sources'] = ['ISIL', 'Archivportal-D']
enriched['enriched_from_archivportal'] = True
return enriched
def convert_archivportal_to_unified(ap_record: Dict) -> Dict:
"""
Convert Archivportal-D record to unified format.
These are new discoveries not in ISIL registry.
"""
return {
'id': ap_record.get('id'),
'institution_name': ap_record.get('name'),
'city': ap_record.get('location'),
'federal_state': ap_record.get('federal_state'),
'institution_type': 'ARCHIVE',
'institution_subtype': ap_record.get('archive_type'),
'isil_code': ap_record.get('isil'), # May be None
'latitude': ap_record.get('latitude'),
'longitude': ap_record.get('longitude'),
'thumbnail': ap_record.get('thumbnail'),
'archivportal_id': ap_record.get('id'),
'archivportal_url': ap_record.get('profile_url'),
'data_sources': ['Archivportal-D'],
'data_tier': 'TIER_2_VERIFIED', # From official portal
'needs_isil_assignment': not ap_record.get('isil')
}
def create_unified_dataset(
isil_records: List[Dict],
merge_results: Dict
) -> List[Dict]:
"""
Create unified dataset combining all sources.
"""
print("Creating unified dataset...")
unified = []
# 1. Add matched institutions (ISIL + Archivportal enrichment)
matched_isil_codes = set()
for match in merge_results['matched']:
isil_record = match['isil']
ap_record = match['archivportal']
enriched = enrich_isil_with_archivportal(isil_record, ap_record)
unified.append(enriched)
matched_isil_codes.add(isil_record.get('isil_code'))
print(f" Added {len(unified)} enriched records (matched)")
# 2. Add ISIL-only records (not in Archivportal)
isil_only_count = 0
for isil_record in isil_records:
if isil_record.get('isil_code') not in matched_isil_codes:
record = isil_record.copy()
record['data_sources'] = ['ISIL']
unified.append(record)
isil_only_count += 1
print(f" Added {isil_only_count} ISIL-only records")
# 3. Add new discoveries from Archivportal
new_count = 0
for ap_record in merge_results['new_discoveries']:
unified_record = convert_archivportal_to_unified(ap_record)
unified.append(unified_record)
new_count += 1
print(f" Added {new_count} new discoveries (Archivportal-only)")
print(f"\nTotal unified records: {len(unified)}\n")
return unified
def generate_statistics(unified: List[Dict]) -> Dict:
"""Generate comprehensive statistics."""
stats = {
'total': len(unified),
'by_type': {},
'by_state': {},
'by_source': {},
'data_completeness': {
'with_isil': 0,
'with_coordinates': 0,
'with_website': 0,
'with_email': 0,
'enriched_from_archivportal': 0,
'needs_isil_assignment': 0
}
}
for record in unified:
# By type
inst_type = record.get('institution_type', 'Unknown')
stats['by_type'][inst_type] = stats['by_type'].get(inst_type, 0) + 1
# By state
state = record.get('federal_state', 'Unknown')
stats['by_state'][state] = stats['by_state'].get(state, 0) + 1
# By source
sources = ','.join(record.get('data_sources', ['Unknown']))
stats['by_source'][sources] = stats['by_source'].get(sources, 0) + 1
# Completeness
if record.get('isil_code'):
stats['data_completeness']['with_isil'] += 1
if record.get('latitude'):
stats['data_completeness']['with_coordinates'] += 1
if record.get('website'):
stats['data_completeness']['with_website'] += 1
if record.get('email'):
stats['data_completeness']['with_email'] += 1
if record.get('enriched_from_archivportal'):
stats['data_completeness']['enriched_from_archivportal'] += 1
if record.get('needs_isil_assignment'):
stats['data_completeness']['needs_isil_assignment'] += 1
# Print report
print(f"\n{'='*70}")
print("UNIFIED DATASET STATISTICS")
print(f"{'='*70}")
print(f"\nTotal institutions: {stats['total']}")
print(f"\nBy institution type:")
for inst_type, count in sorted(stats['by_type'].items(), key=lambda x: x[1], reverse=True):
pct = count / stats['total'] * 100
print(f" - {inst_type}: {count} ({pct:.1f}%)")
print(f"\nBy data source:")
for source, count in sorted(stats['by_source'].items(), key=lambda x: x[1], reverse=True):
pct = count / stats['total'] * 100
print(f" - {source}: {count} ({pct:.1f}%)")
print(f"\nData completeness:")
for metric, count in stats['data_completeness'].items():
pct = count / stats['total'] * 100
print(f" - {metric.replace('_', ' ').title()}: {count} ({pct:.1f}%)")
print(f"\nTop 10 federal states:")
for state, count in sorted(stats['by_state'].items(), key=lambda x: x[1], reverse=True)[:10]:
pct = count / stats['total'] * 100
print(f" - {state}: {count} ({pct:.1f}%)")
print(f"{'='*70}\n")
return stats
def save_unified_dataset(unified: List[Dict], stats: Dict):
"""Save unified dataset to JSON file."""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = OUTPUT_DIR / f"german_unified_{timestamp}.json"
output = {
'metadata': {
'description': 'Unified German heritage institution dataset',
'sources': [
'ISIL Registry (16,979 institutions)',
'Archivportal-D via DDB API (~10,000-20,000 archives)'
],
'creation_date': datetime.utcnow().isoformat() + 'Z',
'total_institutions': len(unified),
'coverage': 'Germany - all heritage sectors',
'data_tiers': 'TIER_1 (ISIL), TIER_2 (Archivportal-D)',
'license': 'CC0 1.0 Universal (Public Domain)'
},
'statistics': stats,
'institutions': unified
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
print(f"✓ Saved unified dataset: {output_file}")
print(f" File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB\n")
# Also save as JSONL for easier processing
jsonl_file = OUTPUT_DIR / f"german_unified_{timestamp}.jsonl"
with open(jsonl_file, 'w', encoding='utf-8') as f:
for record in unified:
f.write(json.dumps(record, ensure_ascii=False) + '\n')
print(f"✓ Saved JSONL format: {jsonl_file}")
print(f" File size: {jsonl_file.stat().st_size / 1024 / 1024:.2f} MB\n")
return output_file
def main():
"""Main execution."""
print(f"\n{'#'*70}")
print(f"# Create Unified German Heritage Dataset")
print(f"{'#'*70}\n")
# Load data
isil_records = load_isil_data()
merge_results = load_merge_results()
# Create unified dataset
unified = create_unified_dataset(isil_records, merge_results)
# Generate statistics
stats = generate_statistics(unified)
# Save
output_file = save_unified_dataset(unified, stats)
print("✓ Unified dataset creation complete!\n")
print("Summary:")
print(f" - Total institutions: {len(unified)}")
print(f" - ISIL records: {sum(1 for r in unified if 'ISIL' in r.get('data_sources', []))}")
print(f" - Archivportal enriched: {sum(1 for r in unified if r.get('enriched_from_archivportal'))}")
print(f" - New discoveries: {sum(1 for r in unified if r.get('data_sources') == ['Archivportal-D'])}")
print(f" - With ISIL codes: {sum(1 for r in unified if r.get('isil_code'))}")
print(f" - Needing ISIL: {sum(1 for r in unified if r.get('needs_isil_assignment'))}")
print(f"\nNext steps:")
print(f" 1. Review unified dataset: {output_file}")
print(f" 2. Convert to LinkML format")
print(f" 3. Generate GHCIDs")
print(f" 4. Export to RDF/CSV/Parquet\n")
if __name__ == "__main__":
main()