#!/usr/bin/env python3 """ German Heritage Institution Data Cross-Reference Merges DDB institutions with ISIL registry data This script cross-references two German datasets: 1. ISIL Registry (16,979 institutions) - DNB/Staatsbibliothek zu Berlin 2. DDB Institutions (4,937 institutions) - Deutsche Digitale Bibliothek Matching strategy: - Primary: ISIL code (if available in DDB data) - Secondary: Fuzzy name matching - Tertiary: Location matching (city + postal code) Outputs: - Unified German dataset - Statistics report - Match quality analysis Author: OpenCode + MCP Tools Date: 2025-11-19 """ import json from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Tuple, Optional from collections import defaultdict from rapidfuzz import fuzz # Configuration DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany") ISIL_FILE = DATA_DIR / "german_isil_complete_20251119_134939.json" DDB_FILE = DATA_DIR / "ddb_institutions_all_sectors_20251119_191121.json" OUTPUT_FILE = DATA_DIR / f"german_institutions_unified_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json" STATS_FILE = DATA_DIR / f"german_unification_stats_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json" # Fuzzy matching thresholds NAME_MATCH_THRESHOLD = 85 LOCATION_MATCH_THRESHOLD = 80 def load_isil_data() -> List[Dict]: """Load ISIL registry data.""" print(f"📄 Loading ISIL registry data from {ISIL_FILE.name}...") with open(ISIL_FILE, 'r', encoding='utf-8') as f: data = json.load(f) institutions = data['records'] print(f"✅ Loaded {len(institutions)} institutions from ISIL registry") print(f" Metadata: {data['metadata']['source']}") return institutions def load_ddb_data() -> List[Dict]: """Load DDB institutions data.""" print(f"📄 Loading DDB institutions from {DDB_FILE.name}...") with open(DDB_FILE, 'r', encoding='utf-8') as f: data = json.load(f) institutions = data['institutions'] print(f"✅ Loaded {len(institutions)} institutions from DDB") print(f" Metadata: {data['metadata']['source']}") return institutions def normalize_name(name: str) -> str: """Normalize institution name for matching.""" if not name: return "" # Convert to lowercase name = name.lower() # Remove common prefixes/suffixes replacements = [ ('stadtarchiv ', ''), ('landesarchiv ', ''), ('staatsarchiv ', ''), ('universitätsbibliothek ', ''), ('museum ', ''), (' e.v.', ''), (' e. v.', ''), ] for old, new in replacements: name = name.replace(old, new) return name.strip() def match_by_location(ddb_inst: Dict, isil_inst: Dict) -> int: """Calculate location match score.""" ddb_loc = ddb_inst.get('locationDisplayName', '').lower() isil_addr = isil_inst.get('address', {}) if not ddb_loc: return 0 # Extract city and postal code from ISIL city = isil_addr.get('city', '').lower() postal = isil_addr.get('postal_code', '').lower() score = 0 if city and city in ddb_loc: score += 50 if postal and postal in ddb_loc: score += 50 return score def cross_reference_institutions(isil_data: List[Dict], ddb_data: List[Dict]) -> Tuple[List[Dict], Dict]: """ Cross-reference DDB and ISIL institutions. Returns: - Unified institution list - Match statistics """ print(f"\n🔍 Cross-referencing {len(ddb_data)} DDB institutions with {len(isil_data)} ISIL records...") # Index ISIL data by ISIL code for fast lookup isil_by_code: Dict[str, Dict] = {} for inst in isil_data: isil_code = inst.get('isil') if isil_code: isil_by_code[isil_code] = inst print(f" - Indexed {len(isil_by_code)} ISIL codes for matching") # Track matches matched_ddb = [] unmatched_ddb = [] match_stats = { 'isil_code_matches': 0, 'name_matches': 0, 'location_matches': 0, 'no_match': 0 } for ddb_inst in ddb_data: ddb_name = ddb_inst.get('name', '') # Try ISIL code match first (if DDB has ISIL field) ddb_isil = ddb_inst.get('isil') # Some DDB records may have ISIL if ddb_isil and ddb_isil in isil_by_code: # Perfect match via ISIL code isil_inst = isil_by_code[ddb_isil] merged = merge_institutions(ddb_inst, isil_inst, 'ISIL_CODE') matched_ddb.append(merged) match_stats['isil_code_matches'] += 1 continue # Try fuzzy name matching best_match = None best_score = 0 match_type = None for isil_inst in isil_data: isil_name = isil_inst.get('name', '') # Fuzzy name matching name_score = fuzz.ratio( normalize_name(ddb_name), normalize_name(isil_name) ) if name_score >= NAME_MATCH_THRESHOLD and name_score > best_score: # Check location as secondary validation loc_score = match_by_location(ddb_inst, isil_inst) combined_score = (name_score * 0.7) + (loc_score * 0.3) if combined_score > best_score: best_score = combined_score best_match = isil_inst match_type = 'NAME_FUZZY' if loc_score > 0 else 'NAME_ONLY' if best_match and best_score >= NAME_MATCH_THRESHOLD and match_type: merged = merge_institutions(ddb_inst, best_match, match_type) merged['_match_score'] = round(best_score, 2) matched_ddb.append(merged) match_stats['name_matches'] += 1 else: # No match found - keep DDB-only record ddb_inst['_data_sources'] = ['DDB'] ddb_inst['_match_type'] = 'DDB_ONLY' unmatched_ddb.append(ddb_inst) match_stats['no_match'] += 1 # Add ISIL-only records (not matched with DDB) matched_isil_codes = set() for inst in matched_ddb: isil = inst.get('isil') if isil: matched_isil_codes.add(isil) isil_only = [] for inst in isil_data: isil = inst.get('isil') if isil not in matched_isil_codes: inst['_data_sources'] = ['ISIL'] inst['_match_type'] = 'ISIL_ONLY' isil_only.append(inst) # Combine all institutions unified = matched_ddb + unmatched_ddb + isil_only match_stats['total_unified'] = len(unified) match_stats['matched_records'] = len(matched_ddb) match_stats['ddb_only'] = len(unmatched_ddb) match_stats['isil_only'] = len(isil_only) print(f"\n✅ Cross-reference complete!") print(f" - ISIL code matches: {match_stats['isil_code_matches']}") print(f" - Name fuzzy matches: {match_stats['name_matches']}") print(f" - Total matched: {match_stats['matched_records']}") print(f" - DDB only: {match_stats['ddb_only']}") print(f" - ISIL only: {match_stats['isil_only']}") print(f" - Total unified records: {match_stats['total_unified']}") return unified, match_stats def merge_institutions(ddb_inst: Dict, isil_inst: Dict, match_type: str) -> Dict: """ Merge DDB and ISIL institution records. Priority: - ISIL for authoritative metadata (ISIL code, address, contact) - DDB for sector classification, geocoding, item counts """ merged = { '_data_sources': ['DDB', 'ISIL'], '_match_type': match_type } # ISIL fields (authoritative) merged['isil'] = isil_inst.get('isil') merged['name'] = isil_inst.get('name') # ISIL name is authoritative merged['alternative_names'] = isil_inst.get('alternative_names', []) # Add DDB name as alternative if different ddb_name = ddb_inst.get('name') if ddb_name and ddb_name != merged['name']: if ddb_name not in merged['alternative_names']: merged['alternative_names'].append(ddb_name) # Address (ISIL preferred, DDB as fallback for geocoding) merged['address'] = isil_inst.get('address', {}) # If ISIL lacks geocoding but DDB has it, add DDB coordinates if not merged['address'].get('latitude') and ddb_inst.get('latitude'): merged['address']['latitude'] = ddb_inst.get('latitude') merged['address']['longitude'] = ddb_inst.get('longitude') merged['address']['_geocoding_source'] = 'DDB' # Contact (ISIL only) merged['contact'] = isil_inst.get('contact', {}) # URLs (ISIL) merged['urls'] = isil_inst.get('urls', []) # DDB-specific fields merged['ddb_id'] = ddb_inst.get('id') merged['sector'] = ddb_inst.get('sector_name') merged['sector_code'] = ddb_inst.get('sector_code') merged['has_digital_items'] = ddb_inst.get('hasItems', False) merged['digital_item_count'] = ddb_inst.get('numberOfItems', 0) merged['ddb_location_display'] = ddb_inst.get('locationDisplayName') # ISIL-specific fields merged['institution_type'] = isil_inst.get('institution_type') merged['parent_org'] = isil_inst.get('parent_org') merged['interloan_region'] = isil_inst.get('interloan_region') merged['notes'] = isil_inst.get('notes') return merged def generate_statistics(unified: List[Dict], match_stats: Dict) -> Dict: """Generate comprehensive unification statistics.""" stats = { 'generation_date': datetime.now(timezone.utc).isoformat(), 'source_files': { 'isil': str(ISIL_FILE.name), 'ddb': str(DDB_FILE.name) }, 'match_statistics': match_stats, 'coverage': { 'total_institutions': len(unified), 'with_isil_code': 0, 'with_ddb_id': 0, 'with_geocoding': 0, 'with_contact_info': 0, 'with_website': 0, 'with_digital_items': 0, 'matched_both_sources': 0, 'isil_only': 0, 'ddb_only': 0 }, 'by_sector': defaultdict(int), 'by_region': defaultdict(int), 'top_cities': defaultdict(int) } for inst in unified: sources = inst.get('_data_sources', []) # Count by source if len(sources) == 2: stats['coverage']['matched_both_sources'] += 1 elif 'ISIL' in sources: stats['coverage']['isil_only'] += 1 elif 'DDB' in sources: stats['coverage']['ddb_only'] += 1 # Count features if inst.get('isil'): stats['coverage']['with_isil_code'] += 1 if inst.get('ddb_id'): stats['coverage']['with_ddb_id'] += 1 address = inst.get('address', {}) if address.get('latitude'): stats['coverage']['with_geocoding'] += 1 contact = inst.get('contact', {}) if contact.get('email') or contact.get('phone'): stats['coverage']['with_contact_info'] += 1 if inst.get('urls'): stats['coverage']['with_website'] += 1 if inst.get('has_digital_items'): stats['coverage']['with_digital_items'] += 1 # Count by sector sector = inst.get('sector', 'unknown') stats['by_sector'][sector] += 1 # Count by region region = address.get('region', 'unknown') stats['by_region'][region] += 1 # Count by city city = address.get('city', 'unknown') stats['top_cities'][city] += 1 # Convert defaultdicts and sort stats['by_sector'] = dict(sorted(stats['by_sector'].items(), key=lambda x: x[1], reverse=True)) stats['by_region'] = dict(sorted(stats['by_region'].items(), key=lambda x: x[1], reverse=True)) stats['top_cities'] = dict(sorted(stats['top_cities'].items(), key=lambda x: x[1], reverse=True)[:20]) return stats def main(): """Main unification workflow.""" print("🇩🇪 German Heritage Institution Data Unification") print("=" * 60) # Load data isil_data = load_isil_data() ddb_data = load_ddb_data() # Cross-reference unified, match_stats = cross_reference_institutions(isil_data, ddb_data) # Generate statistics print(f"\n📈 Generating statistics...") stats = generate_statistics(unified, match_stats) # Export unified data print(f"\n💾 Exporting unified dataset...") output_data = { 'metadata': { 'source': 'Unified German Heritage Institutions (ISIL + DDB)', 'generation_date': stats['generation_date'], 'source_files': stats['source_files'], 'total_institutions': len(unified), 'license': 'CC0 1.0 Universal (Public Domain)' }, 'institutions': unified } with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(output_data, f, indent=2, ensure_ascii=False) print(f"✅ Saved to: {OUTPUT_FILE}") print(f" Size: {OUTPUT_FILE.stat().st_size / 1024 / 1024:.1f} MB") # Export statistics with open(STATS_FILE, 'w', encoding='utf-8') as f: json.dump(stats, f, indent=2, ensure_ascii=False) print(f"✅ Statistics saved to: {STATS_FILE}") # Print summary print(f"\n" + "=" * 60) print(f"📊 UNIFICATION SUMMARY") print(f"=" * 60) print(f"Total unified institutions: {stats['coverage']['total_institutions']}") print(f"") print(f"Match quality:") print(f" - Both ISIL + DDB: {stats['coverage']['matched_both_sources']} ({stats['coverage']['matched_both_sources']/stats['coverage']['total_institutions']*100:.1f}%)") print(f" - ISIL only: {stats['coverage']['isil_only']} ({stats['coverage']['isil_only']/stats['coverage']['total_institutions']*100:.1f}%)") print(f" - DDB only: {stats['coverage']['ddb_only']} ({stats['coverage']['ddb_only']/stats['coverage']['total_institutions']*100:.1f}%)") print(f"") print(f"Coverage:") print(f" - With ISIL codes: {stats['coverage']['with_isil_code']} ({stats['coverage']['with_isil_code']/stats['coverage']['total_institutions']*100:.1f}%)") print(f" - With geocoding: {stats['coverage']['with_geocoding']} ({stats['coverage']['with_geocoding']/stats['coverage']['total_institutions']*100:.1f}%)") print(f" - With contact info: {stats['coverage']['with_contact_info']} ({stats['coverage']['with_contact_info']/stats['coverage']['total_institutions']*100:.1f}%)") print(f" - With digital items: {stats['coverage']['with_digital_items']} ({stats['coverage']['with_digital_items']/stats['coverage']['total_institutions']*100:.1f}%)") print(f"") print(f"Top 5 sectors:") for i, (sector, count) in enumerate(list(stats['by_sector'].items())[:5], 1): print(f" {i}. {sector}: {count}") print(f"") print(f"Top 5 regions:") for i, (region, count) in enumerate(list(stats['by_region'].items())[:5], 1): print(f" {i}. {region}: {count}") print(f"") print(f"Top 5 cities:") for i, (city, count) in enumerate(list(stats['top_cities'].items())[:5], 1): print(f" {i}. {city}: {count}") print(f"=" * 60) if __name__ == "__main__": main()