#!/usr/bin/env python3 """ Enrich Existing Thüringen Records with v4.0 Metadata Updates the 140 existing Thüringen institutions in the German dataset with rich metadata from the v4.0 harvest (95.6% completeness). Strategy: - Match by fuzzy name matching (>90% similarity) - ENRICH existing records instead of skipping - Add contact, administrative, collections, description fields - Preserve existing ISIL codes and identifiers Input: - data/isil/germany/german_institutions_unified_v4_20251120_113920.json (20,944) - data/isil/germany/thueringen_archives_100percent_20251120_095757.json (149) Output: - data/isil/germany/german_institutions_unified_v4_enriched_{timestamp}.json - Enrichment statistics report Expected Enrichment: - 140 existing Thüringen records updated with v4.0 metadata - Fields added: contact, administrative, collections, description - 9 new records already added in previous merge Author: OpenCode AI Agent Date: 2025-11-20 """ import json from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional from rapidfuzz import fuzz # Constants FUZZY_MATCH_THRESHOLD = 90.0 # 90% similarity for matching def load_json(filepath: Path) -> dict: """Load JSON file.""" with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) def save_json(data: dict, filepath: Path): """Save JSON file.""" with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) def find_matching_harvest_record( institution: dict, harvest_archives: List[dict], threshold: float = FUZZY_MATCH_THRESHOLD ) -> Optional[dict]: """ Find matching archive in Thüringen harvest by fuzzy name matching. Args: institution: Institution from German dataset harvest_archives: List of Thüringen harvest archives threshold: Minimum similarity score (0-100) Returns: Matching harvest record or None """ name = institution['name'] city = None if institution.get('locations'): city = institution['locations'][0].get('city') best_match = None best_score = 0 for harvest_arch in harvest_archives: # Fuzzy match on name name_similarity = fuzz.ratio(name.lower(), harvest_arch['name'].lower()) # Bonus if city matches city_match = False if city and harvest_arch.get('city'): if city.lower() == harvest_arch['city'].lower(): city_match = True # Match threshold if name_similarity >= threshold: if city_match or name_similarity >= 95: # High confidence if name_similarity > best_score: best_match = harvest_arch best_score = name_similarity return best_match if best_score >= threshold else None def enrich_institution_with_harvest_data(institution: dict, harvest_record: dict) -> dict: """ Enrich existing institution record with v4.0 harvest metadata. Adds/updates: - Physical address (street, postal code) - Contact (email, phone, fax, website) - Administrative (director, opening hours) - Collections (size, temporal coverage) - Description (archive history) """ enriched = institution.copy() # Update location with physical address physical_addr = harvest_record.get('physical_address') or harvest_record.get('postal_address') if physical_addr and enriched.get('locations'): location = enriched['locations'][0] if physical_addr.get('street') and not location.get('street_address'): location['street_address'] = physical_addr['street'] if physical_addr.get('postal_code') and not location.get('postal_code'): location['postal_code'] = physical_addr['postal_code'] # Add contact info contact = {} if harvest_record.get('email'): contact['email'] = harvest_record['email'] if harvest_record.get('phone'): contact['phone'] = harvest_record['phone'] if harvest_record.get('fax'): contact['fax'] = harvest_record['fax'] if harvest_record.get('website'): contact['website'] = harvest_record['website'] if contact: enriched['contact'] = contact # Add administrative info administrative = {} if harvest_record.get('director'): administrative['director'] = harvest_record['director'] if harvest_record.get('opening_hours'): administrative['opening_hours'] = harvest_record['opening_hours'] if administrative: enriched['administrative'] = administrative # Add collections metadata collections = [] if harvest_record.get('collection_size') or harvest_record.get('temporal_coverage'): collection = {} if harvest_record.get('collection_size'): collection['collection_size'] = harvest_record['collection_size'] if harvest_record.get('temporal_coverage'): collection['temporal_coverage'] = harvest_record['temporal_coverage'] collections.append(collection) if collections: enriched['collections'] = collections # Add archive history as description if harvest_record.get('archive_history') and not enriched.get('description'): history = harvest_record['archive_history'] enriched['description'] = history[:2000] + '...' if len(history) > 2000 else history # Update provenance to reflect enrichment if enriched.get('provenance'): enriched['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat() enriched['provenance']['enrichment_source'] = 'Thüringen archives v4.0 (95.6% completeness)' return enriched def enrich_existing_thueringen_records(): """Main enrichment logic.""" print("🔬 Enriching Existing Thüringen Records with v4.0 Metadata") print("=" * 70) print() # Load datasets print("📁 Loading datasets...") german_unified_path = Path("data/isil/germany/german_institutions_unified_v4_20251120_113920.json") thueringen_harvest_path = Path("data/isil/germany/thueringen_archives_100percent_20251120_095757.json") german_data = load_json(german_unified_path) thueringen_data = load_json(thueringen_harvest_path) print(f" German unified v4: {len(german_data['institutions'])} institutions") print(f" Thüringen harvest v4.0: {len(thueringen_data['archives'])} archives") print() # Statistics stats = { "total_checked": 0, "matches_found": 0, "enriched_count": 0, "fields_added": { "contact": 0, "administrative": 0, "collections": 0, "description": 0 } } # Process institutions print("🔍 Matching and enriching records...") print() enriched_institutions = [] for institution in german_data['institutions']: stats["total_checked"] += 1 # Check if institution might be from Thüringen is_thueringen = False if institution.get('locations'): region = institution['locations'][0].get('region', '') if 'thüringen' in region.lower() or 'thuringen' in region.lower(): is_thueringen = True # Also check source portals if institution.get('source_portals'): if 'archive-in-thueringen.de' in institution['source_portals']: is_thueringen = True # If not Thüringen, keep as-is if not is_thueringen: enriched_institutions.append(institution) continue # Find matching harvest record harvest_match = find_matching_harvest_record(institution, thueringen_data['archives']) if harvest_match: stats["matches_found"] += 1 # Check what will be added has_contact_before = bool(institution.get('contact')) has_admin_before = bool(institution.get('administrative')) has_colls_before = bool(institution.get('collections')) has_desc_before = bool(institution.get('description')) # Enrich record enriched = enrich_institution_with_harvest_data(institution, harvest_match) # Track what was added if not has_contact_before and enriched.get('contact'): stats["fields_added"]["contact"] += 1 if not has_admin_before and enriched.get('administrative'): stats["fields_added"]["administrative"] += 1 if not has_colls_before and enriched.get('collections'): stats["fields_added"]["collections"] += 1 if not has_desc_before and enriched.get('description'): stats["fields_added"]["description"] += 1 enriched_institutions.append(enriched) stats["enriched_count"] += 1 print(f" ✅ ENRICHED: {institution['name']}") else: enriched_institutions.append(institution) print() print("=" * 70) print("📊 Enrichment Statistics") print("=" * 70) print(f" Total institutions checked: {stats['total_checked']}") print(f" Thüringen matches found: {stats['matches_found']}") print(f" Records enriched: {stats['enriched_count']}") print() print(" Fields added:") print(f" Contact metadata: {stats['fields_added']['contact']} records") print(f" Administrative metadata: {stats['fields_added']['administrative']} records") print(f" Collections metadata: {stats['fields_added']['collections']} records") print(f" Descriptions: {stats['fields_added']['description']} records") print() # Update metadata german_data['institutions'] = enriched_institutions german_data['metadata'] = { "version": "v4.0-enriched", "created_at": datetime.now(timezone.utc).isoformat(), "total_institutions": len(enriched_institutions), "sources": ["ISIL Registry", "DDB SPARQL", "NRW Archives Portal", "Thüringen Archives Portal v4.0 (95.6% metadata completeness)"], "enrichment_statistics": stats, "thueringen_v4_features": { "physical_addresses": "100%", "directors": "96%", "opening_hours": "99.3%", "archive_histories": "84.6%", "overall_completeness": "95.6%" } } # Save enriched dataset output_path = Path(f"data/isil/germany/german_institutions_unified_v4_enriched_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json") save_json(german_data, output_path) print(f"💾 Saved: {output_path}") print(f" File size: {output_path.stat().st_size / 1024 / 1024:.1f} MB") print() print("✅ Enrichment complete!") if __name__ == '__main__': enrich_existing_thueringen_records()