glam/scripts/scrapers/enrich_existing_thueringen_records.py
2025-11-21 22:12:33 +01:00

295 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Enrich Existing Thüringen Records with v4.0 Metadata
Updates the 140 existing Thüringen institutions in the German dataset with rich
metadata from the v4.0 harvest (95.6% completeness).
Strategy:
- Match by fuzzy name matching (>90% similarity)
- ENRICH existing records instead of skipping
- Add contact, administrative, collections, description fields
- Preserve existing ISIL codes and identifiers
Input:
- data/isil/germany/german_institutions_unified_v4_20251120_113920.json (20,944)
- data/isil/germany/thueringen_archives_100percent_20251120_095757.json (149)
Output:
- data/isil/germany/german_institutions_unified_v4_enriched_{timestamp}.json
- Enrichment statistics report
Expected Enrichment:
- 140 existing Thüringen records updated with v4.0 metadata
- Fields added: contact, administrative, collections, description
- 9 new records already added in previous merge
Author: OpenCode AI Agent
Date: 2025-11-20
"""
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
from rapidfuzz import fuzz
# Constants
FUZZY_MATCH_THRESHOLD = 90.0 # 90% similarity for matching
def load_json(filepath: Path) -> dict:
"""Load JSON file."""
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
def save_json(data: dict, filepath: Path):
"""Save JSON file."""
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def find_matching_harvest_record(
institution: dict,
harvest_archives: List[dict],
threshold: float = FUZZY_MATCH_THRESHOLD
) -> Optional[dict]:
"""
Find matching archive in Thüringen harvest by fuzzy name matching.
Args:
institution: Institution from German dataset
harvest_archives: List of Thüringen harvest archives
threshold: Minimum similarity score (0-100)
Returns:
Matching harvest record or None
"""
name = institution['name']
city = None
if institution.get('locations'):
city = institution['locations'][0].get('city')
best_match = None
best_score = 0
for harvest_arch in harvest_archives:
# Fuzzy match on name
name_similarity = fuzz.ratio(name.lower(), harvest_arch['name'].lower())
# Bonus if city matches
city_match = False
if city and harvest_arch.get('city'):
if city.lower() == harvest_arch['city'].lower():
city_match = True
# Match threshold
if name_similarity >= threshold:
if city_match or name_similarity >= 95: # High confidence
if name_similarity > best_score:
best_match = harvest_arch
best_score = name_similarity
return best_match if best_score >= threshold else None
def enrich_institution_with_harvest_data(institution: dict, harvest_record: dict) -> dict:
"""
Enrich existing institution record with v4.0 harvest metadata.
Adds/updates:
- Physical address (street, postal code)
- Contact (email, phone, fax, website)
- Administrative (director, opening hours)
- Collections (size, temporal coverage)
- Description (archive history)
"""
enriched = institution.copy()
# Update location with physical address
physical_addr = harvest_record.get('physical_address') or harvest_record.get('postal_address')
if physical_addr and enriched.get('locations'):
location = enriched['locations'][0]
if physical_addr.get('street') and not location.get('street_address'):
location['street_address'] = physical_addr['street']
if physical_addr.get('postal_code') and not location.get('postal_code'):
location['postal_code'] = physical_addr['postal_code']
# Add contact info
contact = {}
if harvest_record.get('email'):
contact['email'] = harvest_record['email']
if harvest_record.get('phone'):
contact['phone'] = harvest_record['phone']
if harvest_record.get('fax'):
contact['fax'] = harvest_record['fax']
if harvest_record.get('website'):
contact['website'] = harvest_record['website']
if contact:
enriched['contact'] = contact
# Add administrative info
administrative = {}
if harvest_record.get('director'):
administrative['director'] = harvest_record['director']
if harvest_record.get('opening_hours'):
administrative['opening_hours'] = harvest_record['opening_hours']
if administrative:
enriched['administrative'] = administrative
# Add collections metadata
collections = []
if harvest_record.get('collection_size') or harvest_record.get('temporal_coverage'):
collection = {}
if harvest_record.get('collection_size'):
collection['collection_size'] = harvest_record['collection_size']
if harvest_record.get('temporal_coverage'):
collection['temporal_coverage'] = harvest_record['temporal_coverage']
collections.append(collection)
if collections:
enriched['collections'] = collections
# Add archive history as description
if harvest_record.get('archive_history') and not enriched.get('description'):
history = harvest_record['archive_history']
enriched['description'] = history[:2000] + '...' if len(history) > 2000 else history
# Update provenance to reflect enrichment
if enriched.get('provenance'):
enriched['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
enriched['provenance']['enrichment_source'] = 'Thüringen archives v4.0 (95.6% completeness)'
return enriched
def enrich_existing_thueringen_records():
"""Main enrichment logic."""
print("🔬 Enriching Existing Thüringen Records with v4.0 Metadata")
print("=" * 70)
print()
# Load datasets
print("📁 Loading datasets...")
german_unified_path = Path("data/isil/germany/german_institutions_unified_v4_20251120_113920.json")
thueringen_harvest_path = Path("data/isil/germany/thueringen_archives_100percent_20251120_095757.json")
german_data = load_json(german_unified_path)
thueringen_data = load_json(thueringen_harvest_path)
print(f" German unified v4: {len(german_data['institutions'])} institutions")
print(f" Thüringen harvest v4.0: {len(thueringen_data['archives'])} archives")
print()
# Statistics
stats = {
"total_checked": 0,
"matches_found": 0,
"enriched_count": 0,
"fields_added": {
"contact": 0,
"administrative": 0,
"collections": 0,
"description": 0
}
}
# Process institutions
print("🔍 Matching and enriching records...")
print()
enriched_institutions = []
for institution in german_data['institutions']:
stats["total_checked"] += 1
# Check if institution might be from Thüringen
is_thueringen = False
if institution.get('locations'):
region = institution['locations'][0].get('region', '')
if 'thüringen' in region.lower() or 'thuringen' in region.lower():
is_thueringen = True
# Also check source portals
if institution.get('source_portals'):
if 'archive-in-thueringen.de' in institution['source_portals']:
is_thueringen = True
# If not Thüringen, keep as-is
if not is_thueringen:
enriched_institutions.append(institution)
continue
# Find matching harvest record
harvest_match = find_matching_harvest_record(institution, thueringen_data['archives'])
if harvest_match:
stats["matches_found"] += 1
# Check what will be added
has_contact_before = bool(institution.get('contact'))
has_admin_before = bool(institution.get('administrative'))
has_colls_before = bool(institution.get('collections'))
has_desc_before = bool(institution.get('description'))
# Enrich record
enriched = enrich_institution_with_harvest_data(institution, harvest_match)
# Track what was added
if not has_contact_before and enriched.get('contact'):
stats["fields_added"]["contact"] += 1
if not has_admin_before and enriched.get('administrative'):
stats["fields_added"]["administrative"] += 1
if not has_colls_before and enriched.get('collections'):
stats["fields_added"]["collections"] += 1
if not has_desc_before and enriched.get('description'):
stats["fields_added"]["description"] += 1
enriched_institutions.append(enriched)
stats["enriched_count"] += 1
print(f" ✅ ENRICHED: {institution['name']}")
else:
enriched_institutions.append(institution)
print()
print("=" * 70)
print("📊 Enrichment Statistics")
print("=" * 70)
print(f" Total institutions checked: {stats['total_checked']}")
print(f" Thüringen matches found: {stats['matches_found']}")
print(f" Records enriched: {stats['enriched_count']}")
print()
print(" Fields added:")
print(f" Contact metadata: {stats['fields_added']['contact']} records")
print(f" Administrative metadata: {stats['fields_added']['administrative']} records")
print(f" Collections metadata: {stats['fields_added']['collections']} records")
print(f" Descriptions: {stats['fields_added']['description']} records")
print()
# Update metadata
german_data['institutions'] = enriched_institutions
german_data['metadata'] = {
"version": "v4.0-enriched",
"created_at": datetime.now(timezone.utc).isoformat(),
"total_institutions": len(enriched_institutions),
"sources": ["ISIL Registry", "DDB SPARQL", "NRW Archives Portal", "Thüringen Archives Portal v4.0 (95.6% metadata completeness)"],
"enrichment_statistics": stats,
"thueringen_v4_features": {
"physical_addresses": "100%",
"directors": "96%",
"opening_hours": "99.3%",
"archive_histories": "84.6%",
"overall_completeness": "95.6%"
}
}
# Save enriched dataset
output_path = Path(f"data/isil/germany/german_institutions_unified_v4_enriched_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
save_json(german_data, output_path)
print(f"💾 Saved: {output_path}")
print(f" File size: {output_path.stat().st_size / 1024 / 1024:.1f} MB")
print()
print("✅ Enrichment complete!")
if __name__ == '__main__':
enrich_existing_thueringen_records()