295 lines
11 KiB
Python
295 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Existing Thüringen Records with v4.0 Metadata
|
|
|
|
Updates the 140 existing Thüringen institutions in the German dataset with rich
|
|
metadata from the v4.0 harvest (95.6% completeness).
|
|
|
|
Strategy:
|
|
- Match by fuzzy name matching (>90% similarity)
|
|
- ENRICH existing records instead of skipping
|
|
- Add contact, administrative, collections, description fields
|
|
- Preserve existing ISIL codes and identifiers
|
|
|
|
Input:
|
|
- data/isil/germany/german_institutions_unified_v4_20251120_113920.json (20,944)
|
|
- data/isil/germany/thueringen_archives_100percent_20251120_095757.json (149)
|
|
|
|
Output:
|
|
- data/isil/germany/german_institutions_unified_v4_enriched_{timestamp}.json
|
|
- Enrichment statistics report
|
|
|
|
Expected Enrichment:
|
|
- 140 existing Thüringen records updated with v4.0 metadata
|
|
- Fields added: contact, administrative, collections, description
|
|
- 9 new records already added in previous merge
|
|
|
|
Author: OpenCode AI Agent
|
|
Date: 2025-11-20
|
|
"""
|
|
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
from rapidfuzz import fuzz
|
|
|
|
# Constants
|
|
FUZZY_MATCH_THRESHOLD = 90.0 # 90% similarity for matching
|
|
|
|
def load_json(filepath: Path) -> dict:
|
|
"""Load JSON file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
def save_json(data: dict, filepath: Path):
|
|
"""Save JSON file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
def find_matching_harvest_record(
|
|
institution: dict,
|
|
harvest_archives: List[dict],
|
|
threshold: float = FUZZY_MATCH_THRESHOLD
|
|
) -> Optional[dict]:
|
|
"""
|
|
Find matching archive in Thüringen harvest by fuzzy name matching.
|
|
|
|
Args:
|
|
institution: Institution from German dataset
|
|
harvest_archives: List of Thüringen harvest archives
|
|
threshold: Minimum similarity score (0-100)
|
|
|
|
Returns:
|
|
Matching harvest record or None
|
|
"""
|
|
name = institution['name']
|
|
city = None
|
|
if institution.get('locations'):
|
|
city = institution['locations'][0].get('city')
|
|
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for harvest_arch in harvest_archives:
|
|
# Fuzzy match on name
|
|
name_similarity = fuzz.ratio(name.lower(), harvest_arch['name'].lower())
|
|
|
|
# Bonus if city matches
|
|
city_match = False
|
|
if city and harvest_arch.get('city'):
|
|
if city.lower() == harvest_arch['city'].lower():
|
|
city_match = True
|
|
|
|
# Match threshold
|
|
if name_similarity >= threshold:
|
|
if city_match or name_similarity >= 95: # High confidence
|
|
if name_similarity > best_score:
|
|
best_match = harvest_arch
|
|
best_score = name_similarity
|
|
|
|
return best_match if best_score >= threshold else None
|
|
|
|
def enrich_institution_with_harvest_data(institution: dict, harvest_record: dict) -> dict:
|
|
"""
|
|
Enrich existing institution record with v4.0 harvest metadata.
|
|
|
|
Adds/updates:
|
|
- Physical address (street, postal code)
|
|
- Contact (email, phone, fax, website)
|
|
- Administrative (director, opening hours)
|
|
- Collections (size, temporal coverage)
|
|
- Description (archive history)
|
|
"""
|
|
enriched = institution.copy()
|
|
|
|
# Update location with physical address
|
|
physical_addr = harvest_record.get('physical_address') or harvest_record.get('postal_address')
|
|
if physical_addr and enriched.get('locations'):
|
|
location = enriched['locations'][0]
|
|
if physical_addr.get('street') and not location.get('street_address'):
|
|
location['street_address'] = physical_addr['street']
|
|
if physical_addr.get('postal_code') and not location.get('postal_code'):
|
|
location['postal_code'] = physical_addr['postal_code']
|
|
|
|
# Add contact info
|
|
contact = {}
|
|
if harvest_record.get('email'):
|
|
contact['email'] = harvest_record['email']
|
|
if harvest_record.get('phone'):
|
|
contact['phone'] = harvest_record['phone']
|
|
if harvest_record.get('fax'):
|
|
contact['fax'] = harvest_record['fax']
|
|
if harvest_record.get('website'):
|
|
contact['website'] = harvest_record['website']
|
|
|
|
if contact:
|
|
enriched['contact'] = contact
|
|
|
|
# Add administrative info
|
|
administrative = {}
|
|
if harvest_record.get('director'):
|
|
administrative['director'] = harvest_record['director']
|
|
if harvest_record.get('opening_hours'):
|
|
administrative['opening_hours'] = harvest_record['opening_hours']
|
|
|
|
if administrative:
|
|
enriched['administrative'] = administrative
|
|
|
|
# Add collections metadata
|
|
collections = []
|
|
if harvest_record.get('collection_size') or harvest_record.get('temporal_coverage'):
|
|
collection = {}
|
|
if harvest_record.get('collection_size'):
|
|
collection['collection_size'] = harvest_record['collection_size']
|
|
if harvest_record.get('temporal_coverage'):
|
|
collection['temporal_coverage'] = harvest_record['temporal_coverage']
|
|
collections.append(collection)
|
|
|
|
if collections:
|
|
enriched['collections'] = collections
|
|
|
|
# Add archive history as description
|
|
if harvest_record.get('archive_history') and not enriched.get('description'):
|
|
history = harvest_record['archive_history']
|
|
enriched['description'] = history[:2000] + '...' if len(history) > 2000 else history
|
|
|
|
# Update provenance to reflect enrichment
|
|
if enriched.get('provenance'):
|
|
enriched['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
|
|
enriched['provenance']['enrichment_source'] = 'Thüringen archives v4.0 (95.6% completeness)'
|
|
|
|
return enriched
|
|
|
|
def enrich_existing_thueringen_records():
|
|
"""Main enrichment logic."""
|
|
print("🔬 Enriching Existing Thüringen Records with v4.0 Metadata")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Load datasets
|
|
print("📁 Loading datasets...")
|
|
german_unified_path = Path("data/isil/germany/german_institutions_unified_v4_20251120_113920.json")
|
|
thueringen_harvest_path = Path("data/isil/germany/thueringen_archives_100percent_20251120_095757.json")
|
|
|
|
german_data = load_json(german_unified_path)
|
|
thueringen_data = load_json(thueringen_harvest_path)
|
|
|
|
print(f" German unified v4: {len(german_data['institutions'])} institutions")
|
|
print(f" Thüringen harvest v4.0: {len(thueringen_data['archives'])} archives")
|
|
print()
|
|
|
|
# Statistics
|
|
stats = {
|
|
"total_checked": 0,
|
|
"matches_found": 0,
|
|
"enriched_count": 0,
|
|
"fields_added": {
|
|
"contact": 0,
|
|
"administrative": 0,
|
|
"collections": 0,
|
|
"description": 0
|
|
}
|
|
}
|
|
|
|
# Process institutions
|
|
print("🔍 Matching and enriching records...")
|
|
print()
|
|
|
|
enriched_institutions = []
|
|
|
|
for institution in german_data['institutions']:
|
|
stats["total_checked"] += 1
|
|
|
|
# Check if institution might be from Thüringen
|
|
is_thueringen = False
|
|
if institution.get('locations'):
|
|
region = institution['locations'][0].get('region', '')
|
|
if 'thüringen' in region.lower() or 'thuringen' in region.lower():
|
|
is_thueringen = True
|
|
|
|
# Also check source portals
|
|
if institution.get('source_portals'):
|
|
if 'archive-in-thueringen.de' in institution['source_portals']:
|
|
is_thueringen = True
|
|
|
|
# If not Thüringen, keep as-is
|
|
if not is_thueringen:
|
|
enriched_institutions.append(institution)
|
|
continue
|
|
|
|
# Find matching harvest record
|
|
harvest_match = find_matching_harvest_record(institution, thueringen_data['archives'])
|
|
|
|
if harvest_match:
|
|
stats["matches_found"] += 1
|
|
|
|
# Check what will be added
|
|
has_contact_before = bool(institution.get('contact'))
|
|
has_admin_before = bool(institution.get('administrative'))
|
|
has_colls_before = bool(institution.get('collections'))
|
|
has_desc_before = bool(institution.get('description'))
|
|
|
|
# Enrich record
|
|
enriched = enrich_institution_with_harvest_data(institution, harvest_match)
|
|
|
|
# Track what was added
|
|
if not has_contact_before and enriched.get('contact'):
|
|
stats["fields_added"]["contact"] += 1
|
|
if not has_admin_before and enriched.get('administrative'):
|
|
stats["fields_added"]["administrative"] += 1
|
|
if not has_colls_before and enriched.get('collections'):
|
|
stats["fields_added"]["collections"] += 1
|
|
if not has_desc_before and enriched.get('description'):
|
|
stats["fields_added"]["description"] += 1
|
|
|
|
enriched_institutions.append(enriched)
|
|
stats["enriched_count"] += 1
|
|
|
|
print(f" ✅ ENRICHED: {institution['name']}")
|
|
else:
|
|
enriched_institutions.append(institution)
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("📊 Enrichment Statistics")
|
|
print("=" * 70)
|
|
print(f" Total institutions checked: {stats['total_checked']}")
|
|
print(f" Thüringen matches found: {stats['matches_found']}")
|
|
print(f" Records enriched: {stats['enriched_count']}")
|
|
print()
|
|
print(" Fields added:")
|
|
print(f" Contact metadata: {stats['fields_added']['contact']} records")
|
|
print(f" Administrative metadata: {stats['fields_added']['administrative']} records")
|
|
print(f" Collections metadata: {stats['fields_added']['collections']} records")
|
|
print(f" Descriptions: {stats['fields_added']['description']} records")
|
|
print()
|
|
|
|
# Update metadata
|
|
german_data['institutions'] = enriched_institutions
|
|
german_data['metadata'] = {
|
|
"version": "v4.0-enriched",
|
|
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
"total_institutions": len(enriched_institutions),
|
|
"sources": ["ISIL Registry", "DDB SPARQL", "NRW Archives Portal", "Thüringen Archives Portal v4.0 (95.6% metadata completeness)"],
|
|
"enrichment_statistics": stats,
|
|
"thueringen_v4_features": {
|
|
"physical_addresses": "100%",
|
|
"directors": "96%",
|
|
"opening_hours": "99.3%",
|
|
"archive_histories": "84.6%",
|
|
"overall_completeness": "95.6%"
|
|
}
|
|
}
|
|
|
|
# Save enriched dataset
|
|
output_path = Path(f"data/isil/germany/german_institutions_unified_v4_enriched_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
|
|
save_json(german_data, output_path)
|
|
|
|
print(f"💾 Saved: {output_path}")
|
|
print(f" File size: {output_path.stat().st_size / 1024 / 1024:.1f} MB")
|
|
print()
|
|
print("✅ Enrichment complete!")
|
|
|
|
if __name__ == '__main__':
|
|
enrich_existing_thueringen_records()
|