352 lines
13 KiB
Python
Executable file
352 lines
13 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Merge Thüringen Archives v4.0 (100% Metadata) with German Unified Dataset
|
|
|
|
Integrates the 149 Thüringen archives from archive-in-thueringen.de with the existing
|
|
German unified dataset (20,935 institutions from ISIL + DDB + NRW).
|
|
|
|
Features:
|
|
- Deduplication by name fuzzy matching (>90% similarity)
|
|
- Rich metadata extraction (physical addresses, directors, opening hours, archive histories)
|
|
- Uses pre-geocoded coordinates from harvest
|
|
- Preserves existing data quality (ISIL codes, coordinates)
|
|
- Adds Thüringen-specific detailed metadata
|
|
|
|
Input:
|
|
- data/isil/germany/german_institutions_unified_v3_20251120_091059.json (20,935)
|
|
- data/isil/germany/thueringen_archives_100percent_20251120_095757.json (149)
|
|
|
|
Output:
|
|
- data/isil/germany/german_institutions_unified_v4_{timestamp}.json
|
|
- Merge statistics report
|
|
|
|
Metadata Coverage v4.0:
|
|
- Physical addresses: 100% (vs 0% in v2.0)
|
|
- Directors: 96% (vs 0% in v2.0)
|
|
- Opening hours: 99.3% (vs 0% in v2.0)
|
|
- Archive histories: 84.6% (vs 0% in v2.0)
|
|
- Overall completeness: 95.6% (vs 60% in v2.0)
|
|
|
|
Author: OpenCode AI Agent
|
|
Date: 2025-11-20
|
|
"""
|
|
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
from rapidfuzz import fuzz
|
|
|
|
# Constants
|
|
FUZZY_MATCH_THRESHOLD = 90.0 # 90% similarity for deduplication
|
|
|
|
def load_json(filepath: Path) -> dict:
|
|
"""Load JSON file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
def save_json(data: dict, filepath: Path):
|
|
"""Save JSON file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for fuzzy matching."""
|
|
# Remove common prefixes/suffixes
|
|
normalized = name.lower()
|
|
normalized = normalized.replace('stadtarchiv', '').replace('kreisarchiv', '').replace('staatsarchiv', '')
|
|
normalized = normalized.replace('archiv', '').strip()
|
|
return normalized
|
|
|
|
def find_duplicate(
|
|
institution: dict,
|
|
existing_institutions: List[dict],
|
|
threshold: float = FUZZY_MATCH_THRESHOLD
|
|
) -> Optional[dict]:
|
|
"""
|
|
Find duplicate institution in existing dataset using fuzzy name matching.
|
|
|
|
Args:
|
|
institution: Institution to check
|
|
existing_institutions: List of existing institutions
|
|
threshold: Minimum similarity score (0-100)
|
|
|
|
Returns:
|
|
Matching institution or None
|
|
"""
|
|
name = institution['name']
|
|
city = institution.get('city', '')
|
|
|
|
for existing in existing_institutions:
|
|
# Fuzzy match on name
|
|
name_similarity = fuzz.ratio(name.lower(), existing['name'].lower())
|
|
|
|
# Bonus if city matches
|
|
city_match = False
|
|
if city and existing.get('locations'):
|
|
existing_city = existing['locations'][0].get('city')
|
|
if existing_city and city.lower() == existing_city.lower():
|
|
city_match = True
|
|
|
|
# Match threshold
|
|
if name_similarity >= threshold:
|
|
if city_match or name_similarity >= 95: # High confidence
|
|
return existing
|
|
|
|
return None
|
|
|
|
def convert_thueringen_to_german_format(thueringen_archive: dict) -> dict:
|
|
"""
|
|
Convert Thüringen v4.0 harvest format to German unified dataset format.
|
|
|
|
Thüringen v4.0 format (95.6% metadata completeness):
|
|
{
|
|
"id": "thueringen-81",
|
|
"name": "Stadtarchiv Ohrdruf",
|
|
"institution_type": "ARCHIVE",
|
|
"city": "Ohrdruf",
|
|
"region": "Thüringen",
|
|
"country": "DE",
|
|
"latitude": 50.827008,
|
|
"longitude": 10.731950,
|
|
"url": "https://...",
|
|
"email": "archiv@ohrdruf.de",
|
|
"phone": "03624/31700150",
|
|
"website": "https://...",
|
|
"postal_address": {...}, # ← NEW in v4.0
|
|
"physical_address": {...}, # ← NEW in v4.0
|
|
"opening_hours": "...", # ← NEW in v4.0 (99.3% coverage)
|
|
"director": "Dr. X", # ← NEW in v4.0 (96% coverage)
|
|
"collection_size": "869 lfm", # ← NEW in v4.0
|
|
"temporal_coverage": "16-20 Jh", # ← NEW in v4.0
|
|
"archive_history": "...", # ← NEW in v4.0 (84.6% coverage)
|
|
"source_portal": "archive-in-thueringen.de",
|
|
"provenance": {...}
|
|
}
|
|
|
|
German unified format:
|
|
{
|
|
"id": "https://w3id.org/heritage/custodian/de/...",
|
|
"name": "Stadtarchiv Ohrdruf",
|
|
"institution_type": "ARCHIVE",
|
|
"locations": [{
|
|
"city": "Ohrdruf",
|
|
"region": "Thüringen",
|
|
"country": "DE",
|
|
"street_address": "Schloßplatz 1",
|
|
"postal_code": "99885",
|
|
"latitude": 50.827008,
|
|
"longitude": 10.731950
|
|
}],
|
|
"identifiers": [...],
|
|
"contact": {
|
|
"email": "...",
|
|
"phone": "...",
|
|
"website": "..."
|
|
},
|
|
"administrative": {
|
|
"director": "Dr. X",
|
|
"opening_hours": "..."
|
|
},
|
|
"collections": [{
|
|
"collection_size": "869 lfm",
|
|
"temporal_coverage": "16-20 Jh"
|
|
}],
|
|
"description": "Archive history...",
|
|
"source_portals": ["archive-in-thueringen.de"],
|
|
"provenance": {...}
|
|
}
|
|
"""
|
|
# Generate W3ID URI
|
|
name_slug = thueringen_archive['name'].lower().replace(' ', '-').replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue').replace('ß', 'ss')
|
|
name_slug = ''.join(c for c in name_slug if c.isalnum() or c == '-')
|
|
w3id = f"https://w3id.org/heritage/custodian/de/thueringen-{thueringen_archive['id']}"
|
|
|
|
# Build location (with physical address if available)
|
|
location = {
|
|
"city": thueringen_archive.get('city'),
|
|
"region": thueringen_archive.get('region', 'Thüringen'),
|
|
"country": thueringen_archive.get('country', 'DE')
|
|
}
|
|
|
|
# Add physical address details (NEW in v4.0)
|
|
physical_addr = thueringen_archive.get('physical_address') or thueringen_archive.get('postal_address')
|
|
if physical_addr:
|
|
if physical_addr.get('street'):
|
|
location['street_address'] = physical_addr['street']
|
|
if physical_addr.get('postal_code'):
|
|
location['postal_code'] = physical_addr['postal_code']
|
|
|
|
# Add coordinates if available
|
|
if thueringen_archive.get('latitude') and thueringen_archive.get('longitude'):
|
|
location['latitude'] = thueringen_archive['latitude']
|
|
location['longitude'] = thueringen_archive['longitude']
|
|
|
|
# Build identifiers (website URL + portal detail page)
|
|
identifiers = []
|
|
if thueringen_archive.get('website'):
|
|
identifiers.append({
|
|
"identifier_scheme": "Website",
|
|
"identifier_value": thueringen_archive['website'],
|
|
"identifier_url": thueringen_archive['website']
|
|
})
|
|
if thueringen_archive.get('url'):
|
|
identifiers.append({
|
|
"identifier_scheme": "Portal",
|
|
"identifier_value": thueringen_archive['url'],
|
|
"identifier_url": thueringen_archive['url']
|
|
})
|
|
|
|
# Build contact info (NEW in v4.0)
|
|
contact = {}
|
|
if thueringen_archive.get('email'):
|
|
contact['email'] = thueringen_archive['email']
|
|
if thueringen_archive.get('phone'):
|
|
contact['phone'] = thueringen_archive['phone']
|
|
if thueringen_archive.get('fax'):
|
|
contact['fax'] = thueringen_archive['fax']
|
|
if thueringen_archive.get('website'):
|
|
contact['website'] = thueringen_archive['website']
|
|
|
|
# Build administrative info (NEW in v4.0)
|
|
administrative = {}
|
|
if thueringen_archive.get('director'):
|
|
administrative['director'] = thueringen_archive['director']
|
|
if thueringen_archive.get('opening_hours'):
|
|
administrative['opening_hours'] = thueringen_archive['opening_hours']
|
|
|
|
# Build collections metadata (NEW in v4.0)
|
|
collections = []
|
|
if thueringen_archive.get('collection_size') or thueringen_archive.get('temporal_coverage'):
|
|
collection = {}
|
|
if thueringen_archive.get('collection_size'):
|
|
collection['collection_size'] = thueringen_archive['collection_size']
|
|
if thueringen_archive.get('temporal_coverage'):
|
|
collection['temporal_coverage'] = thueringen_archive['temporal_coverage']
|
|
collections.append(collection)
|
|
|
|
# Archive history as description (NEW in v4.0 - 84.6% coverage)
|
|
description = None
|
|
if thueringen_archive.get('archive_history'):
|
|
# Truncate if too long (store first 2000 chars)
|
|
history = thueringen_archive['archive_history']
|
|
description = history[:2000] + '...' if len(history) > 2000 else history
|
|
|
|
# Build unified record
|
|
record = {
|
|
"id": w3id,
|
|
"name": thueringen_archive['name'],
|
|
"institution_type": thueringen_archive['institution_type'],
|
|
"locations": [location],
|
|
"identifiers": identifiers,
|
|
"source_portals": [thueringen_archive.get('source_portal', 'archive-in-thueringen.de')],
|
|
"provenance": {
|
|
"data_source": "WEB_SCRAPING",
|
|
"data_tier": "TIER_2_VERIFIED",
|
|
"extraction_date": thueringen_archive['provenance']['extraction_date'],
|
|
"extraction_method": "Thüringen archives portal v4.0 (100% metadata goal) + fuzzy deduplication",
|
|
"source_url": thueringen_archive.get('url', ''),
|
|
"confidence_score": 0.95
|
|
}
|
|
}
|
|
|
|
# Add optional fields only if present
|
|
if contact:
|
|
record['contact'] = contact
|
|
if administrative:
|
|
record['administrative'] = administrative
|
|
if collections:
|
|
record['collections'] = collections
|
|
if description:
|
|
record['description'] = description
|
|
|
|
return record
|
|
|
|
def merge_thueringen_archives():
|
|
"""Main merge logic."""
|
|
print("🔀 Merging Thüringen Archives into German Unified Dataset")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Load datasets
|
|
print("📁 Loading datasets...")
|
|
german_unified_path = Path("data/isil/germany/german_institutions_unified_v3_20251120_091059.json")
|
|
thueringen_harvest_path = Path("data/isil/germany/thueringen_archives_100percent_20251120_095757.json")
|
|
|
|
german_data = load_json(german_unified_path)
|
|
thueringen_data = load_json(thueringen_harvest_path)
|
|
|
|
print(f" German unified v2: {len(german_data['institutions'])} institutions")
|
|
print(f" Thüringen harvest: {len(thueringen_data['archives'])} archives")
|
|
print()
|
|
|
|
# Statistics
|
|
stats = {
|
|
"duplicates_found": 0,
|
|
"new_additions": 0,
|
|
"geocoded_count": 0,
|
|
"thueringen_total": len(thueringen_data['archives'])
|
|
}
|
|
|
|
# Process Thüringen archives
|
|
print("🔍 Deduplicating and merging...")
|
|
|
|
for thueringen_archive in thueringen_data['archives']:
|
|
# Check for duplicate
|
|
duplicate = find_duplicate(thueringen_archive, german_data['institutions'])
|
|
|
|
if duplicate:
|
|
stats["duplicates_found"] += 1
|
|
print(f" ⏭️ SKIP (duplicate): {thueringen_archive['name']} → {duplicate['name']}")
|
|
else:
|
|
# Convert to German format and add
|
|
german_record = convert_thueringen_to_german_format(thueringen_archive)
|
|
german_data['institutions'].append(german_record)
|
|
stats["new_additions"] += 1
|
|
|
|
if german_record['locations'][0].get('latitude'):
|
|
stats["geocoded_count"] += 1
|
|
|
|
print(f" ✅ ADD: {german_record['name']} ({german_record['locations'][0].get('city', 'no city')})")
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("📊 Merge Statistics")
|
|
print("=" * 70)
|
|
print(f" Thüringen archives processed: {stats['thueringen_total']}")
|
|
print(f" Duplicates (skipped): {stats['duplicates_found']} ({stats['duplicates_found']/stats['thueringen_total']*100:.1f}%)")
|
|
print(f" New additions: {stats['new_additions']} ({stats['new_additions']/stats['thueringen_total']*100:.1f}%)")
|
|
print(f" With coordinates: {stats['geocoded_count']}/{stats['new_additions']} ({stats['geocoded_count']/stats['new_additions']*100 if stats['new_additions'] > 0 else 0:.1f}%)")
|
|
print()
|
|
print(f" German dataset v2 (before): {len(german_data['institutions']) - stats['new_additions']}")
|
|
print(f" German dataset v3 (after): {len(german_data['institutions'])}")
|
|
print(f" Growth: +{stats['new_additions']} institutions")
|
|
print()
|
|
|
|
# Update metadata
|
|
german_data['metadata'] = {
|
|
"version": "v4.0",
|
|
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
"total_institutions": len(german_data['institutions']),
|
|
"sources": ["ISIL Registry", "DDB SPARQL", "NRW Archives Portal", "Thüringen Archives Portal v4.0 (95.6% metadata completeness)"],
|
|
"merge_statistics": stats,
|
|
"thueringen_v4_features": {
|
|
"physical_addresses": "100%",
|
|
"directors": "96%",
|
|
"opening_hours": "99.3%",
|
|
"archive_histories": "84.6%",
|
|
"overall_completeness": "95.6%"
|
|
}
|
|
}
|
|
|
|
# Save merged dataset
|
|
output_path = Path(f"data/isil/germany/german_institutions_unified_v4_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
|
|
save_json(german_data, output_path)
|
|
|
|
print(f"💾 Saved: {output_path}")
|
|
print(f" File size: {output_path.stat().st_size / 1024 / 1024:.1f} MB")
|
|
print()
|
|
print("✅ Merge complete!")
|
|
|
|
if __name__ == '__main__':
|
|
merge_thueringen_archives()
|