338 lines
11 KiB
Python
338 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge Archivportal-D and ISIL Data
|
|
|
|
This script cross-references archives from Archivportal-D with the ISIL registry
|
|
to identify overlaps and new discoveries.
|
|
|
|
Matching strategy:
|
|
1. Primary: Match by ISIL code (exact match)
|
|
2. Secondary: Match by name + city (fuzzy match > 0.85)
|
|
|
|
Outputs:
|
|
- Overlap report (archives in both sources)
|
|
- New discoveries (archives only in Archivportal-D)
|
|
- ISIL-only archives (not in Archivportal-D)
|
|
|
|
Author: OpenCode + MCP Tools
|
|
Date: 2025-11-19
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Dict, Tuple
|
|
from rapidfuzz import fuzz
|
|
from datetime import datetime
|
|
|
|
# Configuration
|
|
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
|
|
OUTPUT_DIR = DATA_DIR
|
|
FUZZY_THRESHOLD = 85 # Minimum similarity score for name matching
|
|
|
|
|
|
def load_isil_data() -> List[Dict]:
|
|
"""Load ISIL registry data."""
|
|
isil_file = DATA_DIR / "german_isil_complete_20251119_134939.json"
|
|
|
|
print(f"Loading ISIL data from: {isil_file}")
|
|
with open(isil_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
institutions = data.get('institutions', [])
|
|
print(f" Loaded {len(institutions)} ISIL records\n")
|
|
|
|
return institutions
|
|
|
|
|
|
def load_archivportal_data() -> List[Dict]:
|
|
"""Load Archivportal-D data (most recent file)."""
|
|
archivportal_files = list(DATA_DIR.glob("archivportal_d_api_*.json"))
|
|
|
|
if not archivportal_files:
|
|
raise FileNotFoundError(
|
|
f"No Archivportal-D data found in {DATA_DIR}. "
|
|
"Run harvest_archivportal_d_api.py first."
|
|
)
|
|
|
|
# Use most recent file
|
|
latest_file = max(archivportal_files, key=lambda p: p.stat().st_mtime)
|
|
|
|
print(f"Loading Archivportal-D data from: {latest_file}")
|
|
with open(latest_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
archives = data.get('archives', [])
|
|
print(f" Loaded {len(archives)} archive records\n")
|
|
|
|
return archives
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for comparison."""
|
|
if not name:
|
|
return ""
|
|
|
|
# Lowercase
|
|
normalized = name.lower()
|
|
|
|
# Remove common prefixes/suffixes
|
|
prefixes = ['stadtarchiv', 'kreisarchiv', 'landesarchiv', 'staatsarchiv',
|
|
'archiv', 'der', 'des', 'von']
|
|
for prefix in prefixes:
|
|
normalized = normalized.replace(f"{prefix} ", "")
|
|
|
|
# Remove punctuation
|
|
for char in ",.;:-":
|
|
normalized = normalized.replace(char, "")
|
|
|
|
return normalized.strip()
|
|
|
|
|
|
def match_by_isil(
|
|
archivportal_records: List[Dict],
|
|
isil_records: List[Dict]
|
|
) -> Tuple[List[Dict], List[Dict], List[Dict]]:
|
|
"""
|
|
Match archives by ISIL code.
|
|
|
|
Returns:
|
|
(matched_pairs, archivportal_only, isil_only)
|
|
"""
|
|
print("Matching by ISIL code...")
|
|
|
|
# Build ISIL lookup
|
|
isil_by_code = {}
|
|
for record in isil_records:
|
|
isil = record.get('isil_code')
|
|
if isil:
|
|
isil_by_code[isil] = record
|
|
|
|
matched_pairs = []
|
|
archivportal_only = []
|
|
|
|
for ap_record in archivportal_records:
|
|
isil = ap_record.get('isil')
|
|
if isil and isil in isil_by_code:
|
|
# Match found
|
|
matched_pairs.append({
|
|
'archivportal': ap_record,
|
|
'isil': isil_by_code[isil],
|
|
'match_type': 'isil_exact',
|
|
'confidence': 1.0
|
|
})
|
|
else:
|
|
# No ISIL match
|
|
archivportal_only.append(ap_record)
|
|
|
|
# Find ISIL records not in Archivportal
|
|
matched_isil_codes = {pair['isil']['isil_code'] for pair in matched_pairs}
|
|
isil_only = [
|
|
record for record in isil_records
|
|
if record.get('isil_code') not in matched_isil_codes
|
|
]
|
|
|
|
print(f" Matched by ISIL: {len(matched_pairs)}")
|
|
print(f" Archivportal-only: {len(archivportal_only)}")
|
|
print(f" ISIL-only: {len(isil_only)}\n")
|
|
|
|
return matched_pairs, archivportal_only, isil_only
|
|
|
|
|
|
def match_by_name_location(
|
|
archivportal_records: List[Dict],
|
|
isil_records: List[Dict]
|
|
) -> Tuple[List[Dict], List[Dict]]:
|
|
"""
|
|
Match archives by name + city (fuzzy matching).
|
|
|
|
Returns:
|
|
(additional_matches, unmatched)
|
|
"""
|
|
print("Matching by name + location (fuzzy)...")
|
|
|
|
additional_matches = []
|
|
unmatched = []
|
|
|
|
for ap_record in archivportal_records:
|
|
ap_name = normalize_name(ap_record.get('name', ''))
|
|
ap_city = ap_record.get('location', '').lower().strip()
|
|
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for isil_record in isil_records:
|
|
isil_name = normalize_name(isil_record.get('institution_name', ''))
|
|
isil_city = isil_record.get('city', '').lower().strip()
|
|
|
|
# Require city match
|
|
if ap_city != isil_city:
|
|
continue
|
|
|
|
# Fuzzy match name
|
|
score = fuzz.ratio(ap_name, isil_name)
|
|
|
|
if score > best_score and score >= FUZZY_THRESHOLD:
|
|
best_score = score
|
|
best_match = isil_record
|
|
|
|
if best_match:
|
|
additional_matches.append({
|
|
'archivportal': ap_record,
|
|
'isil': best_match,
|
|
'match_type': 'name_location_fuzzy',
|
|
'confidence': best_score / 100.0
|
|
})
|
|
else:
|
|
unmatched.append(ap_record)
|
|
|
|
print(f" Additional matches: {len(additional_matches)}")
|
|
print(f" Unmatched: {len(unmatched)}\n")
|
|
|
|
return additional_matches, unmatched
|
|
|
|
|
|
def generate_overlap_report(
|
|
matched_pairs: List[Dict],
|
|
archivportal_only: List[Dict],
|
|
isil_only: List[Dict]
|
|
):
|
|
"""Generate overlap statistics report."""
|
|
total_archivportal = len(matched_pairs) + len(archivportal_only)
|
|
total_isil = len(matched_pairs) + len(isil_only)
|
|
|
|
print(f"\n{'='*70}")
|
|
print("OVERLAP REPORT")
|
|
print(f"{'='*70}")
|
|
print(f"\nTotal Archivportal-D archives: {total_archivportal}")
|
|
print(f"Total ISIL archives (all types): {total_isil}")
|
|
print(f"\nMatched (in both sources): {len(matched_pairs)}")
|
|
print(f" - ISIL exact match: {sum(1 for m in matched_pairs if m['match_type'] == 'isil_exact')}")
|
|
print(f" - Name+location fuzzy: {sum(1 for m in matched_pairs if m['match_type'] == 'name_location_fuzzy')}")
|
|
print(f"\nArchivportal-only (new discoveries): {len(archivportal_only)}")
|
|
print(f" Percentage of Archivportal data: {len(archivportal_only)/total_archivportal*100:.1f}%")
|
|
print(f"\nISIL-only (not in Archivportal): {len(isil_only)}")
|
|
print(f" Percentage of ISIL data: {len(isil_only)/total_isil*100:.1f}%")
|
|
|
|
# Analyze ISIL-only types
|
|
isil_only_by_type = {}
|
|
for record in isil_only:
|
|
inst_type = record.get('institution_type', 'Unknown')
|
|
isil_only_by_type[inst_type] = isil_only_by_type.get(inst_type, 0) + 1
|
|
|
|
print(f"\nISIL-only by institution type:")
|
|
for inst_type, count in sorted(isil_only_by_type.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" - {inst_type}: {count}")
|
|
|
|
print(f"{'='*70}\n")
|
|
|
|
return {
|
|
'total_archivportal': total_archivportal,
|
|
'total_isil': total_isil,
|
|
'matched': len(matched_pairs),
|
|
'archivportal_only': len(archivportal_only),
|
|
'isil_only': len(isil_only),
|
|
'overlap_percentage': len(matched_pairs) / total_archivportal * 100 if total_archivportal > 0 else 0
|
|
}
|
|
|
|
|
|
def save_results(
|
|
matched_pairs: List[Dict],
|
|
archivportal_only: List[Dict],
|
|
isil_only: List[Dict],
|
|
stats: Dict
|
|
):
|
|
"""Save merge results to JSON files."""
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
|
|
# Save matched pairs
|
|
matched_file = OUTPUT_DIR / f"merged_matched_{timestamp}.json"
|
|
with open(matched_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'metadata': {
|
|
'description': 'Archives matched between Archivportal-D and ISIL',
|
|
'total': len(matched_pairs),
|
|
'merge_date': datetime.utcnow().isoformat() + 'Z'
|
|
},
|
|
'matches': matched_pairs
|
|
}, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved matched pairs: {matched_file}")
|
|
|
|
# Save new discoveries
|
|
new_file = OUTPUT_DIR / f"merged_new_discoveries_{timestamp}.json"
|
|
with open(new_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'metadata': {
|
|
'description': 'Archives in Archivportal-D but not in ISIL',
|
|
'total': len(archivportal_only),
|
|
'merge_date': datetime.utcnow().isoformat() + 'Z'
|
|
},
|
|
'archives': archivportal_only
|
|
}, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved new discoveries: {new_file}")
|
|
|
|
# Save ISIL-only
|
|
isil_only_file = OUTPUT_DIR / f"merged_isil_only_{timestamp}.json"
|
|
with open(isil_only_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'metadata': {
|
|
'description': 'Institutions in ISIL but not in Archivportal-D',
|
|
'total': len(isil_only),
|
|
'merge_date': datetime.utcnow().isoformat() + 'Z'
|
|
},
|
|
'institutions': isil_only
|
|
}, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved ISIL-only: {isil_only_file}")
|
|
|
|
# Save statistics
|
|
stats_file = OUTPUT_DIR / f"merged_stats_{timestamp}.json"
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
json.dump(stats, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved statistics: {stats_file}\n")
|
|
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
print(f"\n{'#'*70}")
|
|
print(f"# Merge Archivportal-D and ISIL Data")
|
|
print(f"{'#'*70}\n")
|
|
|
|
# Load data
|
|
isil_records = load_isil_data()
|
|
archivportal_records = load_archivportal_data()
|
|
|
|
# Match by ISIL code
|
|
matched_pairs, archivportal_only, isil_only = match_by_isil(
|
|
archivportal_records, isil_records
|
|
)
|
|
|
|
# Match remaining by name + location
|
|
additional_matches, unmatched = match_by_name_location(
|
|
archivportal_only, isil_only
|
|
)
|
|
|
|
# Combine results
|
|
all_matched = matched_pairs + additional_matches
|
|
archivportal_only = unmatched
|
|
|
|
# Remove matched ISIL records from isil_only
|
|
matched_isil_codes = {m['isil'].get('isil_code') for m in all_matched}
|
|
isil_only = [
|
|
r for r in isil_only
|
|
if r.get('isil_code') not in matched_isil_codes
|
|
]
|
|
|
|
# Generate report
|
|
stats = generate_overlap_report(all_matched, archivportal_only, isil_only)
|
|
|
|
# Save results
|
|
save_results(all_matched, archivportal_only, isil_only, stats)
|
|
|
|
print("✓ Merge complete!\n")
|
|
print("Next steps:")
|
|
print(" 1. Review matched pairs for accuracy")
|
|
print(" 2. Investigate high-confidence fuzzy matches")
|
|
print(" 3. Run unified dataset builder script\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|