glam/scripts/scrapers/merge_archivportal_isil.py
2025-11-19 23:25:22 +01:00

338 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Merge Archivportal-D and ISIL Data
This script cross-references archives from Archivportal-D with the ISIL registry
to identify overlaps and new discoveries.
Matching strategy:
1. Primary: Match by ISIL code (exact match)
2. Secondary: Match by name + city (fuzzy match > 0.85)
Outputs:
- Overlap report (archives in both sources)
- New discoveries (archives only in Archivportal-D)
- ISIL-only archives (not in Archivportal-D)
Author: OpenCode + MCP Tools
Date: 2025-11-19
"""
import json
from pathlib import Path
from typing import List, Dict, Tuple
from rapidfuzz import fuzz
from datetime import datetime
# Configuration
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/germany")
OUTPUT_DIR = DATA_DIR
FUZZY_THRESHOLD = 85 # Minimum similarity score for name matching
def load_isil_data() -> List[Dict]:
"""Load ISIL registry data."""
isil_file = DATA_DIR / "german_isil_complete_20251119_134939.json"
print(f"Loading ISIL data from: {isil_file}")
with open(isil_file, 'r', encoding='utf-8') as f:
data = json.load(f)
institutions = data.get('institutions', [])
print(f" Loaded {len(institutions)} ISIL records\n")
return institutions
def load_archivportal_data() -> List[Dict]:
"""Load Archivportal-D data (most recent file)."""
archivportal_files = list(DATA_DIR.glob("archivportal_d_api_*.json"))
if not archivportal_files:
raise FileNotFoundError(
f"No Archivportal-D data found in {DATA_DIR}. "
"Run harvest_archivportal_d_api.py first."
)
# Use most recent file
latest_file = max(archivportal_files, key=lambda p: p.stat().st_mtime)
print(f"Loading Archivportal-D data from: {latest_file}")
with open(latest_file, 'r', encoding='utf-8') as f:
data = json.load(f)
archives = data.get('archives', [])
print(f" Loaded {len(archives)} archive records\n")
return archives
def normalize_name(name: str) -> str:
"""Normalize institution name for comparison."""
if not name:
return ""
# Lowercase
normalized = name.lower()
# Remove common prefixes/suffixes
prefixes = ['stadtarchiv', 'kreisarchiv', 'landesarchiv', 'staatsarchiv',
'archiv', 'der', 'des', 'von']
for prefix in prefixes:
normalized = normalized.replace(f"{prefix} ", "")
# Remove punctuation
for char in ",.;:-":
normalized = normalized.replace(char, "")
return normalized.strip()
def match_by_isil(
archivportal_records: List[Dict],
isil_records: List[Dict]
) -> Tuple[List[Dict], List[Dict], List[Dict]]:
"""
Match archives by ISIL code.
Returns:
(matched_pairs, archivportal_only, isil_only)
"""
print("Matching by ISIL code...")
# Build ISIL lookup
isil_by_code = {}
for record in isil_records:
isil = record.get('isil_code')
if isil:
isil_by_code[isil] = record
matched_pairs = []
archivportal_only = []
for ap_record in archivportal_records:
isil = ap_record.get('isil')
if isil and isil in isil_by_code:
# Match found
matched_pairs.append({
'archivportal': ap_record,
'isil': isil_by_code[isil],
'match_type': 'isil_exact',
'confidence': 1.0
})
else:
# No ISIL match
archivportal_only.append(ap_record)
# Find ISIL records not in Archivportal
matched_isil_codes = {pair['isil']['isil_code'] for pair in matched_pairs}
isil_only = [
record for record in isil_records
if record.get('isil_code') not in matched_isil_codes
]
print(f" Matched by ISIL: {len(matched_pairs)}")
print(f" Archivportal-only: {len(archivportal_only)}")
print(f" ISIL-only: {len(isil_only)}\n")
return matched_pairs, archivportal_only, isil_only
def match_by_name_location(
archivportal_records: List[Dict],
isil_records: List[Dict]
) -> Tuple[List[Dict], List[Dict]]:
"""
Match archives by name + city (fuzzy matching).
Returns:
(additional_matches, unmatched)
"""
print("Matching by name + location (fuzzy)...")
additional_matches = []
unmatched = []
for ap_record in archivportal_records:
ap_name = normalize_name(ap_record.get('name', ''))
ap_city = ap_record.get('location', '').lower().strip()
best_match = None
best_score = 0
for isil_record in isil_records:
isil_name = normalize_name(isil_record.get('institution_name', ''))
isil_city = isil_record.get('city', '').lower().strip()
# Require city match
if ap_city != isil_city:
continue
# Fuzzy match name
score = fuzz.ratio(ap_name, isil_name)
if score > best_score and score >= FUZZY_THRESHOLD:
best_score = score
best_match = isil_record
if best_match:
additional_matches.append({
'archivportal': ap_record,
'isil': best_match,
'match_type': 'name_location_fuzzy',
'confidence': best_score / 100.0
})
else:
unmatched.append(ap_record)
print(f" Additional matches: {len(additional_matches)}")
print(f" Unmatched: {len(unmatched)}\n")
return additional_matches, unmatched
def generate_overlap_report(
matched_pairs: List[Dict],
archivportal_only: List[Dict],
isil_only: List[Dict]
):
"""Generate overlap statistics report."""
total_archivportal = len(matched_pairs) + len(archivportal_only)
total_isil = len(matched_pairs) + len(isil_only)
print(f"\n{'='*70}")
print("OVERLAP REPORT")
print(f"{'='*70}")
print(f"\nTotal Archivportal-D archives: {total_archivportal}")
print(f"Total ISIL archives (all types): {total_isil}")
print(f"\nMatched (in both sources): {len(matched_pairs)}")
print(f" - ISIL exact match: {sum(1 for m in matched_pairs if m['match_type'] == 'isil_exact')}")
print(f" - Name+location fuzzy: {sum(1 for m in matched_pairs if m['match_type'] == 'name_location_fuzzy')}")
print(f"\nArchivportal-only (new discoveries): {len(archivportal_only)}")
print(f" Percentage of Archivportal data: {len(archivportal_only)/total_archivportal*100:.1f}%")
print(f"\nISIL-only (not in Archivportal): {len(isil_only)}")
print(f" Percentage of ISIL data: {len(isil_only)/total_isil*100:.1f}%")
# Analyze ISIL-only types
isil_only_by_type = {}
for record in isil_only:
inst_type = record.get('institution_type', 'Unknown')
isil_only_by_type[inst_type] = isil_only_by_type.get(inst_type, 0) + 1
print(f"\nISIL-only by institution type:")
for inst_type, count in sorted(isil_only_by_type.items(), key=lambda x: x[1], reverse=True):
print(f" - {inst_type}: {count}")
print(f"{'='*70}\n")
return {
'total_archivportal': total_archivportal,
'total_isil': total_isil,
'matched': len(matched_pairs),
'archivportal_only': len(archivportal_only),
'isil_only': len(isil_only),
'overlap_percentage': len(matched_pairs) / total_archivportal * 100 if total_archivportal > 0 else 0
}
def save_results(
matched_pairs: List[Dict],
archivportal_only: List[Dict],
isil_only: List[Dict],
stats: Dict
):
"""Save merge results to JSON files."""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Save matched pairs
matched_file = OUTPUT_DIR / f"merged_matched_{timestamp}.json"
with open(matched_file, 'w', encoding='utf-8') as f:
json.dump({
'metadata': {
'description': 'Archives matched between Archivportal-D and ISIL',
'total': len(matched_pairs),
'merge_date': datetime.utcnow().isoformat() + 'Z'
},
'matches': matched_pairs
}, f, ensure_ascii=False, indent=2)
print(f"✓ Saved matched pairs: {matched_file}")
# Save new discoveries
new_file = OUTPUT_DIR / f"merged_new_discoveries_{timestamp}.json"
with open(new_file, 'w', encoding='utf-8') as f:
json.dump({
'metadata': {
'description': 'Archives in Archivportal-D but not in ISIL',
'total': len(archivportal_only),
'merge_date': datetime.utcnow().isoformat() + 'Z'
},
'archives': archivportal_only
}, f, ensure_ascii=False, indent=2)
print(f"✓ Saved new discoveries: {new_file}")
# Save ISIL-only
isil_only_file = OUTPUT_DIR / f"merged_isil_only_{timestamp}.json"
with open(isil_only_file, 'w', encoding='utf-8') as f:
json.dump({
'metadata': {
'description': 'Institutions in ISIL but not in Archivportal-D',
'total': len(isil_only),
'merge_date': datetime.utcnow().isoformat() + 'Z'
},
'institutions': isil_only
}, f, ensure_ascii=False, indent=2)
print(f"✓ Saved ISIL-only: {isil_only_file}")
# Save statistics
stats_file = OUTPUT_DIR / f"merged_stats_{timestamp}.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, ensure_ascii=False, indent=2)
print(f"✓ Saved statistics: {stats_file}\n")
def main():
"""Main execution."""
print(f"\n{'#'*70}")
print(f"# Merge Archivportal-D and ISIL Data")
print(f"{'#'*70}\n")
# Load data
isil_records = load_isil_data()
archivportal_records = load_archivportal_data()
# Match by ISIL code
matched_pairs, archivportal_only, isil_only = match_by_isil(
archivportal_records, isil_records
)
# Match remaining by name + location
additional_matches, unmatched = match_by_name_location(
archivportal_only, isil_only
)
# Combine results
all_matched = matched_pairs + additional_matches
archivportal_only = unmatched
# Remove matched ISIL records from isil_only
matched_isil_codes = {m['isil'].get('isil_code') for m in all_matched}
isil_only = [
r for r in isil_only
if r.get('isil_code') not in matched_isil_codes
]
# Generate report
stats = generate_overlap_report(all_matched, archivportal_only, isil_only)
# Save results
save_results(all_matched, archivportal_only, isil_only, stats)
print("✓ Merge complete!\n")
print("Next steps:")
print(" 1. Review matched pairs for accuracy")
print(" 2. Investigate high-confidence fuzzy matches")
print(" 3. Run unified dataset builder script\n")
if __name__ == "__main__":
main()