glam/scripts/match_zcbs_nde.py

#!/usr/bin/env python3
"""Match ZCBS institutions with NDE entries and create cross-reference."""

import yaml
import os
import re
from pathlib import Path
from datetime import datetime, timezone
from difflib import SequenceMatcher

# Paths
ZCBS_FILE = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs/zcbs_institutions_extracted.yaml")
NDE_ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs/zcbs_nde_crossref.yaml")

def normalize_name(name):
    """Normalize name for matching."""
    if not name:
        return ""
    # Lowercase
    n = name.lower()
    # Remove common prefixes/suffixes
    n = re.sub(r'\b(stichting|vereniging|historische?|heemkunde(kring)?|museum|oudheidkundige?|genootschap)\b', '', n)
    # Remove punctuation and extra spaces
    n = re.sub(r"[''`\",.:;!?()[\]{}/-]", ' ', n)
    n = re.sub(r'\s+', ' ', n).strip()
    return n

def similarity(a, b):
    """Calculate string similarity."""
    return SequenceMatcher(None, a, b).ratio()

def load_zcbs_institutions():
    """Load ZCBS institutions from YAML."""
    with open(ZCBS_FILE, 'r') as f:
        data = yaml.safe_load(f)
    return data.get('zcbs_institutions', [])

def get_nde_name(data):
    """Extract name from NDE entry (various possible locations)."""
    # Check original_entry.organisatie first
    if 'original_entry' in data and data['original_entry']:
        org = data['original_entry'].get('organisatie')
        if org:
            return org
    # Fallback to other fields
    return data.get('naam', '') or data.get('name', '')

def load_nde_zcbs_entries():
    """Load all NDE entries that use ZCBS system."""
    entries = []
    for yaml_file in NDE_ENTRIES_DIR.glob("*.yaml"):
        with open(yaml_file, 'r') as f:
            content = f.read()
            if 'systeem: ZCBS' in content:
                data = yaml.safe_load(content)
                data['_file'] = yaml_file.name
                data['_path'] = str(yaml_file)
                data['_name'] = get_nde_name(data)
                entries.append(data)
    return entries

def match_entries(zcbs_list, nde_list):
    """Match ZCBS institutions with NDE entries."""
    matches = []
    unmatched_zcbs = []
    unmatched_nde = []

    # Track which have been matched
    matched_zcbs_ids = set()
    matched_nde_files = set()

    # First pass: direct name matching
    for zcbs in zcbs_list:
        zcbs_name = zcbs.get('name', '')
        zcbs_norm = normalize_name(zcbs_name)
        best_match = None
        best_score = 0

        for nde in nde_list:
            if nde['_file'] in matched_nde_files:
                continue

            nde_name = nde['_name']
            nde_norm = normalize_name(nde_name)

            # Check similarity
            score = similarity(zcbs_norm, nde_norm)

            # Also check if key words match
            zcbs_words = set(zcbs_norm.split())
            nde_words = set(nde_norm.split())

            # Boost score if key location words match
            location_boost = 0
            for word in zcbs_words & nde_words:
                if len(word) > 3:  # Ignore short words
                    location_boost += 0.1

            total_score = min(score + location_boost, 1.0)

            if total_score > best_score:
                best_score = total_score
                best_match = nde

        if best_match and best_score >= 0.5:
            matches.append({
                'zcbs_id': zcbs['zcbs_id'],
                'zcbs_name': zcbs_name,
                'nde_file': best_match['_file'],
                'nde_name': best_match['_name'],
                'match_score': round(best_score, 3),
                'platform_urls': zcbs.get('platform_urls', {}),
                'already_enriched': 'zcbs_enrichment' in best_match
            })
            matched_zcbs_ids.add(zcbs['zcbs_id'])
            matched_nde_files.add(best_match['_file'])
        else:
            unmatched_zcbs.append({
                'zcbs_id': zcbs['zcbs_id'],
                'name': zcbs_name,
                'location': zcbs.get('location'),
                'country': zcbs.get('country', 'NL'),
                'platform_urls': zcbs.get('platform_urls', {}),
                'best_candidate': {
                    'nde_file': best_match['_file'] if best_match else None,
                    'nde_name': best_match['_name'] if best_match else None,
                    'score': round(best_score, 3)
                } if best_match else None
            })

    # Find unmatched NDE entries
    for nde in nde_list:
        if nde['_file'] not in matched_nde_files:
            loc = ''
            if 'original_entry' in nde and nde['original_entry']:
                loc = nde['original_entry'].get('plaatsnaam_bezoekadres', '')
            unmatched_nde.append({
                'nde_file': nde['_file'],
                'nde_name': nde['_name'],
                'plaatsnaam': loc
            })

    return matches, unmatched_zcbs, unmatched_nde

def main():
    print("Loading ZCBS institutions...")
    zcbs_list = load_zcbs_institutions()
    print(f"  Found {len(zcbs_list)} ZCBS institutions")

    print("Loading NDE ZCBS entries...")
    nde_list = load_nde_zcbs_entries()
    print(f"  Found {len(nde_list)} NDE entries with ZCBS")

    print("Matching entries...")
    matches, unmatched_zcbs, unmatched_nde = match_entries(zcbs_list, nde_list)

    # Separate already enriched from needs enrichment
    already_enriched = [m for m in matches if m['already_enriched']]
    needs_enrichment = [m for m in matches if not m['already_enriched']]

    # Create output structure
    output = {
        'metadata': {
            'generated_at': datetime.now(timezone.utc).isoformat(),
            'zcbs_count': len(zcbs_list),
            'nde_zcbs_count': len(nde_list),
            'matched_count': len(matches),
            'already_enriched_count': len(already_enriched),
            'needs_enrichment_count': len(needs_enrichment),
            'unmatched_zcbs_count': len(unmatched_zcbs),
            'unmatched_nde_count': len(unmatched_nde)
        },
        'summary': {
            'high_confidence_matches': len([m for m in matches if m['match_score'] >= 0.8]),
            'medium_confidence_matches': len([m for m in matches if 0.6 <= m['match_score'] < 0.8]),
            'low_confidence_matches': len([m for m in matches if m['match_score'] < 0.6])
        },
        'already_enriched': already_enriched,
        'needs_enrichment': sorted(needs_enrichment, key=lambda x: x['match_score'], reverse=True),
        'unmatched_zcbs': sorted(unmatched_zcbs, key=lambda x: x['zcbs_id']),
        'unmatched_nde': sorted(unmatched_nde, key=lambda x: x['nde_file'])
    }

    # Write output
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_FILE, 'w') as f:
        yaml.dump(output, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

    print(f"\nResults written to: {OUTPUT_FILE}")
    print(f"\nSummary:")
    print(f"  Total matches: {len(matches)}")
    print(f"    - Already enriched: {len(already_enriched)}")
    print(f"    - Needs enrichment: {len(needs_enrichment)}")
    print(f"  Unmatched ZCBS: {len(unmatched_zcbs)} (potential new NDE entries)")
    print(f"  Unmatched NDE: {len(unmatched_nde)} (may need manual matching)")
    print(f"\nConfidence breakdown:")
    print(f"    - High (>=0.8): {output['summary']['high_confidence_matches']}")
    print(f"    - Medium (0.6-0.8): {output['summary']['medium_confidence_matches']}")
    print(f"    - Low (<0.6): {output['summary']['low_confidence_matches']}")

if __name__ == "__main__":
    main()