#!/usr/bin/env python3 """Match ZCBS institutions with NDE entries and create cross-reference.""" import yaml import os import re from pathlib import Path from datetime import datetime, timezone from difflib import SequenceMatcher # Paths ZCBS_FILE = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs/zcbs_institutions_extracted.yaml") NDE_ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries") OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs/zcbs_nde_crossref.yaml") def normalize_name(name): """Normalize name for matching.""" if not name: return "" # Lowercase n = name.lower() # Remove common prefixes/suffixes n = re.sub(r'\b(stichting|vereniging|historische?|heemkunde(kring)?|museum|oudheidkundige?|genootschap)\b', '', n) # Remove punctuation and extra spaces n = re.sub(r"[''`\",.:;!?()[\]{}/-]", ' ', n) n = re.sub(r'\s+', ' ', n).strip() return n def similarity(a, b): """Calculate string similarity.""" return SequenceMatcher(None, a, b).ratio() def load_zcbs_institutions(): """Load ZCBS institutions from YAML.""" with open(ZCBS_FILE, 'r') as f: data = yaml.safe_load(f) return data.get('zcbs_institutions', []) def get_nde_name(data): """Extract name from NDE entry (various possible locations).""" # Check original_entry.organisatie first if 'original_entry' in data and data['original_entry']: org = data['original_entry'].get('organisatie') if org: return org # Fallback to other fields return data.get('naam', '') or data.get('name', '') def load_nde_zcbs_entries(): """Load all NDE entries that use ZCBS system.""" entries = [] for yaml_file in NDE_ENTRIES_DIR.glob("*.yaml"): with open(yaml_file, 'r') as f: content = f.read() if 'systeem: ZCBS' in content: data = yaml.safe_load(content) data['_file'] = yaml_file.name data['_path'] = str(yaml_file) data['_name'] = get_nde_name(data) entries.append(data) return entries def match_entries(zcbs_list, nde_list): """Match ZCBS institutions with NDE entries.""" matches = [] unmatched_zcbs = [] unmatched_nde = [] # Track which have been matched matched_zcbs_ids = set() matched_nde_files = set() # First pass: direct name matching for zcbs in zcbs_list: zcbs_name = zcbs.get('name', '') zcbs_norm = normalize_name(zcbs_name) best_match = None best_score = 0 for nde in nde_list: if nde['_file'] in matched_nde_files: continue nde_name = nde['_name'] nde_norm = normalize_name(nde_name) # Check similarity score = similarity(zcbs_norm, nde_norm) # Also check if key words match zcbs_words = set(zcbs_norm.split()) nde_words = set(nde_norm.split()) # Boost score if key location words match location_boost = 0 for word in zcbs_words & nde_words: if len(word) > 3: # Ignore short words location_boost += 0.1 total_score = min(score + location_boost, 1.0) if total_score > best_score: best_score = total_score best_match = nde if best_match and best_score >= 0.5: matches.append({ 'zcbs_id': zcbs['zcbs_id'], 'zcbs_name': zcbs_name, 'nde_file': best_match['_file'], 'nde_name': best_match['_name'], 'match_score': round(best_score, 3), 'platform_urls': zcbs.get('platform_urls', {}), 'already_enriched': 'zcbs_enrichment' in best_match }) matched_zcbs_ids.add(zcbs['zcbs_id']) matched_nde_files.add(best_match['_file']) else: unmatched_zcbs.append({ 'zcbs_id': zcbs['zcbs_id'], 'name': zcbs_name, 'location': zcbs.get('location'), 'country': zcbs.get('country', 'NL'), 'platform_urls': zcbs.get('platform_urls', {}), 'best_candidate': { 'nde_file': best_match['_file'] if best_match else None, 'nde_name': best_match['_name'] if best_match else None, 'score': round(best_score, 3) } if best_match else None }) # Find unmatched NDE entries for nde in nde_list: if nde['_file'] not in matched_nde_files: loc = '' if 'original_entry' in nde and nde['original_entry']: loc = nde['original_entry'].get('plaatsnaam_bezoekadres', '') unmatched_nde.append({ 'nde_file': nde['_file'], 'nde_name': nde['_name'], 'plaatsnaam': loc }) return matches, unmatched_zcbs, unmatched_nde def main(): print("Loading ZCBS institutions...") zcbs_list = load_zcbs_institutions() print(f" Found {len(zcbs_list)} ZCBS institutions") print("Loading NDE ZCBS entries...") nde_list = load_nde_zcbs_entries() print(f" Found {len(nde_list)} NDE entries with ZCBS") print("Matching entries...") matches, unmatched_zcbs, unmatched_nde = match_entries(zcbs_list, nde_list) # Separate already enriched from needs enrichment already_enriched = [m for m in matches if m['already_enriched']] needs_enrichment = [m for m in matches if not m['already_enriched']] # Create output structure output = { 'metadata': { 'generated_at': datetime.now(timezone.utc).isoformat(), 'zcbs_count': len(zcbs_list), 'nde_zcbs_count': len(nde_list), 'matched_count': len(matches), 'already_enriched_count': len(already_enriched), 'needs_enrichment_count': len(needs_enrichment), 'unmatched_zcbs_count': len(unmatched_zcbs), 'unmatched_nde_count': len(unmatched_nde) }, 'summary': { 'high_confidence_matches': len([m for m in matches if m['match_score'] >= 0.8]), 'medium_confidence_matches': len([m for m in matches if 0.6 <= m['match_score'] < 0.8]), 'low_confidence_matches': len([m for m in matches if m['match_score'] < 0.6]) }, 'already_enriched': already_enriched, 'needs_enrichment': sorted(needs_enrichment, key=lambda x: x['match_score'], reverse=True), 'unmatched_zcbs': sorted(unmatched_zcbs, key=lambda x: x['zcbs_id']), 'unmatched_nde': sorted(unmatched_nde, key=lambda x: x['nde_file']) } # Write output OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, 'w') as f: yaml.dump(output, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"\nResults written to: {OUTPUT_FILE}") print(f"\nSummary:") print(f" Total matches: {len(matches)}") print(f" - Already enriched: {len(already_enriched)}") print(f" - Needs enrichment: {len(needs_enrichment)}") print(f" Unmatched ZCBS: {len(unmatched_zcbs)} (potential new NDE entries)") print(f" Unmatched NDE: {len(unmatched_nde)} (may need manual matching)") print(f"\nConfidence breakdown:") print(f" - High (>=0.8): {output['summary']['high_confidence_matches']}") print(f" - Medium (0.6-0.8): {output['summary']['medium_confidence_matches']}") print(f" - Low (<0.6): {output['summary']['low_confidence_matches']}") if __name__ == "__main__": main()