glam/scripts/match_zcbs_nde.py
2025-11-30 23:30:29 +01:00

203 lines
7.7 KiB
Python

#!/usr/bin/env python3
"""Match ZCBS institutions with NDE entries and create cross-reference."""
import yaml
import os
import re
from pathlib import Path
from datetime import datetime, timezone
from difflib import SequenceMatcher
# Paths
ZCBS_FILE = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs/zcbs_institutions_extracted.yaml")
NDE_ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs/zcbs_nde_crossref.yaml")
def normalize_name(name):
"""Normalize name for matching."""
if not name:
return ""
# Lowercase
n = name.lower()
# Remove common prefixes/suffixes
n = re.sub(r'\b(stichting|vereniging|historische?|heemkunde(kring)?|museum|oudheidkundige?|genootschap)\b', '', n)
# Remove punctuation and extra spaces
n = re.sub(r"[''`\",.:;!?()[\]{}/-]", ' ', n)
n = re.sub(r'\s+', ' ', n).strip()
return n
def similarity(a, b):
"""Calculate string similarity."""
return SequenceMatcher(None, a, b).ratio()
def load_zcbs_institutions():
"""Load ZCBS institutions from YAML."""
with open(ZCBS_FILE, 'r') as f:
data = yaml.safe_load(f)
return data.get('zcbs_institutions', [])
def get_nde_name(data):
"""Extract name from NDE entry (various possible locations)."""
# Check original_entry.organisatie first
if 'original_entry' in data and data['original_entry']:
org = data['original_entry'].get('organisatie')
if org:
return org
# Fallback to other fields
return data.get('naam', '') or data.get('name', '')
def load_nde_zcbs_entries():
"""Load all NDE entries that use ZCBS system."""
entries = []
for yaml_file in NDE_ENTRIES_DIR.glob("*.yaml"):
with open(yaml_file, 'r') as f:
content = f.read()
if 'systeem: ZCBS' in content:
data = yaml.safe_load(content)
data['_file'] = yaml_file.name
data['_path'] = str(yaml_file)
data['_name'] = get_nde_name(data)
entries.append(data)
return entries
def match_entries(zcbs_list, nde_list):
"""Match ZCBS institutions with NDE entries."""
matches = []
unmatched_zcbs = []
unmatched_nde = []
# Track which have been matched
matched_zcbs_ids = set()
matched_nde_files = set()
# First pass: direct name matching
for zcbs in zcbs_list:
zcbs_name = zcbs.get('name', '')
zcbs_norm = normalize_name(zcbs_name)
best_match = None
best_score = 0
for nde in nde_list:
if nde['_file'] in matched_nde_files:
continue
nde_name = nde['_name']
nde_norm = normalize_name(nde_name)
# Check similarity
score = similarity(zcbs_norm, nde_norm)
# Also check if key words match
zcbs_words = set(zcbs_norm.split())
nde_words = set(nde_norm.split())
# Boost score if key location words match
location_boost = 0
for word in zcbs_words & nde_words:
if len(word) > 3: # Ignore short words
location_boost += 0.1
total_score = min(score + location_boost, 1.0)
if total_score > best_score:
best_score = total_score
best_match = nde
if best_match and best_score >= 0.5:
matches.append({
'zcbs_id': zcbs['zcbs_id'],
'zcbs_name': zcbs_name,
'nde_file': best_match['_file'],
'nde_name': best_match['_name'],
'match_score': round(best_score, 3),
'platform_urls': zcbs.get('platform_urls', {}),
'already_enriched': 'zcbs_enrichment' in best_match
})
matched_zcbs_ids.add(zcbs['zcbs_id'])
matched_nde_files.add(best_match['_file'])
else:
unmatched_zcbs.append({
'zcbs_id': zcbs['zcbs_id'],
'name': zcbs_name,
'location': zcbs.get('location'),
'country': zcbs.get('country', 'NL'),
'platform_urls': zcbs.get('platform_urls', {}),
'best_candidate': {
'nde_file': best_match['_file'] if best_match else None,
'nde_name': best_match['_name'] if best_match else None,
'score': round(best_score, 3)
} if best_match else None
})
# Find unmatched NDE entries
for nde in nde_list:
if nde['_file'] not in matched_nde_files:
loc = ''
if 'original_entry' in nde and nde['original_entry']:
loc = nde['original_entry'].get('plaatsnaam_bezoekadres', '')
unmatched_nde.append({
'nde_file': nde['_file'],
'nde_name': nde['_name'],
'plaatsnaam': loc
})
return matches, unmatched_zcbs, unmatched_nde
def main():
print("Loading ZCBS institutions...")
zcbs_list = load_zcbs_institutions()
print(f" Found {len(zcbs_list)} ZCBS institutions")
print("Loading NDE ZCBS entries...")
nde_list = load_nde_zcbs_entries()
print(f" Found {len(nde_list)} NDE entries with ZCBS")
print("Matching entries...")
matches, unmatched_zcbs, unmatched_nde = match_entries(zcbs_list, nde_list)
# Separate already enriched from needs enrichment
already_enriched = [m for m in matches if m['already_enriched']]
needs_enrichment = [m for m in matches if not m['already_enriched']]
# Create output structure
output = {
'metadata': {
'generated_at': datetime.now(timezone.utc).isoformat(),
'zcbs_count': len(zcbs_list),
'nde_zcbs_count': len(nde_list),
'matched_count': len(matches),
'already_enriched_count': len(already_enriched),
'needs_enrichment_count': len(needs_enrichment),
'unmatched_zcbs_count': len(unmatched_zcbs),
'unmatched_nde_count': len(unmatched_nde)
},
'summary': {
'high_confidence_matches': len([m for m in matches if m['match_score'] >= 0.8]),
'medium_confidence_matches': len([m for m in matches if 0.6 <= m['match_score'] < 0.8]),
'low_confidence_matches': len([m for m in matches if m['match_score'] < 0.6])
},
'already_enriched': already_enriched,
'needs_enrichment': sorted(needs_enrichment, key=lambda x: x['match_score'], reverse=True),
'unmatched_zcbs': sorted(unmatched_zcbs, key=lambda x: x['zcbs_id']),
'unmatched_nde': sorted(unmatched_nde, key=lambda x: x['nde_file'])
}
# Write output
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w') as f:
yaml.dump(output, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"\nResults written to: {OUTPUT_FILE}")
print(f"\nSummary:")
print(f" Total matches: {len(matches)}")
print(f" - Already enriched: {len(already_enriched)}")
print(f" - Needs enrichment: {len(needs_enrichment)}")
print(f" Unmatched ZCBS: {len(unmatched_zcbs)} (potential new NDE entries)")
print(f" Unmatched NDE: {len(unmatched_nde)} (may need manual matching)")
print(f"\nConfidence breakdown:")
print(f" - High (>=0.8): {output['summary']['high_confidence_matches']}")
print(f" - Medium (0.6-0.8): {output['summary']['medium_confidence_matches']}")
print(f" - Low (<0.6): {output['summary']['low_confidence_matches']}")
if __name__ == "__main__":
main()