203 lines
7.7 KiB
Python
203 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Match ZCBS institutions with NDE entries and create cross-reference."""
|
|
|
|
import yaml
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from difflib import SequenceMatcher
|
|
|
|
# Paths
|
|
ZCBS_FILE = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs/zcbs_institutions_extracted.yaml")
|
|
NDE_ENTRIES_DIR = Path("/Users/kempersc/apps/glam/data/nde/enriched/entries")
|
|
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/nde/enriched/sources/zcbs/zcbs_nde_crossref.yaml")
|
|
|
|
def normalize_name(name):
|
|
"""Normalize name for matching."""
|
|
if not name:
|
|
return ""
|
|
# Lowercase
|
|
n = name.lower()
|
|
# Remove common prefixes/suffixes
|
|
n = re.sub(r'\b(stichting|vereniging|historische?|heemkunde(kring)?|museum|oudheidkundige?|genootschap)\b', '', n)
|
|
# Remove punctuation and extra spaces
|
|
n = re.sub(r"[''`\",.:;!?()[\]{}/-]", ' ', n)
|
|
n = re.sub(r'\s+', ' ', n).strip()
|
|
return n
|
|
|
|
def similarity(a, b):
|
|
"""Calculate string similarity."""
|
|
return SequenceMatcher(None, a, b).ratio()
|
|
|
|
def load_zcbs_institutions():
|
|
"""Load ZCBS institutions from YAML."""
|
|
with open(ZCBS_FILE, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
return data.get('zcbs_institutions', [])
|
|
|
|
def get_nde_name(data):
|
|
"""Extract name from NDE entry (various possible locations)."""
|
|
# Check original_entry.organisatie first
|
|
if 'original_entry' in data and data['original_entry']:
|
|
org = data['original_entry'].get('organisatie')
|
|
if org:
|
|
return org
|
|
# Fallback to other fields
|
|
return data.get('naam', '') or data.get('name', '')
|
|
|
|
def load_nde_zcbs_entries():
|
|
"""Load all NDE entries that use ZCBS system."""
|
|
entries = []
|
|
for yaml_file in NDE_ENTRIES_DIR.glob("*.yaml"):
|
|
with open(yaml_file, 'r') as f:
|
|
content = f.read()
|
|
if 'systeem: ZCBS' in content:
|
|
data = yaml.safe_load(content)
|
|
data['_file'] = yaml_file.name
|
|
data['_path'] = str(yaml_file)
|
|
data['_name'] = get_nde_name(data)
|
|
entries.append(data)
|
|
return entries
|
|
|
|
def match_entries(zcbs_list, nde_list):
|
|
"""Match ZCBS institutions with NDE entries."""
|
|
matches = []
|
|
unmatched_zcbs = []
|
|
unmatched_nde = []
|
|
|
|
# Track which have been matched
|
|
matched_zcbs_ids = set()
|
|
matched_nde_files = set()
|
|
|
|
# First pass: direct name matching
|
|
for zcbs in zcbs_list:
|
|
zcbs_name = zcbs.get('name', '')
|
|
zcbs_norm = normalize_name(zcbs_name)
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for nde in nde_list:
|
|
if nde['_file'] in matched_nde_files:
|
|
continue
|
|
|
|
nde_name = nde['_name']
|
|
nde_norm = normalize_name(nde_name)
|
|
|
|
# Check similarity
|
|
score = similarity(zcbs_norm, nde_norm)
|
|
|
|
# Also check if key words match
|
|
zcbs_words = set(zcbs_norm.split())
|
|
nde_words = set(nde_norm.split())
|
|
|
|
# Boost score if key location words match
|
|
location_boost = 0
|
|
for word in zcbs_words & nde_words:
|
|
if len(word) > 3: # Ignore short words
|
|
location_boost += 0.1
|
|
|
|
total_score = min(score + location_boost, 1.0)
|
|
|
|
if total_score > best_score:
|
|
best_score = total_score
|
|
best_match = nde
|
|
|
|
if best_match and best_score >= 0.5:
|
|
matches.append({
|
|
'zcbs_id': zcbs['zcbs_id'],
|
|
'zcbs_name': zcbs_name,
|
|
'nde_file': best_match['_file'],
|
|
'nde_name': best_match['_name'],
|
|
'match_score': round(best_score, 3),
|
|
'platform_urls': zcbs.get('platform_urls', {}),
|
|
'already_enriched': 'zcbs_enrichment' in best_match
|
|
})
|
|
matched_zcbs_ids.add(zcbs['zcbs_id'])
|
|
matched_nde_files.add(best_match['_file'])
|
|
else:
|
|
unmatched_zcbs.append({
|
|
'zcbs_id': zcbs['zcbs_id'],
|
|
'name': zcbs_name,
|
|
'location': zcbs.get('location'),
|
|
'country': zcbs.get('country', 'NL'),
|
|
'platform_urls': zcbs.get('platform_urls', {}),
|
|
'best_candidate': {
|
|
'nde_file': best_match['_file'] if best_match else None,
|
|
'nde_name': best_match['_name'] if best_match else None,
|
|
'score': round(best_score, 3)
|
|
} if best_match else None
|
|
})
|
|
|
|
# Find unmatched NDE entries
|
|
for nde in nde_list:
|
|
if nde['_file'] not in matched_nde_files:
|
|
loc = ''
|
|
if 'original_entry' in nde and nde['original_entry']:
|
|
loc = nde['original_entry'].get('plaatsnaam_bezoekadres', '')
|
|
unmatched_nde.append({
|
|
'nde_file': nde['_file'],
|
|
'nde_name': nde['_name'],
|
|
'plaatsnaam': loc
|
|
})
|
|
|
|
return matches, unmatched_zcbs, unmatched_nde
|
|
|
|
def main():
|
|
print("Loading ZCBS institutions...")
|
|
zcbs_list = load_zcbs_institutions()
|
|
print(f" Found {len(zcbs_list)} ZCBS institutions")
|
|
|
|
print("Loading NDE ZCBS entries...")
|
|
nde_list = load_nde_zcbs_entries()
|
|
print(f" Found {len(nde_list)} NDE entries with ZCBS")
|
|
|
|
print("Matching entries...")
|
|
matches, unmatched_zcbs, unmatched_nde = match_entries(zcbs_list, nde_list)
|
|
|
|
# Separate already enriched from needs enrichment
|
|
already_enriched = [m for m in matches if m['already_enriched']]
|
|
needs_enrichment = [m for m in matches if not m['already_enriched']]
|
|
|
|
# Create output structure
|
|
output = {
|
|
'metadata': {
|
|
'generated_at': datetime.now(timezone.utc).isoformat(),
|
|
'zcbs_count': len(zcbs_list),
|
|
'nde_zcbs_count': len(nde_list),
|
|
'matched_count': len(matches),
|
|
'already_enriched_count': len(already_enriched),
|
|
'needs_enrichment_count': len(needs_enrichment),
|
|
'unmatched_zcbs_count': len(unmatched_zcbs),
|
|
'unmatched_nde_count': len(unmatched_nde)
|
|
},
|
|
'summary': {
|
|
'high_confidence_matches': len([m for m in matches if m['match_score'] >= 0.8]),
|
|
'medium_confidence_matches': len([m for m in matches if 0.6 <= m['match_score'] < 0.8]),
|
|
'low_confidence_matches': len([m for m in matches if m['match_score'] < 0.6])
|
|
},
|
|
'already_enriched': already_enriched,
|
|
'needs_enrichment': sorted(needs_enrichment, key=lambda x: x['match_score'], reverse=True),
|
|
'unmatched_zcbs': sorted(unmatched_zcbs, key=lambda x: x['zcbs_id']),
|
|
'unmatched_nde': sorted(unmatched_nde, key=lambda x: x['nde_file'])
|
|
}
|
|
|
|
# Write output
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(OUTPUT_FILE, 'w') as f:
|
|
yaml.dump(output, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"\nResults written to: {OUTPUT_FILE}")
|
|
print(f"\nSummary:")
|
|
print(f" Total matches: {len(matches)}")
|
|
print(f" - Already enriched: {len(already_enriched)}")
|
|
print(f" - Needs enrichment: {len(needs_enrichment)}")
|
|
print(f" Unmatched ZCBS: {len(unmatched_zcbs)} (potential new NDE entries)")
|
|
print(f" Unmatched NDE: {len(unmatched_nde)} (may need manual matching)")
|
|
print(f"\nConfidence breakdown:")
|
|
print(f" - High (>=0.8): {output['summary']['high_confidence_matches']}")
|
|
print(f" - Medium (0.6-0.8): {output['summary']['medium_confidence_matches']}")
|
|
print(f" - Low (<0.6): {output['summary']['low_confidence_matches']}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|