707 lines
26 KiB
Python
707 lines
26 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Derive CustodianName by finding consensus across all enrichment sources.
|
|
|
|
APPROACH: Find the name that appears most consistently across sources.
|
|
Instead of a fixed priority, we compare all available names and pick
|
|
the one with the highest agreement (fuzzy matching).
|
|
|
|
Sources checked:
|
|
- wikidata_enrichment.wikidata_label_nl / wikidata_label_en
|
|
- google_maps_enrichment.name
|
|
- isil_enrichment.name
|
|
- original_entry.organisatie
|
|
- museum_register (if present)
|
|
- youtube_enrichment (if present)
|
|
- web_claims org_name (og:site_name, schema.org, h1, title)
|
|
|
|
The consensus approach automatically handles:
|
|
- Wrong Google Maps POIs (parking lots won't match other sources)
|
|
- Garbage web claims (exhibition titles won't match Wikidata)
|
|
- Outdated CSV names (if most sources agree on new name)
|
|
|
|
Usage:
|
|
python scripts/derive_custodian_name_v2.py [--limit N] [--entry ENTRY_NUM] [--dry-run] [--force]
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List, Tuple
|
|
|
|
import yaml
|
|
|
|
try:
|
|
from rapidfuzz import fuzz
|
|
RAPIDFUZZ_AVAILABLE = True
|
|
except ImportError:
|
|
RAPIDFUZZ_AVAILABLE = False
|
|
|
|
|
|
# Directories
|
|
ENTRIES_DIR = Path('/Users/kempersc/apps/glam/data/nde/enriched/entries')
|
|
|
|
|
|
# Source weights for tie-breaking (not primary selection)
|
|
SOURCE_WEIGHTS = {
|
|
'wikidata': 1.0,
|
|
'google_maps': 0.9,
|
|
'isil': 0.85,
|
|
'original_entry': 0.8,
|
|
'museum_register': 0.75,
|
|
'youtube': 0.7,
|
|
'web_og_site_name': 0.6,
|
|
'web_schema_org': 0.55,
|
|
'web_h1_tag': 0.4,
|
|
'web_title_tag': 0.35,
|
|
}
|
|
|
|
|
|
# Patterns that indicate invalid/garbage names
|
|
INVALID_NAME_PATTERNS = [
|
|
# Navigation/UI elements
|
|
r'^(home|welkom|welcome|menu|nav|header|footer|sidebar)$',
|
|
r'^(contact|over ons|about|info|informatie)$',
|
|
r'^(nieuws|news|agenda|calendar|events?|activiteiten)$',
|
|
r'^(zoeken?|search|filter|sort|browse|bladeren)$',
|
|
r'^zoeken in', r'^doorzoek\s', r'^bekijk\s', r'^ontdek\s',
|
|
|
|
# Cookie/privacy/legal
|
|
r'cookie', r'privacy', r'gdpr', r'consent', r'waarom gebruiken wij',
|
|
|
|
# Generic page elements
|
|
r'^(default|untitled|index|main|pagina|page)\s*\d*$',
|
|
r'^(foto|image|picture|afbeelding)\s*\d+$',
|
|
r'^(oproep|call|melding|bericht|scroll)$',
|
|
r'^(openingstijden|tickets|reserveer|plan je bezoek)$',
|
|
r'^(main menu|hoofdmenu)$',
|
|
|
|
# Exhibition/event titles
|
|
r'tentoonstelling', r'expositie', r'exhibition', r'verlengd',
|
|
r'^nu te zien', r'^te zien:',
|
|
|
|
# Taglines/slogans
|
|
r'^op het kruispunt van', r'^het verhaal van\s', r'^de geschiedenis van\s',
|
|
r'^beleef je\s', r'^ontdek ook\s', r'^welkom bij\s',
|
|
r'^over het museum$', r'^over de\s', r'^over ons$',
|
|
r'binnen handbereik$', r'met een glimlach$',
|
|
|
|
# Newsletter/marketing
|
|
r'nieuwsbrief', r'newsletter', r'^schrijf je in', r'^sign up',
|
|
|
|
# Wrong websites
|
|
r'webdesign', r'libraries\.org', r'NLmapNew\.com', r'fotobeeldbank',
|
|
|
|
# Wrong POIs from Google Maps
|
|
r'^parkeerplaats$', r'^parking$', r'^bushalte$', r'^tramhalte$',
|
|
|
|
# Generic/ambiguous
|
|
r'^homepage\s', r'^homepagina\s', r'^chat$', r'^help$',
|
|
r'onder constructie', r"web server's default page",
|
|
]
|
|
|
|
|
|
# Patterns to extract actual institution name from greeting/wrapper text
|
|
# These patterns capture the institution name from common website title formats
|
|
# NOTE: Order matters! More specific patterns MUST come before more general ones.
|
|
NAME_EXTRACTION_PATTERNS = [
|
|
# "Welkom op de website van [het] [NAME]" - most specific
|
|
(r'^welkom\s+op\s+de\s+(?:website|site|pagina)\s+van\s+(?:het\s+|de\s+)?(.+?)(?:\s*[-–—|]\s*.*)?$', 1),
|
|
# "Welkom bij [het] [NAME]" - captures NAME
|
|
(r'^welkom\s+bij\s+(?:het\s+|de\s+)?(.+?)(?:\s*[-–—|]\s*.*)?$', 1),
|
|
# "Welkom in [het] [NAME]" - captures NAME
|
|
(r'^welkom\s+in\s+(?:het\s+|de\s+)?(.+?)(?:\s*[-–—|]\s*.*)?$', 1),
|
|
# "Welkom [het] [NAME]" - just "Welkom" followed by name (fallback)
|
|
(r'^welkom\s+(?:het\s+|de\s+)?(.+?)(?:\s*[-–—|]\s*.*)?$', 1),
|
|
# "[NAME] - Welkom" or "[NAME] | Home" etc.
|
|
(r'^(.+?)\s*[-–—|]\s*(?:welkom|home|homepage|start).*$', 1),
|
|
# "[NAME] | Official Website" etc.
|
|
(r'^(.+?)\s*[-–—|]\s*(?:official\s+)?(?:website|site).*$', 1),
|
|
]
|
|
|
|
|
|
# Dutch legal form prefixes that should be stripped for emic names
|
|
# These are formal legal designations, NOT part of the public-facing name
|
|
#
|
|
# NOTE: "Vereniging" is NOT in this list! It describes organizational purpose
|
|
# (a voluntary association of members), not just legal registration.
|
|
# "Historische Vereniging Nijeveen" is fundamentally different from
|
|
# "Stichting Rijksmuseum" - the former's identity IS being a vereniging.
|
|
# See AGENTS.md Rule 8 for full rationale.
|
|
DUTCH_LEGAL_PREFIXES = [
|
|
r'^stichting\s+', # Foundation (legal entity type)
|
|
r'^coöperatie\s+', # Cooperative
|
|
r'^coöperatieve\s+',
|
|
r'^naamloze\s+vennootschap\s+', # Public company (NV)
|
|
r'^besloten\s+vennootschap\s+', # Private company (BV)
|
|
r'^commanditaire\s+vennootschap\s+', # Limited partnership
|
|
r'^vennootschap\s+onder\s+firma\s+', # General partnership
|
|
r'^maatschap\s+', # Partnership
|
|
r'^eenmanszaak\s+', # Sole proprietorship
|
|
]
|
|
|
|
# Suffixes that indicate legal form
|
|
DUTCH_LEGAL_SUFFIXES = [
|
|
r'\s+b\.?v\.?\s*$', # B.V.
|
|
r'\s+n\.?v\.?\s*$', # N.V.
|
|
r'\s+v\.?o\.?f\.?\s*$', # V.O.F.
|
|
r'\s+c\.?v\.?\s*$', # C.V.
|
|
]
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize name for comparison."""
|
|
if not name:
|
|
return ""
|
|
return ' '.join(name.lower().split())
|
|
|
|
|
|
def extract_name_from_greeting(name: str) -> str:
|
|
"""
|
|
Extract the actual institution name from greeting/wrapper text.
|
|
|
|
Examples:
|
|
"Welkom op de website van het Zeister Historisch Genootschap (ZHG)"
|
|
-> "Zeister Historisch Genootschap (ZHG)"
|
|
"Welkom bij Oudheidkamer Texel"
|
|
-> "Oudheidkamer Texel"
|
|
"Rijksmuseum | Home"
|
|
-> "Rijksmuseum"
|
|
"""
|
|
if not name:
|
|
return ""
|
|
|
|
name = name.strip()
|
|
name_lower = name.lower()
|
|
|
|
# Try each extraction pattern
|
|
for pattern, group_idx in NAME_EXTRACTION_PATTERNS:
|
|
match = re.match(pattern, name_lower, re.IGNORECASE)
|
|
if match:
|
|
extracted = match.group(group_idx).strip()
|
|
# Preserve original case by finding the extracted part in original
|
|
start_pos = name_lower.find(extracted.lower())
|
|
if start_pos >= 0:
|
|
extracted = name[start_pos:start_pos + len(extracted)]
|
|
return extracted.strip(' -–—|:.')
|
|
|
|
return name
|
|
|
|
|
|
def extract_emic_name(name: str) -> str:
|
|
"""
|
|
Extract the emic (public-facing) name, stripping legal form prefixes/suffixes.
|
|
|
|
Per CustodianName.yaml:
|
|
- CustodianName = How custodian presents itself (emic, operational)
|
|
- LegalName = Formal registered name (in CustodianLegalStatus)
|
|
- Example: "Rijksmuseum" (emic) vs "Stichting Rijksmuseum" (legal)
|
|
|
|
Examples:
|
|
"Stichting Het Geld- En Bankmuseum" -> "Geldmuseum" (if Geldmuseum is the emic name)
|
|
"Stichting Rijksmuseum" -> "Rijksmuseum"
|
|
"Vereniging Oud-Utrecht" -> "Oud-Utrecht"
|
|
"Museum Boijmans Van Beuningen B.V." -> "Museum Boijmans Van Beuningen"
|
|
"""
|
|
if not name:
|
|
return ""
|
|
|
|
result = name.strip()
|
|
|
|
# Strip legal prefixes (case-insensitive)
|
|
for pattern in DUTCH_LEGAL_PREFIXES:
|
|
result = re.sub(pattern, '', result, flags=re.IGNORECASE).strip()
|
|
|
|
# Strip legal suffixes
|
|
for pattern in DUTCH_LEGAL_SUFFIXES:
|
|
result = re.sub(pattern, '', result, flags=re.IGNORECASE).strip()
|
|
|
|
# Clean up any double spaces or leading/trailing punctuation
|
|
result = ' '.join(result.split())
|
|
result = result.strip(' -–—|:.')
|
|
|
|
return result if result else name
|
|
|
|
|
|
def get_legal_name(name: str) -> Optional[str]:
|
|
"""
|
|
Check if the name contains a legal form indicator.
|
|
If so, return the full legal name; otherwise return None.
|
|
|
|
This is used to populate CustodianLegalStatus.legal_name when available.
|
|
|
|
NOTE: Also checks inside greeting text (e.g., "Welkom op de website van Vereniging X")
|
|
"""
|
|
if not name:
|
|
return None
|
|
|
|
# First extract from greeting if present
|
|
extracted = extract_name_from_greeting(name)
|
|
|
|
# Check both original and extracted
|
|
for check_name in [extracted, name]:
|
|
if not check_name:
|
|
continue
|
|
check_lower = check_name.lower()
|
|
|
|
# Check for legal prefixes
|
|
for pattern in DUTCH_LEGAL_PREFIXES:
|
|
if re.match(pattern, check_lower, re.IGNORECASE):
|
|
return check_name.strip()
|
|
|
|
# Check for legal suffixes
|
|
for pattern in DUTCH_LEGAL_SUFFIXES:
|
|
if re.search(pattern, check_lower, re.IGNORECASE):
|
|
return check_name.strip()
|
|
|
|
return None
|
|
|
|
|
|
def fuzzy_match_score(name1: str, name2: str) -> float:
|
|
"""Calculate fuzzy match score between two names (0-1)."""
|
|
if not name1 or not name2:
|
|
return 0.0
|
|
|
|
n1 = normalize_name(name1)
|
|
n2 = normalize_name(name2)
|
|
|
|
if n1 == n2:
|
|
return 1.0
|
|
|
|
if RAPIDFUZZ_AVAILABLE:
|
|
token_score = fuzz.token_set_ratio(n1, n2) / 100.0
|
|
partial_score = fuzz.partial_ratio(n1, n2) / 100.0
|
|
return max(token_score * 0.8 + partial_score * 0.2, token_score)
|
|
else:
|
|
if n1 in n2 or n2 in n1:
|
|
return min(len(n1), len(n2)) / max(len(n1), len(n2))
|
|
return 0.0
|
|
|
|
|
|
def is_obviously_invalid(name: str) -> bool:
|
|
"""Check if a name is obviously invalid."""
|
|
if not name or len(name.strip()) < 3:
|
|
return True
|
|
|
|
name_lower = name.lower().strip()
|
|
|
|
for pattern in INVALID_NAME_PATTERNS:
|
|
if re.search(pattern, name_lower, re.IGNORECASE):
|
|
return True
|
|
|
|
# Mostly numbers
|
|
if sum(1 for c in name if c.isdigit()) > len(name) * 0.5:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def clean_name(name: str, extract_from_greeting: bool = True, to_emic: bool = False) -> str:
|
|
"""
|
|
Clean organization name.
|
|
|
|
Args:
|
|
name: Raw name string
|
|
extract_from_greeting: If True, extract name from "Welkom..." patterns
|
|
to_emic: If True, strip legal form prefixes to get emic name
|
|
"""
|
|
if not name:
|
|
return ""
|
|
name = ' '.join(name.split())
|
|
name = name.strip(' -–—|:.')
|
|
|
|
# Extract actual name from greeting text if present
|
|
if extract_from_greeting:
|
|
name = extract_name_from_greeting(name)
|
|
|
|
# Convert to emic name if requested
|
|
if to_emic:
|
|
name = extract_emic_name(name)
|
|
|
|
return name
|
|
|
|
|
|
def extract_all_names(entry_data: Dict) -> Tuple[List[Tuple[str, str, float]], Optional[str]]:
|
|
"""
|
|
Extract all candidate names from all enrichment sources.
|
|
|
|
Returns:
|
|
- List of (emic_name, source, weight) tuples for consensus matching
|
|
- Optional legal_name if a legal form was detected in any source
|
|
"""
|
|
candidates = []
|
|
legal_name = None
|
|
|
|
# Wikidata (usually has emic name, not legal name)
|
|
wikidata = entry_data.get('wikidata_enrichment', {})
|
|
for field in ['wikidata_label_nl', 'wikidata_label_en']:
|
|
if wikidata.get(field):
|
|
raw_name = wikidata[field]
|
|
# Extract greeting if present, convert to emic
|
|
name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
|
|
if not is_obviously_invalid(name):
|
|
candidates.append((name, 'wikidata', SOURCE_WEIGHTS['wikidata']))
|
|
# Check if raw name was a legal name
|
|
if not legal_name:
|
|
legal_name = get_legal_name(raw_name)
|
|
break # Only use one wikidata name
|
|
|
|
# Google Maps (usually has emic name)
|
|
google = entry_data.get('google_maps_enrichment', {})
|
|
if google.get('name'):
|
|
raw_name = google['name']
|
|
name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
|
|
if not is_obviously_invalid(name):
|
|
candidates.append((name, 'google_maps', SOURCE_WEIGHTS['google_maps']))
|
|
if not legal_name:
|
|
legal_name = get_legal_name(raw_name)
|
|
|
|
# ISIL registry (may have legal or emic name)
|
|
isil = entry_data.get('isil_enrichment', {})
|
|
if isil.get('name'):
|
|
raw_name = isil['name']
|
|
name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
|
|
if not is_obviously_invalid(name):
|
|
candidates.append((name, 'isil', SOURCE_WEIGHTS['isil']))
|
|
if not legal_name:
|
|
legal_name = get_legal_name(raw_name)
|
|
|
|
# NAN ISIL enrichment (authoritative source for legal names)
|
|
nan_isil = entry_data.get('nan_isil_enrichment', {})
|
|
if nan_isil.get('nan_name'):
|
|
raw_name = nan_isil['nan_name']
|
|
name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
|
|
if not is_obviously_invalid(name):
|
|
candidates.append((name, 'nan_isil', SOURCE_WEIGHTS.get('nan_isil', 0.85)))
|
|
if not legal_name:
|
|
legal_name = get_legal_name(raw_name)
|
|
|
|
# Original CSV entry (often has legal name with "Stichting" etc.)
|
|
original = entry_data.get('original_entry', {})
|
|
if original.get('organisatie'):
|
|
raw_name = original['organisatie']
|
|
# CSV often has legal names - extract emic version
|
|
name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
|
|
if not is_obviously_invalid(name):
|
|
candidates.append((name, 'original_entry', SOURCE_WEIGHTS['original_entry']))
|
|
# Original entry is good source for legal name
|
|
if not legal_name:
|
|
legal_name = get_legal_name(raw_name)
|
|
|
|
# Museum register (if present)
|
|
museum_reg = entry_data.get('museum_register_enrichment', {})
|
|
if museum_reg.get('name'):
|
|
raw_name = museum_reg['name']
|
|
name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
|
|
if not is_obviously_invalid(name):
|
|
candidates.append((name, 'museum_register', SOURCE_WEIGHTS['museum_register']))
|
|
if not legal_name:
|
|
legal_name = get_legal_name(raw_name)
|
|
|
|
# YouTube (if present)
|
|
youtube = entry_data.get('youtube_enrichment', {})
|
|
if youtube.get('channel_name'):
|
|
raw_name = youtube['channel_name']
|
|
name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
|
|
if not is_obviously_invalid(name):
|
|
candidates.append((name, 'youtube', SOURCE_WEIGHTS['youtube']))
|
|
|
|
# Web claims (title tags often have greetings, need extraction)
|
|
web_claims = entry_data.get('web_claims', {}).get('claims', [])
|
|
for claim in web_claims:
|
|
if claim.get('claim_type') == 'org_name':
|
|
raw_name = claim.get('claim_value', '')
|
|
# Web claims especially need greeting extraction
|
|
name = clean_name(raw_name, extract_from_greeting=True, to_emic=True)
|
|
method = claim.get('extraction_method', '')
|
|
source_key = f'web_{method}'
|
|
weight = SOURCE_WEIGHTS.get(source_key, 0.3)
|
|
|
|
if not is_obviously_invalid(name):
|
|
candidates.append((name, source_key, weight))
|
|
|
|
return candidates, legal_name
|
|
|
|
|
|
def find_consensus_name(candidates: List[Tuple[str, str, float]]) -> Tuple[Optional[str], str, float, Dict]:
|
|
"""
|
|
Find the name with highest consensus across sources.
|
|
|
|
For each candidate, calculate how well it matches all other candidates.
|
|
The name with highest total agreement wins.
|
|
|
|
Returns (best_name, best_source, confidence, match_details)
|
|
"""
|
|
if not candidates:
|
|
return None, 'none', 0.0, {}
|
|
|
|
if len(candidates) == 1:
|
|
name, source, weight = candidates[0]
|
|
return name, source, weight, {'single_source': True}
|
|
|
|
# Calculate agreement scores for each candidate
|
|
agreement_scores = []
|
|
|
|
for i, (name1, source1, weight1) in enumerate(candidates):
|
|
total_agreement = 0.0
|
|
matches = []
|
|
|
|
for j, (name2, source2, weight2) in enumerate(candidates):
|
|
if i == j:
|
|
continue
|
|
|
|
score = fuzzy_match_score(name1, name2)
|
|
# Weight the agreement by the source weight of the matching name
|
|
weighted_score = score * weight2
|
|
total_agreement += weighted_score
|
|
|
|
if score >= 0.6:
|
|
matches.append({
|
|
'source': source2,
|
|
'name': name2,
|
|
'score': score,
|
|
})
|
|
|
|
# Normalize by number of other sources
|
|
avg_agreement = total_agreement / (len(candidates) - 1) if len(candidates) > 1 else 0
|
|
|
|
# Boost by source weight
|
|
final_score = avg_agreement * 0.7 + weight1 * 0.3
|
|
|
|
agreement_scores.append({
|
|
'name': name1,
|
|
'source': source1,
|
|
'weight': weight1,
|
|
'avg_agreement': avg_agreement,
|
|
'final_score': final_score,
|
|
'matches': matches,
|
|
'match_count': len(matches),
|
|
})
|
|
|
|
# Sort by final score (highest first)
|
|
agreement_scores.sort(key=lambda x: (x['final_score'], x['match_count'], x['weight']), reverse=True)
|
|
|
|
best = agreement_scores[0]
|
|
|
|
# Calculate confidence based on agreement
|
|
confidence = best['final_score']
|
|
if best['match_count'] >= 2:
|
|
confidence = min(1.0, confidence + 0.1) # Boost for multiple matches
|
|
|
|
return best['name'], best['source'], confidence, {
|
|
'match_count': best['match_count'],
|
|
'matches': best['matches'],
|
|
'avg_agreement': best['avg_agreement'],
|
|
'all_candidates': [(c['name'], c['source'], c['final_score']) for c in agreement_scores],
|
|
}
|
|
|
|
|
|
def process_entry(filepath: Path, dry_run: bool = False) -> Dict[str, Any]:
|
|
"""
|
|
Process a single entry file to derive CustodianName by consensus.
|
|
"""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if not data:
|
|
return {'status': 'error', 'message': 'Empty file', 'filepath': str(filepath)}
|
|
|
|
result = {
|
|
'filepath': str(filepath),
|
|
'filename': filepath.name,
|
|
'entry_index': data.get('entry_index', ''),
|
|
'status': 'ok',
|
|
'name': None,
|
|
'source': None,
|
|
'confidence': 0.0,
|
|
'match_count': 0,
|
|
'previous_name': None,
|
|
'previous_source': None,
|
|
}
|
|
|
|
# Get current custodian_name if exists
|
|
current = data.get('custodian_name', {})
|
|
if current.get('claim_value'):
|
|
result['previous_name'] = current.get('claim_value')
|
|
result['previous_source'] = current.get('source') or current.get('extraction_method', 'unknown')
|
|
|
|
# Extract all candidate names from all sources
|
|
candidates, legal_name = extract_all_names(data)
|
|
|
|
if not candidates:
|
|
result['status'] = 'no_source'
|
|
result['message'] = 'No valid names found in any source'
|
|
return result
|
|
|
|
# Store legal name in result for later use
|
|
result['legal_name'] = legal_name
|
|
|
|
# Find consensus name
|
|
best_name, best_source, confidence, details = find_consensus_name(candidates)
|
|
|
|
if not best_name:
|
|
result['status'] = 'no_consensus'
|
|
result['message'] = 'Could not find consensus among candidates'
|
|
return result
|
|
|
|
result['name'] = best_name
|
|
result['source'] = best_source
|
|
result['confidence'] = confidence
|
|
result['match_count'] = details.get('match_count', 0)
|
|
result['candidates'] = len(candidates)
|
|
|
|
# Build custodian_name record
|
|
custodian_name = {
|
|
'claim_type': 'custodian_name',
|
|
'claim_value': best_name,
|
|
'source': best_source,
|
|
'confidence': round(confidence, 3),
|
|
'consensus_method': True,
|
|
'sources_checked': len(candidates),
|
|
'sources_matched': details.get('match_count', 0) + 1, # +1 for self
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
# Add match details
|
|
if details.get('matches'):
|
|
custodian_name['matching_sources'] = [
|
|
{'source': m['source'], 'name': m['name'], 'score': round(m['score'], 2)}
|
|
for m in details['matches']
|
|
]
|
|
|
|
# Track if changed
|
|
if result['previous_name'] and result['previous_name'] != best_name:
|
|
custodian_name['previous_value'] = result['previous_name']
|
|
custodian_name['previous_source'] = result['previous_source']
|
|
result['status'] = 'updated'
|
|
elif not result['previous_name']:
|
|
result['status'] = 'new'
|
|
|
|
# Write if not dry run
|
|
if not dry_run:
|
|
data['custodian_name'] = custodian_name
|
|
|
|
# Store legal_name separately if detected (for CustodianLegalStatus)
|
|
if legal_name:
|
|
data['custodian_legal_name'] = {
|
|
'claim_type': 'legal_name',
|
|
'claim_value': legal_name,
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'note': 'Legal form detected in source name (e.g., Stichting, B.V., N.V.)'
|
|
}
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Derive CustodianName by consensus across sources')
|
|
parser.add_argument('--limit', type=int, default=None, help='Limit number of entries')
|
|
parser.add_argument('--entry', type=str, default=None, help='Process specific entry number')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without writing')
|
|
parser.add_argument('--force', action='store_true', help='Re-derive even if custodian_name exists')
|
|
parser.add_argument('--show-all', action='store_true', help='Show all entries, not just changes')
|
|
parser.add_argument('--verbose', action='store_true', help='Show candidate details')
|
|
args = parser.parse_args()
|
|
|
|
# Find entry files
|
|
if args.entry:
|
|
files = list(ENTRIES_DIR.glob(f'{args.entry}*.yaml'))
|
|
else:
|
|
files = sorted([f for f in ENTRIES_DIR.glob('*.yaml') if f.is_file() and not f.name.startswith('.')])
|
|
|
|
if args.limit:
|
|
files = files[:args.limit]
|
|
|
|
print(f"Processing {len(files)} entries...")
|
|
print(f"Mode: {'DRY RUN' if args.dry_run else 'LIVE'}")
|
|
print(f"Method: Consensus across all enrichment sources")
|
|
print()
|
|
|
|
# Track statistics
|
|
stats = defaultdict(int)
|
|
low_confidence = []
|
|
|
|
for filepath in files:
|
|
if filepath.is_dir():
|
|
continue
|
|
|
|
# Skip if already has custodian_name (unless --force)
|
|
if not args.force:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
if data and data.get('custodian_name', {}).get('claim_value'):
|
|
stats['unchanged'] += 1
|
|
if args.show_all:
|
|
name = data['custodian_name']['claim_value']
|
|
source = data['custodian_name'].get('source', 'unknown')
|
|
print(f" = {filepath.name}: '{name}' [{source}]")
|
|
continue
|
|
|
|
result = process_entry(filepath, dry_run=args.dry_run)
|
|
|
|
# Update stats
|
|
if result['status'] == 'error':
|
|
stats['error'] += 1
|
|
print(f" ! {filepath.name}: ERROR - {result.get('message', 'Unknown')}")
|
|
elif result['status'] in ('no_source', 'no_consensus'):
|
|
stats['no_source'] += 1
|
|
print(f" - {filepath.name}: {result.get('message', 'No source')}")
|
|
else:
|
|
stats[result['source']] += 1
|
|
stats['total_derived'] += 1
|
|
|
|
# Track low confidence for review
|
|
if result['confidence'] < 0.5:
|
|
low_confidence.append(result)
|
|
|
|
if result['status'] == 'updated':
|
|
stats['updated'] += 1
|
|
match_info = f"[{result['match_count']+1}/{result['candidates']} sources]"
|
|
print(f" ~ {filepath.name}: '{result['previous_name']}' -> '{result['name']}' [{result['source']}] {match_info}")
|
|
elif result['status'] == 'new':
|
|
stats['new'] += 1
|
|
match_info = f"[{result['match_count']+1}/{result['candidates']} sources]"
|
|
print(f" + {filepath.name}: '{result['name']}' [{result['source']}] {match_info}")
|
|
elif args.show_all:
|
|
print(f" = {filepath.name}: '{result['name']}' [{result['source']}]")
|
|
|
|
# Summary
|
|
print()
|
|
print("=" * 70)
|
|
print(f"{'DRY RUN - ' if args.dry_run else ''}Summary:")
|
|
print()
|
|
print("Sources used:")
|
|
for source in ['wikidata', 'google_maps', 'isil', 'original_entry', 'museum_register',
|
|
'youtube', 'web_og_site_name', 'web_schema_org', 'web_h1_tag', 'web_title_tag']:
|
|
if stats[source] > 0:
|
|
print(f" {source:20s}: {stats[source]}")
|
|
print()
|
|
print(f" New names derived: {stats['new']}")
|
|
print(f" Names updated: {stats['updated']}")
|
|
print(f" Unchanged (skipped): {stats['unchanged']}")
|
|
print(f" No valid source: {stats['no_source']}")
|
|
print(f" Errors: {stats['error']}")
|
|
print()
|
|
print(f" TOTAL DERIVED: {stats['total_derived']}")
|
|
|
|
if low_confidence:
|
|
print()
|
|
print(f" Low confidence ({len(low_confidence)} entries) - may need review:")
|
|
for r in low_confidence[:10]:
|
|
print(f" {r['filename']}: '{r['name']}' (confidence: {r['confidence']:.2f})")
|
|
if len(low_confidence) > 10:
|
|
print(f" ... and {len(low_confidence) - 10} more")
|
|
|
|
print("=" * 70)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|