fix: improve birth year extraction to avoid date suffix false positives

- Skip YYYYMMDD and YYMMDD date patterns at end of email
- Skip digit sequences longer than 4 characters
- Require non-digit before 4-digit years at end
- Add knid.nl/kabelnoord.nl to consumer domains (Friesland ISP)
- Add 11 missing regional archive domains to HERITAGE_DOMAIN_MAP
- Update recalculation script to re-extract email semantics

Results:
- 3,151 false birth years removed
- 'Likely wrong person' reduced from 533 to 325 (-39%)
- 2,944 candidates' scores boosted
This commit is contained in:
kempersc 2026-01-13 22:37:10 +01:00
parent 74ca873585
commit 9a395f3dbe
2 changed files with 138 additions and 7 deletions

View file

@ -7,6 +7,9 @@ This script applies improved confidence scoring that incorporates:
2. Institution match boosting
3. Wrong-person detection for birth year mismatches
It also RE-EXTRACTS email semantics using the latest parsing logic,
ensuring any fixes to birth year extraction are applied.
Usage:
python scripts/recalculate_confidence.py \
--input data/entity_resolution/entity_resolution_candidates.json \
@ -26,9 +29,59 @@ sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from glam_extractor.entity_resolution import (
recalculate_candidate_confidence,
extract_birth_decade_from_ppid,
parse_email_semantics,
)
def re_extract_email_semantics(candidate: dict) -> int:
"""
Re-extract email semantics using current parsing logic.
Returns number of changes made.
"""
changes = 0
wcms_email = candidate.get('wcms_email')
if not wcms_email:
return 0
# Parse email with current logic
semantics = parse_email_semantics(wcms_email)
if not semantics:
return 0
# Update birth year if changed
old_birth_year = candidate.get('email_probable_birth_year')
new_birth_year = semantics.probable_birth_year
if old_birth_year != new_birth_year:
candidate['email_probable_birth_year'] = new_birth_year
candidate['email_birth_year_confidence'] = semantics.birth_year_confidence
candidate['email_birth_year_position'] = semantics.birth_year_position
changes += 1
# Update institutional status if changed
old_institutional = candidate.get('email_is_institutional')
new_institutional = semantics.is_institutional_domain
if old_institutional != new_institutional:
candidate['email_is_institutional'] = new_institutional
candidate['email_institution_name'] = semantics.institution_name
candidate['email_institution_type'] = semantics.institution_type
changes += 1
# Update consumer status if changed
old_consumer = candidate.get('email_is_consumer')
new_consumer = semantics.is_consumer_domain
if old_consumer != new_consumer:
candidate['email_is_consumer'] = new_consumer
changes += 1
return changes
def main():
parser = argparse.ArgumentParser(
description='Recalculate confidence scores for entity resolution candidates'
@ -79,6 +132,9 @@ def main():
'penalized': 0,
'likely_wrong_person': 0,
'reviews_preserved': 0,
'email_reextracted': 0,
'birth_years_removed': 0,
'birth_years_changed': 0,
}
# Score distribution before
@ -90,11 +146,22 @@ def main():
# Recalculate each candidate
for i, candidate in enumerate(candidates):
original_score = candidate.get('confidence_score', 0)
old_birth_year = candidate.get('email_probable_birth_year')
# Preserve review status
if candidate.get('reviewed'):
stats['reviews_preserved'] += 1
# Re-extract email semantics with updated parsing logic
email_changes = re_extract_email_semantics(candidate)
if email_changes:
stats['email_reextracted'] += 1
new_birth_year = candidate.get('email_probable_birth_year')
if old_birth_year and not new_birth_year:
stats['birth_years_removed'] += 1
elif old_birth_year != new_birth_year:
stats['birth_years_changed'] += 1
# Apply new scoring
recalculate_candidate_confidence(candidate)
@ -136,6 +203,9 @@ def main():
print("RECALCULATION STATISTICS")
print("=" * 60)
print(f"Total candidates: {stats['total']:,}")
print(f"Email re-extracted: {stats['email_reextracted']:,}")
print(f" - Birth years removed: {stats['birth_years_removed']:,}")
print(f" - Birth years changed: {stats['birth_years_changed']:,}")
print(f"Scores adjusted: {stats['adjusted']:,}")
print(f" - Boosted: {stats['boosted']:,}")
print(f" - Penalized: {stats['penalized']:,}")

View file

@ -218,7 +218,19 @@ HERITAGE_DOMAIN_MAP: Dict[str, Tuple[str, str, Optional[str]]] = {
# Veterans & Military Heritage
'veteranen.nl': ('Veteraneninstituut', 'museum', None),
'knid.nl': ('KNID (unknown)', 'government', None), # 64 hits, needs research
# Additional Regional Archives (discovered via domain analysis)
'archiefalkmaar.nl': ('Regionaal Archief Alkmaar', 'archive', None),
'historischcentrumoverijssel.nl': ('Historisch Centrum Overijssel', 'archive', None),
'westbrabantsarchief.nl': ('West-Brabants Archief', 'archive', None),
'haagshistorischmuseum.nl': ('Haags Historisch Museum', 'museum', None),
'zeeuwsarchief.nl': ('Zeeuws Archief', 'archive', None),
'erfgoedhuis-zh.nl': ('Erfgoedhuis Zuid-Holland', 'archive', None),
'noord-hollandsarchief.nl': ('Noord-Hollands Archief', 'archive', 'NL-NH-HAA-A-NHA'),
'geldersarchief.nl': ('Gelders Archief', 'archive', None),
'erfgoedbrabant.nl': ('Erfgoed Brabant', 'archive', None),
'waterlandsarchief.nl': ('Waterlands Archief', 'archive', None),
'erfgoedservice.nl': ('Erfgoed Service', 'archive', None),
# Belgian Heritage (for completeness)
'arch.be': ('Rijksarchief België', 'archive', None),
@ -240,6 +252,7 @@ CONSUMER_DOMAINS: Set[str] = {
'dds.nl', 'freedom.nl', 'xmsnet.nl', 'inter.nl.net', 'euronet.nl',
'onsbrabantnet.nl', 'concepts.nl', 'worldonline.nl', 'delta.nl', 'lijbrandt.nl',
't-mobilethuis.nl', 'compaqnet.nl', 'filternet.nl', 'onsmail.nl', 'box.nl',
'knid.nl', 'kabelnoord.nl', # Kabelnoord ISP (Northern Netherlands/Friesland)
# Belgian
'telenet.be', 'skynet.be', 'proximus.be',
# German
@ -276,6 +289,7 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
'michiel.huizing.1970' -> (1970, 0.85, 'end')
'j.devries65' -> (1965, 0.6, 'end') # 2-digit year
'bob791120061' -> None (ambiguous)
'test20180702' -> None (date suffix, not birth year)
"""
if not local_part:
return None, 0.0, None
@ -284,6 +298,37 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
# People with 2010+ birth years are unlikely to have email accounts yet
year_pattern = r'(?:19[3-9][0-9]|200[0-9]|201[0-5])'
# IMPORTANT: Check for date suffixes (YYYYMMDD or YYMMDD patterns)
# These are NOT birth years! E.g., test20180702, user20210830
# Date pattern: 4-digit year + 2-digit month (01-12) + 2-digit day (01-31)
date_suffix_pattern = r'(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
if re.search(date_suffix_pattern, local_part):
# This looks like a date suffix (e.g., 20180702), skip birth year extraction
return None, 0.0, None
# Also check for 6-digit date patterns YYMMDD at end
date_suffix_6digit = r'\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
if re.search(date_suffix_6digit, local_part):
# Verify it's likely a date, not a name + year
# E.g., test210830 looks like a date (2021-08-30)
last_6 = local_part[-6:]
if last_6.isdigit():
# Check if middle 2 digits are valid month (01-12)
# and last 2 digits are valid day (01-31)
month = int(last_6[2:4])
day = int(last_6[4:6])
if 1 <= month <= 12 and 1 <= day <= 31:
return None, 0.0, None
# Check for long digit sequences at the end (likely not birth years)
# E.g., josbakker532 might be 532, not 1932
trailing_digits = re.search(r'(\d+)$', local_part)
if trailing_digits:
digit_seq = trailing_digits.group(1)
# If more than 4 digits, it's probably not a clean birth year
if len(digit_seq) > 4:
return None, 0.0, None
# Check for year at start
start_match = re.match(rf'^({year_pattern})', local_part)
if start_match:
@ -291,18 +336,33 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
# Validate: person would be 10-95 years old
age = CURRENT_YEAR - year
if 10 <= age <= 95:
return year, 0.9, 'start'
# Make sure there's something after the year (not just "1965")
# and it's not followed by more digits (which might indicate a date)
after_year = local_part[4:]
if after_year and not after_year[0].isdigit():
return year, 0.9, 'start'
# Check for year at end
end_match = re.search(rf'({year_pattern})$', local_part)
# Check for year at end - but not if preceded by other digits
# Valid: michiel.huizing.1970, j.devries1965
# Invalid: test20180702 (date), josbakker532 (random number)
end_match = re.search(rf'[^0-9]({year_pattern})$', local_part)
if end_match:
year = int(end_match.group(1))
age = CURRENT_YEAR - year
if 10 <= age <= 95:
return year, 0.85, 'end'
# Also check if year is at very start of local part ending
end_match_start = re.match(rf'^({year_pattern})$', local_part)
if end_match_start:
year = int(end_match_start.group(1))
age = CURRENT_YEAR - year
if 10 <= age <= 95:
return year, 0.7, 'end'
# Check for year embedded (lower confidence)
embedded_matches = re.findall(year_pattern, local_part)
# Only if there's exactly one year and it's clearly separated
embedded_matches = re.findall(rf'(?<=[^0-9])({year_pattern})(?=[^0-9])', local_part)
if len(embedded_matches) == 1: # Only one year found
year = int(embedded_matches[0])
age = CURRENT_YEAR - year
@ -310,10 +370,11 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
return year, 0.5, 'embedded'
# Try 2-digit years at the end (less reliable)
two_digit_match = re.search(r'(\d{2})$', local_part)
# But only if preceded by a non-digit and exactly 2 digits
two_digit_match = re.search(r'[^0-9](\d{2})$', local_part)
if two_digit_match:
two_digit = int(two_digit_match.group(1))
# Assume 19XX for 30-99, 20XX for 00-25
# Assume 19XX for 30-99, 20XX for 00-15
if 30 <= two_digit <= 99:
year = 1900 + two_digit
age = CURRENT_YEAR - year