From 9a395f3dbe11c03b083244cad075117d2fa1e1a1 Mon Sep 17 00:00:00 2001 From: kempersc Date: Tue, 13 Jan 2026 22:37:10 +0100 Subject: [PATCH] fix: improve birth year extraction to avoid date suffix false positives - Skip YYYYMMDD and YYMMDD date patterns at end of email - Skip digit sequences longer than 4 characters - Require non-digit before 4-digit years at end - Add knid.nl/kabelnoord.nl to consumer domains (Friesland ISP) - Add 11 missing regional archive domains to HERITAGE_DOMAIN_MAP - Update recalculation script to re-extract email semantics Results: - 3,151 false birth years removed - 'Likely wrong person' reduced from 533 to 325 (-39%) - 2,944 candidates' scores boosted --- scripts/recalculate_confidence.py | 70 +++++++++++++++++ .../entity_resolution/email_semantics.py | 75 +++++++++++++++++-- 2 files changed, 138 insertions(+), 7 deletions(-) diff --git a/scripts/recalculate_confidence.py b/scripts/recalculate_confidence.py index 22f9cbf5dc..dbef4464a0 100644 --- a/scripts/recalculate_confidence.py +++ b/scripts/recalculate_confidence.py @@ -7,6 +7,9 @@ This script applies improved confidence scoring that incorporates: 2. Institution match boosting 3. Wrong-person detection for birth year mismatches +It also RE-EXTRACTS email semantics using the latest parsing logic, +ensuring any fixes to birth year extraction are applied. + Usage: python scripts/recalculate_confidence.py \ --input data/entity_resolution/entity_resolution_candidates.json \ @@ -26,9 +29,59 @@ sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from glam_extractor.entity_resolution import ( recalculate_candidate_confidence, extract_birth_decade_from_ppid, + parse_email_semantics, ) +def re_extract_email_semantics(candidate: dict) -> int: + """ + Re-extract email semantics using current parsing logic. + + Returns number of changes made. + """ + changes = 0 + wcms_email = candidate.get('wcms_email') + + if not wcms_email: + return 0 + + # Parse email with current logic + semantics = parse_email_semantics(wcms_email) + + if not semantics: + return 0 + + # Update birth year if changed + old_birth_year = candidate.get('email_probable_birth_year') + new_birth_year = semantics.probable_birth_year + + if old_birth_year != new_birth_year: + candidate['email_probable_birth_year'] = new_birth_year + candidate['email_birth_year_confidence'] = semantics.birth_year_confidence + candidate['email_birth_year_position'] = semantics.birth_year_position + changes += 1 + + # Update institutional status if changed + old_institutional = candidate.get('email_is_institutional') + new_institutional = semantics.is_institutional_domain + + if old_institutional != new_institutional: + candidate['email_is_institutional'] = new_institutional + candidate['email_institution_name'] = semantics.institution_name + candidate['email_institution_type'] = semantics.institution_type + changes += 1 + + # Update consumer status if changed + old_consumer = candidate.get('email_is_consumer') + new_consumer = semantics.is_consumer_domain + + if old_consumer != new_consumer: + candidate['email_is_consumer'] = new_consumer + changes += 1 + + return changes + + def main(): parser = argparse.ArgumentParser( description='Recalculate confidence scores for entity resolution candidates' @@ -79,6 +132,9 @@ def main(): 'penalized': 0, 'likely_wrong_person': 0, 'reviews_preserved': 0, + 'email_reextracted': 0, + 'birth_years_removed': 0, + 'birth_years_changed': 0, } # Score distribution before @@ -90,11 +146,22 @@ def main(): # Recalculate each candidate for i, candidate in enumerate(candidates): original_score = candidate.get('confidence_score', 0) + old_birth_year = candidate.get('email_probable_birth_year') # Preserve review status if candidate.get('reviewed'): stats['reviews_preserved'] += 1 + # Re-extract email semantics with updated parsing logic + email_changes = re_extract_email_semantics(candidate) + if email_changes: + stats['email_reextracted'] += 1 + new_birth_year = candidate.get('email_probable_birth_year') + if old_birth_year and not new_birth_year: + stats['birth_years_removed'] += 1 + elif old_birth_year != new_birth_year: + stats['birth_years_changed'] += 1 + # Apply new scoring recalculate_candidate_confidence(candidate) @@ -136,6 +203,9 @@ def main(): print("RECALCULATION STATISTICS") print("=" * 60) print(f"Total candidates: {stats['total']:,}") + print(f"Email re-extracted: {stats['email_reextracted']:,}") + print(f" - Birth years removed: {stats['birth_years_removed']:,}") + print(f" - Birth years changed: {stats['birth_years_changed']:,}") print(f"Scores adjusted: {stats['adjusted']:,}") print(f" - Boosted: {stats['boosted']:,}") print(f" - Penalized: {stats['penalized']:,}") diff --git a/src/glam_extractor/entity_resolution/email_semantics.py b/src/glam_extractor/entity_resolution/email_semantics.py index 08125f4297..ebf0170cc0 100644 --- a/src/glam_extractor/entity_resolution/email_semantics.py +++ b/src/glam_extractor/entity_resolution/email_semantics.py @@ -218,7 +218,19 @@ HERITAGE_DOMAIN_MAP: Dict[str, Tuple[str, str, Optional[str]]] = { # Veterans & Military Heritage 'veteranen.nl': ('Veteraneninstituut', 'museum', None), - 'knid.nl': ('KNID (unknown)', 'government', None), # 64 hits, needs research + + # Additional Regional Archives (discovered via domain analysis) + 'archiefalkmaar.nl': ('Regionaal Archief Alkmaar', 'archive', None), + 'historischcentrumoverijssel.nl': ('Historisch Centrum Overijssel', 'archive', None), + 'westbrabantsarchief.nl': ('West-Brabants Archief', 'archive', None), + 'haagshistorischmuseum.nl': ('Haags Historisch Museum', 'museum', None), + 'zeeuwsarchief.nl': ('Zeeuws Archief', 'archive', None), + 'erfgoedhuis-zh.nl': ('Erfgoedhuis Zuid-Holland', 'archive', None), + 'noord-hollandsarchief.nl': ('Noord-Hollands Archief', 'archive', 'NL-NH-HAA-A-NHA'), + 'geldersarchief.nl': ('Gelders Archief', 'archive', None), + 'erfgoedbrabant.nl': ('Erfgoed Brabant', 'archive', None), + 'waterlandsarchief.nl': ('Waterlands Archief', 'archive', None), + 'erfgoedservice.nl': ('Erfgoed Service', 'archive', None), # Belgian Heritage (for completeness) 'arch.be': ('Rijksarchief Belgiƫ', 'archive', None), @@ -240,6 +252,7 @@ CONSUMER_DOMAINS: Set[str] = { 'dds.nl', 'freedom.nl', 'xmsnet.nl', 'inter.nl.net', 'euronet.nl', 'onsbrabantnet.nl', 'concepts.nl', 'worldonline.nl', 'delta.nl', 'lijbrandt.nl', 't-mobilethuis.nl', 'compaqnet.nl', 'filternet.nl', 'onsmail.nl', 'box.nl', + 'knid.nl', 'kabelnoord.nl', # Kabelnoord ISP (Northern Netherlands/Friesland) # Belgian 'telenet.be', 'skynet.be', 'proximus.be', # German @@ -276,6 +289,7 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[ 'michiel.huizing.1970' -> (1970, 0.85, 'end') 'j.devries65' -> (1965, 0.6, 'end') # 2-digit year 'bob791120061' -> None (ambiguous) + 'test20180702' -> None (date suffix, not birth year) """ if not local_part: return None, 0.0, None @@ -284,6 +298,37 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[ # People with 2010+ birth years are unlikely to have email accounts yet year_pattern = r'(?:19[3-9][0-9]|200[0-9]|201[0-5])' + # IMPORTANT: Check for date suffixes (YYYYMMDD or YYMMDD patterns) + # These are NOT birth years! E.g., test20180702, user20210830 + # Date pattern: 4-digit year + 2-digit month (01-12) + 2-digit day (01-31) + date_suffix_pattern = r'(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$' + if re.search(date_suffix_pattern, local_part): + # This looks like a date suffix (e.g., 20180702), skip birth year extraction + return None, 0.0, None + + # Also check for 6-digit date patterns YYMMDD at end + date_suffix_6digit = r'\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$' + if re.search(date_suffix_6digit, local_part): + # Verify it's likely a date, not a name + year + # E.g., test210830 looks like a date (2021-08-30) + last_6 = local_part[-6:] + if last_6.isdigit(): + # Check if middle 2 digits are valid month (01-12) + # and last 2 digits are valid day (01-31) + month = int(last_6[2:4]) + day = int(last_6[4:6]) + if 1 <= month <= 12 and 1 <= day <= 31: + return None, 0.0, None + + # Check for long digit sequences at the end (likely not birth years) + # E.g., josbakker532 might be 532, not 1932 + trailing_digits = re.search(r'(\d+)$', local_part) + if trailing_digits: + digit_seq = trailing_digits.group(1) + # If more than 4 digits, it's probably not a clean birth year + if len(digit_seq) > 4: + return None, 0.0, None + # Check for year at start start_match = re.match(rf'^({year_pattern})', local_part) if start_match: @@ -291,18 +336,33 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[ # Validate: person would be 10-95 years old age = CURRENT_YEAR - year if 10 <= age <= 95: - return year, 0.9, 'start' + # Make sure there's something after the year (not just "1965") + # and it's not followed by more digits (which might indicate a date) + after_year = local_part[4:] + if after_year and not after_year[0].isdigit(): + return year, 0.9, 'start' - # Check for year at end - end_match = re.search(rf'({year_pattern})$', local_part) + # Check for year at end - but not if preceded by other digits + # Valid: michiel.huizing.1970, j.devries1965 + # Invalid: test20180702 (date), josbakker532 (random number) + end_match = re.search(rf'[^0-9]({year_pattern})$', local_part) if end_match: year = int(end_match.group(1)) age = CURRENT_YEAR - year if 10 <= age <= 95: return year, 0.85, 'end' + # Also check if year is at very start of local part ending + end_match_start = re.match(rf'^({year_pattern})$', local_part) + if end_match_start: + year = int(end_match_start.group(1)) + age = CURRENT_YEAR - year + if 10 <= age <= 95: + return year, 0.7, 'end' + # Check for year embedded (lower confidence) - embedded_matches = re.findall(year_pattern, local_part) + # Only if there's exactly one year and it's clearly separated + embedded_matches = re.findall(rf'(?<=[^0-9])({year_pattern})(?=[^0-9])', local_part) if len(embedded_matches) == 1: # Only one year found year = int(embedded_matches[0]) age = CURRENT_YEAR - year @@ -310,10 +370,11 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[ return year, 0.5, 'embedded' # Try 2-digit years at the end (less reliable) - two_digit_match = re.search(r'(\d{2})$', local_part) + # But only if preceded by a non-digit and exactly 2 digits + two_digit_match = re.search(r'[^0-9](\d{2})$', local_part) if two_digit_match: two_digit = int(two_digit_match.group(1)) - # Assume 19XX for 30-99, 20XX for 00-25 + # Assume 19XX for 30-99, 20XX for 00-15 if 30 <= two_digit <= 99: year = 1900 + two_digit age = CURRENT_YEAR - year