From 9a395f3dbe11c03b083244cad075117d2fa1e1a1 Mon Sep 17 00:00:00 2001
From: kempersc <sckemper@mailfence.com>
Date: Tue, 13 Jan 2026 22:37:10 +0100
Subject: [PATCH] fix: improve birth year extraction to avoid date suffix false
 positives

- Skip YYYYMMDD and YYMMDD date patterns at end of email
- Skip digit sequences longer than 4 characters
- Require non-digit before 4-digit years at end
- Add knid.nl/kabelnoord.nl to consumer domains (Friesland ISP)
- Add 11 missing regional archive domains to HERITAGE_DOMAIN_MAP
- Update recalculation script to re-extract email semantics

Results:
- 3,151 false birth years removed
- 'Likely wrong person' reduced from 533 to 325 (-39%)
- 2,944 candidates' scores boosted
---
 scripts/recalculate_confidence.py             | 70 +++++++++++++++++
 .../entity_resolution/email_semantics.py      | 75 +++++++++++++++++--
 2 files changed, 138 insertions(+), 7 deletions(-)

diff --git a/scripts/recalculate_confidence.py b/scripts/recalculate_confidence.py
index 22f9cbf5dc..dbef4464a0 100644
--- a/scripts/recalculate_confidence.py
+++ b/scripts/recalculate_confidence.py
@@ -7,6 +7,9 @@ This script applies improved confidence scoring that incorporates:
 2. Institution match boosting
 3. Wrong-person detection for birth year mismatches
 
+It also RE-EXTRACTS email semantics using the latest parsing logic,
+ensuring any fixes to birth year extraction are applied.
+
 Usage:
     python scripts/recalculate_confidence.py \
         --input data/entity_resolution/entity_resolution_candidates.json \
@@ -26,9 +29,59 @@ sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
 from glam_extractor.entity_resolution import (
     recalculate_candidate_confidence,
     extract_birth_decade_from_ppid,
+    parse_email_semantics,
 )
 
 
+def re_extract_email_semantics(candidate: dict) -> int:
+    """
+    Re-extract email semantics using current parsing logic.
+    
+    Returns number of changes made.
+    """
+    changes = 0
+    wcms_email = candidate.get('wcms_email')
+    
+    if not wcms_email:
+        return 0
+    
+    # Parse email with current logic
+    semantics = parse_email_semantics(wcms_email)
+    
+    if not semantics:
+        return 0
+    
+    # Update birth year if changed
+    old_birth_year = candidate.get('email_probable_birth_year')
+    new_birth_year = semantics.probable_birth_year
+    
+    if old_birth_year != new_birth_year:
+        candidate['email_probable_birth_year'] = new_birth_year
+        candidate['email_birth_year_confidence'] = semantics.birth_year_confidence
+        candidate['email_birth_year_position'] = semantics.birth_year_position
+        changes += 1
+    
+    # Update institutional status if changed
+    old_institutional = candidate.get('email_is_institutional')
+    new_institutional = semantics.is_institutional_domain
+    
+    if old_institutional != new_institutional:
+        candidate['email_is_institutional'] = new_institutional
+        candidate['email_institution_name'] = semantics.institution_name
+        candidate['email_institution_type'] = semantics.institution_type
+        changes += 1
+    
+    # Update consumer status if changed
+    old_consumer = candidate.get('email_is_consumer')
+    new_consumer = semantics.is_consumer_domain
+    
+    if old_consumer != new_consumer:
+        candidate['email_is_consumer'] = new_consumer
+        changes += 1
+    
+    return changes
+
+
 def main():
     parser = argparse.ArgumentParser(
         description='Recalculate confidence scores for entity resolution candidates'
@@ -79,6 +132,9 @@ def main():
         'penalized': 0,
         'likely_wrong_person': 0,
         'reviews_preserved': 0,
+        'email_reextracted': 0,
+        'birth_years_removed': 0,
+        'birth_years_changed': 0,
     }
     
     # Score distribution before
@@ -90,11 +146,22 @@ def main():
     # Recalculate each candidate
     for i, candidate in enumerate(candidates):
         original_score = candidate.get('confidence_score', 0)
+        old_birth_year = candidate.get('email_probable_birth_year')
         
         # Preserve review status
         if candidate.get('reviewed'):
             stats['reviews_preserved'] += 1
         
+        # Re-extract email semantics with updated parsing logic
+        email_changes = re_extract_email_semantics(candidate)
+        if email_changes:
+            stats['email_reextracted'] += 1
+            new_birth_year = candidate.get('email_probable_birth_year')
+            if old_birth_year and not new_birth_year:
+                stats['birth_years_removed'] += 1
+            elif old_birth_year != new_birth_year:
+                stats['birth_years_changed'] += 1
+        
         # Apply new scoring
         recalculate_candidate_confidence(candidate)
         
@@ -136,6 +203,9 @@ def main():
     print("RECALCULATION STATISTICS")
     print("=" * 60)
     print(f"Total candidates:        {stats['total']:,}")
+    print(f"Email re-extracted:      {stats['email_reextracted']:,}")
+    print(f"  - Birth years removed: {stats['birth_years_removed']:,}")
+    print(f"  - Birth years changed: {stats['birth_years_changed']:,}")
     print(f"Scores adjusted:         {stats['adjusted']:,}")
     print(f"  - Boosted:             {stats['boosted']:,}")
     print(f"  - Penalized:           {stats['penalized']:,}")
diff --git a/src/glam_extractor/entity_resolution/email_semantics.py b/src/glam_extractor/entity_resolution/email_semantics.py
index 08125f4297..ebf0170cc0 100644
--- a/src/glam_extractor/entity_resolution/email_semantics.py
+++ b/src/glam_extractor/entity_resolution/email_semantics.py
@@ -218,7 +218,19 @@ HERITAGE_DOMAIN_MAP: Dict[str, Tuple[str, str, Optional[str]]] = {
     
     # Veterans & Military Heritage
     'veteranen.nl': ('Veteraneninstituut', 'museum', None),
-    'knid.nl': ('KNID (unknown)', 'government', None),  # 64 hits, needs research
+    
+    # Additional Regional Archives (discovered via domain analysis)
+    'archiefalkmaar.nl': ('Regionaal Archief Alkmaar', 'archive', None),
+    'historischcentrumoverijssel.nl': ('Historisch Centrum Overijssel', 'archive', None),
+    'westbrabantsarchief.nl': ('West-Brabants Archief', 'archive', None),
+    'haagshistorischmuseum.nl': ('Haags Historisch Museum', 'museum', None),
+    'zeeuwsarchief.nl': ('Zeeuws Archief', 'archive', None),
+    'erfgoedhuis-zh.nl': ('Erfgoedhuis Zuid-Holland', 'archive', None),
+    'noord-hollandsarchief.nl': ('Noord-Hollands Archief', 'archive', 'NL-NH-HAA-A-NHA'),
+    'geldersarchief.nl': ('Gelders Archief', 'archive', None),
+    'erfgoedbrabant.nl': ('Erfgoed Brabant', 'archive', None),
+    'waterlandsarchief.nl': ('Waterlands Archief', 'archive', None),
+    'erfgoedservice.nl': ('Erfgoed Service', 'archive', None),
     
     # Belgian Heritage (for completeness)
     'arch.be': ('Rijksarchief België', 'archive', None),
@@ -240,6 +252,7 @@ CONSUMER_DOMAINS: Set[str] = {
     'dds.nl', 'freedom.nl', 'xmsnet.nl', 'inter.nl.net', 'euronet.nl',
     'onsbrabantnet.nl', 'concepts.nl', 'worldonline.nl', 'delta.nl', 'lijbrandt.nl',
     't-mobilethuis.nl', 'compaqnet.nl', 'filternet.nl', 'onsmail.nl', 'box.nl',
+    'knid.nl', 'kabelnoord.nl',  # Kabelnoord ISP (Northern Netherlands/Friesland)
     # Belgian
     'telenet.be', 'skynet.be', 'proximus.be',
     # German
@@ -276,6 +289,7 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
         'michiel.huizing.1970' -> (1970, 0.85, 'end')
         'j.devries65' -> (1965, 0.6, 'end')  # 2-digit year
         'bob791120061' -> None (ambiguous)
+        'test20180702' -> None (date suffix, not birth year)
     """
     if not local_part:
         return None, 0.0, None
@@ -284,6 +298,37 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
     # People with 2010+ birth years are unlikely to have email accounts yet
     year_pattern = r'(?:19[3-9][0-9]|200[0-9]|201[0-5])'
     
+    # IMPORTANT: Check for date suffixes (YYYYMMDD or YYMMDD patterns)
+    # These are NOT birth years! E.g., test20180702, user20210830
+    # Date pattern: 4-digit year + 2-digit month (01-12) + 2-digit day (01-31)
+    date_suffix_pattern = r'(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
+    if re.search(date_suffix_pattern, local_part):
+        # This looks like a date suffix (e.g., 20180702), skip birth year extraction
+        return None, 0.0, None
+    
+    # Also check for 6-digit date patterns YYMMDD at end
+    date_suffix_6digit = r'\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
+    if re.search(date_suffix_6digit, local_part):
+        # Verify it's likely a date, not a name + year
+        # E.g., test210830 looks like a date (2021-08-30)
+        last_6 = local_part[-6:]
+        if last_6.isdigit():
+            # Check if middle 2 digits are valid month (01-12)
+            # and last 2 digits are valid day (01-31)
+            month = int(last_6[2:4])
+            day = int(last_6[4:6])
+            if 1 <= month <= 12 and 1 <= day <= 31:
+                return None, 0.0, None
+    
+    # Check for long digit sequences at the end (likely not birth years)
+    # E.g., josbakker532 might be 532, not 1932
+    trailing_digits = re.search(r'(\d+)$', local_part)
+    if trailing_digits:
+        digit_seq = trailing_digits.group(1)
+        # If more than 4 digits, it's probably not a clean birth year
+        if len(digit_seq) > 4:
+            return None, 0.0, None
+    
     # Check for year at start
     start_match = re.match(rf'^({year_pattern})', local_part)
     if start_match:
@@ -291,18 +336,33 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
         # Validate: person would be 10-95 years old
         age = CURRENT_YEAR - year
         if 10 <= age <= 95:
-            return year, 0.9, 'start'
+            # Make sure there's something after the year (not just "1965")
+            # and it's not followed by more digits (which might indicate a date)
+            after_year = local_part[4:]
+            if after_year and not after_year[0].isdigit():
+                return year, 0.9, 'start'
     
-    # Check for year at end
-    end_match = re.search(rf'({year_pattern})$', local_part)
+    # Check for year at end - but not if preceded by other digits
+    # Valid: michiel.huizing.1970, j.devries1965
+    # Invalid: test20180702 (date), josbakker532 (random number)
+    end_match = re.search(rf'[^0-9]({year_pattern})$', local_part)
     if end_match:
         year = int(end_match.group(1))
         age = CURRENT_YEAR - year
         if 10 <= age <= 95:
             return year, 0.85, 'end'
     
+    # Also check if year is at very start of local part ending
+    end_match_start = re.match(rf'^({year_pattern})$', local_part)
+    if end_match_start:
+        year = int(end_match_start.group(1))
+        age = CURRENT_YEAR - year
+        if 10 <= age <= 95:
+            return year, 0.7, 'end'
+    
     # Check for year embedded (lower confidence)
-    embedded_matches = re.findall(year_pattern, local_part)
+    # Only if there's exactly one year and it's clearly separated
+    embedded_matches = re.findall(rf'(?<=[^0-9])({year_pattern})(?=[^0-9])', local_part)
     if len(embedded_matches) == 1:  # Only one year found
         year = int(embedded_matches[0])
         age = CURRENT_YEAR - year
@@ -310,10 +370,11 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
             return year, 0.5, 'embedded'
     
     # Try 2-digit years at the end (less reliable)
-    two_digit_match = re.search(r'(\d{2})$', local_part)
+    # But only if preceded by a non-digit and exactly 2 digits
+    two_digit_match = re.search(r'[^0-9](\d{2})$', local_part)
     if two_digit_match:
         two_digit = int(two_digit_match.group(1))
-        # Assume 19XX for 30-99, 20XX for 00-25
+        # Assume 19XX for 30-99, 20XX for 00-15
         if 30 <= two_digit <= 99:
             year = 1900 + two_digit
             age = CURRENT_YEAR - year