fix: improve birth year extraction to avoid date suffix false positives

- Skip YYYYMMDD and YYMMDD date patterns at end of email - Skip digit sequences longer than 4 characters - Require non-digit before 4-digit years at end - Add knid.nl/kabelnoord.nl to consumer domains (Friesland ISP) - Add 11 missing regional archive domains to HERITAGE_DOMAIN_MAP - Update recalculation script to re-extract email semantics Results: - 3,151 false birth years removed - 'Likely wrong person' reduced from 533 to 325 (-39%) - 2,944 candidates' scores boosted
2026-01-13 22:37:10 +01:00 · 2026-01-13 22:37:10 +01:00 · 9a395f3dbe
commit 9a395f3dbe
parent 74ca873585
2 changed files with 138 additions and 7 deletions
--- a/scripts/recalculate_confidence.py
+++ b/scripts/recalculate_confidence.py
@ -7,6 +7,9 @@ This script applies improved confidence scoring that incorporates:
 2. Institution match boosting
 3. Wrong-person detection for birth year mismatches

+It also RE-EXTRACTS email semantics using the latest parsing logic,
+ensuring any fixes to birth year extraction are applied.
+
 Usage:
    python scripts/recalculate_confidence.py \
        --input data/entity_resolution/entity_resolution_candidates.json \
@ -26,9 +29,59 @@ sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
 from glam_extractor.entity_resolution import (
    recalculate_candidate_confidence,
    extract_birth_decade_from_ppid,
+    parse_email_semantics,
 )


+def re_extract_email_semantics(candidate: dict) -> int:
+    """
+    Re-extract email semantics using current parsing logic.
+    
+    Returns number of changes made.
+    """
+    changes = 0
+    wcms_email = candidate.get('wcms_email')
+    
+    if not wcms_email:
+        return 0
+    
+    # Parse email with current logic
+    semantics = parse_email_semantics(wcms_email)
+    
+    if not semantics:
+        return 0
+    
+    # Update birth year if changed
+    old_birth_year = candidate.get('email_probable_birth_year')
+    new_birth_year = semantics.probable_birth_year
+    
+    if old_birth_year != new_birth_year:
+        candidate['email_probable_birth_year'] = new_birth_year
+        candidate['email_birth_year_confidence'] = semantics.birth_year_confidence
+        candidate['email_birth_year_position'] = semantics.birth_year_position
+        changes += 1
+    
+    # Update institutional status if changed
+    old_institutional = candidate.get('email_is_institutional')
+    new_institutional = semantics.is_institutional_domain
+    
+    if old_institutional != new_institutional:
+        candidate['email_is_institutional'] = new_institutional
+        candidate['email_institution_name'] = semantics.institution_name
+        candidate['email_institution_type'] = semantics.institution_type
+        changes += 1
+    
+    # Update consumer status if changed
+    old_consumer = candidate.get('email_is_consumer')
+    new_consumer = semantics.is_consumer_domain
+    
+    if old_consumer != new_consumer:
+        candidate['email_is_consumer'] = new_consumer
+        changes += 1
+    
+    return changes
+
+
 def main():
    parser = argparse.ArgumentParser(
        description='Recalculate confidence scores for entity resolution candidates'
@ -79,6 +132,9 @@ def main():
        'penalized': 0,
        'likely_wrong_person': 0,
        'reviews_preserved': 0,
+        'email_reextracted': 0,
+        'birth_years_removed': 0,
+        'birth_years_changed': 0,
    }
    
    # Score distribution before
@ -90,11 +146,22 @@ def main():
    # Recalculate each candidate
    for i, candidate in enumerate(candidates):
        original_score = candidate.get('confidence_score', 0)
+        old_birth_year = candidate.get('email_probable_birth_year')
        
        # Preserve review status
        if candidate.get('reviewed'):
            stats['reviews_preserved'] += 1
        
+        # Re-extract email semantics with updated parsing logic
+        email_changes = re_extract_email_semantics(candidate)
+        if email_changes:
+            stats['email_reextracted'] += 1
+            new_birth_year = candidate.get('email_probable_birth_year')
+            if old_birth_year and not new_birth_year:
+                stats['birth_years_removed'] += 1
+            elif old_birth_year != new_birth_year:
+                stats['birth_years_changed'] += 1
+        
        # Apply new scoring
        recalculate_candidate_confidence(candidate)
        
@ -136,6 +203,9 @@ def main():
    print("RECALCULATION STATISTICS")
    print("=" * 60)
    print(f"Total candidates:        {stats['total']:,}")
+    print(f"Email re-extracted:      {stats['email_reextracted']:,}")
+    print(f"  - Birth years removed: {stats['birth_years_removed']:,}")
+    print(f"  - Birth years changed: {stats['birth_years_changed']:,}")
    print(f"Scores adjusted:         {stats['adjusted']:,}")
    print(f"  - Boosted:             {stats['boosted']:,}")
    print(f"  - Penalized:           {stats['penalized']:,}")
--- a/src/glam_extractor/entity_resolution/email_semantics.py
+++ b/src/glam_extractor/entity_resolution/email_semantics.py
@ -218,7 +218,19 @@ HERITAGE_DOMAIN_MAP: Dict[str, Tuple[str, str, Optional[str]]] = {
    
    # Veterans & Military Heritage
    'veteranen.nl': ('Veteraneninstituut', 'museum', None),
-    'knid.nl': ('KNID (unknown)', 'government', None),  # 64 hits, needs research
+    
+    # Additional Regional Archives (discovered via domain analysis)
+    'archiefalkmaar.nl': ('Regionaal Archief Alkmaar', 'archive', None),
+    'historischcentrumoverijssel.nl': ('Historisch Centrum Overijssel', 'archive', None),
+    'westbrabantsarchief.nl': ('West-Brabants Archief', 'archive', None),
+    'haagshistorischmuseum.nl': ('Haags Historisch Museum', 'museum', None),
+    'zeeuwsarchief.nl': ('Zeeuws Archief', 'archive', None),
+    'erfgoedhuis-zh.nl': ('Erfgoedhuis Zuid-Holland', 'archive', None),
+    'noord-hollandsarchief.nl': ('Noord-Hollands Archief', 'archive', 'NL-NH-HAA-A-NHA'),
+    'geldersarchief.nl': ('Gelders Archief', 'archive', None),
+    'erfgoedbrabant.nl': ('Erfgoed Brabant', 'archive', None),
+    'waterlandsarchief.nl': ('Waterlands Archief', 'archive', None),
+    'erfgoedservice.nl': ('Erfgoed Service', 'archive', None),
    
    # Belgian Heritage (for completeness)
    'arch.be': ('Rijksarchief België', 'archive', None),
@ -240,6 +252,7 @@ CONSUMER_DOMAINS: Set[str] = {
    'dds.nl', 'freedom.nl', 'xmsnet.nl', 'inter.nl.net', 'euronet.nl',
    'onsbrabantnet.nl', 'concepts.nl', 'worldonline.nl', 'delta.nl', 'lijbrandt.nl',
    't-mobilethuis.nl', 'compaqnet.nl', 'filternet.nl', 'onsmail.nl', 'box.nl',
+    'knid.nl', 'kabelnoord.nl',  # Kabelnoord ISP (Northern Netherlands/Friesland)
    # Belgian
    'telenet.be', 'skynet.be', 'proximus.be',
    # German
@ -276,6 +289,7 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
        'michiel.huizing.1970' -> (1970, 0.85, 'end')
        'j.devries65' -> (1965, 0.6, 'end')  # 2-digit year
        'bob791120061' -> None (ambiguous)
+        'test20180702' -> None (date suffix, not birth year)
    """
    if not local_part:
        return None, 0.0, None
@ -284,6 +298,37 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
    # People with 2010+ birth years are unlikely to have email accounts yet
    year_pattern = r'(?:19[3-9][0-9]|200[0-9]|201[0-5])'
    
+    # IMPORTANT: Check for date suffixes (YYYYMMDD or YYMMDD patterns)
+    # These are NOT birth years! E.g., test20180702, user20210830
+    # Date pattern: 4-digit year + 2-digit month (01-12) + 2-digit day (01-31)
+    date_suffix_pattern = r'(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
+    if re.search(date_suffix_pattern, local_part):
+        # This looks like a date suffix (e.g., 20180702), skip birth year extraction
+        return None, 0.0, None
+    
+    # Also check for 6-digit date patterns YYMMDD at end
+    date_suffix_6digit = r'\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
+    if re.search(date_suffix_6digit, local_part):
+        # Verify it's likely a date, not a name + year
+        # E.g., test210830 looks like a date (2021-08-30)
+        last_6 = local_part[-6:]
+        if last_6.isdigit():
+            # Check if middle 2 digits are valid month (01-12)
+            # and last 2 digits are valid day (01-31)
+            month = int(last_6[2:4])
+            day = int(last_6[4:6])
+            if 1 <= month <= 12 and 1 <= day <= 31:
+                return None, 0.0, None
+    
+    # Check for long digit sequences at the end (likely not birth years)
+    # E.g., josbakker532 might be 532, not 1932
+    trailing_digits = re.search(r'(\d+)$', local_part)
+    if trailing_digits:
+        digit_seq = trailing_digits.group(1)
+        # If more than 4 digits, it's probably not a clean birth year
+        if len(digit_seq) > 4:
+            return None, 0.0, None
+    
    # Check for year at start
    start_match = re.match(rf'^({year_pattern})', local_part)
    if start_match:
@ -291,18 +336,33 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
        # Validate: person would be 10-95 years old
        age = CURRENT_YEAR - year
        if 10 <= age <= 95:
-            return year, 0.9, 'start'
+            # Make sure there's something after the year (not just "1965")
+            # and it's not followed by more digits (which might indicate a date)
+            after_year = local_part[4:]
+            if after_year and not after_year[0].isdigit():
+                return year, 0.9, 'start'
    
-    # Check for year at end
-    end_match = re.search(rf'({year_pattern})$', local_part)
+    # Check for year at end - but not if preceded by other digits
+    # Valid: michiel.huizing.1970, j.devries1965
+    # Invalid: test20180702 (date), josbakker532 (random number)
+    end_match = re.search(rf'[^0-9]({year_pattern})$', local_part)
    if end_match:
        year = int(end_match.group(1))
        age = CURRENT_YEAR - year
        if 10 <= age <= 95:
            return year, 0.85, 'end'
    
+    # Also check if year is at very start of local part ending
+    end_match_start = re.match(rf'^({year_pattern})$', local_part)
+    if end_match_start:
+        year = int(end_match_start.group(1))
+        age = CURRENT_YEAR - year
+        if 10 <= age <= 95:
+            return year, 0.7, 'end'
+    
    # Check for year embedded (lower confidence)
-    embedded_matches = re.findall(year_pattern, local_part)
+    # Only if there's exactly one year and it's clearly separated
+    embedded_matches = re.findall(rf'(?<=[^0-9])({year_pattern})(?=[^0-9])', local_part)
    if len(embedded_matches) == 1:  # Only one year found
        year = int(embedded_matches[0])
        age = CURRENT_YEAR - year
@ -310,10 +370,11 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
            return year, 0.5, 'embedded'
    
    # Try 2-digit years at the end (less reliable)
-    two_digit_match = re.search(r'(\d{2})$', local_part)
+    # But only if preceded by a non-digit and exactly 2 digits
+    two_digit_match = re.search(r'[^0-9](\d{2})$', local_part)
    if two_digit_match:
        two_digit = int(two_digit_match.group(1))
-        # Assume 19XX for 30-99, 20XX for 00-25
+        # Assume 19XX for 30-99, 20XX for 00-15
        if 30 <= two_digit <= 99:
            year = 1900 + two_digit
            age = CURRENT_YEAR - year