fix: improve birth year extraction to avoid date suffix false positives
- Skip YYYYMMDD and YYMMDD date patterns at end of email - Skip digit sequences longer than 4 characters - Require non-digit before 4-digit years at end - Add knid.nl/kabelnoord.nl to consumer domains (Friesland ISP) - Add 11 missing regional archive domains to HERITAGE_DOMAIN_MAP - Update recalculation script to re-extract email semantics Results: - 3,151 false birth years removed - 'Likely wrong person' reduced from 533 to 325 (-39%) - 2,944 candidates' scores boosted
This commit is contained in:
parent
74ca873585
commit
9a395f3dbe
2 changed files with 138 additions and 7 deletions
|
|
@ -7,6 +7,9 @@ This script applies improved confidence scoring that incorporates:
|
|||
2. Institution match boosting
|
||||
3. Wrong-person detection for birth year mismatches
|
||||
|
||||
It also RE-EXTRACTS email semantics using the latest parsing logic,
|
||||
ensuring any fixes to birth year extraction are applied.
|
||||
|
||||
Usage:
|
||||
python scripts/recalculate_confidence.py \
|
||||
--input data/entity_resolution/entity_resolution_candidates.json \
|
||||
|
|
@ -26,9 +29,59 @@ sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
|
|||
from glam_extractor.entity_resolution import (
|
||||
recalculate_candidate_confidence,
|
||||
extract_birth_decade_from_ppid,
|
||||
parse_email_semantics,
|
||||
)
|
||||
|
||||
|
||||
def re_extract_email_semantics(candidate: dict) -> int:
|
||||
"""
|
||||
Re-extract email semantics using current parsing logic.
|
||||
|
||||
Returns number of changes made.
|
||||
"""
|
||||
changes = 0
|
||||
wcms_email = candidate.get('wcms_email')
|
||||
|
||||
if not wcms_email:
|
||||
return 0
|
||||
|
||||
# Parse email with current logic
|
||||
semantics = parse_email_semantics(wcms_email)
|
||||
|
||||
if not semantics:
|
||||
return 0
|
||||
|
||||
# Update birth year if changed
|
||||
old_birth_year = candidate.get('email_probable_birth_year')
|
||||
new_birth_year = semantics.probable_birth_year
|
||||
|
||||
if old_birth_year != new_birth_year:
|
||||
candidate['email_probable_birth_year'] = new_birth_year
|
||||
candidate['email_birth_year_confidence'] = semantics.birth_year_confidence
|
||||
candidate['email_birth_year_position'] = semantics.birth_year_position
|
||||
changes += 1
|
||||
|
||||
# Update institutional status if changed
|
||||
old_institutional = candidate.get('email_is_institutional')
|
||||
new_institutional = semantics.is_institutional_domain
|
||||
|
||||
if old_institutional != new_institutional:
|
||||
candidate['email_is_institutional'] = new_institutional
|
||||
candidate['email_institution_name'] = semantics.institution_name
|
||||
candidate['email_institution_type'] = semantics.institution_type
|
||||
changes += 1
|
||||
|
||||
# Update consumer status if changed
|
||||
old_consumer = candidate.get('email_is_consumer')
|
||||
new_consumer = semantics.is_consumer_domain
|
||||
|
||||
if old_consumer != new_consumer:
|
||||
candidate['email_is_consumer'] = new_consumer
|
||||
changes += 1
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Recalculate confidence scores for entity resolution candidates'
|
||||
|
|
@ -79,6 +132,9 @@ def main():
|
|||
'penalized': 0,
|
||||
'likely_wrong_person': 0,
|
||||
'reviews_preserved': 0,
|
||||
'email_reextracted': 0,
|
||||
'birth_years_removed': 0,
|
||||
'birth_years_changed': 0,
|
||||
}
|
||||
|
||||
# Score distribution before
|
||||
|
|
@ -90,11 +146,22 @@ def main():
|
|||
# Recalculate each candidate
|
||||
for i, candidate in enumerate(candidates):
|
||||
original_score = candidate.get('confidence_score', 0)
|
||||
old_birth_year = candidate.get('email_probable_birth_year')
|
||||
|
||||
# Preserve review status
|
||||
if candidate.get('reviewed'):
|
||||
stats['reviews_preserved'] += 1
|
||||
|
||||
# Re-extract email semantics with updated parsing logic
|
||||
email_changes = re_extract_email_semantics(candidate)
|
||||
if email_changes:
|
||||
stats['email_reextracted'] += 1
|
||||
new_birth_year = candidate.get('email_probable_birth_year')
|
||||
if old_birth_year and not new_birth_year:
|
||||
stats['birth_years_removed'] += 1
|
||||
elif old_birth_year != new_birth_year:
|
||||
stats['birth_years_changed'] += 1
|
||||
|
||||
# Apply new scoring
|
||||
recalculate_candidate_confidence(candidate)
|
||||
|
||||
|
|
@ -136,6 +203,9 @@ def main():
|
|||
print("RECALCULATION STATISTICS")
|
||||
print("=" * 60)
|
||||
print(f"Total candidates: {stats['total']:,}")
|
||||
print(f"Email re-extracted: {stats['email_reextracted']:,}")
|
||||
print(f" - Birth years removed: {stats['birth_years_removed']:,}")
|
||||
print(f" - Birth years changed: {stats['birth_years_changed']:,}")
|
||||
print(f"Scores adjusted: {stats['adjusted']:,}")
|
||||
print(f" - Boosted: {stats['boosted']:,}")
|
||||
print(f" - Penalized: {stats['penalized']:,}")
|
||||
|
|
|
|||
|
|
@ -218,7 +218,19 @@ HERITAGE_DOMAIN_MAP: Dict[str, Tuple[str, str, Optional[str]]] = {
|
|||
|
||||
# Veterans & Military Heritage
|
||||
'veteranen.nl': ('Veteraneninstituut', 'museum', None),
|
||||
'knid.nl': ('KNID (unknown)', 'government', None), # 64 hits, needs research
|
||||
|
||||
# Additional Regional Archives (discovered via domain analysis)
|
||||
'archiefalkmaar.nl': ('Regionaal Archief Alkmaar', 'archive', None),
|
||||
'historischcentrumoverijssel.nl': ('Historisch Centrum Overijssel', 'archive', None),
|
||||
'westbrabantsarchief.nl': ('West-Brabants Archief', 'archive', None),
|
||||
'haagshistorischmuseum.nl': ('Haags Historisch Museum', 'museum', None),
|
||||
'zeeuwsarchief.nl': ('Zeeuws Archief', 'archive', None),
|
||||
'erfgoedhuis-zh.nl': ('Erfgoedhuis Zuid-Holland', 'archive', None),
|
||||
'noord-hollandsarchief.nl': ('Noord-Hollands Archief', 'archive', 'NL-NH-HAA-A-NHA'),
|
||||
'geldersarchief.nl': ('Gelders Archief', 'archive', None),
|
||||
'erfgoedbrabant.nl': ('Erfgoed Brabant', 'archive', None),
|
||||
'waterlandsarchief.nl': ('Waterlands Archief', 'archive', None),
|
||||
'erfgoedservice.nl': ('Erfgoed Service', 'archive', None),
|
||||
|
||||
# Belgian Heritage (for completeness)
|
||||
'arch.be': ('Rijksarchief België', 'archive', None),
|
||||
|
|
@ -240,6 +252,7 @@ CONSUMER_DOMAINS: Set[str] = {
|
|||
'dds.nl', 'freedom.nl', 'xmsnet.nl', 'inter.nl.net', 'euronet.nl',
|
||||
'onsbrabantnet.nl', 'concepts.nl', 'worldonline.nl', 'delta.nl', 'lijbrandt.nl',
|
||||
't-mobilethuis.nl', 'compaqnet.nl', 'filternet.nl', 'onsmail.nl', 'box.nl',
|
||||
'knid.nl', 'kabelnoord.nl', # Kabelnoord ISP (Northern Netherlands/Friesland)
|
||||
# Belgian
|
||||
'telenet.be', 'skynet.be', 'proximus.be',
|
||||
# German
|
||||
|
|
@ -276,6 +289,7 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
|
|||
'michiel.huizing.1970' -> (1970, 0.85, 'end')
|
||||
'j.devries65' -> (1965, 0.6, 'end') # 2-digit year
|
||||
'bob791120061' -> None (ambiguous)
|
||||
'test20180702' -> None (date suffix, not birth year)
|
||||
"""
|
||||
if not local_part:
|
||||
return None, 0.0, None
|
||||
|
|
@ -284,6 +298,37 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
|
|||
# People with 2010+ birth years are unlikely to have email accounts yet
|
||||
year_pattern = r'(?:19[3-9][0-9]|200[0-9]|201[0-5])'
|
||||
|
||||
# IMPORTANT: Check for date suffixes (YYYYMMDD or YYMMDD patterns)
|
||||
# These are NOT birth years! E.g., test20180702, user20210830
|
||||
# Date pattern: 4-digit year + 2-digit month (01-12) + 2-digit day (01-31)
|
||||
date_suffix_pattern = r'(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
|
||||
if re.search(date_suffix_pattern, local_part):
|
||||
# This looks like a date suffix (e.g., 20180702), skip birth year extraction
|
||||
return None, 0.0, None
|
||||
|
||||
# Also check for 6-digit date patterns YYMMDD at end
|
||||
date_suffix_6digit = r'\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
|
||||
if re.search(date_suffix_6digit, local_part):
|
||||
# Verify it's likely a date, not a name + year
|
||||
# E.g., test210830 looks like a date (2021-08-30)
|
||||
last_6 = local_part[-6:]
|
||||
if last_6.isdigit():
|
||||
# Check if middle 2 digits are valid month (01-12)
|
||||
# and last 2 digits are valid day (01-31)
|
||||
month = int(last_6[2:4])
|
||||
day = int(last_6[4:6])
|
||||
if 1 <= month <= 12 and 1 <= day <= 31:
|
||||
return None, 0.0, None
|
||||
|
||||
# Check for long digit sequences at the end (likely not birth years)
|
||||
# E.g., josbakker532 might be 532, not 1932
|
||||
trailing_digits = re.search(r'(\d+)$', local_part)
|
||||
if trailing_digits:
|
||||
digit_seq = trailing_digits.group(1)
|
||||
# If more than 4 digits, it's probably not a clean birth year
|
||||
if len(digit_seq) > 4:
|
||||
return None, 0.0, None
|
||||
|
||||
# Check for year at start
|
||||
start_match = re.match(rf'^({year_pattern})', local_part)
|
||||
if start_match:
|
||||
|
|
@ -291,18 +336,33 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
|
|||
# Validate: person would be 10-95 years old
|
||||
age = CURRENT_YEAR - year
|
||||
if 10 <= age <= 95:
|
||||
return year, 0.9, 'start'
|
||||
# Make sure there's something after the year (not just "1965")
|
||||
# and it's not followed by more digits (which might indicate a date)
|
||||
after_year = local_part[4:]
|
||||
if after_year and not after_year[0].isdigit():
|
||||
return year, 0.9, 'start'
|
||||
|
||||
# Check for year at end
|
||||
end_match = re.search(rf'({year_pattern})$', local_part)
|
||||
# Check for year at end - but not if preceded by other digits
|
||||
# Valid: michiel.huizing.1970, j.devries1965
|
||||
# Invalid: test20180702 (date), josbakker532 (random number)
|
||||
end_match = re.search(rf'[^0-9]({year_pattern})$', local_part)
|
||||
if end_match:
|
||||
year = int(end_match.group(1))
|
||||
age = CURRENT_YEAR - year
|
||||
if 10 <= age <= 95:
|
||||
return year, 0.85, 'end'
|
||||
|
||||
# Also check if year is at very start of local part ending
|
||||
end_match_start = re.match(rf'^({year_pattern})$', local_part)
|
||||
if end_match_start:
|
||||
year = int(end_match_start.group(1))
|
||||
age = CURRENT_YEAR - year
|
||||
if 10 <= age <= 95:
|
||||
return year, 0.7, 'end'
|
||||
|
||||
# Check for year embedded (lower confidence)
|
||||
embedded_matches = re.findall(year_pattern, local_part)
|
||||
# Only if there's exactly one year and it's clearly separated
|
||||
embedded_matches = re.findall(rf'(?<=[^0-9])({year_pattern})(?=[^0-9])', local_part)
|
||||
if len(embedded_matches) == 1: # Only one year found
|
||||
year = int(embedded_matches[0])
|
||||
age = CURRENT_YEAR - year
|
||||
|
|
@ -310,10 +370,11 @@ def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[
|
|||
return year, 0.5, 'embedded'
|
||||
|
||||
# Try 2-digit years at the end (less reliable)
|
||||
two_digit_match = re.search(r'(\d{2})$', local_part)
|
||||
# But only if preceded by a non-digit and exactly 2 digits
|
||||
two_digit_match = re.search(r'[^0-9](\d{2})$', local_part)
|
||||
if two_digit_match:
|
||||
two_digit = int(two_digit_match.group(1))
|
||||
# Assume 19XX for 30-99, 20XX for 00-25
|
||||
# Assume 19XX for 30-99, 20XX for 00-15
|
||||
if 30 <= two_digit <= 99:
|
||||
year = 1900 + two_digit
|
||||
age = CURRENT_YEAR - year
|
||||
|
|
|
|||
Loading…
Reference in a new issue