glam/src/glam_extractor/entity_resolution/email_semantics.py
2026-01-14 09:05:54 +01:00

872 lines
36 KiB
Python

#!/usr/bin/env python3
"""
Email semantic analysis for entity resolution.
Extracts identity signals from email addresses:
- Birth year from numeric patterns (e.g., 1948mausti@ziggo.nl -> 1948)
- Institutional affiliation from domain (e.g., @rijksmuseum.nl -> Rijksmuseum)
- Name components from local part (e.g., michiel.huizing@... -> Michiel Huizing)
Usage:
from glam_extractor.entity_resolution.email_semantics import (
parse_email_semantics,
EmailSemantics,
HERITAGE_DOMAIN_MAP
)
result = parse_email_semantics("j.devries1965@rijksmuseum.nl")
print(result.probable_birth_year) # 1965
print(result.institution_name) # Rijksmuseum
print(result.extracted_names) # ['j', 'devries']
"""
import re
import unicodedata
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional, List, Tuple, Dict, Set
# Current year for birth year validation
CURRENT_YEAR = datetime.now().year
@dataclass
class EmailSemantics:
"""Semantic analysis result for an email address."""
email: str
local_part: str
domain: str
# Birth year signal
probable_birth_year: Optional[int] = None
birth_year_confidence: float = 0.0
birth_year_position: Optional[str] = None # 'start', 'end', 'embedded'
# Affiliation signal
institution_domain: Optional[str] = None
institution_name: Optional[str] = None
institution_type: Optional[str] = None # 'museum', 'archive', 'library', 'university', 'government', 'research'
institution_ghcid: Optional[str] = None # If we can link to a custodian
# Name components
extracted_names: List[str] = field(default_factory=list)
extracted_first_name: Optional[str] = None
extracted_last_name: Optional[str] = None
extracted_middle_names: List[str] = field(default_factory=list)
name_pattern: Optional[str] = None # 'firstname.lastname', 'f.lastname', 'firstnamelastname', etc.
has_dutch_prefix: bool = False # van, de, van der, etc.
# Match quality indicators
is_consumer_domain: bool = False
is_institutional_domain: bool = False
has_name_in_email: bool = False
# ============================================================================
# HERITAGE INSTITUTION DOMAIN MAPPING
# ============================================================================
# Map domains to known heritage institutions
# Format: domain -> (name, type, ghcid_if_known)
HERITAGE_DOMAIN_MAP: Dict[str, Tuple[str, str, Optional[str]]] = {
# National Archives & Libraries
'nationaalarchief.nl': ('Nationaal Archief', 'archive', 'NL-ZH-DHA-A-NA'),
'kb.nl': ('Koninklijke Bibliotheek', 'library', 'NL-ZH-DHA-L-KB'),
'rijksmuseum.nl': ('Rijksmuseum', 'museum', 'NL-NH-AMS-M-RA'),
'vangoghmuseum.nl': ('Van Gogh Museum', 'museum', 'NL-NH-AMS-M-VGM'),
'rkd.nl': ('RKD - Nederlands Instituut voor Kunstgeschiedenis', 'research', 'NL-ZH-DHA-R-RKD'),
'cultureelerfgoed.nl': ('Rijksdienst voor het Cultureel Erfgoed', 'government', 'NL-UT-AME-O-RCE'),
'erfgoedinspectie.nl': ('Inspectie Overheidsinformatie en Erfgoed', 'government', None),
# KNAW Institutes
'niod.knaw.nl': ('NIOD Instituut voor Oorlogs-, Holocaust- en Genocidestudies', 'research', 'NL-NH-AMS-R-NIOD'),
'huygens.knaw.nl': ('Huygens Instituut', 'research', 'NL-NH-AMS-R-HI'),
'meertens.knaw.nl': ('Meertens Instituut', 'research', 'NL-NH-AMS-R-MI'),
'dans.knaw.nl': ('Data Archiving and Networked Services', 'archive', 'NL-ZH-DHA-A-DANS'),
'knaw.nl': ('Koninklijke Nederlandse Akademie van Wetenschappen', 'research', None),
# Provincial/Regional Archives
'regionaalarchieftilburg.nl': ('Regionaal Archief Tilburg', 'archive', None),
'bhic.nl': ('Brabants Historisch Informatie Centrum', 'archive', None),
'hetutrechtsarchief.nl': ('Het Utrechts Archief', 'archive', 'NL-UT-UTR-A-HUA'),
'gahetna.nl': ('Gelders Archief', 'archive', None),
'regionaalarchiefdordrecht.nl': ('Regionaal Archief Dordrecht', 'archive', None),
'archiefenschede.nl': ('Historisch Centrum Overijssel', 'archive', None),
'westfriesarchief.nl': ('Westfries Archief', 'archive', None),
'nbha.nl': ('Noord-Brabants Historisch Archief', 'archive', None),
'regionaalarchiefalkmaar.nl': ('Regionaal Archief Alkmaar', 'archive', None),
'drentsarchief.nl': ('Drents Archief', 'archive', None),
'groningerarchieven.nl': ('Groninger Archieven', 'archive', None),
'tresoar.nl': ('Tresoar', 'archive', None),
'zeelandarchief.nl': ('Zeeuws Archief', 'archive', None),
# City Archives
'stadsarchief.rotterdam.nl': ('Stadsarchief Rotterdam', 'archive', None),
'stadsarchief.amsterdam.nl': ('Stadsarchief Amsterdam', 'archive', 'NL-NH-AMS-A-SAA'),
'erfgoedcentrumzutphen.nl': ('Erfgoedcentrum Zutphen', 'archive', None),
'erfgoedleiden.nl': ('Erfgoed Leiden en Omstreken', 'archive', None),
'noord-holland.nl': ('Provincie Noord-Holland', 'government', None),
'amsterdam.nl': ('Gemeente Amsterdam', 'government', None),
'rotterdam.nl': ('Gemeente Rotterdam', 'government', None),
'denhaag.nl': ('Gemeente Den Haag', 'government', None),
'hoorn.nl': ('Gemeente Hoorn', 'government', None),
'hhnk.nl': ('Hoogheemraadschap Hollands Noorderkwartier', 'government', None),
# Museums
'openluchtmuseum.nl': ('Nederlands Openluchtmuseum', 'museum', None),
'mauritshuis.nl': ('Mauritshuis', 'museum', 'NL-ZH-DHA-M-MH'),
'eye.nl': ('Eye Filmmuseum', 'museum', 'NL-NH-AMS-M-EYE'),
'hermitage.nl': ('Hermitage Amsterdam', 'museum', None),
'stedelijk.nl': ('Stedelijk Museum Amsterdam', 'museum', 'NL-NH-AMS-M-SMA'),
'tropenmuseum.nl': ('Tropenmuseum', 'museum', None),
'naturalis.nl': ('Naturalis Biodiversity Center', 'museum', 'NL-ZH-LEI-M-NBC'),
'museumboijmans.nl': ('Museum Boijmans Van Beuningen', 'museum', None),
'kunstmuseum.nl': ('Kunstmuseum Den Haag', 'museum', None),
'annefrank.nl': ('Anne Frank Huis', 'museum', 'NL-NH-AMS-M-AFH'),
'amsterdammuseum.nl': ('Amsterdam Museum', 'museum', 'NL-NH-AMS-M-AM'),
'joods-historisch-museum.nl': ('Joods Historisch Museum', 'museum', None),
'jhm.nl': ('Joods Historisch Museum', 'museum', None),
'verzetsmuseum.org': ('Verzetsmuseum', 'museum', None),
'scheepvaartmuseum.nl': ('Het Scheepvaartmuseum', 'museum', None),
'hetnoordbrabantsmuseum.nl': ('Het Noordbrabants Museum', 'museum', None),
'gemeentemuseum.nl': ('Kunstmuseum Den Haag', 'museum', None),
'centraalmuseum.nl': ('Centraal Museum', 'museum', 'NL-UT-UTR-M-CM'),
'fryskmuseum.nl': ('Fries Museum', 'museum', None),
'groningermuseum.nl': ('Groninger Museum', 'museum', None),
'museumhetvalkhof.nl': ('Museum Het Valkhof', 'museum', None),
'museum.nl': ('Rijksmuseum van Oudheden', 'museum', None), # generic, but often RMO
'rfrankenhuis.nl': ('Museum De Fundatie', 'museum', None),
'maritiemmuseum.nl': ('Maritiem Museum Rotterdam', 'museum', None),
'paleishetloo.nl': ('Paleis Het Loo', 'museum', None),
'slotloevestein.nl': ('Slot Loevestein', 'museum', None),
# Universities (Humanities/Heritage departments)
'uu.nl': ('Universiteit Utrecht', 'university', None),
'students.uu.nl': ('Universiteit Utrecht (student)', 'university', None),
'uva.nl': ('Universiteit van Amsterdam', 'university', None),
'student.uva.nl': ('Universiteit van Amsterdam (student)', 'university', None),
'vu.nl': ('Vrije Universiteit Amsterdam', 'university', None),
'student.vu.nl': ('Vrije Universiteit Amsterdam (student)', 'university', None),
'rug.nl': ('Rijksuniversiteit Groningen', 'university', None),
'student.rug.nl': ('Rijksuniversiteit Groningen (student)', 'university', None),
'leidenuniv.nl': ('Universiteit Leiden', 'university', None),
'hum.leidenuniv.nl': ('Universiteit Leiden - Humanities', 'university', None),
'umail.leidenuniv.nl': ('Universiteit Leiden', 'university', None),
'cdh.leidenuniv.nl': ('Universiteit Leiden - Centre for Digital Humanities', 'university', None),
'ru.nl': ('Radboud Universiteit', 'university', None),
'let.ru.nl': ('Radboud Universiteit - Letteren', 'university', None),
'jur.ru.nl': ('Radboud Universiteit - Rechtsgeleerdheid', 'university', None),
'student.ru.nl': ('Radboud Universiteit (student)', 'university', None),
'tudelft.nl': ('TU Delft', 'university', None),
'student.tudelft.nl': ('TU Delft (student)', 'university', None),
'tue.nl': ('TU Eindhoven', 'university', None),
'utwente.nl': ('Universiteit Twente', 'university', None),
'maastrichtuniversity.nl': ('Maastricht University', 'university', None),
'tilburguniversity.edu': ('Tilburg University', 'university', None),
'eur.nl': ('Erasmus Universiteit Rotterdam', 'university', None),
'eshcc.eur.nl': ('Erasmus Universiteit - ESHCC', 'university', None),
'wur.nl': ('Wageningen University & Research', 'university', None),
'ou.nl': ('Open Universiteit', 'university', None),
# Hogescholen (Universities of Applied Sciences)
'hva.nl': ('Hogeschool van Amsterdam', 'university', None),
'student.hu.nl': ('Hogeschool Utrecht (student)', 'university', None),
'student.fontys.nl': ('Fontys Hogescholen (student)', 'university', None),
# Government
'minbuza.nl': ('Ministerie van Buitenlandse Zaken', 'government', None),
'mindef.nl': ('Ministerie van Defensie', 'government', None),
'minaz.nl': ('Ministerie van Algemene Zaken', 'government', None),
'minez.nl': ('Ministerie van Economische Zaken', 'government', None),
'minienw.nl': ('Ministerie van I&W', 'government', None),
'rws.nl': ('Rijkswaterstaat', 'government', None),
'minvws.nl': ('Ministerie van VWS', 'government', None),
'minjenv.nl': ('Ministerie van J&V', 'government', None),
'minszw.nl': ('Ministerie van SZW', 'government', None),
'minbzk.nl': ('Ministerie van BZK', 'government', None),
'minlnv.nl': ('Ministerie van LNV', 'government', None),
'minocw.nl': ('Ministerie van OCW', 'government', None),
'minfin.nl': ('Ministerie van Financien', 'government', None),
'belastingdienst.nl': ('Belastingdienst', 'government', None),
'rijksoverheid.nl': ('Rijksoverheid', 'government', None),
'politie.nl': ('Politie Nederland', 'government', None),
'kadaster.nl': ('Kadaster', 'government', None),
'rvo.nl': ('Rijksdienst voor Ondernemend Nederland', 'government', None),
'rivm.nl': ('Rijksinstituut voor Volksgezondheid en Milieu', 'government', None),
'staatsbosbeheer.nl': ('Staatsbosbeheer', 'government', None),
'vng.nl': ('Vereniging van Nederlandse Gemeenten', 'government', None),
# Libraries (Public/Academic)
'oba.nl': ('Openbare Bibliotheek Amsterdam', 'library', None),
'bibliotheekrotterdam.nl': ('Bibliotheek Rotterdam', 'library', None),
'ubvu.vu.nl': ('VU Bibliotheek', 'library', None),
'library.uu.nl': ('Universiteitsbibliotheek Utrecht', 'library', None),
# Research Organizations
'nwo.nl': ('Nederlandse Organisatie voor Wetenschappelijk Onderzoek', 'research', None),
'fwo.be': ('Fonds Wetenschappelijk Onderzoek', 'research', None),
'knir.it': ('Koninklijk Nederlands Instituut Rome', 'research', None),
'cbg.nl': ('Centrum voor Familiegeschiedenis (CBG)', 'research', None),
'kitlv.nl': ('Koninklijk Instituut voor Taal-, Land- en Volkenkunde', 'research', None),
'kit.nl': ('KIT Royal Tropical Institute', 'research', None),
# Archaeology/Heritage Research
'baac.nl': ('BAAC Archeologie', 'research', None),
'raap.nl': ('RAAP Archeologisch Adviesbureau', 'research', None),
# Veterans & Military Heritage
'veteranen.nl': ('Veteraneninstituut', 'museum', None),
# Additional Regional Archives (discovered via domain analysis)
'archiefalkmaar.nl': ('Regionaal Archief Alkmaar', 'archive', None),
'historischcentrumoverijssel.nl': ('Historisch Centrum Overijssel', 'archive', None),
'westbrabantsarchief.nl': ('West-Brabants Archief', 'archive', None),
'haagshistorischmuseum.nl': ('Haags Historisch Museum', 'museum', None),
'zeeuwsarchief.nl': ('Zeeuws Archief', 'archive', None),
'erfgoedhuis-zh.nl': ('Erfgoedhuis Zuid-Holland', 'archive', None),
'noord-hollandsarchief.nl': ('Noord-Hollands Archief', 'archive', 'NL-NH-HAA-A-NHA'),
'geldersarchief.nl': ('Gelders Archief', 'archive', None),
'erfgoedbrabant.nl': ('Erfgoed Brabant', 'archive', None),
'waterlandsarchief.nl': ('Waterlands Archief', 'archive', None),
'erfgoedservice.nl': ('Erfgoed Service', 'archive', None),
# Belgian Heritage (for completeness)
'arch.be': ('Rijksarchief België', 'archive', None),
'kbr.be': ('Koninklijke Bibliotheek België', 'library', None),
'kikirpa.be': ('Koninklijk Instituut voor het Kunstpatrimonium', 'research', None),
}
# Consumer email domains (to filter out)
CONSUMER_DOMAINS: Set[str] = {
'gmail.com', 'gmail.nl', 'hotmail.com', 'hotmail.nl', 'outlook.com', 'outlook.nl',
'live.nl', 'live.com', 'msn.com', 'yahoo.com', 'yahoo.nl', 'yahoo.co.uk',
'icloud.com', 'icloud.nl', 'me.com', 'mac.com', 'aol.nl', 'aol.com',
# Dutch ISPs
'ziggo.nl', 'kpnmail.nl', 'kpnplanet.nl', 'planet.nl', 'hetnet.nl',
'xs4all.nl', 'casema.nl', 'home.nl', 'upcmail.nl', 'chello.nl',
'quicknet.nl', 'zonnet.nl', 'tele2.nl', 'solcon.nl', 'zeelandnet.nl',
'wxs.nl', 'telfort.nl', 'telfortglasvezel.nl', 'online.nl', 'hccnet.nl', 'kabelfoon.nl',
'caiway.nl', 'tiscali.nl', 'versatel.nl', 'freeler.nl', 'kliksafe.nl',
'dds.nl', 'freedom.nl', 'xmsnet.nl', 'inter.nl.net', 'euronet.nl',
'onsbrabantnet.nl', 'concepts.nl', 'worldonline.nl', 'delta.nl', 'lijbrandt.nl',
't-mobilethuis.nl', 'compaqnet.nl', 'filternet.nl', 'onsmail.nl', 'box.nl',
'knid.nl', 'kabelnoord.nl', # Kabelnoord ISP (Northern Netherlands/Friesland)
# Belgian
'telenet.be', 'skynet.be', 'proximus.be',
# German
'gmx.de', 'web.de', 't-online.de',
# Generic
'mail.com', 'email.com', 'protonmail.com', 'pm.me', 'mailinator.com',
}
# Dutch name prefixes (tussenvoegsels)
DUTCH_PREFIXES: Set[str] = {
'van', 'de', 'den', 'der', 'het', 'ter', 'ten', 'te', 'op', 'in',
'vande', 'vander', 'vanden', 'vanhet', 'vanten', 'vanter',
'vd', # abbreviation of van de/van der
"'t", # 't Hoen, etc.
}
# ============================================================================
# BIRTH YEAR EXTRACTION
# ============================================================================
def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[str]]:
"""
Extract probable birth year from email local part.
Returns:
Tuple of (year, confidence, position)
- year: 4-digit year if found, None otherwise
- confidence: 0.0-1.0 confidence score
- position: 'start', 'end', 'embedded', or None
Examples:
'1948mausti' -> (1948, 0.9, 'start')
'michiel.huizing.1970' -> (1970, 0.85, 'end')
'j.devries65' -> (1965, 0.6, 'end') # 2-digit year
'bob791120061' -> None (ambiguous)
'test20180702' -> None (date suffix, not birth year)
"""
if not local_part:
return None, 0.0, None
# Pattern for 4-digit years (1930-2010 is plausible birth year range)
# People with 2010+ birth years are unlikely to have email accounts yet
year_pattern = r'(?:19[3-9][0-9]|200[0-9]|201[0-5])'
# IMPORTANT: Check for date suffixes (YYYYMMDD or YYMMDD patterns)
# These are NOT birth years! E.g., test20180702, user20210830
# Date pattern: 4-digit year + 2-digit month (01-12) + 2-digit day (01-31)
date_suffix_pattern = r'(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
if re.search(date_suffix_pattern, local_part):
# This looks like a date suffix (e.g., 20180702), skip birth year extraction
return None, 0.0, None
# Also check for 6-digit date patterns YYMMDD at end
date_suffix_6digit = r'\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
if re.search(date_suffix_6digit, local_part):
# Verify it's likely a date, not a name + year
# E.g., test210830 looks like a date (2021-08-30)
last_6 = local_part[-6:]
if last_6.isdigit():
# Check if middle 2 digits are valid month (01-12)
# and last 2 digits are valid day (01-31)
month = int(last_6[2:4])
day = int(last_6[4:6])
if 1 <= month <= 12 and 1 <= day <= 31:
return None, 0.0, None
# Check for long digit sequences at the end (likely not birth years)
# E.g., josbakker532 might be 532, not 1932
trailing_digits = re.search(r'(\d+)$', local_part)
if trailing_digits:
digit_seq = trailing_digits.group(1)
# If more than 4 digits, it's probably not a clean birth year
if len(digit_seq) > 4:
return None, 0.0, None
# Check for year at start
start_match = re.match(rf'^({year_pattern})', local_part)
if start_match:
year = int(start_match.group(1))
# Validate: person would be 10-95 years old
age = CURRENT_YEAR - year
if 10 <= age <= 95:
# Make sure there's something after the year (not just "1965")
# and it's not followed by more digits (which might indicate a date)
after_year = local_part[4:]
if after_year and not after_year[0].isdigit():
return year, 0.9, 'start'
# Check for year at end - but not if preceded by other digits
# Valid: michiel.huizing.1970, j.devries1965
# Invalid: test20180702 (date), josbakker532 (random number)
end_match = re.search(rf'[^0-9]({year_pattern})$', local_part)
if end_match:
year = int(end_match.group(1))
age = CURRENT_YEAR - year
if 10 <= age <= 95:
return year, 0.85, 'end'
# Also check if year is at very start of local part ending
end_match_start = re.match(rf'^({year_pattern})$', local_part)
if end_match_start:
year = int(end_match_start.group(1))
age = CURRENT_YEAR - year
if 10 <= age <= 95:
return year, 0.7, 'end'
# Check for year embedded (lower confidence)
# Only if there's exactly one year and it's clearly separated
embedded_matches = re.findall(rf'(?<=[^0-9])({year_pattern})(?=[^0-9])', local_part)
if len(embedded_matches) == 1: # Only one year found
year = int(embedded_matches[0])
age = CURRENT_YEAR - year
if 10 <= age <= 95:
return year, 0.5, 'embedded'
# Try 2-digit years at the end (less reliable)
# But only if preceded by a non-digit and exactly 2 digits
two_digit_match = re.search(r'[^0-9](\d{2})$', local_part)
if two_digit_match:
two_digit = int(two_digit_match.group(1))
# Assume 19XX for 30-99, 20XX for 00-15
if 30 <= two_digit <= 99:
year = 1900 + two_digit
age = CURRENT_YEAR - year
if 10 <= age <= 95:
return year, 0.6, 'end'
elif two_digit <= 15: # 2000-2015
year = 2000 + two_digit
age = CURRENT_YEAR - year
if 10 <= age <= 30:
return year, 0.5, 'end'
return None, 0.0, None
# ============================================================================
# NAME COMPONENT EXTRACTION
# ============================================================================
def extract_name_components(local_part: str) -> Tuple[List[str], Optional[str], bool]:
"""
Extract name components from email local part.
Returns:
Tuple of (names, pattern_type, has_dutch_prefix)
- names: List of name components
- pattern_type: Description of the pattern detected
- has_dutch_prefix: True if Dutch prefix detected (van, de, etc.)
Examples:
'michiel.huizing' -> (['michiel', 'huizing'], 'firstname.lastname', False)
'j.devries' -> (['j', 'devries'], 'initial.lastname', False)
'p.vanderberg' -> (['p', 'van', 'der', 'berg'], 'initial.lastname', True)
'josselinpdewit' -> (['josselin', 'p', 'de', 'wit'], 'firstnamemiddlelastname', True)
"""
if not local_part:
return [], None, False
# Remove Gmail-style + suffixes (e.g., user+tag@gmail.com)
clean = re.sub(r'\+.*$', '', local_part)
# Remove any trailing numbers (likely birth year)
clean = re.sub(r'\d+$', '', clean)
clean = re.sub(r'^\d+', '', clean) # Also remove leading numbers
if not clean:
return [], None, False
has_dutch_prefix = False
pattern_type = None
names = []
# Split by common separators
if '.' in clean:
parts = [p for p in clean.split('.') if p]
pattern_type = 'dotted'
elif '_' in clean:
parts = [p for p in clean.split('_') if p]
pattern_type = 'underscored'
elif '-' in clean:
parts = [p for p in clean.split('-') if p]
pattern_type = 'hyphenated'
else:
# No separator - try to split on Dutch prefixes or case changes
# For concatenated names like "josselinpdewit" or "sabrinavisser"
parts = [clean]
pattern_type = 'concatenated'
# Try to split concatenated names with embedded Dutch prefixes
# Patterns: firstnamedewit, firstnamevanderberg, etc.
# Format: (regex, result_builder_function)
concat_prefix_patterns = [
# With middle initial: josselinpdewit -> josselin, p, de, wit
(r'^([a-z]+?)([a-z])de([a-z]+)$', lambda m: [m.group(1), m.group(2), 'de', m.group(3)]),
(r'^([a-z]+?)([a-z])van([a-z]+)$', lambda m: [m.group(1), m.group(2), 'van', m.group(3)]),
(r'^([a-z]+?)([a-z])vander([a-z]+)$', lambda m: [m.group(1), m.group(2), 'van', 'der', m.group(3)]),
(r'^([a-z]+?)([a-z])vanden([a-z]+)$', lambda m: [m.group(1), m.group(2), 'van', 'den', m.group(3)]),
# Without middle initial
(r'^([a-z]{3,})vander([a-z]+)$', lambda m: [m.group(1), 'van', 'der', m.group(2)]),
(r'^([a-z]{3,})vanden([a-z]+)$', lambda m: [m.group(1), 'van', 'den', m.group(2)]),
(r'^([a-z]{3,})vande([a-z]+)$', lambda m: [m.group(1), 'van', 'de', m.group(2)]),
(r'^([a-z]{3,})van([a-z]+)$', lambda m: [m.group(1), 'van', m.group(2)]),
(r'^([a-z]{3,})de([a-z]+)$', lambda m: [m.group(1), 'de', m.group(2)]),
(r'^([a-z]{3,})ter([a-z]+)$', lambda m: [m.group(1), 'ter', m.group(2)]),
(r'^([a-z]{3,})ten([a-z]+)$', lambda m: [m.group(1), 'ten', m.group(2)]),
]
for regex, builder in concat_prefix_patterns:
match = re.match(regex, clean.lower())
if match:
parts = builder(match)
has_dutch_prefix = True
break
# Process each part
for part in parts:
# Check for embedded Dutch prefixes in concatenated names
# e.g., 'vanderberg' -> ['van', 'der', 'berg']
prefix_patterns = [
(r'^(vander)(.+)$', ['van', 'der']),
(r'^(vanden)(.+)$', ['van', 'den']),
(r'^(vande)(.+)$', ['van', 'de']),
(r'^(vant)(.+)$', ['van', "'t"]),
(r'^(van)([^d].*)$', ['van']), # van + not starting with d
(r'^(de)([^n].*)$', ['de']), # de + not starting with n
(r'^(den)(.+)$', ['den']),
(r'^(ter)(.+)$', ['ter']),
(r'^(ten)(.+)$', ['ten']),
]
matched = False
for regex, prefixes in prefix_patterns:
match = re.match(regex, part.lower())
if match:
names.extend(prefixes)
remainder = match.group(len(match.groups()))
if remainder:
names.append(remainder)
has_dutch_prefix = True
matched = True
break
if not matched:
names.append(part.lower())
# Determine more specific pattern type
if names:
if len(names) >= 2:
if len(names[0]) == 1:
pattern_type = 'initial.lastname' if '.' in local_part else 'initiallastname'
elif has_dutch_prefix:
pattern_type = 'firstname.prefix.lastname'
else:
pattern_type = 'firstname.lastname' if '.' in local_part else 'firstnamelastname'
return names, pattern_type, has_dutch_prefix
def identify_first_last_name(names: List[str], has_dutch_prefix: bool) -> Tuple[Optional[str], Optional[str], List[str]]:
"""
Identify first name, last name, and middle names from extracted components.
Returns:
Tuple of (first_name, last_name, middle_names)
"""
if not names:
return None, None, []
if len(names) == 1:
# Single name - could be either
return names[0], None, []
first_name = names[0] if len(names[0]) > 1 else None
middle_names = []
# Find where the last name starts (after any Dutch prefixes)
last_name_parts = []
if has_dutch_prefix:
# Find first prefix position after the first name
for i, name in enumerate(names[1:], 1):
if name.lower() in DUTCH_PREFIXES:
# Everything from here is the last name (including prefixes)
last_name_parts = names[i:]
middle_names = names[1:i]
break
if not last_name_parts:
# No prefix found in remaining names
last_name_parts = names[-1:]
middle_names = names[1:-1]
else:
# No Dutch prefix - last element is last name
last_name_parts = names[-1:]
middle_names = names[1:-1]
last_name = ' '.join(last_name_parts) if last_name_parts else None
return first_name, last_name, middle_names
# ============================================================================
# DOMAIN ANALYSIS
# ============================================================================
def analyze_domain(domain: str) -> Tuple[Optional[str], Optional[str], Optional[str], bool, bool]:
"""
Analyze email domain for institutional affiliation.
Returns:
Tuple of (institution_name, institution_type, ghcid, is_institutional, is_consumer)
"""
if not domain:
return None, None, None, False, True
domain = domain.lower().strip()
# Check consumer domains
if domain in CONSUMER_DOMAINS:
return None, None, None, False, True
# Check known heritage domains
if domain in HERITAGE_DOMAIN_MAP:
name, inst_type, ghcid = HERITAGE_DOMAIN_MAP[domain]
return name, inst_type, ghcid, True, False
# Check if it's a subdomain of a known domain
parts = domain.split('.')
for i in range(len(parts)):
parent = '.'.join(parts[i:])
if parent in HERITAGE_DOMAIN_MAP:
name, inst_type, ghcid = HERITAGE_DOMAIN_MAP[parent]
return name, inst_type, ghcid, True, False
# Heuristics for unknown domains
is_institutional = False
inst_type = None
# Government patterns
if domain.endswith('.overheid.nl') or domain.startswith('min'):
is_institutional = True
inst_type = 'government'
# University patterns
elif any(x in domain for x in ['universit', 'univ.', '.edu', 'student.', 'students.']):
is_institutional = True
inst_type = 'university'
# Museum patterns
elif 'museum' in domain:
is_institutional = True
inst_type = 'museum'
# Archive patterns
elif 'archi' in domain or 'erfgoed' in domain:
is_institutional = True
inst_type = 'archive'
# Library patterns
elif 'biblio' in domain or 'library' in domain:
is_institutional = True
inst_type = 'library'
# Check for .nl TLD with non-consumer pattern
elif domain.endswith('.nl') and domain not in CONSUMER_DOMAINS:
# Might be institutional, but uncertain
is_institutional = False # Don't mark as institutional without more info
return None, inst_type, None, is_institutional, False
# ============================================================================
# MAIN PARSER
# ============================================================================
def parse_email_semantics(email: str) -> Optional[EmailSemantics]:
"""
Parse email address for identity signals.
Args:
email: Email address to analyze
Returns:
EmailSemantics object with extracted information, or None if invalid email
Example:
>>> result = parse_email_semantics("j.devries1965@rijksmuseum.nl")
>>> result.probable_birth_year
1965
>>> result.institution_name
'Rijksmuseum'
>>> result.extracted_names
['j', 'devries']
"""
if not email or '@' not in email:
return None
try:
local_part, domain = email.lower().strip().split('@', 1)
except ValueError:
return None
if not local_part or not domain:
return None
# Extract birth year
birth_year, birth_confidence, birth_position = extract_birth_year(local_part)
# Extract name components
names, name_pattern, has_dutch_prefix = extract_name_components(local_part)
first_name, last_name, middle_names = identify_first_last_name(names, has_dutch_prefix)
# Analyze domain
inst_name, inst_type, ghcid, is_institutional, is_consumer = analyze_domain(domain)
return EmailSemantics(
email=email,
local_part=local_part,
domain=domain,
# Birth year
probable_birth_year=birth_year,
birth_year_confidence=birth_confidence,
birth_year_position=birth_position,
# Institution
institution_domain=domain if is_institutional else None,
institution_name=inst_name,
institution_type=inst_type,
institution_ghcid=ghcid,
# Names
extracted_names=names,
extracted_first_name=first_name,
extracted_last_name=last_name,
extracted_middle_names=middle_names,
name_pattern=name_pattern,
has_dutch_prefix=has_dutch_prefix,
# Flags
is_consumer_domain=is_consumer,
is_institutional_domain=is_institutional,
has_name_in_email=bool(names and len(names) >= 2),
)
# ============================================================================
# MATCHING UTILITIES
# ============================================================================
def email_matches_name(email_semantics: EmailSemantics, full_name: str) -> Tuple[bool, float, List[str]]:
"""
Check if email name components match a given full name.
Returns:
Tuple of (matches, confidence, matched_components)
"""
if not email_semantics or not full_name:
return False, 0.0, []
# Handle dict or other types for full_name
if isinstance(full_name, dict):
full_name = full_name.get('full_name', full_name.get('name', str(full_name)))
if not isinstance(full_name, str):
full_name = str(full_name)
# Normalize the full name
from unicodedata import normalize, combining
name_normalized = normalize('NFKD', full_name.lower())
name_clean = ''.join(c for c in name_normalized if not combining(c))
name_parts = set(name_clean.split())
# Get email name components
email_names = set(email_semantics.extracted_names)
if not email_names:
return False, 0.0, []
# Find overlapping components
matched = email_names & name_parts
if not matched:
# Check for partial matches (initials)
for email_name in email_names:
if len(email_name) == 1: # Initial
for name_part in name_parts:
if name_part.startswith(email_name):
matched.add(email_name)
break
if matched:
# Calculate confidence based on how many parts matched
total_parts = max(len(email_names), len(name_parts))
confidence = len(matched) / total_parts
return True, confidence, list(matched)
return False, 0.0, []
def email_domain_matches_employer(email_semantics: EmailSemantics, employer_domains: Set[str]) -> bool:
"""Check if email domain matches any known employer domain."""
if not email_semantics or not employer_domains:
return False
domain = email_semantics.domain
# Direct match
if domain in employer_domains:
return True
# Check if email domain is subdomain of employer
for emp_domain in employer_domains:
if domain.endswith('.' + emp_domain):
return True
if emp_domain.endswith('.' + domain):
return True
return False
# ============================================================================
# BATCH ANALYSIS
# ============================================================================
def analyze_email_batch(emails: List[str]) -> Dict[str, EmailSemantics]:
"""
Analyze a batch of emails for semantic content.
Returns:
Dict mapping email -> EmailSemantics
"""
results = {}
for email in emails:
if email:
semantics = parse_email_semantics(email)
if semantics:
results[email] = semantics
return results
def get_domain_statistics(email_semantics_list: List[EmailSemantics]) -> Dict[str, int]:
"""Get statistics on institutional domains."""
from collections import Counter
domains = Counter()
for es in email_semantics_list:
if es.is_institutional_domain and es.institution_name:
domains[es.institution_name] += 1
elif es.is_institutional_domain:
domains[es.domain] += 1
return dict(domains.most_common(50))
def get_birth_year_statistics(email_semantics_list: List[EmailSemantics]) -> Dict[str, int]:
"""Get statistics on birth years extracted from emails."""
from collections import Counter
years = Counter()
for es in email_semantics_list:
if es.probable_birth_year and es.birth_year_confidence >= 0.5:
decade = (es.probable_birth_year // 10) * 10
years[f"{decade}s"] += 1
return dict(years.most_common())
# ============================================================================
# CLI INTERFACE
# ============================================================================
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
# Test with provided email
email = sys.argv[1]
result = parse_email_semantics(email)
if result:
print(f"Email: {result.email}")
print(f" Birth year: {result.probable_birth_year} (confidence: {result.birth_year_confidence:.2f})")
print(f" Institution: {result.institution_name or result.domain}")
print(f" Institution type: {result.institution_type}")
print(f" Names: {result.extracted_names}")
print(f" First name: {result.extracted_first_name}")
print(f" Last name: {result.extracted_last_name}")
print(f" Pattern: {result.name_pattern}")
print(f" Dutch prefix: {result.has_dutch_prefix}")
print(f" Is institutional: {result.is_institutional_domain}")
else:
print(f"Could not parse: {email}")
else:
# Run tests
test_emails = [
"1948mausti@ziggo.nl",
"michiel.huizing.1970@gmail.com",
"j.devries1965@rijksmuseum.nl",
"p.vanderberg@nationaalarchief.nl",
"josselinpdewit@hetnet.nl",
"sabrinavisser1992@hotmail.com",
"test@hum.leidenuniv.nl",
"arnold.oppelaar+test21082019@gmail.com",
"h.vandenheuvel@rkd.nl",
]
print("Email Semantic Analysis Test Results")
print("=" * 70)
for email in test_emails:
result = parse_email_semantics(email)
if result:
print(f"\n{email}")
print(f" Birth year: {result.probable_birth_year} (conf: {result.birth_year_confidence:.1f})")
print(f" Institution: {result.institution_name or '-'}")
print(f" Names: {result.extracted_names}")
print(f" Pattern: {result.name_pattern}")