872 lines
36 KiB
Python
872 lines
36 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Email semantic analysis for entity resolution.
|
|
|
|
Extracts identity signals from email addresses:
|
|
- Birth year from numeric patterns (e.g., 1948mausti@ziggo.nl -> 1948)
|
|
- Institutional affiliation from domain (e.g., @rijksmuseum.nl -> Rijksmuseum)
|
|
- Name components from local part (e.g., michiel.huizing@... -> Michiel Huizing)
|
|
|
|
Usage:
|
|
from glam_extractor.entity_resolution.email_semantics import (
|
|
parse_email_semantics,
|
|
EmailSemantics,
|
|
HERITAGE_DOMAIN_MAP
|
|
)
|
|
|
|
result = parse_email_semantics("j.devries1965@rijksmuseum.nl")
|
|
print(result.probable_birth_year) # 1965
|
|
print(result.institution_name) # Rijksmuseum
|
|
print(result.extracted_names) # ['j', 'devries']
|
|
"""
|
|
|
|
import re
|
|
import unicodedata
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from typing import Optional, List, Tuple, Dict, Set
|
|
|
|
|
|
# Current year for birth year validation
|
|
CURRENT_YEAR = datetime.now().year
|
|
|
|
|
|
@dataclass
|
|
class EmailSemantics:
|
|
"""Semantic analysis result for an email address."""
|
|
|
|
email: str
|
|
local_part: str
|
|
domain: str
|
|
|
|
# Birth year signal
|
|
probable_birth_year: Optional[int] = None
|
|
birth_year_confidence: float = 0.0
|
|
birth_year_position: Optional[str] = None # 'start', 'end', 'embedded'
|
|
|
|
# Affiliation signal
|
|
institution_domain: Optional[str] = None
|
|
institution_name: Optional[str] = None
|
|
institution_type: Optional[str] = None # 'museum', 'archive', 'library', 'university', 'government', 'research'
|
|
institution_ghcid: Optional[str] = None # If we can link to a custodian
|
|
|
|
# Name components
|
|
extracted_names: List[str] = field(default_factory=list)
|
|
extracted_first_name: Optional[str] = None
|
|
extracted_last_name: Optional[str] = None
|
|
extracted_middle_names: List[str] = field(default_factory=list)
|
|
name_pattern: Optional[str] = None # 'firstname.lastname', 'f.lastname', 'firstnamelastname', etc.
|
|
has_dutch_prefix: bool = False # van, de, van der, etc.
|
|
|
|
# Match quality indicators
|
|
is_consumer_domain: bool = False
|
|
is_institutional_domain: bool = False
|
|
has_name_in_email: bool = False
|
|
|
|
|
|
# ============================================================================
|
|
# HERITAGE INSTITUTION DOMAIN MAPPING
|
|
# ============================================================================
|
|
|
|
# Map domains to known heritage institutions
|
|
# Format: domain -> (name, type, ghcid_if_known)
|
|
HERITAGE_DOMAIN_MAP: Dict[str, Tuple[str, str, Optional[str]]] = {
|
|
# National Archives & Libraries
|
|
'nationaalarchief.nl': ('Nationaal Archief', 'archive', 'NL-ZH-DHA-A-NA'),
|
|
'kb.nl': ('Koninklijke Bibliotheek', 'library', 'NL-ZH-DHA-L-KB'),
|
|
'rijksmuseum.nl': ('Rijksmuseum', 'museum', 'NL-NH-AMS-M-RA'),
|
|
'vangoghmuseum.nl': ('Van Gogh Museum', 'museum', 'NL-NH-AMS-M-VGM'),
|
|
'rkd.nl': ('RKD - Nederlands Instituut voor Kunstgeschiedenis', 'research', 'NL-ZH-DHA-R-RKD'),
|
|
'cultureelerfgoed.nl': ('Rijksdienst voor het Cultureel Erfgoed', 'government', 'NL-UT-AME-O-RCE'),
|
|
'erfgoedinspectie.nl': ('Inspectie Overheidsinformatie en Erfgoed', 'government', None),
|
|
|
|
# KNAW Institutes
|
|
'niod.knaw.nl': ('NIOD Instituut voor Oorlogs-, Holocaust- en Genocidestudies', 'research', 'NL-NH-AMS-R-NIOD'),
|
|
'huygens.knaw.nl': ('Huygens Instituut', 'research', 'NL-NH-AMS-R-HI'),
|
|
'meertens.knaw.nl': ('Meertens Instituut', 'research', 'NL-NH-AMS-R-MI'),
|
|
'dans.knaw.nl': ('Data Archiving and Networked Services', 'archive', 'NL-ZH-DHA-A-DANS'),
|
|
'knaw.nl': ('Koninklijke Nederlandse Akademie van Wetenschappen', 'research', None),
|
|
|
|
# Provincial/Regional Archives
|
|
'regionaalarchieftilburg.nl': ('Regionaal Archief Tilburg', 'archive', None),
|
|
'bhic.nl': ('Brabants Historisch Informatie Centrum', 'archive', None),
|
|
'hetutrechtsarchief.nl': ('Het Utrechts Archief', 'archive', 'NL-UT-UTR-A-HUA'),
|
|
'gahetna.nl': ('Gelders Archief', 'archive', None),
|
|
'regionaalarchiefdordrecht.nl': ('Regionaal Archief Dordrecht', 'archive', None),
|
|
'archiefenschede.nl': ('Historisch Centrum Overijssel', 'archive', None),
|
|
'westfriesarchief.nl': ('Westfries Archief', 'archive', None),
|
|
'nbha.nl': ('Noord-Brabants Historisch Archief', 'archive', None),
|
|
'regionaalarchiefalkmaar.nl': ('Regionaal Archief Alkmaar', 'archive', None),
|
|
'drentsarchief.nl': ('Drents Archief', 'archive', None),
|
|
'groningerarchieven.nl': ('Groninger Archieven', 'archive', None),
|
|
'tresoar.nl': ('Tresoar', 'archive', None),
|
|
'zeelandarchief.nl': ('Zeeuws Archief', 'archive', None),
|
|
|
|
# City Archives
|
|
'stadsarchief.rotterdam.nl': ('Stadsarchief Rotterdam', 'archive', None),
|
|
'stadsarchief.amsterdam.nl': ('Stadsarchief Amsterdam', 'archive', 'NL-NH-AMS-A-SAA'),
|
|
'erfgoedcentrumzutphen.nl': ('Erfgoedcentrum Zutphen', 'archive', None),
|
|
'erfgoedleiden.nl': ('Erfgoed Leiden en Omstreken', 'archive', None),
|
|
'noord-holland.nl': ('Provincie Noord-Holland', 'government', None),
|
|
'amsterdam.nl': ('Gemeente Amsterdam', 'government', None),
|
|
'rotterdam.nl': ('Gemeente Rotterdam', 'government', None),
|
|
'denhaag.nl': ('Gemeente Den Haag', 'government', None),
|
|
'hoorn.nl': ('Gemeente Hoorn', 'government', None),
|
|
'hhnk.nl': ('Hoogheemraadschap Hollands Noorderkwartier', 'government', None),
|
|
|
|
# Museums
|
|
'openluchtmuseum.nl': ('Nederlands Openluchtmuseum', 'museum', None),
|
|
'mauritshuis.nl': ('Mauritshuis', 'museum', 'NL-ZH-DHA-M-MH'),
|
|
'eye.nl': ('Eye Filmmuseum', 'museum', 'NL-NH-AMS-M-EYE'),
|
|
'hermitage.nl': ('Hermitage Amsterdam', 'museum', None),
|
|
'stedelijk.nl': ('Stedelijk Museum Amsterdam', 'museum', 'NL-NH-AMS-M-SMA'),
|
|
'tropenmuseum.nl': ('Tropenmuseum', 'museum', None),
|
|
'naturalis.nl': ('Naturalis Biodiversity Center', 'museum', 'NL-ZH-LEI-M-NBC'),
|
|
'museumboijmans.nl': ('Museum Boijmans Van Beuningen', 'museum', None),
|
|
'kunstmuseum.nl': ('Kunstmuseum Den Haag', 'museum', None),
|
|
'annefrank.nl': ('Anne Frank Huis', 'museum', 'NL-NH-AMS-M-AFH'),
|
|
'amsterdammuseum.nl': ('Amsterdam Museum', 'museum', 'NL-NH-AMS-M-AM'),
|
|
'joods-historisch-museum.nl': ('Joods Historisch Museum', 'museum', None),
|
|
'jhm.nl': ('Joods Historisch Museum', 'museum', None),
|
|
'verzetsmuseum.org': ('Verzetsmuseum', 'museum', None),
|
|
'scheepvaartmuseum.nl': ('Het Scheepvaartmuseum', 'museum', None),
|
|
'hetnoordbrabantsmuseum.nl': ('Het Noordbrabants Museum', 'museum', None),
|
|
'gemeentemuseum.nl': ('Kunstmuseum Den Haag', 'museum', None),
|
|
'centraalmuseum.nl': ('Centraal Museum', 'museum', 'NL-UT-UTR-M-CM'),
|
|
'fryskmuseum.nl': ('Fries Museum', 'museum', None),
|
|
'groningermuseum.nl': ('Groninger Museum', 'museum', None),
|
|
'museumhetvalkhof.nl': ('Museum Het Valkhof', 'museum', None),
|
|
'museum.nl': ('Rijksmuseum van Oudheden', 'museum', None), # generic, but often RMO
|
|
'rfrankenhuis.nl': ('Museum De Fundatie', 'museum', None),
|
|
'maritiemmuseum.nl': ('Maritiem Museum Rotterdam', 'museum', None),
|
|
'paleishetloo.nl': ('Paleis Het Loo', 'museum', None),
|
|
'slotloevestein.nl': ('Slot Loevestein', 'museum', None),
|
|
|
|
# Universities (Humanities/Heritage departments)
|
|
'uu.nl': ('Universiteit Utrecht', 'university', None),
|
|
'students.uu.nl': ('Universiteit Utrecht (student)', 'university', None),
|
|
'uva.nl': ('Universiteit van Amsterdam', 'university', None),
|
|
'student.uva.nl': ('Universiteit van Amsterdam (student)', 'university', None),
|
|
'vu.nl': ('Vrije Universiteit Amsterdam', 'university', None),
|
|
'student.vu.nl': ('Vrije Universiteit Amsterdam (student)', 'university', None),
|
|
'rug.nl': ('Rijksuniversiteit Groningen', 'university', None),
|
|
'student.rug.nl': ('Rijksuniversiteit Groningen (student)', 'university', None),
|
|
'leidenuniv.nl': ('Universiteit Leiden', 'university', None),
|
|
'hum.leidenuniv.nl': ('Universiteit Leiden - Humanities', 'university', None),
|
|
'umail.leidenuniv.nl': ('Universiteit Leiden', 'university', None),
|
|
'cdh.leidenuniv.nl': ('Universiteit Leiden - Centre for Digital Humanities', 'university', None),
|
|
'ru.nl': ('Radboud Universiteit', 'university', None),
|
|
'let.ru.nl': ('Radboud Universiteit - Letteren', 'university', None),
|
|
'jur.ru.nl': ('Radboud Universiteit - Rechtsgeleerdheid', 'university', None),
|
|
'student.ru.nl': ('Radboud Universiteit (student)', 'university', None),
|
|
'tudelft.nl': ('TU Delft', 'university', None),
|
|
'student.tudelft.nl': ('TU Delft (student)', 'university', None),
|
|
'tue.nl': ('TU Eindhoven', 'university', None),
|
|
'utwente.nl': ('Universiteit Twente', 'university', None),
|
|
'maastrichtuniversity.nl': ('Maastricht University', 'university', None),
|
|
'tilburguniversity.edu': ('Tilburg University', 'university', None),
|
|
'eur.nl': ('Erasmus Universiteit Rotterdam', 'university', None),
|
|
'eshcc.eur.nl': ('Erasmus Universiteit - ESHCC', 'university', None),
|
|
'wur.nl': ('Wageningen University & Research', 'university', None),
|
|
'ou.nl': ('Open Universiteit', 'university', None),
|
|
|
|
# Hogescholen (Universities of Applied Sciences)
|
|
'hva.nl': ('Hogeschool van Amsterdam', 'university', None),
|
|
'student.hu.nl': ('Hogeschool Utrecht (student)', 'university', None),
|
|
'student.fontys.nl': ('Fontys Hogescholen (student)', 'university', None),
|
|
|
|
# Government
|
|
'minbuza.nl': ('Ministerie van Buitenlandse Zaken', 'government', None),
|
|
'mindef.nl': ('Ministerie van Defensie', 'government', None),
|
|
'minaz.nl': ('Ministerie van Algemene Zaken', 'government', None),
|
|
'minez.nl': ('Ministerie van Economische Zaken', 'government', None),
|
|
'minienw.nl': ('Ministerie van I&W', 'government', None),
|
|
'rws.nl': ('Rijkswaterstaat', 'government', None),
|
|
'minvws.nl': ('Ministerie van VWS', 'government', None),
|
|
'minjenv.nl': ('Ministerie van J&V', 'government', None),
|
|
'minszw.nl': ('Ministerie van SZW', 'government', None),
|
|
'minbzk.nl': ('Ministerie van BZK', 'government', None),
|
|
'minlnv.nl': ('Ministerie van LNV', 'government', None),
|
|
'minocw.nl': ('Ministerie van OCW', 'government', None),
|
|
'minfin.nl': ('Ministerie van Financien', 'government', None),
|
|
'belastingdienst.nl': ('Belastingdienst', 'government', None),
|
|
'rijksoverheid.nl': ('Rijksoverheid', 'government', None),
|
|
'politie.nl': ('Politie Nederland', 'government', None),
|
|
'kadaster.nl': ('Kadaster', 'government', None),
|
|
'rvo.nl': ('Rijksdienst voor Ondernemend Nederland', 'government', None),
|
|
'rivm.nl': ('Rijksinstituut voor Volksgezondheid en Milieu', 'government', None),
|
|
'staatsbosbeheer.nl': ('Staatsbosbeheer', 'government', None),
|
|
'vng.nl': ('Vereniging van Nederlandse Gemeenten', 'government', None),
|
|
|
|
# Libraries (Public/Academic)
|
|
'oba.nl': ('Openbare Bibliotheek Amsterdam', 'library', None),
|
|
'bibliotheekrotterdam.nl': ('Bibliotheek Rotterdam', 'library', None),
|
|
'ubvu.vu.nl': ('VU Bibliotheek', 'library', None),
|
|
'library.uu.nl': ('Universiteitsbibliotheek Utrecht', 'library', None),
|
|
|
|
# Research Organizations
|
|
'nwo.nl': ('Nederlandse Organisatie voor Wetenschappelijk Onderzoek', 'research', None),
|
|
'fwo.be': ('Fonds Wetenschappelijk Onderzoek', 'research', None),
|
|
'knir.it': ('Koninklijk Nederlands Instituut Rome', 'research', None),
|
|
'cbg.nl': ('Centrum voor Familiegeschiedenis (CBG)', 'research', None),
|
|
'kitlv.nl': ('Koninklijk Instituut voor Taal-, Land- en Volkenkunde', 'research', None),
|
|
'kit.nl': ('KIT Royal Tropical Institute', 'research', None),
|
|
|
|
# Archaeology/Heritage Research
|
|
'baac.nl': ('BAAC Archeologie', 'research', None),
|
|
'raap.nl': ('RAAP Archeologisch Adviesbureau', 'research', None),
|
|
|
|
# Veterans & Military Heritage
|
|
'veteranen.nl': ('Veteraneninstituut', 'museum', None),
|
|
|
|
# Additional Regional Archives (discovered via domain analysis)
|
|
'archiefalkmaar.nl': ('Regionaal Archief Alkmaar', 'archive', None),
|
|
'historischcentrumoverijssel.nl': ('Historisch Centrum Overijssel', 'archive', None),
|
|
'westbrabantsarchief.nl': ('West-Brabants Archief', 'archive', None),
|
|
'haagshistorischmuseum.nl': ('Haags Historisch Museum', 'museum', None),
|
|
'zeeuwsarchief.nl': ('Zeeuws Archief', 'archive', None),
|
|
'erfgoedhuis-zh.nl': ('Erfgoedhuis Zuid-Holland', 'archive', None),
|
|
'noord-hollandsarchief.nl': ('Noord-Hollands Archief', 'archive', 'NL-NH-HAA-A-NHA'),
|
|
'geldersarchief.nl': ('Gelders Archief', 'archive', None),
|
|
'erfgoedbrabant.nl': ('Erfgoed Brabant', 'archive', None),
|
|
'waterlandsarchief.nl': ('Waterlands Archief', 'archive', None),
|
|
'erfgoedservice.nl': ('Erfgoed Service', 'archive', None),
|
|
|
|
# Belgian Heritage (for completeness)
|
|
'arch.be': ('Rijksarchief België', 'archive', None),
|
|
'kbr.be': ('Koninklijke Bibliotheek België', 'library', None),
|
|
'kikirpa.be': ('Koninklijk Instituut voor het Kunstpatrimonium', 'research', None),
|
|
}
|
|
|
|
# Consumer email domains (to filter out)
|
|
CONSUMER_DOMAINS: Set[str] = {
|
|
'gmail.com', 'gmail.nl', 'hotmail.com', 'hotmail.nl', 'outlook.com', 'outlook.nl',
|
|
'live.nl', 'live.com', 'msn.com', 'yahoo.com', 'yahoo.nl', 'yahoo.co.uk',
|
|
'icloud.com', 'icloud.nl', 'me.com', 'mac.com', 'aol.nl', 'aol.com',
|
|
# Dutch ISPs
|
|
'ziggo.nl', 'kpnmail.nl', 'kpnplanet.nl', 'planet.nl', 'hetnet.nl',
|
|
'xs4all.nl', 'casema.nl', 'home.nl', 'upcmail.nl', 'chello.nl',
|
|
'quicknet.nl', 'zonnet.nl', 'tele2.nl', 'solcon.nl', 'zeelandnet.nl',
|
|
'wxs.nl', 'telfort.nl', 'telfortglasvezel.nl', 'online.nl', 'hccnet.nl', 'kabelfoon.nl',
|
|
'caiway.nl', 'tiscali.nl', 'versatel.nl', 'freeler.nl', 'kliksafe.nl',
|
|
'dds.nl', 'freedom.nl', 'xmsnet.nl', 'inter.nl.net', 'euronet.nl',
|
|
'onsbrabantnet.nl', 'concepts.nl', 'worldonline.nl', 'delta.nl', 'lijbrandt.nl',
|
|
't-mobilethuis.nl', 'compaqnet.nl', 'filternet.nl', 'onsmail.nl', 'box.nl',
|
|
'knid.nl', 'kabelnoord.nl', # Kabelnoord ISP (Northern Netherlands/Friesland)
|
|
# Belgian
|
|
'telenet.be', 'skynet.be', 'proximus.be',
|
|
# German
|
|
'gmx.de', 'web.de', 't-online.de',
|
|
# Generic
|
|
'mail.com', 'email.com', 'protonmail.com', 'pm.me', 'mailinator.com',
|
|
}
|
|
|
|
# Dutch name prefixes (tussenvoegsels)
|
|
DUTCH_PREFIXES: Set[str] = {
|
|
'van', 'de', 'den', 'der', 'het', 'ter', 'ten', 'te', 'op', 'in',
|
|
'vande', 'vander', 'vanden', 'vanhet', 'vanten', 'vanter',
|
|
'vd', # abbreviation of van de/van der
|
|
"'t", # 't Hoen, etc.
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# BIRTH YEAR EXTRACTION
|
|
# ============================================================================
|
|
|
|
def extract_birth_year(local_part: str) -> Tuple[Optional[int], float, Optional[str]]:
|
|
"""
|
|
Extract probable birth year from email local part.
|
|
|
|
Returns:
|
|
Tuple of (year, confidence, position)
|
|
- year: 4-digit year if found, None otherwise
|
|
- confidence: 0.0-1.0 confidence score
|
|
- position: 'start', 'end', 'embedded', or None
|
|
|
|
Examples:
|
|
'1948mausti' -> (1948, 0.9, 'start')
|
|
'michiel.huizing.1970' -> (1970, 0.85, 'end')
|
|
'j.devries65' -> (1965, 0.6, 'end') # 2-digit year
|
|
'bob791120061' -> None (ambiguous)
|
|
'test20180702' -> None (date suffix, not birth year)
|
|
"""
|
|
if not local_part:
|
|
return None, 0.0, None
|
|
|
|
# Pattern for 4-digit years (1930-2010 is plausible birth year range)
|
|
# People with 2010+ birth years are unlikely to have email accounts yet
|
|
year_pattern = r'(?:19[3-9][0-9]|200[0-9]|201[0-5])'
|
|
|
|
# IMPORTANT: Check for date suffixes (YYYYMMDD or YYMMDD patterns)
|
|
# These are NOT birth years! E.g., test20180702, user20210830
|
|
# Date pattern: 4-digit year + 2-digit month (01-12) + 2-digit day (01-31)
|
|
date_suffix_pattern = r'(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
|
|
if re.search(date_suffix_pattern, local_part):
|
|
# This looks like a date suffix (e.g., 20180702), skip birth year extraction
|
|
return None, 0.0, None
|
|
|
|
# Also check for 6-digit date patterns YYMMDD at end
|
|
date_suffix_6digit = r'\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])$'
|
|
if re.search(date_suffix_6digit, local_part):
|
|
# Verify it's likely a date, not a name + year
|
|
# E.g., test210830 looks like a date (2021-08-30)
|
|
last_6 = local_part[-6:]
|
|
if last_6.isdigit():
|
|
# Check if middle 2 digits are valid month (01-12)
|
|
# and last 2 digits are valid day (01-31)
|
|
month = int(last_6[2:4])
|
|
day = int(last_6[4:6])
|
|
if 1 <= month <= 12 and 1 <= day <= 31:
|
|
return None, 0.0, None
|
|
|
|
# Check for long digit sequences at the end (likely not birth years)
|
|
# E.g., josbakker532 might be 532, not 1932
|
|
trailing_digits = re.search(r'(\d+)$', local_part)
|
|
if trailing_digits:
|
|
digit_seq = trailing_digits.group(1)
|
|
# If more than 4 digits, it's probably not a clean birth year
|
|
if len(digit_seq) > 4:
|
|
return None, 0.0, None
|
|
|
|
# Check for year at start
|
|
start_match = re.match(rf'^({year_pattern})', local_part)
|
|
if start_match:
|
|
year = int(start_match.group(1))
|
|
# Validate: person would be 10-95 years old
|
|
age = CURRENT_YEAR - year
|
|
if 10 <= age <= 95:
|
|
# Make sure there's something after the year (not just "1965")
|
|
# and it's not followed by more digits (which might indicate a date)
|
|
after_year = local_part[4:]
|
|
if after_year and not after_year[0].isdigit():
|
|
return year, 0.9, 'start'
|
|
|
|
# Check for year at end - but not if preceded by other digits
|
|
# Valid: michiel.huizing.1970, j.devries1965
|
|
# Invalid: test20180702 (date), josbakker532 (random number)
|
|
end_match = re.search(rf'[^0-9]({year_pattern})$', local_part)
|
|
if end_match:
|
|
year = int(end_match.group(1))
|
|
age = CURRENT_YEAR - year
|
|
if 10 <= age <= 95:
|
|
return year, 0.85, 'end'
|
|
|
|
# Also check if year is at very start of local part ending
|
|
end_match_start = re.match(rf'^({year_pattern})$', local_part)
|
|
if end_match_start:
|
|
year = int(end_match_start.group(1))
|
|
age = CURRENT_YEAR - year
|
|
if 10 <= age <= 95:
|
|
return year, 0.7, 'end'
|
|
|
|
# Check for year embedded (lower confidence)
|
|
# Only if there's exactly one year and it's clearly separated
|
|
embedded_matches = re.findall(rf'(?<=[^0-9])({year_pattern})(?=[^0-9])', local_part)
|
|
if len(embedded_matches) == 1: # Only one year found
|
|
year = int(embedded_matches[0])
|
|
age = CURRENT_YEAR - year
|
|
if 10 <= age <= 95:
|
|
return year, 0.5, 'embedded'
|
|
|
|
# Try 2-digit years at the end (less reliable)
|
|
# But only if preceded by a non-digit and exactly 2 digits
|
|
two_digit_match = re.search(r'[^0-9](\d{2})$', local_part)
|
|
if two_digit_match:
|
|
two_digit = int(two_digit_match.group(1))
|
|
# Assume 19XX for 30-99, 20XX for 00-15
|
|
if 30 <= two_digit <= 99:
|
|
year = 1900 + two_digit
|
|
age = CURRENT_YEAR - year
|
|
if 10 <= age <= 95:
|
|
return year, 0.6, 'end'
|
|
elif two_digit <= 15: # 2000-2015
|
|
year = 2000 + two_digit
|
|
age = CURRENT_YEAR - year
|
|
if 10 <= age <= 30:
|
|
return year, 0.5, 'end'
|
|
|
|
return None, 0.0, None
|
|
|
|
|
|
# ============================================================================
|
|
# NAME COMPONENT EXTRACTION
|
|
# ============================================================================
|
|
|
|
def extract_name_components(local_part: str) -> Tuple[List[str], Optional[str], bool]:
|
|
"""
|
|
Extract name components from email local part.
|
|
|
|
Returns:
|
|
Tuple of (names, pattern_type, has_dutch_prefix)
|
|
- names: List of name components
|
|
- pattern_type: Description of the pattern detected
|
|
- has_dutch_prefix: True if Dutch prefix detected (van, de, etc.)
|
|
|
|
Examples:
|
|
'michiel.huizing' -> (['michiel', 'huizing'], 'firstname.lastname', False)
|
|
'j.devries' -> (['j', 'devries'], 'initial.lastname', False)
|
|
'p.vanderberg' -> (['p', 'van', 'der', 'berg'], 'initial.lastname', True)
|
|
'josselinpdewit' -> (['josselin', 'p', 'de', 'wit'], 'firstnamemiddlelastname', True)
|
|
"""
|
|
if not local_part:
|
|
return [], None, False
|
|
|
|
# Remove Gmail-style + suffixes (e.g., user+tag@gmail.com)
|
|
clean = re.sub(r'\+.*$', '', local_part)
|
|
|
|
# Remove any trailing numbers (likely birth year)
|
|
clean = re.sub(r'\d+$', '', clean)
|
|
clean = re.sub(r'^\d+', '', clean) # Also remove leading numbers
|
|
|
|
if not clean:
|
|
return [], None, False
|
|
|
|
has_dutch_prefix = False
|
|
pattern_type = None
|
|
names = []
|
|
|
|
# Split by common separators
|
|
if '.' in clean:
|
|
parts = [p for p in clean.split('.') if p]
|
|
pattern_type = 'dotted'
|
|
elif '_' in clean:
|
|
parts = [p for p in clean.split('_') if p]
|
|
pattern_type = 'underscored'
|
|
elif '-' in clean:
|
|
parts = [p for p in clean.split('-') if p]
|
|
pattern_type = 'hyphenated'
|
|
else:
|
|
# No separator - try to split on Dutch prefixes or case changes
|
|
# For concatenated names like "josselinpdewit" or "sabrinavisser"
|
|
parts = [clean]
|
|
pattern_type = 'concatenated'
|
|
|
|
# Try to split concatenated names with embedded Dutch prefixes
|
|
# Patterns: firstnamedewit, firstnamevanderberg, etc.
|
|
# Format: (regex, result_builder_function)
|
|
concat_prefix_patterns = [
|
|
# With middle initial: josselinpdewit -> josselin, p, de, wit
|
|
(r'^([a-z]+?)([a-z])de([a-z]+)$', lambda m: [m.group(1), m.group(2), 'de', m.group(3)]),
|
|
(r'^([a-z]+?)([a-z])van([a-z]+)$', lambda m: [m.group(1), m.group(2), 'van', m.group(3)]),
|
|
(r'^([a-z]+?)([a-z])vander([a-z]+)$', lambda m: [m.group(1), m.group(2), 'van', 'der', m.group(3)]),
|
|
(r'^([a-z]+?)([a-z])vanden([a-z]+)$', lambda m: [m.group(1), m.group(2), 'van', 'den', m.group(3)]),
|
|
# Without middle initial
|
|
(r'^([a-z]{3,})vander([a-z]+)$', lambda m: [m.group(1), 'van', 'der', m.group(2)]),
|
|
(r'^([a-z]{3,})vanden([a-z]+)$', lambda m: [m.group(1), 'van', 'den', m.group(2)]),
|
|
(r'^([a-z]{3,})vande([a-z]+)$', lambda m: [m.group(1), 'van', 'de', m.group(2)]),
|
|
(r'^([a-z]{3,})van([a-z]+)$', lambda m: [m.group(1), 'van', m.group(2)]),
|
|
(r'^([a-z]{3,})de([a-z]+)$', lambda m: [m.group(1), 'de', m.group(2)]),
|
|
(r'^([a-z]{3,})ter([a-z]+)$', lambda m: [m.group(1), 'ter', m.group(2)]),
|
|
(r'^([a-z]{3,})ten([a-z]+)$', lambda m: [m.group(1), 'ten', m.group(2)]),
|
|
]
|
|
|
|
for regex, builder in concat_prefix_patterns:
|
|
match = re.match(regex, clean.lower())
|
|
if match:
|
|
parts = builder(match)
|
|
has_dutch_prefix = True
|
|
break
|
|
|
|
# Process each part
|
|
for part in parts:
|
|
# Check for embedded Dutch prefixes in concatenated names
|
|
# e.g., 'vanderberg' -> ['van', 'der', 'berg']
|
|
prefix_patterns = [
|
|
(r'^(vander)(.+)$', ['van', 'der']),
|
|
(r'^(vanden)(.+)$', ['van', 'den']),
|
|
(r'^(vande)(.+)$', ['van', 'de']),
|
|
(r'^(vant)(.+)$', ['van', "'t"]),
|
|
(r'^(van)([^d].*)$', ['van']), # van + not starting with d
|
|
(r'^(de)([^n].*)$', ['de']), # de + not starting with n
|
|
(r'^(den)(.+)$', ['den']),
|
|
(r'^(ter)(.+)$', ['ter']),
|
|
(r'^(ten)(.+)$', ['ten']),
|
|
]
|
|
|
|
matched = False
|
|
for regex, prefixes in prefix_patterns:
|
|
match = re.match(regex, part.lower())
|
|
if match:
|
|
names.extend(prefixes)
|
|
remainder = match.group(len(match.groups()))
|
|
if remainder:
|
|
names.append(remainder)
|
|
has_dutch_prefix = True
|
|
matched = True
|
|
break
|
|
|
|
if not matched:
|
|
names.append(part.lower())
|
|
|
|
# Determine more specific pattern type
|
|
if names:
|
|
if len(names) >= 2:
|
|
if len(names[0]) == 1:
|
|
pattern_type = 'initial.lastname' if '.' in local_part else 'initiallastname'
|
|
elif has_dutch_prefix:
|
|
pattern_type = 'firstname.prefix.lastname'
|
|
else:
|
|
pattern_type = 'firstname.lastname' if '.' in local_part else 'firstnamelastname'
|
|
|
|
return names, pattern_type, has_dutch_prefix
|
|
|
|
|
|
def identify_first_last_name(names: List[str], has_dutch_prefix: bool) -> Tuple[Optional[str], Optional[str], List[str]]:
|
|
"""
|
|
Identify first name, last name, and middle names from extracted components.
|
|
|
|
Returns:
|
|
Tuple of (first_name, last_name, middle_names)
|
|
"""
|
|
if not names:
|
|
return None, None, []
|
|
|
|
if len(names) == 1:
|
|
# Single name - could be either
|
|
return names[0], None, []
|
|
|
|
first_name = names[0] if len(names[0]) > 1 else None
|
|
middle_names = []
|
|
|
|
# Find where the last name starts (after any Dutch prefixes)
|
|
last_name_parts = []
|
|
|
|
if has_dutch_prefix:
|
|
# Find first prefix position after the first name
|
|
for i, name in enumerate(names[1:], 1):
|
|
if name.lower() in DUTCH_PREFIXES:
|
|
# Everything from here is the last name (including prefixes)
|
|
last_name_parts = names[i:]
|
|
middle_names = names[1:i]
|
|
break
|
|
|
|
if not last_name_parts:
|
|
# No prefix found in remaining names
|
|
last_name_parts = names[-1:]
|
|
middle_names = names[1:-1]
|
|
else:
|
|
# No Dutch prefix - last element is last name
|
|
last_name_parts = names[-1:]
|
|
middle_names = names[1:-1]
|
|
|
|
last_name = ' '.join(last_name_parts) if last_name_parts else None
|
|
|
|
return first_name, last_name, middle_names
|
|
|
|
|
|
# ============================================================================
|
|
# DOMAIN ANALYSIS
|
|
# ============================================================================
|
|
|
|
def analyze_domain(domain: str) -> Tuple[Optional[str], Optional[str], Optional[str], bool, bool]:
|
|
"""
|
|
Analyze email domain for institutional affiliation.
|
|
|
|
Returns:
|
|
Tuple of (institution_name, institution_type, ghcid, is_institutional, is_consumer)
|
|
"""
|
|
if not domain:
|
|
return None, None, None, False, True
|
|
|
|
domain = domain.lower().strip()
|
|
|
|
# Check consumer domains
|
|
if domain in CONSUMER_DOMAINS:
|
|
return None, None, None, False, True
|
|
|
|
# Check known heritage domains
|
|
if domain in HERITAGE_DOMAIN_MAP:
|
|
name, inst_type, ghcid = HERITAGE_DOMAIN_MAP[domain]
|
|
return name, inst_type, ghcid, True, False
|
|
|
|
# Check if it's a subdomain of a known domain
|
|
parts = domain.split('.')
|
|
for i in range(len(parts)):
|
|
parent = '.'.join(parts[i:])
|
|
if parent in HERITAGE_DOMAIN_MAP:
|
|
name, inst_type, ghcid = HERITAGE_DOMAIN_MAP[parent]
|
|
return name, inst_type, ghcid, True, False
|
|
|
|
# Heuristics for unknown domains
|
|
is_institutional = False
|
|
inst_type = None
|
|
|
|
# Government patterns
|
|
if domain.endswith('.overheid.nl') or domain.startswith('min'):
|
|
is_institutional = True
|
|
inst_type = 'government'
|
|
|
|
# University patterns
|
|
elif any(x in domain for x in ['universit', 'univ.', '.edu', 'student.', 'students.']):
|
|
is_institutional = True
|
|
inst_type = 'university'
|
|
|
|
# Museum patterns
|
|
elif 'museum' in domain:
|
|
is_institutional = True
|
|
inst_type = 'museum'
|
|
|
|
# Archive patterns
|
|
elif 'archi' in domain or 'erfgoed' in domain:
|
|
is_institutional = True
|
|
inst_type = 'archive'
|
|
|
|
# Library patterns
|
|
elif 'biblio' in domain or 'library' in domain:
|
|
is_institutional = True
|
|
inst_type = 'library'
|
|
|
|
# Check for .nl TLD with non-consumer pattern
|
|
elif domain.endswith('.nl') and domain not in CONSUMER_DOMAINS:
|
|
# Might be institutional, but uncertain
|
|
is_institutional = False # Don't mark as institutional without more info
|
|
|
|
return None, inst_type, None, is_institutional, False
|
|
|
|
|
|
# ============================================================================
|
|
# MAIN PARSER
|
|
# ============================================================================
|
|
|
|
def parse_email_semantics(email: str) -> Optional[EmailSemantics]:
|
|
"""
|
|
Parse email address for identity signals.
|
|
|
|
Args:
|
|
email: Email address to analyze
|
|
|
|
Returns:
|
|
EmailSemantics object with extracted information, or None if invalid email
|
|
|
|
Example:
|
|
>>> result = parse_email_semantics("j.devries1965@rijksmuseum.nl")
|
|
>>> result.probable_birth_year
|
|
1965
|
|
>>> result.institution_name
|
|
'Rijksmuseum'
|
|
>>> result.extracted_names
|
|
['j', 'devries']
|
|
"""
|
|
if not email or '@' not in email:
|
|
return None
|
|
|
|
try:
|
|
local_part, domain = email.lower().strip().split('@', 1)
|
|
except ValueError:
|
|
return None
|
|
|
|
if not local_part or not domain:
|
|
return None
|
|
|
|
# Extract birth year
|
|
birth_year, birth_confidence, birth_position = extract_birth_year(local_part)
|
|
|
|
# Extract name components
|
|
names, name_pattern, has_dutch_prefix = extract_name_components(local_part)
|
|
first_name, last_name, middle_names = identify_first_last_name(names, has_dutch_prefix)
|
|
|
|
# Analyze domain
|
|
inst_name, inst_type, ghcid, is_institutional, is_consumer = analyze_domain(domain)
|
|
|
|
return EmailSemantics(
|
|
email=email,
|
|
local_part=local_part,
|
|
domain=domain,
|
|
|
|
# Birth year
|
|
probable_birth_year=birth_year,
|
|
birth_year_confidence=birth_confidence,
|
|
birth_year_position=birth_position,
|
|
|
|
# Institution
|
|
institution_domain=domain if is_institutional else None,
|
|
institution_name=inst_name,
|
|
institution_type=inst_type,
|
|
institution_ghcid=ghcid,
|
|
|
|
# Names
|
|
extracted_names=names,
|
|
extracted_first_name=first_name,
|
|
extracted_last_name=last_name,
|
|
extracted_middle_names=middle_names,
|
|
name_pattern=name_pattern,
|
|
has_dutch_prefix=has_dutch_prefix,
|
|
|
|
# Flags
|
|
is_consumer_domain=is_consumer,
|
|
is_institutional_domain=is_institutional,
|
|
has_name_in_email=bool(names and len(names) >= 2),
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# MATCHING UTILITIES
|
|
# ============================================================================
|
|
|
|
def email_matches_name(email_semantics: EmailSemantics, full_name: str) -> Tuple[bool, float, List[str]]:
|
|
"""
|
|
Check if email name components match a given full name.
|
|
|
|
Returns:
|
|
Tuple of (matches, confidence, matched_components)
|
|
"""
|
|
if not email_semantics or not full_name:
|
|
return False, 0.0, []
|
|
|
|
# Handle dict or other types for full_name
|
|
if isinstance(full_name, dict):
|
|
full_name = full_name.get('full_name', full_name.get('name', str(full_name)))
|
|
if not isinstance(full_name, str):
|
|
full_name = str(full_name)
|
|
|
|
# Normalize the full name
|
|
from unicodedata import normalize, combining
|
|
name_normalized = normalize('NFKD', full_name.lower())
|
|
name_clean = ''.join(c for c in name_normalized if not combining(c))
|
|
name_parts = set(name_clean.split())
|
|
|
|
# Get email name components
|
|
email_names = set(email_semantics.extracted_names)
|
|
|
|
if not email_names:
|
|
return False, 0.0, []
|
|
|
|
# Find overlapping components
|
|
matched = email_names & name_parts
|
|
|
|
if not matched:
|
|
# Check for partial matches (initials)
|
|
for email_name in email_names:
|
|
if len(email_name) == 1: # Initial
|
|
for name_part in name_parts:
|
|
if name_part.startswith(email_name):
|
|
matched.add(email_name)
|
|
break
|
|
|
|
if matched:
|
|
# Calculate confidence based on how many parts matched
|
|
total_parts = max(len(email_names), len(name_parts))
|
|
confidence = len(matched) / total_parts
|
|
return True, confidence, list(matched)
|
|
|
|
return False, 0.0, []
|
|
|
|
|
|
def email_domain_matches_employer(email_semantics: EmailSemantics, employer_domains: Set[str]) -> bool:
|
|
"""Check if email domain matches any known employer domain."""
|
|
if not email_semantics or not employer_domains:
|
|
return False
|
|
|
|
domain = email_semantics.domain
|
|
|
|
# Direct match
|
|
if domain in employer_domains:
|
|
return True
|
|
|
|
# Check if email domain is subdomain of employer
|
|
for emp_domain in employer_domains:
|
|
if domain.endswith('.' + emp_domain):
|
|
return True
|
|
if emp_domain.endswith('.' + domain):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
# ============================================================================
|
|
# BATCH ANALYSIS
|
|
# ============================================================================
|
|
|
|
def analyze_email_batch(emails: List[str]) -> Dict[str, EmailSemantics]:
|
|
"""
|
|
Analyze a batch of emails for semantic content.
|
|
|
|
Returns:
|
|
Dict mapping email -> EmailSemantics
|
|
"""
|
|
results = {}
|
|
for email in emails:
|
|
if email:
|
|
semantics = parse_email_semantics(email)
|
|
if semantics:
|
|
results[email] = semantics
|
|
return results
|
|
|
|
|
|
def get_domain_statistics(email_semantics_list: List[EmailSemantics]) -> Dict[str, int]:
|
|
"""Get statistics on institutional domains."""
|
|
from collections import Counter
|
|
domains = Counter()
|
|
|
|
for es in email_semantics_list:
|
|
if es.is_institutional_domain and es.institution_name:
|
|
domains[es.institution_name] += 1
|
|
elif es.is_institutional_domain:
|
|
domains[es.domain] += 1
|
|
|
|
return dict(domains.most_common(50))
|
|
|
|
|
|
def get_birth_year_statistics(email_semantics_list: List[EmailSemantics]) -> Dict[str, int]:
|
|
"""Get statistics on birth years extracted from emails."""
|
|
from collections import Counter
|
|
years = Counter()
|
|
|
|
for es in email_semantics_list:
|
|
if es.probable_birth_year and es.birth_year_confidence >= 0.5:
|
|
decade = (es.probable_birth_year // 10) * 10
|
|
years[f"{decade}s"] += 1
|
|
|
|
return dict(years.most_common())
|
|
|
|
|
|
# ============================================================================
|
|
# CLI INTERFACE
|
|
# ============================================================================
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
|
|
if len(sys.argv) > 1:
|
|
# Test with provided email
|
|
email = sys.argv[1]
|
|
result = parse_email_semantics(email)
|
|
if result:
|
|
print(f"Email: {result.email}")
|
|
print(f" Birth year: {result.probable_birth_year} (confidence: {result.birth_year_confidence:.2f})")
|
|
print(f" Institution: {result.institution_name or result.domain}")
|
|
print(f" Institution type: {result.institution_type}")
|
|
print(f" Names: {result.extracted_names}")
|
|
print(f" First name: {result.extracted_first_name}")
|
|
print(f" Last name: {result.extracted_last_name}")
|
|
print(f" Pattern: {result.name_pattern}")
|
|
print(f" Dutch prefix: {result.has_dutch_prefix}")
|
|
print(f" Is institutional: {result.is_institutional_domain}")
|
|
else:
|
|
print(f"Could not parse: {email}")
|
|
else:
|
|
# Run tests
|
|
test_emails = [
|
|
"1948mausti@ziggo.nl",
|
|
"michiel.huizing.1970@gmail.com",
|
|
"j.devries1965@rijksmuseum.nl",
|
|
"p.vanderberg@nationaalarchief.nl",
|
|
"josselinpdewit@hetnet.nl",
|
|
"sabrinavisser1992@hotmail.com",
|
|
"test@hum.leidenuniv.nl",
|
|
"arnold.oppelaar+test21082019@gmail.com",
|
|
"h.vandenheuvel@rkd.nl",
|
|
]
|
|
|
|
print("Email Semantic Analysis Test Results")
|
|
print("=" * 70)
|
|
|
|
for email in test_emails:
|
|
result = parse_email_semantics(email)
|
|
if result:
|
|
print(f"\n{email}")
|
|
print(f" Birth year: {result.probable_birth_year} (conf: {result.birth_year_confidence:.1f})")
|
|
print(f" Institution: {result.institution_name or '-'}")
|
|
print(f" Names: {result.extracted_names}")
|
|
print(f" Pattern: {result.name_pattern}")
|