glam/scripts/enrich_ppids.py
kempersc dd0ee2cf11 feat(scripts): expand university location mappings and add web enrichment
- enrich_ppids.py: Add 40+ Dutch universities and hogescholen to location mapping
- enrich_ppids_web.py: New script for web-based PPID enrichment
- resolve_pending_known_orgs.py: Updates for pending org resolution
2026-01-09 21:10:14 +01:00

1399 lines
58 KiB
Python

#!/usr/bin/env python3
"""
PPID Enrichment Script (Rule 45 Compliant)
Enriches PPID files with EXPLICIT inferred data:
1. inferred_birth_decade - From earliest career observations
2. inferred_birth_settlement - From earliest school/university location
3. inferred_current_settlement - From current work location
All inferred data includes full provenance chains per Rule 45:
- Each inference step is documented
- Source observations are linked
- Confidence levels are assigned
- Inferred values NEVER silently replace canonical fields
Reference:
- .opencode/rules/inferred-data-explicit-provenance-rule.md (Rule 45)
- .opencode/rules/ppid-birth-date-enrichment-rule.md (Rule 44)
"""
import json
import os
import re
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Tuple, List, Dict, Any
# GeoNames admin1_code to ISO 3166-2 mapping for Netherlands
NL_ADMIN1_TO_ISO = {
"01": "DR", # Drenthe
"02": "FR", # Friesland
"03": "GE", # Gelderland
"04": "GR", # Groningen
"05": "LI", # Limburg
"06": "NB", # Noord-Brabant
"07": "NH", # Noord-Holland
"09": "UT", # Utrecht
"10": "ZE", # Zeeland
"11": "ZH", # Zuid-Holland
"15": "OV", # Overijssel
"16": "FL", # Flevoland
}
# Common country-specific admin1 mappings
COUNTRY_ADMIN1_MAPPINGS = {
"NL": NL_ADMIN1_TO_ISO,
}
# Known university location mappings
DUTCH_UNI_LOCATIONS = {
"Universiteit Utrecht": ("Utrecht", "NL"),
"Utrecht University": ("Utrecht", "NL"),
"UU": ("Utrecht", "NL"),
"Universiteit van Amsterdam": ("Amsterdam", "NL"),
"University of Amsterdam": ("Amsterdam", "NL"),
"UvA": ("Amsterdam", "NL"),
"VU Amsterdam": ("Amsterdam", "NL"),
"Vrije Universiteit": ("Amsterdam", "NL"),
"Leiden University": ("Leiden", "NL"),
"Universiteit Leiden": ("Leiden", "NL"),
"TU Delft": ("Delft", "NL"),
"Technische Universiteit Delft": ("Delft", "NL"),
"TU Eindhoven": ("Eindhoven", "NL"),
"Technische Universiteit Eindhoven": ("Eindhoven", "NL"),
"Radboud": ("Nijmegen", "NL"),
"Radboud Universiteit": ("Nijmegen", "NL"),
"Rijksuniversiteit Groningen": ("Groningen", "NL"),
"University of Groningen": ("Groningen", "NL"),
"RUG": ("Groningen", "NL"),
"Maastricht University": ("Maastricht", "NL"),
"Universiteit Maastricht": ("Maastricht", "NL"),
"Erasmus": ("Rotterdam", "NL"),
"Erasmus Universiteit": ("Rotterdam", "NL"),
"Erasmus University Rotterdam": ("Rotterdam", "NL"),
"Tilburg University": ("Tilburg", "NL"),
"Universiteit Tilburg": ("Tilburg", "NL"),
"Wageningen": ("Wageningen", "NL"),
"Wageningen University": ("Wageningen", "NL"),
"Hogeschool": ("", "NL"), # Generic, location from name
# Additional Dutch institutions
"Hogeschool van Arnhem en Nijmegen": ("Nijmegen", "NL"),
"HAN": ("Nijmegen", "NL"),
"Hogeschool Utrecht": ("Utrecht", "NL"),
"HU": ("Utrecht", "NL"),
"Hogeschool van Amsterdam": ("Amsterdam", "NL"),
"HvA": ("Amsterdam", "NL"),
"Hogeschool Rotterdam": ("Rotterdam", "NL"),
"Hogeschool Inholland": ("Amsterdam", "NL"),
"Fontys": ("Eindhoven", "NL"),
"Fontys Hogescholen": ("Eindhoven", "NL"),
"Saxion": ("Enschede", "NL"),
"Saxion Hogeschool": ("Enschede", "NL"),
"Stenden": ("Leeuwarden", "NL"),
"NHL Stenden": ("Leeuwarden", "NL"),
"Hanzehogeschool": ("Groningen", "NL"),
"Hanze": ("Groningen", "NL"),
"Christelijke Hogeschool Ede": ("Ede", "NL"),
"CHE": ("Ede", "NL"),
"Avans": ("Breda", "NL"),
"Avans Hogeschool": ("Breda", "NL"),
"Windesheim": ("Zwolle", "NL"),
"Hogeschool Windesheim": ("Zwolle", "NL"),
"Zuyd Hogeschool": ("Maastricht", "NL"),
"Archiefschool": ("Amsterdam", "NL"),
"Archiefschool Amsterdam": ("Amsterdam", "NL"),
"Reinwardt Academie": ("Amsterdam", "NL"),
"KABK": ("Den Haag", "NL"),
"Koninklijke Academie van Beeldende Kunsten": ("Den Haag", "NL"),
"Gerrit Rietveld Academie": ("Amsterdam", "NL"),
"Design Academy Eindhoven": ("Eindhoven", "NL"),
"Art & Design College Utrecht": ("Utrecht", "NL"),
"ArtEZ": ("Arnhem", "NL"),
"IOPS": ("Amsterdam", "NL"),
"Interuniversity Graduate School of Psychometrics": ("Amsterdam", "NL"),
"Sioo": ("Utrecht", "NL"),
# Additional Dutch universities (expanded mapping)
"Eindhoven University of Technology": ("Eindhoven", "NL"),
"Delft University of Technology": ("Delft", "NL"),
"University of Twente": ("Enschede", "NL"),
"Universiteit Twente": ("Enschede", "NL"),
"UT": ("Enschede", "NL"),
"Open Universiteit": ("Heerlen", "NL"),
"Open University Netherlands": ("Heerlen", "NL"),
"Nyenrode": ("Breukelen", "NL"),
"Nyenrode Business Universiteit": ("Breukelen", "NL"),
"Theologische Universiteit": ("Kampen", "NL"),
"Protestant Theological University": ("Amsterdam", "NL"),
# Additional Hogescholen
"De Haagse Hogeschool": ("Den Haag", "NL"),
"The Hague University": ("Den Haag", "NL"),
"The Hague University of Applied Sciences": ("Den Haag", "NL"),
"Amsterdamse Hogeschool voor de Kunsten": ("Amsterdam", "NL"),
"AHK": ("Amsterdam", "NL"),
"Conservatorium van Amsterdam": ("Amsterdam", "NL"),
"Hanzehogeschool Groningen": ("Groningen", "NL"),
"Hogeschool Leiden": ("Leiden", "NL"),
"Hogeschool Zeeland": ("Vlissingen", "NL"),
"HZ University of Applied Sciences": ("Vlissingen", "NL"),
"Hogeschool voor de Kunsten Utrecht": ("Utrecht", "NL"),
"HKU": ("Utrecht", "NL"),
"Willem de Kooning Academie": ("Rotterdam", "NL"),
"Codarts Rotterdam": ("Rotterdam", "NL"),
"Codarts": ("Rotterdam", "NL"),
"Design Academy": ("Eindhoven", "NL"),
"NHTV": ("Breda", "NL"),
"NHTV Breda University of Applied Sciences": ("Breda", "NL"),
"Breda University of Applied Sciences": ("Breda", "NL"),
"NHL Hogeschool": ("Leeuwarden", "NL"),
"Van Hall Larenstein": ("Velp", "NL"),
"NCOI": ("Hilversum", "NL"),
"NCOI Opleidingen": ("Hilversum", "NL"),
"LOI": ("Leiderdorp", "NL"),
"LOI Hogeschool": ("Leiderdorp", "NL"),
"NTI": ("Leiden", "NL"),
"Hogeschool Arnhem": ("Arnhem", "NL"),
"Hogeschool Nijmegen": ("Nijmegen", "NL"),
"ROC": ("", "NL"), # Regional Training Centers - various locations (fallback)
# Specific ROC locations
"ROC Leeuwenborgh": ("Maastricht", "NL"),
"ROC Leiden": ("Leiden", "NL"),
"ROC Midden Nederland": ("Utrecht", "NL"),
"ROC MN": ("Utrecht", "NL"),
"ROC van Amsterdam": ("Amsterdam", "NL"),
"ROC Amsterdam": ("Amsterdam", "NL"),
"ROC Flevoland": ("Almere", "NL"),
"ROC Tilburg": ("Tilburg", "NL"),
"ROC van Twente": ("Enschede", "NL"),
"ROC Twente": ("Enschede", "NL"),
"ROC Nijmegen": ("Nijmegen", "NL"),
"ROC Mondriaan": ("Den Haag", "NL"),
"ROC Nova College": ("Haarlem", "NL"),
"ROC Albeda": ("Rotterdam", "NL"),
"Albeda College": ("Rotterdam", "NL"),
"Zadkine": ("Rotterdam", "NL"),
"Graafschap College": ("Doetinchem", "NL"),
"Friesland College": ("Leeuwarden", "NL"),
"Noorderpoort": ("Groningen", "NL"),
"Alfa-college": ("Groningen", "NL"),
"Deltion College": ("Zwolle", "NL"),
"Cibap": ("Zwolle", "NL"),
"Summa College": ("Eindhoven", "NL"),
"SintLucas": ("Eindhoven", "NL"),
"Koning Willem I College": ("Den Bosch", "NL"),
"Curio": ("Breda", "NL"),
"Da Vinci College": ("Dordrecht", "NL"),
# Additional Radboud variations
"Radboud University Nijmegen": ("Nijmegen", "NL"),
"Radboud University": ("Nijmegen", "NL"),
# Additional VU variations
"Vrije Universiteit Amsterdam": ("Amsterdam", "NL"),
"VU University Amsterdam": ("Amsterdam", "NL"),
# Wageningen variations
"Wageningen University & Research": ("Wageningen", "NL"),
"WUR": ("Wageningen", "NL"),
# Belgian institutions
"KU Leuven": ("Leuven", "BE"),
"University of Leuven": ("Leuven", "BE"),
"Katholieke Universiteit Leuven": ("Leuven", "BE"),
"Vrije Universiteit Brussel": ("Brussel", "BE"),
"VUB": ("Brussel", "BE"),
"Universiteit Gent": ("Gent", "BE"),
"Ghent University": ("Gent", "BE"),
"UGent": ("Gent", "BE"),
"Universiteit Antwerpen": ("Antwerpen", "BE"),
"University of Antwerp": ("Antwerpen", "BE"),
# German institutions
"Universität Bremen": ("Bremen", "DE"),
"University of Bremen": ("Bremen", "DE"),
"Westfälische Wilhelms-Universität Münster": ("Münster", "DE"),
"WWU Münster": ("Münster", "DE"),
"Humboldt-Universität": ("Berlin", "DE"),
"Freie Universität Berlin": ("Berlin", "DE"),
"FU Berlin": ("Berlin", "DE"),
"Universität zu Köln": ("Köln", "DE"),
"University of Cologne": ("Köln", "DE"),
"Ruprecht-Karls-Universität Heidelberg": ("Heidelberg", "DE"),
"Heidelberg University": ("Heidelberg", "DE"),
"Ludwig-Maximilians-Universität München": ("München", "DE"),
"LMU München": ("München", "DE"),
"Technische Universität München": ("München", "DE"),
"TU München": ("München", "DE"),
# UK institutions
"University of Oxford": ("Oxford", "GB"),
"Oxford University": ("Oxford", "GB"),
"University of Cambridge": ("Cambridge", "GB"),
"Cambridge University": ("Cambridge", "GB"),
"University of York": ("York", "GB"),
"University College London": ("London", "GB"),
"UCL": ("London", "GB"),
"London School of Economics": ("London", "GB"),
"LSE": ("London", "GB"),
"King's College London": ("London", "GB"),
"Imperial College": ("London", "GB"),
"University of Edinburgh": ("Edinburgh", "GB"),
"University of Manchester": ("Manchester", "GB"),
# Australian institutions
"The Australian National University": ("Canberra", "AU"),
"Australian National University": ("Canberra", "AU"),
"ANU": ("Canberra", "AU"),
"University of Canberra": ("Canberra", "AU"),
"University of Melbourne": ("Melbourne", "AU"),
"University of Sydney": ("Sydney", "AU"),
"Macquarie University": ("Sydney", "AU"),
"Charles Sturt University": ("Bathurst", "AU"),
"UNSW": ("Sydney", "AU"),
"University of New South Wales": ("Sydney", "AU"),
"University of Queensland": ("Brisbane", "AU"),
"Monash University": ("Melbourne", "AU"),
# South African institutions
"University of Cape Town": ("Cape Town", "ZA"),
"UCT": ("Cape Town", "ZA"),
"University of Pretoria": ("Pretoria", "ZA"),
"University of Witwatersrand": ("Johannesburg", "ZA"),
"Stellenbosch University": ("Stellenbosch", "ZA"),
# Italian institutions
"Politecnico di Milano": ("Milano", "IT"),
"Università degli Studi di Milano": ("Milano", "IT"),
"Università di Bologna": ("Bologna", "IT"),
"University of Bologna": ("Bologna", "IT"),
# US institutions
"Oberlin College": ("Oberlin", "US"),
"Harvard University": ("Cambridge", "US"),
"Harvard": ("Cambridge", "US"),
"Yale University": ("New Haven", "US"),
"Princeton University": ("Princeton", "US"),
"MIT": ("Cambridge", "US"),
"Massachusetts Institute of Technology": ("Cambridge", "US"),
"Stanford University": ("Stanford", "US"),
"Columbia University": ("New York", "US"),
"University of California": ("Berkeley", "US"),
"UCLA": ("Los Angeles", "US"),
"University of Chicago": ("Chicago", "US"),
"NYU": ("New York", "US"),
"New York University": ("New York", "US"),
# Indonesian institutions
"Universitas Gadjah Mada": ("Yogyakarta", "ID"),
"UGM": ("Yogyakarta", "ID"),
"Universitas Indonesia": ("Jakarta", "ID"),
"UI": ("Jakarta", "ID"),
# Turkish institutions
"Middle East Technical University": ("Ankara", "TR"),
"METU": ("Ankara", "TR"),
"Boğaziçi University": ("Istanbul", "TR"),
# Additional Dutch variations found in data
"Rotterdam School of Management": ("Rotterdam", "NL"),
"RSM": ("Rotterdam", "NL"),
"TIAS School for Business and Society": ("Tilburg", "NL"),
"TIAS": ("Tilburg", "NL"),
"GO opleidingen": ("Utrecht", "NL"),
"Amsterdam University of Applied Sciences": ("Amsterdam", "NL"),
"University College Utrecht": ("Utrecht", "NL"),
"UCU": ("Utrecht", "NL"),
"University of Utrecht": ("Utrecht", "NL"),
"NSOB": ("Den Haag", "NL"),
"Nederlandse School voor Openbaar Bestuur": ("Den Haag", "NL"),
"Grotius Academie": ("Nijmegen", "NL"),
"de Baak": ("Noordwijk", "NL"),
"Grafisch Lyceum Rotterdam": ("Rotterdam", "NL"),
"Schoevers": ("Utrecht", "NL"),
"Schoevers College": ("Utrecht", "NL"),
}
def get_settlement_code(city_name: str) -> str:
"""Generate 3-letter settlement code from city name."""
words = city_name.split()
dutch_articles = {"de", "het", "den", "'s"}
if len(words) == 1:
return city_name[:3].upper()
elif words[0].lower() in dutch_articles:
return (words[0][0] + words[1][:2]).upper()
else:
return "".join(w[0] for w in words[:3]).upper()
def geocode_location(location_str: str, db_path: str) -> Optional[dict]:
"""
Geocode a location string to CC-RR-PPP format using GeoNames.
"""
if not location_str:
return None
location_str = location_str.strip()
# Extract country from common patterns
country_code = None
country_patterns = {
"NL": ["(NL)", "Netherlands", "Nederland"],
"BE": ["(BE)", "Belgium", "België", "Belgique"],
"DE": ["(DE)", "Germany", "Deutschland"],
"GB": ["(GB)", "United Kingdom", "UK", "England", "Scotland", "Wales"],
"AU": ["(AU)", "Australia"],
"ZA": ["(ZA)", "South Africa"],
"IT": ["(IT)", "Italy", "Italia"],
"US": ["(US)", "United States", "USA", "U.S."],
"ID": ["(ID)", "Indonesia"],
"TR": ["(TR)", "Turkey", "Türkiye"],
"FR": ["(FR)", "France"],
"ES": ["(ES)", "Spain", "España"],
"AT": ["(AT)", "Austria", "Österreich"],
"CH": ["(CH)", "Switzerland", "Schweiz", "Suisse"],
"CA": ["(CA)", "Canada"],
"NZ": ["(NZ)", "New Zealand"],
"JP": ["(JP)", "Japan"],
"CN": ["(CN)", "China"],
"IN": ["(IN)", "India"],
"BR": ["(BR)", "Brazil", "Brasil"],
"SE": ["(SE)", "Sweden", "Sverige"],
"NO": ["(NO)", "Norway", "Norge"],
"DK": ["(DK)", "Denmark", "Danmark"],
"FI": ["(FI)", "Finland", "Suomi"],
"PL": ["(PL)", "Poland", "Polska"],
"CZ": ["(CZ)", "Czech Republic", "Czechia", "Česko"],
}
for code, patterns in country_patterns.items():
if any(p in location_str for p in patterns):
country_code = code
break
# Clean location for city lookup
city_candidate = location_str.split(",")[0].strip()
city_candidate = re.sub(r"\s*(Area|Region|\([A-Z]{2}\)).*", "", city_candidate).strip()
if not city_candidate or not country_code:
return None
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT name, ascii_name, admin1_code, admin1_name, country_code,
latitude, longitude, geonames_id, population, feature_code
FROM cities
WHERE (name LIKE ? OR ascii_name LIKE ?)
AND country_code = ?
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY population DESC
LIMIT 1
""", (f"{city_candidate}%", f"{city_candidate}%", country_code))
row = cursor.fetchone()
conn.close()
if not row:
return None
name, ascii_name, admin1_code, admin1_name, cc, lat, lon, geonames_id, pop, feature_code = row
# Map admin1_code to ISO 3166-2
region_code = "XX"
if cc in COUNTRY_ADMIN1_MAPPINGS and admin1_code:
region_code = COUNTRY_ADMIN1_MAPPINGS[cc].get(admin1_code, "XX")
elif admin1_code:
region_code = admin1_code[:2].upper()
settlement_code = get_settlement_code(ascii_name)
return {
"country_code": cc,
"region_code": region_code,
"settlement_code": settlement_code,
"settlement_name": name,
"formatted": f"{cc}-{region_code}-{settlement_code}",
"geonames_data": {
"geonames_id": geonames_id,
"geonames_name": name,
"admin1_code": admin1_code,
"admin1_name": admin1_name,
"feature_code": feature_code,
"latitude": lat,
"longitude": lon,
},
"original_query": location_str,
}
except Exception as e:
print(f" GeoNames error: {e}")
return None
def parse_date_range(date_range: str) -> Tuple[Optional[int], Optional[int]]:
"""Parse date range string to extract start and end years."""
if not date_range:
return None, None
years = re.findall(r'\b(19\d{2}|20\d{2})\b', date_range)
if not years:
return None, None
start_year = int(years[0]) if years else None
end_year = int(years[-1]) if len(years) > 1 else None
return start_year, end_year
def get_any_date_field(record: dict) -> str:
"""
Extract date string from a record with various field name conventions.
Handles the following field variations found in LinkedIn profile data:
- date_range: "2019 - Present" (most common, 2,486 entries)
- period: "2015 - 2019" (15 entries)
- years/year: "2010" (single year)
- start_date/end_date: separate fields like "Sep 2019" / "Present" (149 entries)
- dates: "2018 - 2020" (12 entries)
Returns combined date string suitable for parse_date_range().
"""
# Try combined date fields first
for field in ["date_range", "period", "years", "year", "dates"]:
if record.get(field):
return str(record[field])
# Handle separate start_date/end_date fields
start = record.get("start_date", "") or ""
end = record.get("end_date", "") or ""
if start or end:
return f"{start} - {end}".strip(" -")
return ""
def parse_total_experience_field(total_exp: str) -> Optional[int]:
"""
Parse total experience field value to extract years.
Handles formats like:
- "24 years and 8 months"
- "37 years"
- "5 years 3 months"
- "1 year"
Returns number of years or None if not parseable.
"""
if not total_exp:
return None
# Pattern: find digits followed by "year" or "years"
match = re.search(r'(\d+)\s*years?', total_exp.lower())
if match:
return int(match.group(1))
return None
def build_inference_chain(steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Build a numbered inference chain."""
return [{"step": i + 1, **step} for i, step in enumerate(steps)]
def is_near_decade_boundary(year: int, threshold: int = 3) -> bool:
"""
Check if a year is within `threshold` years of a decade boundary.
Examples:
1968, threshold=3 → True (within 3 of 1970)
1972, threshold=3 → True (within 3 of 1970)
1975, threshold=3 → False (5 years from both boundaries)
"""
year_in_decade = year % 10
return year_in_decade >= (10 - threshold) or year_in_decade <= threshold
def get_decade_notation(year: int) -> str:
"""Convert year to EDTF decade notation (e.g., 1968 → 196X)."""
decade = (year // 10) * 10
return f"{decade // 10}X"
def get_adjacent_decades(year: int) -> Tuple[str, str]:
"""
Get two adjacent decades for a year near a boundary.
Examples:
1968 → ("196X", "197X")
1972 → ("196X", "197X")
2001 → ("199X", "200X")
"""
decade = (year // 10) * 10
year_in_decade = year % 10
if year_in_decade >= 7: # Late in decade (7, 8, 9) → spans to next
return (get_decade_notation(year), get_decade_notation(year + 10))
else: # Early in decade (0, 1, 2, 3) → spans to previous
return (get_decade_notation(year - 10), get_decade_notation(year))
def parse_total_experience(about_text: str) -> Optional[int]:
"""
Parse "Total Experience: X years" pattern from about/summary field.
Returns number of years or None if not found.
"""
if not about_text:
return None
# Pattern: "Total Experience: X years and Y months" or "Total Experience: X year"
m = re.search(r'Total Experience:\s*(\d+)\s*years?', about_text, re.IGNORECASE)
if m:
return int(m.group(1))
return None
def infer_birth_decade(profile_data: dict) -> Optional[dict]:
"""
Infer birth decade from earliest career observations.
Returns explicit inferred_birth_decade with full provenance chain.
Supports list-valued results for decade boundary cases (Rule 45 extension):
- If estimated birth year is within 3 years of decade boundary, returns
both adjacent decades as EDTF set notation: [196X,197X]
Inference methods (in priority order):
1. Education start year (most reliable - entry age 18-24)
2. Experience start year (first job - entry age ~23)
3. Total Experience pattern (fallback - "Total Experience: X years")
"""
earliest_year = None
inference_steps = []
age_offset = 18
age_variance = 3 # ±3 years typical variance in entry age
education_record = None
experience_record = None
total_experience_years = None
# Check education first (most reliable)
education = profile_data.get("education") or []
for edu in education:
if edu is None:
continue
# Handle multiple date field names: "date_range", "period", "years", "year"
date_range = edu.get("date_range") or edu.get("period") or edu.get("years") or edu.get("year") or ""
degree = (edu.get("degree") or "").lower() # Handle None
# Handle both "institution" and "school" field names
institution = edu.get("institution") or edu.get("school") or ""
start_year, _ = parse_date_range(date_range)
if start_year:
# Match bachelor's/master's/doctoral level degrees
degree_lower = degree.lower()
if any(term in degree_lower for term in [
# English degrees
"bachelor", "bsc", "ba", "master", "msc", "ma", "phd", "doctor", "postgraduate",
# Dutch degrees
"doctoraal", "drs", "drs.", "mr", "mr.", "ing", "ing.", "ir", "ir.",
"hbo", "mbo", "pabo", "meao", "heao",
# German degrees
"magister", "diplom", "staatsexamen", "referendariat",
# Italian degrees
"laurea",
# Generic
"degree", "graduate", "undergraduate", "post doc", "postdoc",
]):
if earliest_year is None or start_year < earliest_year:
earliest_year = start_year
# Determine age offset based on degree level
if any(term in degree_lower for term in ["master", "msc", "ma", "drs", "drs.", "mr", "mr.", "ir", "ir.", "laurea magistrale", "magister"]):
age_offset = 22 # Master's typically starts at 22
elif any(term in degree_lower for term in ["phd", "doctor", "post doc", "postdoc", "postgraduate"]):
age_offset = 24 # PhD typically starts at 24
else:
age_offset = 18 # Bachelor's/undergraduate
education_record = {
"institution": institution,
"degree": edu.get("degree", ""),
"date_range": date_range,
}
elif any(term in degree_lower for term in ["hbo", "mbo", "vocational", "associate", "pabo", "meao", "heao"]):
if earliest_year is None or start_year < earliest_year:
earliest_year = start_year
age_offset = 16
education_record = {
"institution": institution,
"degree": edu.get("degree", ""),
"date_range": date_range,
}
# Also accept education without clear degree type (use conservative estimate)
elif earliest_year is None:
earliest_year = start_year
age_offset = 18 # Assume typical university entry age
education_record = {
"institution": institution,
"degree": edu.get("degree", "") or "(no degree specified)",
"date_range": date_range,
}
# If no education, check earliest job
if earliest_year is None:
experience = profile_data.get("experience") or []
for exp in experience:
if exp is None:
continue
# Handle multiple date field names (including start_date/end_date)
date_range = get_any_date_field(exp)
start_year, _ = parse_date_range(date_range)
if start_year:
if earliest_year is None or start_year < earliest_year:
earliest_year = start_year
age_offset = 23
age_variance = 5 # Higher variance for first job
experience_record = {
"company": exp.get("company", ""),
"title": exp.get("title", ""),
"date_range": date_range,
}
# If no education or experience dates, try "Total Experience" pattern in about field
if earliest_year is None:
about = profile_data.get("about") or profile_data.get("summary") or ""
total_experience_years = parse_total_experience(about)
if total_experience_years and total_experience_years > 0:
# Estimate: current year - total_years = first job year
# Then: first job year - 23 = birth year (assuming first job at 23)
current_year = datetime.now().year
estimated_first_job_year = current_year - total_experience_years
earliest_year = estimated_first_job_year
age_offset = 23 # Assume first job at 23
age_variance = 7 # Very high variance for this method
inference_steps.append({
"observation": "Total Experience pattern found in about field",
"source_field": "profile_data.about",
"source_value": f"Total Experience: {total_experience_years} years",
})
inference_steps.append({
"calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}",
"result": f"Estimated first job year: {estimated_first_job_year}",
"assumption": "Total experience represents continuous career from first job",
})
# If still no date, try standalone total_experience field in profile_data
if earliest_year is None:
total_exp_field = profile_data.get("total_experience")
if total_exp_field:
total_experience_years = parse_total_experience_field(total_exp_field)
if total_experience_years and total_experience_years > 0:
current_year = datetime.now().year
estimated_first_job_year = current_year - total_experience_years
earliest_year = estimated_first_job_year
age_offset = 23 # Assume first job at 23
age_variance = 7 # Very high variance for this method
inference_steps.append({
"observation": "total_experience field found in profile_data",
"source_field": "profile_data.total_experience",
"source_value": total_exp_field,
})
inference_steps.append({
"calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}",
"result": f"Estimated first job year: {estimated_first_job_year}",
"assumption": "Total experience represents continuous career from first job",
})
if earliest_year is None:
return None
# Build inference chain (only add steps if not already added from Total Experience path)
if education_record:
inference_steps.append({
"observation": "Education record found",
"source_field": "profile_data.education",
"source_value": education_record,
})
inference_steps.append({
"extraction": "Start year extracted from date_range",
"extracted_value": earliest_year,
})
inference_steps.append({
"assumption": f"Education entry age is approximately {age_offset}{age_variance} years)",
"rationale": "Standard entry age for this education level in Netherlands/Europe",
"confidence_impact": f"Assumption introduces uncertainty; actual age may vary ±{age_variance} years",
})
elif experience_record:
inference_steps.append({
"observation": "First job record found (no education data)",
"source_field": "profile_data.experience",
"source_value": experience_record,
})
inference_steps.append({
"extraction": "Start year extracted from date_range",
"extracted_value": earliest_year,
})
inference_steps.append({
"assumption": f"First job age is approximately {age_offset}{age_variance} years)",
"rationale": "Assumes first job after typical university completion",
"confidence_impact": f"Higher uncertainty; first job age varies ±{age_variance} years",
})
elif total_experience_years:
# Steps already added in the Total Experience detection block
inference_steps.append({
"assumption": f"First job age is approximately {age_offset}{age_variance} years)",
"rationale": "Assumes first job after typical university completion; Total Experience method has highest uncertainty",
"confidence_impact": f"Very high uncertainty; first job age varies ±{age_variance} years, plus Total Experience aggregation may be inaccurate",
})
estimated_birth_year = earliest_year - age_offset
min_birth_year = earliest_year - age_offset - age_variance
max_birth_year = earliest_year - age_offset + age_variance
inference_steps.append({
"calculation": f"{earliest_year} - {age_offset} = {estimated_birth_year}",
"result": f"Estimated birth year: {estimated_birth_year}",
"range": f"{min_birth_year}-{max_birth_year} (accounting for ±{age_variance} year variance)",
})
# Check if birth year range spans a decade boundary
min_decade = (min_birth_year // 10) * 10
max_decade = (max_birth_year // 10) * 10
spans_decade_boundary = min_decade != max_decade
if spans_decade_boundary:
# Get decades directly from min/max range (not estimated year)
decade1 = get_decade_notation(min_birth_year)
decade2 = get_decade_notation(max_birth_year)
# Primary is the decade containing the estimated birth year
estimated_decade = get_decade_notation(estimated_birth_year)
if estimated_decade == decade1:
primary_value = decade1
primary_rationale = f"{estimated_birth_year} is in {decade1}, but range extends into {decade2}"
else:
primary_value = decade2
primary_rationale = f"{estimated_birth_year} is in {decade2}, but range extends into {decade1}"
inference_steps.append({
"generalization": "Birth year range spans decade boundary",
"input_range": [min_birth_year, max_birth_year],
"output": [decade1, decade2],
"edtf": f"[{decade1},{decade2}]",
"rationale": "Cannot determine which decade with certainty; using EDTF 'one of' set notation",
})
# Determine method name based on source
if education_record:
method_name = "earliest_education_heuristic"
elif experience_record:
method_name = "earliest_experience_heuristic"
else:
method_name = "total_experience_heuristic"
return {
"values": [decade1, decade2],
"edtf": f"[{decade1},{decade2}]",
"edtf_meaning": f"one of: {decade1[:-1]}0s or {decade2[:-1]}0s",
"precision": "decade_set",
"primary_value": primary_value,
"primary_rationale": primary_rationale,
"confidence": "very_low", # Lower confidence due to boundary uncertainty
"inference_provenance": {
"method": method_name,
"inference_chain": build_inference_chain(inference_steps),
"assumptions": [
f"Entry age for education/first job: {age_offset} years (±{age_variance})",
"Career records are complete in LinkedIn profile",
],
"boundary_note": f"Birth year estimate {estimated_birth_year} spans decades {decade1}/{decade2}",
"inferred_at": datetime.now(timezone.utc).isoformat(),
"inferred_by": "enrich_ppids.py",
}
}
else:
# Single decade - standard case
edtf_decade = get_decade_notation(estimated_birth_year)
inference_steps.append({
"generalization": "Convert to EDTF decade notation",
"input": estimated_birth_year,
"output": edtf_decade,
"rationale": "Decade precision appropriate for heuristic-based estimate",
})
# Determine method name and confidence based on source
if education_record:
method_name = "earliest_education_heuristic"
confidence = "low"
elif experience_record:
method_name = "earliest_experience_heuristic"
confidence = "low"
else:
method_name = "total_experience_heuristic"
confidence = "very_low" # Lowest confidence for Total Experience method
return {
"value": edtf_decade,
"edtf": edtf_decade,
"precision": "decade",
"confidence": confidence,
"inference_provenance": {
"method": method_name,
"inference_chain": build_inference_chain(inference_steps),
"assumptions": [
f"Entry age for education/first job: {age_offset} years (±{age_variance})",
"Career records are complete in LinkedIn profile",
],
"inferred_at": datetime.now(timezone.utc).isoformat(),
"inferred_by": "enrich_ppids.py",
}
}
def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
"""
Infer birth settlement from earliest school/university location.
Returns explicit inferred_birth_settlement with full provenance chain.
"""
inference_steps = []
# Check education first
education = profile_data.get("education") or []
edu_with_years = []
for edu in education:
if edu is None:
continue
# Handle multiple date field names: "date_range", "period", "years", "year"
date_range = edu.get("date_range") or edu.get("period") or edu.get("years") or edu.get("year") or ""
start_year, _ = parse_date_range(date_range)
if start_year:
edu_with_years.append((start_year, edu))
edu_with_years.sort(key=lambda x: x[0])
for start_year, edu in edu_with_years:
# Handle both "institution" and "school" field names
institution = edu.get("institution") or edu.get("school") or ""
# Look up institution location
location = None
location_source = None
for uni_name, (city, country) in DUTCH_UNI_LOCATIONS.items():
if uni_name.lower() in institution.lower():
# Map country code to country name for geocoding
country_names = {
"NL": "Netherlands",
"BE": "Belgium",
"DE": "Germany",
"GB": "United Kingdom",
"AU": "Australia",
"ZA": "South Africa",
"IT": "Italy",
"US": "United States",
"ID": "Indonesia",
"TR": "Turkey",
}
country_name = country_names.get(country, "Netherlands")
location = f"{city}, {country_name}" if city else None
location_source = f"Known institution mapping: {uni_name}"
break
if not location:
continue
# Get date_range for provenance (handle multiple field names)
edu_date_range = edu.get("date_range") or edu.get("period") or edu.get("years") or edu.get("year") or ""
inference_steps.append({
"observation": "Earliest education institution identified",
"source_field": f"profile_data.education",
"source_value": {
"institution": institution,
"date_range": edu_date_range,
"degree": edu.get("degree") or "",
},
})
inference_steps.append({
"lookup": "Institution location mapping",
"mapping_source": "DUTCH_UNI_LOCATIONS dictionary",
"mapping_key": institution,
"mapping_result": location,
})
geo = geocode_location(location, db_path)
if geo:
inference_steps.append({
"geocoding": "GeoNames resolution",
"query": location,
"result": geo["geonames_data"],
})
inference_steps.append({
"formatting": "CC-RR-PPP generation",
"components": {
"country_code": geo["country_code"],
"region_code": geo["region_code"],
"settlement_code": geo["settlement_code"],
},
"result": geo["formatted"],
})
return {
"value": geo["settlement_name"],
"formatted": geo["formatted"],
"country_code": geo["country_code"],
"region_code": geo["region_code"],
"settlement_code": geo["settlement_code"],
"confidence": "low",
"inference_provenance": {
"method": "earliest_education_location",
"inference_chain": build_inference_chain(inference_steps),
"assumptions": [
"Student attended school near birth/family residence",
"Institution location is representative of early life location",
],
"assumption_note": "University location used as proxy for birth settlement; student may have relocated for education",
"geonames_data": geo["geonames_data"],
"inferred_at": datetime.now(timezone.utc).isoformat(),
"inferred_by": "enrich_ppids.py",
}
}
# Fallback: earliest job location
experience = profile_data.get("experience") or []
exp_with_years = []
for exp in experience:
if exp is None:
continue
# Handle multiple date field names (including start_date/end_date)
date_range = get_any_date_field(exp)
start_year, _ = parse_date_range(date_range)
if start_year and exp.get("location"):
exp_with_years.append((start_year, exp))
exp_with_years.sort(key=lambda x: x[0])
for start_year, exp in exp_with_years:
location = exp.get("location", "")
if not location:
continue
# Get date_range for provenance (handle multiple field names)
exp_date_range = get_any_date_field(exp)
inference_steps.append({
"observation": "Earliest job with location found (no education location available)",
"source_field": "profile_data.experience",
"source_value": {
"company": exp.get("company", ""),
"title": exp.get("title", ""),
"date_range": exp_date_range,
"location": location,
},
})
geo = geocode_location(location, db_path)
if geo:
inference_steps.append({
"geocoding": "GeoNames resolution",
"query": location,
"result": geo["geonames_data"],
})
inference_steps.append({
"formatting": "CC-RR-PPP generation",
"result": geo["formatted"],
})
return {
"value": geo["settlement_name"],
"formatted": geo["formatted"],
"country_code": geo["country_code"],
"region_code": geo["region_code"],
"settlement_code": geo["settlement_code"],
"confidence": "very_low",
"inference_provenance": {
"method": "earliest_job_location",
"inference_chain": build_inference_chain(inference_steps),
"assumptions": [
"First job location represents early life region",
],
"assumption_note": "Job location is weak proxy for birth location; person likely relocated for work",
"geonames_data": geo["geonames_data"],
"inferred_at": datetime.now(timezone.utc).isoformat(),
"inferred_by": "enrich_ppids.py",
}
}
return None
def infer_current_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
"""
Infer current settlement from profile location or current job.
Returns explicit inferred_current_settlement with full provenance chain.
"""
inference_steps = []
# Try profile location first (most reliable)
profile_location = profile_data.get("location")
if profile_location:
inference_steps.append({
"observation": "Profile location field found",
"source_field": "profile_data.location",
"source_value": profile_location,
})
geo = geocode_location(profile_location, db_path)
if geo:
inference_steps.append({
"geocoding": "GeoNames resolution",
"query": profile_location,
"result": geo["geonames_data"],
})
inference_steps.append({
"formatting": "CC-RR-PPP generation",
"result": geo["formatted"],
})
return {
"value": geo["settlement_name"],
"formatted": geo["formatted"],
"country_code": geo["country_code"],
"region_code": geo["region_code"],
"settlement_code": geo["settlement_code"],
"confidence": "medium",
"inference_provenance": {
"method": "profile_location",
"inference_chain": build_inference_chain(inference_steps),
"assumptions": [
"Profile location is up-to-date",
"Profile location represents current residence",
],
"geonames_data": geo["geonames_data"],
"inferred_at": datetime.now(timezone.utc).isoformat(),
"inferred_by": "enrich_ppids.py",
}
}
# Try current job location
experience = profile_data.get("experience") or []
for exp in experience:
if exp is None:
continue
# Handle multiple date field names (including start_date/end_date)
date_range = get_any_date_field(exp)
# Also check "current" field which some profiles have
is_current = "Present" in date_range or exp.get("current") is True
if is_current:
location = exp.get("location")
if location:
inference_steps.append({
"observation": "Current job with location found",
"source_field": "profile_data.experience",
"source_value": {
"company": exp.get("company", ""),
"title": exp.get("title", ""),
"location": location,
},
})
geo = geocode_location(location, db_path)
if geo:
inference_steps.append({
"geocoding": "GeoNames resolution",
"query": location,
"result": geo["geonames_data"],
})
inference_steps.append({
"formatting": "CC-RR-PPP generation",
"result": geo["formatted"],
})
return {
"value": geo["settlement_name"],
"formatted": geo["formatted"],
"country_code": geo["country_code"],
"region_code": geo["region_code"],
"settlement_code": geo["settlement_code"],
"confidence": "medium",
"inference_provenance": {
"method": "current_job_location",
"inference_chain": build_inference_chain(inference_steps),
"assumptions": [
"Current job location represents residence area",
"Person works near where they live",
],
"geonames_data": geo["geonames_data"],
"inferred_at": datetime.now(timezone.utc).isoformat(),
"inferred_by": "enrich_ppids.py",
}
}
return None
def regenerate_ppid(components: dict) -> str:
"""Regenerate PPID string from components."""
return (
f"{components['type']}_"
f"{components['first_location']}_{components['first_date']}_"
f"{components['last_location']}_{components['last_date']}_"
f"{'-'.join(components['name_tokens'])}"
)
def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force: bool = False) -> dict:
"""
Enrich a single PPID file with explicit inferred data (Rule 45 compliant).
Args:
filepath: Path to PPID JSON file
db_path: Path to GeoNames SQLite database
dry_run: Don't write changes
force: Re-enrich already-enriched files (clears existing inferred_* fields)
"""
stats = {
"birth_decade_inferred": False,
"birth_decade_is_list": False, # Track decade boundary cases
"birth_decade_method": None, # Track which method was used
"birth_settlement_inferred": False,
"current_settlement_inferred": False,
"ppid_changed": False,
}
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
profile_data = data.get("profile_data", {})
if not profile_data:
return stats
# If force mode, clear existing inferred fields to re-enrich
if force:
for field in ["inferred_birth_decade", "inferred_birth_settlement", "inferred_current_settlement"]:
if field in data:
del data[field]
# Reset components that may have been set from inferred data
if "_source" in str(data.get("ppid_components", {}).get("first_date_source", "")):
data["ppid_components"]["first_date"] = "XXXX"
data["ppid_components"].pop("first_date_source", None)
data["ppid_components"].pop("first_date_alternatives", None)
if "_source" in str(data.get("ppid_components", {}).get("first_location_source", "")):
data["ppid_components"]["first_location"] = "XX-XX-XXX"
data["ppid_components"].pop("first_location_source", None)
if "_source" in str(data.get("ppid_components", {}).get("last_location_source", "")):
data["ppid_components"]["last_location"] = "XX-XX-XXX"
data["ppid_components"].pop("last_location_source", None)
original_ppid = data.get("ppid", "")
components = data.get("ppid_components", {}).copy()
changed = False
# ===== INFER BIRTH DECADE =====
# Only if we don't already have an inferred value AND birth_date is unknown
if (data.get("birth_date", {}).get("edtf") == "XXXX" and
"inferred_birth_decade" not in data):
birth_info = infer_birth_decade(profile_data)
if birth_info:
# Store as EXPLICIT inferred field (Rule 45)
data["inferred_birth_decade"] = birth_info
# Handle list-valued (decade boundary) vs single value
if "values" in birth_info:
# List-valued: use primary_value for PPID
components["first_date"] = birth_info["primary_value"]
components["first_date_source"] = "inferred_birth_decade.primary_value"
components["first_date_alternatives"] = [v for v in birth_info["values"] if v != birth_info["primary_value"]]
stats["birth_decade_is_list"] = True
else:
# Single value
components["first_date"] = birth_info["edtf"]
components["first_date_source"] = "inferred_birth_decade"
# Track which method was used
stats["birth_decade_method"] = birth_info.get("inference_provenance", {}).get("method", "unknown")
# Add note to canonical field pointing to inferred alternative
data["birth_date"]["note"] = "See inferred_birth_decade for heuristic estimate"
stats["birth_decade_inferred"] = True
changed = True
# ===== INFER BIRTH SETTLEMENT =====
if (components.get("first_location") == "XX-XX-XXX" and
"inferred_birth_settlement" not in data):
birth_loc = infer_birth_settlement(profile_data, db_path)
if birth_loc:
data["inferred_birth_settlement"] = birth_loc
components["first_location"] = birth_loc["formatted"]
components["first_location_source"] = "inferred_birth_settlement"
stats["birth_settlement_inferred"] = True
changed = True
# ===== INFER CURRENT SETTLEMENT =====
if (components.get("last_location") == "XX-XX-XXX" and
"inferred_current_settlement" not in data):
current_loc = infer_current_settlement(profile_data, db_path)
if current_loc:
data["inferred_current_settlement"] = current_loc
components["last_location"] = current_loc["formatted"]
components["last_location_source"] = "inferred_current_settlement"
stats["current_settlement_inferred"] = True
changed = True
# ===== REGENERATE PPID IF COMPONENTS CHANGED =====
if changed:
new_ppid = regenerate_ppid(components)
if new_ppid != original_ppid:
data["ppid"] = new_ppid
data["ppid_components"] = components
stats["ppid_changed"] = True
# Track PPID history
if "ppid_history" not in data:
data["ppid_history"] = []
data["ppid_history"].append({
"previous_ppid": original_ppid,
"new_ppid": new_ppid,
"changed_at": datetime.now(timezone.utc).isoformat(),
"reason": "observation_based_inference",
"inferred_fields": [
k for k in ["inferred_birth_decade", "inferred_birth_settlement", "inferred_current_settlement"]
if k in data
],
})
else:
data["ppid_components"] = components
# Update provenance
data["provenance"]["modified_at"] = datetime.now(timezone.utc).isoformat()
data["provenance"]["modified_by"] = "enrich_ppids.py"
if not dry_run:
# Write back to file
with open(filepath, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
# Rename file if PPID changed
if stats["ppid_changed"]:
new_filename = f"{new_ppid}.json"
new_filepath = filepath.parent / new_filename
if new_filepath != filepath and not new_filepath.exists():
filepath.rename(new_filepath)
return stats
def main():
import argparse
parser = argparse.ArgumentParser(description="Enrich PPID files with explicit inferred data (Rule 45)")
parser.add_argument("--dry-run", action="store_true", help="Don't write changes")
parser.add_argument("--limit", type=int, help="Process only N files")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
parser.add_argument("--force", "-f", action="store_true", help="Re-enrich already-enriched files")
args = parser.parse_args()
# Paths
person_dir = Path("/Users/kempersc/apps/glam/data/person")
db_path = "/Users/kempersc/apps/glam/data/reference/geonames.db"
# Get all PPID files
ppid_files = list(person_dir.glob("ID_*.json"))
if args.limit:
ppid_files = ppid_files[:args.limit]
print(f"Processing {len(ppid_files)} PPID files (Rule 45 compliant)...")
if args.dry_run:
print("DRY RUN - no changes will be written")
if args.force:
print("FORCE MODE - re-enriching all files")
# Statistics
total_stats = {
"processed": 0,
"birth_decade_inferred": 0,
"birth_decade_list_valued": 0, # Decade boundary cases
"birth_decade_by_method": {
"earliest_education_heuristic": 0,
"earliest_experience_heuristic": 0,
"total_experience_heuristic": 0,
},
"birth_settlement_inferred": 0,
"current_settlement_inferred": 0,
"ppid_changed": 0,
"errors": 0,
}
for i, filepath in enumerate(ppid_files):
try:
stats = enrich_ppid_file(filepath, db_path, dry_run=args.dry_run, force=args.force)
total_stats["processed"] += 1
if stats["birth_decade_inferred"]:
total_stats["birth_decade_inferred"] += 1
# Track method used
method = stats.get("birth_decade_method")
if method and method in total_stats["birth_decade_by_method"]:
total_stats["birth_decade_by_method"][method] += 1
if stats.get("birth_decade_is_list"):
total_stats["birth_decade_list_valued"] += 1
if stats["birth_settlement_inferred"]:
total_stats["birth_settlement_inferred"] += 1
if stats["current_settlement_inferred"]:
total_stats["current_settlement_inferred"] += 1
if stats["ppid_changed"]:
total_stats["ppid_changed"] += 1
if args.verbose and any(v for k, v in stats.items() if k != "birth_decade_method"):
print(f" {filepath.name}: {stats}")
if (i + 1) % 500 == 0:
print(f" Processed {i + 1}/{len(ppid_files)}...")
except Exception as e:
total_stats["errors"] += 1
if args.verbose:
print(f" ERROR {filepath.name}: {e}")
# Print summary
print("\n" + "=" * 60)
print("ENRICHMENT SUMMARY (Rule 45 Compliant)")
print("=" * 60)
print(f"Processed: {total_stats['processed']}")
print(f"Birth decades inferred: {total_stats['birth_decade_inferred']}")
print(f" - List-valued (boundary): {total_stats['birth_decade_list_valued']}")
print(f" - By method:")
for method, count in total_stats["birth_decade_by_method"].items():
print(f" {method}: {count}")
print(f"Birth settlements inferred: {total_stats['birth_settlement_inferred']}")
print(f"Current settlements inferred: {total_stats['current_settlement_inferred']}")
print(f"PPIDs updated: {total_stats['ppid_changed']}")
print(f"Errors: {total_stats['errors']}")
# Coverage percentages
if total_stats["processed"] > 0:
print("\nCoverage:")
print(f" Birth decade: {total_stats['birth_decade_inferred'] / total_stats['processed'] * 100:.1f}%")
if total_stats["birth_decade_inferred"] > 0:
print(f" - Boundary cases: {total_stats['birth_decade_list_valued'] / total_stats['birth_decade_inferred'] * 100:.1f}%")
print(f" Birth settlement: {total_stats['birth_settlement_inferred'] / total_stats['processed'] * 100:.1f}%")
print(f" Current settlement: {total_stats['current_settlement_inferred'] / total_stats['processed'] * 100:.1f}%")
print("\nNote: All inferred data stored in explicit inferred_* fields with provenance chains.")
print("Note: Decade boundary cases use EDTF set notation [196X,197X] with primary_value for PPID.")
print("Note: Total Experience method has highest uncertainty (very_low confidence).")
if __name__ == "__main__":
main()