#!/usr/bin/env python3 """ PPID Enrichment Script (Rule 45 Compliant) Enriches PPID files with EXPLICIT inferred data: 1. inferred_birth_decade - From earliest career observations 2. inferred_birth_settlement - From earliest school/university location 3. inferred_current_settlement - From current work location All inferred data includes full provenance chains per Rule 45: - Each inference step is documented - Source observations are linked - Confidence levels are assigned - Inferred values NEVER silently replace canonical fields Reference: - .opencode/rules/inferred-data-explicit-provenance-rule.md (Rule 45) - .opencode/rules/ppid-birth-date-enrichment-rule.md (Rule 44) """ import json import os import re import sqlite3 from datetime import datetime, timezone from pathlib import Path from typing import Optional, Tuple, List, Dict, Any # GeoNames admin1_code to ISO 3166-2 mapping for Netherlands NL_ADMIN1_TO_ISO = { "01": "DR", # Drenthe "02": "FR", # Friesland "03": "GE", # Gelderland "04": "GR", # Groningen "05": "LI", # Limburg "06": "NB", # Noord-Brabant "07": "NH", # Noord-Holland "09": "UT", # Utrecht "10": "ZE", # Zeeland "11": "ZH", # Zuid-Holland "15": "OV", # Overijssel "16": "FL", # Flevoland } # Common country-specific admin1 mappings COUNTRY_ADMIN1_MAPPINGS = { "NL": NL_ADMIN1_TO_ISO, } # Known university location mappings DUTCH_UNI_LOCATIONS = { "Universiteit Utrecht": ("Utrecht", "NL"), "Utrecht University": ("Utrecht", "NL"), "UU": ("Utrecht", "NL"), "Universiteit van Amsterdam": ("Amsterdam", "NL"), "University of Amsterdam": ("Amsterdam", "NL"), "UvA": ("Amsterdam", "NL"), "VU Amsterdam": ("Amsterdam", "NL"), "Vrije Universiteit": ("Amsterdam", "NL"), "Leiden University": ("Leiden", "NL"), "Universiteit Leiden": ("Leiden", "NL"), "TU Delft": ("Delft", "NL"), "Technische Universiteit Delft": ("Delft", "NL"), "TU Eindhoven": ("Eindhoven", "NL"), "Technische Universiteit Eindhoven": ("Eindhoven", "NL"), "Radboud": ("Nijmegen", "NL"), "Radboud Universiteit": ("Nijmegen", "NL"), "Rijksuniversiteit Groningen": ("Groningen", "NL"), "University of Groningen": ("Groningen", "NL"), "RUG": ("Groningen", "NL"), "Maastricht University": ("Maastricht", "NL"), "Universiteit Maastricht": ("Maastricht", "NL"), "Erasmus": ("Rotterdam", "NL"), "Erasmus Universiteit": ("Rotterdam", "NL"), "Erasmus University Rotterdam": ("Rotterdam", "NL"), "Tilburg University": ("Tilburg", "NL"), "Universiteit Tilburg": ("Tilburg", "NL"), "Wageningen": ("Wageningen", "NL"), "Wageningen University": ("Wageningen", "NL"), "Hogeschool": ("", "NL"), # Generic, location from name # Additional Dutch institutions "Hogeschool van Arnhem en Nijmegen": ("Nijmegen", "NL"), "HAN": ("Nijmegen", "NL"), "Hogeschool Utrecht": ("Utrecht", "NL"), "HU": ("Utrecht", "NL"), "Hogeschool van Amsterdam": ("Amsterdam", "NL"), "HvA": ("Amsterdam", "NL"), "Hogeschool Rotterdam": ("Rotterdam", "NL"), "Hogeschool Inholland": ("Amsterdam", "NL"), "Fontys": ("Eindhoven", "NL"), "Fontys Hogescholen": ("Eindhoven", "NL"), "Saxion": ("Enschede", "NL"), "Saxion Hogeschool": ("Enschede", "NL"), "Stenden": ("Leeuwarden", "NL"), "NHL Stenden": ("Leeuwarden", "NL"), "Hanzehogeschool": ("Groningen", "NL"), "Hanze": ("Groningen", "NL"), "Christelijke Hogeschool Ede": ("Ede", "NL"), "CHE": ("Ede", "NL"), "Avans": ("Breda", "NL"), "Avans Hogeschool": ("Breda", "NL"), "Windesheim": ("Zwolle", "NL"), "Hogeschool Windesheim": ("Zwolle", "NL"), "Zuyd Hogeschool": ("Maastricht", "NL"), "Archiefschool": ("Amsterdam", "NL"), "Archiefschool Amsterdam": ("Amsterdam", "NL"), "Reinwardt Academie": ("Amsterdam", "NL"), "KABK": ("Den Haag", "NL"), "Koninklijke Academie van Beeldende Kunsten": ("Den Haag", "NL"), "Gerrit Rietveld Academie": ("Amsterdam", "NL"), "Design Academy Eindhoven": ("Eindhoven", "NL"), "Art & Design College Utrecht": ("Utrecht", "NL"), "ArtEZ": ("Arnhem", "NL"), "IOPS": ("Amsterdam", "NL"), "Interuniversity Graduate School of Psychometrics": ("Amsterdam", "NL"), "Sioo": ("Utrecht", "NL"), # Additional Dutch universities (expanded mapping) "Eindhoven University of Technology": ("Eindhoven", "NL"), "Delft University of Technology": ("Delft", "NL"), "University of Twente": ("Enschede", "NL"), "Universiteit Twente": ("Enschede", "NL"), "UT": ("Enschede", "NL"), "Open Universiteit": ("Heerlen", "NL"), "Open University Netherlands": ("Heerlen", "NL"), "Nyenrode": ("Breukelen", "NL"), "Nyenrode Business Universiteit": ("Breukelen", "NL"), "Theologische Universiteit": ("Kampen", "NL"), "Protestant Theological University": ("Amsterdam", "NL"), # Additional Hogescholen "De Haagse Hogeschool": ("Den Haag", "NL"), "The Hague University": ("Den Haag", "NL"), "The Hague University of Applied Sciences": ("Den Haag", "NL"), "Amsterdamse Hogeschool voor de Kunsten": ("Amsterdam", "NL"), "AHK": ("Amsterdam", "NL"), "Conservatorium van Amsterdam": ("Amsterdam", "NL"), "Hanzehogeschool Groningen": ("Groningen", "NL"), "Hogeschool Leiden": ("Leiden", "NL"), "Hogeschool Zeeland": ("Vlissingen", "NL"), "HZ University of Applied Sciences": ("Vlissingen", "NL"), "Hogeschool voor de Kunsten Utrecht": ("Utrecht", "NL"), "HKU": ("Utrecht", "NL"), "Willem de Kooning Academie": ("Rotterdam", "NL"), "Codarts Rotterdam": ("Rotterdam", "NL"), "Codarts": ("Rotterdam", "NL"), "Design Academy": ("Eindhoven", "NL"), "NHTV": ("Breda", "NL"), "NHTV Breda University of Applied Sciences": ("Breda", "NL"), "Breda University of Applied Sciences": ("Breda", "NL"), "NHL Hogeschool": ("Leeuwarden", "NL"), "Van Hall Larenstein": ("Velp", "NL"), "NCOI": ("Hilversum", "NL"), "NCOI Opleidingen": ("Hilversum", "NL"), "LOI": ("Leiderdorp", "NL"), "LOI Hogeschool": ("Leiderdorp", "NL"), "NTI": ("Leiden", "NL"), "Hogeschool Arnhem": ("Arnhem", "NL"), "Hogeschool Nijmegen": ("Nijmegen", "NL"), "ROC": ("", "NL"), # Regional Training Centers - various locations (fallback) # Specific ROC locations "ROC Leeuwenborgh": ("Maastricht", "NL"), "ROC Leiden": ("Leiden", "NL"), "ROC Midden Nederland": ("Utrecht", "NL"), "ROC MN": ("Utrecht", "NL"), "ROC van Amsterdam": ("Amsterdam", "NL"), "ROC Amsterdam": ("Amsterdam", "NL"), "ROC Flevoland": ("Almere", "NL"), "ROC Tilburg": ("Tilburg", "NL"), "ROC van Twente": ("Enschede", "NL"), "ROC Twente": ("Enschede", "NL"), "ROC Nijmegen": ("Nijmegen", "NL"), "ROC Mondriaan": ("Den Haag", "NL"), "ROC Nova College": ("Haarlem", "NL"), "ROC Albeda": ("Rotterdam", "NL"), "Albeda College": ("Rotterdam", "NL"), "Zadkine": ("Rotterdam", "NL"), "Graafschap College": ("Doetinchem", "NL"), "Friesland College": ("Leeuwarden", "NL"), "Noorderpoort": ("Groningen", "NL"), "Alfa-college": ("Groningen", "NL"), "Deltion College": ("Zwolle", "NL"), "Cibap": ("Zwolle", "NL"), "Summa College": ("Eindhoven", "NL"), "SintLucas": ("Eindhoven", "NL"), "Koning Willem I College": ("Den Bosch", "NL"), "Curio": ("Breda", "NL"), "Da Vinci College": ("Dordrecht", "NL"), # Additional Radboud variations "Radboud University Nijmegen": ("Nijmegen", "NL"), "Radboud University": ("Nijmegen", "NL"), # Additional VU variations "Vrije Universiteit Amsterdam": ("Amsterdam", "NL"), "VU University Amsterdam": ("Amsterdam", "NL"), # Wageningen variations "Wageningen University & Research": ("Wageningen", "NL"), "WUR": ("Wageningen", "NL"), # Belgian institutions "KU Leuven": ("Leuven", "BE"), "University of Leuven": ("Leuven", "BE"), "Katholieke Universiteit Leuven": ("Leuven", "BE"), "Vrije Universiteit Brussel": ("Brussel", "BE"), "VUB": ("Brussel", "BE"), "Universiteit Gent": ("Gent", "BE"), "Ghent University": ("Gent", "BE"), "UGent": ("Gent", "BE"), "Universiteit Antwerpen": ("Antwerpen", "BE"), "University of Antwerp": ("Antwerpen", "BE"), # German institutions "Universität Bremen": ("Bremen", "DE"), "University of Bremen": ("Bremen", "DE"), "Westfälische Wilhelms-Universität Münster": ("Münster", "DE"), "WWU Münster": ("Münster", "DE"), "Humboldt-Universität": ("Berlin", "DE"), "Freie Universität Berlin": ("Berlin", "DE"), "FU Berlin": ("Berlin", "DE"), "Universität zu Köln": ("Köln", "DE"), "University of Cologne": ("Köln", "DE"), "Ruprecht-Karls-Universität Heidelberg": ("Heidelberg", "DE"), "Heidelberg University": ("Heidelberg", "DE"), "Ludwig-Maximilians-Universität München": ("München", "DE"), "LMU München": ("München", "DE"), "Technische Universität München": ("München", "DE"), "TU München": ("München", "DE"), # UK institutions "University of Oxford": ("Oxford", "GB"), "Oxford University": ("Oxford", "GB"), "University of Cambridge": ("Cambridge", "GB"), "Cambridge University": ("Cambridge", "GB"), "University of York": ("York", "GB"), "University College London": ("London", "GB"), "UCL": ("London", "GB"), "London School of Economics": ("London", "GB"), "LSE": ("London", "GB"), "King's College London": ("London", "GB"), "Imperial College": ("London", "GB"), "University of Edinburgh": ("Edinburgh", "GB"), "University of Manchester": ("Manchester", "GB"), # Australian institutions "The Australian National University": ("Canberra", "AU"), "Australian National University": ("Canberra", "AU"), "ANU": ("Canberra", "AU"), "University of Canberra": ("Canberra", "AU"), "University of Melbourne": ("Melbourne", "AU"), "University of Sydney": ("Sydney", "AU"), "Macquarie University": ("Sydney", "AU"), "Charles Sturt University": ("Bathurst", "AU"), "UNSW": ("Sydney", "AU"), "University of New South Wales": ("Sydney", "AU"), "University of Queensland": ("Brisbane", "AU"), "Monash University": ("Melbourne", "AU"), # South African institutions "University of Cape Town": ("Cape Town", "ZA"), "UCT": ("Cape Town", "ZA"), "University of Pretoria": ("Pretoria", "ZA"), "University of Witwatersrand": ("Johannesburg", "ZA"), "Stellenbosch University": ("Stellenbosch", "ZA"), # Italian institutions "Politecnico di Milano": ("Milano", "IT"), "Università degli Studi di Milano": ("Milano", "IT"), "Università di Bologna": ("Bologna", "IT"), "University of Bologna": ("Bologna", "IT"), # US institutions "Oberlin College": ("Oberlin", "US"), "Harvard University": ("Cambridge", "US"), "Harvard": ("Cambridge", "US"), "Yale University": ("New Haven", "US"), "Princeton University": ("Princeton", "US"), "MIT": ("Cambridge", "US"), "Massachusetts Institute of Technology": ("Cambridge", "US"), "Stanford University": ("Stanford", "US"), "Columbia University": ("New York", "US"), "University of California": ("Berkeley", "US"), "UCLA": ("Los Angeles", "US"), "University of Chicago": ("Chicago", "US"), "NYU": ("New York", "US"), "New York University": ("New York", "US"), # Indonesian institutions "Universitas Gadjah Mada": ("Yogyakarta", "ID"), "UGM": ("Yogyakarta", "ID"), "Universitas Indonesia": ("Jakarta", "ID"), "UI": ("Jakarta", "ID"), # Turkish institutions "Middle East Technical University": ("Ankara", "TR"), "METU": ("Ankara", "TR"), "Boğaziçi University": ("Istanbul", "TR"), # Additional Dutch variations found in data "Rotterdam School of Management": ("Rotterdam", "NL"), "RSM": ("Rotterdam", "NL"), "TIAS School for Business and Society": ("Tilburg", "NL"), "TIAS": ("Tilburg", "NL"), "GO opleidingen": ("Utrecht", "NL"), "Amsterdam University of Applied Sciences": ("Amsterdam", "NL"), "University College Utrecht": ("Utrecht", "NL"), "UCU": ("Utrecht", "NL"), "University of Utrecht": ("Utrecht", "NL"), "NSOB": ("Den Haag", "NL"), "Nederlandse School voor Openbaar Bestuur": ("Den Haag", "NL"), "Grotius Academie": ("Nijmegen", "NL"), "de Baak": ("Noordwijk", "NL"), "Grafisch Lyceum Rotterdam": ("Rotterdam", "NL"), "Schoevers": ("Utrecht", "NL"), "Schoevers College": ("Utrecht", "NL"), } def get_settlement_code(city_name: str) -> str: """Generate 3-letter settlement code from city name.""" words = city_name.split() dutch_articles = {"de", "het", "den", "'s"} if len(words) == 1: return city_name[:3].upper() elif words[0].lower() in dutch_articles: return (words[0][0] + words[1][:2]).upper() else: return "".join(w[0] for w in words[:3]).upper() def geocode_location(location_str: str, db_path: str) -> Optional[dict]: """ Geocode a location string to CC-RR-PPP format using GeoNames. """ if not location_str: return None location_str = location_str.strip() # Extract country from common patterns country_code = None country_patterns = { "NL": ["(NL)", "Netherlands", "Nederland"], "BE": ["(BE)", "Belgium", "België", "Belgique"], "DE": ["(DE)", "Germany", "Deutschland"], "GB": ["(GB)", "United Kingdom", "UK", "England", "Scotland", "Wales"], "AU": ["(AU)", "Australia"], "ZA": ["(ZA)", "South Africa"], "IT": ["(IT)", "Italy", "Italia"], "US": ["(US)", "United States", "USA", "U.S."], "ID": ["(ID)", "Indonesia"], "TR": ["(TR)", "Turkey", "Türkiye"], "FR": ["(FR)", "France"], "ES": ["(ES)", "Spain", "España"], "AT": ["(AT)", "Austria", "Österreich"], "CH": ["(CH)", "Switzerland", "Schweiz", "Suisse"], "CA": ["(CA)", "Canada"], "NZ": ["(NZ)", "New Zealand"], "JP": ["(JP)", "Japan"], "CN": ["(CN)", "China"], "IN": ["(IN)", "India"], "BR": ["(BR)", "Brazil", "Brasil"], "SE": ["(SE)", "Sweden", "Sverige"], "NO": ["(NO)", "Norway", "Norge"], "DK": ["(DK)", "Denmark", "Danmark"], "FI": ["(FI)", "Finland", "Suomi"], "PL": ["(PL)", "Poland", "Polska"], "CZ": ["(CZ)", "Czech Republic", "Czechia", "Česko"], } for code, patterns in country_patterns.items(): if any(p in location_str for p in patterns): country_code = code break # Clean location for city lookup city_candidate = location_str.split(",")[0].strip() city_candidate = re.sub(r"\s*(Area|Region|\([A-Z]{2}\)).*", "", city_candidate).strip() if not city_candidate or not country_code: return None try: conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute(""" SELECT name, ascii_name, admin1_code, admin1_name, country_code, latitude, longitude, geonames_id, population, feature_code FROM cities WHERE (name LIKE ? OR ascii_name LIKE ?) AND country_code = ? AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY population DESC LIMIT 1 """, (f"{city_candidate}%", f"{city_candidate}%", country_code)) row = cursor.fetchone() conn.close() if not row: return None name, ascii_name, admin1_code, admin1_name, cc, lat, lon, geonames_id, pop, feature_code = row # Map admin1_code to ISO 3166-2 region_code = "XX" if cc in COUNTRY_ADMIN1_MAPPINGS and admin1_code: region_code = COUNTRY_ADMIN1_MAPPINGS[cc].get(admin1_code, "XX") elif admin1_code: region_code = admin1_code[:2].upper() settlement_code = get_settlement_code(ascii_name) return { "country_code": cc, "region_code": region_code, "settlement_code": settlement_code, "settlement_name": name, "formatted": f"{cc}-{region_code}-{settlement_code}", "geonames_data": { "geonames_id": geonames_id, "geonames_name": name, "admin1_code": admin1_code, "admin1_name": admin1_name, "feature_code": feature_code, "latitude": lat, "longitude": lon, }, "original_query": location_str, } except Exception as e: print(f" GeoNames error: {e}") return None def parse_date_range(date_range: str) -> Tuple[Optional[int], Optional[int]]: """Parse date range string to extract start and end years.""" if not date_range: return None, None years = re.findall(r'\b(19\d{2}|20\d{2})\b', date_range) if not years: return None, None start_year = int(years[0]) if years else None end_year = int(years[-1]) if len(years) > 1 else None return start_year, end_year def get_any_date_field(record: dict) -> str: """ Extract date string from a record with various field name conventions. Handles the following field variations found in LinkedIn profile data: - date_range: "2019 - Present" (most common, 2,486 entries) - period: "2015 - 2019" (15 entries) - years/year: "2010" (single year) - start_date/end_date: separate fields like "Sep 2019" / "Present" (149 entries) - dates: "2018 - 2020" (12 entries) Returns combined date string suitable for parse_date_range(). """ # Try combined date fields first for field in ["date_range", "period", "years", "year", "dates"]: if record.get(field): return str(record[field]) # Handle separate start_date/end_date fields start = record.get("start_date", "") or "" end = record.get("end_date", "") or "" if start or end: return f"{start} - {end}".strip(" -") return "" def parse_total_experience_field(total_exp: str) -> Optional[int]: """ Parse total experience field value to extract years. Handles formats like: - "24 years and 8 months" - "37 years" - "5 years 3 months" - "1 year" Returns number of years or None if not parseable. """ if not total_exp: return None # Pattern: find digits followed by "year" or "years" match = re.search(r'(\d+)\s*years?', total_exp.lower()) if match: return int(match.group(1)) return None def build_inference_chain(steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Build a numbered inference chain.""" return [{"step": i + 1, **step} for i, step in enumerate(steps)] def is_near_decade_boundary(year: int, threshold: int = 3) -> bool: """ Check if a year is within `threshold` years of a decade boundary. Examples: 1968, threshold=3 → True (within 3 of 1970) 1972, threshold=3 → True (within 3 of 1970) 1975, threshold=3 → False (5 years from both boundaries) """ year_in_decade = year % 10 return year_in_decade >= (10 - threshold) or year_in_decade <= threshold def get_decade_notation(year: int) -> str: """Convert year to EDTF decade notation (e.g., 1968 → 196X).""" decade = (year // 10) * 10 return f"{decade // 10}X" def get_adjacent_decades(year: int) -> Tuple[str, str]: """ Get two adjacent decades for a year near a boundary. Examples: 1968 → ("196X", "197X") 1972 → ("196X", "197X") 2001 → ("199X", "200X") """ decade = (year // 10) * 10 year_in_decade = year % 10 if year_in_decade >= 7: # Late in decade (7, 8, 9) → spans to next return (get_decade_notation(year), get_decade_notation(year + 10)) else: # Early in decade (0, 1, 2, 3) → spans to previous return (get_decade_notation(year - 10), get_decade_notation(year)) def parse_total_experience(about_text: str) -> Optional[int]: """ Parse "Total Experience: X years" pattern from about/summary field. Returns number of years or None if not found. """ if not about_text: return None # Pattern: "Total Experience: X years and Y months" or "Total Experience: X year" m = re.search(r'Total Experience:\s*(\d+)\s*years?', about_text, re.IGNORECASE) if m: return int(m.group(1)) return None def infer_birth_decade(profile_data: dict) -> Optional[dict]: """ Infer birth decade from earliest career observations. Returns explicit inferred_birth_decade with full provenance chain. Supports list-valued results for decade boundary cases (Rule 45 extension): - If estimated birth year is within 3 years of decade boundary, returns both adjacent decades as EDTF set notation: [196X,197X] Inference methods (in priority order): 1. Education start year (most reliable - entry age 18-24) 2. Experience start year (first job - entry age ~23) 3. Total Experience pattern (fallback - "Total Experience: X years") """ earliest_year = None inference_steps = [] age_offset = 18 age_variance = 3 # ±3 years typical variance in entry age education_record = None experience_record = None total_experience_years = None # Check education first (most reliable) education = profile_data.get("education") or [] for edu in education: if edu is None: continue # Handle multiple date field names: "date_range", "period", "years", "year" date_range = edu.get("date_range") or edu.get("period") or edu.get("years") or edu.get("year") or "" degree = (edu.get("degree") or "").lower() # Handle None # Handle both "institution" and "school" field names institution = edu.get("institution") or edu.get("school") or "" start_year, _ = parse_date_range(date_range) if start_year: # Match bachelor's/master's/doctoral level degrees degree_lower = degree.lower() if any(term in degree_lower for term in [ # English degrees "bachelor", "bsc", "ba", "master", "msc", "ma", "phd", "doctor", "postgraduate", # Dutch degrees "doctoraal", "drs", "drs.", "mr", "mr.", "ing", "ing.", "ir", "ir.", "hbo", "mbo", "pabo", "meao", "heao", # German degrees "magister", "diplom", "staatsexamen", "referendariat", # Italian degrees "laurea", # Generic "degree", "graduate", "undergraduate", "post doc", "postdoc", ]): if earliest_year is None or start_year < earliest_year: earliest_year = start_year # Determine age offset based on degree level if any(term in degree_lower for term in ["master", "msc", "ma", "drs", "drs.", "mr", "mr.", "ir", "ir.", "laurea magistrale", "magister"]): age_offset = 22 # Master's typically starts at 22 elif any(term in degree_lower for term in ["phd", "doctor", "post doc", "postdoc", "postgraduate"]): age_offset = 24 # PhD typically starts at 24 else: age_offset = 18 # Bachelor's/undergraduate education_record = { "institution": institution, "degree": edu.get("degree", ""), "date_range": date_range, } elif any(term in degree_lower for term in ["hbo", "mbo", "vocational", "associate", "pabo", "meao", "heao"]): if earliest_year is None or start_year < earliest_year: earliest_year = start_year age_offset = 16 education_record = { "institution": institution, "degree": edu.get("degree", ""), "date_range": date_range, } # Also accept education without clear degree type (use conservative estimate) elif earliest_year is None: earliest_year = start_year age_offset = 18 # Assume typical university entry age education_record = { "institution": institution, "degree": edu.get("degree", "") or "(no degree specified)", "date_range": date_range, } # If no education, check earliest job if earliest_year is None: experience = profile_data.get("experience") or [] for exp in experience: if exp is None: continue # Handle multiple date field names (including start_date/end_date) date_range = get_any_date_field(exp) start_year, _ = parse_date_range(date_range) if start_year: if earliest_year is None or start_year < earliest_year: earliest_year = start_year age_offset = 23 age_variance = 5 # Higher variance for first job experience_record = { "company": exp.get("company", ""), "title": exp.get("title", ""), "date_range": date_range, } # If no education or experience dates, try "Total Experience" pattern in about field if earliest_year is None: about = profile_data.get("about") or profile_data.get("summary") or "" total_experience_years = parse_total_experience(about) if total_experience_years and total_experience_years > 0: # Estimate: current year - total_years = first job year # Then: first job year - 23 = birth year (assuming first job at 23) current_year = datetime.now().year estimated_first_job_year = current_year - total_experience_years earliest_year = estimated_first_job_year age_offset = 23 # Assume first job at 23 age_variance = 7 # Very high variance for this method inference_steps.append({ "observation": "Total Experience pattern found in about field", "source_field": "profile_data.about", "source_value": f"Total Experience: {total_experience_years} years", }) inference_steps.append({ "calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}", "result": f"Estimated first job year: {estimated_first_job_year}", "assumption": "Total experience represents continuous career from first job", }) # If still no date, try standalone total_experience field in profile_data if earliest_year is None: total_exp_field = profile_data.get("total_experience") if total_exp_field: total_experience_years = parse_total_experience_field(total_exp_field) if total_experience_years and total_experience_years > 0: current_year = datetime.now().year estimated_first_job_year = current_year - total_experience_years earliest_year = estimated_first_job_year age_offset = 23 # Assume first job at 23 age_variance = 7 # Very high variance for this method inference_steps.append({ "observation": "total_experience field found in profile_data", "source_field": "profile_data.total_experience", "source_value": total_exp_field, }) inference_steps.append({ "calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}", "result": f"Estimated first job year: {estimated_first_job_year}", "assumption": "Total experience represents continuous career from first job", }) if earliest_year is None: return None # Build inference chain (only add steps if not already added from Total Experience path) if education_record: inference_steps.append({ "observation": "Education record found", "source_field": "profile_data.education", "source_value": education_record, }) inference_steps.append({ "extraction": "Start year extracted from date_range", "extracted_value": earliest_year, }) inference_steps.append({ "assumption": f"Education entry age is approximately {age_offset} (±{age_variance} years)", "rationale": "Standard entry age for this education level in Netherlands/Europe", "confidence_impact": f"Assumption introduces uncertainty; actual age may vary ±{age_variance} years", }) elif experience_record: inference_steps.append({ "observation": "First job record found (no education data)", "source_field": "profile_data.experience", "source_value": experience_record, }) inference_steps.append({ "extraction": "Start year extracted from date_range", "extracted_value": earliest_year, }) inference_steps.append({ "assumption": f"First job age is approximately {age_offset} (±{age_variance} years)", "rationale": "Assumes first job after typical university completion", "confidence_impact": f"Higher uncertainty; first job age varies ±{age_variance} years", }) elif total_experience_years: # Steps already added in the Total Experience detection block inference_steps.append({ "assumption": f"First job age is approximately {age_offset} (±{age_variance} years)", "rationale": "Assumes first job after typical university completion; Total Experience method has highest uncertainty", "confidence_impact": f"Very high uncertainty; first job age varies ±{age_variance} years, plus Total Experience aggregation may be inaccurate", }) estimated_birth_year = earliest_year - age_offset min_birth_year = earliest_year - age_offset - age_variance max_birth_year = earliest_year - age_offset + age_variance inference_steps.append({ "calculation": f"{earliest_year} - {age_offset} = {estimated_birth_year}", "result": f"Estimated birth year: {estimated_birth_year}", "range": f"{min_birth_year}-{max_birth_year} (accounting for ±{age_variance} year variance)", }) # Check if birth year range spans a decade boundary min_decade = (min_birth_year // 10) * 10 max_decade = (max_birth_year // 10) * 10 spans_decade_boundary = min_decade != max_decade if spans_decade_boundary: # Get decades directly from min/max range (not estimated year) decade1 = get_decade_notation(min_birth_year) decade2 = get_decade_notation(max_birth_year) # Primary is the decade containing the estimated birth year estimated_decade = get_decade_notation(estimated_birth_year) if estimated_decade == decade1: primary_value = decade1 primary_rationale = f"{estimated_birth_year} is in {decade1}, but range extends into {decade2}" else: primary_value = decade2 primary_rationale = f"{estimated_birth_year} is in {decade2}, but range extends into {decade1}" inference_steps.append({ "generalization": "Birth year range spans decade boundary", "input_range": [min_birth_year, max_birth_year], "output": [decade1, decade2], "edtf": f"[{decade1},{decade2}]", "rationale": "Cannot determine which decade with certainty; using EDTF 'one of' set notation", }) # Determine method name based on source if education_record: method_name = "earliest_education_heuristic" elif experience_record: method_name = "earliest_experience_heuristic" else: method_name = "total_experience_heuristic" return { "values": [decade1, decade2], "edtf": f"[{decade1},{decade2}]", "edtf_meaning": f"one of: {decade1[:-1]}0s or {decade2[:-1]}0s", "precision": "decade_set", "primary_value": primary_value, "primary_rationale": primary_rationale, "confidence": "very_low", # Lower confidence due to boundary uncertainty "inference_provenance": { "method": method_name, "inference_chain": build_inference_chain(inference_steps), "assumptions": [ f"Entry age for education/first job: {age_offset} years (±{age_variance})", "Career records are complete in LinkedIn profile", ], "boundary_note": f"Birth year estimate {estimated_birth_year} spans decades {decade1}/{decade2}", "inferred_at": datetime.now(timezone.utc).isoformat(), "inferred_by": "enrich_ppids.py", } } else: # Single decade - standard case edtf_decade = get_decade_notation(estimated_birth_year) inference_steps.append({ "generalization": "Convert to EDTF decade notation", "input": estimated_birth_year, "output": edtf_decade, "rationale": "Decade precision appropriate for heuristic-based estimate", }) # Determine method name and confidence based on source if education_record: method_name = "earliest_education_heuristic" confidence = "low" elif experience_record: method_name = "earliest_experience_heuristic" confidence = "low" else: method_name = "total_experience_heuristic" confidence = "very_low" # Lowest confidence for Total Experience method return { "value": edtf_decade, "edtf": edtf_decade, "precision": "decade", "confidence": confidence, "inference_provenance": { "method": method_name, "inference_chain": build_inference_chain(inference_steps), "assumptions": [ f"Entry age for education/first job: {age_offset} years (±{age_variance})", "Career records are complete in LinkedIn profile", ], "inferred_at": datetime.now(timezone.utc).isoformat(), "inferred_by": "enrich_ppids.py", } } def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]: """ Infer birth settlement from earliest school/university location. Returns explicit inferred_birth_settlement with full provenance chain. """ inference_steps = [] # Check education first education = profile_data.get("education") or [] edu_with_years = [] for edu in education: if edu is None: continue # Handle multiple date field names: "date_range", "period", "years", "year" date_range = edu.get("date_range") or edu.get("period") or edu.get("years") or edu.get("year") or "" start_year, _ = parse_date_range(date_range) if start_year: edu_with_years.append((start_year, edu)) edu_with_years.sort(key=lambda x: x[0]) for start_year, edu in edu_with_years: # Handle both "institution" and "school" field names institution = edu.get("institution") or edu.get("school") or "" # Look up institution location location = None location_source = None for uni_name, (city, country) in DUTCH_UNI_LOCATIONS.items(): if uni_name.lower() in institution.lower(): # Map country code to country name for geocoding country_names = { "NL": "Netherlands", "BE": "Belgium", "DE": "Germany", "GB": "United Kingdom", "AU": "Australia", "ZA": "South Africa", "IT": "Italy", "US": "United States", "ID": "Indonesia", "TR": "Turkey", } country_name = country_names.get(country, "Netherlands") location = f"{city}, {country_name}" if city else None location_source = f"Known institution mapping: {uni_name}" break if not location: continue # Get date_range for provenance (handle multiple field names) edu_date_range = edu.get("date_range") or edu.get("period") or edu.get("years") or edu.get("year") or "" inference_steps.append({ "observation": "Earliest education institution identified", "source_field": f"profile_data.education", "source_value": { "institution": institution, "date_range": edu_date_range, "degree": edu.get("degree") or "", }, }) inference_steps.append({ "lookup": "Institution location mapping", "mapping_source": "DUTCH_UNI_LOCATIONS dictionary", "mapping_key": institution, "mapping_result": location, }) geo = geocode_location(location, db_path) if geo: inference_steps.append({ "geocoding": "GeoNames resolution", "query": location, "result": geo["geonames_data"], }) inference_steps.append({ "formatting": "CC-RR-PPP generation", "components": { "country_code": geo["country_code"], "region_code": geo["region_code"], "settlement_code": geo["settlement_code"], }, "result": geo["formatted"], }) return { "value": geo["settlement_name"], "formatted": geo["formatted"], "country_code": geo["country_code"], "region_code": geo["region_code"], "settlement_code": geo["settlement_code"], "confidence": "low", "inference_provenance": { "method": "earliest_education_location", "inference_chain": build_inference_chain(inference_steps), "assumptions": [ "Student attended school near birth/family residence", "Institution location is representative of early life location", ], "assumption_note": "University location used as proxy for birth settlement; student may have relocated for education", "geonames_data": geo["geonames_data"], "inferred_at": datetime.now(timezone.utc).isoformat(), "inferred_by": "enrich_ppids.py", } } # Fallback: earliest job location experience = profile_data.get("experience") or [] exp_with_years = [] for exp in experience: if exp is None: continue # Handle multiple date field names (including start_date/end_date) date_range = get_any_date_field(exp) start_year, _ = parse_date_range(date_range) if start_year and exp.get("location"): exp_with_years.append((start_year, exp)) exp_with_years.sort(key=lambda x: x[0]) for start_year, exp in exp_with_years: location = exp.get("location", "") if not location: continue # Get date_range for provenance (handle multiple field names) exp_date_range = get_any_date_field(exp) inference_steps.append({ "observation": "Earliest job with location found (no education location available)", "source_field": "profile_data.experience", "source_value": { "company": exp.get("company", ""), "title": exp.get("title", ""), "date_range": exp_date_range, "location": location, }, }) geo = geocode_location(location, db_path) if geo: inference_steps.append({ "geocoding": "GeoNames resolution", "query": location, "result": geo["geonames_data"], }) inference_steps.append({ "formatting": "CC-RR-PPP generation", "result": geo["formatted"], }) return { "value": geo["settlement_name"], "formatted": geo["formatted"], "country_code": geo["country_code"], "region_code": geo["region_code"], "settlement_code": geo["settlement_code"], "confidence": "very_low", "inference_provenance": { "method": "earliest_job_location", "inference_chain": build_inference_chain(inference_steps), "assumptions": [ "First job location represents early life region", ], "assumption_note": "Job location is weak proxy for birth location; person likely relocated for work", "geonames_data": geo["geonames_data"], "inferred_at": datetime.now(timezone.utc).isoformat(), "inferred_by": "enrich_ppids.py", } } return None def infer_current_settlement(profile_data: dict, db_path: str) -> Optional[dict]: """ Infer current settlement from profile location or current job. Returns explicit inferred_current_settlement with full provenance chain. """ inference_steps = [] # Try profile location first (most reliable) profile_location = profile_data.get("location") if profile_location: inference_steps.append({ "observation": "Profile location field found", "source_field": "profile_data.location", "source_value": profile_location, }) geo = geocode_location(profile_location, db_path) if geo: inference_steps.append({ "geocoding": "GeoNames resolution", "query": profile_location, "result": geo["geonames_data"], }) inference_steps.append({ "formatting": "CC-RR-PPP generation", "result": geo["formatted"], }) return { "value": geo["settlement_name"], "formatted": geo["formatted"], "country_code": geo["country_code"], "region_code": geo["region_code"], "settlement_code": geo["settlement_code"], "confidence": "medium", "inference_provenance": { "method": "profile_location", "inference_chain": build_inference_chain(inference_steps), "assumptions": [ "Profile location is up-to-date", "Profile location represents current residence", ], "geonames_data": geo["geonames_data"], "inferred_at": datetime.now(timezone.utc).isoformat(), "inferred_by": "enrich_ppids.py", } } # Try current job location experience = profile_data.get("experience") or [] for exp in experience: if exp is None: continue # Handle multiple date field names (including start_date/end_date) date_range = get_any_date_field(exp) # Also check "current" field which some profiles have is_current = "Present" in date_range or exp.get("current") is True if is_current: location = exp.get("location") if location: inference_steps.append({ "observation": "Current job with location found", "source_field": "profile_data.experience", "source_value": { "company": exp.get("company", ""), "title": exp.get("title", ""), "location": location, }, }) geo = geocode_location(location, db_path) if geo: inference_steps.append({ "geocoding": "GeoNames resolution", "query": location, "result": geo["geonames_data"], }) inference_steps.append({ "formatting": "CC-RR-PPP generation", "result": geo["formatted"], }) return { "value": geo["settlement_name"], "formatted": geo["formatted"], "country_code": geo["country_code"], "region_code": geo["region_code"], "settlement_code": geo["settlement_code"], "confidence": "medium", "inference_provenance": { "method": "current_job_location", "inference_chain": build_inference_chain(inference_steps), "assumptions": [ "Current job location represents residence area", "Person works near where they live", ], "geonames_data": geo["geonames_data"], "inferred_at": datetime.now(timezone.utc).isoformat(), "inferred_by": "enrich_ppids.py", } } return None def regenerate_ppid(components: dict) -> str: """Regenerate PPID string from components.""" return ( f"{components['type']}_" f"{components['first_location']}_{components['first_date']}_" f"{components['last_location']}_{components['last_date']}_" f"{'-'.join(components['name_tokens'])}" ) def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force: bool = False) -> dict: """ Enrich a single PPID file with explicit inferred data (Rule 45 compliant). Args: filepath: Path to PPID JSON file db_path: Path to GeoNames SQLite database dry_run: Don't write changes force: Re-enrich already-enriched files (clears existing inferred_* fields) """ stats = { "birth_decade_inferred": False, "birth_decade_is_list": False, # Track decade boundary cases "birth_decade_method": None, # Track which method was used "birth_settlement_inferred": False, "current_settlement_inferred": False, "ppid_changed": False, } with open(filepath, "r", encoding="utf-8") as f: data = json.load(f) profile_data = data.get("profile_data", {}) if not profile_data: return stats # If force mode, clear existing inferred fields to re-enrich if force: for field in ["inferred_birth_decade", "inferred_birth_settlement", "inferred_current_settlement"]: if field in data: del data[field] # Reset components that may have been set from inferred data if "_source" in str(data.get("ppid_components", {}).get("first_date_source", "")): data["ppid_components"]["first_date"] = "XXXX" data["ppid_components"].pop("first_date_source", None) data["ppid_components"].pop("first_date_alternatives", None) if "_source" in str(data.get("ppid_components", {}).get("first_location_source", "")): data["ppid_components"]["first_location"] = "XX-XX-XXX" data["ppid_components"].pop("first_location_source", None) if "_source" in str(data.get("ppid_components", {}).get("last_location_source", "")): data["ppid_components"]["last_location"] = "XX-XX-XXX" data["ppid_components"].pop("last_location_source", None) original_ppid = data.get("ppid", "") components = data.get("ppid_components", {}).copy() changed = False # ===== INFER BIRTH DECADE ===== # Only if we don't already have an inferred value AND birth_date is unknown if (data.get("birth_date", {}).get("edtf") == "XXXX" and "inferred_birth_decade" not in data): birth_info = infer_birth_decade(profile_data) if birth_info: # Store as EXPLICIT inferred field (Rule 45) data["inferred_birth_decade"] = birth_info # Handle list-valued (decade boundary) vs single value if "values" in birth_info: # List-valued: use primary_value for PPID components["first_date"] = birth_info["primary_value"] components["first_date_source"] = "inferred_birth_decade.primary_value" components["first_date_alternatives"] = [v for v in birth_info["values"] if v != birth_info["primary_value"]] stats["birth_decade_is_list"] = True else: # Single value components["first_date"] = birth_info["edtf"] components["first_date_source"] = "inferred_birth_decade" # Track which method was used stats["birth_decade_method"] = birth_info.get("inference_provenance", {}).get("method", "unknown") # Add note to canonical field pointing to inferred alternative data["birth_date"]["note"] = "See inferred_birth_decade for heuristic estimate" stats["birth_decade_inferred"] = True changed = True # ===== INFER BIRTH SETTLEMENT ===== if (components.get("first_location") == "XX-XX-XXX" and "inferred_birth_settlement" not in data): birth_loc = infer_birth_settlement(profile_data, db_path) if birth_loc: data["inferred_birth_settlement"] = birth_loc components["first_location"] = birth_loc["formatted"] components["first_location_source"] = "inferred_birth_settlement" stats["birth_settlement_inferred"] = True changed = True # ===== INFER CURRENT SETTLEMENT ===== if (components.get("last_location") == "XX-XX-XXX" and "inferred_current_settlement" not in data): current_loc = infer_current_settlement(profile_data, db_path) if current_loc: data["inferred_current_settlement"] = current_loc components["last_location"] = current_loc["formatted"] components["last_location_source"] = "inferred_current_settlement" stats["current_settlement_inferred"] = True changed = True # ===== REGENERATE PPID IF COMPONENTS CHANGED ===== if changed: new_ppid = regenerate_ppid(components) if new_ppid != original_ppid: data["ppid"] = new_ppid data["ppid_components"] = components stats["ppid_changed"] = True # Track PPID history if "ppid_history" not in data: data["ppid_history"] = [] data["ppid_history"].append({ "previous_ppid": original_ppid, "new_ppid": new_ppid, "changed_at": datetime.now(timezone.utc).isoformat(), "reason": "observation_based_inference", "inferred_fields": [ k for k in ["inferred_birth_decade", "inferred_birth_settlement", "inferred_current_settlement"] if k in data ], }) else: data["ppid_components"] = components # Update provenance data["provenance"]["modified_at"] = datetime.now(timezone.utc).isoformat() data["provenance"]["modified_by"] = "enrich_ppids.py" if not dry_run: # Write back to file with open(filepath, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) # Rename file if PPID changed if stats["ppid_changed"]: new_filename = f"{new_ppid}.json" new_filepath = filepath.parent / new_filename if new_filepath != filepath and not new_filepath.exists(): filepath.rename(new_filepath) return stats def main(): import argparse parser = argparse.ArgumentParser(description="Enrich PPID files with explicit inferred data (Rule 45)") parser.add_argument("--dry-run", action="store_true", help="Don't write changes") parser.add_argument("--limit", type=int, help="Process only N files") parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") parser.add_argument("--force", "-f", action="store_true", help="Re-enrich already-enriched files") args = parser.parse_args() # Paths person_dir = Path("/Users/kempersc/apps/glam/data/person") db_path = "/Users/kempersc/apps/glam/data/reference/geonames.db" # Get all PPID files ppid_files = list(person_dir.glob("ID_*.json")) if args.limit: ppid_files = ppid_files[:args.limit] print(f"Processing {len(ppid_files)} PPID files (Rule 45 compliant)...") if args.dry_run: print("DRY RUN - no changes will be written") if args.force: print("FORCE MODE - re-enriching all files") # Statistics total_stats = { "processed": 0, "birth_decade_inferred": 0, "birth_decade_list_valued": 0, # Decade boundary cases "birth_decade_by_method": { "earliest_education_heuristic": 0, "earliest_experience_heuristic": 0, "total_experience_heuristic": 0, }, "birth_settlement_inferred": 0, "current_settlement_inferred": 0, "ppid_changed": 0, "errors": 0, } for i, filepath in enumerate(ppid_files): try: stats = enrich_ppid_file(filepath, db_path, dry_run=args.dry_run, force=args.force) total_stats["processed"] += 1 if stats["birth_decade_inferred"]: total_stats["birth_decade_inferred"] += 1 # Track method used method = stats.get("birth_decade_method") if method and method in total_stats["birth_decade_by_method"]: total_stats["birth_decade_by_method"][method] += 1 if stats.get("birth_decade_is_list"): total_stats["birth_decade_list_valued"] += 1 if stats["birth_settlement_inferred"]: total_stats["birth_settlement_inferred"] += 1 if stats["current_settlement_inferred"]: total_stats["current_settlement_inferred"] += 1 if stats["ppid_changed"]: total_stats["ppid_changed"] += 1 if args.verbose and any(v for k, v in stats.items() if k != "birth_decade_method"): print(f" {filepath.name}: {stats}") if (i + 1) % 500 == 0: print(f" Processed {i + 1}/{len(ppid_files)}...") except Exception as e: total_stats["errors"] += 1 if args.verbose: print(f" ERROR {filepath.name}: {e}") # Print summary print("\n" + "=" * 60) print("ENRICHMENT SUMMARY (Rule 45 Compliant)") print("=" * 60) print(f"Processed: {total_stats['processed']}") print(f"Birth decades inferred: {total_stats['birth_decade_inferred']}") print(f" - List-valued (boundary): {total_stats['birth_decade_list_valued']}") print(f" - By method:") for method, count in total_stats["birth_decade_by_method"].items(): print(f" {method}: {count}") print(f"Birth settlements inferred: {total_stats['birth_settlement_inferred']}") print(f"Current settlements inferred: {total_stats['current_settlement_inferred']}") print(f"PPIDs updated: {total_stats['ppid_changed']}") print(f"Errors: {total_stats['errors']}") # Coverage percentages if total_stats["processed"] > 0: print("\nCoverage:") print(f" Birth decade: {total_stats['birth_decade_inferred'] / total_stats['processed'] * 100:.1f}%") if total_stats["birth_decade_inferred"] > 0: print(f" - Boundary cases: {total_stats['birth_decade_list_valued'] / total_stats['birth_decade_inferred'] * 100:.1f}%") print(f" Birth settlement: {total_stats['birth_settlement_inferred'] / total_stats['processed'] * 100:.1f}%") print(f" Current settlement: {total_stats['current_settlement_inferred'] / total_stats['processed'] * 100:.1f}%") print("\nNote: All inferred data stored in explicit inferred_* fields with provenance chains.") print("Note: Decade boundary cases use EDTF set notation [196X,197X] with primary_value for PPID.") print("Note: Total Experience method has highest uncertainty (very_low confidence).") if __name__ == "__main__": main()