- enrich_ppids.py: Add 40+ Dutch universities and hogescholen to location mapping - enrich_ppids_web.py: New script for web-based PPID enrichment - resolve_pending_known_orgs.py: Updates for pending org resolution
1399 lines
58 KiB
Python
1399 lines
58 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PPID Enrichment Script (Rule 45 Compliant)
|
|
|
|
Enriches PPID files with EXPLICIT inferred data:
|
|
1. inferred_birth_decade - From earliest career observations
|
|
2. inferred_birth_settlement - From earliest school/university location
|
|
3. inferred_current_settlement - From current work location
|
|
|
|
All inferred data includes full provenance chains per Rule 45:
|
|
- Each inference step is documented
|
|
- Source observations are linked
|
|
- Confidence levels are assigned
|
|
- Inferred values NEVER silently replace canonical fields
|
|
|
|
Reference:
|
|
- .opencode/rules/inferred-data-explicit-provenance-rule.md (Rule 45)
|
|
- .opencode/rules/ppid-birth-date-enrichment-rule.md (Rule 44)
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Tuple, List, Dict, Any
|
|
|
|
# GeoNames admin1_code to ISO 3166-2 mapping for Netherlands
|
|
NL_ADMIN1_TO_ISO = {
|
|
"01": "DR", # Drenthe
|
|
"02": "FR", # Friesland
|
|
"03": "GE", # Gelderland
|
|
"04": "GR", # Groningen
|
|
"05": "LI", # Limburg
|
|
"06": "NB", # Noord-Brabant
|
|
"07": "NH", # Noord-Holland
|
|
"09": "UT", # Utrecht
|
|
"10": "ZE", # Zeeland
|
|
"11": "ZH", # Zuid-Holland
|
|
"15": "OV", # Overijssel
|
|
"16": "FL", # Flevoland
|
|
}
|
|
|
|
# Common country-specific admin1 mappings
|
|
COUNTRY_ADMIN1_MAPPINGS = {
|
|
"NL": NL_ADMIN1_TO_ISO,
|
|
}
|
|
|
|
# Known university location mappings
|
|
DUTCH_UNI_LOCATIONS = {
|
|
"Universiteit Utrecht": ("Utrecht", "NL"),
|
|
"Utrecht University": ("Utrecht", "NL"),
|
|
"UU": ("Utrecht", "NL"),
|
|
"Universiteit van Amsterdam": ("Amsterdam", "NL"),
|
|
"University of Amsterdam": ("Amsterdam", "NL"),
|
|
"UvA": ("Amsterdam", "NL"),
|
|
"VU Amsterdam": ("Amsterdam", "NL"),
|
|
"Vrije Universiteit": ("Amsterdam", "NL"),
|
|
"Leiden University": ("Leiden", "NL"),
|
|
"Universiteit Leiden": ("Leiden", "NL"),
|
|
"TU Delft": ("Delft", "NL"),
|
|
"Technische Universiteit Delft": ("Delft", "NL"),
|
|
"TU Eindhoven": ("Eindhoven", "NL"),
|
|
"Technische Universiteit Eindhoven": ("Eindhoven", "NL"),
|
|
"Radboud": ("Nijmegen", "NL"),
|
|
"Radboud Universiteit": ("Nijmegen", "NL"),
|
|
"Rijksuniversiteit Groningen": ("Groningen", "NL"),
|
|
"University of Groningen": ("Groningen", "NL"),
|
|
"RUG": ("Groningen", "NL"),
|
|
"Maastricht University": ("Maastricht", "NL"),
|
|
"Universiteit Maastricht": ("Maastricht", "NL"),
|
|
"Erasmus": ("Rotterdam", "NL"),
|
|
"Erasmus Universiteit": ("Rotterdam", "NL"),
|
|
"Erasmus University Rotterdam": ("Rotterdam", "NL"),
|
|
"Tilburg University": ("Tilburg", "NL"),
|
|
"Universiteit Tilburg": ("Tilburg", "NL"),
|
|
"Wageningen": ("Wageningen", "NL"),
|
|
"Wageningen University": ("Wageningen", "NL"),
|
|
"Hogeschool": ("", "NL"), # Generic, location from name
|
|
# Additional Dutch institutions
|
|
"Hogeschool van Arnhem en Nijmegen": ("Nijmegen", "NL"),
|
|
"HAN": ("Nijmegen", "NL"),
|
|
"Hogeschool Utrecht": ("Utrecht", "NL"),
|
|
"HU": ("Utrecht", "NL"),
|
|
"Hogeschool van Amsterdam": ("Amsterdam", "NL"),
|
|
"HvA": ("Amsterdam", "NL"),
|
|
"Hogeschool Rotterdam": ("Rotterdam", "NL"),
|
|
"Hogeschool Inholland": ("Amsterdam", "NL"),
|
|
"Fontys": ("Eindhoven", "NL"),
|
|
"Fontys Hogescholen": ("Eindhoven", "NL"),
|
|
"Saxion": ("Enschede", "NL"),
|
|
"Saxion Hogeschool": ("Enschede", "NL"),
|
|
"Stenden": ("Leeuwarden", "NL"),
|
|
"NHL Stenden": ("Leeuwarden", "NL"),
|
|
"Hanzehogeschool": ("Groningen", "NL"),
|
|
"Hanze": ("Groningen", "NL"),
|
|
"Christelijke Hogeschool Ede": ("Ede", "NL"),
|
|
"CHE": ("Ede", "NL"),
|
|
"Avans": ("Breda", "NL"),
|
|
"Avans Hogeschool": ("Breda", "NL"),
|
|
"Windesheim": ("Zwolle", "NL"),
|
|
"Hogeschool Windesheim": ("Zwolle", "NL"),
|
|
"Zuyd Hogeschool": ("Maastricht", "NL"),
|
|
"Archiefschool": ("Amsterdam", "NL"),
|
|
"Archiefschool Amsterdam": ("Amsterdam", "NL"),
|
|
"Reinwardt Academie": ("Amsterdam", "NL"),
|
|
"KABK": ("Den Haag", "NL"),
|
|
"Koninklijke Academie van Beeldende Kunsten": ("Den Haag", "NL"),
|
|
"Gerrit Rietveld Academie": ("Amsterdam", "NL"),
|
|
"Design Academy Eindhoven": ("Eindhoven", "NL"),
|
|
"Art & Design College Utrecht": ("Utrecht", "NL"),
|
|
"ArtEZ": ("Arnhem", "NL"),
|
|
"IOPS": ("Amsterdam", "NL"),
|
|
"Interuniversity Graduate School of Psychometrics": ("Amsterdam", "NL"),
|
|
"Sioo": ("Utrecht", "NL"),
|
|
# Additional Dutch universities (expanded mapping)
|
|
"Eindhoven University of Technology": ("Eindhoven", "NL"),
|
|
"Delft University of Technology": ("Delft", "NL"),
|
|
"University of Twente": ("Enschede", "NL"),
|
|
"Universiteit Twente": ("Enschede", "NL"),
|
|
"UT": ("Enschede", "NL"),
|
|
"Open Universiteit": ("Heerlen", "NL"),
|
|
"Open University Netherlands": ("Heerlen", "NL"),
|
|
"Nyenrode": ("Breukelen", "NL"),
|
|
"Nyenrode Business Universiteit": ("Breukelen", "NL"),
|
|
"Theologische Universiteit": ("Kampen", "NL"),
|
|
"Protestant Theological University": ("Amsterdam", "NL"),
|
|
# Additional Hogescholen
|
|
"De Haagse Hogeschool": ("Den Haag", "NL"),
|
|
"The Hague University": ("Den Haag", "NL"),
|
|
"The Hague University of Applied Sciences": ("Den Haag", "NL"),
|
|
"Amsterdamse Hogeschool voor de Kunsten": ("Amsterdam", "NL"),
|
|
"AHK": ("Amsterdam", "NL"),
|
|
"Conservatorium van Amsterdam": ("Amsterdam", "NL"),
|
|
"Hanzehogeschool Groningen": ("Groningen", "NL"),
|
|
"Hogeschool Leiden": ("Leiden", "NL"),
|
|
"Hogeschool Zeeland": ("Vlissingen", "NL"),
|
|
"HZ University of Applied Sciences": ("Vlissingen", "NL"),
|
|
"Hogeschool voor de Kunsten Utrecht": ("Utrecht", "NL"),
|
|
"HKU": ("Utrecht", "NL"),
|
|
"Willem de Kooning Academie": ("Rotterdam", "NL"),
|
|
"Codarts Rotterdam": ("Rotterdam", "NL"),
|
|
"Codarts": ("Rotterdam", "NL"),
|
|
"Design Academy": ("Eindhoven", "NL"),
|
|
"NHTV": ("Breda", "NL"),
|
|
"NHTV Breda University of Applied Sciences": ("Breda", "NL"),
|
|
"Breda University of Applied Sciences": ("Breda", "NL"),
|
|
"NHL Hogeschool": ("Leeuwarden", "NL"),
|
|
"Van Hall Larenstein": ("Velp", "NL"),
|
|
"NCOI": ("Hilversum", "NL"),
|
|
"NCOI Opleidingen": ("Hilversum", "NL"),
|
|
"LOI": ("Leiderdorp", "NL"),
|
|
"LOI Hogeschool": ("Leiderdorp", "NL"),
|
|
"NTI": ("Leiden", "NL"),
|
|
"Hogeschool Arnhem": ("Arnhem", "NL"),
|
|
"Hogeschool Nijmegen": ("Nijmegen", "NL"),
|
|
"ROC": ("", "NL"), # Regional Training Centers - various locations (fallback)
|
|
# Specific ROC locations
|
|
"ROC Leeuwenborgh": ("Maastricht", "NL"),
|
|
"ROC Leiden": ("Leiden", "NL"),
|
|
"ROC Midden Nederland": ("Utrecht", "NL"),
|
|
"ROC MN": ("Utrecht", "NL"),
|
|
"ROC van Amsterdam": ("Amsterdam", "NL"),
|
|
"ROC Amsterdam": ("Amsterdam", "NL"),
|
|
"ROC Flevoland": ("Almere", "NL"),
|
|
"ROC Tilburg": ("Tilburg", "NL"),
|
|
"ROC van Twente": ("Enschede", "NL"),
|
|
"ROC Twente": ("Enschede", "NL"),
|
|
"ROC Nijmegen": ("Nijmegen", "NL"),
|
|
"ROC Mondriaan": ("Den Haag", "NL"),
|
|
"ROC Nova College": ("Haarlem", "NL"),
|
|
"ROC Albeda": ("Rotterdam", "NL"),
|
|
"Albeda College": ("Rotterdam", "NL"),
|
|
"Zadkine": ("Rotterdam", "NL"),
|
|
"Graafschap College": ("Doetinchem", "NL"),
|
|
"Friesland College": ("Leeuwarden", "NL"),
|
|
"Noorderpoort": ("Groningen", "NL"),
|
|
"Alfa-college": ("Groningen", "NL"),
|
|
"Deltion College": ("Zwolle", "NL"),
|
|
"Cibap": ("Zwolle", "NL"),
|
|
"Summa College": ("Eindhoven", "NL"),
|
|
"SintLucas": ("Eindhoven", "NL"),
|
|
"Koning Willem I College": ("Den Bosch", "NL"),
|
|
"Curio": ("Breda", "NL"),
|
|
"Da Vinci College": ("Dordrecht", "NL"),
|
|
# Additional Radboud variations
|
|
"Radboud University Nijmegen": ("Nijmegen", "NL"),
|
|
"Radboud University": ("Nijmegen", "NL"),
|
|
# Additional VU variations
|
|
"Vrije Universiteit Amsterdam": ("Amsterdam", "NL"),
|
|
"VU University Amsterdam": ("Amsterdam", "NL"),
|
|
# Wageningen variations
|
|
"Wageningen University & Research": ("Wageningen", "NL"),
|
|
"WUR": ("Wageningen", "NL"),
|
|
# Belgian institutions
|
|
"KU Leuven": ("Leuven", "BE"),
|
|
"University of Leuven": ("Leuven", "BE"),
|
|
"Katholieke Universiteit Leuven": ("Leuven", "BE"),
|
|
"Vrije Universiteit Brussel": ("Brussel", "BE"),
|
|
"VUB": ("Brussel", "BE"),
|
|
"Universiteit Gent": ("Gent", "BE"),
|
|
"Ghent University": ("Gent", "BE"),
|
|
"UGent": ("Gent", "BE"),
|
|
"Universiteit Antwerpen": ("Antwerpen", "BE"),
|
|
"University of Antwerp": ("Antwerpen", "BE"),
|
|
# German institutions
|
|
"Universität Bremen": ("Bremen", "DE"),
|
|
"University of Bremen": ("Bremen", "DE"),
|
|
"Westfälische Wilhelms-Universität Münster": ("Münster", "DE"),
|
|
"WWU Münster": ("Münster", "DE"),
|
|
"Humboldt-Universität": ("Berlin", "DE"),
|
|
"Freie Universität Berlin": ("Berlin", "DE"),
|
|
"FU Berlin": ("Berlin", "DE"),
|
|
"Universität zu Köln": ("Köln", "DE"),
|
|
"University of Cologne": ("Köln", "DE"),
|
|
"Ruprecht-Karls-Universität Heidelberg": ("Heidelberg", "DE"),
|
|
"Heidelberg University": ("Heidelberg", "DE"),
|
|
"Ludwig-Maximilians-Universität München": ("München", "DE"),
|
|
"LMU München": ("München", "DE"),
|
|
"Technische Universität München": ("München", "DE"),
|
|
"TU München": ("München", "DE"),
|
|
# UK institutions
|
|
"University of Oxford": ("Oxford", "GB"),
|
|
"Oxford University": ("Oxford", "GB"),
|
|
"University of Cambridge": ("Cambridge", "GB"),
|
|
"Cambridge University": ("Cambridge", "GB"),
|
|
"University of York": ("York", "GB"),
|
|
"University College London": ("London", "GB"),
|
|
"UCL": ("London", "GB"),
|
|
"London School of Economics": ("London", "GB"),
|
|
"LSE": ("London", "GB"),
|
|
"King's College London": ("London", "GB"),
|
|
"Imperial College": ("London", "GB"),
|
|
"University of Edinburgh": ("Edinburgh", "GB"),
|
|
"University of Manchester": ("Manchester", "GB"),
|
|
# Australian institutions
|
|
"The Australian National University": ("Canberra", "AU"),
|
|
"Australian National University": ("Canberra", "AU"),
|
|
"ANU": ("Canberra", "AU"),
|
|
"University of Canberra": ("Canberra", "AU"),
|
|
"University of Melbourne": ("Melbourne", "AU"),
|
|
"University of Sydney": ("Sydney", "AU"),
|
|
"Macquarie University": ("Sydney", "AU"),
|
|
"Charles Sturt University": ("Bathurst", "AU"),
|
|
"UNSW": ("Sydney", "AU"),
|
|
"University of New South Wales": ("Sydney", "AU"),
|
|
"University of Queensland": ("Brisbane", "AU"),
|
|
"Monash University": ("Melbourne", "AU"),
|
|
# South African institutions
|
|
"University of Cape Town": ("Cape Town", "ZA"),
|
|
"UCT": ("Cape Town", "ZA"),
|
|
"University of Pretoria": ("Pretoria", "ZA"),
|
|
"University of Witwatersrand": ("Johannesburg", "ZA"),
|
|
"Stellenbosch University": ("Stellenbosch", "ZA"),
|
|
# Italian institutions
|
|
"Politecnico di Milano": ("Milano", "IT"),
|
|
"Università degli Studi di Milano": ("Milano", "IT"),
|
|
"Università di Bologna": ("Bologna", "IT"),
|
|
"University of Bologna": ("Bologna", "IT"),
|
|
# US institutions
|
|
"Oberlin College": ("Oberlin", "US"),
|
|
"Harvard University": ("Cambridge", "US"),
|
|
"Harvard": ("Cambridge", "US"),
|
|
"Yale University": ("New Haven", "US"),
|
|
"Princeton University": ("Princeton", "US"),
|
|
"MIT": ("Cambridge", "US"),
|
|
"Massachusetts Institute of Technology": ("Cambridge", "US"),
|
|
"Stanford University": ("Stanford", "US"),
|
|
"Columbia University": ("New York", "US"),
|
|
"University of California": ("Berkeley", "US"),
|
|
"UCLA": ("Los Angeles", "US"),
|
|
"University of Chicago": ("Chicago", "US"),
|
|
"NYU": ("New York", "US"),
|
|
"New York University": ("New York", "US"),
|
|
# Indonesian institutions
|
|
"Universitas Gadjah Mada": ("Yogyakarta", "ID"),
|
|
"UGM": ("Yogyakarta", "ID"),
|
|
"Universitas Indonesia": ("Jakarta", "ID"),
|
|
"UI": ("Jakarta", "ID"),
|
|
# Turkish institutions
|
|
"Middle East Technical University": ("Ankara", "TR"),
|
|
"METU": ("Ankara", "TR"),
|
|
"Boğaziçi University": ("Istanbul", "TR"),
|
|
# Additional Dutch variations found in data
|
|
"Rotterdam School of Management": ("Rotterdam", "NL"),
|
|
"RSM": ("Rotterdam", "NL"),
|
|
"TIAS School for Business and Society": ("Tilburg", "NL"),
|
|
"TIAS": ("Tilburg", "NL"),
|
|
"GO opleidingen": ("Utrecht", "NL"),
|
|
"Amsterdam University of Applied Sciences": ("Amsterdam", "NL"),
|
|
"University College Utrecht": ("Utrecht", "NL"),
|
|
"UCU": ("Utrecht", "NL"),
|
|
"University of Utrecht": ("Utrecht", "NL"),
|
|
"NSOB": ("Den Haag", "NL"),
|
|
"Nederlandse School voor Openbaar Bestuur": ("Den Haag", "NL"),
|
|
"Grotius Academie": ("Nijmegen", "NL"),
|
|
"de Baak": ("Noordwijk", "NL"),
|
|
"Grafisch Lyceum Rotterdam": ("Rotterdam", "NL"),
|
|
"Schoevers": ("Utrecht", "NL"),
|
|
"Schoevers College": ("Utrecht", "NL"),
|
|
}
|
|
|
|
|
|
def get_settlement_code(city_name: str) -> str:
|
|
"""Generate 3-letter settlement code from city name."""
|
|
words = city_name.split()
|
|
dutch_articles = {"de", "het", "den", "'s"}
|
|
|
|
if len(words) == 1:
|
|
return city_name[:3].upper()
|
|
elif words[0].lower() in dutch_articles:
|
|
return (words[0][0] + words[1][:2]).upper()
|
|
else:
|
|
return "".join(w[0] for w in words[:3]).upper()
|
|
|
|
|
|
def geocode_location(location_str: str, db_path: str) -> Optional[dict]:
|
|
"""
|
|
Geocode a location string to CC-RR-PPP format using GeoNames.
|
|
"""
|
|
if not location_str:
|
|
return None
|
|
|
|
location_str = location_str.strip()
|
|
|
|
# Extract country from common patterns
|
|
country_code = None
|
|
country_patterns = {
|
|
"NL": ["(NL)", "Netherlands", "Nederland"],
|
|
"BE": ["(BE)", "Belgium", "België", "Belgique"],
|
|
"DE": ["(DE)", "Germany", "Deutschland"],
|
|
"GB": ["(GB)", "United Kingdom", "UK", "England", "Scotland", "Wales"],
|
|
"AU": ["(AU)", "Australia"],
|
|
"ZA": ["(ZA)", "South Africa"],
|
|
"IT": ["(IT)", "Italy", "Italia"],
|
|
"US": ["(US)", "United States", "USA", "U.S."],
|
|
"ID": ["(ID)", "Indonesia"],
|
|
"TR": ["(TR)", "Turkey", "Türkiye"],
|
|
"FR": ["(FR)", "France"],
|
|
"ES": ["(ES)", "Spain", "España"],
|
|
"AT": ["(AT)", "Austria", "Österreich"],
|
|
"CH": ["(CH)", "Switzerland", "Schweiz", "Suisse"],
|
|
"CA": ["(CA)", "Canada"],
|
|
"NZ": ["(NZ)", "New Zealand"],
|
|
"JP": ["(JP)", "Japan"],
|
|
"CN": ["(CN)", "China"],
|
|
"IN": ["(IN)", "India"],
|
|
"BR": ["(BR)", "Brazil", "Brasil"],
|
|
"SE": ["(SE)", "Sweden", "Sverige"],
|
|
"NO": ["(NO)", "Norway", "Norge"],
|
|
"DK": ["(DK)", "Denmark", "Danmark"],
|
|
"FI": ["(FI)", "Finland", "Suomi"],
|
|
"PL": ["(PL)", "Poland", "Polska"],
|
|
"CZ": ["(CZ)", "Czech Republic", "Czechia", "Česko"],
|
|
}
|
|
|
|
for code, patterns in country_patterns.items():
|
|
if any(p in location_str for p in patterns):
|
|
country_code = code
|
|
break
|
|
|
|
# Clean location for city lookup
|
|
city_candidate = location_str.split(",")[0].strip()
|
|
city_candidate = re.sub(r"\s*(Area|Region|\([A-Z]{2}\)).*", "", city_candidate).strip()
|
|
|
|
if not city_candidate or not country_code:
|
|
return None
|
|
|
|
try:
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("""
|
|
SELECT name, ascii_name, admin1_code, admin1_name, country_code,
|
|
latitude, longitude, geonames_id, population, feature_code
|
|
FROM cities
|
|
WHERE (name LIKE ? OR ascii_name LIKE ?)
|
|
AND country_code = ?
|
|
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (f"{city_candidate}%", f"{city_candidate}%", country_code))
|
|
|
|
row = cursor.fetchone()
|
|
conn.close()
|
|
|
|
if not row:
|
|
return None
|
|
|
|
name, ascii_name, admin1_code, admin1_name, cc, lat, lon, geonames_id, pop, feature_code = row
|
|
|
|
# Map admin1_code to ISO 3166-2
|
|
region_code = "XX"
|
|
if cc in COUNTRY_ADMIN1_MAPPINGS and admin1_code:
|
|
region_code = COUNTRY_ADMIN1_MAPPINGS[cc].get(admin1_code, "XX")
|
|
elif admin1_code:
|
|
region_code = admin1_code[:2].upper()
|
|
|
|
settlement_code = get_settlement_code(ascii_name)
|
|
|
|
return {
|
|
"country_code": cc,
|
|
"region_code": region_code,
|
|
"settlement_code": settlement_code,
|
|
"settlement_name": name,
|
|
"formatted": f"{cc}-{region_code}-{settlement_code}",
|
|
"geonames_data": {
|
|
"geonames_id": geonames_id,
|
|
"geonames_name": name,
|
|
"admin1_code": admin1_code,
|
|
"admin1_name": admin1_name,
|
|
"feature_code": feature_code,
|
|
"latitude": lat,
|
|
"longitude": lon,
|
|
},
|
|
"original_query": location_str,
|
|
}
|
|
except Exception as e:
|
|
print(f" GeoNames error: {e}")
|
|
return None
|
|
|
|
|
|
def parse_date_range(date_range: str) -> Tuple[Optional[int], Optional[int]]:
|
|
"""Parse date range string to extract start and end years."""
|
|
if not date_range:
|
|
return None, None
|
|
|
|
years = re.findall(r'\b(19\d{2}|20\d{2})\b', date_range)
|
|
|
|
if not years:
|
|
return None, None
|
|
|
|
start_year = int(years[0]) if years else None
|
|
end_year = int(years[-1]) if len(years) > 1 else None
|
|
|
|
return start_year, end_year
|
|
|
|
|
|
def get_any_date_field(record: dict) -> str:
|
|
"""
|
|
Extract date string from a record with various field name conventions.
|
|
|
|
Handles the following field variations found in LinkedIn profile data:
|
|
- date_range: "2019 - Present" (most common, 2,486 entries)
|
|
- period: "2015 - 2019" (15 entries)
|
|
- years/year: "2010" (single year)
|
|
- start_date/end_date: separate fields like "Sep 2019" / "Present" (149 entries)
|
|
- dates: "2018 - 2020" (12 entries)
|
|
|
|
Returns combined date string suitable for parse_date_range().
|
|
"""
|
|
# Try combined date fields first
|
|
for field in ["date_range", "period", "years", "year", "dates"]:
|
|
if record.get(field):
|
|
return str(record[field])
|
|
|
|
# Handle separate start_date/end_date fields
|
|
start = record.get("start_date", "") or ""
|
|
end = record.get("end_date", "") or ""
|
|
if start or end:
|
|
return f"{start} - {end}".strip(" -")
|
|
|
|
return ""
|
|
|
|
|
|
def parse_total_experience_field(total_exp: str) -> Optional[int]:
|
|
"""
|
|
Parse total experience field value to extract years.
|
|
|
|
Handles formats like:
|
|
- "24 years and 8 months"
|
|
- "37 years"
|
|
- "5 years 3 months"
|
|
- "1 year"
|
|
|
|
Returns number of years or None if not parseable.
|
|
"""
|
|
if not total_exp:
|
|
return None
|
|
|
|
# Pattern: find digits followed by "year" or "years"
|
|
match = re.search(r'(\d+)\s*years?', total_exp.lower())
|
|
if match:
|
|
return int(match.group(1))
|
|
|
|
return None
|
|
|
|
|
|
def build_inference_chain(steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Build a numbered inference chain."""
|
|
return [{"step": i + 1, **step} for i, step in enumerate(steps)]
|
|
|
|
|
|
def is_near_decade_boundary(year: int, threshold: int = 3) -> bool:
|
|
"""
|
|
Check if a year is within `threshold` years of a decade boundary.
|
|
|
|
Examples:
|
|
1968, threshold=3 → True (within 3 of 1970)
|
|
1972, threshold=3 → True (within 3 of 1970)
|
|
1975, threshold=3 → False (5 years from both boundaries)
|
|
"""
|
|
year_in_decade = year % 10
|
|
return year_in_decade >= (10 - threshold) or year_in_decade <= threshold
|
|
|
|
|
|
def get_decade_notation(year: int) -> str:
|
|
"""Convert year to EDTF decade notation (e.g., 1968 → 196X)."""
|
|
decade = (year // 10) * 10
|
|
return f"{decade // 10}X"
|
|
|
|
|
|
def get_adjacent_decades(year: int) -> Tuple[str, str]:
|
|
"""
|
|
Get two adjacent decades for a year near a boundary.
|
|
|
|
Examples:
|
|
1968 → ("196X", "197X")
|
|
1972 → ("196X", "197X")
|
|
2001 → ("199X", "200X")
|
|
"""
|
|
decade = (year // 10) * 10
|
|
year_in_decade = year % 10
|
|
|
|
if year_in_decade >= 7: # Late in decade (7, 8, 9) → spans to next
|
|
return (get_decade_notation(year), get_decade_notation(year + 10))
|
|
else: # Early in decade (0, 1, 2, 3) → spans to previous
|
|
return (get_decade_notation(year - 10), get_decade_notation(year))
|
|
|
|
|
|
def parse_total_experience(about_text: str) -> Optional[int]:
|
|
"""
|
|
Parse "Total Experience: X years" pattern from about/summary field.
|
|
Returns number of years or None if not found.
|
|
"""
|
|
if not about_text:
|
|
return None
|
|
|
|
# Pattern: "Total Experience: X years and Y months" or "Total Experience: X year"
|
|
m = re.search(r'Total Experience:\s*(\d+)\s*years?', about_text, re.IGNORECASE)
|
|
if m:
|
|
return int(m.group(1))
|
|
|
|
return None
|
|
|
|
|
|
def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
|
"""
|
|
Infer birth decade from earliest career observations.
|
|
Returns explicit inferred_birth_decade with full provenance chain.
|
|
|
|
Supports list-valued results for decade boundary cases (Rule 45 extension):
|
|
- If estimated birth year is within 3 years of decade boundary, returns
|
|
both adjacent decades as EDTF set notation: [196X,197X]
|
|
|
|
Inference methods (in priority order):
|
|
1. Education start year (most reliable - entry age 18-24)
|
|
2. Experience start year (first job - entry age ~23)
|
|
3. Total Experience pattern (fallback - "Total Experience: X years")
|
|
"""
|
|
earliest_year = None
|
|
inference_steps = []
|
|
age_offset = 18
|
|
age_variance = 3 # ±3 years typical variance in entry age
|
|
education_record = None
|
|
experience_record = None
|
|
total_experience_years = None
|
|
|
|
# Check education first (most reliable)
|
|
education = profile_data.get("education") or []
|
|
for edu in education:
|
|
if edu is None:
|
|
continue
|
|
# Handle multiple date field names: "date_range", "period", "years", "year"
|
|
date_range = edu.get("date_range") or edu.get("period") or edu.get("years") or edu.get("year") or ""
|
|
degree = (edu.get("degree") or "").lower() # Handle None
|
|
# Handle both "institution" and "school" field names
|
|
institution = edu.get("institution") or edu.get("school") or ""
|
|
|
|
start_year, _ = parse_date_range(date_range)
|
|
|
|
if start_year:
|
|
# Match bachelor's/master's/doctoral level degrees
|
|
degree_lower = degree.lower()
|
|
if any(term in degree_lower for term in [
|
|
# English degrees
|
|
"bachelor", "bsc", "ba", "master", "msc", "ma", "phd", "doctor", "postgraduate",
|
|
# Dutch degrees
|
|
"doctoraal", "drs", "drs.", "mr", "mr.", "ing", "ing.", "ir", "ir.",
|
|
"hbo", "mbo", "pabo", "meao", "heao",
|
|
# German degrees
|
|
"magister", "diplom", "staatsexamen", "referendariat",
|
|
# Italian degrees
|
|
"laurea",
|
|
# Generic
|
|
"degree", "graduate", "undergraduate", "post doc", "postdoc",
|
|
]):
|
|
if earliest_year is None or start_year < earliest_year:
|
|
earliest_year = start_year
|
|
# Determine age offset based on degree level
|
|
if any(term in degree_lower for term in ["master", "msc", "ma", "drs", "drs.", "mr", "mr.", "ir", "ir.", "laurea magistrale", "magister"]):
|
|
age_offset = 22 # Master's typically starts at 22
|
|
elif any(term in degree_lower for term in ["phd", "doctor", "post doc", "postdoc", "postgraduate"]):
|
|
age_offset = 24 # PhD typically starts at 24
|
|
else:
|
|
age_offset = 18 # Bachelor's/undergraduate
|
|
education_record = {
|
|
"institution": institution,
|
|
"degree": edu.get("degree", ""),
|
|
"date_range": date_range,
|
|
}
|
|
elif any(term in degree_lower for term in ["hbo", "mbo", "vocational", "associate", "pabo", "meao", "heao"]):
|
|
if earliest_year is None or start_year < earliest_year:
|
|
earliest_year = start_year
|
|
age_offset = 16
|
|
education_record = {
|
|
"institution": institution,
|
|
"degree": edu.get("degree", ""),
|
|
"date_range": date_range,
|
|
}
|
|
# Also accept education without clear degree type (use conservative estimate)
|
|
elif earliest_year is None:
|
|
earliest_year = start_year
|
|
age_offset = 18 # Assume typical university entry age
|
|
education_record = {
|
|
"institution": institution,
|
|
"degree": edu.get("degree", "") or "(no degree specified)",
|
|
"date_range": date_range,
|
|
}
|
|
|
|
# If no education, check earliest job
|
|
if earliest_year is None:
|
|
experience = profile_data.get("experience") or []
|
|
for exp in experience:
|
|
if exp is None:
|
|
continue
|
|
# Handle multiple date field names (including start_date/end_date)
|
|
date_range = get_any_date_field(exp)
|
|
start_year, _ = parse_date_range(date_range)
|
|
|
|
if start_year:
|
|
if earliest_year is None or start_year < earliest_year:
|
|
earliest_year = start_year
|
|
age_offset = 23
|
|
age_variance = 5 # Higher variance for first job
|
|
experience_record = {
|
|
"company": exp.get("company", ""),
|
|
"title": exp.get("title", ""),
|
|
"date_range": date_range,
|
|
}
|
|
|
|
# If no education or experience dates, try "Total Experience" pattern in about field
|
|
if earliest_year is None:
|
|
about = profile_data.get("about") or profile_data.get("summary") or ""
|
|
total_experience_years = parse_total_experience(about)
|
|
|
|
if total_experience_years and total_experience_years > 0:
|
|
# Estimate: current year - total_years = first job year
|
|
# Then: first job year - 23 = birth year (assuming first job at 23)
|
|
current_year = datetime.now().year
|
|
estimated_first_job_year = current_year - total_experience_years
|
|
earliest_year = estimated_first_job_year
|
|
age_offset = 23 # Assume first job at 23
|
|
age_variance = 7 # Very high variance for this method
|
|
|
|
inference_steps.append({
|
|
"observation": "Total Experience pattern found in about field",
|
|
"source_field": "profile_data.about",
|
|
"source_value": f"Total Experience: {total_experience_years} years",
|
|
})
|
|
inference_steps.append({
|
|
"calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}",
|
|
"result": f"Estimated first job year: {estimated_first_job_year}",
|
|
"assumption": "Total experience represents continuous career from first job",
|
|
})
|
|
|
|
# If still no date, try standalone total_experience field in profile_data
|
|
if earliest_year is None:
|
|
total_exp_field = profile_data.get("total_experience")
|
|
if total_exp_field:
|
|
total_experience_years = parse_total_experience_field(total_exp_field)
|
|
|
|
if total_experience_years and total_experience_years > 0:
|
|
current_year = datetime.now().year
|
|
estimated_first_job_year = current_year - total_experience_years
|
|
earliest_year = estimated_first_job_year
|
|
age_offset = 23 # Assume first job at 23
|
|
age_variance = 7 # Very high variance for this method
|
|
|
|
inference_steps.append({
|
|
"observation": "total_experience field found in profile_data",
|
|
"source_field": "profile_data.total_experience",
|
|
"source_value": total_exp_field,
|
|
})
|
|
inference_steps.append({
|
|
"calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}",
|
|
"result": f"Estimated first job year: {estimated_first_job_year}",
|
|
"assumption": "Total experience represents continuous career from first job",
|
|
})
|
|
|
|
if earliest_year is None:
|
|
return None
|
|
|
|
# Build inference chain (only add steps if not already added from Total Experience path)
|
|
if education_record:
|
|
inference_steps.append({
|
|
"observation": "Education record found",
|
|
"source_field": "profile_data.education",
|
|
"source_value": education_record,
|
|
})
|
|
inference_steps.append({
|
|
"extraction": "Start year extracted from date_range",
|
|
"extracted_value": earliest_year,
|
|
})
|
|
inference_steps.append({
|
|
"assumption": f"Education entry age is approximately {age_offset} (±{age_variance} years)",
|
|
"rationale": "Standard entry age for this education level in Netherlands/Europe",
|
|
"confidence_impact": f"Assumption introduces uncertainty; actual age may vary ±{age_variance} years",
|
|
})
|
|
elif experience_record:
|
|
inference_steps.append({
|
|
"observation": "First job record found (no education data)",
|
|
"source_field": "profile_data.experience",
|
|
"source_value": experience_record,
|
|
})
|
|
inference_steps.append({
|
|
"extraction": "Start year extracted from date_range",
|
|
"extracted_value": earliest_year,
|
|
})
|
|
inference_steps.append({
|
|
"assumption": f"First job age is approximately {age_offset} (±{age_variance} years)",
|
|
"rationale": "Assumes first job after typical university completion",
|
|
"confidence_impact": f"Higher uncertainty; first job age varies ±{age_variance} years",
|
|
})
|
|
elif total_experience_years:
|
|
# Steps already added in the Total Experience detection block
|
|
inference_steps.append({
|
|
"assumption": f"First job age is approximately {age_offset} (±{age_variance} years)",
|
|
"rationale": "Assumes first job after typical university completion; Total Experience method has highest uncertainty",
|
|
"confidence_impact": f"Very high uncertainty; first job age varies ±{age_variance} years, plus Total Experience aggregation may be inaccurate",
|
|
})
|
|
|
|
estimated_birth_year = earliest_year - age_offset
|
|
min_birth_year = earliest_year - age_offset - age_variance
|
|
max_birth_year = earliest_year - age_offset + age_variance
|
|
|
|
inference_steps.append({
|
|
"calculation": f"{earliest_year} - {age_offset} = {estimated_birth_year}",
|
|
"result": f"Estimated birth year: {estimated_birth_year}",
|
|
"range": f"{min_birth_year}-{max_birth_year} (accounting for ±{age_variance} year variance)",
|
|
})
|
|
|
|
# Check if birth year range spans a decade boundary
|
|
min_decade = (min_birth_year // 10) * 10
|
|
max_decade = (max_birth_year // 10) * 10
|
|
spans_decade_boundary = min_decade != max_decade
|
|
|
|
if spans_decade_boundary:
|
|
# Get decades directly from min/max range (not estimated year)
|
|
decade1 = get_decade_notation(min_birth_year)
|
|
decade2 = get_decade_notation(max_birth_year)
|
|
|
|
# Primary is the decade containing the estimated birth year
|
|
estimated_decade = get_decade_notation(estimated_birth_year)
|
|
if estimated_decade == decade1:
|
|
primary_value = decade1
|
|
primary_rationale = f"{estimated_birth_year} is in {decade1}, but range extends into {decade2}"
|
|
else:
|
|
primary_value = decade2
|
|
primary_rationale = f"{estimated_birth_year} is in {decade2}, but range extends into {decade1}"
|
|
|
|
inference_steps.append({
|
|
"generalization": "Birth year range spans decade boundary",
|
|
"input_range": [min_birth_year, max_birth_year],
|
|
"output": [decade1, decade2],
|
|
"edtf": f"[{decade1},{decade2}]",
|
|
"rationale": "Cannot determine which decade with certainty; using EDTF 'one of' set notation",
|
|
})
|
|
|
|
# Determine method name based on source
|
|
if education_record:
|
|
method_name = "earliest_education_heuristic"
|
|
elif experience_record:
|
|
method_name = "earliest_experience_heuristic"
|
|
else:
|
|
method_name = "total_experience_heuristic"
|
|
|
|
return {
|
|
"values": [decade1, decade2],
|
|
"edtf": f"[{decade1},{decade2}]",
|
|
"edtf_meaning": f"one of: {decade1[:-1]}0s or {decade2[:-1]}0s",
|
|
"precision": "decade_set",
|
|
"primary_value": primary_value,
|
|
"primary_rationale": primary_rationale,
|
|
"confidence": "very_low", # Lower confidence due to boundary uncertainty
|
|
"inference_provenance": {
|
|
"method": method_name,
|
|
"inference_chain": build_inference_chain(inference_steps),
|
|
"assumptions": [
|
|
f"Entry age for education/first job: {age_offset} years (±{age_variance})",
|
|
"Career records are complete in LinkedIn profile",
|
|
],
|
|
"boundary_note": f"Birth year estimate {estimated_birth_year} spans decades {decade1}/{decade2}",
|
|
"inferred_at": datetime.now(timezone.utc).isoformat(),
|
|
"inferred_by": "enrich_ppids.py",
|
|
}
|
|
}
|
|
else:
|
|
# Single decade - standard case
|
|
edtf_decade = get_decade_notation(estimated_birth_year)
|
|
|
|
inference_steps.append({
|
|
"generalization": "Convert to EDTF decade notation",
|
|
"input": estimated_birth_year,
|
|
"output": edtf_decade,
|
|
"rationale": "Decade precision appropriate for heuristic-based estimate",
|
|
})
|
|
|
|
# Determine method name and confidence based on source
|
|
if education_record:
|
|
method_name = "earliest_education_heuristic"
|
|
confidence = "low"
|
|
elif experience_record:
|
|
method_name = "earliest_experience_heuristic"
|
|
confidence = "low"
|
|
else:
|
|
method_name = "total_experience_heuristic"
|
|
confidence = "very_low" # Lowest confidence for Total Experience method
|
|
|
|
return {
|
|
"value": edtf_decade,
|
|
"edtf": edtf_decade,
|
|
"precision": "decade",
|
|
"confidence": confidence,
|
|
"inference_provenance": {
|
|
"method": method_name,
|
|
"inference_chain": build_inference_chain(inference_steps),
|
|
"assumptions": [
|
|
f"Entry age for education/first job: {age_offset} years (±{age_variance})",
|
|
"Career records are complete in LinkedIn profile",
|
|
],
|
|
"inferred_at": datetime.now(timezone.utc).isoformat(),
|
|
"inferred_by": "enrich_ppids.py",
|
|
}
|
|
}
|
|
|
|
|
|
def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
|
|
"""
|
|
Infer birth settlement from earliest school/university location.
|
|
Returns explicit inferred_birth_settlement with full provenance chain.
|
|
"""
|
|
inference_steps = []
|
|
|
|
# Check education first
|
|
education = profile_data.get("education") or []
|
|
|
|
edu_with_years = []
|
|
for edu in education:
|
|
if edu is None:
|
|
continue
|
|
# Handle multiple date field names: "date_range", "period", "years", "year"
|
|
date_range = edu.get("date_range") or edu.get("period") or edu.get("years") or edu.get("year") or ""
|
|
start_year, _ = parse_date_range(date_range)
|
|
if start_year:
|
|
edu_with_years.append((start_year, edu))
|
|
|
|
edu_with_years.sort(key=lambda x: x[0])
|
|
|
|
for start_year, edu in edu_with_years:
|
|
# Handle both "institution" and "school" field names
|
|
institution = edu.get("institution") or edu.get("school") or ""
|
|
|
|
# Look up institution location
|
|
location = None
|
|
location_source = None
|
|
|
|
for uni_name, (city, country) in DUTCH_UNI_LOCATIONS.items():
|
|
if uni_name.lower() in institution.lower():
|
|
# Map country code to country name for geocoding
|
|
country_names = {
|
|
"NL": "Netherlands",
|
|
"BE": "Belgium",
|
|
"DE": "Germany",
|
|
"GB": "United Kingdom",
|
|
"AU": "Australia",
|
|
"ZA": "South Africa",
|
|
"IT": "Italy",
|
|
"US": "United States",
|
|
"ID": "Indonesia",
|
|
"TR": "Turkey",
|
|
}
|
|
country_name = country_names.get(country, "Netherlands")
|
|
location = f"{city}, {country_name}" if city else None
|
|
location_source = f"Known institution mapping: {uni_name}"
|
|
break
|
|
|
|
if not location:
|
|
continue
|
|
|
|
# Get date_range for provenance (handle multiple field names)
|
|
edu_date_range = edu.get("date_range") or edu.get("period") or edu.get("years") or edu.get("year") or ""
|
|
|
|
inference_steps.append({
|
|
"observation": "Earliest education institution identified",
|
|
"source_field": f"profile_data.education",
|
|
"source_value": {
|
|
"institution": institution,
|
|
"date_range": edu_date_range,
|
|
"degree": edu.get("degree") or "",
|
|
},
|
|
})
|
|
|
|
inference_steps.append({
|
|
"lookup": "Institution location mapping",
|
|
"mapping_source": "DUTCH_UNI_LOCATIONS dictionary",
|
|
"mapping_key": institution,
|
|
"mapping_result": location,
|
|
})
|
|
|
|
geo = geocode_location(location, db_path)
|
|
if geo:
|
|
inference_steps.append({
|
|
"geocoding": "GeoNames resolution",
|
|
"query": location,
|
|
"result": geo["geonames_data"],
|
|
})
|
|
|
|
inference_steps.append({
|
|
"formatting": "CC-RR-PPP generation",
|
|
"components": {
|
|
"country_code": geo["country_code"],
|
|
"region_code": geo["region_code"],
|
|
"settlement_code": geo["settlement_code"],
|
|
},
|
|
"result": geo["formatted"],
|
|
})
|
|
|
|
return {
|
|
"value": geo["settlement_name"],
|
|
"formatted": geo["formatted"],
|
|
"country_code": geo["country_code"],
|
|
"region_code": geo["region_code"],
|
|
"settlement_code": geo["settlement_code"],
|
|
"confidence": "low",
|
|
"inference_provenance": {
|
|
"method": "earliest_education_location",
|
|
"inference_chain": build_inference_chain(inference_steps),
|
|
"assumptions": [
|
|
"Student attended school near birth/family residence",
|
|
"Institution location is representative of early life location",
|
|
],
|
|
"assumption_note": "University location used as proxy for birth settlement; student may have relocated for education",
|
|
"geonames_data": geo["geonames_data"],
|
|
"inferred_at": datetime.now(timezone.utc).isoformat(),
|
|
"inferred_by": "enrich_ppids.py",
|
|
}
|
|
}
|
|
|
|
# Fallback: earliest job location
|
|
experience = profile_data.get("experience") or []
|
|
|
|
exp_with_years = []
|
|
for exp in experience:
|
|
if exp is None:
|
|
continue
|
|
# Handle multiple date field names (including start_date/end_date)
|
|
date_range = get_any_date_field(exp)
|
|
start_year, _ = parse_date_range(date_range)
|
|
if start_year and exp.get("location"):
|
|
exp_with_years.append((start_year, exp))
|
|
|
|
exp_with_years.sort(key=lambda x: x[0])
|
|
|
|
for start_year, exp in exp_with_years:
|
|
location = exp.get("location", "")
|
|
if not location:
|
|
continue
|
|
|
|
# Get date_range for provenance (handle multiple field names)
|
|
exp_date_range = get_any_date_field(exp)
|
|
|
|
inference_steps.append({
|
|
"observation": "Earliest job with location found (no education location available)",
|
|
"source_field": "profile_data.experience",
|
|
"source_value": {
|
|
"company": exp.get("company", ""),
|
|
"title": exp.get("title", ""),
|
|
"date_range": exp_date_range,
|
|
"location": location,
|
|
},
|
|
})
|
|
|
|
geo = geocode_location(location, db_path)
|
|
if geo:
|
|
inference_steps.append({
|
|
"geocoding": "GeoNames resolution",
|
|
"query": location,
|
|
"result": geo["geonames_data"],
|
|
})
|
|
|
|
inference_steps.append({
|
|
"formatting": "CC-RR-PPP generation",
|
|
"result": geo["formatted"],
|
|
})
|
|
|
|
return {
|
|
"value": geo["settlement_name"],
|
|
"formatted": geo["formatted"],
|
|
"country_code": geo["country_code"],
|
|
"region_code": geo["region_code"],
|
|
"settlement_code": geo["settlement_code"],
|
|
"confidence": "very_low",
|
|
"inference_provenance": {
|
|
"method": "earliest_job_location",
|
|
"inference_chain": build_inference_chain(inference_steps),
|
|
"assumptions": [
|
|
"First job location represents early life region",
|
|
],
|
|
"assumption_note": "Job location is weak proxy for birth location; person likely relocated for work",
|
|
"geonames_data": geo["geonames_data"],
|
|
"inferred_at": datetime.now(timezone.utc).isoformat(),
|
|
"inferred_by": "enrich_ppids.py",
|
|
}
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def infer_current_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
|
|
"""
|
|
Infer current settlement from profile location or current job.
|
|
Returns explicit inferred_current_settlement with full provenance chain.
|
|
"""
|
|
inference_steps = []
|
|
|
|
# Try profile location first (most reliable)
|
|
profile_location = profile_data.get("location")
|
|
if profile_location:
|
|
inference_steps.append({
|
|
"observation": "Profile location field found",
|
|
"source_field": "profile_data.location",
|
|
"source_value": profile_location,
|
|
})
|
|
|
|
geo = geocode_location(profile_location, db_path)
|
|
if geo:
|
|
inference_steps.append({
|
|
"geocoding": "GeoNames resolution",
|
|
"query": profile_location,
|
|
"result": geo["geonames_data"],
|
|
})
|
|
|
|
inference_steps.append({
|
|
"formatting": "CC-RR-PPP generation",
|
|
"result": geo["formatted"],
|
|
})
|
|
|
|
return {
|
|
"value": geo["settlement_name"],
|
|
"formatted": geo["formatted"],
|
|
"country_code": geo["country_code"],
|
|
"region_code": geo["region_code"],
|
|
"settlement_code": geo["settlement_code"],
|
|
"confidence": "medium",
|
|
"inference_provenance": {
|
|
"method": "profile_location",
|
|
"inference_chain": build_inference_chain(inference_steps),
|
|
"assumptions": [
|
|
"Profile location is up-to-date",
|
|
"Profile location represents current residence",
|
|
],
|
|
"geonames_data": geo["geonames_data"],
|
|
"inferred_at": datetime.now(timezone.utc).isoformat(),
|
|
"inferred_by": "enrich_ppids.py",
|
|
}
|
|
}
|
|
|
|
# Try current job location
|
|
experience = profile_data.get("experience") or []
|
|
for exp in experience:
|
|
if exp is None:
|
|
continue
|
|
# Handle multiple date field names (including start_date/end_date)
|
|
date_range = get_any_date_field(exp)
|
|
# Also check "current" field which some profiles have
|
|
is_current = "Present" in date_range or exp.get("current") is True
|
|
if is_current:
|
|
location = exp.get("location")
|
|
if location:
|
|
inference_steps.append({
|
|
"observation": "Current job with location found",
|
|
"source_field": "profile_data.experience",
|
|
"source_value": {
|
|
"company": exp.get("company", ""),
|
|
"title": exp.get("title", ""),
|
|
"location": location,
|
|
},
|
|
})
|
|
|
|
geo = geocode_location(location, db_path)
|
|
if geo:
|
|
inference_steps.append({
|
|
"geocoding": "GeoNames resolution",
|
|
"query": location,
|
|
"result": geo["geonames_data"],
|
|
})
|
|
|
|
inference_steps.append({
|
|
"formatting": "CC-RR-PPP generation",
|
|
"result": geo["formatted"],
|
|
})
|
|
|
|
return {
|
|
"value": geo["settlement_name"],
|
|
"formatted": geo["formatted"],
|
|
"country_code": geo["country_code"],
|
|
"region_code": geo["region_code"],
|
|
"settlement_code": geo["settlement_code"],
|
|
"confidence": "medium",
|
|
"inference_provenance": {
|
|
"method": "current_job_location",
|
|
"inference_chain": build_inference_chain(inference_steps),
|
|
"assumptions": [
|
|
"Current job location represents residence area",
|
|
"Person works near where they live",
|
|
],
|
|
"geonames_data": geo["geonames_data"],
|
|
"inferred_at": datetime.now(timezone.utc).isoformat(),
|
|
"inferred_by": "enrich_ppids.py",
|
|
}
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def regenerate_ppid(components: dict) -> str:
|
|
"""Regenerate PPID string from components."""
|
|
return (
|
|
f"{components['type']}_"
|
|
f"{components['first_location']}_{components['first_date']}_"
|
|
f"{components['last_location']}_{components['last_date']}_"
|
|
f"{'-'.join(components['name_tokens'])}"
|
|
)
|
|
|
|
|
|
def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force: bool = False) -> dict:
|
|
"""
|
|
Enrich a single PPID file with explicit inferred data (Rule 45 compliant).
|
|
|
|
Args:
|
|
filepath: Path to PPID JSON file
|
|
db_path: Path to GeoNames SQLite database
|
|
dry_run: Don't write changes
|
|
force: Re-enrich already-enriched files (clears existing inferred_* fields)
|
|
"""
|
|
stats = {
|
|
"birth_decade_inferred": False,
|
|
"birth_decade_is_list": False, # Track decade boundary cases
|
|
"birth_decade_method": None, # Track which method was used
|
|
"birth_settlement_inferred": False,
|
|
"current_settlement_inferred": False,
|
|
"ppid_changed": False,
|
|
}
|
|
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
profile_data = data.get("profile_data", {})
|
|
if not profile_data:
|
|
return stats
|
|
|
|
# If force mode, clear existing inferred fields to re-enrich
|
|
if force:
|
|
for field in ["inferred_birth_decade", "inferred_birth_settlement", "inferred_current_settlement"]:
|
|
if field in data:
|
|
del data[field]
|
|
# Reset components that may have been set from inferred data
|
|
if "_source" in str(data.get("ppid_components", {}).get("first_date_source", "")):
|
|
data["ppid_components"]["first_date"] = "XXXX"
|
|
data["ppid_components"].pop("first_date_source", None)
|
|
data["ppid_components"].pop("first_date_alternatives", None)
|
|
if "_source" in str(data.get("ppid_components", {}).get("first_location_source", "")):
|
|
data["ppid_components"]["first_location"] = "XX-XX-XXX"
|
|
data["ppid_components"].pop("first_location_source", None)
|
|
if "_source" in str(data.get("ppid_components", {}).get("last_location_source", "")):
|
|
data["ppid_components"]["last_location"] = "XX-XX-XXX"
|
|
data["ppid_components"].pop("last_location_source", None)
|
|
|
|
original_ppid = data.get("ppid", "")
|
|
components = data.get("ppid_components", {}).copy()
|
|
changed = False
|
|
|
|
# ===== INFER BIRTH DECADE =====
|
|
# Only if we don't already have an inferred value AND birth_date is unknown
|
|
if (data.get("birth_date", {}).get("edtf") == "XXXX" and
|
|
"inferred_birth_decade" not in data):
|
|
|
|
birth_info = infer_birth_decade(profile_data)
|
|
if birth_info:
|
|
# Store as EXPLICIT inferred field (Rule 45)
|
|
data["inferred_birth_decade"] = birth_info
|
|
|
|
# Handle list-valued (decade boundary) vs single value
|
|
if "values" in birth_info:
|
|
# List-valued: use primary_value for PPID
|
|
components["first_date"] = birth_info["primary_value"]
|
|
components["first_date_source"] = "inferred_birth_decade.primary_value"
|
|
components["first_date_alternatives"] = [v for v in birth_info["values"] if v != birth_info["primary_value"]]
|
|
stats["birth_decade_is_list"] = True
|
|
else:
|
|
# Single value
|
|
components["first_date"] = birth_info["edtf"]
|
|
components["first_date_source"] = "inferred_birth_decade"
|
|
|
|
# Track which method was used
|
|
stats["birth_decade_method"] = birth_info.get("inference_provenance", {}).get("method", "unknown")
|
|
|
|
# Add note to canonical field pointing to inferred alternative
|
|
data["birth_date"]["note"] = "See inferred_birth_decade for heuristic estimate"
|
|
|
|
stats["birth_decade_inferred"] = True
|
|
changed = True
|
|
|
|
# ===== INFER BIRTH SETTLEMENT =====
|
|
if (components.get("first_location") == "XX-XX-XXX" and
|
|
"inferred_birth_settlement" not in data):
|
|
|
|
birth_loc = infer_birth_settlement(profile_data, db_path)
|
|
if birth_loc:
|
|
data["inferred_birth_settlement"] = birth_loc
|
|
|
|
components["first_location"] = birth_loc["formatted"]
|
|
components["first_location_source"] = "inferred_birth_settlement"
|
|
|
|
stats["birth_settlement_inferred"] = True
|
|
changed = True
|
|
|
|
# ===== INFER CURRENT SETTLEMENT =====
|
|
if (components.get("last_location") == "XX-XX-XXX" and
|
|
"inferred_current_settlement" not in data):
|
|
|
|
current_loc = infer_current_settlement(profile_data, db_path)
|
|
if current_loc:
|
|
data["inferred_current_settlement"] = current_loc
|
|
|
|
components["last_location"] = current_loc["formatted"]
|
|
components["last_location_source"] = "inferred_current_settlement"
|
|
|
|
stats["current_settlement_inferred"] = True
|
|
changed = True
|
|
|
|
# ===== REGENERATE PPID IF COMPONENTS CHANGED =====
|
|
if changed:
|
|
new_ppid = regenerate_ppid(components)
|
|
if new_ppid != original_ppid:
|
|
data["ppid"] = new_ppid
|
|
data["ppid_components"] = components
|
|
stats["ppid_changed"] = True
|
|
|
|
# Track PPID history
|
|
if "ppid_history" not in data:
|
|
data["ppid_history"] = []
|
|
data["ppid_history"].append({
|
|
"previous_ppid": original_ppid,
|
|
"new_ppid": new_ppid,
|
|
"changed_at": datetime.now(timezone.utc).isoformat(),
|
|
"reason": "observation_based_inference",
|
|
"inferred_fields": [
|
|
k for k in ["inferred_birth_decade", "inferred_birth_settlement", "inferred_current_settlement"]
|
|
if k in data
|
|
],
|
|
})
|
|
else:
|
|
data["ppid_components"] = components
|
|
|
|
# Update provenance
|
|
data["provenance"]["modified_at"] = datetime.now(timezone.utc).isoformat()
|
|
data["provenance"]["modified_by"] = "enrich_ppids.py"
|
|
|
|
if not dry_run:
|
|
# Write back to file
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
# Rename file if PPID changed
|
|
if stats["ppid_changed"]:
|
|
new_filename = f"{new_ppid}.json"
|
|
new_filepath = filepath.parent / new_filename
|
|
if new_filepath != filepath and not new_filepath.exists():
|
|
filepath.rename(new_filepath)
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Enrich PPID files with explicit inferred data (Rule 45)")
|
|
parser.add_argument("--dry-run", action="store_true", help="Don't write changes")
|
|
parser.add_argument("--limit", type=int, help="Process only N files")
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
parser.add_argument("--force", "-f", action="store_true", help="Re-enrich already-enriched files")
|
|
args = parser.parse_args()
|
|
|
|
# Paths
|
|
person_dir = Path("/Users/kempersc/apps/glam/data/person")
|
|
db_path = "/Users/kempersc/apps/glam/data/reference/geonames.db"
|
|
|
|
# Get all PPID files
|
|
ppid_files = list(person_dir.glob("ID_*.json"))
|
|
|
|
if args.limit:
|
|
ppid_files = ppid_files[:args.limit]
|
|
|
|
print(f"Processing {len(ppid_files)} PPID files (Rule 45 compliant)...")
|
|
if args.dry_run:
|
|
print("DRY RUN - no changes will be written")
|
|
if args.force:
|
|
print("FORCE MODE - re-enriching all files")
|
|
|
|
# Statistics
|
|
total_stats = {
|
|
"processed": 0,
|
|
"birth_decade_inferred": 0,
|
|
"birth_decade_list_valued": 0, # Decade boundary cases
|
|
"birth_decade_by_method": {
|
|
"earliest_education_heuristic": 0,
|
|
"earliest_experience_heuristic": 0,
|
|
"total_experience_heuristic": 0,
|
|
},
|
|
"birth_settlement_inferred": 0,
|
|
"current_settlement_inferred": 0,
|
|
"ppid_changed": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
for i, filepath in enumerate(ppid_files):
|
|
try:
|
|
stats = enrich_ppid_file(filepath, db_path, dry_run=args.dry_run, force=args.force)
|
|
total_stats["processed"] += 1
|
|
if stats["birth_decade_inferred"]:
|
|
total_stats["birth_decade_inferred"] += 1
|
|
# Track method used
|
|
method = stats.get("birth_decade_method")
|
|
if method and method in total_stats["birth_decade_by_method"]:
|
|
total_stats["birth_decade_by_method"][method] += 1
|
|
if stats.get("birth_decade_is_list"):
|
|
total_stats["birth_decade_list_valued"] += 1
|
|
if stats["birth_settlement_inferred"]:
|
|
total_stats["birth_settlement_inferred"] += 1
|
|
if stats["current_settlement_inferred"]:
|
|
total_stats["current_settlement_inferred"] += 1
|
|
if stats["ppid_changed"]:
|
|
total_stats["ppid_changed"] += 1
|
|
|
|
if args.verbose and any(v for k, v in stats.items() if k != "birth_decade_method"):
|
|
print(f" {filepath.name}: {stats}")
|
|
|
|
if (i + 1) % 500 == 0:
|
|
print(f" Processed {i + 1}/{len(ppid_files)}...")
|
|
|
|
except Exception as e:
|
|
total_stats["errors"] += 1
|
|
if args.verbose:
|
|
print(f" ERROR {filepath.name}: {e}")
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("ENRICHMENT SUMMARY (Rule 45 Compliant)")
|
|
print("=" * 60)
|
|
print(f"Processed: {total_stats['processed']}")
|
|
print(f"Birth decades inferred: {total_stats['birth_decade_inferred']}")
|
|
print(f" - List-valued (boundary): {total_stats['birth_decade_list_valued']}")
|
|
print(f" - By method:")
|
|
for method, count in total_stats["birth_decade_by_method"].items():
|
|
print(f" {method}: {count}")
|
|
print(f"Birth settlements inferred: {total_stats['birth_settlement_inferred']}")
|
|
print(f"Current settlements inferred: {total_stats['current_settlement_inferred']}")
|
|
print(f"PPIDs updated: {total_stats['ppid_changed']}")
|
|
print(f"Errors: {total_stats['errors']}")
|
|
|
|
# Coverage percentages
|
|
if total_stats["processed"] > 0:
|
|
print("\nCoverage:")
|
|
print(f" Birth decade: {total_stats['birth_decade_inferred'] / total_stats['processed'] * 100:.1f}%")
|
|
if total_stats["birth_decade_inferred"] > 0:
|
|
print(f" - Boundary cases: {total_stats['birth_decade_list_valued'] / total_stats['birth_decade_inferred'] * 100:.1f}%")
|
|
print(f" Birth settlement: {total_stats['birth_settlement_inferred'] / total_stats['processed'] * 100:.1f}%")
|
|
print(f" Current settlement: {total_stats['current_settlement_inferred'] / total_stats['processed'] * 100:.1f}%")
|
|
|
|
print("\nNote: All inferred data stored in explicit inferred_* fields with provenance chains.")
|
|
print("Note: Decade boundary cases use EDTF set notation [196X,197X] with primary_value for PPID.")
|
|
print("Note: Total Experience method has highest uncertainty (very_low confidence).")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|