glam/scripts/create_custodians_from_linkedin.py
2025-12-17 10:11:56 +01:00

759 lines
26 KiB
Python

#!/usr/bin/env python3
"""
Create new NL-*.yaml custodian files from unmatched LinkedIn profiles.
This script:
1. Loads Dutch candidates from data/custodian/linkedin/_unmatched_analysis.json
2. Resolves city/province using GeoNames database
3. Generates GHCID identifiers following project rules
4. Creates skeleton custodian files with linkedin_enrichment
Usage:
python scripts/create_custodians_from_linkedin.py --dry-run --limit 10
python scripts/create_custodians_from_linkedin.py --limit 50
python scripts/create_custodians_from_linkedin.py # Process all 452
Key Rules Applied:
- Rule 8: Filter legal forms (Stichting, B.V., etc.) from abbreviations
- GeoNames is authoritative for settlement resolution
- admin1_code mapping to ISO 3166-2 province codes
"""
import argparse
import hashlib
import json
import re
import sqlite3
import sys
import unicodedata
import uuid
from datetime import datetime, timezone
from pathlib import Path
import yaml
# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
LINKEDIN_DIR = PROJECT_ROOT / "data" / "custodian" / "linkedin"
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db"
UNMATCHED_FILE = LINKEDIN_DIR / "_unmatched_analysis.json"
# GHCID namespace UUID for v5 generation
GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # URL namespace
# Dutch province mapping: GeoNames admin1_code → ISO 3166-2 code
ADMIN1_TO_ISO = {
"01": "DR", # Drenthe
"02": "FR", # Friesland (note: also used for other provinces in some GeoNames data)
"03": "GE", # Gelderland
"04": "GR", # Groningen
"05": "LI", # Limburg
"06": "NB", # Noord-Brabant
"07": "NH", # Noord-Holland
"09": "UT", # Utrecht
"10": "ZE", # Zeeland
"11": "ZH", # Zuid-Holland
"15": "OV", # Overijssel
"16": "FL", # Flevoland
}
# Province name to ISO code (for when LinkedIn gives province name as city)
PROVINCE_NAME_TO_ISO = {
"drenthe": "DR",
"friesland": "FR",
"fryslân": "FR",
"gelderland": "GE",
"groningen": "GR",
"limburg": "LI",
"noord-brabant": "NB",
"brabant": "NB",
"noord-holland": "NH",
"utrecht": "UT",
"zeeland": "ZE",
"zuid-holland": "ZH",
"overijssel": "OV",
"flevoland": "FL",
}
# Dutch legal form words to skip in abbreviation (Rule 8)
LEGAL_FORM_WORDS = {
# Dutch
"stichting", "coöperatie", "cooperatie", "maatschap",
"bv", "b.v.", "nv", "n.v.", "vof", "v.o.f.", "cv", "c.v.",
# English
"foundation", "trust", "inc", "incorporated", "ltd", "limited",
"llc", "corp", "corporation",
}
# Dutch prepositions/articles to skip in abbreviation
SKIP_WORDS = {
"de", "het", "een", "van", "voor", "in", "op", "te", "den", "der",
"des", "'s", "aan", "bij", "met", "naar", "om", "tot", "uit",
"over", "onder", "door", "en", "of", "the", "a", "an", "of", "and",
}
# Institution type inference patterns
# Patterns matched against NAME (high priority) and INDUSTRY (lower priority)
TYPE_PATTERNS = {
"M": [ # Museum
r"\bmuseum\b", r"\bmusea\b", r"\bkunsthal\b", r"\bkunsthuis\b", r"\bgalerie\b",
r"\btentoonstelling\b", r"\bexpositie\b", r"\bcollectie\b",
],
"A": [ # Archive
r"\barchief\b", r"\barchieven\b", r"\barchive\b", r"\bdocumentatie\b",
r"\berfgoedcentrum\b", r"historisch\s+centrum",
],
"L": [ # Library
r"\bbibliotheek\b", r"\bbibliotheken\b", r"\blibrary\b", r"\bmediatheek\b",
],
"S": [ # Society/Kring
r"\bvereniging\b", r"\bgenootschap\b", r"\bkring\b", r"\bbond\b", r"stichting.*erfgoed",
r"\bheemkunde\b", r"\boudheidkunde\b", r"historische.*vereniging",
],
"R": [ # Research
r"\bonderzoek\b", r"\bresearch\b", r"\binstituut\b", r"\bkenniscentrum\b",
],
"E": [ # Education
r"\buniversiteit\b", r"\bhogeschool\b", r"\bacademie\b", r"\bschool\b",
],
"B": [ # Botanical/Zoo
r"\bdierentuin\b", r"\bzoo\b", r"\bbotanische\b", r"\barboretum\b", r"\bhortus\b",
],
}
def normalize_text(text: str) -> str:
"""Normalize unicode text, remove diacritics."""
normalized = unicodedata.normalize("NFD", text)
ascii_text = "".join(c for c in normalized if unicodedata.category(c) != "Mn")
return ascii_text.lower()
def generate_city_code(city_name: str) -> str:
"""Generate 3-letter city code from city name.
Rules:
- Single word: first 3 letters → Amsterdam = AMS
- Dutch article (de, het, den, 's): article initial + 2 from main → Den Haag = DHA
- Multi-word: initials (up to 3) → Nieuw Amsterdam = NAM
"""
if not city_name:
return "XXX"
# Normalize
clean = normalize_text(city_name)
words = clean.split()
if not words:
return "XXX"
# Single word
if len(words) == 1:
return words[0][:3].upper()
# Check for Dutch articles at start
dutch_articles = {"de", "het", "den", "'s", "s"}
if words[0] in dutch_articles:
# Article initial + 2 from next word
if len(words) > 1:
article_initial = words[0][0] if words[0] != "'s" else "S"
return (article_initial + words[1][:2]).upper()
# Multi-word: take initials
initials = "".join(w[0] for w in words if w not in dutch_articles)
return initials[:3].upper()
def extract_abbreviation_from_name(name: str) -> str:
"""Extract abbreviation from institution name.
Rules (per AGENTS.md):
- Use first letter of each significant word
- Skip prepositions, articles, conjunctions
- Skip legal form words (Stichting, B.V., etc.)
- Remove diacritics, uppercase, max 10 chars
"""
if not name:
return "UNK"
# Normalize
clean = normalize_text(name)
# Remove punctuation except spaces
clean = re.sub(r"[^\w\s]", " ", clean)
words = clean.split()
# Filter out skip words and legal forms
significant_words = []
for word in words:
word_lower = word.lower()
if word_lower in SKIP_WORDS:
continue
if word_lower in LEGAL_FORM_WORDS:
continue
# Skip digits
if word.isdigit():
continue
significant_words.append(word)
if not significant_words:
# Fallback: use first 3 letters of original
return name[:3].upper()
# Take first letter of each significant word
abbrev = "".join(w[0] for w in significant_words)
return abbrev[:10].upper()
def infer_institution_type(name: str, industry: str) -> list[str]:
"""Infer institution type from name and industry.
Priority: Name patterns > Industry patterns > Industry keywords
If name clearly indicates museum, archive, etc., industry is ignored.
Returns list of type codes (e.g., ["M"], ["A", "L"]).
"""
name_types = set()
industry_types = set()
name_lower = name.lower() if name else ""
industry_lower = industry.lower() if industry else ""
# First pass: check name-based patterns (high priority)
for type_code, patterns in TYPE_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, name_lower):
name_types.add(type_code)
break
# If name clearly identifies type, return just that (skip industry)
if name_types:
return sorted(name_types)
# Second pass: check industry-based patterns
for type_code, patterns in TYPE_PATTERNS.items():
for pattern in patterns:
if re.search(pattern, industry_lower):
industry_types.add(type_code)
break
# Industry keyword inference (lower priority)
if "museum" in industry_lower or "historical site" in industry_lower:
industry_types.add("M")
if "librar" in industry_lower:
industry_types.add("L")
if "archiv" in industry_lower:
industry_types.add("A")
if industry_types:
return sorted(industry_types)
# Default to Unknown if no inference possible
return ["U"]
def lookup_city_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None:
"""Look up city in GeoNames database.
Returns dict with geonames_id, name, admin1_code, admin1_name, etc.
or None if not found.
"""
if not city_name:
return None
# First try exact match
cursor = conn.execute(
"""
SELECT geonames_id, name, ascii_name, admin1_code, admin1_name,
latitude, longitude, feature_code, population
FROM cities
WHERE country_code = 'NL'
AND (name = ? OR ascii_name = ?)
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY population DESC
LIMIT 1
""",
(city_name, city_name),
)
row = cursor.fetchone()
if row:
return {
"geonames_id": row[0],
"name": row[1],
"ascii_name": row[2],
"admin1_code": row[3],
"admin1_name": row[4],
"latitude": row[5],
"longitude": row[6],
"feature_code": row[7],
"population": row[8],
}
# Try case-insensitive match
cursor = conn.execute(
"""
SELECT geonames_id, name, ascii_name, admin1_code, admin1_name,
latitude, longitude, feature_code, population
FROM cities
WHERE country_code = 'NL'
AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?))
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
ORDER BY population DESC
LIMIT 1
""",
(city_name, city_name),
)
row = cursor.fetchone()
if row:
return {
"geonames_id": row[0],
"name": row[1],
"ascii_name": row[2],
"admin1_code": row[3],
"admin1_name": row[4],
"latitude": row[5],
"longitude": row[6],
"feature_code": row[7],
"population": row[8],
}
return None
def infer_city_from_name(institution_name: str, conn: sqlite3.Connection) -> dict | None:
"""Try to infer city from institution name (e.g., 'Museum Spakenburg' → Spakenburg)."""
# Common patterns: "Museum X", "X Museum", "Archief X"
name_lower = institution_name.lower()
# Extract potential city names from institution name
# Remove common institution type words
type_words = {"museum", "archief", "bibliotheek", "galerie", "kunsthal", "stichting"}
words = institution_name.split()
potential_cities = []
for word in words:
word_clean = re.sub(r"[^\w]", "", word)
if word_clean.lower() not in type_words and word_clean.lower() not in SKIP_WORDS:
potential_cities.append(word_clean)
# Try each potential city
for city_candidate in potential_cities:
result = lookup_city_geonames(city_candidate, conn)
if result:
return result
return None
def generate_ghcid_uuids(ghcid_string: str) -> dict:
"""Generate UUID v5 and SHA-256 based UUIDs from GHCID string."""
# UUID v5 (SHA-1) - Primary
ghcid_uuid = uuid.uuid5(GHCID_NAMESPACE, ghcid_string)
# UUID v8 (SHA-256 based) - Secondary
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
# Use first 16 bytes to form UUID
uuid_bytes = bytearray(sha256_hash[:16])
# Set version to 8 (custom)
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80
# Set variant to RFC 4122
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80
ghcid_uuid_sha256 = uuid.UUID(bytes=bytes(uuid_bytes))
# 64-bit numeric ID
ghcid_numeric = int.from_bytes(sha256_hash[:8], byteorder="big")
return {
"ghcid_uuid": str(ghcid_uuid),
"ghcid_uuid_sha256": str(ghcid_uuid_sha256),
"ghcid_numeric": ghcid_numeric,
}
def load_yaml(filepath: Path) -> dict:
"""Load a YAML file."""
with open(filepath, "r", encoding="utf-8") as f:
return yaml.safe_load(f) or {}
def save_yaml(filepath: Path, data: dict) -> None:
"""Save data to a YAML file with nice formatting."""
with open(filepath, "w", encoding="utf-8") as f:
yaml.dump(
data,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120,
)
def check_ghcid_collision(ghcid: str) -> bool:
"""Check if a GHCID already exists in the custodian directory."""
filename = f"{ghcid}.yaml"
return (CUSTODIAN_DIR / filename).exists()
def resolve_location(candidate: dict, conn: sqlite3.Connection) -> dict:
"""Resolve location for a candidate.
Returns dict with:
- province_code: ISO 3166-2 code (e.g., "NH")
- city_code: 3-letter city code (e.g., "AMS")
- city_name: Full city name
- geonames_info: GeoNames lookup result
- resolution_method: How the location was resolved
"""
city = candidate.get("city", "").strip()
name = candidate.get("name", "")
result = {
"province_code": "XX",
"city_code": "XXX",
"city_name": None,
"geonames_info": None,
"resolution_method": "UNRESOLVED",
}
# Check if "city" is actually a province name
city_lower = city.lower() if city else ""
if city_lower in PROVINCE_NAME_TO_ISO:
result["province_code"] = PROVINCE_NAME_TO_ISO[city_lower]
result["resolution_method"] = "PROVINCE_FROM_CITY_FIELD"
# Try to infer city from institution name
geonames = infer_city_from_name(name, conn)
if geonames:
# Use the inferred city's actual province (more accurate than LinkedIn's)
admin1 = geonames.get("admin1_code", "")
if admin1 in ADMIN1_TO_ISO:
result["province_code"] = ADMIN1_TO_ISO[admin1]
result["city_name"] = geonames["name"]
result["city_code"] = generate_city_code(geonames["name"])
result["geonames_info"] = geonames
result["resolution_method"] = "CITY_INFERRED_FROM_NAME"
return result
# Try GeoNames lookup for city
if city:
geonames = lookup_city_geonames(city, conn)
if geonames:
admin1 = geonames.get("admin1_code", "")
result["province_code"] = ADMIN1_TO_ISO.get(admin1, "XX")
result["city_name"] = geonames["name"]
result["city_code"] = generate_city_code(geonames["name"])
result["geonames_info"] = geonames
result["resolution_method"] = "GEONAMES_LOOKUP"
return result
# Try to infer city from institution name
geonames = infer_city_from_name(name, conn)
if geonames:
admin1 = geonames.get("admin1_code", "")
result["province_code"] = ADMIN1_TO_ISO.get(admin1, "XX")
result["city_name"] = geonames["name"]
result["city_code"] = generate_city_code(geonames["name"])
result["geonames_info"] = geonames
result["resolution_method"] = "CITY_INFERRED_FROM_NAME"
return result
return result
def create_custodian_from_linkedin(
candidate: dict,
linkedin_data: dict,
location_info: dict,
institution_types: list[str],
) -> tuple[str, dict]:
"""Create a custodian YAML structure from LinkedIn data.
Returns tuple of (ghcid, data_dict).
"""
name = candidate.get("name", "Unknown")
slug = candidate.get("slug", "")
# Generate GHCID components
province = location_info["province_code"]
city = location_info["city_code"]
primary_type = institution_types[0] if institution_types else "U"
abbrev = extract_abbreviation_from_name(name)
# Build GHCID string
ghcid_string = f"NL-{province}-{city}-{primary_type}-{abbrev}"
# Handle collisions by adding name suffix
if check_ghcid_collision(ghcid_string):
# Add snake_case name suffix
name_suffix = normalize_text(name).replace(" ", "_")
name_suffix = re.sub(r"[^a-z0-9_]", "", name_suffix)
name_suffix = re.sub(r"_+", "_", name_suffix).strip("_")
ghcid_string = f"{ghcid_string}-{name_suffix}"
# Generate UUIDs
uuids = generate_ghcid_uuids(ghcid_string)
timestamp = datetime.now(timezone.utc).isoformat()
# Build custodian data structure
data = {
"custodian_name": {
"emic_name": name,
"emic_name_source": "linkedin",
},
"institution_type": institution_types,
"linkedin_enrichment": {
"linkedin_url": linkedin_data.get("linkedin_url"),
"linkedin_slug": slug,
"industry": linkedin_data.get("industry"),
"website": linkedin_data.get("website"),
"follower_count": linkedin_data.get("follower_count"),
"staff_count": linkedin_data.get("staff_count"),
"heritage_staff_count": linkedin_data.get("heritage_staff_count"),
"heritage_staff": linkedin_data.get("heritage_staff", []),
"enrichment_timestamp": timestamp,
"provenance": {
"source": "linkedin_company_scrape",
"original_file": f"data/custodian/linkedin/{slug}.yaml",
"schema_version": linkedin_data.get("provenance", {}).get("schema_version", "1.0.0"),
},
},
"location": {
"city": location_info.get("city_name") or candidate.get("city"),
"region": location_info["province_code"],
"country": "NL",
},
"ghcid": {
"ghcid_current": ghcid_string,
"ghcid_original": ghcid_string,
"ghcid_uuid": uuids["ghcid_uuid"],
"ghcid_uuid_sha256": uuids["ghcid_uuid_sha256"],
"ghcid_numeric": uuids["ghcid_numeric"],
"record_id": str(uuid.uuid4()), # UUID v4 for database record ID
"generation_timestamp": timestamp,
"ghcid_history": [
{
"ghcid": ghcid_string,
"ghcid_numeric": uuids["ghcid_numeric"],
"valid_from": timestamp,
"valid_to": None,
"reason": "Initial GHCID assignment from LinkedIn batch import",
}
],
"location_resolution": {
"method": location_info["resolution_method"],
"city_code": location_info["city_code"],
"region_code": location_info["province_code"],
"country_code": "NL",
},
},
"provenance": {
"schema_version": "1.0.0",
"generated_at": timestamp,
"sources": {
"linkedin": [
{
"source_type": "linkedin_company_profile",
"data_tier": "TIER_4_INFERRED",
"source_file": f"data/custodian/linkedin/{slug}.yaml",
"extraction_timestamp": timestamp,
"claims_extracted": [
"name",
"industry",
"location",
"website",
"staff_count",
"heritage_staff",
],
}
],
},
"data_tier_summary": {
"TIER_4_INFERRED": ["linkedin_company_profile"],
},
"notes": [
"Created from unmatched LinkedIn company profile",
f"Location resolution method: {location_info['resolution_method']}",
],
},
}
# Add GeoNames info if available
if location_info.get("geonames_info"):
geo = location_info["geonames_info"]
data["ghcid"]["location_resolution"]["geonames_id"] = geo.get("geonames_id")
data["ghcid"]["location_resolution"]["geonames_name"] = geo.get("name")
data["ghcid"]["location_resolution"]["feature_code"] = geo.get("feature_code")
data["ghcid"]["location_resolution"]["admin1_code"] = geo.get("admin1_code")
if geo.get("latitude") and geo.get("longitude"):
data["location"]["coordinates"] = {
"latitude": geo["latitude"],
"longitude": geo["longitude"],
"source": "geonames",
}
return ghcid_string, data
def main():
parser = argparse.ArgumentParser(
description="Create NL-*.yaml custodian files from unmatched LinkedIn profiles"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be created without writing files",
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="Limit number of candidates to process",
)
parser.add_argument(
"--offset",
type=int,
default=0,
help="Start from this index in the candidate list",
)
parser.add_argument(
"--verbose",
action="store_true",
help="Show detailed output for each candidate",
)
args = parser.parse_args()
# Load unmatched analysis
print(f"Loading unmatched analysis from {UNMATCHED_FILE}...")
with open(UNMATCHED_FILE, "r") as f:
analysis = json.load(f)
candidates = analysis.get("dutch_list", [])
print(f" Found {len(candidates)} Dutch candidates")
# Apply offset and limit
if args.offset:
candidates = candidates[args.offset:]
print(f" Starting from index {args.offset}")
if args.limit:
candidates = candidates[: args.limit]
print(f" Processing {len(candidates)} candidates (limit={args.limit})")
# Connect to GeoNames database
if not GEONAMES_DB.exists():
print(f"ERROR: GeoNames database not found: {GEONAMES_DB}")
sys.exit(1)
conn = sqlite3.connect(GEONAMES_DB)
print(f"Connected to GeoNames database")
# Statistics
stats = {
"processed": 0,
"created": 0,
"skipped_no_linkedin": 0,
"skipped_collision": 0,
"location_resolved": 0,
"location_unresolved": 0,
"resolution_methods": {},
}
created_files = []
for candidate in candidates:
slug = candidate.get("slug", "")
name = candidate.get("name", "Unknown")
stats["processed"] += 1
# Load full LinkedIn data
linkedin_file = LINKEDIN_DIR / f"{slug}.yaml"
if not linkedin_file.exists():
if args.verbose:
print(f" SKIP: No LinkedIn file for {slug}")
stats["skipped_no_linkedin"] += 1
continue
linkedin_data = load_yaml(linkedin_file)
# Resolve location
location_info = resolve_location(candidate, conn)
# Track resolution method
method = location_info["resolution_method"]
stats["resolution_methods"][method] = stats["resolution_methods"].get(method, 0) + 1
if method != "UNRESOLVED":
stats["location_resolved"] += 1
else:
stats["location_unresolved"] += 1
# Infer institution type from name (primary) and industry (fallback)
# Name-based inference is more reliable than LinkedIn's pre-assigned types
industry = candidate.get("industry", "") or linkedin_data.get("industry", "")
institution_types = infer_institution_type(name, industry)
# Only use LinkedIn's pre-assigned types if our inference returned Unknown
# LinkedIn types are often wrong (e.g., "Libraries" industry → L type for museums)
if institution_types == ["U"] and linkedin_data.get("institution_type"):
institution_types = linkedin_data["institution_type"]
# Create custodian data
ghcid, data = create_custodian_from_linkedin(
candidate, linkedin_data, location_info, institution_types
)
# Check collision (already handled in create function, but double-check)
output_file = CUSTODIAN_DIR / f"{ghcid}.yaml"
if output_file.exists():
if args.verbose:
print(f" COLLISION: {ghcid} already exists")
stats["skipped_collision"] += 1
continue
if args.dry_run:
print(f" [DRY-RUN] Would create: {output_file.name}")
print(f" Name: {name}")
print(f" Type: {institution_types}")
print(f" Location: {location_info['city_name']} ({location_info['province_code']})")
print(f" Resolution: {method}")
if args.verbose:
print(f" GHCID: {ghcid}")
print(f" UUID: {data['ghcid']['ghcid_uuid']}")
else:
save_yaml(output_file, data)
print(f" Created: {output_file.name} ({name})")
stats["created"] += 1
created_files.append({"ghcid": ghcid, "name": name, "file": str(output_file.name)})
conn.close()
# Print summary
print("\n" + "=" * 60)
print("Summary")
print("=" * 60)
print(f"Processed: {stats['processed']}")
print(f"Created: {stats['created']}")
print(f"Skipped (no file): {stats['skipped_no_linkedin']}")
print(f"Skipped (collision): {stats['skipped_collision']}")
print(f"Location resolved: {stats['location_resolved']}")
print(f"Location unresolved: {stats['location_unresolved']}")
print("\nResolution methods:")
for method, count in sorted(stats["resolution_methods"].items()):
print(f" {method}: {count}")
if args.dry_run:
print("\n[DRY-RUN] No files were created.")
return 0
if __name__ == "__main__":
sys.exit(main())