#!/usr/bin/env python3 """ Create new NL-*.yaml custodian files from unmatched LinkedIn profiles. This script: 1. Loads Dutch candidates from data/custodian/linkedin/_unmatched_analysis.json 2. Resolves city/province using GeoNames database 3. Generates GHCID identifiers following project rules 4. Creates skeleton custodian files with linkedin_enrichment Usage: python scripts/create_custodians_from_linkedin.py --dry-run --limit 10 python scripts/create_custodians_from_linkedin.py --limit 50 python scripts/create_custodians_from_linkedin.py # Process all 452 Key Rules Applied: - Rule 8: Filter legal forms (Stichting, B.V., etc.) from abbreviations - GeoNames is authoritative for settlement resolution - admin1_code mapping to ISO 3166-2 province codes """ import argparse import hashlib import json import re import sqlite3 import sys import unicodedata import uuid from datetime import datetime, timezone from pathlib import Path import yaml # Project paths PROJECT_ROOT = Path(__file__).parent.parent LINKEDIN_DIR = PROJECT_ROOT / "data" / "custodian" / "linkedin" CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" GEONAMES_DB = PROJECT_ROOT / "data" / "reference" / "geonames.db" UNMATCHED_FILE = LINKEDIN_DIR / "_unmatched_analysis.json" # GHCID namespace UUID for v5 generation GHCID_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # URL namespace # Dutch province mapping: GeoNames admin1_code → ISO 3166-2 code ADMIN1_TO_ISO = { "01": "DR", # Drenthe "02": "FR", # Friesland (note: also used for other provinces in some GeoNames data) "03": "GE", # Gelderland "04": "GR", # Groningen "05": "LI", # Limburg "06": "NB", # Noord-Brabant "07": "NH", # Noord-Holland "09": "UT", # Utrecht "10": "ZE", # Zeeland "11": "ZH", # Zuid-Holland "15": "OV", # Overijssel "16": "FL", # Flevoland } # Province name to ISO code (for when LinkedIn gives province name as city) PROVINCE_NAME_TO_ISO = { "drenthe": "DR", "friesland": "FR", "fryslân": "FR", "gelderland": "GE", "groningen": "GR", "limburg": "LI", "noord-brabant": "NB", "brabant": "NB", "noord-holland": "NH", "utrecht": "UT", "zeeland": "ZE", "zuid-holland": "ZH", "overijssel": "OV", "flevoland": "FL", } # Dutch legal form words to skip in abbreviation (Rule 8) LEGAL_FORM_WORDS = { # Dutch "stichting", "coöperatie", "cooperatie", "maatschap", "bv", "b.v.", "nv", "n.v.", "vof", "v.o.f.", "cv", "c.v.", # English "foundation", "trust", "inc", "incorporated", "ltd", "limited", "llc", "corp", "corporation", } # Dutch prepositions/articles to skip in abbreviation SKIP_WORDS = { "de", "het", "een", "van", "voor", "in", "op", "te", "den", "der", "des", "'s", "aan", "bij", "met", "naar", "om", "tot", "uit", "over", "onder", "door", "en", "of", "the", "a", "an", "of", "and", } # Institution type inference patterns # Patterns matched against NAME (high priority) and INDUSTRY (lower priority) TYPE_PATTERNS = { "M": [ # Museum r"\bmuseum\b", r"\bmusea\b", r"\bkunsthal\b", r"\bkunsthuis\b", r"\bgalerie\b", r"\btentoonstelling\b", r"\bexpositie\b", r"\bcollectie\b", ], "A": [ # Archive r"\barchief\b", r"\barchieven\b", r"\barchive\b", r"\bdocumentatie\b", r"\berfgoedcentrum\b", r"historisch\s+centrum", ], "L": [ # Library r"\bbibliotheek\b", r"\bbibliotheken\b", r"\blibrary\b", r"\bmediatheek\b", ], "S": [ # Society/Kring r"\bvereniging\b", r"\bgenootschap\b", r"\bkring\b", r"\bbond\b", r"stichting.*erfgoed", r"\bheemkunde\b", r"\boudheidkunde\b", r"historische.*vereniging", ], "R": [ # Research r"\bonderzoek\b", r"\bresearch\b", r"\binstituut\b", r"\bkenniscentrum\b", ], "E": [ # Education r"\buniversiteit\b", r"\bhogeschool\b", r"\bacademie\b", r"\bschool\b", ], "B": [ # Botanical/Zoo r"\bdierentuin\b", r"\bzoo\b", r"\bbotanische\b", r"\barboretum\b", r"\bhortus\b", ], } def normalize_text(text: str) -> str: """Normalize unicode text, remove diacritics.""" normalized = unicodedata.normalize("NFD", text) ascii_text = "".join(c for c in normalized if unicodedata.category(c) != "Mn") return ascii_text.lower() def generate_city_code(city_name: str) -> str: """Generate 3-letter city code from city name. Rules: - Single word: first 3 letters → Amsterdam = AMS - Dutch article (de, het, den, 's): article initial + 2 from main → Den Haag = DHA - Multi-word: initials (up to 3) → Nieuw Amsterdam = NAM """ if not city_name: return "XXX" # Normalize clean = normalize_text(city_name) words = clean.split() if not words: return "XXX" # Single word if len(words) == 1: return words[0][:3].upper() # Check for Dutch articles at start dutch_articles = {"de", "het", "den", "'s", "s"} if words[0] in dutch_articles: # Article initial + 2 from next word if len(words) > 1: article_initial = words[0][0] if words[0] != "'s" else "S" return (article_initial + words[1][:2]).upper() # Multi-word: take initials initials = "".join(w[0] for w in words if w not in dutch_articles) return initials[:3].upper() def extract_abbreviation_from_name(name: str) -> str: """Extract abbreviation from institution name. Rules (per AGENTS.md): - Use first letter of each significant word - Skip prepositions, articles, conjunctions - Skip legal form words (Stichting, B.V., etc.) - Remove diacritics, uppercase, max 10 chars """ if not name: return "UNK" # Normalize clean = normalize_text(name) # Remove punctuation except spaces clean = re.sub(r"[^\w\s]", " ", clean) words = clean.split() # Filter out skip words and legal forms significant_words = [] for word in words: word_lower = word.lower() if word_lower in SKIP_WORDS: continue if word_lower in LEGAL_FORM_WORDS: continue # Skip digits if word.isdigit(): continue significant_words.append(word) if not significant_words: # Fallback: use first 3 letters of original return name[:3].upper() # Take first letter of each significant word abbrev = "".join(w[0] for w in significant_words) return abbrev[:10].upper() def infer_institution_type(name: str, industry: str) -> list[str]: """Infer institution type from name and industry. Priority: Name patterns > Industry patterns > Industry keywords If name clearly indicates museum, archive, etc., industry is ignored. Returns list of type codes (e.g., ["M"], ["A", "L"]). """ name_types = set() industry_types = set() name_lower = name.lower() if name else "" industry_lower = industry.lower() if industry else "" # First pass: check name-based patterns (high priority) for type_code, patterns in TYPE_PATTERNS.items(): for pattern in patterns: if re.search(pattern, name_lower): name_types.add(type_code) break # If name clearly identifies type, return just that (skip industry) if name_types: return sorted(name_types) # Second pass: check industry-based patterns for type_code, patterns in TYPE_PATTERNS.items(): for pattern in patterns: if re.search(pattern, industry_lower): industry_types.add(type_code) break # Industry keyword inference (lower priority) if "museum" in industry_lower or "historical site" in industry_lower: industry_types.add("M") if "librar" in industry_lower: industry_types.add("L") if "archiv" in industry_lower: industry_types.add("A") if industry_types: return sorted(industry_types) # Default to Unknown if no inference possible return ["U"] def lookup_city_geonames(city_name: str, conn: sqlite3.Connection) -> dict | None: """Look up city in GeoNames database. Returns dict with geonames_id, name, admin1_code, admin1_name, etc. or None if not found. """ if not city_name: return None # First try exact match cursor = conn.execute( """ SELECT geonames_id, name, ascii_name, admin1_code, admin1_name, latitude, longitude, feature_code, population FROM cities WHERE country_code = 'NL' AND (name = ? OR ascii_name = ?) AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY population DESC LIMIT 1 """, (city_name, city_name), ) row = cursor.fetchone() if row: return { "geonames_id": row[0], "name": row[1], "ascii_name": row[2], "admin1_code": row[3], "admin1_name": row[4], "latitude": row[5], "longitude": row[6], "feature_code": row[7], "population": row[8], } # Try case-insensitive match cursor = conn.execute( """ SELECT geonames_id, name, ascii_name, admin1_code, admin1_name, latitude, longitude, feature_code, population FROM cities WHERE country_code = 'NL' AND (LOWER(name) = LOWER(?) OR LOWER(ascii_name) = LOWER(?)) AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG') ORDER BY population DESC LIMIT 1 """, (city_name, city_name), ) row = cursor.fetchone() if row: return { "geonames_id": row[0], "name": row[1], "ascii_name": row[2], "admin1_code": row[3], "admin1_name": row[4], "latitude": row[5], "longitude": row[6], "feature_code": row[7], "population": row[8], } return None def infer_city_from_name(institution_name: str, conn: sqlite3.Connection) -> dict | None: """Try to infer city from institution name (e.g., 'Museum Spakenburg' → Spakenburg).""" # Common patterns: "Museum X", "X Museum", "Archief X" name_lower = institution_name.lower() # Extract potential city names from institution name # Remove common institution type words type_words = {"museum", "archief", "bibliotheek", "galerie", "kunsthal", "stichting"} words = institution_name.split() potential_cities = [] for word in words: word_clean = re.sub(r"[^\w]", "", word) if word_clean.lower() not in type_words and word_clean.lower() not in SKIP_WORDS: potential_cities.append(word_clean) # Try each potential city for city_candidate in potential_cities: result = lookup_city_geonames(city_candidate, conn) if result: return result return None def generate_ghcid_uuids(ghcid_string: str) -> dict: """Generate UUID v5 and SHA-256 based UUIDs from GHCID string.""" # UUID v5 (SHA-1) - Primary ghcid_uuid = uuid.uuid5(GHCID_NAMESPACE, ghcid_string) # UUID v8 (SHA-256 based) - Secondary sha256_hash = hashlib.sha256(ghcid_string.encode()).digest() # Use first 16 bytes to form UUID uuid_bytes = bytearray(sha256_hash[:16]) # Set version to 8 (custom) uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Set variant to RFC 4122 uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 ghcid_uuid_sha256 = uuid.UUID(bytes=bytes(uuid_bytes)) # 64-bit numeric ID ghcid_numeric = int.from_bytes(sha256_hash[:8], byteorder="big") return { "ghcid_uuid": str(ghcid_uuid), "ghcid_uuid_sha256": str(ghcid_uuid_sha256), "ghcid_numeric": ghcid_numeric, } def load_yaml(filepath: Path) -> dict: """Load a YAML file.""" with open(filepath, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} def save_yaml(filepath: Path, data: dict) -> None: """Save data to a YAML file with nice formatting.""" with open(filepath, "w", encoding="utf-8") as f: yaml.dump( data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120, ) def check_ghcid_collision(ghcid: str) -> bool: """Check if a GHCID already exists in the custodian directory.""" filename = f"{ghcid}.yaml" return (CUSTODIAN_DIR / filename).exists() def resolve_location(candidate: dict, conn: sqlite3.Connection) -> dict: """Resolve location for a candidate. Returns dict with: - province_code: ISO 3166-2 code (e.g., "NH") - city_code: 3-letter city code (e.g., "AMS") - city_name: Full city name - geonames_info: GeoNames lookup result - resolution_method: How the location was resolved """ city = candidate.get("city", "").strip() name = candidate.get("name", "") result = { "province_code": "XX", "city_code": "XXX", "city_name": None, "geonames_info": None, "resolution_method": "UNRESOLVED", } # Check if "city" is actually a province name city_lower = city.lower() if city else "" if city_lower in PROVINCE_NAME_TO_ISO: result["province_code"] = PROVINCE_NAME_TO_ISO[city_lower] result["resolution_method"] = "PROVINCE_FROM_CITY_FIELD" # Try to infer city from institution name geonames = infer_city_from_name(name, conn) if geonames: # Use the inferred city's actual province (more accurate than LinkedIn's) admin1 = geonames.get("admin1_code", "") if admin1 in ADMIN1_TO_ISO: result["province_code"] = ADMIN1_TO_ISO[admin1] result["city_name"] = geonames["name"] result["city_code"] = generate_city_code(geonames["name"]) result["geonames_info"] = geonames result["resolution_method"] = "CITY_INFERRED_FROM_NAME" return result # Try GeoNames lookup for city if city: geonames = lookup_city_geonames(city, conn) if geonames: admin1 = geonames.get("admin1_code", "") result["province_code"] = ADMIN1_TO_ISO.get(admin1, "XX") result["city_name"] = geonames["name"] result["city_code"] = generate_city_code(geonames["name"]) result["geonames_info"] = geonames result["resolution_method"] = "GEONAMES_LOOKUP" return result # Try to infer city from institution name geonames = infer_city_from_name(name, conn) if geonames: admin1 = geonames.get("admin1_code", "") result["province_code"] = ADMIN1_TO_ISO.get(admin1, "XX") result["city_name"] = geonames["name"] result["city_code"] = generate_city_code(geonames["name"]) result["geonames_info"] = geonames result["resolution_method"] = "CITY_INFERRED_FROM_NAME" return result return result def create_custodian_from_linkedin( candidate: dict, linkedin_data: dict, location_info: dict, institution_types: list[str], ) -> tuple[str, dict]: """Create a custodian YAML structure from LinkedIn data. Returns tuple of (ghcid, data_dict). """ name = candidate.get("name", "Unknown") slug = candidate.get("slug", "") # Generate GHCID components province = location_info["province_code"] city = location_info["city_code"] primary_type = institution_types[0] if institution_types else "U" abbrev = extract_abbreviation_from_name(name) # Build GHCID string ghcid_string = f"NL-{province}-{city}-{primary_type}-{abbrev}" # Handle collisions by adding name suffix if check_ghcid_collision(ghcid_string): # Add snake_case name suffix name_suffix = normalize_text(name).replace(" ", "_") name_suffix = re.sub(r"[^a-z0-9_]", "", name_suffix) name_suffix = re.sub(r"_+", "_", name_suffix).strip("_") ghcid_string = f"{ghcid_string}-{name_suffix}" # Generate UUIDs uuids = generate_ghcid_uuids(ghcid_string) timestamp = datetime.now(timezone.utc).isoformat() # Build custodian data structure data = { "custodian_name": { "emic_name": name, "emic_name_source": "linkedin", }, "institution_type": institution_types, "linkedin_enrichment": { "linkedin_url": linkedin_data.get("linkedin_url"), "linkedin_slug": slug, "industry": linkedin_data.get("industry"), "website": linkedin_data.get("website"), "follower_count": linkedin_data.get("follower_count"), "staff_count": linkedin_data.get("staff_count"), "heritage_staff_count": linkedin_data.get("heritage_staff_count"), "heritage_staff": linkedin_data.get("heritage_staff", []), "enrichment_timestamp": timestamp, "provenance": { "source": "linkedin_company_scrape", "original_file": f"data/custodian/linkedin/{slug}.yaml", "schema_version": linkedin_data.get("provenance", {}).get("schema_version", "1.0.0"), }, }, "location": { "city": location_info.get("city_name") or candidate.get("city"), "region": location_info["province_code"], "country": "NL", }, "ghcid": { "ghcid_current": ghcid_string, "ghcid_original": ghcid_string, "ghcid_uuid": uuids["ghcid_uuid"], "ghcid_uuid_sha256": uuids["ghcid_uuid_sha256"], "ghcid_numeric": uuids["ghcid_numeric"], "record_id": str(uuid.uuid4()), # UUID v4 for database record ID "generation_timestamp": timestamp, "ghcid_history": [ { "ghcid": ghcid_string, "ghcid_numeric": uuids["ghcid_numeric"], "valid_from": timestamp, "valid_to": None, "reason": "Initial GHCID assignment from LinkedIn batch import", } ], "location_resolution": { "method": location_info["resolution_method"], "city_code": location_info["city_code"], "region_code": location_info["province_code"], "country_code": "NL", }, }, "provenance": { "schema_version": "1.0.0", "generated_at": timestamp, "sources": { "linkedin": [ { "source_type": "linkedin_company_profile", "data_tier": "TIER_4_INFERRED", "source_file": f"data/custodian/linkedin/{slug}.yaml", "extraction_timestamp": timestamp, "claims_extracted": [ "name", "industry", "location", "website", "staff_count", "heritage_staff", ], } ], }, "data_tier_summary": { "TIER_4_INFERRED": ["linkedin_company_profile"], }, "notes": [ "Created from unmatched LinkedIn company profile", f"Location resolution method: {location_info['resolution_method']}", ], }, } # Add GeoNames info if available if location_info.get("geonames_info"): geo = location_info["geonames_info"] data["ghcid"]["location_resolution"]["geonames_id"] = geo.get("geonames_id") data["ghcid"]["location_resolution"]["geonames_name"] = geo.get("name") data["ghcid"]["location_resolution"]["feature_code"] = geo.get("feature_code") data["ghcid"]["location_resolution"]["admin1_code"] = geo.get("admin1_code") if geo.get("latitude") and geo.get("longitude"): data["location"]["coordinates"] = { "latitude": geo["latitude"], "longitude": geo["longitude"], "source": "geonames", } return ghcid_string, data def main(): parser = argparse.ArgumentParser( description="Create NL-*.yaml custodian files from unmatched LinkedIn profiles" ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be created without writing files", ) parser.add_argument( "--limit", type=int, default=None, help="Limit number of candidates to process", ) parser.add_argument( "--offset", type=int, default=0, help="Start from this index in the candidate list", ) parser.add_argument( "--verbose", action="store_true", help="Show detailed output for each candidate", ) args = parser.parse_args() # Load unmatched analysis print(f"Loading unmatched analysis from {UNMATCHED_FILE}...") with open(UNMATCHED_FILE, "r") as f: analysis = json.load(f) candidates = analysis.get("dutch_list", []) print(f" Found {len(candidates)} Dutch candidates") # Apply offset and limit if args.offset: candidates = candidates[args.offset:] print(f" Starting from index {args.offset}") if args.limit: candidates = candidates[: args.limit] print(f" Processing {len(candidates)} candidates (limit={args.limit})") # Connect to GeoNames database if not GEONAMES_DB.exists(): print(f"ERROR: GeoNames database not found: {GEONAMES_DB}") sys.exit(1) conn = sqlite3.connect(GEONAMES_DB) print(f"Connected to GeoNames database") # Statistics stats = { "processed": 0, "created": 0, "skipped_no_linkedin": 0, "skipped_collision": 0, "location_resolved": 0, "location_unresolved": 0, "resolution_methods": {}, } created_files = [] for candidate in candidates: slug = candidate.get("slug", "") name = candidate.get("name", "Unknown") stats["processed"] += 1 # Load full LinkedIn data linkedin_file = LINKEDIN_DIR / f"{slug}.yaml" if not linkedin_file.exists(): if args.verbose: print(f" SKIP: No LinkedIn file for {slug}") stats["skipped_no_linkedin"] += 1 continue linkedin_data = load_yaml(linkedin_file) # Resolve location location_info = resolve_location(candidate, conn) # Track resolution method method = location_info["resolution_method"] stats["resolution_methods"][method] = stats["resolution_methods"].get(method, 0) + 1 if method != "UNRESOLVED": stats["location_resolved"] += 1 else: stats["location_unresolved"] += 1 # Infer institution type from name (primary) and industry (fallback) # Name-based inference is more reliable than LinkedIn's pre-assigned types industry = candidate.get("industry", "") or linkedin_data.get("industry", "") institution_types = infer_institution_type(name, industry) # Only use LinkedIn's pre-assigned types if our inference returned Unknown # LinkedIn types are often wrong (e.g., "Libraries" industry → L type for museums) if institution_types == ["U"] and linkedin_data.get("institution_type"): institution_types = linkedin_data["institution_type"] # Create custodian data ghcid, data = create_custodian_from_linkedin( candidate, linkedin_data, location_info, institution_types ) # Check collision (already handled in create function, but double-check) output_file = CUSTODIAN_DIR / f"{ghcid}.yaml" if output_file.exists(): if args.verbose: print(f" COLLISION: {ghcid} already exists") stats["skipped_collision"] += 1 continue if args.dry_run: print(f" [DRY-RUN] Would create: {output_file.name}") print(f" Name: {name}") print(f" Type: {institution_types}") print(f" Location: {location_info['city_name']} ({location_info['province_code']})") print(f" Resolution: {method}") if args.verbose: print(f" GHCID: {ghcid}") print(f" UUID: {data['ghcid']['ghcid_uuid']}") else: save_yaml(output_file, data) print(f" Created: {output_file.name} ({name})") stats["created"] += 1 created_files.append({"ghcid": ghcid, "name": name, "file": str(output_file.name)}) conn.close() # Print summary print("\n" + "=" * 60) print("Summary") print("=" * 60) print(f"Processed: {stats['processed']}") print(f"Created: {stats['created']}") print(f"Skipped (no file): {stats['skipped_no_linkedin']}") print(f"Skipped (collision): {stats['skipped_collision']}") print(f"Location resolved: {stats['location_resolved']}") print(f"Location unresolved: {stats['location_unresolved']}") print("\nResolution methods:") for method, count in sorted(stats["resolution_methods"].items()): print(f" {method}: {count}") if args.dry_run: print("\n[DRY-RUN] No files were created.") return 0 if __name__ == "__main__": sys.exit(main())