glam/scripts/enrich_xxx_from_linkedin_html.py
2025-12-17 10:11:56 +01:00

486 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Enrich NL-XX-XXX custodian files with location data from LinkedIn HTML files.
This script:
1. Parses LinkedIn HTML files to extract company names and headquarters locations
2. Matches custodian files by emic_name
3. Updates custodian files with correct country, region (province), and city codes
4. Regenerates GHCID based on new location data
"""
import os
import re
import json
import yaml
from pathlib import Path
from typing import Optional, Dict, Any, Tuple
from collections import defaultdict
import unicodedata
# Dutch province code mapping
DUTCH_PROVINCE_CODES = {
'drenthe': 'DR',
'flevoland': 'FL',
'friesland': 'FR',
'fryslân': 'FR',
'gelderland': 'GE',
'groningen': 'GR',
'limburg': 'LI',
'noord-brabant': 'NB',
'noord brabant': 'NB',
'north brabant': 'NB',
'noord-holland': 'NH',
'noord holland': 'NH',
'north holland': 'NH',
'overijssel': 'OV',
'utrecht': 'UT',
'zeeland': 'ZE',
'zuid-holland': 'ZH',
'zuid holland': 'ZH',
'south holland': 'ZH',
}
# City to province mapping for common Dutch cities
DUTCH_CITY_TO_PROVINCE = {
'amsterdam': 'NH',
'rotterdam': 'ZH',
'den haag': 'ZH',
'the hague': 'ZH',
"'s-gravenhage": 'ZH',
's-gravenhage': 'ZH',
'utrecht': 'UT',
'eindhoven': 'NB',
'tilburg': 'NB',
'groningen': 'GR',
'almere': 'FL',
'breda': 'NB',
'nijmegen': 'GE',
'arnhem': 'GE',
'haarlem': 'NH',
'enschede': 'OV',
'maastricht': 'LI',
'leiden': 'ZH',
'dordrecht': 'ZH',
'apeldoorn': 'GE',
'zwolle': 'OV',
'amersfoort': 'UT',
'delft': 'ZH',
'alkmaar': 'NH',
'zaandam': 'NH',
'leeuwarden': 'FR',
'hilversum': 'NH',
'deventer': 'OV',
'middelburg': 'ZE',
'assen': 'DR',
'wageningen': 'GE',
'lelystad': 'FL',
'venlo': 'LI',
'heerlen': 'LI',
'sittard': 'LI',
'oss': 'NB',
"'s-hertogenbosch": 'NB',
's-hertogenbosch': 'NB',
'den bosch': 'NB',
'gouda': 'ZH',
'schiedam': 'ZH',
'zoetermeer': 'ZH',
'alphen aan den rijn': 'ZH',
'emmen': 'DR',
'kampen': 'OV',
'harderwijk': 'GE',
'hoorn': 'NH',
'purmerend': 'NH',
'vlaardingen': 'ZH',
'beverwijk': 'NH',
'hoofddorp': 'NH',
'amstelveen': 'NH',
'diemen': 'NH',
'nieuwegein': 'UT',
'zeist': 'UT',
'veenendaal': 'UT',
'helmond': 'NB',
'roosendaal': 'NB',
'bergen op zoom': 'NB',
'waalwijk': 'NB',
'vlissingen': 'ZE',
'goes': 'ZE',
'terneuzen': 'ZE',
'roermond': 'LI',
'weert': 'LI',
'kerkrade': 'LI',
'geleen': 'LI',
'doetinchem': 'GE',
'tiel': 'GE',
'ede': 'GE',
'barneveld': 'GE',
'winterswijk': 'GE',
'almelo': 'OV',
'hengelo': 'OV',
'oldenzaal': 'OV',
'steenwijk': 'OV',
'meppel': 'DR',
'hoogeveen': 'DR',
'coevorden': 'DR',
'drachten': 'FR',
'sneek': 'FR',
'heerenveen': 'FR',
'harlingen': 'FR',
'franeker': 'FR',
}
def normalize_name(name: str) -> str:
"""Normalize a name for matching."""
if not name:
return ""
# NFD decomposition to separate base characters from diacritics
normalized = unicodedata.normalize('NFD', name.lower())
# Remove diacritics
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Remove punctuation and extra spaces
ascii_name = re.sub(r'[^\w\s]', ' ', ascii_name)
ascii_name = re.sub(r'\s+', ' ', ascii_name).strip()
return ascii_name
def generate_city_code(city: str) -> str:
"""Generate a 3-letter city code from city name."""
if not city:
return "XXX"
city = city.strip()
words = city.split()
# Handle Dutch articles
dutch_articles = {'de', 'het', 'den', "'s", 's'}
if len(words) == 1:
# Single word: first 3 letters
return city[:3].upper()
elif words[0].lower() in dutch_articles:
# Dutch article + word: article initial + 2 from main word
if len(words) >= 2:
return (words[0][0] + words[1][:2]).upper()
else:
# Multi-word: initials
initials = ''.join(w[0] for w in words if w.lower() not in dutch_articles)
return initials[:3].upper()
return city[:3].upper()
def extract_linkedin_locations(html_dir: Path) -> Dict[str, Dict[str, Any]]:
"""
Extract company names and locations from LinkedIn HTML files.
Returns dict mapping normalized company name -> location info
"""
locations = {}
html_files = list(html_dir.glob("*.html"))
print(f"Found {len(html_files)} HTML files to process")
for html_file in html_files:
try:
content = html_file.read_text(encoding='utf-8', errors='ignore')
# Extract company name from filename
# Format: "(N) Company Name_ People _ LinkedIn.html"
filename = html_file.stem
match = re.match(r'\(\d+\)\s*(.+?)_\s*People\s*_\s*LinkedIn', filename)
if match:
company_name = match.group(1).strip()
else:
# Try without the number prefix
match = re.match(r'(.+?)_\s*People\s*_\s*LinkedIn', filename)
if match:
company_name = match.group(1).strip()
else:
continue
# Extract headquarter location from JSON embedded in HTML
# Pattern: "headquarter":{"streetAddressOptOut":...,"address":{"country":"NL",...,"city":"Amsterdam",...}}
hq_pattern = r'"headquarter":\s*\{[^}]*"address":\s*\{([^}]+)\}'
hq_matches = re.findall(hq_pattern, content)
if hq_matches:
# Get the first headquarter (usually the main one)
address_json = '{' + hq_matches[0] + '}'
try:
# Clean up JSON (remove $recipeTypes etc.)
address_json = re.sub(r'"\$[^"]*":\s*\[[^\]]*\]', '', address_json)
address_json = re.sub(r'"\$[^"]*":\s*"[^"]*"', '', address_json)
address_json = re.sub(r',\s*,', ',', address_json)
address_json = re.sub(r',\s*}', '}', address_json)
address_data = json.loads(address_json)
country = address_data.get('country', '')
city = address_data.get('city', '')
region = address_data.get('geographicArea', '')
if country or city:
normalized = normalize_name(company_name)
locations[normalized] = {
'original_name': company_name,
'country': country,
'city': city,
'region': region,
'source_file': str(html_file.name),
}
except json.JSONDecodeError:
pass
except Exception as e:
print(f"Error processing {html_file.name}: {e}")
continue
return locations
def get_province_code(city: str, region: str, country: str) -> str:
"""Get Dutch province code from city or region."""
if country != 'NL':
return 'XX'
# Try region first
if region:
region_lower = region.lower().strip()
if region_lower in DUTCH_PROVINCE_CODES:
return DUTCH_PROVINCE_CODES[region_lower]
# Try city
if city:
city_lower = city.lower().strip()
if city_lower in DUTCH_CITY_TO_PROVINCE:
return DUTCH_CITY_TO_PROVINCE[city_lower]
return 'XX'
def generate_abbreviation(name: str) -> str:
"""Generate abbreviation from institution name."""
if not name:
return "XXX"
# Skip words (articles, prepositions)
skip_words = {
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s",
'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of',
'a', 'an', 'the', 'of', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'and', 'or',
'stichting', 'vereniging', 'foundation', 'museum', 'archief', 'bibliotheek',
}
words = name.split()
initials = []
for word in words:
# Skip if word is in skip list
word_lower = word.lower().strip('.,;:!?()')
if word_lower in skip_words:
continue
# Skip numbers
if word.isdigit():
continue
# Get first letter, normalize to ASCII
if word:
first = word[0]
# Normalize diacritics
first = unicodedata.normalize('NFD', first)
first = ''.join(c for c in first if unicodedata.category(c) != 'Mn')
if first.isalpha():
initials.append(first.upper())
if not initials:
# Fallback: first 3 letters of first word
return name[:3].upper()
return ''.join(initials[:10]) # Max 10 chars
def update_custodian_file(
file_path: Path,
country: str,
region_code: str,
city: str,
city_code: str,
source_file: str
) -> Tuple[str, str]:
"""
Update a custodian YAML file with new location data.
Returns (old_ghcid, new_ghcid)
"""
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Get current GHCID
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
# Get institution type
inst_types = data.get('institution_type', ['M'])
inst_type = inst_types[0] if inst_types else 'M'
# Get emic name for abbreviation
emic_name = data.get('custodian_name', {}).get('emic_name', '')
abbreviation = generate_abbreviation(emic_name)
# Generate new GHCID
new_ghcid = f"{country}-{region_code}-{city_code}-{inst_type}-{abbreviation}"
# Check for name suffix in old GHCID
old_parts = old_ghcid.split('-')
if len(old_parts) > 5:
# Has name suffix
name_suffix = '-'.join(old_parts[5:])
new_ghcid = f"{new_ghcid}-{name_suffix}"
# Update location
if 'location' not in data:
data['location'] = {}
data['location']['city'] = city
data['location']['region'] = region_code
data['location']['country'] = country
# Update GHCID
if 'ghcid' not in data:
data['ghcid'] = {}
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['location_resolution'] = {
'method': 'LINKEDIN_HTML_EXTRACTION',
'source_file': source_file,
'city_code': city_code,
'region_code': region_code,
'country_code': country,
}
# Write back
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return old_ghcid, new_ghcid
def main():
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
linkedin_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/affiliated/manual')
# Step 1: Extract locations from LinkedIn HTML files
print("Extracting locations from LinkedIn HTML files...")
linkedin_locations = extract_linkedin_locations(linkedin_dir)
print(f"Extracted locations for {len(linkedin_locations)} companies")
# Step 2: Find XXX custodian files
xxx_files = list(custodian_dir.glob('NL-XX-XXX-*.yaml'))
print(f"\nFound {len(xxx_files)} NL-XX-XXX files to process")
# Step 3: Match and update
matched = 0
not_matched = []
updates = []
non_nl_files = []
for file_path in xxx_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
emic_name = data.get('custodian_name', {}).get('emic_name', '')
if not emic_name:
not_matched.append((file_path.name, "No emic_name"))
continue
normalized = normalize_name(emic_name)
# Try to find a match
location = linkedin_locations.get(normalized)
if not location:
# Try partial matching
for key, loc in linkedin_locations.items():
if normalized in key or key in normalized:
location = loc
break
if location:
country = location['country']
city = location['city']
region = location.get('region', '')
source_file = location['source_file']
if country != 'NL':
# Non-Dutch institution - mark for later
non_nl_files.append({
'file': file_path.name,
'emic_name': emic_name,
'country': country,
'city': city,
})
continue
# Get province code
region_code = get_province_code(city, region, country)
city_code = generate_city_code(city)
if region_code == 'XX' and city_code == 'XXX':
not_matched.append((file_path.name, f"No province/city for {city}"))
continue
# Update the file
old_ghcid, new_ghcid = update_custodian_file(
file_path, country, region_code, city, city_code, source_file
)
updates.append({
'file': file_path.name,
'emic_name': emic_name,
'old_ghcid': old_ghcid,
'new_ghcid': new_ghcid,
'city': city,
'region': region_code,
})
matched += 1
else:
not_matched.append((file_path.name, emic_name))
except Exception as e:
print(f"Error processing {file_path.name}: {e}")
# Report results
print(f"\n{'='*60}")
print(f"RESULTS")
print(f"{'='*60}")
print(f"Total XXX files: {len(xxx_files)}")
print(f"Matched and updated: {matched}")
print(f"Not matched: {len(not_matched)}")
print(f"Non-NL institutions found: {len(non_nl_files)}")
if updates:
print(f"\n--- UPDATES ({len(updates)}) ---")
for u in updates[:20]:
print(f" {u['emic_name'][:40]:<40} | {u['city']:<15} | {u['region']} | {u['new_ghcid']}")
if len(updates) > 20:
print(f" ... and {len(updates) - 20} more")
if non_nl_files:
print(f"\n--- NON-NL INSTITUTIONS ({len(non_nl_files)}) ---")
for nf in non_nl_files[:10]:
print(f" {nf['emic_name'][:40]:<40} | {nf['country']} | {nf['city']}")
if len(non_nl_files) > 10:
print(f" ... and {len(non_nl_files) - 10} more")
if not_matched:
print(f"\n--- NOT MATCHED ({len(not_matched)}) ---")
for nm in not_matched[:20]:
print(f" {nm[0]:<50} | {nm[1][:40]}")
if len(not_matched) > 20:
print(f" ... and {len(not_matched) - 20} more")
return updates, not_matched, non_nl_files
if __name__ == '__main__':
main()