#!/usr/bin/env python3 """ Enrich NL-XX-XXX custodian files with location data from LinkedIn HTML files. This script: 1. Parses LinkedIn HTML files to extract company names and headquarters locations 2. Matches custodian files by emic_name 3. Updates custodian files with correct country, region (province), and city codes 4. Regenerates GHCID based on new location data """ import os import re import json import yaml from pathlib import Path from typing import Optional, Dict, Any, Tuple from collections import defaultdict import unicodedata # Dutch province code mapping DUTCH_PROVINCE_CODES = { 'drenthe': 'DR', 'flevoland': 'FL', 'friesland': 'FR', 'fryslân': 'FR', 'gelderland': 'GE', 'groningen': 'GR', 'limburg': 'LI', 'noord-brabant': 'NB', 'noord brabant': 'NB', 'north brabant': 'NB', 'noord-holland': 'NH', 'noord holland': 'NH', 'north holland': 'NH', 'overijssel': 'OV', 'utrecht': 'UT', 'zeeland': 'ZE', 'zuid-holland': 'ZH', 'zuid holland': 'ZH', 'south holland': 'ZH', } # City to province mapping for common Dutch cities DUTCH_CITY_TO_PROVINCE = { 'amsterdam': 'NH', 'rotterdam': 'ZH', 'den haag': 'ZH', 'the hague': 'ZH', "'s-gravenhage": 'ZH', 's-gravenhage': 'ZH', 'utrecht': 'UT', 'eindhoven': 'NB', 'tilburg': 'NB', 'groningen': 'GR', 'almere': 'FL', 'breda': 'NB', 'nijmegen': 'GE', 'arnhem': 'GE', 'haarlem': 'NH', 'enschede': 'OV', 'maastricht': 'LI', 'leiden': 'ZH', 'dordrecht': 'ZH', 'apeldoorn': 'GE', 'zwolle': 'OV', 'amersfoort': 'UT', 'delft': 'ZH', 'alkmaar': 'NH', 'zaandam': 'NH', 'leeuwarden': 'FR', 'hilversum': 'NH', 'deventer': 'OV', 'middelburg': 'ZE', 'assen': 'DR', 'wageningen': 'GE', 'lelystad': 'FL', 'venlo': 'LI', 'heerlen': 'LI', 'sittard': 'LI', 'oss': 'NB', "'s-hertogenbosch": 'NB', 's-hertogenbosch': 'NB', 'den bosch': 'NB', 'gouda': 'ZH', 'schiedam': 'ZH', 'zoetermeer': 'ZH', 'alphen aan den rijn': 'ZH', 'emmen': 'DR', 'kampen': 'OV', 'harderwijk': 'GE', 'hoorn': 'NH', 'purmerend': 'NH', 'vlaardingen': 'ZH', 'beverwijk': 'NH', 'hoofddorp': 'NH', 'amstelveen': 'NH', 'diemen': 'NH', 'nieuwegein': 'UT', 'zeist': 'UT', 'veenendaal': 'UT', 'helmond': 'NB', 'roosendaal': 'NB', 'bergen op zoom': 'NB', 'waalwijk': 'NB', 'vlissingen': 'ZE', 'goes': 'ZE', 'terneuzen': 'ZE', 'roermond': 'LI', 'weert': 'LI', 'kerkrade': 'LI', 'geleen': 'LI', 'doetinchem': 'GE', 'tiel': 'GE', 'ede': 'GE', 'barneveld': 'GE', 'winterswijk': 'GE', 'almelo': 'OV', 'hengelo': 'OV', 'oldenzaal': 'OV', 'steenwijk': 'OV', 'meppel': 'DR', 'hoogeveen': 'DR', 'coevorden': 'DR', 'drachten': 'FR', 'sneek': 'FR', 'heerenveen': 'FR', 'harlingen': 'FR', 'franeker': 'FR', } def normalize_name(name: str) -> str: """Normalize a name for matching.""" if not name: return "" # NFD decomposition to separate base characters from diacritics normalized = unicodedata.normalize('NFD', name.lower()) # Remove diacritics ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Remove punctuation and extra spaces ascii_name = re.sub(r'[^\w\s]', ' ', ascii_name) ascii_name = re.sub(r'\s+', ' ', ascii_name).strip() return ascii_name def generate_city_code(city: str) -> str: """Generate a 3-letter city code from city name.""" if not city: return "XXX" city = city.strip() words = city.split() # Handle Dutch articles dutch_articles = {'de', 'het', 'den', "'s", 's'} if len(words) == 1: # Single word: first 3 letters return city[:3].upper() elif words[0].lower() in dutch_articles: # Dutch article + word: article initial + 2 from main word if len(words) >= 2: return (words[0][0] + words[1][:2]).upper() else: # Multi-word: initials initials = ''.join(w[0] for w in words if w.lower() not in dutch_articles) return initials[:3].upper() return city[:3].upper() def extract_linkedin_locations(html_dir: Path) -> Dict[str, Dict[str, Any]]: """ Extract company names and locations from LinkedIn HTML files. Returns dict mapping normalized company name -> location info """ locations = {} html_files = list(html_dir.glob("*.html")) print(f"Found {len(html_files)} HTML files to process") for html_file in html_files: try: content = html_file.read_text(encoding='utf-8', errors='ignore') # Extract company name from filename # Format: "(N) Company Name_ People _ LinkedIn.html" filename = html_file.stem match = re.match(r'\(\d+\)\s*(.+?)_\s*People\s*_\s*LinkedIn', filename) if match: company_name = match.group(1).strip() else: # Try without the number prefix match = re.match(r'(.+?)_\s*People\s*_\s*LinkedIn', filename) if match: company_name = match.group(1).strip() else: continue # Extract headquarter location from JSON embedded in HTML # Pattern: "headquarter":{"streetAddressOptOut":...,"address":{"country":"NL",...,"city":"Amsterdam",...}} hq_pattern = r'"headquarter":\s*\{[^}]*"address":\s*\{([^}]+)\}' hq_matches = re.findall(hq_pattern, content) if hq_matches: # Get the first headquarter (usually the main one) address_json = '{' + hq_matches[0] + '}' try: # Clean up JSON (remove $recipeTypes etc.) address_json = re.sub(r'"\$[^"]*":\s*\[[^\]]*\]', '', address_json) address_json = re.sub(r'"\$[^"]*":\s*"[^"]*"', '', address_json) address_json = re.sub(r',\s*,', ',', address_json) address_json = re.sub(r',\s*}', '}', address_json) address_data = json.loads(address_json) country = address_data.get('country', '') city = address_data.get('city', '') region = address_data.get('geographicArea', '') if country or city: normalized = normalize_name(company_name) locations[normalized] = { 'original_name': company_name, 'country': country, 'city': city, 'region': region, 'source_file': str(html_file.name), } except json.JSONDecodeError: pass except Exception as e: print(f"Error processing {html_file.name}: {e}") continue return locations def get_province_code(city: str, region: str, country: str) -> str: """Get Dutch province code from city or region.""" if country != 'NL': return 'XX' # Try region first if region: region_lower = region.lower().strip() if region_lower in DUTCH_PROVINCE_CODES: return DUTCH_PROVINCE_CODES[region_lower] # Try city if city: city_lower = city.lower().strip() if city_lower in DUTCH_CITY_TO_PROVINCE: return DUTCH_CITY_TO_PROVINCE[city_lower] return 'XX' def generate_abbreviation(name: str) -> str: """Generate abbreviation from institution name.""" if not name: return "XXX" # Skip words (articles, prepositions) skip_words = { 'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s", 'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of', 'a', 'an', 'the', 'of', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'and', 'or', 'stichting', 'vereniging', 'foundation', 'museum', 'archief', 'bibliotheek', } words = name.split() initials = [] for word in words: # Skip if word is in skip list word_lower = word.lower().strip('.,;:!?()') if word_lower in skip_words: continue # Skip numbers if word.isdigit(): continue # Get first letter, normalize to ASCII if word: first = word[0] # Normalize diacritics first = unicodedata.normalize('NFD', first) first = ''.join(c for c in first if unicodedata.category(c) != 'Mn') if first.isalpha(): initials.append(first.upper()) if not initials: # Fallback: first 3 letters of first word return name[:3].upper() return ''.join(initials[:10]) # Max 10 chars def update_custodian_file( file_path: Path, country: str, region_code: str, city: str, city_code: str, source_file: str ) -> Tuple[str, str]: """ Update a custodian YAML file with new location data. Returns (old_ghcid, new_ghcid) """ with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Get current GHCID old_ghcid = data.get('ghcid', {}).get('ghcid_current', '') # Get institution type inst_types = data.get('institution_type', ['M']) inst_type = inst_types[0] if inst_types else 'M' # Get emic name for abbreviation emic_name = data.get('custodian_name', {}).get('emic_name', '') abbreviation = generate_abbreviation(emic_name) # Generate new GHCID new_ghcid = f"{country}-{region_code}-{city_code}-{inst_type}-{abbreviation}" # Check for name suffix in old GHCID old_parts = old_ghcid.split('-') if len(old_parts) > 5: # Has name suffix name_suffix = '-'.join(old_parts[5:]) new_ghcid = f"{new_ghcid}-{name_suffix}" # Update location if 'location' not in data: data['location'] = {} data['location']['city'] = city data['location']['region'] = region_code data['location']['country'] = country # Update GHCID if 'ghcid' not in data: data['ghcid'] = {} data['ghcid']['ghcid_current'] = new_ghcid data['ghcid']['location_resolution'] = { 'method': 'LINKEDIN_HTML_EXTRACTION', 'source_file': source_file, 'city_code': city_code, 'region_code': region_code, 'country_code': country, } # Write back with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return old_ghcid, new_ghcid def main(): custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') linkedin_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/affiliated/manual') # Step 1: Extract locations from LinkedIn HTML files print("Extracting locations from LinkedIn HTML files...") linkedin_locations = extract_linkedin_locations(linkedin_dir) print(f"Extracted locations for {len(linkedin_locations)} companies") # Step 2: Find XXX custodian files xxx_files = list(custodian_dir.glob('NL-XX-XXX-*.yaml')) print(f"\nFound {len(xxx_files)} NL-XX-XXX files to process") # Step 3: Match and update matched = 0 not_matched = [] updates = [] non_nl_files = [] for file_path in xxx_files: try: with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) emic_name = data.get('custodian_name', {}).get('emic_name', '') if not emic_name: not_matched.append((file_path.name, "No emic_name")) continue normalized = normalize_name(emic_name) # Try to find a match location = linkedin_locations.get(normalized) if not location: # Try partial matching for key, loc in linkedin_locations.items(): if normalized in key or key in normalized: location = loc break if location: country = location['country'] city = location['city'] region = location.get('region', '') source_file = location['source_file'] if country != 'NL': # Non-Dutch institution - mark for later non_nl_files.append({ 'file': file_path.name, 'emic_name': emic_name, 'country': country, 'city': city, }) continue # Get province code region_code = get_province_code(city, region, country) city_code = generate_city_code(city) if region_code == 'XX' and city_code == 'XXX': not_matched.append((file_path.name, f"No province/city for {city}")) continue # Update the file old_ghcid, new_ghcid = update_custodian_file( file_path, country, region_code, city, city_code, source_file ) updates.append({ 'file': file_path.name, 'emic_name': emic_name, 'old_ghcid': old_ghcid, 'new_ghcid': new_ghcid, 'city': city, 'region': region_code, }) matched += 1 else: not_matched.append((file_path.name, emic_name)) except Exception as e: print(f"Error processing {file_path.name}: {e}") # Report results print(f"\n{'='*60}") print(f"RESULTS") print(f"{'='*60}") print(f"Total XXX files: {len(xxx_files)}") print(f"Matched and updated: {matched}") print(f"Not matched: {len(not_matched)}") print(f"Non-NL institutions found: {len(non_nl_files)}") if updates: print(f"\n--- UPDATES ({len(updates)}) ---") for u in updates[:20]: print(f" {u['emic_name'][:40]:<40} | {u['city']:<15} | {u['region']} | {u['new_ghcid']}") if len(updates) > 20: print(f" ... and {len(updates) - 20} more") if non_nl_files: print(f"\n--- NON-NL INSTITUTIONS ({len(non_nl_files)}) ---") for nf in non_nl_files[:10]: print(f" {nf['emic_name'][:40]:<40} | {nf['country']} | {nf['city']}") if len(non_nl_files) > 10: print(f" ... and {len(non_nl_files) - 10} more") if not_matched: print(f"\n--- NOT MATCHED ({len(not_matched)}) ---") for nm in not_matched[:20]: print(f" {nm[0]:<50} | {nm[1][:40]}") if len(not_matched) > 20: print(f" ... and {len(not_matched) - 20} more") return updates, not_matched, non_nl_files if __name__ == '__main__': main()