#!/usr/bin/env python3 """ Extract company locations from LinkedIn About pages. These HTML files contain the actual headquarters/primary location of each company, which can be used to resolve PENDING files to proper GHCIDs. Usage: python scripts/extract_linkedin_locations.py --output data/linkedin_locations.json python scripts/extract_linkedin_locations.py --test # Test with 10 files """ import re import json import unicodedata from pathlib import Path from datetime import datetime, timezone from typing import Dict, Optional, List from collections import Counter # Source directory for LinkedIn HTML files SOURCE_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual") # Dutch province name to code mapping PROVINCE_TO_CODE = { 'noord-holland': 'NH', 'north holland': 'NH', 'zuid-holland': 'ZH', 'south holland': 'ZH', 'utrecht': 'UT', 'gelderland': 'GE', 'noord-brabant': 'NB', 'north brabant': 'NB', 'limburg': 'LI', 'overijssel': 'OV', 'friesland': 'FR', 'drenthe': 'DR', 'groningen': 'GR', 'zeeland': 'ZE', 'flevoland': 'FL', } # City to province mapping (for cases where province is not in HTML) CITY_TO_PROVINCE = { 'amsterdam': 'NH', 'haarlem': 'NH', 'alkmaar': 'NH', 'hilversum': 'NH', 'zaandam': 'NH', 'hoorn': 'NH', 'enkhuizen': 'NH', 'rotterdam': 'ZH', 'den haag': 'ZH', 'the hague': 'ZH', 'leiden': 'ZH', 'delft': 'ZH', 'dordrecht': 'ZH', 'gouda': 'ZH', 'schiedam': 'ZH', 'utrecht': 'UT', 'amersfoort': 'UT', 'zeist': 'UT', 'arnhem': 'GE', 'nijmegen': 'GE', 'apeldoorn': 'GE', 'ede': 'GE', 'wageningen': 'GE', 'eindhoven': 'NB', 'tilburg': 'NB', 'breda': 'NB', "'s-hertogenbosch": 'NB', 'den bosch': 'NB', 'maastricht': 'LI', 'venlo': 'LI', 'heerlen': 'LI', 'roermond': 'LI', 'zwolle': 'OV', 'deventer': 'OV', 'enschede': 'OV', 'leeuwarden': 'FR', 'assen': 'DR', 'groningen': 'GR', 'middelburg': 'ZE', 'almere': 'FL', 'lelystad': 'FL', } # City to 3-letter code mapping CITY_TO_CODE = { 'amsterdam': 'AMS', 'rotterdam': 'ROT', 'den haag': 'DHA', 'the hague': 'DHA', "'s-gravenhage": 'DHA', 'utrecht': 'UTR', 'eindhoven': 'EIN', 'tilburg': 'TIL', 'groningen': 'GRO', 'almere': 'ALM', 'breda': 'BRE', 'nijmegen': 'NIJ', 'apeldoorn': 'APE', 'haarlem': 'HAA', 'arnhem': 'ARN', 'enschede': 'ENS', 'amersfoort': 'AME', 'zaanstad': 'ZAA', 'zaandam': 'ZAA', "'s-hertogenbosch": 'DBO', 'den bosch': 'DBO', 'zwolle': 'ZWO', 'leiden': 'LEI', 'maastricht': 'MAA', 'dordrecht': 'DOR', 'deventer': 'DEV', 'delft': 'DEL', 'alkmaar': 'ALK', 'leeuwarden': 'LEE', 'hilversum': 'HIL', 'assen': 'ASS', 'middelburg': 'MID', 'hoorn': 'HOO', 'enkhuizen': 'ENK', 'wageningen': 'WAG', 'gouda': 'GOU', 'venlo': 'VEN', 'heerlen': 'HEE', 'roermond': 'ROE', 'zeist': 'ZEI', 'ede': 'EDE', } def normalize_name(name: str) -> str: """Normalize organization name for matching.""" # Remove special characters and normalize unicode normalized = unicodedata.normalize('NFKD', name) # Convert to lowercase normalized = normalized.lower().strip() # Remove common prefixes/suffixes normalized = re.sub(r'\s*\|\s*linkedin.*$', '', normalized) return normalized def extract_org_name_from_filename(filename: str) -> str: """Extract organization name from LinkedIn HTML filename.""" # Pattern: (XX) Organization Name_ About/People _ LinkedIn.html match = re.match(r'\(\d+\)\s*(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename) if match: return match.group(1).strip() # Simpler pattern without number prefix match = re.match(r'(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename) if match: return match.group(1).strip() return filename def extract_company_location(html_content: str) -> Optional[Dict]: """Extract primary company location from LinkedIn About page HTML.""" # Pattern 1: Look for org-locations-module with Primary label # This contains the actual company headquarters pattern = r'org-locations-module.*?Primary.*?
]*>(.*?)
' match = re.search(pattern, html_content, re.DOTALL | re.IGNORECASE) if match: address_text = match.group(1).strip() # Clean HTML entities address_text = re.sub(r'<[^>]+>', '', address_text) address_text = re.sub(r'\s+', ' ', address_text).strip() return parse_address_string(address_text) # Pattern 2: Look for Locations (N) section without Primary label pattern2 = r'Locations\s*\(\d+\).*?]*class="[^"]*break-words[^"]*"[^>]*>(.*?)
' match2 = re.search(pattern2, html_content, re.DOTALL | re.IGNORECASE) if match2: address_text = match2.group(1).strip() address_text = re.sub(r'<[^>]+>', '', address_text) address_text = re.sub(r'\s+', ' ', address_text).strip() return parse_address_string(address_text) return None def parse_address_string(address_text: str) -> Optional[Dict]: """Parse a LinkedIn address string like 'Street, City, Postal Code, Country'""" parts = [p.strip() for p in address_text.split(',')] if len(parts) < 2: return None result = {'raw': address_text} # Country is always last (2-letter code) country = parts[-1].upper() result['country'] = country # Find city - it's usually the part before postal code # Postal codes have digits, cities usually don't city = None postal_code = None street = None for i, part in enumerate(parts[:-1]): # Exclude country part = part.strip() # Check if this looks like a postal code (has digits) if re.search(r'\d', part): # This is likely postal code if postal_code is None: postal_code = part else: # This is likely city or street if i == 0 or street is not None: # First non-postal part or after street if city is None and street is not None: city = part elif street is None: street = part else: city = part # If we only found one text part, it's the city text_parts = [p for p in parts[:-1] if not re.search(r'^\d', p.strip())] if len(text_parts) == 1: city = text_parts[0] elif len(text_parts) >= 2: street = text_parts[0] city = text_parts[1] if city: result['city'] = city if postal_code: result['postal_code'] = postal_code if street: result['street'] = street return result if city else None def extract_all_locations(source_dir: Path, limit: int = 0) -> Dict[str, Dict]: """Extract locations from all About pages.""" results = {} about_files = list(source_dir.glob("*About*LinkedIn.html")) if limit: about_files = about_files[:limit] print(f"Processing {len(about_files)} About pages...") success = 0 failed = 0 for filepath in about_files: org_name = extract_org_name_from_filename(filepath.name) try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() location = extract_company_location(content) if location: # Normalize city name for lookup city_lower = location.get('city', '').lower().strip() # Get province code province_code = None if 'geographicArea' in location: area = location['geographicArea'].lower() province_code = PROVINCE_TO_CODE.get(area) if not province_code: province_code = CITY_TO_PROVINCE.get(city_lower) # Get city code city_code = CITY_TO_CODE.get(city_lower) if not city_code: # Generate 3-letter code from city name words = city_lower.split() if len(words) == 1: city_code = city_lower[:3].upper() else: city_code = ''.join(w[0] for w in words[:3]).upper() location['province_code'] = province_code location['city_code'] = city_code results[org_name] = { 'location': location, 'source_file': filepath.name } success += 1 else: failed += 1 except Exception as e: failed += 1 continue print(f"Successfully extracted: {success}") print(f"Failed: {failed}") return results def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--output', type=Path, default=Path('data/linkedin_locations.json')) parser.add_argument('--test', action='store_true', help='Test with 10 files') parser.add_argument('--source', type=Path, default=SOURCE_DIR) args = parser.parse_args() limit = 10 if args.test else 0 results = extract_all_locations(args.source, limit=limit) # Print some stats print("\n" + "=" * 60) print("EXTRACTION RESULTS") print("=" * 60) country_counts = Counter() city_counts = Counter() for org, data in results.items(): loc = data['location'] country_counts[loc.get('country', 'Unknown')] += 1 city_counts[loc.get('city', 'Unknown')] += 1 print("\nTop countries:") for country, count in country_counts.most_common(10): print(f" {country}: {count}") print("\nTop cities:") for city, count in city_counts.most_common(15): print(f" {city}: {count}") # Save results if not args.test: args.output.parent.mkdir(parents=True, exist_ok=True) with open(args.output, 'w', encoding='utf-8') as f: json.dump({ 'extracted_at': datetime.now(timezone.utc).isoformat(), 'total_organizations': len(results), 'organizations': results }, f, indent=2, ensure_ascii=False) print(f"\nSaved to: {args.output}") else: print("\n[TEST MODE] Not saving results") print("\nSample extractions:") for org, data in list(results.items())[:5]: print(f" {org}:") print(f" {data['location']}") if __name__ == '__main__': main()