glam/scripts/extract_linkedin_locations.py
2026-01-09 20:35:19 +01:00

364 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Extract company locations from LinkedIn About pages.
These HTML files contain the actual headquarters/primary location of each company,
which can be used to resolve PENDING files to proper GHCIDs.
Usage:
python scripts/extract_linkedin_locations.py --output data/linkedin_locations.json
python scripts/extract_linkedin_locations.py --test # Test with 10 files
"""
import re
import json
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, List
from collections import Counter
# Source directory for LinkedIn HTML files
SOURCE_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
# Dutch province name to code mapping
PROVINCE_TO_CODE = {
'noord-holland': 'NH',
'north holland': 'NH',
'zuid-holland': 'ZH',
'south holland': 'ZH',
'utrecht': 'UT',
'gelderland': 'GE',
'noord-brabant': 'NB',
'north brabant': 'NB',
'limburg': 'LI',
'overijssel': 'OV',
'friesland': 'FR',
'drenthe': 'DR',
'groningen': 'GR',
'zeeland': 'ZE',
'flevoland': 'FL',
}
# City to province mapping (for cases where province is not in HTML)
CITY_TO_PROVINCE = {
'amsterdam': 'NH',
'haarlem': 'NH',
'alkmaar': 'NH',
'hilversum': 'NH',
'zaandam': 'NH',
'hoorn': 'NH',
'enkhuizen': 'NH',
'rotterdam': 'ZH',
'den haag': 'ZH',
'the hague': 'ZH',
'leiden': 'ZH',
'delft': 'ZH',
'dordrecht': 'ZH',
'gouda': 'ZH',
'schiedam': 'ZH',
'utrecht': 'UT',
'amersfoort': 'UT',
'zeist': 'UT',
'arnhem': 'GE',
'nijmegen': 'GE',
'apeldoorn': 'GE',
'ede': 'GE',
'wageningen': 'GE',
'eindhoven': 'NB',
'tilburg': 'NB',
'breda': 'NB',
"'s-hertogenbosch": 'NB',
'den bosch': 'NB',
'maastricht': 'LI',
'venlo': 'LI',
'heerlen': 'LI',
'roermond': 'LI',
'zwolle': 'OV',
'deventer': 'OV',
'enschede': 'OV',
'leeuwarden': 'FR',
'assen': 'DR',
'groningen': 'GR',
'middelburg': 'ZE',
'almere': 'FL',
'lelystad': 'FL',
}
# City to 3-letter code mapping
CITY_TO_CODE = {
'amsterdam': 'AMS',
'rotterdam': 'ROT',
'den haag': 'DHA',
'the hague': 'DHA',
"'s-gravenhage": 'DHA',
'utrecht': 'UTR',
'eindhoven': 'EIN',
'tilburg': 'TIL',
'groningen': 'GRO',
'almere': 'ALM',
'breda': 'BRE',
'nijmegen': 'NIJ',
'apeldoorn': 'APE',
'haarlem': 'HAA',
'arnhem': 'ARN',
'enschede': 'ENS',
'amersfoort': 'AME',
'zaanstad': 'ZAA',
'zaandam': 'ZAA',
"'s-hertogenbosch": 'DBO',
'den bosch': 'DBO',
'zwolle': 'ZWO',
'leiden': 'LEI',
'maastricht': 'MAA',
'dordrecht': 'DOR',
'deventer': 'DEV',
'delft': 'DEL',
'alkmaar': 'ALK',
'leeuwarden': 'LEE',
'hilversum': 'HIL',
'assen': 'ASS',
'middelburg': 'MID',
'hoorn': 'HOO',
'enkhuizen': 'ENK',
'wageningen': 'WAG',
'gouda': 'GOU',
'venlo': 'VEN',
'heerlen': 'HEE',
'roermond': 'ROE',
'zeist': 'ZEI',
'ede': 'EDE',
}
def normalize_name(name: str) -> str:
"""Normalize organization name for matching."""
# Remove special characters and normalize unicode
normalized = unicodedata.normalize('NFKD', name)
# Convert to lowercase
normalized = normalized.lower().strip()
# Remove common prefixes/suffixes
normalized = re.sub(r'\s*\|\s*linkedin.*$', '', normalized)
return normalized
def extract_org_name_from_filename(filename: str) -> str:
"""Extract organization name from LinkedIn HTML filename."""
# Pattern: (XX) Organization Name_ About/People _ LinkedIn.html
match = re.match(r'\(\d+\)\s*(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
if match:
return match.group(1).strip()
# Simpler pattern without number prefix
match = re.match(r'(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
if match:
return match.group(1).strip()
return filename
def extract_company_location(html_content: str) -> Optional[Dict]:
"""Extract primary company location from LinkedIn About page HTML."""
# Pattern 1: Look for org-locations-module with Primary label
# This contains the actual company headquarters
pattern = r'org-locations-module.*?Primary.*?<p[^>]*>(.*?)</p>'
match = re.search(pattern, html_content, re.DOTALL | re.IGNORECASE)
if match:
address_text = match.group(1).strip()
# Clean HTML entities
address_text = re.sub(r'<[^>]+>', '', address_text)
address_text = re.sub(r'\s+', ' ', address_text).strip()
return parse_address_string(address_text)
# Pattern 2: Look for Locations (N) section without Primary label
pattern2 = r'Locations\s*\(\d+\).*?<p[^>]*class="[^"]*break-words[^"]*"[^>]*>(.*?)</p>'
match2 = re.search(pattern2, html_content, re.DOTALL | re.IGNORECASE)
if match2:
address_text = match2.group(1).strip()
address_text = re.sub(r'<[^>]+>', '', address_text)
address_text = re.sub(r'\s+', ' ', address_text).strip()
return parse_address_string(address_text)
return None
def parse_address_string(address_text: str) -> Optional[Dict]:
"""Parse a LinkedIn address string like 'Street, City, Postal Code, Country'"""
parts = [p.strip() for p in address_text.split(',')]
if len(parts) < 2:
return None
result = {'raw': address_text}
# Country is always last (2-letter code)
country = parts[-1].upper()
result['country'] = country
# Find city - it's usually the part before postal code
# Postal codes have digits, cities usually don't
city = None
postal_code = None
street = None
for i, part in enumerate(parts[:-1]): # Exclude country
part = part.strip()
# Check if this looks like a postal code (has digits)
if re.search(r'\d', part):
# This is likely postal code
if postal_code is None:
postal_code = part
else:
# This is likely city or street
if i == 0 or street is not None:
# First non-postal part or after street
if city is None and street is not None:
city = part
elif street is None:
street = part
else:
city = part
# If we only found one text part, it's the city
text_parts = [p for p in parts[:-1] if not re.search(r'^\d', p.strip())]
if len(text_parts) == 1:
city = text_parts[0]
elif len(text_parts) >= 2:
street = text_parts[0]
city = text_parts[1]
if city:
result['city'] = city
if postal_code:
result['postal_code'] = postal_code
if street:
result['street'] = street
return result if city else None
def extract_all_locations(source_dir: Path, limit: int = 0) -> Dict[str, Dict]:
"""Extract locations from all About pages."""
results = {}
about_files = list(source_dir.glob("*About*LinkedIn.html"))
if limit:
about_files = about_files[:limit]
print(f"Processing {len(about_files)} About pages...")
success = 0
failed = 0
for filepath in about_files:
org_name = extract_org_name_from_filename(filepath.name)
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
location = extract_company_location(content)
if location:
# Normalize city name for lookup
city_lower = location.get('city', '').lower().strip()
# Get province code
province_code = None
if 'geographicArea' in location:
area = location['geographicArea'].lower()
province_code = PROVINCE_TO_CODE.get(area)
if not province_code:
province_code = CITY_TO_PROVINCE.get(city_lower)
# Get city code
city_code = CITY_TO_CODE.get(city_lower)
if not city_code:
# Generate 3-letter code from city name
words = city_lower.split()
if len(words) == 1:
city_code = city_lower[:3].upper()
else:
city_code = ''.join(w[0] for w in words[:3]).upper()
location['province_code'] = province_code
location['city_code'] = city_code
results[org_name] = {
'location': location,
'source_file': filepath.name
}
success += 1
else:
failed += 1
except Exception as e:
failed += 1
continue
print(f"Successfully extracted: {success}")
print(f"Failed: {failed}")
return results
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--output', type=Path, default=Path('data/linkedin_locations.json'))
parser.add_argument('--test', action='store_true', help='Test with 10 files')
parser.add_argument('--source', type=Path, default=SOURCE_DIR)
args = parser.parse_args()
limit = 10 if args.test else 0
results = extract_all_locations(args.source, limit=limit)
# Print some stats
print("\n" + "=" * 60)
print("EXTRACTION RESULTS")
print("=" * 60)
country_counts = Counter()
city_counts = Counter()
for org, data in results.items():
loc = data['location']
country_counts[loc.get('country', 'Unknown')] += 1
city_counts[loc.get('city', 'Unknown')] += 1
print("\nTop countries:")
for country, count in country_counts.most_common(10):
print(f" {country}: {count}")
print("\nTop cities:")
for city, count in city_counts.most_common(15):
print(f" {city}: {count}")
# Save results
if not args.test:
args.output.parent.mkdir(parents=True, exist_ok=True)
with open(args.output, 'w', encoding='utf-8') as f:
json.dump({
'extracted_at': datetime.now(timezone.utc).isoformat(),
'total_organizations': len(results),
'organizations': results
}, f, indent=2, ensure_ascii=False)
print(f"\nSaved to: {args.output}")
else:
print("\n[TEST MODE] Not saving results")
print("\nSample extractions:")
for org, data in list(results.items())[:5]:
print(f" {org}:")
print(f" {data['location']}")
if __name__ == '__main__':
main()