364 lines
11 KiB
Python
364 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract company locations from LinkedIn About pages.
|
|
|
|
These HTML files contain the actual headquarters/primary location of each company,
|
|
which can be used to resolve PENDING files to proper GHCIDs.
|
|
|
|
Usage:
|
|
python scripts/extract_linkedin_locations.py --output data/linkedin_locations.json
|
|
python scripts/extract_linkedin_locations.py --test # Test with 10 files
|
|
"""
|
|
|
|
import re
|
|
import json
|
|
import unicodedata
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, Optional, List
|
|
from collections import Counter
|
|
|
|
# Source directory for LinkedIn HTML files
|
|
SOURCE_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")
|
|
|
|
# Dutch province name to code mapping
|
|
PROVINCE_TO_CODE = {
|
|
'noord-holland': 'NH',
|
|
'north holland': 'NH',
|
|
'zuid-holland': 'ZH',
|
|
'south holland': 'ZH',
|
|
'utrecht': 'UT',
|
|
'gelderland': 'GE',
|
|
'noord-brabant': 'NB',
|
|
'north brabant': 'NB',
|
|
'limburg': 'LI',
|
|
'overijssel': 'OV',
|
|
'friesland': 'FR',
|
|
'drenthe': 'DR',
|
|
'groningen': 'GR',
|
|
'zeeland': 'ZE',
|
|
'flevoland': 'FL',
|
|
}
|
|
|
|
# City to province mapping (for cases where province is not in HTML)
|
|
CITY_TO_PROVINCE = {
|
|
'amsterdam': 'NH',
|
|
'haarlem': 'NH',
|
|
'alkmaar': 'NH',
|
|
'hilversum': 'NH',
|
|
'zaandam': 'NH',
|
|
'hoorn': 'NH',
|
|
'enkhuizen': 'NH',
|
|
'rotterdam': 'ZH',
|
|
'den haag': 'ZH',
|
|
'the hague': 'ZH',
|
|
'leiden': 'ZH',
|
|
'delft': 'ZH',
|
|
'dordrecht': 'ZH',
|
|
'gouda': 'ZH',
|
|
'schiedam': 'ZH',
|
|
'utrecht': 'UT',
|
|
'amersfoort': 'UT',
|
|
'zeist': 'UT',
|
|
'arnhem': 'GE',
|
|
'nijmegen': 'GE',
|
|
'apeldoorn': 'GE',
|
|
'ede': 'GE',
|
|
'wageningen': 'GE',
|
|
'eindhoven': 'NB',
|
|
'tilburg': 'NB',
|
|
'breda': 'NB',
|
|
"'s-hertogenbosch": 'NB',
|
|
'den bosch': 'NB',
|
|
'maastricht': 'LI',
|
|
'venlo': 'LI',
|
|
'heerlen': 'LI',
|
|
'roermond': 'LI',
|
|
'zwolle': 'OV',
|
|
'deventer': 'OV',
|
|
'enschede': 'OV',
|
|
'leeuwarden': 'FR',
|
|
'assen': 'DR',
|
|
'groningen': 'GR',
|
|
'middelburg': 'ZE',
|
|
'almere': 'FL',
|
|
'lelystad': 'FL',
|
|
}
|
|
|
|
# City to 3-letter code mapping
|
|
CITY_TO_CODE = {
|
|
'amsterdam': 'AMS',
|
|
'rotterdam': 'ROT',
|
|
'den haag': 'DHA',
|
|
'the hague': 'DHA',
|
|
"'s-gravenhage": 'DHA',
|
|
'utrecht': 'UTR',
|
|
'eindhoven': 'EIN',
|
|
'tilburg': 'TIL',
|
|
'groningen': 'GRO',
|
|
'almere': 'ALM',
|
|
'breda': 'BRE',
|
|
'nijmegen': 'NIJ',
|
|
'apeldoorn': 'APE',
|
|
'haarlem': 'HAA',
|
|
'arnhem': 'ARN',
|
|
'enschede': 'ENS',
|
|
'amersfoort': 'AME',
|
|
'zaanstad': 'ZAA',
|
|
'zaandam': 'ZAA',
|
|
"'s-hertogenbosch": 'DBO',
|
|
'den bosch': 'DBO',
|
|
'zwolle': 'ZWO',
|
|
'leiden': 'LEI',
|
|
'maastricht': 'MAA',
|
|
'dordrecht': 'DOR',
|
|
'deventer': 'DEV',
|
|
'delft': 'DEL',
|
|
'alkmaar': 'ALK',
|
|
'leeuwarden': 'LEE',
|
|
'hilversum': 'HIL',
|
|
'assen': 'ASS',
|
|
'middelburg': 'MID',
|
|
'hoorn': 'HOO',
|
|
'enkhuizen': 'ENK',
|
|
'wageningen': 'WAG',
|
|
'gouda': 'GOU',
|
|
'venlo': 'VEN',
|
|
'heerlen': 'HEE',
|
|
'roermond': 'ROE',
|
|
'zeist': 'ZEI',
|
|
'ede': 'EDE',
|
|
}
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize organization name for matching."""
|
|
# Remove special characters and normalize unicode
|
|
normalized = unicodedata.normalize('NFKD', name)
|
|
# Convert to lowercase
|
|
normalized = normalized.lower().strip()
|
|
# Remove common prefixes/suffixes
|
|
normalized = re.sub(r'\s*\|\s*linkedin.*$', '', normalized)
|
|
return normalized
|
|
|
|
|
|
def extract_org_name_from_filename(filename: str) -> str:
|
|
"""Extract organization name from LinkedIn HTML filename."""
|
|
# Pattern: (XX) Organization Name_ About/People _ LinkedIn.html
|
|
match = re.match(r'\(\d+\)\s*(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
|
|
if match:
|
|
return match.group(1).strip()
|
|
|
|
# Simpler pattern without number prefix
|
|
match = re.match(r'(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
|
|
if match:
|
|
return match.group(1).strip()
|
|
|
|
return filename
|
|
|
|
|
|
def extract_company_location(html_content: str) -> Optional[Dict]:
|
|
"""Extract primary company location from LinkedIn About page HTML."""
|
|
|
|
# Pattern 1: Look for org-locations-module with Primary label
|
|
# This contains the actual company headquarters
|
|
pattern = r'org-locations-module.*?Primary.*?<p[^>]*>(.*?)</p>'
|
|
match = re.search(pattern, html_content, re.DOTALL | re.IGNORECASE)
|
|
|
|
if match:
|
|
address_text = match.group(1).strip()
|
|
# Clean HTML entities
|
|
address_text = re.sub(r'<[^>]+>', '', address_text)
|
|
address_text = re.sub(r'\s+', ' ', address_text).strip()
|
|
|
|
return parse_address_string(address_text)
|
|
|
|
# Pattern 2: Look for Locations (N) section without Primary label
|
|
pattern2 = r'Locations\s*\(\d+\).*?<p[^>]*class="[^"]*break-words[^"]*"[^>]*>(.*?)</p>'
|
|
match2 = re.search(pattern2, html_content, re.DOTALL | re.IGNORECASE)
|
|
|
|
if match2:
|
|
address_text = match2.group(1).strip()
|
|
address_text = re.sub(r'<[^>]+>', '', address_text)
|
|
address_text = re.sub(r'\s+', ' ', address_text).strip()
|
|
|
|
return parse_address_string(address_text)
|
|
|
|
return None
|
|
|
|
|
|
def parse_address_string(address_text: str) -> Optional[Dict]:
|
|
"""Parse a LinkedIn address string like 'Street, City, Postal Code, Country'"""
|
|
|
|
parts = [p.strip() for p in address_text.split(',')]
|
|
|
|
if len(parts) < 2:
|
|
return None
|
|
|
|
result = {'raw': address_text}
|
|
|
|
# Country is always last (2-letter code)
|
|
country = parts[-1].upper()
|
|
result['country'] = country
|
|
|
|
# Find city - it's usually the part before postal code
|
|
# Postal codes have digits, cities usually don't
|
|
city = None
|
|
postal_code = None
|
|
street = None
|
|
|
|
for i, part in enumerate(parts[:-1]): # Exclude country
|
|
part = part.strip()
|
|
# Check if this looks like a postal code (has digits)
|
|
if re.search(r'\d', part):
|
|
# This is likely postal code
|
|
if postal_code is None:
|
|
postal_code = part
|
|
else:
|
|
# This is likely city or street
|
|
if i == 0 or street is not None:
|
|
# First non-postal part or after street
|
|
if city is None and street is not None:
|
|
city = part
|
|
elif street is None:
|
|
street = part
|
|
else:
|
|
city = part
|
|
|
|
# If we only found one text part, it's the city
|
|
text_parts = [p for p in parts[:-1] if not re.search(r'^\d', p.strip())]
|
|
if len(text_parts) == 1:
|
|
city = text_parts[0]
|
|
elif len(text_parts) >= 2:
|
|
street = text_parts[0]
|
|
city = text_parts[1]
|
|
|
|
if city:
|
|
result['city'] = city
|
|
if postal_code:
|
|
result['postal_code'] = postal_code
|
|
if street:
|
|
result['street'] = street
|
|
|
|
return result if city else None
|
|
|
|
|
|
def extract_all_locations(source_dir: Path, limit: int = 0) -> Dict[str, Dict]:
|
|
"""Extract locations from all About pages."""
|
|
|
|
results = {}
|
|
about_files = list(source_dir.glob("*About*LinkedIn.html"))
|
|
|
|
if limit:
|
|
about_files = about_files[:limit]
|
|
|
|
print(f"Processing {len(about_files)} About pages...")
|
|
|
|
success = 0
|
|
failed = 0
|
|
|
|
for filepath in about_files:
|
|
org_name = extract_org_name_from_filename(filepath.name)
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
location = extract_company_location(content)
|
|
|
|
if location:
|
|
# Normalize city name for lookup
|
|
city_lower = location.get('city', '').lower().strip()
|
|
|
|
# Get province code
|
|
province_code = None
|
|
if 'geographicArea' in location:
|
|
area = location['geographicArea'].lower()
|
|
province_code = PROVINCE_TO_CODE.get(area)
|
|
if not province_code:
|
|
province_code = CITY_TO_PROVINCE.get(city_lower)
|
|
|
|
# Get city code
|
|
city_code = CITY_TO_CODE.get(city_lower)
|
|
if not city_code:
|
|
# Generate 3-letter code from city name
|
|
words = city_lower.split()
|
|
if len(words) == 1:
|
|
city_code = city_lower[:3].upper()
|
|
else:
|
|
city_code = ''.join(w[0] for w in words[:3]).upper()
|
|
|
|
location['province_code'] = province_code
|
|
location['city_code'] = city_code
|
|
|
|
results[org_name] = {
|
|
'location': location,
|
|
'source_file': filepath.name
|
|
}
|
|
success += 1
|
|
else:
|
|
failed += 1
|
|
|
|
except Exception as e:
|
|
failed += 1
|
|
continue
|
|
|
|
print(f"Successfully extracted: {success}")
|
|
print(f"Failed: {failed}")
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--output', type=Path, default=Path('data/linkedin_locations.json'))
|
|
parser.add_argument('--test', action='store_true', help='Test with 10 files')
|
|
parser.add_argument('--source', type=Path, default=SOURCE_DIR)
|
|
args = parser.parse_args()
|
|
|
|
limit = 10 if args.test else 0
|
|
|
|
results = extract_all_locations(args.source, limit=limit)
|
|
|
|
# Print some stats
|
|
print("\n" + "=" * 60)
|
|
print("EXTRACTION RESULTS")
|
|
print("=" * 60)
|
|
|
|
country_counts = Counter()
|
|
city_counts = Counter()
|
|
|
|
for org, data in results.items():
|
|
loc = data['location']
|
|
country_counts[loc.get('country', 'Unknown')] += 1
|
|
city_counts[loc.get('city', 'Unknown')] += 1
|
|
|
|
print("\nTop countries:")
|
|
for country, count in country_counts.most_common(10):
|
|
print(f" {country}: {count}")
|
|
|
|
print("\nTop cities:")
|
|
for city, count in city_counts.most_common(15):
|
|
print(f" {city}: {count}")
|
|
|
|
# Save results
|
|
if not args.test:
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(args.output, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'extracted_at': datetime.now(timezone.utc).isoformat(),
|
|
'total_organizations': len(results),
|
|
'organizations': results
|
|
}, f, indent=2, ensure_ascii=False)
|
|
print(f"\nSaved to: {args.output}")
|
|
else:
|
|
print("\n[TEST MODE] Not saving results")
|
|
print("\nSample extractions:")
|
|
for org, data in list(results.items())[:5]:
|
|
print(f" {org}:")
|
|
print(f" {data['location']}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|