glam/scripts/extract_linkedin_locations.py

#!/usr/bin/env python3
"""
Extract company locations from LinkedIn About pages.

These HTML files contain the actual headquarters/primary location of each company,
which can be used to resolve PENDING files to proper GHCIDs.

Usage:
    python scripts/extract_linkedin_locations.py --output data/linkedin_locations.json
    python scripts/extract_linkedin_locations.py --test  # Test with 10 files
"""

import re
import json
import unicodedata
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Optional, List
from collections import Counter

# Source directory for LinkedIn HTML files
SOURCE_DIR = Path("/Volumes/KINGSTON/data/glam/data/custodian/person/affiliated/manual")

# Dutch province name to code mapping
PROVINCE_TO_CODE = {
    'noord-holland': 'NH',
    'north holland': 'NH',
    'zuid-holland': 'ZH',
    'south holland': 'ZH',
    'utrecht': 'UT',
    'gelderland': 'GE',
    'noord-brabant': 'NB',
    'north brabant': 'NB',
    'limburg': 'LI',
    'overijssel': 'OV',
    'friesland': 'FR',
    'drenthe': 'DR',
    'groningen': 'GR',
    'zeeland': 'ZE',
    'flevoland': 'FL',
}

# City to province mapping (for cases where province is not in HTML)
CITY_TO_PROVINCE = {
    'amsterdam': 'NH',
    'haarlem': 'NH',
    'alkmaar': 'NH',
    'hilversum': 'NH',
    'zaandam': 'NH',
    'hoorn': 'NH',
    'enkhuizen': 'NH',
    'rotterdam': 'ZH',
    'den haag': 'ZH',
    'the hague': 'ZH',
    'leiden': 'ZH',
    'delft': 'ZH',
    'dordrecht': 'ZH',
    'gouda': 'ZH',
    'schiedam': 'ZH',
    'utrecht': 'UT',
    'amersfoort': 'UT',
    'zeist': 'UT',
    'arnhem': 'GE',
    'nijmegen': 'GE',
    'apeldoorn': 'GE',
    'ede': 'GE',
    'wageningen': 'GE',
    'eindhoven': 'NB',
    'tilburg': 'NB',
    'breda': 'NB',
    "'s-hertogenbosch": 'NB',
    'den bosch': 'NB',
    'maastricht': 'LI',
    'venlo': 'LI',
    'heerlen': 'LI',
    'roermond': 'LI',
    'zwolle': 'OV',
    'deventer': 'OV',
    'enschede': 'OV',
    'leeuwarden': 'FR',
    'assen': 'DR',
    'groningen': 'GR',
    'middelburg': 'ZE',
    'almere': 'FL',
    'lelystad': 'FL',
}

# City to 3-letter code mapping
CITY_TO_CODE = {
    'amsterdam': 'AMS',
    'rotterdam': 'ROT',
    'den haag': 'DHA',
    'the hague': 'DHA',
    "'s-gravenhage": 'DHA',
    'utrecht': 'UTR',
    'eindhoven': 'EIN',
    'tilburg': 'TIL',
    'groningen': 'GRO',
    'almere': 'ALM',
    'breda': 'BRE',
    'nijmegen': 'NIJ',
    'apeldoorn': 'APE',
    'haarlem': 'HAA',
    'arnhem': 'ARN',
    'enschede': 'ENS',
    'amersfoort': 'AME',
    'zaanstad': 'ZAA',
    'zaandam': 'ZAA',
    "'s-hertogenbosch": 'DBO',
    'den bosch': 'DBO',
    'zwolle': 'ZWO',
    'leiden': 'LEI',
    'maastricht': 'MAA',
    'dordrecht': 'DOR',
    'deventer': 'DEV',
    'delft': 'DEL',
    'alkmaar': 'ALK',
    'leeuwarden': 'LEE',
    'hilversum': 'HIL',
    'assen': 'ASS',
    'middelburg': 'MID',
    'hoorn': 'HOO',
    'enkhuizen': 'ENK',
    'wageningen': 'WAG',
    'gouda': 'GOU',
    'venlo': 'VEN',
    'heerlen': 'HEE',
    'roermond': 'ROE',
    'zeist': 'ZEI',
    'ede': 'EDE',
}


def normalize_name(name: str) -> str:
    """Normalize organization name for matching."""
    # Remove special characters and normalize unicode
    normalized = unicodedata.normalize('NFKD', name)
    # Convert to lowercase
    normalized = normalized.lower().strip()
    # Remove common prefixes/suffixes
    normalized = re.sub(r'\s*\|\s*linkedin.*$', '', normalized)
    return normalized


def extract_org_name_from_filename(filename: str) -> str:
    """Extract organization name from LinkedIn HTML filename."""
    # Pattern: (XX) Organization Name_ About/People _ LinkedIn.html
    match = re.match(r'\(\d+\)\s*(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
    if match:
        return match.group(1).strip()

    # Simpler pattern without number prefix
    match = re.match(r'(.+?)_\s*(About|People)\s*_\s*LinkedIn', filename)
    if match:
        return match.group(1).strip()

    return filename


def extract_company_location(html_content: str) -> Optional[Dict]:
    """Extract primary company location from LinkedIn About page HTML."""

    # Pattern 1: Look for org-locations-module with Primary label
    # This contains the actual company headquarters
    pattern = r'org-locations-module.*?Primary.*?<p[^>]*>(.*?)</p>'
    match = re.search(pattern, html_content, re.DOTALL | re.IGNORECASE)

    if match:
        address_text = match.group(1).strip()
        # Clean HTML entities
        address_text = re.sub(r'<[^>]+>', '', address_text)
        address_text = re.sub(r'\s+', ' ', address_text).strip()

        return parse_address_string(address_text)

    # Pattern 2: Look for Locations (N) section without Primary label
    pattern2 = r'Locations\s*\(\d+\).*?<p[^>]*class="[^"]*break-words[^"]*"[^>]*>(.*?)</p>'
    match2 = re.search(pattern2, html_content, re.DOTALL | re.IGNORECASE)

    if match2:
        address_text = match2.group(1).strip()
        address_text = re.sub(r'<[^>]+>', '', address_text)
        address_text = re.sub(r'\s+', ' ', address_text).strip()

        return parse_address_string(address_text)

    return None


def parse_address_string(address_text: str) -> Optional[Dict]:
    """Parse a LinkedIn address string like 'Street, City, Postal Code, Country'"""

    parts = [p.strip() for p in address_text.split(',')]

    if len(parts) < 2:
        return None

    result = {'raw': address_text}

    # Country is always last (2-letter code)
    country = parts[-1].upper()
    result['country'] = country

    # Find city - it's usually the part before postal code
    # Postal codes have digits, cities usually don't
    city = None
    postal_code = None
    street = None

    for i, part in enumerate(parts[:-1]):  # Exclude country
        part = part.strip()
        # Check if this looks like a postal code (has digits)
        if re.search(r'\d', part):
            # This is likely postal code
            if postal_code is None:
                postal_code = part
        else:
            # This is likely city or street
            if i == 0 or street is not None:
                # First non-postal part or after street
                if city is None and street is not None:
                    city = part
                elif street is None:
                    street = part
            else:
                city = part

    # If we only found one text part, it's the city
    text_parts = [p for p in parts[:-1] if not re.search(r'^\d', p.strip())]
    if len(text_parts) == 1:
        city = text_parts[0]
    elif len(text_parts) >= 2:
        street = text_parts[0]
        city = text_parts[1]

    if city:
        result['city'] = city
    if postal_code:
        result['postal_code'] = postal_code
    if street:
        result['street'] = street

    return result if city else None


def extract_all_locations(source_dir: Path, limit: int = 0) -> Dict[str, Dict]:
    """Extract locations from all About pages."""

    results = {}
    about_files = list(source_dir.glob("*About*LinkedIn.html"))

    if limit:
        about_files = about_files[:limit]

    print(f"Processing {len(about_files)} About pages...")

    success = 0
    failed = 0

    for filepath in about_files:
        org_name = extract_org_name_from_filename(filepath.name)

        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()

            location = extract_company_location(content)

            if location:
                # Normalize city name for lookup
                city_lower = location.get('city', '').lower().strip()

                # Get province code
                province_code = None
                if 'geographicArea' in location:
                    area = location['geographicArea'].lower()
                    province_code = PROVINCE_TO_CODE.get(area)
                if not province_code:
                    province_code = CITY_TO_PROVINCE.get(city_lower)

                # Get city code
                city_code = CITY_TO_CODE.get(city_lower)
                if not city_code:
                    # Generate 3-letter code from city name
                    words = city_lower.split()
                    if len(words) == 1:
                        city_code = city_lower[:3].upper()
                    else:
                        city_code = ''.join(w[0] for w in words[:3]).upper()

                location['province_code'] = province_code
                location['city_code'] = city_code

                results[org_name] = {
                    'location': location,
                    'source_file': filepath.name
                }
                success += 1
            else:
                failed += 1

        except Exception as e:
            failed += 1
            continue

    print(f"Successfully extracted: {success}")
    print(f"Failed: {failed}")

    return results


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--output', type=Path, default=Path('data/linkedin_locations.json'))
    parser.add_argument('--test', action='store_true', help='Test with 10 files')
    parser.add_argument('--source', type=Path, default=SOURCE_DIR)
    args = parser.parse_args()

    limit = 10 if args.test else 0

    results = extract_all_locations(args.source, limit=limit)

    # Print some stats
    print("\n" + "=" * 60)
    print("EXTRACTION RESULTS")
    print("=" * 60)

    country_counts = Counter()
    city_counts = Counter()

    for org, data in results.items():
        loc = data['location']
        country_counts[loc.get('country', 'Unknown')] += 1
        city_counts[loc.get('city', 'Unknown')] += 1

    print("\nTop countries:")
    for country, count in country_counts.most_common(10):
        print(f"  {country}: {count}")

    print("\nTop cities:")
    for city, count in city_counts.most_common(15):
        print(f"  {city}: {count}")

    # Save results
    if not args.test:
        args.output.parent.mkdir(parents=True, exist_ok=True)
        with open(args.output, 'w', encoding='utf-8') as f:
            json.dump({
                'extracted_at': datetime.now(timezone.utc).isoformat(),
                'total_organizations': len(results),
                'organizations': results
            }, f, indent=2, ensure_ascii=False)
        print(f"\nSaved to: {args.output}")
    else:
        print("\n[TEST MODE] Not saving results")
        print("\nSample extractions:")
        for org, data in list(results.items())[:5]:
            print(f"  {org}:")
            print(f"    {data['location']}")


if __name__ == '__main__':
    main()