glam/scripts/fetch_surname_data.py

#!/usr/bin/env python3
"""
Fetch surname data from Forebears.io for multiple countries.

This script fetches the top surnames from Forebears.io and creates
JSON files for use in the entity resolution system.

Usage:
    python scripts/fetch_surname_data.py --country indonesia
    python scripts/fetch_surname_data.py --all
"""

import argparse
import json
import re
import sys
import time
from datetime import date
from pathlib import Path

import requests
from bs4 import BeautifulSoup

# Configuration for each country
COUNTRY_CONFIG = {
    "indonesia": {
        "url": "https://forebears.io/indonesia/surnames",
        "code": "ID",
        "output_file": "indonesian_surnames.json",
        "population": 277000000,
    },
    "germany": {
        "url": "https://forebears.io/germany/surnames",
        "code": "DE",
        "output_file": "german_surnames.json",
        "population": 84000000,
    },
    "england": {
        "url": "https://forebears.io/england/surnames",
        "code": "GB",
        "output_file": "uk_surnames.json",
        "population": 56000000,
    },
    "egypt": {
        "url": "https://forebears.io/egypt/surnames",
        "code": "EG",
        "output_file": "egyptian_surnames.json",
        "population": 104000000,
    },
    "saudi-arabia": {
        "url": "https://forebears.io/saudi-arabia/surnames",
        "code": "SA",
        "output_file": "saudi_surnames.json",
        "population": 36000000,
    },
}

DATA_DIR = Path(__file__).parent.parent / "src/glam_extractor/entity_resolution/data"


def fetch_surnames(country: str) -> dict:
    """Fetch surname data from Forebears.io for a given country."""
    config = COUNTRY_CONFIG.get(country)
    if not config:
        raise ValueError(f"Unknown country: {country}")

    url = config["url"]
    print(f"Fetching surnames from {url}...")

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
    }

    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    # Find the surname table
    surnames = {}
    total_unique = 0

    # Look for the table with surname data
    table = soup.find("table", class_="table")
    if table:
        rows = table.find_all("tr")
        for row in rows[1:]:  # Skip header
            cols = row.find_all("td")
            if len(cols) >= 2:
                name_cell = cols[0]
                # Get the surname text
                name_link = name_cell.find("a")
                if name_link:
                    surname = name_link.get_text(strip=True).lower()
                else:
                    surname = name_cell.get_text(strip=True).lower()

                # Get the incidence (number of people)
                incidence_text = cols[1].get_text(strip=True)
                # Remove commas and parse as integer
                incidence = int(incidence_text.replace(",", "").replace(" ", ""))

                if surname:
                    surnames[surname] = incidence

    # Try to find total unique surnames from page text
    page_text = soup.get_text()
    unique_match = re.search(r"([\d,]+)\s+unique surnames", page_text)
    if unique_match:
        total_unique = int(unique_match.group(1).replace(",", ""))

    print(f"  Found {len(surnames)} surnames")
    if total_unique:
        print(f"  Total unique surnames in country: {total_unique:,}")

    return {
        "_metadata": {
            "source": f"Forebears.io - {url}",
            "retrieved_date": date.today().isoformat(),
            "description": f"Top surnames from {country.title()} with incidence",
            "total_unique_surnames": total_unique,
            "country_code": config["code"],
            "population": config["population"],
        },
        "surnames": surnames,
    }


def save_surnames(data: dict, output_file: str) -> None:
    """Save surname data to JSON file."""
    output_path = DATA_DIR / output_file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"  Saved to {output_path}")


def main():
    parser = argparse.ArgumentParser(description="Fetch surname data from Forebears.io")
    parser.add_argument(
        "--country",
        choices=list(COUNTRY_CONFIG.keys()),
        help="Country to fetch surnames for",
    )
    parser.add_argument(
        "--all", action="store_true", help="Fetch surnames for all configured countries"
    )
    parser.add_argument(
        "--list", action="store_true", help="List available countries"
    )

    args = parser.parse_args()

    if args.list:
        print("Available countries:")
        for country, config in COUNTRY_CONFIG.items():
            print(f"  {country} ({config['code']}): {config['url']}")
        return

    if args.all:
        countries = list(COUNTRY_CONFIG.keys())
    elif args.country:
        countries = [args.country]
    else:
        parser.print_help()
        return

    for country in countries:
        try:
            data = fetch_surnames(country)
            if data["surnames"]:
                config = COUNTRY_CONFIG[country]
                save_surnames(data, config["output_file"])
            else:
                print(f"  WARNING: No surnames found for {country}")

            # Be polite - wait between requests
            if country != countries[-1]:
                print("  Waiting 2 seconds before next request...")
                time.sleep(2)

        except Exception as e:
            print(f"  ERROR fetching {country}: {e}")
            continue


if __name__ == "__main__":
    main()