#!/usr/bin/env python3 """ Fetch surname data from Forebears.io for multiple countries. This script fetches the top surnames from Forebears.io and creates JSON files for use in the entity resolution system. Usage: python scripts/fetch_surname_data.py --country indonesia python scripts/fetch_surname_data.py --all """ import argparse import json import re import sys import time from datetime import date from pathlib import Path import requests from bs4 import BeautifulSoup # Configuration for each country COUNTRY_CONFIG = { "indonesia": { "url": "https://forebears.io/indonesia/surnames", "code": "ID", "output_file": "indonesian_surnames.json", "population": 277000000, }, "germany": { "url": "https://forebears.io/germany/surnames", "code": "DE", "output_file": "german_surnames.json", "population": 84000000, }, "england": { "url": "https://forebears.io/england/surnames", "code": "GB", "output_file": "uk_surnames.json", "population": 56000000, }, "egypt": { "url": "https://forebears.io/egypt/surnames", "code": "EG", "output_file": "egyptian_surnames.json", "population": 104000000, }, "saudi-arabia": { "url": "https://forebears.io/saudi-arabia/surnames", "code": "SA", "output_file": "saudi_surnames.json", "population": 36000000, }, } DATA_DIR = Path(__file__).parent.parent / "src/glam_extractor/entity_resolution/data" def fetch_surnames(country: str) -> dict: """Fetch surname data from Forebears.io for a given country.""" config = COUNTRY_CONFIG.get(country) if not config: raise ValueError(f"Unknown country: {country}") url = config["url"] print(f"Fetching surnames from {url}...") headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" } response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Find the surname table surnames = {} total_unique = 0 # Look for the table with surname data table = soup.find("table", class_="table") if table: rows = table.find_all("tr") for row in rows[1:]: # Skip header cols = row.find_all("td") if len(cols) >= 2: name_cell = cols[0] # Get the surname text name_link = name_cell.find("a") if name_link: surname = name_link.get_text(strip=True).lower() else: surname = name_cell.get_text(strip=True).lower() # Get the incidence (number of people) incidence_text = cols[1].get_text(strip=True) # Remove commas and parse as integer incidence = int(incidence_text.replace(",", "").replace(" ", "")) if surname: surnames[surname] = incidence # Try to find total unique surnames from page text page_text = soup.get_text() unique_match = re.search(r"([\d,]+)\s+unique surnames", page_text) if unique_match: total_unique = int(unique_match.group(1).replace(",", "")) print(f" Found {len(surnames)} surnames") if total_unique: print(f" Total unique surnames in country: {total_unique:,}") return { "_metadata": { "source": f"Forebears.io - {url}", "retrieved_date": date.today().isoformat(), "description": f"Top surnames from {country.title()} with incidence", "total_unique_surnames": total_unique, "country_code": config["code"], "population": config["population"], }, "surnames": surnames, } def save_surnames(data: dict, output_file: str) -> None: """Save surname data to JSON file.""" output_path = DATA_DIR / output_file with open(output_path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f" Saved to {output_path}") def main(): parser = argparse.ArgumentParser(description="Fetch surname data from Forebears.io") parser.add_argument( "--country", choices=list(COUNTRY_CONFIG.keys()), help="Country to fetch surnames for", ) parser.add_argument( "--all", action="store_true", help="Fetch surnames for all configured countries" ) parser.add_argument( "--list", action="store_true", help="List available countries" ) args = parser.parse_args() if args.list: print("Available countries:") for country, config in COUNTRY_CONFIG.items(): print(f" {country} ({config['code']}): {config['url']}") return if args.all: countries = list(COUNTRY_CONFIG.keys()) elif args.country: countries = [args.country] else: parser.print_help() return for country in countries: try: data = fetch_surnames(country) if data["surnames"]: config = COUNTRY_CONFIG[country] save_surnames(data, config["output_file"]) else: print(f" WARNING: No surnames found for {country}") # Be polite - wait between requests if country != countries[-1]: print(" Waiting 2 seconds before next request...") time.sleep(2) except Exception as e: print(f" ERROR fetching {country}: {e}") continue if __name__ == "__main__": main()