glam/scripts/fetch_surname_data.py
kempersc fd792fce2c
Some checks failed
Deploy Frontend / build-and-deploy (push) Has been cancelled
Refactor code structure for improved readability and maintainability
2026-01-11 15:27:14 +01:00

187 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Fetch surname data from Forebears.io for multiple countries.
This script fetches the top surnames from Forebears.io and creates
JSON files for use in the entity resolution system.
Usage:
python scripts/fetch_surname_data.py --country indonesia
python scripts/fetch_surname_data.py --all
"""
import argparse
import json
import re
import sys
import time
from datetime import date
from pathlib import Path
import requests
from bs4 import BeautifulSoup
# Configuration for each country
COUNTRY_CONFIG = {
"indonesia": {
"url": "https://forebears.io/indonesia/surnames",
"code": "ID",
"output_file": "indonesian_surnames.json",
"population": 277000000,
},
"germany": {
"url": "https://forebears.io/germany/surnames",
"code": "DE",
"output_file": "german_surnames.json",
"population": 84000000,
},
"england": {
"url": "https://forebears.io/england/surnames",
"code": "GB",
"output_file": "uk_surnames.json",
"population": 56000000,
},
"egypt": {
"url": "https://forebears.io/egypt/surnames",
"code": "EG",
"output_file": "egyptian_surnames.json",
"population": 104000000,
},
"saudi-arabia": {
"url": "https://forebears.io/saudi-arabia/surnames",
"code": "SA",
"output_file": "saudi_surnames.json",
"population": 36000000,
},
}
DATA_DIR = Path(__file__).parent.parent / "src/glam_extractor/entity_resolution/data"
def fetch_surnames(country: str) -> dict:
"""Fetch surname data from Forebears.io for a given country."""
config = COUNTRY_CONFIG.get(country)
if not config:
raise ValueError(f"Unknown country: {country}")
url = config["url"]
print(f"Fetching surnames from {url}...")
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Find the surname table
surnames = {}
total_unique = 0
# Look for the table with surname data
table = soup.find("table", class_="table")
if table:
rows = table.find_all("tr")
for row in rows[1:]: # Skip header
cols = row.find_all("td")
if len(cols) >= 2:
name_cell = cols[0]
# Get the surname text
name_link = name_cell.find("a")
if name_link:
surname = name_link.get_text(strip=True).lower()
else:
surname = name_cell.get_text(strip=True).lower()
# Get the incidence (number of people)
incidence_text = cols[1].get_text(strip=True)
# Remove commas and parse as integer
incidence = int(incidence_text.replace(",", "").replace(" ", ""))
if surname:
surnames[surname] = incidence
# Try to find total unique surnames from page text
page_text = soup.get_text()
unique_match = re.search(r"([\d,]+)\s+unique surnames", page_text)
if unique_match:
total_unique = int(unique_match.group(1).replace(",", ""))
print(f" Found {len(surnames)} surnames")
if total_unique:
print(f" Total unique surnames in country: {total_unique:,}")
return {
"_metadata": {
"source": f"Forebears.io - {url}",
"retrieved_date": date.today().isoformat(),
"description": f"Top surnames from {country.title()} with incidence",
"total_unique_surnames": total_unique,
"country_code": config["code"],
"population": config["population"],
},
"surnames": surnames,
}
def save_surnames(data: dict, output_file: str) -> None:
"""Save surname data to JSON file."""
output_path = DATA_DIR / output_file
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f" Saved to {output_path}")
def main():
parser = argparse.ArgumentParser(description="Fetch surname data from Forebears.io")
parser.add_argument(
"--country",
choices=list(COUNTRY_CONFIG.keys()),
help="Country to fetch surnames for",
)
parser.add_argument(
"--all", action="store_true", help="Fetch surnames for all configured countries"
)
parser.add_argument(
"--list", action="store_true", help="List available countries"
)
args = parser.parse_args()
if args.list:
print("Available countries:")
for country, config in COUNTRY_CONFIG.items():
print(f" {country} ({config['code']}): {config['url']}")
return
if args.all:
countries = list(COUNTRY_CONFIG.keys())
elif args.country:
countries = [args.country]
else:
parser.print_help()
return
for country in countries:
try:
data = fetch_surnames(country)
if data["surnames"]:
config = COUNTRY_CONFIG[country]
save_surnames(data, config["output_file"])
else:
print(f" WARNING: No surnames found for {country}")
# Be polite - wait between requests
if country != countries[-1]:
print(" Waiting 2 seconds before next request...")
time.sleep(2)
except Exception as e:
print(f" ERROR fetching {country}: {e}")
continue
if __name__ == "__main__":
main()