187 lines
5.6 KiB
Python
187 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fetch surname data from Forebears.io for multiple countries.
|
|
|
|
This script fetches the top surnames from Forebears.io and creates
|
|
JSON files for use in the entity resolution system.
|
|
|
|
Usage:
|
|
python scripts/fetch_surname_data.py --country indonesia
|
|
python scripts/fetch_surname_data.py --all
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from datetime import date
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configuration for each country
|
|
COUNTRY_CONFIG = {
|
|
"indonesia": {
|
|
"url": "https://forebears.io/indonesia/surnames",
|
|
"code": "ID",
|
|
"output_file": "indonesian_surnames.json",
|
|
"population": 277000000,
|
|
},
|
|
"germany": {
|
|
"url": "https://forebears.io/germany/surnames",
|
|
"code": "DE",
|
|
"output_file": "german_surnames.json",
|
|
"population": 84000000,
|
|
},
|
|
"england": {
|
|
"url": "https://forebears.io/england/surnames",
|
|
"code": "GB",
|
|
"output_file": "uk_surnames.json",
|
|
"population": 56000000,
|
|
},
|
|
"egypt": {
|
|
"url": "https://forebears.io/egypt/surnames",
|
|
"code": "EG",
|
|
"output_file": "egyptian_surnames.json",
|
|
"population": 104000000,
|
|
},
|
|
"saudi-arabia": {
|
|
"url": "https://forebears.io/saudi-arabia/surnames",
|
|
"code": "SA",
|
|
"output_file": "saudi_surnames.json",
|
|
"population": 36000000,
|
|
},
|
|
}
|
|
|
|
DATA_DIR = Path(__file__).parent.parent / "src/glam_extractor/entity_resolution/data"
|
|
|
|
|
|
def fetch_surnames(country: str) -> dict:
|
|
"""Fetch surname data from Forebears.io for a given country."""
|
|
config = COUNTRY_CONFIG.get(country)
|
|
if not config:
|
|
raise ValueError(f"Unknown country: {country}")
|
|
|
|
url = config["url"]
|
|
print(f"Fetching surnames from {url}...")
|
|
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# Find the surname table
|
|
surnames = {}
|
|
total_unique = 0
|
|
|
|
# Look for the table with surname data
|
|
table = soup.find("table", class_="table")
|
|
if table:
|
|
rows = table.find_all("tr")
|
|
for row in rows[1:]: # Skip header
|
|
cols = row.find_all("td")
|
|
if len(cols) >= 2:
|
|
name_cell = cols[0]
|
|
# Get the surname text
|
|
name_link = name_cell.find("a")
|
|
if name_link:
|
|
surname = name_link.get_text(strip=True).lower()
|
|
else:
|
|
surname = name_cell.get_text(strip=True).lower()
|
|
|
|
# Get the incidence (number of people)
|
|
incidence_text = cols[1].get_text(strip=True)
|
|
# Remove commas and parse as integer
|
|
incidence = int(incidence_text.replace(",", "").replace(" ", ""))
|
|
|
|
if surname:
|
|
surnames[surname] = incidence
|
|
|
|
# Try to find total unique surnames from page text
|
|
page_text = soup.get_text()
|
|
unique_match = re.search(r"([\d,]+)\s+unique surnames", page_text)
|
|
if unique_match:
|
|
total_unique = int(unique_match.group(1).replace(",", ""))
|
|
|
|
print(f" Found {len(surnames)} surnames")
|
|
if total_unique:
|
|
print(f" Total unique surnames in country: {total_unique:,}")
|
|
|
|
return {
|
|
"_metadata": {
|
|
"source": f"Forebears.io - {url}",
|
|
"retrieved_date": date.today().isoformat(),
|
|
"description": f"Top surnames from {country.title()} with incidence",
|
|
"total_unique_surnames": total_unique,
|
|
"country_code": config["code"],
|
|
"population": config["population"],
|
|
},
|
|
"surnames": surnames,
|
|
}
|
|
|
|
|
|
def save_surnames(data: dict, output_file: str) -> None:
|
|
"""Save surname data to JSON file."""
|
|
output_path = DATA_DIR / output_file
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
print(f" Saved to {output_path}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Fetch surname data from Forebears.io")
|
|
parser.add_argument(
|
|
"--country",
|
|
choices=list(COUNTRY_CONFIG.keys()),
|
|
help="Country to fetch surnames for",
|
|
)
|
|
parser.add_argument(
|
|
"--all", action="store_true", help="Fetch surnames for all configured countries"
|
|
)
|
|
parser.add_argument(
|
|
"--list", action="store_true", help="List available countries"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.list:
|
|
print("Available countries:")
|
|
for country, config in COUNTRY_CONFIG.items():
|
|
print(f" {country} ({config['code']}): {config['url']}")
|
|
return
|
|
|
|
if args.all:
|
|
countries = list(COUNTRY_CONFIG.keys())
|
|
elif args.country:
|
|
countries = [args.country]
|
|
else:
|
|
parser.print_help()
|
|
return
|
|
|
|
for country in countries:
|
|
try:
|
|
data = fetch_surnames(country)
|
|
if data["surnames"]:
|
|
config = COUNTRY_CONFIG[country]
|
|
save_surnames(data, config["output_file"])
|
|
else:
|
|
print(f" WARNING: No surnames found for {country}")
|
|
|
|
# Be polite - wait between requests
|
|
if country != countries[-1]:
|
|
print(" Waiting 2 seconds before next request...")
|
|
time.sleep(2)
|
|
|
|
except Exception as e:
|
|
print(f" ERROR fetching {country}: {e}")
|
|
continue
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|