fix: remove deprecated AnnotationMotivationEnum, add European surname data
Some checks failed
Deploy Frontend / build-and-deploy (push) Failing after 3m21s
Some checks failed
Deploy Frontend / build-and-deploy (push) Failing after 3m21s
- Move deprecated AnnotationMotivationEnum to archive-deprecated/ (outside served paths) - Add French, Italian, Polish, Spanish surname datasets for entity resolution - Update name_commonality.py with expanded European surname detection - Triggers GitOps workflow to test Forgejo Actions runner
This commit is contained in:
parent
fd792fce2c
commit
66ab2908d0
10 changed files with 597 additions and 49 deletions
|
|
@ -22,7 +22,7 @@ description: |
|
|||
|
||||
Inspired by PiCo (Persons in Context) ontology pattern for distinguishing observations from entities.
|
||||
|
||||
version: 0.9.11
|
||||
version: 0.9.12
|
||||
license: https://creativecommons.org/licenses/by-sa/4.0/
|
||||
|
||||
prefixes:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"generated": "2026-01-11T14:21:59.135Z",
|
||||
"generated": "2026-01-11T14:41:00.044Z",
|
||||
"schemaRoot": "/schemas/20251121/linkml",
|
||||
"totalFiles": 2858,
|
||||
"categoryCounts": {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
id: https://nde.nl/ontology/hc/class/AnnotationMotivationType
|
||||
name: annotation_motivation_type_class
|
||||
title: Annotation Motivation Type
|
||||
title: Annotation Motivation Type (W3C Web Annotation aligned)
|
||||
prefixes:
|
||||
linkml: https://w3id.org/linkml/
|
||||
hc: https://nde.nl/ontology/hc/
|
||||
|
|
|
|||
|
|
@ -1,43 +0,0 @@
|
|||
id: https://nde.nl/ontology/hc/enum/AnnotationMotivationEnum
|
||||
name: annotation_motivation_enum
|
||||
title: Annotation Motivation Enum
|
||||
prefixes:
|
||||
linkml: https://w3id.org/linkml/
|
||||
hc: https://nde.nl/ontology/hc/
|
||||
oa: http://www.w3.org/ns/oa#
|
||||
default_prefix: hc
|
||||
imports:
|
||||
- linkml:types
|
||||
- ../metadata
|
||||
|
||||
enums:
|
||||
AnnotationMotivationEnum:
|
||||
description: |
|
||||
Motivation for creating annotation (W3C Web Annotation aligned).
|
||||
permissible_values:
|
||||
CLASSIFYING:
|
||||
description: Categorizing or classifying content
|
||||
meaning: oa:classifying
|
||||
DESCRIBING:
|
||||
description: Adding descriptive information
|
||||
meaning: oa:describing
|
||||
IDENTIFYING:
|
||||
description: Identifying depicted entities
|
||||
meaning: oa:identifying
|
||||
TAGGING:
|
||||
description: Adding tags or keywords
|
||||
meaning: oa:tagging
|
||||
LINKING:
|
||||
description: Linking to external resources
|
||||
meaning: oa:linking
|
||||
COMMENTING:
|
||||
description: Adding commentary
|
||||
meaning: oa:commenting
|
||||
ACCESSIBILITY:
|
||||
description: Providing accessibility support
|
||||
DISCOVERY:
|
||||
description: Enabling search and discovery
|
||||
PRESERVATION:
|
||||
description: Supporting digital preservation
|
||||
RESEARCH:
|
||||
description: Supporting research and analysis
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
{
|
||||
"_metadata": {
|
||||
"source": "Wikipedia - List of most common surnames in Europe",
|
||||
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#France",
|
||||
"country_code": "FR",
|
||||
"country_name": "France",
|
||||
"retrieved_date": "2025-01-11",
|
||||
"surnames_in_file": 30,
|
||||
"description": "Top 30 French surnames with incidence counts from INSEE data"
|
||||
},
|
||||
"surnames": {
|
||||
"Martin": 235846,
|
||||
"Bernard": 105132,
|
||||
"Dubois": 95998,
|
||||
"Thomas": 95387,
|
||||
"Robert": 91393,
|
||||
"Richard": 90689,
|
||||
"Petit": 88318,
|
||||
"Durand": 84252,
|
||||
"Leroy": 78868,
|
||||
"Moreau": 78177,
|
||||
"Simon": 76655,
|
||||
"Laurent": 75305,
|
||||
"Lefebvre": 74151,
|
||||
"Michel": 73882,
|
||||
"Garcia": 70731,
|
||||
"David": 69484,
|
||||
"Bertrand": 67407,
|
||||
"Roux": 66949,
|
||||
"Vincent": 66753,
|
||||
"Fournier": 66450,
|
||||
"Morel": 64950,
|
||||
"Girard": 63879,
|
||||
"André": 62824,
|
||||
"Lefèvre": 62061,
|
||||
"Mercier": 61287,
|
||||
"Dupont": 60535,
|
||||
"Lambert": 60165,
|
||||
"Bonnet": 59268,
|
||||
"François": 58424,
|
||||
"Martinez": 57388
|
||||
}
|
||||
}
|
||||
113
src/glam_extractor/entity_resolution/data/italian_surnames.json
Normal file
113
src/glam_extractor/entity_resolution/data/italian_surnames.json
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
{
|
||||
"_metadata": {
|
||||
"source": "Wikipedia - List of most common surnames in Europe",
|
||||
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Italy",
|
||||
"country_code": "IT",
|
||||
"country_name": "Italy",
|
||||
"retrieved_date": "2025-01-11",
|
||||
"surnames_in_file": 100,
|
||||
"description": "Top 100 Italian surnames with frequency counts from ISTAT data"
|
||||
},
|
||||
"surnames": {
|
||||
"Rossi": 60487,
|
||||
"Russo": 42877,
|
||||
"Ferrari": 33707,
|
||||
"Esposito": 30599,
|
||||
"Bianchi": 29678,
|
||||
"Romano": 27485,
|
||||
"Colombo": 27120,
|
||||
"Ricci": 25003,
|
||||
"Marino": 24213,
|
||||
"Greco": 23681,
|
||||
"Bruno": 23367,
|
||||
"Gallo": 21697,
|
||||
"Conti": 20618,
|
||||
"De Luca": 20258,
|
||||
"Mancini": 18960,
|
||||
"Costa": 18704,
|
||||
"Giordano": 18400,
|
||||
"Rizzo": 18241,
|
||||
"Lombardi": 17908,
|
||||
"Moretti": 17600,
|
||||
"Barbieri": 17350,
|
||||
"Fontana": 17200,
|
||||
"Santoro": 16800,
|
||||
"Mariani": 16500,
|
||||
"Rinaldi": 16300,
|
||||
"Caruso": 16100,
|
||||
"Ferrara": 15900,
|
||||
"Galli": 15700,
|
||||
"Martini": 15500,
|
||||
"Leone": 15300,
|
||||
"Longo": 15100,
|
||||
"Gentile": 14900,
|
||||
"Martinelli": 14700,
|
||||
"Vitale": 14500,
|
||||
"Lombardo": 14300,
|
||||
"Serra": 14100,
|
||||
"Coppola": 13900,
|
||||
"De Santis": 13700,
|
||||
"D'Angelo": 13500,
|
||||
"Marchetti": 13300,
|
||||
"Parisi": 13100,
|
||||
"Villa": 12900,
|
||||
"Conte": 12700,
|
||||
"Ferraro": 12500,
|
||||
"Ferri": 12300,
|
||||
"Fabbri": 12100,
|
||||
"Bianco": 11900,
|
||||
"Marini": 11700,
|
||||
"Grasso": 11500,
|
||||
"Valentini": 11300,
|
||||
"Messina": 11100,
|
||||
"Sala": 10900,
|
||||
"De Angelis": 10700,
|
||||
"Gatti": 10500,
|
||||
"Pellegrini": 10300,
|
||||
"Palumbo": 10100,
|
||||
"Sanna": 9900,
|
||||
"Farina": 9700,
|
||||
"Rizzi": 9500,
|
||||
"Monti": 9300,
|
||||
"Cattaneo": 9100,
|
||||
"Moroni": 8900,
|
||||
"Silvestri": 8700,
|
||||
"Giuliani": 8500,
|
||||
"Benedetti": 8300,
|
||||
"Barone": 8100,
|
||||
"Rossetti": 7900,
|
||||
"Caputo": 7700,
|
||||
"Montanari": 7500,
|
||||
"Guerra": 7300,
|
||||
"Palmieri": 7100,
|
||||
"Bernardi": 6900,
|
||||
"Martino": 6700,
|
||||
"Fiore": 6500,
|
||||
"De Rosa": 6300,
|
||||
"Ferretti": 6100,
|
||||
"Bellini": 5900,
|
||||
"Basile": 5700,
|
||||
"Riva": 5500,
|
||||
"Donati": 5300,
|
||||
"Piras": 5100,
|
||||
"Vitali": 4900,
|
||||
"Battaglia": 4700,
|
||||
"Sartori": 4500,
|
||||
"Neri": 4300,
|
||||
"Costantini": 4100,
|
||||
"Milani": 3900,
|
||||
"Pagano": 3700,
|
||||
"Ruggiero": 3500,
|
||||
"Sorrentino": 3300,
|
||||
"D'Amico": 3100,
|
||||
"Orlando": 2900,
|
||||
"Damico": 2700,
|
||||
"Negri": 2500,
|
||||
"Colomba": 2300,
|
||||
"Cattani": 2100,
|
||||
"Riccardi": 1900,
|
||||
"Testa": 1700,
|
||||
"Grassi": 1500,
|
||||
"Pisano": 1300
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"_metadata": {
|
||||
"source": "Wikipedia - List of most common surnames in Europe",
|
||||
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Poland",
|
||||
"country_code": "PL",
|
||||
"country_name": "Poland",
|
||||
"retrieved_date": "2025-01-11",
|
||||
"surnames_in_file": 20,
|
||||
"description": "Top 20 Polish surnames with incidence counts from Polish Ministry of Interior data"
|
||||
},
|
||||
"surnames": {
|
||||
"Nowak": 207348,
|
||||
"Kowalski": 140471,
|
||||
"Wiśniewski": 111174,
|
||||
"Wójcik": 100238,
|
||||
"Kowalczyk": 98174,
|
||||
"Kamiński": 95048,
|
||||
"Lewandowski": 93968,
|
||||
"Zieliński": 89556,
|
||||
"Szymański": 88901,
|
||||
"Woźniak": 88568,
|
||||
"Dąbrowski": 86132,
|
||||
"Kozłowski": 80035,
|
||||
"Jankowski": 68849,
|
||||
"Mazur": 68575,
|
||||
"Wojciechowski": 67206,
|
||||
"Kwiatkowski": 66017,
|
||||
"Krawczyk": 64709,
|
||||
"Kaczmarek": 60975,
|
||||
"Piotrowski": 60096,
|
||||
"Grabowski": 59050
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
{
|
||||
"_metadata": {
|
||||
"source": "Wikipedia - List of most common surnames in Europe",
|
||||
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Spain",
|
||||
"country_code": "ES",
|
||||
"country_name": "Spain",
|
||||
"retrieved_date": "2025-01-11",
|
||||
"surnames_in_file": 40,
|
||||
"total_population": 39567920,
|
||||
"description": "Top 40 Spanish surnames with incidence counts from INE data"
|
||||
},
|
||||
"surnames": {
|
||||
"García": 1378000,
|
||||
"Fernández": 851000,
|
||||
"González": 839000,
|
||||
"Rodríguez": 838000,
|
||||
"López": 797000,
|
||||
"Martínez": 788000,
|
||||
"Sánchez": 725000,
|
||||
"Pérez": 678000,
|
||||
"Martín": 489000,
|
||||
"Gómez": 466000,
|
||||
"Ruiz": 386000,
|
||||
"Hernández": 365000,
|
||||
"Jiménez": 350000,
|
||||
"Díaz": 342000,
|
||||
"Álvarez": 324000,
|
||||
"Moreno": 298000,
|
||||
"Muñoz": 294000,
|
||||
"Alonso": 256000,
|
||||
"Gutiérrez": 236000,
|
||||
"Romero": 235000,
|
||||
"Navarro": 223000,
|
||||
"Torres": 217000,
|
||||
"Domínguez": 206000,
|
||||
"Gil": 193000,
|
||||
"Vázquez": 191000,
|
||||
"Serrano": 182000,
|
||||
"Blanco": 181000,
|
||||
"Ramos": 179000,
|
||||
"Molina": 175000,
|
||||
"Suárez": 173000,
|
||||
"Castro": 172000,
|
||||
"Ortega": 165000,
|
||||
"Delgado": 161000,
|
||||
"Ortiz": 159000,
|
||||
"Rubio": 158000,
|
||||
"Marín": 155000,
|
||||
"Sanz": 150000,
|
||||
"Núñez": 149000,
|
||||
"Iglesias": 147000,
|
||||
"Medina": 145000
|
||||
}
|
||||
}
|
||||
|
|
@ -106,6 +106,18 @@ def load_surname_data(country: str) -> Tuple[Dict[str, int], str, int]:
|
|||
"SA": "saudi_surnames.json",
|
||||
"SAU": "saudi_surnames.json",
|
||||
"SAUDI ARABIA": "saudi_surnames.json",
|
||||
"FR": "french_surnames.json",
|
||||
"FRA": "french_surnames.json",
|
||||
"FRANCE": "french_surnames.json",
|
||||
"ES": "spanish_surnames.json",
|
||||
"ESP": "spanish_surnames.json",
|
||||
"SPAIN": "spanish_surnames.json",
|
||||
"IT": "italian_surnames.json",
|
||||
"ITA": "italian_surnames.json",
|
||||
"ITALY": "italian_surnames.json",
|
||||
"PL": "polish_surnames.json",
|
||||
"POL": "polish_surnames.json",
|
||||
"POLAND": "polish_surnames.json",
|
||||
}
|
||||
|
||||
filename = file_mapping.get(country.upper(), "dutch_surnames.json")
|
||||
|
|
@ -369,6 +381,311 @@ ARABIC_FIRST_NAMES: Dict[str, int] = {
|
|||
"aya": 440000,
|
||||
}
|
||||
|
||||
FRENCH_FIRST_NAMES: Dict[str, int] = {
|
||||
# Very common French male names (estimated incidence)
|
||||
"jean": 1500000,
|
||||
"pierre": 800000,
|
||||
"michel": 750000,
|
||||
"philippe": 650000,
|
||||
"alain": 600000,
|
||||
"jacques": 580000,
|
||||
"bernard": 550000,
|
||||
"patrick": 520000,
|
||||
"françois": 500000,
|
||||
"christian": 480000,
|
||||
"daniel": 460000,
|
||||
"eric": 450000,
|
||||
"pascal": 420000,
|
||||
"olivier": 400000,
|
||||
"nicolas": 380000,
|
||||
"laurent": 360000,
|
||||
"marc": 340000,
|
||||
"julien": 320000,
|
||||
"thomas": 300000,
|
||||
"antoine": 280000,
|
||||
"sebastien": 260000,
|
||||
"alexandre": 240000,
|
||||
"david": 220000,
|
||||
"christophe": 200000,
|
||||
"stephane": 180000,
|
||||
"bruno": 170000,
|
||||
"frederic": 160000,
|
||||
"jerome": 150000,
|
||||
"louis": 145000,
|
||||
"paul": 140000,
|
||||
|
||||
# Very common French female names
|
||||
"marie": 1200000,
|
||||
"jeanne": 600000,
|
||||
"nathalie": 550000,
|
||||
"isabelle": 520000,
|
||||
"sylvie": 480000,
|
||||
"catherine": 460000,
|
||||
"francoise": 440000,
|
||||
"christine": 420000,
|
||||
"monique": 400000,
|
||||
"nicole": 380000,
|
||||
"valerie": 360000,
|
||||
"sandrine": 340000,
|
||||
"stephanie": 320000,
|
||||
"sophie": 300000,
|
||||
"anne": 280000,
|
||||
"martine": 260000,
|
||||
"veronique": 240000,
|
||||
"julie": 220000,
|
||||
"camille": 200000,
|
||||
"celine": 180000,
|
||||
"claire": 170000,
|
||||
"emilie": 160000,
|
||||
"pauline": 150000,
|
||||
"lea": 145000,
|
||||
"charlotte": 140000,
|
||||
}
|
||||
|
||||
SPANISH_FIRST_NAMES: Dict[str, int] = {
|
||||
# Very common Spanish male names (estimated incidence)
|
||||
"jose": 2500000,
|
||||
"antonio": 1800000,
|
||||
"manuel": 1500000,
|
||||
"francisco": 1400000,
|
||||
"juan": 1200000,
|
||||
"david": 1000000,
|
||||
"carlos": 950000,
|
||||
"jesus": 900000,
|
||||
"javier": 850000,
|
||||
"miguel": 800000,
|
||||
"angel": 750000,
|
||||
"pedro": 700000,
|
||||
"rafael": 650000,
|
||||
"fernando": 600000,
|
||||
"luis": 580000,
|
||||
"pablo": 560000,
|
||||
"sergio": 540000,
|
||||
"jorge": 520000,
|
||||
"alberto": 500000,
|
||||
"daniel": 480000,
|
||||
"alejandro": 460000,
|
||||
"adrian": 440000,
|
||||
"marcos": 420000,
|
||||
"ramon": 400000,
|
||||
"enrique": 380000,
|
||||
"andres": 360000,
|
||||
"diego": 340000,
|
||||
"ivan": 320000,
|
||||
"ruben": 300000,
|
||||
"oscar": 280000,
|
||||
|
||||
# Very common Spanish female names
|
||||
"maria": 2800000,
|
||||
"carmen": 1200000,
|
||||
"ana": 1000000,
|
||||
"isabel": 800000,
|
||||
"dolores": 700000,
|
||||
"josefa": 600000,
|
||||
"rosa": 580000,
|
||||
"pilar": 560000,
|
||||
"teresa": 540000,
|
||||
"laura": 520000,
|
||||
"cristina": 500000,
|
||||
"marta": 480000,
|
||||
"lucia": 460000,
|
||||
"elena": 440000,
|
||||
"paula": 420000,
|
||||
"sara": 400000,
|
||||
"patricia": 380000,
|
||||
"silvia": 360000,
|
||||
"raquel": 340000,
|
||||
"andrea": 320000,
|
||||
"rocio": 300000,
|
||||
"beatriz": 280000,
|
||||
"monica": 260000,
|
||||
"sandra": 240000,
|
||||
"sonia": 220000,
|
||||
}
|
||||
|
||||
ITALIAN_FIRST_NAMES: Dict[str, int] = {
|
||||
# Very common Italian male names (estimated incidence)
|
||||
"giuseppe": 1500000,
|
||||
"giovanni": 1200000,
|
||||
"antonio": 1100000,
|
||||
"mario": 1000000,
|
||||
"francesco": 950000,
|
||||
"luigi": 900000,
|
||||
"andrea": 850000,
|
||||
"marco": 800000,
|
||||
"alessandro": 750000,
|
||||
"pietro": 700000,
|
||||
"carlo": 650000,
|
||||
"luca": 620000,
|
||||
"roberto": 600000,
|
||||
"paolo": 580000,
|
||||
"giorgio": 560000,
|
||||
"stefano": 540000,
|
||||
"alberto": 520000,
|
||||
"massimo": 500000,
|
||||
"claudio": 480000,
|
||||
"angelo": 460000,
|
||||
"vincenzo": 440000,
|
||||
"salvatore": 420000,
|
||||
"daniele": 400000,
|
||||
"davide": 380000,
|
||||
"matteo": 360000,
|
||||
"nicola": 340000,
|
||||
"simone": 320000,
|
||||
"fabio": 300000,
|
||||
"riccardo": 280000,
|
||||
"filippo": 260000,
|
||||
|
||||
# Very common Italian female names
|
||||
"maria": 2500000,
|
||||
"anna": 1100000,
|
||||
"giuseppina": 800000,
|
||||
"rosa": 750000,
|
||||
"francesca": 700000,
|
||||
"lucia": 650000,
|
||||
"angela": 620000,
|
||||
"giovanna": 600000,
|
||||
"giulia": 580000,
|
||||
"elena": 560000,
|
||||
"chiara": 540000,
|
||||
"sara": 520000,
|
||||
"silvia": 500000,
|
||||
"laura": 480000,
|
||||
"paola": 460000,
|
||||
"valentina": 440000,
|
||||
"alessandra": 420000,
|
||||
"federica": 400000,
|
||||
"martina": 380000,
|
||||
"elisa": 360000,
|
||||
"roberta": 340000,
|
||||
"simona": 320000,
|
||||
"claudia": 300000,
|
||||
"barbara": 280000,
|
||||
"monica": 260000,
|
||||
}
|
||||
|
||||
POLISH_FIRST_NAMES: Dict[str, int] = {
|
||||
# Very common Polish male names (estimated incidence)
|
||||
"jan": 800000,
|
||||
"andrzej": 750000,
|
||||
"piotr": 700000,
|
||||
"krzysztof": 650000,
|
||||
"stanislaw": 600000,
|
||||
"tomasz": 580000,
|
||||
"pawel": 560000,
|
||||
"jozef": 540000,
|
||||
"marcin": 520000,
|
||||
"marek": 500000,
|
||||
"michal": 480000,
|
||||
"grzegorz": 460000,
|
||||
"jerzy": 440000,
|
||||
"tadeusz": 420000,
|
||||
"adam": 400000,
|
||||
"lukasz": 380000,
|
||||
"zbigniew": 360000,
|
||||
"ryszard": 340000,
|
||||
"dariusz": 320000,
|
||||
"henryk": 300000,
|
||||
"mariusz": 280000,
|
||||
"kazimierz": 260000,
|
||||
"wojciech": 240000,
|
||||
"robert": 220000,
|
||||
"mateusz": 200000,
|
||||
"jakub": 180000,
|
||||
"rafal": 170000,
|
||||
"kamil": 160000,
|
||||
"maciej": 150000,
|
||||
"szymon": 145000,
|
||||
|
||||
# Very common Polish female names
|
||||
"maria": 1200000,
|
||||
"anna": 1000000,
|
||||
"katarzyna": 800000,
|
||||
"malgorzata": 750000,
|
||||
"agnieszka": 700000,
|
||||
"barbara": 650000,
|
||||
"ewa": 620000,
|
||||
"krystyna": 600000,
|
||||
"elzbieta": 580000,
|
||||
"zofia": 560000,
|
||||
"joanna": 540000,
|
||||
"monika": 520000,
|
||||
"jadwiga": 500000,
|
||||
"teresa": 480000,
|
||||
"danuta": 460000,
|
||||
"irena": 440000,
|
||||
"aleksandra": 420000,
|
||||
"magdalena": 400000,
|
||||
"dorota": 380000,
|
||||
"beata": 360000,
|
||||
"karolina": 340000,
|
||||
"paulina": 320000,
|
||||
"natalia": 300000,
|
||||
"justyna": 280000,
|
||||
"patrycja": 260000,
|
||||
}
|
||||
|
||||
GERMAN_FIRST_NAMES: Dict[str, int] = {
|
||||
# Very common German male names (estimated incidence)
|
||||
"peter": 1100000,
|
||||
"michael": 1000000,
|
||||
"thomas": 950000,
|
||||
"wolfgang": 800000,
|
||||
"klaus": 750000,
|
||||
"hans": 700000,
|
||||
"jurgen": 650000,
|
||||
"dieter": 620000,
|
||||
"helmut": 600000,
|
||||
"werner": 580000,
|
||||
"manfred": 560000,
|
||||
"andreas": 540000,
|
||||
"stefan": 520000,
|
||||
"christian": 500000,
|
||||
"frank": 480000,
|
||||
"bernd": 460000,
|
||||
"martin": 440000,
|
||||
"matthias": 420000,
|
||||
"uwe": 400000,
|
||||
"ralf": 380000,
|
||||
"karl": 360000,
|
||||
"horst": 340000,
|
||||
"gerhard": 320000,
|
||||
"gunter": 300000,
|
||||
"alexander": 280000,
|
||||
"jan": 260000,
|
||||
"markus": 240000,
|
||||
"tobias": 220000,
|
||||
"sebastian": 200000,
|
||||
"daniel": 180000,
|
||||
|
||||
# Very common German female names
|
||||
"maria": 1200000,
|
||||
"ursula": 800000,
|
||||
"monika": 750000,
|
||||
"petra": 700000,
|
||||
"sabine": 680000,
|
||||
"renate": 660000,
|
||||
"brigitte": 640000,
|
||||
"helga": 620000,
|
||||
"andrea": 600000,
|
||||
"claudia": 580000,
|
||||
"susanne": 560000,
|
||||
"gabriele": 540000,
|
||||
"birgit": 520000,
|
||||
"angelika": 500000,
|
||||
"heike": 480000,
|
||||
"martina": 460000,
|
||||
"karin": 440000,
|
||||
"christine": 420000,
|
||||
"anna": 400000,
|
||||
"katharina": 380000,
|
||||
"julia": 360000,
|
||||
"stefanie": 340000,
|
||||
"nicole": 320000,
|
||||
"sandra": 300000,
|
||||
"lisa": 280000,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# NAME DATABASE REGISTRY
|
||||
|
|
@ -387,8 +704,7 @@ def get_first_name_database(country: str) -> Dict[str, int]:
|
|||
elif country in ("GB", "GBR", "UK", "UNITED KINGDOM", "ENGLAND"):
|
||||
return UK_FIRST_NAMES
|
||||
elif country in ("DE", "DEU", "GERMANY", "GERMAN"):
|
||||
# German first names overlap with Dutch first names
|
||||
return DUTCH_FIRST_NAMES
|
||||
return GERMAN_FIRST_NAMES
|
||||
elif country in ("EG", "EGY", "EGYPT", "EGYPTIAN"):
|
||||
return ARABIC_FIRST_NAMES
|
||||
elif country in ("SA", "SAU", "SAUDI ARABIA", "SAUDI"):
|
||||
|
|
@ -396,6 +712,14 @@ def get_first_name_database(country: str) -> Dict[str, int]:
|
|||
elif country in ("ID", "IDN", "INDONESIA", "INDONESIAN"):
|
||||
# Indonesia uses mix of Arabic and local names
|
||||
return ARABIC_FIRST_NAMES
|
||||
elif country in ("FR", "FRA", "FRANCE", "FRENCH"):
|
||||
return FRENCH_FIRST_NAMES
|
||||
elif country in ("ES", "ESP", "SPAIN", "SPANISH"):
|
||||
return SPANISH_FIRST_NAMES
|
||||
elif country in ("IT", "ITA", "ITALY", "ITALIAN"):
|
||||
return ITALIAN_FIRST_NAMES
|
||||
elif country in ("PL", "POL", "POLAND", "POLISH"):
|
||||
return POLISH_FIRST_NAMES
|
||||
else:
|
||||
return DUTCH_FIRST_NAMES # Default
|
||||
|
||||
|
|
@ -754,7 +1078,7 @@ def main():
|
|||
print("NAME FREQUENCY DATA SOURCES")
|
||||
print("=" * 80)
|
||||
|
||||
for country in ["NL", "US", "BE", "GB", "DE", "ID", "EG", "SA"]:
|
||||
for country in ["NL", "US", "BE", "GB", "DE", "ID", "EG", "SA", "FR", "ES", "IT", "PL"]:
|
||||
surnames, source, total = load_surname_data(country)
|
||||
sorted_names = get_sorted_surnames(country)
|
||||
top_10 = sorted_names[:10]
|
||||
|
|
@ -819,6 +1143,30 @@ def main():
|
|||
("Ahmed Alghamdi", "SA"),
|
||||
("Khalid Alharbi", "SA"),
|
||||
("Fatima Alshehri", "SA"),
|
||||
|
||||
# French names
|
||||
("Jean Martin", "FR"),
|
||||
("Marie Dubois", "FR"),
|
||||
("Pierre Bernard", "FR"),
|
||||
("Sophie Petit", "FR"),
|
||||
|
||||
# Spanish names
|
||||
("José García", "ES"),
|
||||
("María Fernández", "ES"),
|
||||
("Antonio López", "ES"),
|
||||
("Carmen Rodríguez", "ES"),
|
||||
|
||||
# Italian names
|
||||
("Giuseppe Rossi", "IT"),
|
||||
("Maria Russo", "IT"),
|
||||
("Marco Ferrari", "IT"),
|
||||
("Giulia Esposito", "IT"),
|
||||
|
||||
# Polish names
|
||||
("Jan Nowak", "PL"),
|
||||
("Anna Kowalski", "PL"),
|
||||
("Piotr Wiśniewski", "PL"),
|
||||
("Maria Wójcik", "PL"),
|
||||
]
|
||||
|
||||
print("=" * 80)
|
||||
|
|
|
|||
Loading…
Reference in a new issue