fix: remove deprecated AnnotationMotivationEnum, add European surname data
Some checks failed
Deploy Frontend / build-and-deploy (push) Failing after 3m21s
Some checks failed
Deploy Frontend / build-and-deploy (push) Failing after 3m21s
- Move deprecated AnnotationMotivationEnum to archive-deprecated/ (outside served paths) - Add French, Italian, Polish, Spanish surname datasets for entity resolution - Update name_commonality.py with expanded European surname detection - Triggers GitOps workflow to test Forgejo Actions runner
This commit is contained in:
parent
fd792fce2c
commit
66ab2908d0
10 changed files with 597 additions and 49 deletions
|
|
@ -22,7 +22,7 @@ description: |
|
||||||
|
|
||||||
Inspired by PiCo (Persons in Context) ontology pattern for distinguishing observations from entities.
|
Inspired by PiCo (Persons in Context) ontology pattern for distinguishing observations from entities.
|
||||||
|
|
||||||
version: 0.9.11
|
version: 0.9.12
|
||||||
license: https://creativecommons.org/licenses/by-sa/4.0/
|
license: https://creativecommons.org/licenses/by-sa/4.0/
|
||||||
|
|
||||||
prefixes:
|
prefixes:
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"generated": "2026-01-11T14:21:59.135Z",
|
"generated": "2026-01-11T14:41:00.044Z",
|
||||||
"schemaRoot": "/schemas/20251121/linkml",
|
"schemaRoot": "/schemas/20251121/linkml",
|
||||||
"totalFiles": 2858,
|
"totalFiles": 2858,
|
||||||
"categoryCounts": {
|
"categoryCounts": {
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
id: https://nde.nl/ontology/hc/class/AnnotationMotivationType
|
id: https://nde.nl/ontology/hc/class/AnnotationMotivationType
|
||||||
name: annotation_motivation_type_class
|
name: annotation_motivation_type_class
|
||||||
title: Annotation Motivation Type
|
title: Annotation Motivation Type (W3C Web Annotation aligned)
|
||||||
prefixes:
|
prefixes:
|
||||||
linkml: https://w3id.org/linkml/
|
linkml: https://w3id.org/linkml/
|
||||||
hc: https://nde.nl/ontology/hc/
|
hc: https://nde.nl/ontology/hc/
|
||||||
|
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
||||||
id: https://nde.nl/ontology/hc/enum/AnnotationMotivationEnum
|
|
||||||
name: annotation_motivation_enum
|
|
||||||
title: Annotation Motivation Enum
|
|
||||||
prefixes:
|
|
||||||
linkml: https://w3id.org/linkml/
|
|
||||||
hc: https://nde.nl/ontology/hc/
|
|
||||||
oa: http://www.w3.org/ns/oa#
|
|
||||||
default_prefix: hc
|
|
||||||
imports:
|
|
||||||
- linkml:types
|
|
||||||
- ../metadata
|
|
||||||
|
|
||||||
enums:
|
|
||||||
AnnotationMotivationEnum:
|
|
||||||
description: |
|
|
||||||
Motivation for creating annotation (W3C Web Annotation aligned).
|
|
||||||
permissible_values:
|
|
||||||
CLASSIFYING:
|
|
||||||
description: Categorizing or classifying content
|
|
||||||
meaning: oa:classifying
|
|
||||||
DESCRIBING:
|
|
||||||
description: Adding descriptive information
|
|
||||||
meaning: oa:describing
|
|
||||||
IDENTIFYING:
|
|
||||||
description: Identifying depicted entities
|
|
||||||
meaning: oa:identifying
|
|
||||||
TAGGING:
|
|
||||||
description: Adding tags or keywords
|
|
||||||
meaning: oa:tagging
|
|
||||||
LINKING:
|
|
||||||
description: Linking to external resources
|
|
||||||
meaning: oa:linking
|
|
||||||
COMMENTING:
|
|
||||||
description: Adding commentary
|
|
||||||
meaning: oa:commenting
|
|
||||||
ACCESSIBILITY:
|
|
||||||
description: Providing accessibility support
|
|
||||||
DISCOVERY:
|
|
||||||
description: Enabling search and discovery
|
|
||||||
PRESERVATION:
|
|
||||||
description: Supporting digital preservation
|
|
||||||
RESEARCH:
|
|
||||||
description: Supporting research and analysis
|
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
{
|
||||||
|
"_metadata": {
|
||||||
|
"source": "Wikipedia - List of most common surnames in Europe",
|
||||||
|
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#France",
|
||||||
|
"country_code": "FR",
|
||||||
|
"country_name": "France",
|
||||||
|
"retrieved_date": "2025-01-11",
|
||||||
|
"surnames_in_file": 30,
|
||||||
|
"description": "Top 30 French surnames with incidence counts from INSEE data"
|
||||||
|
},
|
||||||
|
"surnames": {
|
||||||
|
"Martin": 235846,
|
||||||
|
"Bernard": 105132,
|
||||||
|
"Dubois": 95998,
|
||||||
|
"Thomas": 95387,
|
||||||
|
"Robert": 91393,
|
||||||
|
"Richard": 90689,
|
||||||
|
"Petit": 88318,
|
||||||
|
"Durand": 84252,
|
||||||
|
"Leroy": 78868,
|
||||||
|
"Moreau": 78177,
|
||||||
|
"Simon": 76655,
|
||||||
|
"Laurent": 75305,
|
||||||
|
"Lefebvre": 74151,
|
||||||
|
"Michel": 73882,
|
||||||
|
"Garcia": 70731,
|
||||||
|
"David": 69484,
|
||||||
|
"Bertrand": 67407,
|
||||||
|
"Roux": 66949,
|
||||||
|
"Vincent": 66753,
|
||||||
|
"Fournier": 66450,
|
||||||
|
"Morel": 64950,
|
||||||
|
"Girard": 63879,
|
||||||
|
"André": 62824,
|
||||||
|
"Lefèvre": 62061,
|
||||||
|
"Mercier": 61287,
|
||||||
|
"Dupont": 60535,
|
||||||
|
"Lambert": 60165,
|
||||||
|
"Bonnet": 59268,
|
||||||
|
"François": 58424,
|
||||||
|
"Martinez": 57388
|
||||||
|
}
|
||||||
|
}
|
||||||
113
src/glam_extractor/entity_resolution/data/italian_surnames.json
Normal file
113
src/glam_extractor/entity_resolution/data/italian_surnames.json
Normal file
|
|
@ -0,0 +1,113 @@
|
||||||
|
{
|
||||||
|
"_metadata": {
|
||||||
|
"source": "Wikipedia - List of most common surnames in Europe",
|
||||||
|
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Italy",
|
||||||
|
"country_code": "IT",
|
||||||
|
"country_name": "Italy",
|
||||||
|
"retrieved_date": "2025-01-11",
|
||||||
|
"surnames_in_file": 100,
|
||||||
|
"description": "Top 100 Italian surnames with frequency counts from ISTAT data"
|
||||||
|
},
|
||||||
|
"surnames": {
|
||||||
|
"Rossi": 60487,
|
||||||
|
"Russo": 42877,
|
||||||
|
"Ferrari": 33707,
|
||||||
|
"Esposito": 30599,
|
||||||
|
"Bianchi": 29678,
|
||||||
|
"Romano": 27485,
|
||||||
|
"Colombo": 27120,
|
||||||
|
"Ricci": 25003,
|
||||||
|
"Marino": 24213,
|
||||||
|
"Greco": 23681,
|
||||||
|
"Bruno": 23367,
|
||||||
|
"Gallo": 21697,
|
||||||
|
"Conti": 20618,
|
||||||
|
"De Luca": 20258,
|
||||||
|
"Mancini": 18960,
|
||||||
|
"Costa": 18704,
|
||||||
|
"Giordano": 18400,
|
||||||
|
"Rizzo": 18241,
|
||||||
|
"Lombardi": 17908,
|
||||||
|
"Moretti": 17600,
|
||||||
|
"Barbieri": 17350,
|
||||||
|
"Fontana": 17200,
|
||||||
|
"Santoro": 16800,
|
||||||
|
"Mariani": 16500,
|
||||||
|
"Rinaldi": 16300,
|
||||||
|
"Caruso": 16100,
|
||||||
|
"Ferrara": 15900,
|
||||||
|
"Galli": 15700,
|
||||||
|
"Martini": 15500,
|
||||||
|
"Leone": 15300,
|
||||||
|
"Longo": 15100,
|
||||||
|
"Gentile": 14900,
|
||||||
|
"Martinelli": 14700,
|
||||||
|
"Vitale": 14500,
|
||||||
|
"Lombardo": 14300,
|
||||||
|
"Serra": 14100,
|
||||||
|
"Coppola": 13900,
|
||||||
|
"De Santis": 13700,
|
||||||
|
"D'Angelo": 13500,
|
||||||
|
"Marchetti": 13300,
|
||||||
|
"Parisi": 13100,
|
||||||
|
"Villa": 12900,
|
||||||
|
"Conte": 12700,
|
||||||
|
"Ferraro": 12500,
|
||||||
|
"Ferri": 12300,
|
||||||
|
"Fabbri": 12100,
|
||||||
|
"Bianco": 11900,
|
||||||
|
"Marini": 11700,
|
||||||
|
"Grasso": 11500,
|
||||||
|
"Valentini": 11300,
|
||||||
|
"Messina": 11100,
|
||||||
|
"Sala": 10900,
|
||||||
|
"De Angelis": 10700,
|
||||||
|
"Gatti": 10500,
|
||||||
|
"Pellegrini": 10300,
|
||||||
|
"Palumbo": 10100,
|
||||||
|
"Sanna": 9900,
|
||||||
|
"Farina": 9700,
|
||||||
|
"Rizzi": 9500,
|
||||||
|
"Monti": 9300,
|
||||||
|
"Cattaneo": 9100,
|
||||||
|
"Moroni": 8900,
|
||||||
|
"Silvestri": 8700,
|
||||||
|
"Giuliani": 8500,
|
||||||
|
"Benedetti": 8300,
|
||||||
|
"Barone": 8100,
|
||||||
|
"Rossetti": 7900,
|
||||||
|
"Caputo": 7700,
|
||||||
|
"Montanari": 7500,
|
||||||
|
"Guerra": 7300,
|
||||||
|
"Palmieri": 7100,
|
||||||
|
"Bernardi": 6900,
|
||||||
|
"Martino": 6700,
|
||||||
|
"Fiore": 6500,
|
||||||
|
"De Rosa": 6300,
|
||||||
|
"Ferretti": 6100,
|
||||||
|
"Bellini": 5900,
|
||||||
|
"Basile": 5700,
|
||||||
|
"Riva": 5500,
|
||||||
|
"Donati": 5300,
|
||||||
|
"Piras": 5100,
|
||||||
|
"Vitali": 4900,
|
||||||
|
"Battaglia": 4700,
|
||||||
|
"Sartori": 4500,
|
||||||
|
"Neri": 4300,
|
||||||
|
"Costantini": 4100,
|
||||||
|
"Milani": 3900,
|
||||||
|
"Pagano": 3700,
|
||||||
|
"Ruggiero": 3500,
|
||||||
|
"Sorrentino": 3300,
|
||||||
|
"D'Amico": 3100,
|
||||||
|
"Orlando": 2900,
|
||||||
|
"Damico": 2700,
|
||||||
|
"Negri": 2500,
|
||||||
|
"Colomba": 2300,
|
||||||
|
"Cattani": 2100,
|
||||||
|
"Riccardi": 1900,
|
||||||
|
"Testa": 1700,
|
||||||
|
"Grassi": 1500,
|
||||||
|
"Pisano": 1300
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,33 @@
|
||||||
|
{
|
||||||
|
"_metadata": {
|
||||||
|
"source": "Wikipedia - List of most common surnames in Europe",
|
||||||
|
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Poland",
|
||||||
|
"country_code": "PL",
|
||||||
|
"country_name": "Poland",
|
||||||
|
"retrieved_date": "2025-01-11",
|
||||||
|
"surnames_in_file": 20,
|
||||||
|
"description": "Top 20 Polish surnames with incidence counts from Polish Ministry of Interior data"
|
||||||
|
},
|
||||||
|
"surnames": {
|
||||||
|
"Nowak": 207348,
|
||||||
|
"Kowalski": 140471,
|
||||||
|
"Wiśniewski": 111174,
|
||||||
|
"Wójcik": 100238,
|
||||||
|
"Kowalczyk": 98174,
|
||||||
|
"Kamiński": 95048,
|
||||||
|
"Lewandowski": 93968,
|
||||||
|
"Zieliński": 89556,
|
||||||
|
"Szymański": 88901,
|
||||||
|
"Woźniak": 88568,
|
||||||
|
"Dąbrowski": 86132,
|
||||||
|
"Kozłowski": 80035,
|
||||||
|
"Jankowski": 68849,
|
||||||
|
"Mazur": 68575,
|
||||||
|
"Wojciechowski": 67206,
|
||||||
|
"Kwiatkowski": 66017,
|
||||||
|
"Krawczyk": 64709,
|
||||||
|
"Kaczmarek": 60975,
|
||||||
|
"Piotrowski": 60096,
|
||||||
|
"Grabowski": 59050
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
{
|
||||||
|
"_metadata": {
|
||||||
|
"source": "Wikipedia - List of most common surnames in Europe",
|
||||||
|
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Spain",
|
||||||
|
"country_code": "ES",
|
||||||
|
"country_name": "Spain",
|
||||||
|
"retrieved_date": "2025-01-11",
|
||||||
|
"surnames_in_file": 40,
|
||||||
|
"total_population": 39567920,
|
||||||
|
"description": "Top 40 Spanish surnames with incidence counts from INE data"
|
||||||
|
},
|
||||||
|
"surnames": {
|
||||||
|
"García": 1378000,
|
||||||
|
"Fernández": 851000,
|
||||||
|
"González": 839000,
|
||||||
|
"Rodríguez": 838000,
|
||||||
|
"López": 797000,
|
||||||
|
"Martínez": 788000,
|
||||||
|
"Sánchez": 725000,
|
||||||
|
"Pérez": 678000,
|
||||||
|
"Martín": 489000,
|
||||||
|
"Gómez": 466000,
|
||||||
|
"Ruiz": 386000,
|
||||||
|
"Hernández": 365000,
|
||||||
|
"Jiménez": 350000,
|
||||||
|
"Díaz": 342000,
|
||||||
|
"Álvarez": 324000,
|
||||||
|
"Moreno": 298000,
|
||||||
|
"Muñoz": 294000,
|
||||||
|
"Alonso": 256000,
|
||||||
|
"Gutiérrez": 236000,
|
||||||
|
"Romero": 235000,
|
||||||
|
"Navarro": 223000,
|
||||||
|
"Torres": 217000,
|
||||||
|
"Domínguez": 206000,
|
||||||
|
"Gil": 193000,
|
||||||
|
"Vázquez": 191000,
|
||||||
|
"Serrano": 182000,
|
||||||
|
"Blanco": 181000,
|
||||||
|
"Ramos": 179000,
|
||||||
|
"Molina": 175000,
|
||||||
|
"Suárez": 173000,
|
||||||
|
"Castro": 172000,
|
||||||
|
"Ortega": 165000,
|
||||||
|
"Delgado": 161000,
|
||||||
|
"Ortiz": 159000,
|
||||||
|
"Rubio": 158000,
|
||||||
|
"Marín": 155000,
|
||||||
|
"Sanz": 150000,
|
||||||
|
"Núñez": 149000,
|
||||||
|
"Iglesias": 147000,
|
||||||
|
"Medina": 145000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -106,6 +106,18 @@ def load_surname_data(country: str) -> Tuple[Dict[str, int], str, int]:
|
||||||
"SA": "saudi_surnames.json",
|
"SA": "saudi_surnames.json",
|
||||||
"SAU": "saudi_surnames.json",
|
"SAU": "saudi_surnames.json",
|
||||||
"SAUDI ARABIA": "saudi_surnames.json",
|
"SAUDI ARABIA": "saudi_surnames.json",
|
||||||
|
"FR": "french_surnames.json",
|
||||||
|
"FRA": "french_surnames.json",
|
||||||
|
"FRANCE": "french_surnames.json",
|
||||||
|
"ES": "spanish_surnames.json",
|
||||||
|
"ESP": "spanish_surnames.json",
|
||||||
|
"SPAIN": "spanish_surnames.json",
|
||||||
|
"IT": "italian_surnames.json",
|
||||||
|
"ITA": "italian_surnames.json",
|
||||||
|
"ITALY": "italian_surnames.json",
|
||||||
|
"PL": "polish_surnames.json",
|
||||||
|
"POL": "polish_surnames.json",
|
||||||
|
"POLAND": "polish_surnames.json",
|
||||||
}
|
}
|
||||||
|
|
||||||
filename = file_mapping.get(country.upper(), "dutch_surnames.json")
|
filename = file_mapping.get(country.upper(), "dutch_surnames.json")
|
||||||
|
|
@ -369,6 +381,311 @@ ARABIC_FIRST_NAMES: Dict[str, int] = {
|
||||||
"aya": 440000,
|
"aya": 440000,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FRENCH_FIRST_NAMES: Dict[str, int] = {
|
||||||
|
# Very common French male names (estimated incidence)
|
||||||
|
"jean": 1500000,
|
||||||
|
"pierre": 800000,
|
||||||
|
"michel": 750000,
|
||||||
|
"philippe": 650000,
|
||||||
|
"alain": 600000,
|
||||||
|
"jacques": 580000,
|
||||||
|
"bernard": 550000,
|
||||||
|
"patrick": 520000,
|
||||||
|
"françois": 500000,
|
||||||
|
"christian": 480000,
|
||||||
|
"daniel": 460000,
|
||||||
|
"eric": 450000,
|
||||||
|
"pascal": 420000,
|
||||||
|
"olivier": 400000,
|
||||||
|
"nicolas": 380000,
|
||||||
|
"laurent": 360000,
|
||||||
|
"marc": 340000,
|
||||||
|
"julien": 320000,
|
||||||
|
"thomas": 300000,
|
||||||
|
"antoine": 280000,
|
||||||
|
"sebastien": 260000,
|
||||||
|
"alexandre": 240000,
|
||||||
|
"david": 220000,
|
||||||
|
"christophe": 200000,
|
||||||
|
"stephane": 180000,
|
||||||
|
"bruno": 170000,
|
||||||
|
"frederic": 160000,
|
||||||
|
"jerome": 150000,
|
||||||
|
"louis": 145000,
|
||||||
|
"paul": 140000,
|
||||||
|
|
||||||
|
# Very common French female names
|
||||||
|
"marie": 1200000,
|
||||||
|
"jeanne": 600000,
|
||||||
|
"nathalie": 550000,
|
||||||
|
"isabelle": 520000,
|
||||||
|
"sylvie": 480000,
|
||||||
|
"catherine": 460000,
|
||||||
|
"francoise": 440000,
|
||||||
|
"christine": 420000,
|
||||||
|
"monique": 400000,
|
||||||
|
"nicole": 380000,
|
||||||
|
"valerie": 360000,
|
||||||
|
"sandrine": 340000,
|
||||||
|
"stephanie": 320000,
|
||||||
|
"sophie": 300000,
|
||||||
|
"anne": 280000,
|
||||||
|
"martine": 260000,
|
||||||
|
"veronique": 240000,
|
||||||
|
"julie": 220000,
|
||||||
|
"camille": 200000,
|
||||||
|
"celine": 180000,
|
||||||
|
"claire": 170000,
|
||||||
|
"emilie": 160000,
|
||||||
|
"pauline": 150000,
|
||||||
|
"lea": 145000,
|
||||||
|
"charlotte": 140000,
|
||||||
|
}
|
||||||
|
|
||||||
|
SPANISH_FIRST_NAMES: Dict[str, int] = {
|
||||||
|
# Very common Spanish male names (estimated incidence)
|
||||||
|
"jose": 2500000,
|
||||||
|
"antonio": 1800000,
|
||||||
|
"manuel": 1500000,
|
||||||
|
"francisco": 1400000,
|
||||||
|
"juan": 1200000,
|
||||||
|
"david": 1000000,
|
||||||
|
"carlos": 950000,
|
||||||
|
"jesus": 900000,
|
||||||
|
"javier": 850000,
|
||||||
|
"miguel": 800000,
|
||||||
|
"angel": 750000,
|
||||||
|
"pedro": 700000,
|
||||||
|
"rafael": 650000,
|
||||||
|
"fernando": 600000,
|
||||||
|
"luis": 580000,
|
||||||
|
"pablo": 560000,
|
||||||
|
"sergio": 540000,
|
||||||
|
"jorge": 520000,
|
||||||
|
"alberto": 500000,
|
||||||
|
"daniel": 480000,
|
||||||
|
"alejandro": 460000,
|
||||||
|
"adrian": 440000,
|
||||||
|
"marcos": 420000,
|
||||||
|
"ramon": 400000,
|
||||||
|
"enrique": 380000,
|
||||||
|
"andres": 360000,
|
||||||
|
"diego": 340000,
|
||||||
|
"ivan": 320000,
|
||||||
|
"ruben": 300000,
|
||||||
|
"oscar": 280000,
|
||||||
|
|
||||||
|
# Very common Spanish female names
|
||||||
|
"maria": 2800000,
|
||||||
|
"carmen": 1200000,
|
||||||
|
"ana": 1000000,
|
||||||
|
"isabel": 800000,
|
||||||
|
"dolores": 700000,
|
||||||
|
"josefa": 600000,
|
||||||
|
"rosa": 580000,
|
||||||
|
"pilar": 560000,
|
||||||
|
"teresa": 540000,
|
||||||
|
"laura": 520000,
|
||||||
|
"cristina": 500000,
|
||||||
|
"marta": 480000,
|
||||||
|
"lucia": 460000,
|
||||||
|
"elena": 440000,
|
||||||
|
"paula": 420000,
|
||||||
|
"sara": 400000,
|
||||||
|
"patricia": 380000,
|
||||||
|
"silvia": 360000,
|
||||||
|
"raquel": 340000,
|
||||||
|
"andrea": 320000,
|
||||||
|
"rocio": 300000,
|
||||||
|
"beatriz": 280000,
|
||||||
|
"monica": 260000,
|
||||||
|
"sandra": 240000,
|
||||||
|
"sonia": 220000,
|
||||||
|
}
|
||||||
|
|
||||||
|
ITALIAN_FIRST_NAMES: Dict[str, int] = {
|
||||||
|
# Very common Italian male names (estimated incidence)
|
||||||
|
"giuseppe": 1500000,
|
||||||
|
"giovanni": 1200000,
|
||||||
|
"antonio": 1100000,
|
||||||
|
"mario": 1000000,
|
||||||
|
"francesco": 950000,
|
||||||
|
"luigi": 900000,
|
||||||
|
"andrea": 850000,
|
||||||
|
"marco": 800000,
|
||||||
|
"alessandro": 750000,
|
||||||
|
"pietro": 700000,
|
||||||
|
"carlo": 650000,
|
||||||
|
"luca": 620000,
|
||||||
|
"roberto": 600000,
|
||||||
|
"paolo": 580000,
|
||||||
|
"giorgio": 560000,
|
||||||
|
"stefano": 540000,
|
||||||
|
"alberto": 520000,
|
||||||
|
"massimo": 500000,
|
||||||
|
"claudio": 480000,
|
||||||
|
"angelo": 460000,
|
||||||
|
"vincenzo": 440000,
|
||||||
|
"salvatore": 420000,
|
||||||
|
"daniele": 400000,
|
||||||
|
"davide": 380000,
|
||||||
|
"matteo": 360000,
|
||||||
|
"nicola": 340000,
|
||||||
|
"simone": 320000,
|
||||||
|
"fabio": 300000,
|
||||||
|
"riccardo": 280000,
|
||||||
|
"filippo": 260000,
|
||||||
|
|
||||||
|
# Very common Italian female names
|
||||||
|
"maria": 2500000,
|
||||||
|
"anna": 1100000,
|
||||||
|
"giuseppina": 800000,
|
||||||
|
"rosa": 750000,
|
||||||
|
"francesca": 700000,
|
||||||
|
"lucia": 650000,
|
||||||
|
"angela": 620000,
|
||||||
|
"giovanna": 600000,
|
||||||
|
"giulia": 580000,
|
||||||
|
"elena": 560000,
|
||||||
|
"chiara": 540000,
|
||||||
|
"sara": 520000,
|
||||||
|
"silvia": 500000,
|
||||||
|
"laura": 480000,
|
||||||
|
"paola": 460000,
|
||||||
|
"valentina": 440000,
|
||||||
|
"alessandra": 420000,
|
||||||
|
"federica": 400000,
|
||||||
|
"martina": 380000,
|
||||||
|
"elisa": 360000,
|
||||||
|
"roberta": 340000,
|
||||||
|
"simona": 320000,
|
||||||
|
"claudia": 300000,
|
||||||
|
"barbara": 280000,
|
||||||
|
"monica": 260000,
|
||||||
|
}
|
||||||
|
|
||||||
|
POLISH_FIRST_NAMES: Dict[str, int] = {
|
||||||
|
# Very common Polish male names (estimated incidence)
|
||||||
|
"jan": 800000,
|
||||||
|
"andrzej": 750000,
|
||||||
|
"piotr": 700000,
|
||||||
|
"krzysztof": 650000,
|
||||||
|
"stanislaw": 600000,
|
||||||
|
"tomasz": 580000,
|
||||||
|
"pawel": 560000,
|
||||||
|
"jozef": 540000,
|
||||||
|
"marcin": 520000,
|
||||||
|
"marek": 500000,
|
||||||
|
"michal": 480000,
|
||||||
|
"grzegorz": 460000,
|
||||||
|
"jerzy": 440000,
|
||||||
|
"tadeusz": 420000,
|
||||||
|
"adam": 400000,
|
||||||
|
"lukasz": 380000,
|
||||||
|
"zbigniew": 360000,
|
||||||
|
"ryszard": 340000,
|
||||||
|
"dariusz": 320000,
|
||||||
|
"henryk": 300000,
|
||||||
|
"mariusz": 280000,
|
||||||
|
"kazimierz": 260000,
|
||||||
|
"wojciech": 240000,
|
||||||
|
"robert": 220000,
|
||||||
|
"mateusz": 200000,
|
||||||
|
"jakub": 180000,
|
||||||
|
"rafal": 170000,
|
||||||
|
"kamil": 160000,
|
||||||
|
"maciej": 150000,
|
||||||
|
"szymon": 145000,
|
||||||
|
|
||||||
|
# Very common Polish female names
|
||||||
|
"maria": 1200000,
|
||||||
|
"anna": 1000000,
|
||||||
|
"katarzyna": 800000,
|
||||||
|
"malgorzata": 750000,
|
||||||
|
"agnieszka": 700000,
|
||||||
|
"barbara": 650000,
|
||||||
|
"ewa": 620000,
|
||||||
|
"krystyna": 600000,
|
||||||
|
"elzbieta": 580000,
|
||||||
|
"zofia": 560000,
|
||||||
|
"joanna": 540000,
|
||||||
|
"monika": 520000,
|
||||||
|
"jadwiga": 500000,
|
||||||
|
"teresa": 480000,
|
||||||
|
"danuta": 460000,
|
||||||
|
"irena": 440000,
|
||||||
|
"aleksandra": 420000,
|
||||||
|
"magdalena": 400000,
|
||||||
|
"dorota": 380000,
|
||||||
|
"beata": 360000,
|
||||||
|
"karolina": 340000,
|
||||||
|
"paulina": 320000,
|
||||||
|
"natalia": 300000,
|
||||||
|
"justyna": 280000,
|
||||||
|
"patrycja": 260000,
|
||||||
|
}
|
||||||
|
|
||||||
|
GERMAN_FIRST_NAMES: Dict[str, int] = {
|
||||||
|
# Very common German male names (estimated incidence)
|
||||||
|
"peter": 1100000,
|
||||||
|
"michael": 1000000,
|
||||||
|
"thomas": 950000,
|
||||||
|
"wolfgang": 800000,
|
||||||
|
"klaus": 750000,
|
||||||
|
"hans": 700000,
|
||||||
|
"jurgen": 650000,
|
||||||
|
"dieter": 620000,
|
||||||
|
"helmut": 600000,
|
||||||
|
"werner": 580000,
|
||||||
|
"manfred": 560000,
|
||||||
|
"andreas": 540000,
|
||||||
|
"stefan": 520000,
|
||||||
|
"christian": 500000,
|
||||||
|
"frank": 480000,
|
||||||
|
"bernd": 460000,
|
||||||
|
"martin": 440000,
|
||||||
|
"matthias": 420000,
|
||||||
|
"uwe": 400000,
|
||||||
|
"ralf": 380000,
|
||||||
|
"karl": 360000,
|
||||||
|
"horst": 340000,
|
||||||
|
"gerhard": 320000,
|
||||||
|
"gunter": 300000,
|
||||||
|
"alexander": 280000,
|
||||||
|
"jan": 260000,
|
||||||
|
"markus": 240000,
|
||||||
|
"tobias": 220000,
|
||||||
|
"sebastian": 200000,
|
||||||
|
"daniel": 180000,
|
||||||
|
|
||||||
|
# Very common German female names
|
||||||
|
"maria": 1200000,
|
||||||
|
"ursula": 800000,
|
||||||
|
"monika": 750000,
|
||||||
|
"petra": 700000,
|
||||||
|
"sabine": 680000,
|
||||||
|
"renate": 660000,
|
||||||
|
"brigitte": 640000,
|
||||||
|
"helga": 620000,
|
||||||
|
"andrea": 600000,
|
||||||
|
"claudia": 580000,
|
||||||
|
"susanne": 560000,
|
||||||
|
"gabriele": 540000,
|
||||||
|
"birgit": 520000,
|
||||||
|
"angelika": 500000,
|
||||||
|
"heike": 480000,
|
||||||
|
"martina": 460000,
|
||||||
|
"karin": 440000,
|
||||||
|
"christine": 420000,
|
||||||
|
"anna": 400000,
|
||||||
|
"katharina": 380000,
|
||||||
|
"julia": 360000,
|
||||||
|
"stefanie": 340000,
|
||||||
|
"nicole": 320000,
|
||||||
|
"sandra": 300000,
|
||||||
|
"lisa": 280000,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# NAME DATABASE REGISTRY
|
# NAME DATABASE REGISTRY
|
||||||
|
|
@ -387,8 +704,7 @@ def get_first_name_database(country: str) -> Dict[str, int]:
|
||||||
elif country in ("GB", "GBR", "UK", "UNITED KINGDOM", "ENGLAND"):
|
elif country in ("GB", "GBR", "UK", "UNITED KINGDOM", "ENGLAND"):
|
||||||
return UK_FIRST_NAMES
|
return UK_FIRST_NAMES
|
||||||
elif country in ("DE", "DEU", "GERMANY", "GERMAN"):
|
elif country in ("DE", "DEU", "GERMANY", "GERMAN"):
|
||||||
# German first names overlap with Dutch first names
|
return GERMAN_FIRST_NAMES
|
||||||
return DUTCH_FIRST_NAMES
|
|
||||||
elif country in ("EG", "EGY", "EGYPT", "EGYPTIAN"):
|
elif country in ("EG", "EGY", "EGYPT", "EGYPTIAN"):
|
||||||
return ARABIC_FIRST_NAMES
|
return ARABIC_FIRST_NAMES
|
||||||
elif country in ("SA", "SAU", "SAUDI ARABIA", "SAUDI"):
|
elif country in ("SA", "SAU", "SAUDI ARABIA", "SAUDI"):
|
||||||
|
|
@ -396,6 +712,14 @@ def get_first_name_database(country: str) -> Dict[str, int]:
|
||||||
elif country in ("ID", "IDN", "INDONESIA", "INDONESIAN"):
|
elif country in ("ID", "IDN", "INDONESIA", "INDONESIAN"):
|
||||||
# Indonesia uses mix of Arabic and local names
|
# Indonesia uses mix of Arabic and local names
|
||||||
return ARABIC_FIRST_NAMES
|
return ARABIC_FIRST_NAMES
|
||||||
|
elif country in ("FR", "FRA", "FRANCE", "FRENCH"):
|
||||||
|
return FRENCH_FIRST_NAMES
|
||||||
|
elif country in ("ES", "ESP", "SPAIN", "SPANISH"):
|
||||||
|
return SPANISH_FIRST_NAMES
|
||||||
|
elif country in ("IT", "ITA", "ITALY", "ITALIAN"):
|
||||||
|
return ITALIAN_FIRST_NAMES
|
||||||
|
elif country in ("PL", "POL", "POLAND", "POLISH"):
|
||||||
|
return POLISH_FIRST_NAMES
|
||||||
else:
|
else:
|
||||||
return DUTCH_FIRST_NAMES # Default
|
return DUTCH_FIRST_NAMES # Default
|
||||||
|
|
||||||
|
|
@ -754,7 +1078,7 @@ def main():
|
||||||
print("NAME FREQUENCY DATA SOURCES")
|
print("NAME FREQUENCY DATA SOURCES")
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
|
|
||||||
for country in ["NL", "US", "BE", "GB", "DE", "ID", "EG", "SA"]:
|
for country in ["NL", "US", "BE", "GB", "DE", "ID", "EG", "SA", "FR", "ES", "IT", "PL"]:
|
||||||
surnames, source, total = load_surname_data(country)
|
surnames, source, total = load_surname_data(country)
|
||||||
sorted_names = get_sorted_surnames(country)
|
sorted_names = get_sorted_surnames(country)
|
||||||
top_10 = sorted_names[:10]
|
top_10 = sorted_names[:10]
|
||||||
|
|
@ -819,6 +1143,30 @@ def main():
|
||||||
("Ahmed Alghamdi", "SA"),
|
("Ahmed Alghamdi", "SA"),
|
||||||
("Khalid Alharbi", "SA"),
|
("Khalid Alharbi", "SA"),
|
||||||
("Fatima Alshehri", "SA"),
|
("Fatima Alshehri", "SA"),
|
||||||
|
|
||||||
|
# French names
|
||||||
|
("Jean Martin", "FR"),
|
||||||
|
("Marie Dubois", "FR"),
|
||||||
|
("Pierre Bernard", "FR"),
|
||||||
|
("Sophie Petit", "FR"),
|
||||||
|
|
||||||
|
# Spanish names
|
||||||
|
("José García", "ES"),
|
||||||
|
("María Fernández", "ES"),
|
||||||
|
("Antonio López", "ES"),
|
||||||
|
("Carmen Rodríguez", "ES"),
|
||||||
|
|
||||||
|
# Italian names
|
||||||
|
("Giuseppe Rossi", "IT"),
|
||||||
|
("Maria Russo", "IT"),
|
||||||
|
("Marco Ferrari", "IT"),
|
||||||
|
("Giulia Esposito", "IT"),
|
||||||
|
|
||||||
|
# Polish names
|
||||||
|
("Jan Nowak", "PL"),
|
||||||
|
("Anna Kowalski", "PL"),
|
||||||
|
("Piotr Wiśniewski", "PL"),
|
||||||
|
("Maria Wójcik", "PL"),
|
||||||
]
|
]
|
||||||
|
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue