fix: remove deprecated AnnotationMotivationEnum, add European surname data
Some checks failed
Deploy Frontend / build-and-deploy (push) Failing after 3m21s

- Move deprecated AnnotationMotivationEnum to archive-deprecated/ (outside served paths)
- Add French, Italian, Polish, Spanish surname datasets for entity resolution
- Update name_commonality.py with expanded European surname detection
- Triggers GitOps workflow to test Forgejo Actions runner
This commit is contained in:
kempersc 2026-01-11 16:03:18 +01:00
parent fd792fce2c
commit 66ab2908d0
10 changed files with 597 additions and 49 deletions

View file

@ -22,7 +22,7 @@ description: |
Inspired by PiCo (Persons in Context) ontology pattern for distinguishing observations from entities. Inspired by PiCo (Persons in Context) ontology pattern for distinguishing observations from entities.
version: 0.9.11 version: 0.9.12
license: https://creativecommons.org/licenses/by-sa/4.0/ license: https://creativecommons.org/licenses/by-sa/4.0/
prefixes: prefixes:

View file

@ -1,5 +1,5 @@
{ {
"generated": "2026-01-11T14:21:59.135Z", "generated": "2026-01-11T14:41:00.044Z",
"schemaRoot": "/schemas/20251121/linkml", "schemaRoot": "/schemas/20251121/linkml",
"totalFiles": 2858, "totalFiles": 2858,
"categoryCounts": { "categoryCounts": {

View file

@ -1,6 +1,6 @@
id: https://nde.nl/ontology/hc/class/AnnotationMotivationType id: https://nde.nl/ontology/hc/class/AnnotationMotivationType
name: annotation_motivation_type_class name: annotation_motivation_type_class
title: Annotation Motivation Type title: Annotation Motivation Type (W3C Web Annotation aligned)
prefixes: prefixes:
linkml: https://w3id.org/linkml/ linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/ hc: https://nde.nl/ontology/hc/

View file

@ -1,43 +0,0 @@
id: https://nde.nl/ontology/hc/enum/AnnotationMotivationEnum
name: annotation_motivation_enum
title: Annotation Motivation Enum
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
oa: http://www.w3.org/ns/oa#
default_prefix: hc
imports:
- linkml:types
- ../metadata
enums:
AnnotationMotivationEnum:
description: |
Motivation for creating annotation (W3C Web Annotation aligned).
permissible_values:
CLASSIFYING:
description: Categorizing or classifying content
meaning: oa:classifying
DESCRIBING:
description: Adding descriptive information
meaning: oa:describing
IDENTIFYING:
description: Identifying depicted entities
meaning: oa:identifying
TAGGING:
description: Adding tags or keywords
meaning: oa:tagging
LINKING:
description: Linking to external resources
meaning: oa:linking
COMMENTING:
description: Adding commentary
meaning: oa:commenting
ACCESSIBILITY:
description: Providing accessibility support
DISCOVERY:
description: Enabling search and discovery
PRESERVATION:
description: Supporting digital preservation
RESEARCH:
description: Supporting research and analysis

View file

@ -0,0 +1,43 @@
{
"_metadata": {
"source": "Wikipedia - List of most common surnames in Europe",
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#France",
"country_code": "FR",
"country_name": "France",
"retrieved_date": "2025-01-11",
"surnames_in_file": 30,
"description": "Top 30 French surnames with incidence counts from INSEE data"
},
"surnames": {
"Martin": 235846,
"Bernard": 105132,
"Dubois": 95998,
"Thomas": 95387,
"Robert": 91393,
"Richard": 90689,
"Petit": 88318,
"Durand": 84252,
"Leroy": 78868,
"Moreau": 78177,
"Simon": 76655,
"Laurent": 75305,
"Lefebvre": 74151,
"Michel": 73882,
"Garcia": 70731,
"David": 69484,
"Bertrand": 67407,
"Roux": 66949,
"Vincent": 66753,
"Fournier": 66450,
"Morel": 64950,
"Girard": 63879,
"André": 62824,
"Lefèvre": 62061,
"Mercier": 61287,
"Dupont": 60535,
"Lambert": 60165,
"Bonnet": 59268,
"François": 58424,
"Martinez": 57388
}
}

View file

@ -0,0 +1,113 @@
{
"_metadata": {
"source": "Wikipedia - List of most common surnames in Europe",
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Italy",
"country_code": "IT",
"country_name": "Italy",
"retrieved_date": "2025-01-11",
"surnames_in_file": 100,
"description": "Top 100 Italian surnames with frequency counts from ISTAT data"
},
"surnames": {
"Rossi": 60487,
"Russo": 42877,
"Ferrari": 33707,
"Esposito": 30599,
"Bianchi": 29678,
"Romano": 27485,
"Colombo": 27120,
"Ricci": 25003,
"Marino": 24213,
"Greco": 23681,
"Bruno": 23367,
"Gallo": 21697,
"Conti": 20618,
"De Luca": 20258,
"Mancini": 18960,
"Costa": 18704,
"Giordano": 18400,
"Rizzo": 18241,
"Lombardi": 17908,
"Moretti": 17600,
"Barbieri": 17350,
"Fontana": 17200,
"Santoro": 16800,
"Mariani": 16500,
"Rinaldi": 16300,
"Caruso": 16100,
"Ferrara": 15900,
"Galli": 15700,
"Martini": 15500,
"Leone": 15300,
"Longo": 15100,
"Gentile": 14900,
"Martinelli": 14700,
"Vitale": 14500,
"Lombardo": 14300,
"Serra": 14100,
"Coppola": 13900,
"De Santis": 13700,
"D'Angelo": 13500,
"Marchetti": 13300,
"Parisi": 13100,
"Villa": 12900,
"Conte": 12700,
"Ferraro": 12500,
"Ferri": 12300,
"Fabbri": 12100,
"Bianco": 11900,
"Marini": 11700,
"Grasso": 11500,
"Valentini": 11300,
"Messina": 11100,
"Sala": 10900,
"De Angelis": 10700,
"Gatti": 10500,
"Pellegrini": 10300,
"Palumbo": 10100,
"Sanna": 9900,
"Farina": 9700,
"Rizzi": 9500,
"Monti": 9300,
"Cattaneo": 9100,
"Moroni": 8900,
"Silvestri": 8700,
"Giuliani": 8500,
"Benedetti": 8300,
"Barone": 8100,
"Rossetti": 7900,
"Caputo": 7700,
"Montanari": 7500,
"Guerra": 7300,
"Palmieri": 7100,
"Bernardi": 6900,
"Martino": 6700,
"Fiore": 6500,
"De Rosa": 6300,
"Ferretti": 6100,
"Bellini": 5900,
"Basile": 5700,
"Riva": 5500,
"Donati": 5300,
"Piras": 5100,
"Vitali": 4900,
"Battaglia": 4700,
"Sartori": 4500,
"Neri": 4300,
"Costantini": 4100,
"Milani": 3900,
"Pagano": 3700,
"Ruggiero": 3500,
"Sorrentino": 3300,
"D'Amico": 3100,
"Orlando": 2900,
"Damico": 2700,
"Negri": 2500,
"Colomba": 2300,
"Cattani": 2100,
"Riccardi": 1900,
"Testa": 1700,
"Grassi": 1500,
"Pisano": 1300
}
}

View file

@ -0,0 +1,33 @@
{
"_metadata": {
"source": "Wikipedia - List of most common surnames in Europe",
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Poland",
"country_code": "PL",
"country_name": "Poland",
"retrieved_date": "2025-01-11",
"surnames_in_file": 20,
"description": "Top 20 Polish surnames with incidence counts from Polish Ministry of Interior data"
},
"surnames": {
"Nowak": 207348,
"Kowalski": 140471,
"Wiśniewski": 111174,
"Wójcik": 100238,
"Kowalczyk": 98174,
"Kamiński": 95048,
"Lewandowski": 93968,
"Zieliński": 89556,
"Szymański": 88901,
"Woźniak": 88568,
"Dąbrowski": 86132,
"Kozłowski": 80035,
"Jankowski": 68849,
"Mazur": 68575,
"Wojciechowski": 67206,
"Kwiatkowski": 66017,
"Krawczyk": 64709,
"Kaczmarek": 60975,
"Piotrowski": 60096,
"Grabowski": 59050
}
}

View file

@ -0,0 +1,54 @@
{
"_metadata": {
"source": "Wikipedia - List of most common surnames in Europe",
"source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Spain",
"country_code": "ES",
"country_name": "Spain",
"retrieved_date": "2025-01-11",
"surnames_in_file": 40,
"total_population": 39567920,
"description": "Top 40 Spanish surnames with incidence counts from INE data"
},
"surnames": {
"García": 1378000,
"Fernández": 851000,
"González": 839000,
"Rodríguez": 838000,
"López": 797000,
"Martínez": 788000,
"Sánchez": 725000,
"Pérez": 678000,
"Martín": 489000,
"Gómez": 466000,
"Ruiz": 386000,
"Hernández": 365000,
"Jiménez": 350000,
"Díaz": 342000,
"Álvarez": 324000,
"Moreno": 298000,
"Muñoz": 294000,
"Alonso": 256000,
"Gutiérrez": 236000,
"Romero": 235000,
"Navarro": 223000,
"Torres": 217000,
"Domínguez": 206000,
"Gil": 193000,
"Vázquez": 191000,
"Serrano": 182000,
"Blanco": 181000,
"Ramos": 179000,
"Molina": 175000,
"Suárez": 173000,
"Castro": 172000,
"Ortega": 165000,
"Delgado": 161000,
"Ortiz": 159000,
"Rubio": 158000,
"Marín": 155000,
"Sanz": 150000,
"Núñez": 149000,
"Iglesias": 147000,
"Medina": 145000
}
}

View file

@ -106,6 +106,18 @@ def load_surname_data(country: str) -> Tuple[Dict[str, int], str, int]:
"SA": "saudi_surnames.json", "SA": "saudi_surnames.json",
"SAU": "saudi_surnames.json", "SAU": "saudi_surnames.json",
"SAUDI ARABIA": "saudi_surnames.json", "SAUDI ARABIA": "saudi_surnames.json",
"FR": "french_surnames.json",
"FRA": "french_surnames.json",
"FRANCE": "french_surnames.json",
"ES": "spanish_surnames.json",
"ESP": "spanish_surnames.json",
"SPAIN": "spanish_surnames.json",
"IT": "italian_surnames.json",
"ITA": "italian_surnames.json",
"ITALY": "italian_surnames.json",
"PL": "polish_surnames.json",
"POL": "polish_surnames.json",
"POLAND": "polish_surnames.json",
} }
filename = file_mapping.get(country.upper(), "dutch_surnames.json") filename = file_mapping.get(country.upper(), "dutch_surnames.json")
@ -369,6 +381,311 @@ ARABIC_FIRST_NAMES: Dict[str, int] = {
"aya": 440000, "aya": 440000,
} }
FRENCH_FIRST_NAMES: Dict[str, int] = {
# Very common French male names (estimated incidence)
"jean": 1500000,
"pierre": 800000,
"michel": 750000,
"philippe": 650000,
"alain": 600000,
"jacques": 580000,
"bernard": 550000,
"patrick": 520000,
"françois": 500000,
"christian": 480000,
"daniel": 460000,
"eric": 450000,
"pascal": 420000,
"olivier": 400000,
"nicolas": 380000,
"laurent": 360000,
"marc": 340000,
"julien": 320000,
"thomas": 300000,
"antoine": 280000,
"sebastien": 260000,
"alexandre": 240000,
"david": 220000,
"christophe": 200000,
"stephane": 180000,
"bruno": 170000,
"frederic": 160000,
"jerome": 150000,
"louis": 145000,
"paul": 140000,
# Very common French female names
"marie": 1200000,
"jeanne": 600000,
"nathalie": 550000,
"isabelle": 520000,
"sylvie": 480000,
"catherine": 460000,
"francoise": 440000,
"christine": 420000,
"monique": 400000,
"nicole": 380000,
"valerie": 360000,
"sandrine": 340000,
"stephanie": 320000,
"sophie": 300000,
"anne": 280000,
"martine": 260000,
"veronique": 240000,
"julie": 220000,
"camille": 200000,
"celine": 180000,
"claire": 170000,
"emilie": 160000,
"pauline": 150000,
"lea": 145000,
"charlotte": 140000,
}
SPANISH_FIRST_NAMES: Dict[str, int] = {
# Very common Spanish male names (estimated incidence)
"jose": 2500000,
"antonio": 1800000,
"manuel": 1500000,
"francisco": 1400000,
"juan": 1200000,
"david": 1000000,
"carlos": 950000,
"jesus": 900000,
"javier": 850000,
"miguel": 800000,
"angel": 750000,
"pedro": 700000,
"rafael": 650000,
"fernando": 600000,
"luis": 580000,
"pablo": 560000,
"sergio": 540000,
"jorge": 520000,
"alberto": 500000,
"daniel": 480000,
"alejandro": 460000,
"adrian": 440000,
"marcos": 420000,
"ramon": 400000,
"enrique": 380000,
"andres": 360000,
"diego": 340000,
"ivan": 320000,
"ruben": 300000,
"oscar": 280000,
# Very common Spanish female names
"maria": 2800000,
"carmen": 1200000,
"ana": 1000000,
"isabel": 800000,
"dolores": 700000,
"josefa": 600000,
"rosa": 580000,
"pilar": 560000,
"teresa": 540000,
"laura": 520000,
"cristina": 500000,
"marta": 480000,
"lucia": 460000,
"elena": 440000,
"paula": 420000,
"sara": 400000,
"patricia": 380000,
"silvia": 360000,
"raquel": 340000,
"andrea": 320000,
"rocio": 300000,
"beatriz": 280000,
"monica": 260000,
"sandra": 240000,
"sonia": 220000,
}
ITALIAN_FIRST_NAMES: Dict[str, int] = {
# Very common Italian male names (estimated incidence)
"giuseppe": 1500000,
"giovanni": 1200000,
"antonio": 1100000,
"mario": 1000000,
"francesco": 950000,
"luigi": 900000,
"andrea": 850000,
"marco": 800000,
"alessandro": 750000,
"pietro": 700000,
"carlo": 650000,
"luca": 620000,
"roberto": 600000,
"paolo": 580000,
"giorgio": 560000,
"stefano": 540000,
"alberto": 520000,
"massimo": 500000,
"claudio": 480000,
"angelo": 460000,
"vincenzo": 440000,
"salvatore": 420000,
"daniele": 400000,
"davide": 380000,
"matteo": 360000,
"nicola": 340000,
"simone": 320000,
"fabio": 300000,
"riccardo": 280000,
"filippo": 260000,
# Very common Italian female names
"maria": 2500000,
"anna": 1100000,
"giuseppina": 800000,
"rosa": 750000,
"francesca": 700000,
"lucia": 650000,
"angela": 620000,
"giovanna": 600000,
"giulia": 580000,
"elena": 560000,
"chiara": 540000,
"sara": 520000,
"silvia": 500000,
"laura": 480000,
"paola": 460000,
"valentina": 440000,
"alessandra": 420000,
"federica": 400000,
"martina": 380000,
"elisa": 360000,
"roberta": 340000,
"simona": 320000,
"claudia": 300000,
"barbara": 280000,
"monica": 260000,
}
POLISH_FIRST_NAMES: Dict[str, int] = {
# Very common Polish male names (estimated incidence)
"jan": 800000,
"andrzej": 750000,
"piotr": 700000,
"krzysztof": 650000,
"stanislaw": 600000,
"tomasz": 580000,
"pawel": 560000,
"jozef": 540000,
"marcin": 520000,
"marek": 500000,
"michal": 480000,
"grzegorz": 460000,
"jerzy": 440000,
"tadeusz": 420000,
"adam": 400000,
"lukasz": 380000,
"zbigniew": 360000,
"ryszard": 340000,
"dariusz": 320000,
"henryk": 300000,
"mariusz": 280000,
"kazimierz": 260000,
"wojciech": 240000,
"robert": 220000,
"mateusz": 200000,
"jakub": 180000,
"rafal": 170000,
"kamil": 160000,
"maciej": 150000,
"szymon": 145000,
# Very common Polish female names
"maria": 1200000,
"anna": 1000000,
"katarzyna": 800000,
"malgorzata": 750000,
"agnieszka": 700000,
"barbara": 650000,
"ewa": 620000,
"krystyna": 600000,
"elzbieta": 580000,
"zofia": 560000,
"joanna": 540000,
"monika": 520000,
"jadwiga": 500000,
"teresa": 480000,
"danuta": 460000,
"irena": 440000,
"aleksandra": 420000,
"magdalena": 400000,
"dorota": 380000,
"beata": 360000,
"karolina": 340000,
"paulina": 320000,
"natalia": 300000,
"justyna": 280000,
"patrycja": 260000,
}
GERMAN_FIRST_NAMES: Dict[str, int] = {
# Very common German male names (estimated incidence)
"peter": 1100000,
"michael": 1000000,
"thomas": 950000,
"wolfgang": 800000,
"klaus": 750000,
"hans": 700000,
"jurgen": 650000,
"dieter": 620000,
"helmut": 600000,
"werner": 580000,
"manfred": 560000,
"andreas": 540000,
"stefan": 520000,
"christian": 500000,
"frank": 480000,
"bernd": 460000,
"martin": 440000,
"matthias": 420000,
"uwe": 400000,
"ralf": 380000,
"karl": 360000,
"horst": 340000,
"gerhard": 320000,
"gunter": 300000,
"alexander": 280000,
"jan": 260000,
"markus": 240000,
"tobias": 220000,
"sebastian": 200000,
"daniel": 180000,
# Very common German female names
"maria": 1200000,
"ursula": 800000,
"monika": 750000,
"petra": 700000,
"sabine": 680000,
"renate": 660000,
"brigitte": 640000,
"helga": 620000,
"andrea": 600000,
"claudia": 580000,
"susanne": 560000,
"gabriele": 540000,
"birgit": 520000,
"angelika": 500000,
"heike": 480000,
"martina": 460000,
"karin": 440000,
"christine": 420000,
"anna": 400000,
"katharina": 380000,
"julia": 360000,
"stefanie": 340000,
"nicole": 320000,
"sandra": 300000,
"lisa": 280000,
}
# ============================================================================= # =============================================================================
# NAME DATABASE REGISTRY # NAME DATABASE REGISTRY
@ -387,8 +704,7 @@ def get_first_name_database(country: str) -> Dict[str, int]:
elif country in ("GB", "GBR", "UK", "UNITED KINGDOM", "ENGLAND"): elif country in ("GB", "GBR", "UK", "UNITED KINGDOM", "ENGLAND"):
return UK_FIRST_NAMES return UK_FIRST_NAMES
elif country in ("DE", "DEU", "GERMANY", "GERMAN"): elif country in ("DE", "DEU", "GERMANY", "GERMAN"):
# German first names overlap with Dutch first names return GERMAN_FIRST_NAMES
return DUTCH_FIRST_NAMES
elif country in ("EG", "EGY", "EGYPT", "EGYPTIAN"): elif country in ("EG", "EGY", "EGYPT", "EGYPTIAN"):
return ARABIC_FIRST_NAMES return ARABIC_FIRST_NAMES
elif country in ("SA", "SAU", "SAUDI ARABIA", "SAUDI"): elif country in ("SA", "SAU", "SAUDI ARABIA", "SAUDI"):
@ -396,6 +712,14 @@ def get_first_name_database(country: str) -> Dict[str, int]:
elif country in ("ID", "IDN", "INDONESIA", "INDONESIAN"): elif country in ("ID", "IDN", "INDONESIA", "INDONESIAN"):
# Indonesia uses mix of Arabic and local names # Indonesia uses mix of Arabic and local names
return ARABIC_FIRST_NAMES return ARABIC_FIRST_NAMES
elif country in ("FR", "FRA", "FRANCE", "FRENCH"):
return FRENCH_FIRST_NAMES
elif country in ("ES", "ESP", "SPAIN", "SPANISH"):
return SPANISH_FIRST_NAMES
elif country in ("IT", "ITA", "ITALY", "ITALIAN"):
return ITALIAN_FIRST_NAMES
elif country in ("PL", "POL", "POLAND", "POLISH"):
return POLISH_FIRST_NAMES
else: else:
return DUTCH_FIRST_NAMES # Default return DUTCH_FIRST_NAMES # Default
@ -754,7 +1078,7 @@ def main():
print("NAME FREQUENCY DATA SOURCES") print("NAME FREQUENCY DATA SOURCES")
print("=" * 80) print("=" * 80)
for country in ["NL", "US", "BE", "GB", "DE", "ID", "EG", "SA"]: for country in ["NL", "US", "BE", "GB", "DE", "ID", "EG", "SA", "FR", "ES", "IT", "PL"]:
surnames, source, total = load_surname_data(country) surnames, source, total = load_surname_data(country)
sorted_names = get_sorted_surnames(country) sorted_names = get_sorted_surnames(country)
top_10 = sorted_names[:10] top_10 = sorted_names[:10]
@ -819,6 +1143,30 @@ def main():
("Ahmed Alghamdi", "SA"), ("Ahmed Alghamdi", "SA"),
("Khalid Alharbi", "SA"), ("Khalid Alharbi", "SA"),
("Fatima Alshehri", "SA"), ("Fatima Alshehri", "SA"),
# French names
("Jean Martin", "FR"),
("Marie Dubois", "FR"),
("Pierre Bernard", "FR"),
("Sophie Petit", "FR"),
# Spanish names
("José García", "ES"),
("María Fernández", "ES"),
("Antonio López", "ES"),
("Carmen Rodríguez", "ES"),
# Italian names
("Giuseppe Rossi", "IT"),
("Maria Russo", "IT"),
("Marco Ferrari", "IT"),
("Giulia Esposito", "IT"),
# Polish names
("Jan Nowak", "PL"),
("Anna Kowalski", "PL"),
("Piotr Wiśniewski", "PL"),
("Maria Wójcik", "PL"),
] ]
print("=" * 80) print("=" * 80)