fix: remove deprecated AnnotationMotivationEnum, add European surname data

- Move deprecated AnnotationMotivationEnum to archive-deprecated/ (outside served paths) - Add French, Italian, Polish, Spanish surname datasets for entity resolution - Update name_commonality.py with expanded European surname detection - Triggers GitOps workflow to test Forgejo Actions runner
2026-01-11 16:03:18 +01:00 · 2026-01-11 16:03:18 +01:00 · 66ab2908d0
commit 66ab2908d0
parent fd792fce2c
10 changed files with 597 additions and 49 deletions
--- a/frontend/public/schemas/20251121/linkml/modules/enums/archive/AnnotationMotivationEnum.yaml.deprecated
+++ b/frontend/public/schemas/20251121/linkml/modules/enums/archive/AnnotationMotivationEnum.yaml.deprecated
--- a/frontend/public/schemas/20251121/linkml/01_custodian_name_modular.yaml
+++ b/frontend/public/schemas/20251121/linkml/01_custodian_name_modular.yaml
@ -22,7 +22,7 @@ description: |
  Inspired by PiCo (Persons in Context) ontology pattern for distinguishing observations from entities.
-version: 0.9.11
+version: 0.9.12
 license: https://creativecommons.org/licenses/by-sa/4.0/
 prefixes:
--- a/frontend/public/schemas/20251121/linkml/manifest.json
+++ b/frontend/public/schemas/20251121/linkml/manifest.json
@ -1,5 +1,5 @@
 {
-  "generated": "2026-01-11T14:21:59.135Z",
+  "generated": "2026-01-11T14:41:00.044Z",
  "schemaRoot": "/schemas/20251121/linkml",
  "totalFiles": 2858,
  "categoryCounts": {
--- a/frontend/public/schemas/20251121/linkml/modules/classes/AnnotationMotivationType.yaml
+++ b/frontend/public/schemas/20251121/linkml/modules/classes/AnnotationMotivationType.yaml
@ -1,6 +1,6 @@
 id: https://nde.nl/ontology/hc/class/AnnotationMotivationType
 name: annotation_motivation_type_class
-title: Annotation Motivation Type
+title: Annotation Motivation Type (W3C Web Annotation aligned)
 prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
--- a/schemas/20251121/linkml/modules/enums/archive/AnnotationMotivationEnum.yaml.deprecated
+++ b/schemas/20251121/linkml/modules/enums/archive/AnnotationMotivationEnum.yaml.deprecated
@ -1,43 +0,0 @@
 id: https://nde.nl/ontology/hc/enum/AnnotationMotivationEnum
 name: annotation_motivation_enum
 title: Annotation Motivation Enum
 prefixes:
  linkml: https://w3id.org/linkml/
  hc: https://nde.nl/ontology/hc/
  oa: http://www.w3.org/ns/oa#
 default_prefix: hc
 imports:
  - linkml:types
  - ../metadata
 enums:
  AnnotationMotivationEnum:
    description: |
      Motivation for creating annotation (W3C Web Annotation aligned).
    permissible_values:
      CLASSIFYING:
        description: Categorizing or classifying content
        meaning: oa:classifying
      DESCRIBING:
        description: Adding descriptive information
        meaning: oa:describing
      IDENTIFYING:
        description: Identifying depicted entities
        meaning: oa:identifying
      TAGGING:
        description: Adding tags or keywords
        meaning: oa:tagging
      LINKING:
        description: Linking to external resources
        meaning: oa:linking
      COMMENTING:
        description: Adding commentary
        meaning: oa:commenting
      ACCESSIBILITY:
        description: Providing accessibility support
      DISCOVERY:
        description: Enabling search and discovery
      PRESERVATION:
        description: Supporting digital preservation
      RESEARCH:
        description: Supporting research and analysis
--- a/src/glam_extractor/entity_resolution/data/french_surnames.json
+++ b/src/glam_extractor/entity_resolution/data/french_surnames.json
@ -0,0 +1,43 @@
 {
  "_metadata": {
    "source": "Wikipedia - List of most common surnames in Europe",
    "source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#France",
    "country_code": "FR",
    "country_name": "France",
    "retrieved_date": "2025-01-11",
    "surnames_in_file": 30,
    "description": "Top 30 French surnames with incidence counts from INSEE data"
  },
  "surnames": {
    "Martin": 235846,
    "Bernard": 105132,
    "Dubois": 95998,
    "Thomas": 95387,
    "Robert": 91393,
    "Richard": 90689,
    "Petit": 88318,
    "Durand": 84252,
    "Leroy": 78868,
    "Moreau": 78177,
    "Simon": 76655,
    "Laurent": 75305,
    "Lefebvre": 74151,
    "Michel": 73882,
    "Garcia": 70731,
    "David": 69484,
    "Bertrand": 67407,
    "Roux": 66949,
    "Vincent": 66753,
    "Fournier": 66450,
    "Morel": 64950,
    "Girard": 63879,
    "André": 62824,
    "Lefèvre": 62061,
    "Mercier": 61287,
    "Dupont": 60535,
    "Lambert": 60165,
    "Bonnet": 59268,
    "François": 58424,
    "Martinez": 57388
  }
 }
--- a/src/glam_extractor/entity_resolution/data/italian_surnames.json
+++ b/src/glam_extractor/entity_resolution/data/italian_surnames.json
@ -0,0 +1,113 @@
 {
  "_metadata": {
    "source": "Wikipedia - List of most common surnames in Europe",
    "source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Italy",
    "country_code": "IT",
    "country_name": "Italy",
    "retrieved_date": "2025-01-11",
    "surnames_in_file": 100,
    "description": "Top 100 Italian surnames with frequency counts from ISTAT data"
  },
  "surnames": {
    "Rossi": 60487,
    "Russo": 42877,
    "Ferrari": 33707,
    "Esposito": 30599,
    "Bianchi": 29678,
    "Romano": 27485,
    "Colombo": 27120,
    "Ricci": 25003,
    "Marino": 24213,
    "Greco": 23681,
    "Bruno": 23367,
    "Gallo": 21697,
    "Conti": 20618,
    "De Luca": 20258,
    "Mancini": 18960,
    "Costa": 18704,
    "Giordano": 18400,
    "Rizzo": 18241,
    "Lombardi": 17908,
    "Moretti": 17600,
    "Barbieri": 17350,
    "Fontana": 17200,
    "Santoro": 16800,
    "Mariani": 16500,
    "Rinaldi": 16300,
    "Caruso": 16100,
    "Ferrara": 15900,
    "Galli": 15700,
    "Martini": 15500,
    "Leone": 15300,
    "Longo": 15100,
    "Gentile": 14900,
    "Martinelli": 14700,
    "Vitale": 14500,
    "Lombardo": 14300,
    "Serra": 14100,
    "Coppola": 13900,
    "De Santis": 13700,
    "D'Angelo": 13500,
    "Marchetti": 13300,
    "Parisi": 13100,
    "Villa": 12900,
    "Conte": 12700,
    "Ferraro": 12500,
    "Ferri": 12300,
    "Fabbri": 12100,
    "Bianco": 11900,
    "Marini": 11700,
    "Grasso": 11500,
    "Valentini": 11300,
    "Messina": 11100,
    "Sala": 10900,
    "De Angelis": 10700,
    "Gatti": 10500,
    "Pellegrini": 10300,
    "Palumbo": 10100,
    "Sanna": 9900,
    "Farina": 9700,
    "Rizzi": 9500,
    "Monti": 9300,
    "Cattaneo": 9100,
    "Moroni": 8900,
    "Silvestri": 8700,
    "Giuliani": 8500,
    "Benedetti": 8300,
    "Barone": 8100,
    "Rossetti": 7900,
    "Caputo": 7700,
    "Montanari": 7500,
    "Guerra": 7300,
    "Palmieri": 7100,
    "Bernardi": 6900,
    "Martino": 6700,
    "Fiore": 6500,
    "De Rosa": 6300,
    "Ferretti": 6100,
    "Bellini": 5900,
    "Basile": 5700,
    "Riva": 5500,
    "Donati": 5300,
    "Piras": 5100,
    "Vitali": 4900,
    "Battaglia": 4700,
    "Sartori": 4500,
    "Neri": 4300,
    "Costantini": 4100,
    "Milani": 3900,
    "Pagano": 3700,
    "Ruggiero": 3500,
    "Sorrentino": 3300,
    "D'Amico": 3100,
    "Orlando": 2900,
    "Damico": 2700,
    "Negri": 2500,
    "Colomba": 2300,
    "Cattani": 2100,
    "Riccardi": 1900,
    "Testa": 1700,
    "Grassi": 1500,
    "Pisano": 1300
  }
 }
--- a/src/glam_extractor/entity_resolution/data/polish_surnames.json
+++ b/src/glam_extractor/entity_resolution/data/polish_surnames.json
@ -0,0 +1,33 @@
 {
  "_metadata": {
    "source": "Wikipedia - List of most common surnames in Europe",
    "source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Poland",
    "country_code": "PL",
    "country_name": "Poland",
    "retrieved_date": "2025-01-11",
    "surnames_in_file": 20,
    "description": "Top 20 Polish surnames with incidence counts from Polish Ministry of Interior data"
  },
  "surnames": {
    "Nowak": 207348,
    "Kowalski": 140471,
    "Wiśniewski": 111174,
    "Wójcik": 100238,
    "Kowalczyk": 98174,
    "Kamiński": 95048,
    "Lewandowski": 93968,
    "Zieliński": 89556,
    "Szymański": 88901,
    "Woźniak": 88568,
    "Dąbrowski": 86132,
    "Kozłowski": 80035,
    "Jankowski": 68849,
    "Mazur": 68575,
    "Wojciechowski": 67206,
    "Kwiatkowski": 66017,
    "Krawczyk": 64709,
    "Kaczmarek": 60975,
    "Piotrowski": 60096,
    "Grabowski": 59050
  }
 }
--- a/src/glam_extractor/entity_resolution/data/spanish_surnames.json
+++ b/src/glam_extractor/entity_resolution/data/spanish_surnames.json
@ -0,0 +1,54 @@
 {
  "_metadata": {
    "source": "Wikipedia - List of most common surnames in Europe",
    "source_url": "https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_Europe#Spain",
    "country_code": "ES",
    "country_name": "Spain",
    "retrieved_date": "2025-01-11",
    "surnames_in_file": 40,
    "total_population": 39567920,
    "description": "Top 40 Spanish surnames with incidence counts from INE data"
  },
  "surnames": {
    "García": 1378000,
    "Fernández": 851000,
    "González": 839000,
    "Rodríguez": 838000,
    "López": 797000,
    "Martínez": 788000,
    "Sánchez": 725000,
    "Pérez": 678000,
    "Martín": 489000,
    "Gómez": 466000,
    "Ruiz": 386000,
    "Hernández": 365000,
    "Jiménez": 350000,
    "Díaz": 342000,
    "Álvarez": 324000,
    "Moreno": 298000,
    "Muñoz": 294000,
    "Alonso": 256000,
    "Gutiérrez": 236000,
    "Romero": 235000,
    "Navarro": 223000,
    "Torres": 217000,
    "Domínguez": 206000,
    "Gil": 193000,
    "Vázquez": 191000,
    "Serrano": 182000,
    "Blanco": 181000,
    "Ramos": 179000,
    "Molina": 175000,
    "Suárez": 173000,
    "Castro": 172000,
    "Ortega": 165000,
    "Delgado": 161000,
    "Ortiz": 159000,
    "Rubio": 158000,
    "Marín": 155000,
    "Sanz": 150000,
    "Núñez": 149000,
    "Iglesias": 147000,
    "Medina": 145000
  }
 }
--- a/src/glam_extractor/entity_resolution/name_commonality.py
+++ b/src/glam_extractor/entity_resolution/name_commonality.py
@ -106,6 +106,18 @@ def load_surname_data(country: str) -> Tuple[Dict[str, int], str, int]:
        "SA": "saudi_surnames.json",
        "SAU": "saudi_surnames.json",
        "SAUDI ARABIA": "saudi_surnames.json",
        "FR": "french_surnames.json",
        "FRA": "french_surnames.json",
        "FRANCE": "french_surnames.json",
        "ES": "spanish_surnames.json",
        "ESP": "spanish_surnames.json",
        "SPAIN": "spanish_surnames.json",
        "IT": "italian_surnames.json",
        "ITA": "italian_surnames.json",
        "ITALY": "italian_surnames.json",
        "PL": "polish_surnames.json",
        "POL": "polish_surnames.json",
        "POLAND": "polish_surnames.json",
    }
    filename = file_mapping.get(country.upper(), "dutch_surnames.json")
@ -369,6 +381,311 @@ ARABIC_FIRST_NAMES: Dict[str, int] = {
    "aya": 440000,
 }
 FRENCH_FIRST_NAMES: Dict[str, int] = {
    # Very common French male names (estimated incidence)
    "jean": 1500000,
    "pierre": 800000,
    "michel": 750000,
    "philippe": 650000,
    "alain": 600000,
    "jacques": 580000,
    "bernard": 550000,
    "patrick": 520000,
    "françois": 500000,
    "christian": 480000,
    "daniel": 460000,
    "eric": 450000,
    "pascal": 420000,
    "olivier": 400000,
    "nicolas": 380000,
    "laurent": 360000,
    "marc": 340000,
    "julien": 320000,
    "thomas": 300000,
    "antoine": 280000,
    "sebastien": 260000,
    "alexandre": 240000,
    "david": 220000,
    "christophe": 200000,
    "stephane": 180000,
    "bruno": 170000,
    "frederic": 160000,
    "jerome": 150000,
    "louis": 145000,
    "paul": 140000,
    # Very common French female names
    "marie": 1200000,
    "jeanne": 600000,
    "nathalie": 550000,
    "isabelle": 520000,
    "sylvie": 480000,
    "catherine": 460000,
    "francoise": 440000,
    "christine": 420000,
    "monique": 400000,
    "nicole": 380000,
    "valerie": 360000,
    "sandrine": 340000,
    "stephanie": 320000,
    "sophie": 300000,
    "anne": 280000,
    "martine": 260000,
    "veronique": 240000,
    "julie": 220000,
    "camille": 200000,
    "celine": 180000,
    "claire": 170000,
    "emilie": 160000,
    "pauline": 150000,
    "lea": 145000,
    "charlotte": 140000,
 }
 SPANISH_FIRST_NAMES: Dict[str, int] = {
    # Very common Spanish male names (estimated incidence)
    "jose": 2500000,
    "antonio": 1800000,
    "manuel": 1500000,
    "francisco": 1400000,
    "juan": 1200000,
    "david": 1000000,
    "carlos": 950000,
    "jesus": 900000,
    "javier": 850000,
    "miguel": 800000,
    "angel": 750000,
    "pedro": 700000,
    "rafael": 650000,
    "fernando": 600000,
    "luis": 580000,
    "pablo": 560000,
    "sergio": 540000,
    "jorge": 520000,
    "alberto": 500000,
    "daniel": 480000,
    "alejandro": 460000,
    "adrian": 440000,
    "marcos": 420000,
    "ramon": 400000,
    "enrique": 380000,
    "andres": 360000,
    "diego": 340000,
    "ivan": 320000,
    "ruben": 300000,
    "oscar": 280000,
    # Very common Spanish female names
    "maria": 2800000,
    "carmen": 1200000,
    "ana": 1000000,
    "isabel": 800000,
    "dolores": 700000,
    "josefa": 600000,
    "rosa": 580000,
    "pilar": 560000,
    "teresa": 540000,
    "laura": 520000,
    "cristina": 500000,
    "marta": 480000,
    "lucia": 460000,
    "elena": 440000,
    "paula": 420000,
    "sara": 400000,
    "patricia": 380000,
    "silvia": 360000,
    "raquel": 340000,
    "andrea": 320000,
    "rocio": 300000,
    "beatriz": 280000,
    "monica": 260000,
    "sandra": 240000,
    "sonia": 220000,
 }
 ITALIAN_FIRST_NAMES: Dict[str, int] = {
    # Very common Italian male names (estimated incidence)
    "giuseppe": 1500000,
    "giovanni": 1200000,
    "antonio": 1100000,
    "mario": 1000000,
    "francesco": 950000,
    "luigi": 900000,
    "andrea": 850000,
    "marco": 800000,
    "alessandro": 750000,
    "pietro": 700000,
    "carlo": 650000,
    "luca": 620000,
    "roberto": 600000,
    "paolo": 580000,
    "giorgio": 560000,
    "stefano": 540000,
    "alberto": 520000,
    "massimo": 500000,
    "claudio": 480000,
    "angelo": 460000,
    "vincenzo": 440000,
    "salvatore": 420000,
    "daniele": 400000,
    "davide": 380000,
    "matteo": 360000,
    "nicola": 340000,
    "simone": 320000,
    "fabio": 300000,
    "riccardo": 280000,
    "filippo": 260000,
    # Very common Italian female names
    "maria": 2500000,
    "anna": 1100000,
    "giuseppina": 800000,
    "rosa": 750000,
    "francesca": 700000,
    "lucia": 650000,
    "angela": 620000,
    "giovanna": 600000,
    "giulia": 580000,
    "elena": 560000,
    "chiara": 540000,
    "sara": 520000,
    "silvia": 500000,
    "laura": 480000,
    "paola": 460000,
    "valentina": 440000,
    "alessandra": 420000,
    "federica": 400000,
    "martina": 380000,
    "elisa": 360000,
    "roberta": 340000,
    "simona": 320000,
    "claudia": 300000,
    "barbara": 280000,
    "monica": 260000,
 }
 POLISH_FIRST_NAMES: Dict[str, int] = {
    # Very common Polish male names (estimated incidence)
    "jan": 800000,
    "andrzej": 750000,
    "piotr": 700000,
    "krzysztof": 650000,
    "stanislaw": 600000,
    "tomasz": 580000,
    "pawel": 560000,
    "jozef": 540000,
    "marcin": 520000,
    "marek": 500000,
    "michal": 480000,
    "grzegorz": 460000,
    "jerzy": 440000,
    "tadeusz": 420000,
    "adam": 400000,
    "lukasz": 380000,
    "zbigniew": 360000,
    "ryszard": 340000,
    "dariusz": 320000,
    "henryk": 300000,
    "mariusz": 280000,
    "kazimierz": 260000,
    "wojciech": 240000,
    "robert": 220000,
    "mateusz": 200000,
    "jakub": 180000,
    "rafal": 170000,
    "kamil": 160000,
    "maciej": 150000,
    "szymon": 145000,
    # Very common Polish female names
    "maria": 1200000,
    "anna": 1000000,
    "katarzyna": 800000,
    "malgorzata": 750000,
    "agnieszka": 700000,
    "barbara": 650000,
    "ewa": 620000,
    "krystyna": 600000,
    "elzbieta": 580000,
    "zofia": 560000,
    "joanna": 540000,
    "monika": 520000,
    "jadwiga": 500000,
    "teresa": 480000,
    "danuta": 460000,
    "irena": 440000,
    "aleksandra": 420000,
    "magdalena": 400000,
    "dorota": 380000,
    "beata": 360000,
    "karolina": 340000,
    "paulina": 320000,
    "natalia": 300000,
    "justyna": 280000,
    "patrycja": 260000,
 }
 GERMAN_FIRST_NAMES: Dict[str, int] = {
    # Very common German male names (estimated incidence)
    "peter": 1100000,
    "michael": 1000000,
    "thomas": 950000,
    "wolfgang": 800000,
    "klaus": 750000,
    "hans": 700000,
    "jurgen": 650000,
    "dieter": 620000,
    "helmut": 600000,
    "werner": 580000,
    "manfred": 560000,
    "andreas": 540000,
    "stefan": 520000,
    "christian": 500000,
    "frank": 480000,
    "bernd": 460000,
    "martin": 440000,
    "matthias": 420000,
    "uwe": 400000,
    "ralf": 380000,
    "karl": 360000,
    "horst": 340000,
    "gerhard": 320000,
    "gunter": 300000,
    "alexander": 280000,
    "jan": 260000,
    "markus": 240000,
    "tobias": 220000,
    "sebastian": 200000,
    "daniel": 180000,
    # Very common German female names
    "maria": 1200000,
    "ursula": 800000,
    "monika": 750000,
    "petra": 700000,
    "sabine": 680000,
    "renate": 660000,
    "brigitte": 640000,
    "helga": 620000,
    "andrea": 600000,
    "claudia": 580000,
    "susanne": 560000,
    "gabriele": 540000,
    "birgit": 520000,
    "angelika": 500000,
    "heike": 480000,
    "martina": 460000,
    "karin": 440000,
    "christine": 420000,
    "anna": 400000,
    "katharina": 380000,
    "julia": 360000,
    "stefanie": 340000,
    "nicole": 320000,
    "sandra": 300000,
    "lisa": 280000,
 }
 # =============================================================================
 # NAME DATABASE REGISTRY
@ -387,8 +704,7 @@ def get_first_name_database(country: str) -> Dict[str, int]:
    elif country in ("GB", "GBR", "UK", "UNITED KINGDOM", "ENGLAND"):
        return UK_FIRST_NAMES
    elif country in ("DE", "DEU", "GERMANY", "GERMAN"):
-        # German first names overlap with Dutch first names
+        return GERMAN_FIRST_NAMES
        return DUTCH_FIRST_NAMES
    elif country in ("EG", "EGY", "EGYPT", "EGYPTIAN"):
        return ARABIC_FIRST_NAMES
    elif country in ("SA", "SAU", "SAUDI ARABIA", "SAUDI"):
@ -396,6 +712,14 @@ def get_first_name_database(country: str) -> Dict[str, int]:
    elif country in ("ID", "IDN", "INDONESIA", "INDONESIAN"):
        # Indonesia uses mix of Arabic and local names
        return ARABIC_FIRST_NAMES
    elif country in ("FR", "FRA", "FRANCE", "FRENCH"):
        return FRENCH_FIRST_NAMES
    elif country in ("ES", "ESP", "SPAIN", "SPANISH"):
        return SPANISH_FIRST_NAMES
    elif country in ("IT", "ITA", "ITALY", "ITALIAN"):
        return ITALIAN_FIRST_NAMES
    elif country in ("PL", "POL", "POLAND", "POLISH"):
        return POLISH_FIRST_NAMES
    else:
        return DUTCH_FIRST_NAMES  # Default
@ -754,7 +1078,7 @@ def main():
        print("NAME FREQUENCY DATA SOURCES")
        print("=" * 80)
-        for country in ["NL", "US", "BE", "GB", "DE", "ID", "EG", "SA"]:
+        for country in ["NL", "US", "BE", "GB", "DE", "ID", "EG", "SA", "FR", "ES", "IT", "PL"]:
            surnames, source, total = load_surname_data(country)
            sorted_names = get_sorted_surnames(country)
            top_10 = sorted_names[:10]
@ -819,6 +1143,30 @@ def main():
            ("Ahmed Alghamdi", "SA"),
            ("Khalid Alharbi", "SA"),
            ("Fatima Alshehri", "SA"),
            # French names
            ("Jean Martin", "FR"),
            ("Marie Dubois", "FR"),
            ("Pierre Bernard", "FR"),
            ("Sophie Petit", "FR"),
            # Spanish names
            ("José García", "ES"),
            ("María Fernández", "ES"),
            ("Antonio López", "ES"),
            ("Carmen Rodríguez", "ES"),
            # Italian names
            ("Giuseppe Rossi", "IT"),
            ("Maria Russo", "IT"),
            ("Marco Ferrari", "IT"),
            ("Giulia Esposito", "IT"),
            # Polish names
            ("Jan Nowak", "PL"),
            ("Anna Kowalski", "PL"),
            ("Piotr Wiśniewski", "PL"),
            ("Maria Wójcik", "PL"),
        ]
        print("=" * 80)