glam/schemas/20251121/linkml/modules/classes/LinkedInProfile.yaml
kempersc 767fb8ca80 feat(schema): Add LinkedIn profile and person modeling schema
Person Identity Classes:
- PersonName: Full name modeling with components (given_name, surname_prefix,
  base_surname, patronym, initials) following Dutch naming conventions
- PersonConnection: Professional network connections with heritage relevance scoring
- ConnectionNetwork: Network-level analysis and statistics

LinkedIn Profile Schema:
- LinkedInProfile: Complete professional profile structure
- WorkExperience: Employment history with heritage institution detection
- EducationCredential: Academic background and qualifications
- LanguageProficiency: Language skills with ISO 639-1 codes

Supporting Classes:
- ExtractionMetadata: Provenance tracking for extracted profile data
- HeritageRelevance: GLAMORCUBESFIXPHDNT type scoring and classification

Slots (17 person-related slots):
- Name components: given_name, base_surname, surname_prefix, patronym, initials
- Identity: age, birth_date, birth_place, death_place, gender_identity, pronouns
- Professional: occupation, religion
- References: literal_name, name_specification, has_person_name, extraction_metadata

Enums:
- HeritageTypeEnum: GLAMORCUBESFIXPHDNT type codes for heritage relevance
2025-12-16 20:04:59 +01:00

612 lines
18 KiB
YAML

# LinkedIn Profile Class
# Complete LinkedIn profile extraction for person entity files
id: https://nde.nl/ontology/hc/class/LinkedInProfile
name: linkedin_profile_class
title: LinkedIn Profile Class
version: 1.0.0
prefixes:
linkml: https://w3id.org/linkml/
hc: https://nde.nl/ontology/hc/
schema: http://schema.org/
foaf: http://xmlns.com/foaf/0.1/
prov: http://www.w3.org/ns/prov#
dct: http://purl.org/dc/terms/
imports:
- linkml:types
- ../metadata
- ./ExtractionMetadata
- ./WorkExperience
- ./EducationCredential
- ./HeritageRelevance
- ./LanguageProficiency
default_range: string
classes:
LinkedInProfile:
class_uri: schema:ProfilePage
description: |
Complete LinkedIn profile extraction for a person.
Models the content of person entity JSON files stored at
`data/custodian/person/entity/*.json`. This is the root class
for LinkedIn profile data extracted via Exa API or HTML parsing.
**Relationship to PersonObservation**:
- PersonObservation.linkedin_profile_path references the file containing
this LinkedInProfile data
- PersonObservation.linkedin_profile_url links to the source URL
- This class models the CONTENT of that file
**Relationship to SocialMediaProfile**:
- SocialMediaProfile (in same schema) is for CUSTODIAN social media accounts
(e.g., Rijksmuseum's Instagram, Nationaal Archief's Twitter)
- LinkedInProfile is for PERSON LinkedIn profiles (staff members)
- These are complementary, not overlapping classes
**Data Flow**:
```
LinkedIn URL → Exa API → JSON file → LinkedInProfile (this class)
PersonObservation.linkedin_profile_path references this file
```
**Example JSON Structure**:
```json
{
"extraction_metadata": {
"source_file": "...",
"staff_id": "org_staff_0001_name",
"extraction_date": "2025-12-12T22:00:00Z",
"extraction_method": "exa_crawling_exa",
"extraction_agent": "claude-opus-4.5",
"linkedin_url": "https://www.linkedin.com/in/...",
"cost_usd": 0.001
},
"profile_data": {
"name": "Sander Hulleman",
"headline": "Stafadviseur PO",
"location": "Arnhem, Gelderland, Netherlands",
"connections": "246 connections • 248 followers",
"about": "Third year student...",
"experience": [...],
"education": [...],
"skills": ["education"],
"languages": ["English - Native or bilingual"],
"profile_image_url": "https://media.licdn.com/..."
},
"heritage_relevance": {
"is_heritage_relevant": true,
"heritage_types": ["E"],
"rationale": "Education sector professional..."
},
"source_organization": "the-dutch-inspectorate-of-education",
"whatsapp_enrichment": {...}
}
```
tree_root: true
exact_mappings:
- schema:ProfilePage
close_mappings:
- foaf:PersonalProfileDocument
- schema:Person
slots:
- extraction_metadata
- profile_data
- heritage_relevance
- source_organization
- whatsapp_enrichment
slot_usage:
extraction_metadata:
description: |
Provenance metadata for the extraction activity.
Records how, when, and by what agent this profile was extracted.
See ExtractionMetadata class for field definitions.
range: ExtractionMetadata
required: true
inlined: true
profile_data:
description: |
Core profile data extracted from LinkedIn.
Contains personal info, career history, education, skills, languages.
See LinkedInProfileData class for field definitions.
range: LinkedInProfileData
required: true
inlined: true
heritage_relevance:
description: |
Classification of this person's relevance to heritage sectors.
See HeritageRelevance class for scoring guidelines.
range: HeritageRelevance
inlined: true
source_organization:
description: |
Slug identifier of the organization from which this profile was discovered.
Matches the custodian slug used in staff list parsing.
Format: lowercase with hyphens (e.g., "rijksmuseum", "nationaal-archief")
slot_uri: prov:wasInfluencedBy
range: string
pattern: "^[a-z0-9-]+$"
examples:
- value: "the-dutch-inspectorate-of-education"
description: "Organization where person was discovered as staff"
- value: "rijksmuseum"
description: "Heritage institution employer"
whatsapp_enrichment:
description: |
Optional WhatsApp business likelihood enrichment.
Added by enrichment scripts to assess digital communication capabilities.
range: WhatsAppEnrichment
inlined: true
comments:
- "This is the root class for person entity JSON files"
- "PersonObservation.linkedin_profile_path references files containing this data"
- "See AGENTS.md Rule 20 for person entity file requirements"
- "See AGENTS.md Rule 27 for person-custodian data architecture"
see_also:
- "https://schema.org/ProfilePage"
- "https://nde.nl/ontology/hc/class/PersonObservation"
- "https://nde.nl/ontology/hc/class/SocialMediaProfile"
LinkedInProfileData:
class_uri: schema:Person
description: |
Core profile data extracted from a LinkedIn profile.
Contains the person's professional information including:
- Basic info (name, headline, location, connections)
- About/summary text
- Career history (experience array)
- Education history (education array)
- Skills and languages
- Profile image URL
**Note on Data Representation**:
- Raw strings are preserved for provenance (e.g., connections text)
- Nested objects use defined classes (WorkExperience, EducationCredential)
- Skills are simple strings (not structured objects)
- Languages may be raw strings or LanguageProficiency objects
exact_mappings:
- schema:Person
close_mappings:
- foaf:Person
slots:
- profile_name
- profile_linkedin_url
- headline
- profile_location
- connections_text
- about_text
- experience
- education
- skills
- languages_raw
- languages
- profile_image_url
slot_usage:
profile_name:
description: |
Full name of the person as displayed on LinkedIn.
slot_uri: schema:name
range: string
required: true
examples:
- value: "Sander Hulleman"
- value: "Jan van der Berg"
profile_linkedin_url:
description: |
LinkedIn profile URL for this person.
Duplicated from extraction_metadata for convenience.
slot_uri: schema:url
range: uri
pattern: "^https://www\\.linkedin\\.com/in/[a-z0-9-]+/?$"
examples:
- value: "https://www.linkedin.com/in/sander-hulleman-5017b9105"
headline:
description: |
Professional headline/tagline from LinkedIn.
Typically includes current job title and/or professional identity.
slot_uri: schema:jobTitle
range: string
examples:
- value: "Stafadviseur PO"
description: "Dutch job title"
- value: "Senior Curator | Rijksmuseum"
description: "Title with organization"
- value: "Digital Archivist | Heritage Data Specialist"
description: "Multiple roles"
profile_location:
description: |
Location as displayed on LinkedIn profile.
Format varies: "City, Region, Country" or "City, Country"
slot_uri: schema:homeLocation
range: string
examples:
- value: "Arnhem, Gelderland, Netherlands"
- value: "Amsterdam, Netherlands"
connections_text:
description: |
Raw connections/followers text from LinkedIn.
Format: "X connections • Y followers"
Preserved as-is for provenance.
slot_uri: schema:description
range: string
examples:
- value: "246 connections • 248 followers"
- value: "500+ connections"
about_text:
description: |
About/summary section text from LinkedIn profile.
May be absent if person hasn't written a summary.
slot_uri: schema:description
range: string
examples:
- value: "Third year student at Stenden University..."
experience:
description: |
Work experience entries from LinkedIn.
Array of WorkExperience objects with job title, company, dates, location.
range: WorkExperience
multivalued: true
inlined_as_list: true
education:
description: |
Education entries from LinkedIn.
Array of EducationCredential objects with school, degree, years.
range: EducationCredential
multivalued: true
inlined_as_list: true
skills:
description: |
Skills listed on LinkedIn profile.
Simple string array (not structured objects).
slot_uri: schema:knowsAbout
range: string
multivalued: true
examples:
- value: ["education", "teaching", "curriculum development"]
languages_raw:
description: |
Raw language strings as extracted from LinkedIn.
Format: "Language - Proficiency level"
Use this when storing unprocessed data.
range: string
multivalued: true
examples:
- value: ["English - Native or bilingual", "Dutch - Native or bilingual"]
languages:
description: |
Parsed language proficiency entries.
Array of LanguageProficiency objects with language name, code, level.
Use this when storing processed/structured data.
range: LanguageProficiency
multivalued: true
inlined_as_list: true
profile_image_url:
description: |
URL to the LinkedIn profile photo.
Should be the actual CDN URL (media.licdn.com), not overlay page.
See AGENTS.md Rule 16 for photo URL requirements.
slot_uri: schema:image
range: uri
pattern: "^https://media\\.licdn\\.com/.*$"
examples:
- value: "https://media.licdn.com/dms/image/v2/C4E03AQHoGyR6G0kphA/profile-displayphoto-shrink_200_200/..."
comments:
- "Inlined within LinkedInProfile as profile_data"
- "experience and education use inlined_as_list for JSON array representation"
- "languages_raw preserves original strings; languages has parsed objects"
- "profile_image_url must be CDN URL per AGENTS.md Rule 16"
WhatsAppEnrichment:
class_uri: hc:WhatsAppEnrichment
description: |
WhatsApp business likelihood enrichment data.
Added by enrichment scripts to assess whether a person is likely
to use WhatsApp for professional/business communication.
**Assessment Factors**:
- Digital technology indicators in profile
- Role type (customer-facing, technical, etc.)
- Industry/sector norms
- Geographic region (WhatsApp prevalence varies)
slots:
- digital_professional
- whatsapp_business_likelihood
- enrichment_metadata_whatsapp
slot_usage:
digital_professional:
description: |
Assessment of digital/technology proficiency.
range: DigitalProfessionalAssessment
inlined: true
whatsapp_business_likelihood:
description: |
Likelihood score for WhatsApp business usage.
range: WhatsAppLikelihood
inlined: true
enrichment_metadata_whatsapp:
description: |
Metadata about the enrichment process.
range: WhatsAppEnrichmentMetadata
inlined: true
DigitalProfessionalAssessment:
class_uri: hc:DigitalProfessionalAssessment
description: |
Assessment of a person's digital/technology proficiency.
slots:
- likely_whatsapp_proficient
- digital_indicators
- digital_confidence
slot_usage:
likely_whatsapp_proficient:
description: "Whether person is likely proficient with WhatsApp"
range: boolean
digital_indicators:
description: "Indicators of digital proficiency from profile"
range: string
multivalued: true
digital_confidence:
description: "Confidence level: low, medium, high"
range: string
WhatsAppLikelihood:
class_uri: hc:WhatsAppLikelihood
description: |
Likelihood score for WhatsApp business usage.
slots:
- likelihood_score
- max_likelihood_score
- likelihood_level
- likelihood_confidence
- likelihood_factors
- assessment_date
slot_usage:
likelihood_score:
description: "Numeric score (0-100)"
range: integer
minimum_value: 0
maximum_value: 100
max_likelihood_score:
description: "Maximum possible score (typically 100)"
range: integer
likelihood_level:
description: "Categorical level: low, medium, high"
range: string
likelihood_confidence:
description: "Confidence in the assessment (0.0-1.0)"
range: float
minimum_value: 0.0
maximum_value: 1.0
likelihood_factors:
description: "Factors contributing to the score"
range: string
multivalued: true
assessment_date:
description: "When the assessment was performed (ISO 8601)"
range: datetime
WhatsAppEnrichmentMetadata:
class_uri: hc:WhatsAppEnrichmentMetadata
description: |
Metadata about the WhatsApp enrichment process.
slots:
- enriched_date
- enrichment_method_whatsapp
- data_source_whatsapp
- no_fabrication
- all_data_real
slot_usage:
enriched_date:
description: "When enrichment was performed (ISO 8601)"
range: datetime
enrichment_method_whatsapp:
description: "Method used for enrichment"
range: string
examples:
- value: "linkedin_profile_analysis"
data_source_whatsapp:
description: "Source of data for enrichment"
range: string
examples:
- value: "public_linkedin_profile"
no_fabrication:
description: "Confirms no data was fabricated"
range: boolean
all_data_real:
description: "Confirms all data is from real sources"
range: boolean
# Top-level slot definitions
slots:
extraction_metadata:
description: "Provenance metadata for the extraction activity"
range: ExtractionMetadata
profile_data:
description: "Core profile data from LinkedIn"
range: LinkedInProfileData
heritage_relevance:
description: "Heritage sector classification"
range: HeritageRelevance
source_organization:
description: "Organization slug where person was discovered"
range: string
whatsapp_enrichment:
description: "WhatsApp business likelihood enrichment"
range: WhatsAppEnrichment
profile_name:
description: "Full name of the person"
range: string
profile_linkedin_url:
description: "LinkedIn profile URL"
range: uri
headline:
description: "Professional headline/tagline"
range: string
profile_location:
description: "Location as displayed on profile"
range: string
connections_text:
description: "Raw connections/followers text"
range: string
about_text:
description: "About/summary section text"
range: string
experience:
description: "Work experience entries"
range: WorkExperience
multivalued: true
education:
description: "Education entries"
range: EducationCredential
multivalued: true
skills:
description: "Skills listed on profile"
range: string
multivalued: true
languages_raw:
description: "Raw language strings"
range: string
multivalued: true
languages:
description: "Parsed language proficiency entries"
range: LanguageProficiency
multivalued: true
profile_image_url:
description: "Profile photo URL"
range: uri
digital_professional:
description: "Digital proficiency assessment"
range: DigitalProfessionalAssessment
whatsapp_business_likelihood:
description: "WhatsApp business usage likelihood"
range: WhatsAppLikelihood
enrichment_metadata_whatsapp:
description: "WhatsApp enrichment metadata"
range: WhatsAppEnrichmentMetadata
likely_whatsapp_proficient:
description: "Whether person is likely WhatsApp proficient"
range: boolean
digital_indicators:
description: "Indicators of digital proficiency"
range: string
multivalued: true
digital_confidence:
description: "Digital proficiency confidence level"
range: string
likelihood_score:
description: "Numeric likelihood score"
range: integer
max_likelihood_score:
description: "Maximum possible score"
range: integer
likelihood_level:
description: "Categorical likelihood level"
range: string
likelihood_confidence:
description: "Confidence in the assessment"
range: float
likelihood_factors:
description: "Factors contributing to score"
range: string
multivalued: true
assessment_date:
description: "When assessment was performed"
range: datetime
enriched_date:
description: "When enrichment was performed"
range: datetime
enrichment_method_whatsapp:
description: "Method used for enrichment"
range: string
data_source_whatsapp:
description: "Data source for enrichment"
range: string
no_fabrication:
description: "Confirms no data was fabricated"
range: boolean
all_data_real:
description: "Confirms all data is real"
range: boolean