311 lines
9.7 KiB
YAML
311 lines
9.7 KiB
YAML
id: https://w3id.org/heritage/isil/nl/nan/mapping
|
||
name: dutch-national-archive-isil-csv-mapping
|
||
title: National Archive ISIL CSV to LinkML Mapping
|
||
description: >-
|
||
Field-by-field mapping documentation for converting Dutch National Archive
|
||
ISIL registry CSV (ISIL-codes_2025-11-06.csv) to LinkML HeritageCustodian schema.
|
||
|
||
version: 1.0.0
|
||
created: 2025-11-17
|
||
conversion_script: /scripts/convert_isil_csv_to_yaml.py
|
||
|
||
source:
|
||
file: data/isil/nl/nan/ISIL-codes_2025-11-06.csv
|
||
encoding: latin-1
|
||
format: malformed_csv
|
||
total_records: 371
|
||
unique_cities: 201
|
||
date_range: 2008-10-10 to 2025-09-18
|
||
authority: Nationaal Archief (National Archive of the Netherlands)
|
||
url: https://www.nationaalarchief.nl/isil
|
||
|
||
target:
|
||
file: data/isil/nl/nan/ISIL-codes_2025-11-06.yaml
|
||
schema: schemas/heritage_custodian.yaml
|
||
format: LinkML-compliant YAML
|
||
|
||
csv_structure:
|
||
encoding_issue: >-
|
||
File uses latin-1 encoding. All fields stored in single CSV cell with
|
||
'","' delimiter. Header row includes sequence number as first field.
|
||
parsing_strategy: >-
|
||
Split on '","' pattern, strip quotes and semicolons, extract actual
|
||
field values from indices 1-5 (skipping sequence number at index 0).
|
||
header_row: ['', 'Volgnr', 'Plaats', 'Instelling', 'ISIL code', 'Toegekend op', 'Opmerking']
|
||
data_rows: 371
|
||
|
||
field_mappings:
|
||
|
||
# CSV Column 1: Sequential number
|
||
- csv_field: Volgnr
|
||
csv_index: 1
|
||
yaml_field: csv_row_number
|
||
data_type: integer
|
||
required: true
|
||
examples:
|
||
- csv_value: "1"
|
||
yaml_value: 1
|
||
- csv_value: "371"
|
||
yaml_value: 371
|
||
notes: >-
|
||
Sequential row number (1-371). Preserved for CSV traceability.
|
||
|
||
# CSV Column 2: City/location
|
||
- csv_field: Plaats
|
||
csv_index: 2
|
||
yaml_field: csv_plaats
|
||
data_type: string
|
||
required: true
|
||
examples:
|
||
- csv_value: Amsterdam
|
||
yaml_value: Amsterdam
|
||
- csv_value: Den Haag
|
||
yaml_value: Den Haag
|
||
mappings:
|
||
- target_field: locations[0].city
|
||
transformation: direct_copy
|
||
- target_field: locations[0].country
|
||
transformation: constant
|
||
value: NL
|
||
notes: >-
|
||
City name. Mapped to Location.city. Country code "NL" added automatically.
|
||
201 unique cities in dataset.
|
||
|
||
# CSV Column 3: Institution name
|
||
- csv_field: Instelling
|
||
csv_index: 3
|
||
yaml_field: csv_instelling
|
||
data_type: string
|
||
required: true
|
||
examples:
|
||
- csv_value: Rijksmuseum
|
||
yaml_value: Rijksmuseum
|
||
- csv_value: Stadsarchief Amsterdam
|
||
yaml_value: Stadsarchief Amsterdam
|
||
- csv_value: KB, nationale bibliotheek
|
||
yaml_value: KB, nationale bibliotheek
|
||
mappings:
|
||
- target_field: name
|
||
transformation: direct_copy
|
||
notes: >-
|
||
Institution name. Maps directly to HeritageCustodian.name.
|
||
No normalization or cleaning applied (preserves original spelling).
|
||
|
||
# CSV Column 4: ISIL code
|
||
- csv_field: ISIL code
|
||
csv_index: 4
|
||
yaml_field: csv_isil_code
|
||
data_type: string
|
||
required: true
|
||
pattern: "^NL-[A-Za-z0-9]+"
|
||
examples:
|
||
- csv_value: NL-AsdRM
|
||
yaml_value: NL-AsdRM
|
||
- csv_value: NL-HaNa
|
||
yaml_value: NL-HaNa
|
||
- csv_value: NL-LlsBatavialand
|
||
yaml_value: NL-LlsBatavialand
|
||
mappings:
|
||
- target_field: identifiers[0].identifier_scheme
|
||
transformation: constant
|
||
value: ISIL
|
||
- target_field: identifiers[0].identifier_value
|
||
transformation: direct_copy
|
||
- target_field: identifiers[0].identifier_url
|
||
transformation: url_construction
|
||
template: https://isil.org/{csv_isil_code}
|
||
notes: >-
|
||
ISIL code with semantic encoding (city abbreviation + institution abbreviation).
|
||
Variable length: 7-17 characters. Always starts with "NL-".
|
||
Mapped to Identifier object with scheme, value, and URL.
|
||
|
||
# CSV Column 5: Assignment date
|
||
- csv_field: Toegekend op
|
||
csv_index: 5
|
||
yaml_field: csv_toegekend_op
|
||
data_type: date
|
||
required: false
|
||
examples:
|
||
- csv_value: "2013-03-07"
|
||
yaml_value: "2013-03-07"
|
||
- csv_value: "2025-09-18"
|
||
yaml_value: "2025-09-18"
|
||
- csv_value: ""
|
||
yaml_value: null
|
||
mappings:
|
||
- target_field: identifiers[0].assigned_date
|
||
transformation: date_parsing
|
||
format: YYYY-MM-DD
|
||
notes: >-
|
||
Date when ISIL code was assigned. Format: YYYY-MM-DD.
|
||
Range: 2008-10-10 to 2025-09-18.
|
||
Empty values converted to null (not all records have assignment dates).
|
||
|
||
# CSV Column 6: Remarks
|
||
- csv_field: Opmerking
|
||
csv_index: 6
|
||
yaml_field: csv_opmerking
|
||
data_type: string
|
||
required: false
|
||
examples:
|
||
- csv_value: "n.b. in 2020 ontstaan uit een fusie tussen het RHCL en Rijckheyt"
|
||
yaml_value: "n.b. in 2020 ontstaan uit een fusie tussen het RHCL en Rijckheyt"
|
||
- csv_value: "n.b. Per 2021-10-08 in onbruik a.g.v. naamswijziging / fusie"
|
||
yaml_value: "n.b. Per 2021-10-08 in onbruik a.g.v. naamswijziging / fusie"
|
||
- csv_value: ""
|
||
yaml_value: ""
|
||
mappings:
|
||
- target_field: description
|
||
transformation: conditional_formatting
|
||
condition: csv_opmerking is not empty
|
||
template: "Opmerking: {csv_opmerking}"
|
||
notes: >-
|
||
Organizational history notes (mergers, name changes, closures).
|
||
Present in 18/371 records (4.9%).
|
||
Contains valuable change event information (FOUNDING, MERGER, CLOSURE, NAME_CHANGE).
|
||
When present, formatted as "Opmerking: {value}" in description field.
|
||
|
||
provenance_mapping:
|
||
data_source:
|
||
value: ISIL_REGISTRY
|
||
enum: DataSource.ISIL_REGISTRY
|
||
|
||
data_tier:
|
||
value: TIER_1_AUTHORITATIVE
|
||
enum: DataTier.TIER_1_AUTHORITATIVE
|
||
rationale: Official registry maintained by National Archive
|
||
|
||
extraction_date:
|
||
source: system_timestamp
|
||
format: ISO 8601 with timezone
|
||
example: "2025-11-17T12:30:45.123456+00:00"
|
||
|
||
extraction_method:
|
||
value: "CSV to YAML conversion (National Archive ISIL codes)"
|
||
|
||
source_url:
|
||
value: https://www.nationaalarchief.nl/isil
|
||
|
||
confidence_score:
|
||
value: 1.0
|
||
rationale: Authoritative source, no inference or estimation
|
||
|
||
transformation_rules:
|
||
|
||
encoding_handling:
|
||
description: CSV uses latin-1 encoding, requires explicit encoding parameter
|
||
implementation: >-
|
||
with open(csv_path, 'r', encoding='latin-1') as f:
|
||
|
||
field_splitting:
|
||
description: All fields stored in single cell with '","' delimiter
|
||
implementation: >-
|
||
parts = row.split('","')
|
||
fields = [p.strip('";') for p in parts]
|
||
|
||
header_extraction:
|
||
description: Skip sequence number field at index 0
|
||
implementation: >-
|
||
headers = fields[1:] # Actual headers: Volgnr, Plaats, Instelling, etc.
|
||
|
||
date_parsing:
|
||
description: Convert date strings to date objects, handle empty values
|
||
implementation: >-
|
||
date_str = fields[5].strip()
|
||
assigned_date = datetime.fromisoformat(date_str) if date_str else None
|
||
|
||
isil_url_generation:
|
||
description: Construct ISIL URL from code value
|
||
implementation: >-
|
||
identifier_url = f"https://isil.org/{isil_code}"
|
||
|
||
description_generation:
|
||
description: Create description from remarks when present
|
||
implementation: >-
|
||
description = f"Opmerking: {remark}" if remark else None
|
||
|
||
data_quality:
|
||
field_preservation: 100%
|
||
total_fields: 2226 # 371 records × 6 fields
|
||
preserved_fields: 2226
|
||
validation_errors: 0
|
||
|
||
missing_values:
|
||
csv_toegekend_op: >-
|
||
Some records lack assignment dates (converted to null)
|
||
csv_opmerking: >-
|
||
353 records have empty remarks (94.1%)
|
||
|
||
data_completeness:
|
||
csv_row_number: 100% # 371/371
|
||
csv_plaats: 100% # 371/371
|
||
csv_instelling: 100% # 371/371
|
||
csv_isil_code: 100% # 371/371
|
||
csv_toegekend_op: ~95% # Most records have dates
|
||
csv_opmerking: 4.9% # 18/371
|
||
|
||
organizational_change_events:
|
||
description: >-
|
||
18 records contain organizational history in csv_opmerking field.
|
||
These can be extracted as ChangeEvent objects in future processing.
|
||
|
||
event_types_detected:
|
||
MERGER: >-
|
||
"fusie tussen", "samenvoeging"
|
||
Examples: RHCL-Rijckheyt merger (2020), Zaanstreek-Waterland merger (2014)
|
||
|
||
NAME_CHANGE: >-
|
||
"naamswijziging", "hernoemd", "nieuwe naam"
|
||
|
||
CLOSURE: >-
|
||
"in onbruik", "gesloten", "opgeheven"
|
||
|
||
RELOCATION: >-
|
||
"verhuisd naar", "overgebracht naar"
|
||
|
||
future_processing: >-
|
||
Recommend NLP extraction or manual tagging to populate
|
||
HeritageCustodian.change_history field with structured ChangeEvent objects.
|
||
|
||
statistics:
|
||
total_records: 371
|
||
unique_cities: 201
|
||
unique_isil_codes: 371 # All unique (no duplicates)
|
||
|
||
top_cities:
|
||
- city: Den Haag
|
||
count: 38
|
||
percentage: 10.2%
|
||
- city: Amsterdam
|
||
count: 29
|
||
percentage: 7.8%
|
||
- city: Deventer
|
||
count: 11
|
||
percentage: 3.0%
|
||
- city: Groningen
|
||
count: 10
|
||
percentage: 2.7%
|
||
|
||
isil_code_length:
|
||
min: 7
|
||
max: 17
|
||
mean: 10.3
|
||
note: Variable length due to semantic encoding (city+institution abbreviations)
|
||
|
||
assignment_date_distribution:
|
||
earliest: 2008-10-10
|
||
latest: 2025-09-18
|
||
peak_year: 2013 # Most codes assigned during initial registration campaign
|
||
|
||
validation:
|
||
schema_validation: linkml-validate (schemas/heritage_custodian.yaml)
|
||
field_count_check: All 371 records have 6 fields preserved
|
||
isil_pattern_check: All codes match ^NL-[A-Za-z0-9]+
|
||
date_format_check: All non-empty dates parse as ISO 8601 (YYYY-MM-DD)
|
||
no_duplicates: All ISIL codes are unique
|
||
|
||
related_documentation:
|
||
conversion_report: /docs/ISIL_CSV_TO_YAML_CONVERSION_REPORT.md
|
||
schema_definition: /data/isil/nl/nan/linkml/schema.yaml
|
||
source_csv: /data/isil/nl/nan/ISIL-codes_2025-11-06.csv
|
||
output_yaml: /data/isil/nl/nan/ISIL-codes_2025-11-06.yaml
|