feat: Add legal form filtering rule for CustodianName

- Introduced LEGAL-FORM-FILTER rule to standardize CustodianName by removing legal form designations.
- Documented rationale, examples, and implementation guidelines for the filtering process.

docs: Create README for value standardization rules

- Established a comprehensive README outlining various value standardization rules applicable to Heritage Custodian classes.
- Categorized rules into Name Standardization, Geographic Standardization, Web Observation, and Schema Evolution.

feat: Implement transliteration standards for non-Latin scripts

- Added TRANSLIT-ISO rule to ensure GHCID abbreviations are generated from emic names using ISO standards for transliteration.
- Included detailed guidelines for various scripts and languages, along with implementation examples.

feat: Define XPath provenance rules for web observations

- Created XPATH-PROVENANCE rule mandating XPath pointers for claims extracted from web sources.
- Established a workflow for archiving websites and verifying claims against archived HTML.

chore: Update records lifecycle diagram

- Generated a new Mermaid diagram illustrating the records lifecycle for heritage custodians.
- Included phases for active records, inactive archives, and processed heritage collections with key relationships and classifications.
This commit is contained in:
kempersc 2025-12-09 16:58:41 +01:00
parent 7b42d720d5
commit 3a6ead8fde
64 changed files with 18017 additions and 466 deletions

View file

@ -14921,6 +14921,7 @@ hypernym:
rico:
- label: recordSetTypes
- label: Q112796578
class: True
hypernym:
- archive
type:
@ -15176,6 +15177,7 @@ hypernym:
rico:
- label: recordSetTypes
- label: Q3621648
class: True
hypernym:
- archive
type:
@ -15281,6 +15283,7 @@ hypernym:
type:
- A
- label: Q9854379
class: True
country:
- Portugal
hypernym:
@ -15714,6 +15717,7 @@ hypernym:
rico:
- label: recordSetTypes
- label: Q11906844
class: True
hypernym:
- archive
type:
@ -15845,6 +15849,7 @@ hypernym:
type:
- D
- label: Q5177943
class: True
hypernym:
- archive
type:

View file

@ -19,6 +19,7 @@
"@types/dagre": "^0.7.53",
"@types/js-yaml": "^4.0.9",
"@types/lodash": "^4.17.20",
"@types/three": "^0.181.0",
"@uiw/react-codemirror": "^4.25.3",
"axios": "^1.13.2",
"chevrotain-allstar": "^0.3.1",
@ -34,7 +35,7 @@
"lodash": "^4.17.21",
"lucide-react": "^0.554.0",
"maplibre-gl": "^5.14.0",
"mermaid": "^11.12.1",
"mermaid": "^11.12.2",
"n3": "^1.26.0",
"react": "^19.2.0",
"react-dom": "^19.2.0",
@ -43,6 +44,8 @@
"rehype-raw": "^7.0.0",
"rehype-sanitize": "^6.0.0",
"remark-gfm": "^4.0.1",
"three": "^0.181.2",
"umap-js": "^1.4.0",
"zustand": "^5.0.8"
},
"devDependencies": {
@ -605,9 +608,9 @@
}
},
"node_modules/@codemirror/view": {
"version": "6.38.8",
"resolved": "https://registry.npmjs.org/@codemirror/view/-/view-6.38.8.tgz",
"integrity": "sha512-XcE9fcnkHCbWkjeKyi0lllwXmBLtyYb5dt89dJyx23I9+LSh5vZDIuk7OLG4VM1lgrXZQcY6cxyZyk5WVPRv/A==",
"version": "6.39.1",
"resolved": "https://registry.npmjs.org/@codemirror/view/-/view-6.39.1.tgz",
"integrity": "sha512-yxpbDf9JwUgLVuAzOS1r0upM+f482FCYkcc+ZbJ34SGBppKL26giehibMEX+nAzLonlrJYiFi9zrftGDrO4mrQ==",
"license": "MIT",
"dependencies": {
"@codemirror/state": "^6.5.0",
@ -712,9 +715,9 @@
}
},
"node_modules/@csstools/css-syntax-patches-for-csstree": {
"version": "1.0.20",
"resolved": "https://registry.npmjs.org/@csstools/css-syntax-patches-for-csstree/-/css-syntax-patches-for-csstree-1.0.20.tgz",
"integrity": "sha512-8BHsjXfSciZxjmHQOuVdW2b8WLUPts9a+mfL13/PzEviufUEW2xnvQuOlKs9dRBHgRqJ53SF/DUoK9+MZk72oQ==",
"version": "1.0.14",
"resolved": "https://registry.npmjs.org/@csstools/css-syntax-patches-for-csstree/-/css-syntax-patches-for-csstree-1.0.14.tgz",
"integrity": "sha512-zSlIxa20WvMojjpCSy8WrNpcZ61RqfTfX3XTaOeVlGJrt/8HF3YbzgFZa01yTbT4GWQLwfTcC3EB8i3XnB647Q==",
"dev": true,
"funding": [
{
@ -729,6 +732,9 @@
"license": "MIT-0",
"engines": {
"node": ">=18"
},
"peerDependencies": {
"postcss": "^8.4"
}
},
"node_modules/@csstools/css-tokenizer": {
@ -751,6 +757,12 @@
"node": ">=18"
}
},
"node_modules/@dimforge/rapier3d-compat": {
"version": "0.12.0",
"resolved": "https://registry.npmjs.org/@dimforge/rapier3d-compat/-/rapier3d-compat-0.12.0.tgz",
"integrity": "sha512-uekIGetywIgopfD97oDL5PfeezkFpNhwlzlaEYNOA0N6ghdsOvh/HYjSMek5Q2O1PYvRSDFcqFVJl4r4ZBwOow==",
"license": "Apache-2.0"
},
"node_modules/@duckdb/duckdb-wasm": {
"version": "1.30.0",
"resolved": "https://registry.npmjs.org/@duckdb/duckdb-wasm/-/duckdb-wasm-1.30.0.tgz",
@ -1692,9 +1704,9 @@
}
},
"node_modules/@lezer/lr": {
"version": "1.4.4",
"resolved": "https://registry.npmjs.org/@lezer/lr/-/lr-1.4.4.tgz",
"integrity": "sha512-LHL17Mq0OcFXm1pGQssuGTQFPPdxARjKM8f7GA5+sGtHi0K3R84YaSbmche0+RKWHnCsx9asEe5OWOI4FHfe4A==",
"version": "1.4.5",
"resolved": "https://registry.npmjs.org/@lezer/lr/-/lr-1.4.5.tgz",
"integrity": "sha512-/YTRKP5yPPSo1xImYQk7AZZMAgap0kegzqCSYHjAL9x1AZ0ZQW+IpcEzMKagCsbTsLnVeWkxYrCNeXG8xEPrjg==",
"license": "MIT",
"dependencies": {
"@lezer/common": "^1.0.0"
@ -2098,9 +2110,9 @@
}
},
"node_modules/@rolldown/pluginutils": {
"version": "1.0.0-beta.47",
"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.47.tgz",
"integrity": "sha512-8QagwMH3kNCuzD8EWL8R2YPW5e4OrHNSAHRFDdmFqEwEaD/KcNKjVoumo+gP2vW5eKB2UPbM6vTYiGZX0ixLnw==",
"version": "1.0.0-beta.53",
"resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.53.tgz",
"integrity": "sha512-vENRlFU4YbrwVqNDZ7fLvy+JR1CRkyr01jhSiDpE1u6py3OMzQfztQU2jxykW3ALNxO4kSlqIDeYyD0Y9RcQeQ==",
"dev": true,
"license": "MIT"
},
@ -2544,6 +2556,12 @@
"@testing-library/dom": ">=7.21.4"
}
},
"node_modules/@tweenjs/tween.js": {
"version": "23.1.3",
"resolved": "https://registry.npmjs.org/@tweenjs/tween.js/-/tween.js-23.1.3.tgz",
"integrity": "sha512-vJmvvwFxYuGnF2axRtPYocag6Clbb5YS7kLL+SO/TeVFzHqDIWrNKYtcsPMibjDx9O+bu+psAy9NKfWklassUA==",
"license": "MIT"
},
"node_modules/@types/aria-query": {
"version": "5.0.4",
"resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz",
@ -2979,9 +2997,9 @@
"license": "MIT"
},
"node_modules/@types/node": {
"version": "24.10.1",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.1.tgz",
"integrity": "sha512-GNWcUTRBgIRJD5zj+Tq0fKOJ5XZajIiBroOF0yvj2bSU1WvNdYS/dn9UxwsujGW4JX06dnHyjV2y9rRaybH0iQ==",
"version": "24.10.2",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.2.tgz",
"integrity": "sha512-WOhQTZ4G8xZ1tjJTvKOpyEVSGgOTvJAfDK3FNFgELyaTpzhdgHVHeqW8V+UJvzF5BT+/B54T/1S2K6gd9c7bbA==",
"dev": true,
"license": "MIT",
"dependencies": {
@ -3028,6 +3046,12 @@
"@types/react": "*"
}
},
"node_modules/@types/stats.js": {
"version": "0.17.4",
"resolved": "https://registry.npmjs.org/@types/stats.js/-/stats.js-0.17.4.tgz",
"integrity": "sha512-jIBvWWShCvlBqBNIZt0KAshWpvSjhkwkEu4ZUcASoAvhmrgAUI2t1dXrjSL4xXVLB4FznPrIsX3nKXFl/Dt4vA==",
"license": "MIT"
},
"node_modules/@types/supercluster": {
"version": "7.1.3",
"resolved": "https://registry.npmjs.org/@types/supercluster/-/supercluster-7.1.3.tgz",
@ -3037,6 +3061,21 @@
"@types/geojson": "*"
}
},
"node_modules/@types/three": {
"version": "0.181.0",
"resolved": "https://registry.npmjs.org/@types/three/-/three-0.181.0.tgz",
"integrity": "sha512-MLF1ks8yRM2k71D7RprFpDb9DOX0p22DbdPqT/uAkc6AtQXjxWCVDjCy23G9t1o8HcQPk7woD2NIyiaWcWPYmA==",
"license": "MIT",
"dependencies": {
"@dimforge/rapier3d-compat": "~0.12.0",
"@tweenjs/tween.js": "~23.1.3",
"@types/stats.js": "*",
"@types/webxr": "*",
"@webgpu/types": "*",
"fflate": "~0.8.2",
"meshoptimizer": "~0.22.0"
}
},
"node_modules/@types/trusted-types": {
"version": "2.0.7",
"resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz",
@ -3050,19 +3089,24 @@
"integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==",
"license": "MIT"
},
"node_modules/@types/webxr": {
"version": "0.5.24",
"resolved": "https://registry.npmjs.org/@types/webxr/-/webxr-0.5.24.tgz",
"integrity": "sha512-h8fgEd/DpoS9CBrjEQXR+dIDraopAEfu4wYVNY2tEPwk60stPWhvZMf4Foo5FakuQ7HFZoa8WceaWFervK2Ovg==",
"license": "MIT"
},
"node_modules/@typescript-eslint/eslint-plugin": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.48.1.tgz",
"integrity": "sha512-X63hI1bxl5ohelzr0LY5coufyl0LJNthld+abwxpCoo6Gq+hSqhKwci7MUWkXo67mzgUK6YFByhmaHmUcuBJmA==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.49.0.tgz",
"integrity": "sha512-JXij0vzIaTtCwu6SxTh8qBc66kmf1xs7pI4UOiMDFVct6q86G0Zs7KRcEoJgY3Cav3x5Tq0MF5jwgpgLqgKG3A==",
"dev": true,
"license": "MIT",
"dependencies": {
"@eslint-community/regexpp": "^4.10.0",
"@typescript-eslint/scope-manager": "8.48.1",
"@typescript-eslint/type-utils": "8.48.1",
"@typescript-eslint/utils": "8.48.1",
"@typescript-eslint/visitor-keys": "8.48.1",
"graphemer": "^1.4.0",
"@typescript-eslint/scope-manager": "8.49.0",
"@typescript-eslint/type-utils": "8.49.0",
"@typescript-eslint/utils": "8.49.0",
"@typescript-eslint/visitor-keys": "8.49.0",
"ignore": "^7.0.0",
"natural-compare": "^1.4.0",
"ts-api-utils": "^2.1.0"
@ -3075,22 +3119,22 @@
"url": "https://opencollective.com/typescript-eslint"
},
"peerDependencies": {
"@typescript-eslint/parser": "^8.48.1",
"@typescript-eslint/parser": "^8.49.0",
"eslint": "^8.57.0 || ^9.0.0",
"typescript": ">=4.8.4 <6.0.0"
}
},
"node_modules/@typescript-eslint/parser": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.48.1.tgz",
"integrity": "sha512-PC0PDZfJg8sP7cmKe6L3QIL8GZwU5aRvUFedqSIpw3B+QjRSUZeeITC2M5XKeMXEzL6wccN196iy3JLwKNvDVA==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.49.0.tgz",
"integrity": "sha512-N9lBGA9o9aqb1hVMc9hzySbhKibHmB+N3IpoShyV6HyQYRGIhlrO5rQgttypi+yEeKsKI4idxC8Jw6gXKD4THA==",
"dev": true,
"license": "MIT",
"dependencies": {
"@typescript-eslint/scope-manager": "8.48.1",
"@typescript-eslint/types": "8.48.1",
"@typescript-eslint/typescript-estree": "8.48.1",
"@typescript-eslint/visitor-keys": "8.48.1",
"@typescript-eslint/scope-manager": "8.49.0",
"@typescript-eslint/types": "8.49.0",
"@typescript-eslint/typescript-estree": "8.49.0",
"@typescript-eslint/visitor-keys": "8.49.0",
"debug": "^4.3.4"
},
"engines": {
@ -3106,14 +3150,14 @@
}
},
"node_modules/@typescript-eslint/project-service": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/project-service/-/project-service-8.48.1.tgz",
"integrity": "sha512-HQWSicah4s9z2/HifRPQ6b6R7G+SBx64JlFQpgSSHWPKdvCZX57XCbszg/bapbRsOEv42q5tayTYcEFpACcX1w==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/project-service/-/project-service-8.49.0.tgz",
"integrity": "sha512-/wJN0/DKkmRUMXjZUXYZpD1NEQzQAAn9QWfGwo+Ai8gnzqH7tvqS7oNVdTjKqOcPyVIdZdyCMoqN66Ia789e7g==",
"dev": true,
"license": "MIT",
"dependencies": {
"@typescript-eslint/tsconfig-utils": "^8.48.1",
"@typescript-eslint/types": "^8.48.1",
"@typescript-eslint/tsconfig-utils": "^8.49.0",
"@typescript-eslint/types": "^8.49.0",
"debug": "^4.3.4"
},
"engines": {
@ -3128,14 +3172,14 @@
}
},
"node_modules/@typescript-eslint/scope-manager": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.48.1.tgz",
"integrity": "sha512-rj4vWQsytQbLxC5Bf4XwZ0/CKd362DkWMUkviT7DCS057SK64D5lH74sSGzhI6PDD2HCEq02xAP9cX68dYyg1w==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.49.0.tgz",
"integrity": "sha512-npgS3zi+/30KSOkXNs0LQXtsg9ekZ8OISAOLGWA/ZOEn0ZH74Ginfl7foziV8DT+D98WfQ5Kopwqb/PZOaIJGg==",
"dev": true,
"license": "MIT",
"dependencies": {
"@typescript-eslint/types": "8.48.1",
"@typescript-eslint/visitor-keys": "8.48.1"
"@typescript-eslint/types": "8.49.0",
"@typescript-eslint/visitor-keys": "8.49.0"
},
"engines": {
"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
@ -3146,9 +3190,9 @@
}
},
"node_modules/@typescript-eslint/tsconfig-utils": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/tsconfig-utils/-/tsconfig-utils-8.48.1.tgz",
"integrity": "sha512-k0Jhs4CpEffIBm6wPaCXBAD7jxBtrHjrSgtfCjUvPp9AZ78lXKdTR8fxyZO5y4vWNlOvYXRtngSZNSn+H53Jkw==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/tsconfig-utils/-/tsconfig-utils-8.49.0.tgz",
"integrity": "sha512-8prixNi1/6nawsRYxet4YOhnbW+W9FK/bQPxsGB1D3ZrDzbJ5FXw5XmzxZv82X3B+ZccuSxo/X8q9nQ+mFecWA==",
"dev": true,
"license": "MIT",
"engines": {
@ -3163,15 +3207,15 @@
}
},
"node_modules/@typescript-eslint/type-utils": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.48.1.tgz",
"integrity": "sha512-1jEop81a3LrJQLTf/1VfPQdhIY4PlGDBc/i67EVWObrtvcziysbLN3oReexHOM6N3jyXgCrkBsZpqwH0hiDOQg==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.49.0.tgz",
"integrity": "sha512-KTExJfQ+svY8I10P4HdxKzWsvtVnsuCifU5MvXrRwoP2KOlNZ9ADNEWWsQTJgMxLzS5VLQKDjkCT/YzgsnqmZg==",
"dev": true,
"license": "MIT",
"dependencies": {
"@typescript-eslint/types": "8.48.1",
"@typescript-eslint/typescript-estree": "8.48.1",
"@typescript-eslint/utils": "8.48.1",
"@typescript-eslint/types": "8.49.0",
"@typescript-eslint/typescript-estree": "8.49.0",
"@typescript-eslint/utils": "8.49.0",
"debug": "^4.3.4",
"ts-api-utils": "^2.1.0"
},
@ -3188,9 +3232,9 @@
}
},
"node_modules/@typescript-eslint/types": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.48.1.tgz",
"integrity": "sha512-+fZ3LZNeiELGmimrujsDCT4CRIbq5oXdHe7chLiW8qzqyPMnn1puNstCrMNVAqwcl2FdIxkuJ4tOs/RFDBVc/Q==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.49.0.tgz",
"integrity": "sha512-e9k/fneezorUo6WShlQpMxXh8/8wfyc+biu6tnAqA81oWrEic0k21RHzP9uqqpyBBeBKu4T+Bsjy9/b8u7obXQ==",
"dev": true,
"license": "MIT",
"engines": {
@ -3202,16 +3246,16 @@
}
},
"node_modules/@typescript-eslint/typescript-estree": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.48.1.tgz",
"integrity": "sha512-/9wQ4PqaefTK6POVTjJaYS0bynCgzh6ClJHGSBj06XEHjkfylzB+A3qvyaXnErEZSaxhIo4YdyBgq6j4RysxDg==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.49.0.tgz",
"integrity": "sha512-jrLdRuAbPfPIdYNppHJ/D0wN+wwNfJ32YTAm10eJVsFmrVpXQnDWBn8niCSMlWjvml8jsce5E/O+86IQtTbJWA==",
"dev": true,
"license": "MIT",
"dependencies": {
"@typescript-eslint/project-service": "8.48.1",
"@typescript-eslint/tsconfig-utils": "8.48.1",
"@typescript-eslint/types": "8.48.1",
"@typescript-eslint/visitor-keys": "8.48.1",
"@typescript-eslint/project-service": "8.49.0",
"@typescript-eslint/tsconfig-utils": "8.49.0",
"@typescript-eslint/types": "8.49.0",
"@typescript-eslint/visitor-keys": "8.49.0",
"debug": "^4.3.4",
"minimatch": "^9.0.4",
"semver": "^7.6.0",
@ -3230,16 +3274,16 @@
}
},
"node_modules/@typescript-eslint/utils": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.48.1.tgz",
"integrity": "sha512-fAnhLrDjiVfey5wwFRwrweyRlCmdz5ZxXz2G/4cLn0YDLjTapmN4gcCsTBR1N2rWnZSDeWpYtgLDsJt+FpmcwA==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.49.0.tgz",
"integrity": "sha512-N3W7rJw7Rw+z1tRsHZbK395TWSYvufBXumYtEGzypgMUthlg0/hmCImeA8hgO2d2G4pd7ftpxxul2J8OdtdaFA==",
"dev": true,
"license": "MIT",
"dependencies": {
"@eslint-community/eslint-utils": "^4.7.0",
"@typescript-eslint/scope-manager": "8.48.1",
"@typescript-eslint/types": "8.48.1",
"@typescript-eslint/typescript-estree": "8.48.1"
"@typescript-eslint/scope-manager": "8.49.0",
"@typescript-eslint/types": "8.49.0",
"@typescript-eslint/typescript-estree": "8.49.0"
},
"engines": {
"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
@ -3254,13 +3298,13 @@
}
},
"node_modules/@typescript-eslint/visitor-keys": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.48.1.tgz",
"integrity": "sha512-BmxxndzEWhE4TIEEMBs8lP3MBWN3jFPs/p6gPm/wkv02o41hI6cq9AuSmGAaTTHPtA1FTi2jBre4A9rm5ZmX+Q==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.49.0.tgz",
"integrity": "sha512-LlKaciDe3GmZFphXIc79THF/YYBugZ7FS1pO581E/edlVVNbZKDy93evqmrfQ9/Y4uN0vVhX4iuchq26mK/iiA==",
"dev": true,
"license": "MIT",
"dependencies": {
"@typescript-eslint/types": "8.48.1",
"@typescript-eslint/types": "8.49.0",
"eslint-visitor-keys": "^4.2.1"
},
"engines": {
@ -3344,16 +3388,16 @@
"license": "ISC"
},
"node_modules/@vitejs/plugin-react": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-5.1.1.tgz",
"integrity": "sha512-WQfkSw0QbQ5aJ2CHYw23ZGkqnRwqKHD/KYsMeTkZzPT4Jcf0DcBxBtwMJxnu6E7oxw5+JC6ZAiePgh28uJ1HBA==",
"version": "5.1.2",
"resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-5.1.2.tgz",
"integrity": "sha512-EcA07pHJouywpzsoTUqNh5NwGayl2PPVEJKUSinGGSxFGYn+shYbqMGBg6FXDqgXum9Ou/ecb+411ssw8HImJQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"@babel/core": "^7.28.5",
"@babel/plugin-transform-react-jsx-self": "^7.27.1",
"@babel/plugin-transform-react-jsx-source": "^7.27.1",
"@rolldown/pluginutils": "1.0.0-beta.47",
"@rolldown/pluginutils": "1.0.0-beta.53",
"@types/babel__core": "^7.20.5",
"react-refresh": "^0.18.0"
},
@ -3497,6 +3541,12 @@
"url": "https://opencollective.com/vitest"
}
},
"node_modules/@webgpu/types": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.67.tgz",
"integrity": "sha512-uk53+2ECGUkWoDFez/hymwpRfdgdIn6y1ref70fEecGMe5607f4sozNFgBk0oxlr7j2CRGWBEc3IBYMmFdGGTQ==",
"license": "BSD-3-Clause"
},
"node_modules/abort-controller": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
@ -3605,9 +3655,9 @@
}
},
"node_modules/apache-arrow/node_modules/@types/node": {
"version": "20.19.25",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.25.tgz",
"integrity": "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ==",
"version": "20.19.26",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.26.tgz",
"integrity": "sha512-0l6cjgF0XnihUpndDhk+nyD3exio3iKaYROSgvh/qSevPXax3L8p5DBRFjbvalnwatGgHEQn2R88y2fA3g4irg==",
"license": "MIT",
"dependencies": {
"undici-types": "~6.21.0"
@ -3724,9 +3774,9 @@
"license": "MIT"
},
"node_modules/baseline-browser-mapping": {
"version": "2.9.4",
"resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.4.tgz",
"integrity": "sha512-ZCQ9GEWl73BVm8bu5Fts8nt7MHdbt5vY9bP6WGnUh+r3l8M7CgfyTlwsgCbMC66BNxPr6Xoce3j66Ms5YUQTNA==",
"version": "2.9.5",
"resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.5.tgz",
"integrity": "sha512-D5vIoztZOq1XM54LUdttJVc96ggEsIfju2JBvht06pSzpckp3C7HReun67Bghzrtdsq9XdMGbSSB3v3GhMNmAA==",
"dev": true,
"license": "Apache-2.0",
"bin": {
@ -3834,9 +3884,9 @@
}
},
"node_modules/caniuse-lite": {
"version": "1.0.30001759",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001759.tgz",
"integrity": "sha512-Pzfx9fOKoKvevQf8oCXoyNRQ5QyxJj+3O0Rqx2V5oxT61KGx8+n6hV/IUyJeifUci2clnmmKVpvtiqRzgiWjSw==",
"version": "1.0.30001760",
"resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001760.tgz",
"integrity": "sha512-7AAMPcueWELt1p3mi13HR/LHH0TJLT11cnwDJEs3xA4+CK/PLKeO9Kl1oru24htkyUKtkGCvAx4ohB0Ttry8Dw==",
"dev": true,
"funding": [
{
@ -4201,14 +4251,14 @@
"license": "MIT"
},
"node_modules/cssstyle": {
"version": "5.3.3",
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-5.3.3.tgz",
"integrity": "sha512-OytmFH+13/QXONJcC75QNdMtKpceNk3u8ThBjyyYjkEcy/ekBwR1mMAuNvi3gdBPW3N5TlCzQ0WZw8H0lN/bDw==",
"version": "5.3.4",
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-5.3.4.tgz",
"integrity": "sha512-KyOS/kJMEq5O9GdPnaf82noigg5X5DYn0kZPJTaAsCUaBizp6Xa1y9D4Qoqf/JazEXWuruErHgVXwjN5391ZJw==",
"dev": true,
"license": "MIT",
"dependencies": {
"@asamuzakjp/css-color": "^4.0.3",
"@csstools/css-syntax-patches-for-csstree": "^1.0.14",
"@asamuzakjp/css-color": "^4.1.0",
"@csstools/css-syntax-patches-for-csstree": "1.0.14",
"css-tree": "^3.1.0"
},
"engines": {
@ -4863,9 +4913,9 @@
}
},
"node_modules/dompurify": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.3.0.tgz",
"integrity": "sha512-r+f6MYR1gGN1eJv0TVQbhA7if/U7P87cdPl3HN5rikqaBSBxLiCb/b9O+2eG0cxz0ghyU+mU1QkbsOwERMYlWQ==",
"version": "3.3.1",
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.3.1.tgz",
"integrity": "sha512-qkdCKzLNtrgPFP1Vo+98FRzJnBRGe4ffyCea9IwHB1fyxPOeNTHpLKYGd4Uk9xvNoH0ZoOjwZxNptyMwqrId1Q==",
"license": "(MPL-2.0 OR Apache-2.0)",
"optionalDependencies": {
"@types/trusted-types": "^2.0.7"
@ -4892,9 +4942,9 @@
"license": "ISC"
},
"node_modules/electron-to-chromium": {
"version": "1.5.266",
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.266.tgz",
"integrity": "sha512-kgWEglXvkEfMH7rxP5OSZZwnaDWT7J9EoZCujhnpLbfi0bbNtRkgdX2E3gt0Uer11c61qCYktB3hwkAS325sJg==",
"version": "1.5.267",
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.267.tgz",
"integrity": "sha512-0Drusm6MVRXSOJpGbaSVgcQsuB4hEkMpHXaVstcPmhu5LIedxs1xNK/nIxmQIU/RPC0+1/o0AVZfBTkTNJOdUw==",
"dev": true,
"license": "ISC"
},
@ -5371,9 +5421,9 @@
}
},
"node_modules/expect-type": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.2.2.tgz",
"integrity": "sha512-JhFGDVJ7tmDJItKhYgJCGLOWjuK9vPxiXoUFLwLDc99NlmklilbiQJwoctZtt13+xMw91MCk/REan6MWHqDjyA==",
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.3.0.tgz",
"integrity": "sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==",
"dev": true,
"license": "Apache-2.0",
"engines": {
@ -5435,7 +5485,6 @@
"version": "0.8.2",
"resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.2.tgz",
"integrity": "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A==",
"dev": true,
"license": "MIT"
},
"node_modules/file-entry-cache": {
@ -5682,13 +5731,6 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/graphemer": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz",
"integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==",
"dev": true,
"license": "MIT"
},
"node_modules/graphlib": {
"version": "2.1.8",
"resolved": "https://registry.npmjs.org/graphlib/-/graphlib-2.1.8.tgz",
@ -6123,6 +6165,12 @@
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/is-any-array": {
"version": "0.1.1",
"resolved": "https://registry.npmjs.org/is-any-array/-/is-any-array-0.1.1.tgz",
"integrity": "sha512-qTiELO+kpTKqPgxPYbshMERlzaFu29JDnpB8s3bjg+JkxBpw29/qqSaOdKv2pCdaG92rLGeG/zG2GauX58hfoA==",
"license": "MIT"
},
"node_modules/is-arrayish": {
"version": "0.2.1",
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz",
@ -6232,15 +6280,15 @@
}
},
"node_modules/jsdom": {
"version": "27.2.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-27.2.0.tgz",
"integrity": "sha512-454TI39PeRDW1LgpyLPyURtB4Zx1tklSr6+OFOipsxGUH1WMTvk6C65JQdrj455+DP2uJ1+veBEHTGFKWVLFoA==",
"version": "27.3.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-27.3.0.tgz",
"integrity": "sha512-GtldT42B8+jefDUC4yUKAvsaOrH7PDHmZxZXNgF2xMmymjUbRYJvpAybZAKEmXDGTM0mCsz8duOa4vTm5AY2Kg==",
"dev": true,
"license": "MIT",
"dependencies": {
"@acemir/cssom": "^0.9.23",
"@asamuzakjp/dom-selector": "^6.7.4",
"cssstyle": "^5.3.3",
"@acemir/cssom": "^0.9.28",
"@asamuzakjp/dom-selector": "^6.7.6",
"cssstyle": "^5.3.4",
"data-urls": "^6.0.0",
"decimal.js": "^10.6.0",
"html-encoding-sniffer": "^4.0.0",
@ -6338,9 +6386,9 @@
}
},
"node_modules/katex": {
"version": "0.16.26",
"resolved": "https://registry.npmjs.org/katex/-/katex-0.16.26.tgz",
"integrity": "sha512-LvYwQDwfcFB3rCkxwzqVFxhIB21x1JivrWAs3HT9NsmtgvQrcVCZ6xihnNwXwiQ8UhqRtDJRmwrRz5EgzQ2DuA==",
"version": "0.16.27",
"resolved": "https://registry.npmjs.org/katex/-/katex-0.16.27.tgz",
"integrity": "sha512-aeQoDkuRWSqQN6nSvVCEFvfXdqo1OQiCmmW1kc9xSdjutPv7BGO7pqY9sQRJpMOGrEdfDgF2TfRXe5eUAD2Waw==",
"funding": [
"https://opencollective.com/katex",
"https://github.com/sponsors/katex"
@ -6920,6 +6968,12 @@
"integrity": "sha512-yQ3rwFWRfwNUY7H5vpU0wfdkNSnvnJinhF9830Swlaxl03zsOjCfmX0ugac+3LtK0lYSgwL/KXc8oYL3mG4YFQ==",
"license": "MIT"
},
"node_modules/meshoptimizer": {
"version": "0.22.0",
"resolved": "https://registry.npmjs.org/meshoptimizer/-/meshoptimizer-0.22.0.tgz",
"integrity": "sha512-IebiK79sqIy+E4EgOr+CAw+Ke8hAspXKzBd0JdgEmPHiAwmvEj2S4h1rfvo+o/BnfEYd/jAOg5IeeIjzlzSnDg==",
"license": "MIT"
},
"node_modules/micromark": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/micromark/-/micromark-4.0.2.tgz",
@ -7539,6 +7593,79 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/ml-array-max": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/ml-array-max/-/ml-array-max-1.2.4.tgz",
"integrity": "sha512-BlEeg80jI0tW6WaPyGxf5Sa4sqvcyY6lbSn5Vcv44lp1I2GR6AWojfUvLnGTNsIXrZ8uqWmo8VcG1WpkI2ONMQ==",
"license": "MIT",
"dependencies": {
"is-any-array": "^2.0.0"
}
},
"node_modules/ml-array-max/node_modules/is-any-array": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/is-any-array/-/is-any-array-2.0.1.tgz",
"integrity": "sha512-UtilS7hLRu++wb/WBAw9bNuP1Eg04Ivn1vERJck8zJthEvXCBEBpGR/33u/xLKWEQf95803oalHrVDptcAvFdQ==",
"license": "MIT"
},
"node_modules/ml-array-min": {
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/ml-array-min/-/ml-array-min-1.2.3.tgz",
"integrity": "sha512-VcZ5f3VZ1iihtrGvgfh/q0XlMobG6GQ8FsNyQXD3T+IlstDv85g8kfV0xUG1QPRO/t21aukaJowDzMTc7j5V6Q==",
"license": "MIT",
"dependencies": {
"is-any-array": "^2.0.0"
}
},
"node_modules/ml-array-min/node_modules/is-any-array": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/is-any-array/-/is-any-array-2.0.1.tgz",
"integrity": "sha512-UtilS7hLRu++wb/WBAw9bNuP1Eg04Ivn1vERJck8zJthEvXCBEBpGR/33u/xLKWEQf95803oalHrVDptcAvFdQ==",
"license": "MIT"
},
"node_modules/ml-array-rescale": {
"version": "1.3.7",
"resolved": "https://registry.npmjs.org/ml-array-rescale/-/ml-array-rescale-1.3.7.tgz",
"integrity": "sha512-48NGChTouvEo9KBctDfHC3udWnQKNKEWN0ziELvY3KG25GR5cA8K8wNVzracsqSW1QEkAXjTNx+ycgAv06/1mQ==",
"license": "MIT",
"dependencies": {
"is-any-array": "^2.0.0",
"ml-array-max": "^1.2.4",
"ml-array-min": "^1.2.3"
}
},
"node_modules/ml-array-rescale/node_modules/is-any-array": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/is-any-array/-/is-any-array-2.0.1.tgz",
"integrity": "sha512-UtilS7hLRu++wb/WBAw9bNuP1Eg04Ivn1vERJck8zJthEvXCBEBpGR/33u/xLKWEQf95803oalHrVDptcAvFdQ==",
"license": "MIT"
},
"node_modules/ml-levenberg-marquardt": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/ml-levenberg-marquardt/-/ml-levenberg-marquardt-2.1.1.tgz",
"integrity": "sha512-2+HwUqew4qFFFYujYlQtmFUrxCB4iJAPqnUYro3P831wj70eJZcANwcRaIMGUVaH9NDKzfYuA4N5u67KExmaRA==",
"license": "MIT",
"dependencies": {
"is-any-array": "^0.1.0",
"ml-matrix": "^6.4.1"
}
},
"node_modules/ml-matrix": {
"version": "6.12.1",
"resolved": "https://registry.npmjs.org/ml-matrix/-/ml-matrix-6.12.1.tgz",
"integrity": "sha512-TJ+8eOFdp+INvzR4zAuwBQJznDUfktMtOB6g/hUcGh3rcyjxbz4Te57Pgri8Q9bhSQ7Zys4IYOGhFdnlgeB6Lw==",
"license": "MIT",
"dependencies": {
"is-any-array": "^2.0.1",
"ml-array-rescale": "^1.3.7"
}
},
"node_modules/ml-matrix/node_modules/is-any-array": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/is-any-array/-/is-any-array-2.0.1.tgz",
"integrity": "sha512-UtilS7hLRu++wb/WBAw9bNuP1Eg04Ivn1vERJck8zJthEvXCBEBpGR/33u/xLKWEQf95803oalHrVDptcAvFdQ==",
"license": "MIT"
},
"node_modules/mlly": {
"version": "1.8.0",
"resolved": "https://registry.npmjs.org/mlly/-/mlly-1.8.0.tgz",
@ -8742,6 +8869,12 @@
"node": ">=12.17"
}
},
"node_modules/three": {
"version": "0.181.2",
"resolved": "https://registry.npmjs.org/three/-/three-0.181.2.tgz",
"integrity": "sha512-k/CjiZ80bYss6Qs7/ex1TBlPD11whT9oKfT8oTGiHa34W4JRd1NiH/Tr1DbHWQ2/vMUypxksLnF2CfmlmM5XFQ==",
"license": "MIT"
},
"node_modules/tinybench": {
"version": "2.9.0",
"resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz",
@ -8923,16 +9056,16 @@
}
},
"node_modules/typescript-eslint": {
"version": "8.48.1",
"resolved": "https://registry.npmjs.org/typescript-eslint/-/typescript-eslint-8.48.1.tgz",
"integrity": "sha512-FbOKN1fqNoXp1hIl5KYpObVrp0mCn+CLgn479nmu2IsRMrx2vyv74MmsBLVlhg8qVwNFGbXSp8fh1zp8pEoC2A==",
"version": "8.49.0",
"resolved": "https://registry.npmjs.org/typescript-eslint/-/typescript-eslint-8.49.0.tgz",
"integrity": "sha512-zRSVH1WXD0uXczCXw+nsdjGPUdx4dfrs5VQoHnUWmv1U3oNlAKv4FUNdLDhVUg+gYn+a5hUESqch//Rv5wVhrg==",
"dev": true,
"license": "MIT",
"dependencies": {
"@typescript-eslint/eslint-plugin": "8.48.1",
"@typescript-eslint/parser": "8.48.1",
"@typescript-eslint/typescript-estree": "8.48.1",
"@typescript-eslint/utils": "8.48.1"
"@typescript-eslint/eslint-plugin": "8.49.0",
"@typescript-eslint/parser": "8.49.0",
"@typescript-eslint/typescript-estree": "8.49.0",
"@typescript-eslint/utils": "8.49.0"
},
"engines": {
"node": "^18.18.0 || ^20.9.0 || >=21.1.0"
@ -8961,6 +9094,15 @@
"integrity": "sha512-9a4/uxlTWJ4+a5i0ooc1rU7C7YOw3wT+UGqdeNNHWnOF9qcMBgLRS+4IYUqbczewFx4mLEig6gawh7X6mFlEkA==",
"license": "MIT"
},
"node_modules/umap-js": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/umap-js/-/umap-js-1.4.0.tgz",
"integrity": "sha512-xxpviF9wUO6Nxrx+C58SoDgea+h2PnVaRPKDelWv0HotmY6BeWeh0kAPJoumfqUkzUvowGsYfMbnsWI0b9do+A==",
"license": "MIT",
"dependencies": {
"ml-levenberg-marquardt": "^2.0.0"
}
},
"node_modules/undici-types": {
"version": "7.16.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
@ -9152,9 +9294,9 @@
}
},
"node_modules/vite": {
"version": "7.2.6",
"resolved": "https://registry.npmjs.org/vite/-/vite-7.2.6.tgz",
"integrity": "sha512-tI2l/nFHC5rLh7+5+o7QjKjSR04ivXDF4jcgV0f/bTQ+OJiITy5S6gaynVsEM+7RqzufMnVbIon6Sr5x1SDYaQ==",
"version": "7.2.7",
"resolved": "https://registry.npmjs.org/vite/-/vite-7.2.7.tgz",
"integrity": "sha512-ITcnkFeR3+fI8P1wMgItjGrR10170d8auB4EpMLPqmx6uxElH3a/hHGQabSHKdqd4FXWO1nFIp9rRn7JQ34ACQ==",
"dev": true,
"license": "MIT",
"dependencies": {

View file

@ -27,6 +27,7 @@
"@types/dagre": "^0.7.53",
"@types/js-yaml": "^4.0.9",
"@types/lodash": "^4.17.20",
"@types/three": "^0.181.0",
"@uiw/react-codemirror": "^4.25.3",
"axios": "^1.13.2",
"chevrotain-allstar": "^0.3.1",
@ -42,7 +43,7 @@
"lodash": "^4.17.21",
"lucide-react": "^0.554.0",
"maplibre-gl": "^5.14.0",
"mermaid": "^11.12.1",
"mermaid": "^11.12.2",
"n3": "^1.26.0",
"react": "^19.2.0",
"react-dom": "^19.2.0",
@ -51,6 +52,8 @@
"rehype-raw": "^7.0.0",
"rehype-sanitize": "^6.0.0",
"remark-gfm": "^4.0.1",
"three": "^0.181.2",
"umap-js": "^1.4.0",
"zustand": "^5.0.8"
},
"devDependencies": {

View file

@ -185,7 +185,7 @@ imports:
- modules/enums/ReconstructionActivityTypeEnum
- modules/enums/SourceDocumentTypeEnum
# StaffRoleTypeEnum REMOVED - replaced by StaffRole class hierarchy
# See: .opencode/ENUM_TO_CLASS_PRINCIPLE.md for rationale
# See: rules/ENUM_TO_CLASS_PRINCIPLE.md for rationale
- modules/enums/CallForApplicationStatusEnum
- modules/enums/FundingRequirementTypeEnum
@ -242,7 +242,7 @@ imports:
- modules/classes/PersonObservation
# Staff role class hierarchy (replaces StaffRoleTypeEnum - Single Source of Truth)
# See: .opencode/ENUM_TO_CLASS_PRINCIPLE.md
# See: rules/ENUM_TO_CLASS_PRINCIPLE.md
- modules/classes/StaffRole
- modules/classes/StaffRoles

View file

@ -1,5 +1,5 @@
{
"generated": "2025-12-09T10:49:54.625Z",
"generated": "2025-12-09T15:58:27.582Z",
"version": "1.0.0",
"categories": [
{

View file

@ -4,11 +4,16 @@ title: Company Archives Type
prefixes:
linkml: https://w3id.org/linkml/
schema: http://schema.org/
org: http://www.w3.org/ns/org#
rico: https://www.ica.org/standards/RiC/ontology#
imports:
- linkml:types
- ./ArchiveOrganizationType
- ./CollectionType
- ./Department
- ./OrganizationBranch
classes:
CompanyArchives:
@ -32,6 +37,37 @@ classes:
- Technical drawings and blueprints
- Corporate publications
**Organizational Context**:
Company archives are typically organized as:
1. **Departments within corporations** (`org:OrganizationalUnit`):
- Archive department under Records Management division
- Historical archives team within Communications/PR
- Technical archives under Engineering department
2. **Branches at corporate facilities** (`org:OrganizationalUnit`):
- Central archive at headquarters
- Regional archive at manufacturing sites
- Research archive at R&D centers
3. **Standalone heritage organizations** (rare):
- Independent foundation managing corporate heritage
- Heritage society for defunct companies
**Relationship to Parent Organization**:
| Pattern | Property | Example |
|---------|----------|---------|
| Archive as department | `schema:department` / `org:hasUnit` | Philips Archive is department of Philips N.V. |
| Archive as branch | `org:hasSite` / `org:unitOf` | Shell Archive at The Hague HQ |
| Archive with parent org | `schema:parentOrganization` | Unilever Historical Archives → Unilever PLC |
**W3C ORG / Schema.org Alignment**:
- `schema:parentOrganization` - Links archive to the corporation it belongs to
- `schema:department` - Corporation links to its archive department
- `org:unitOf` - Archive is organizational unit of corporation
- `org:hasUnit` - Corporation has archive as organizational unit
**Business Value**:
Company archives support:
- Legal and regulatory compliance
@ -45,6 +81,8 @@ classes:
- BankArchive (Q52718263) - Financial institution archives
- EconomicArchive (Q27032167) - Economic history focus
- InstitutionalArchive (Q124762372) - Institutional records
- Department - Formal departmental structure within organization
- OrganizationBranch - Physical branch locations of archive
**Professional Body**:
Company archivists often belong to:
@ -58,7 +96,8 @@ classes:
**Ontological Alignment**:
- **SKOS**: skos:Concept with skos:broader Q166118 (archive)
- **Schema.org**: schema:ArchiveOrganization
- **Schema.org**: schema:ArchiveOrganization, schema:parentOrganization
- **W3C ORG**: org:OrganizationalUnit, org:unitOf, org:hasUnit
- **RiC-O**: rico:CorporateBody (as agent)
**Multilingual Labels**:
@ -66,6 +105,11 @@ classes:
- es: archivo empresarial
- fr: archives d'entreprise
slots:
- parent_corporation
- archive_department_of
- archive_branches
slot_usage:
primary_type:
description: |
@ -90,23 +134,123 @@ classes:
description: |
Typically includes: governance records, financial records,
product documentation, marketing materials, personnel files.
parent_corporation:
slot_uri: schema:parentOrganization
description: |
The parent corporation that owns/operates this company archive.
**Schema.org Alignment**:
`schema:parentOrganization` - "The larger organization that this
organization is a subOrganization of, if any."
**Use Cases**:
- Philips Company Archives → Philips N.V.
- Shell Historical Archive → Shell PLC
- Siemens Corporate Archives → Siemens AG
Can reference:
- External URI for the parent corporation
- Custodian instance if parent is also modeled as heritage custodian
range: uriorcurie
examples:
- value: "https://www.wikidata.org/entity/Q163292"
description: "Philips N.V. as parent of Philips Archives"
- value: "https://nde.nl/ontology/hc/nl-corporation/shell-plc"
description: "Shell PLC as parent organization"
archive_department_of:
slot_uri: org:unitOf
description: |
Links this archive to the Department within which it operates.
**W3C ORG Alignment**:
`org:unitOf` - "Indicates an Organization of which this Unit is a part."
Many company archives are organized as:
- Sub-unit of Records Management department
- Part of Corporate Communications
- Under Legal/Compliance division
Links to Department class for formal departmental context.
range: Department
examples:
- value:
department_name: "Records Management Division"
refers_to_custodian: "https://nde.nl/ontology/hc/nl-corporation/philips"
description: "Archive is unit of Records Management"
archive_branches:
slot_uri: org:hasSubOrganization
description: |
Physical branch locations of this company archive.
**W3C ORG Alignment**:
`org:hasSubOrganization` - "Represents hierarchical containment of
Organizations or Organizational Units."
Large corporations may have multiple archive locations:
- Central archive at headquarters
- Regional archives at major facilities
- Research archives at R&D centers
- Product archives at manufacturing sites
Links to OrganizationBranch class for physical locations.
range: OrganizationBranch
multivalued: true
inlined_as_list: true
examples:
- value:
- branch_name: "Philips Archives - Eindhoven"
branch_type: REGIONAL_OFFICE
- branch_name: "Philips Research Archives - High Tech Campus"
branch_type: RESEARCH_CENTER
description: "Multiple archive branches"
exact_mappings:
- skos:Concept
close_mappings:
- schema:ArchiveOrganization
- rico:CorporateBody
- org:OrganizationalUnit
related_mappings:
- schema:parentOrganization
- org:unitOf
- org:hasSubOrganization
comments:
- "Corporate archives preserving business heritage"
- "Important for legal compliance and corporate identity"
- "Part of dual-class pattern: custodian type + rico:RecordSetType"
- "May have restricted access for commercial sensitivity"
- "Typically organized as Department within larger corporation (org:unitOf)"
- "May have multiple branch locations (org:hasSubOrganization)"
- "Links to parent corporation via schema:parentOrganization"
see_also:
- BankArchive
- EconomicArchive
- InstitutionalArchive
- Department
- OrganizationBranch
examples:
- value:
type_id: "https://nde.nl/ontology/hc/type/archive/company/philips"
primary_type: "ARCHIVE"
wikidata_entity: "Q10605195"
type_label:
- "Philips Company Archives@en"
- "Philips Bedrijfsarchief@nl"
parent_corporation: "https://www.wikidata.org/entity/Q163292"
archive_department_of:
department_name: "Corporate Communications & Heritage"
archive_branches:
- branch_name: "Philips Archives - Eindhoven HQ"
branch_type: REGIONAL_OFFICE
- branch_name: "Philips Research Archives"
branch_type: RESEARCH_CENTER
description: "Philips company archives with organizational context"
# rico:RecordSetType for collection classification
CompanyArchivesRecordSetType:
@ -123,3 +267,56 @@ classes:
annotations:
wikidata: Q10605195
linked_custodian_type: CompanyArchives
# Slot definitions for organizational relationships
slots:
parent_corporation:
slot_uri: schema:parentOrganization
description: |
The parent corporation that owns/operates this company archive.
Schema.org: parentOrganization - "The larger organization that this
organization is a subOrganization of, if any."
Inverse of schema:subOrganization.
range: uriorcurie
exact_mappings:
- schema:parentOrganization
comments:
- "Links company archive to owning corporation"
- "Use Wikidata Q-number or organizational URI"
archive_department_of:
slot_uri: org:unitOf
description: |
Links this archive to the Department within which it operates.
W3C ORG: unitOf - "Indicates an Organization of which this Unit is a part."
Company archives are often organized as sub-units of:
- Records Management department
- Corporate Communications
- Legal/Compliance division
range: Department
exact_mappings:
- org:unitOf
comments:
- "Links archive to formal department structure"
- "Inverse of org:hasUnit"
archive_branches:
slot_uri: org:hasSubOrganization
description: |
Physical branch locations of this company archive.
W3C ORG: hasSubOrganization - "Represents hierarchical containment of
Organizations or Organizational Units."
Links to OrganizationBranch instances for each physical location.
range: OrganizationBranch
multivalued: true
exact_mappings:
- org:hasSubOrganization
comments:
- "Multiple archive branch locations"
- "Each branch at different corporate facility"

View file

@ -2,6 +2,9 @@ id: https://nde.nl/ontology/hc/class/Conservatoria
name: Conservatoria
title: Conservatória Type (Lusophone)
prefixes:
linkml: https://w3id.org/linkml/
imports:
- linkml:types
- ./ArchiveOrganizationType
@ -16,7 +19,8 @@ classes:
**Wikidata**: Q9854379
**Geographic Restriction**: Portugal, Brazil, and other Lusophone countries
**Geographic Restriction**: Lusophone countries (PT, BR, AO, MZ, CV, GW, ST, TL)
This constraint is enforced via LinkML `rules` with `postconditions`.
**CUSTODIAN-ONLY**: This type does NOT have a corresponding rico:RecordSetType
class. Conservatórias are administrative offices with registration functions,
@ -59,6 +63,7 @@ classes:
**Multilingual Labels**:
- pt: Conservatória
- pt-BR: Cartório de Registro
slot_usage:
primary_type:
@ -70,10 +75,49 @@ classes:
wikidata_entity:
description: |
Should be Q9854379 for Conservatórias.
MUST be Q9854379 for Conservatórias.
Lusophone civil/property registration offices.
pattern: "^Q[0-9]+$"
equals_string: "Q9854379"
applicable_countries:
description: |
**Geographic Restriction**: Lusophone countries only.
Conservatórias exist in Portuguese-speaking countries:
- PT (Portugal) - Conservatórias do Registo
- BR (Brazil) - Cartórios de Registro
- AO (Angola) - Conservatórias
- MZ (Mozambique) - Conservatórias
- CV (Cape Verde) - Conservatórias
- GW (Guinea-Bissau) - Conservatórias
- ST (São Tomé and Príncipe) - Conservatórias
- TL (Timor-Leste) - Conservatórias (Portuguese legal heritage)
The `rules` section below enforces this constraint during validation.
multivalued: true
required: true
minimum_cardinality: 1
# LinkML rules for geographic constraint validation
rules:
- description: >-
Conservatoria MUST have applicable_countries containing at least one
Lusophone country (PT, BR, AO, MZ, CV, GW, ST, TL).
This is a mandatory geographic restriction for Portuguese-speaking
civil registry and notarial archive offices.
postconditions:
slot_conditions:
applicable_countries:
any_of:
- equals_string: "PT"
- equals_string: "BR"
- equals_string: "AO"
- equals_string: "MZ"
- equals_string: "CV"
- equals_string: "GW"
- equals_string: "ST"
- equals_string: "TL"
exact_mappings:
- skos:Concept
@ -82,8 +126,10 @@ classes:
- rico:CorporateBody
comments:
- "Conservatória (pt)"
- "Cartório de Registro (pt-BR)"
- "CUSTODIAN-ONLY type: No corresponding rico:RecordSetType class"
- "Geographic restriction: Lusophone countries (Portugal, Brazil, etc.)"
- "Geographic restriction enforced via LinkML rules: Lusophone countries only"
- "Government registration office, not traditional archive"
- "Essential for genealogical and legal research"

View file

@ -2,21 +2,27 @@ id: https://nde.nl/ontology/hc/class/CountyRecordOffice
name: CountyRecordOffice
title: County Record Office Type
prefixes:
linkml: https://w3id.org/linkml/
org: http://www.w3.org/ns/org#
imports:
- linkml:types
- ./ArchiveOrganizationType
- ./OrganizationBranch
classes:
CountyRecordOffice:
is_a: ArchiveOrganizationType
class_uri: skos:Concept
description: |
Local authority repository in the United Kingdom and similar jurisdictions,
preserving historical records of the county and its communities.
Local authority repository in the United Kingdom, preserving historical
records of the county and its communities.
**Wikidata**: Q5177943
**Geographic Context**: Primarily United Kingdom
**Geographic Restriction**: United Kingdom (GB) only.
This constraint is enforced via LinkML `rules` with `postconditions`.
**CUSTODIAN-ONLY**: This type does NOT have a corresponding rico:RecordSetType
class. County Record Offices are institutional types, not collection
@ -40,16 +46,25 @@ classes:
- Often designated as place of deposit for public records
- Increasingly rebranded as "Archives and Local Studies"
In Scotland:
- Similar functions performed by local authority archives
- National Records of Scotland at national level
In Northern Ireland:
- Public Record Office of Northern Ireland (PRONI)
- Local council archives
**Related Types**:
- LocalGovernmentArchive (Q118281267) - Local authority records
- MunicipalArchive (Q604177) - City/town archives
- LocalHistoryArchive (Q12324798) - Local history focus
**Notable Examples**:
- The National Archives (Kew) - National level
- London Metropolitan Archives
- Oxfordshire History Centre
- Lancashire Archives
- West Yorkshire Archive Service
- Surrey History Centre
**Ontological Alignment**:
- **SKOS**: skos:Concept with skos:broader Q166118 (archive)
@ -57,6 +72,8 @@ classes:
- **RiC-O**: rico:CorporateBody (as agent)
**Multilingual Labels**:
- en: County Record Office
- en-GB: County Record Office
- it: archivio pubblico territoriale
slot_usage:
@ -67,7 +84,7 @@ classes:
wikidata_entity:
description: |
Should be Q5177943 for county record offices.
MUST be Q5177943 for county record offices.
UK local authority archive type.
pattern: "^Q[0-9]+$"
equals_string: "Q5177943"
@ -76,6 +93,66 @@ classes:
description: |
Typically 'county' or 'local' for this archive type.
Corresponds to UK county administrative level.
is_branch_of_authority:
description: |
**Organizational Relationship**: County Record Offices may be branches
of larger local authority structures.
**Common Parent Organizations**:
- County Councils (e.g., Oxfordshire County Council)
- Unitary Authorities (e.g., Bristol City Council)
- Combined Authorities (e.g., Greater Manchester)
- Joint Archive Services (e.g., East Sussex / Brighton & Hove)
**Legal Context**:
County Record Offices are typically:
- Designated "place of deposit" under Public Records Act 1958
- Part of local authority heritage/cultural services
- May share governance with local studies libraries
**Use org:unitOf pattern** from OrganizationBranch to link to parent
authority when modeled as formal organizational unit.
**Examples**:
- Oxfordshire History Centre → part of Oxfordshire County Council
- London Metropolitan Archives → part of City of London Corporation
- West Yorkshire Archive Service → joint service of five councils
range: uriorcurie
multivalued: false
required: false
examples:
- value: "https://nde.nl/ontology/hc/uk/oxfordshire-county-council"
description: "Parent local authority"
applicable_countries:
description: |
**Geographic Restriction**: United Kingdom (GB) only.
County Record Offices are a UK-specific institution type within
the local authority structure of England, Wales, Scotland, and
Northern Ireland.
Note: Uses ISO 3166-1 alpha-2 code "GB" for United Kingdom
(not "UK" which is not a valid ISO code).
The `rules` section below enforces this constraint during validation.
ifabsent: "string(GB)"
required: true
minimum_cardinality: 1
maximum_cardinality: 1
# LinkML rules for geographic constraint validation
rules:
- description: >-
CountyRecordOffice MUST have applicable_countries containing "GB"
(United Kingdom). This is a mandatory geographic restriction for
UK county record offices and local authority archives.
postconditions:
slot_conditions:
applicable_countries:
any_of:
- equals_string: "GB"
exact_mappings:
- skos:Concept
@ -84,7 +161,9 @@ classes:
- rico:CorporateBody
comments:
- "County Record Office (en-GB)"
- "CUSTODIAN-ONLY type: No corresponding rico:RecordSetType class"
- "Geographic restriction enforced via LinkML rules: United Kingdom (GB) only"
- "UK local authority archive institution type"
- "Often designated place of deposit for public records"
- "Key resource for local and family history research"
@ -93,3 +172,12 @@ classes:
- LocalGovernmentArchive
- MunicipalArchive
- LocalHistoryArchive
- OrganizationBranch
slots:
is_branch_of_authority:
slot_uri: org:unitOf
description: |
Parent local authority or governing body for this County Record Office.
Uses W3C Org ontology org:unitOf relationship.
range: uriorcurie

View file

@ -22,6 +22,7 @@ imports:
- linkml:types
- ./ArchiveOrganizationType
- ./CustodianAdministration
- ./CustodianArchive
classes:
CurrentArchive:
@ -63,6 +64,24 @@ classes:
- HistoricalArchive (Q3621673) - non-current permanent records
- RecordsCenter - semi-current storage facility
**RELATIONSHIP TO CustodianArchive**:
CurrentArchive (this class) is a TYPE classification (skos:Concept) for
archives managing records in the active/current phase of the lifecycle.
CustodianArchive is an INSTANCE class (rico:RecordSet) representing the
actual operational archives of a heritage custodian awaiting processing.
**Semantic Relationship**:
- CurrentArchive is a HYPERNYM (broader type) for the concept of active records
- CustodianArchive records MAY be typed as CurrentArchive when in active use
- When CustodianArchive.processing_status = "UNPROCESSED", records may still
be in the current/active phase conceptually
**SKOS Alignment**:
- skos:broader: CurrentArchive → DepositArchive (lifecycle progression)
- skos:narrower: CurrentArchive ← specific current archive types
**ONTOLOGICAL ALIGNMENT**:
- **SKOS**: skos:Concept (type classification)
- **RiC-O**: rico:RecordSet for active record groups
@ -74,6 +93,7 @@ classes:
- retention_schedule
- creating_organization
- transfer_policy
- has_narrower_instance
slot_usage:
wikidata_entity:
@ -101,6 +121,25 @@ classes:
Policy for transferring records to intermediate or permanent archives.
Describes triggers, timelines, and procedures for transfer.
range: string
has_narrower_instance:
slot_uri: skos:narrowerTransitive
description: |
Links this archive TYPE to specific CustodianArchive INSTANCES
that are classified under this lifecycle phase.
**SKOS**: skos:narrowerTransitive for type-instance relationship.
**Usage**:
When a CustodianArchive contains records in the "current/active" phase,
it can be linked from CurrentArchive via this property.
**Example**:
- CurrentArchive (type) → has_narrower_instance →
CustodianArchive "Director's Active Files 2020-2024" (instance)
range: CustodianArchive
multivalued: true
required: false
exact_mappings:
- wikidata:Q3621648
@ -145,3 +184,11 @@ slots:
transfer_policy:
description: Policy for transferring to permanent archive
range: string
has_narrower_instance:
slot_uri: skos:narrowerTransitive
description: |
Links archive TYPE to specific CustodianArchive INSTANCES.
SKOS narrowerTransitive for type-to-instance relationship.
range: CustodianArchive
multivalued: true

View file

@ -20,6 +20,7 @@ imports:
- ../slots/access_restrictions
- ../slots/storage_location
- ./ReconstructedEntity
- ./CurrentArchive
prefixes:
linkml: https://w3id.org/linkml/
@ -31,6 +32,8 @@ prefixes:
time: http://www.w3.org/2006/time#
org: http://www.w3.org/ns/org#
premis: http://www.loc.gov/premis/rdf/v3/
skos: http://www.w3.org/2004/02/skos/core#
wikidata: http://www.wikidata.org/entity/
classes:
CustodianArchive:
@ -122,6 +125,18 @@ classes:
- **Storage**: Physical location of unprocessed archives
- **OrganizationalStructure**: Unit responsible for processing
**RELATIONSHIP TO LIFECYCLE TYPE CLASSES**:
CustodianArchive (this class) is an INSTANCE class representing actual
operational archives. It can be TYPED using lifecycle phase classifications:
- **CurrentArchive** (Q3621648): Active records in daily use
- skos:broaderTransitive links CustodianArchive → CurrentArchive type
- **DepositArchive** (Q244904): Intermediate/semi-current records
- **HistoricalArchive** (Q3621673): Permanent archival records
Use `lifecycle_phase_type` slot to classify by lifecycle position.
exact_mappings:
- rico:RecordSet
@ -162,6 +177,7 @@ classes:
- was_generated_by
- valid_from
- valid_to
- lifecycle_phase_type
slot_usage:
id:
@ -591,6 +607,33 @@ classes:
required: false
description: |
End of validity period (typically = transfer_to_collection_date).
lifecycle_phase_type:
slot_uri: skos:broaderTransitive
range: uriorcurie
required: false
description: |
Links this CustodianArchive INSTANCE to its lifecycle phase TYPE.
**SKOS**: skos:broaderTransitive for instance-to-type relationship.
**Archive Lifecycle Types (Wikidata)**:
- Q3621648 (CurrentArchive) - Active records phase
- Q244904 (DepositArchive) - Intermediate/semi-current phase
- Q3621673 (HistoricalArchive) - Archival/permanent phase
**Usage**:
Classify this operational archive by its position in the records lifecycle.
Most CustodianArchive records are in the intermediate phase (awaiting processing).
**Example**:
- CustodianArchive "Ministry Records 2010-2020" → lifecycle_phase_type →
DepositArchive (Q244904) - semi-current, awaiting processing
examples:
- value: "wikidata:Q244904"
description: "Deposit archive / semi-current records"
- value: "wikidata:Q3621648"
description: "Current archive / active records"
comments:
- "Represents operational archives BEFORE integration into CustodianCollection"
@ -719,3 +762,12 @@ slots:
arrangement_notes:
description: Notes from arrangement process
range: string
lifecycle_phase_type:
slot_uri: skos:broaderTransitive
description: |
Links CustodianArchive INSTANCE to lifecycle phase TYPE.
SKOS broaderTransitive for instance-to-type relationship.
Values: CurrentArchive (Q3621648), DepositArchive (Q244904),
HistoricalArchive (Q3621673).
range: uriorcurie

View file

@ -61,7 +61,7 @@ classes:
- Portuguese: Fundação, Associação, Ltda., S.A.
- Italian: Fondazione, Associazione, S.p.A., S.r.l.
See: .opencode/LEGAL_FORM_FILTERING_RULE.md for comprehensive global list
See: rules/LEGAL_FORM_FILTERING_RULE.md for comprehensive global list
===========================================================================
MANDATORY RULE: Special Characters MUST Be Excluded from Abbreviations
@ -112,7 +112,7 @@ classes:
- "Heritage@Digital" → "HD" (not "H@D")
- "Archives (Historical)" → "AH" (not "A(H)")
See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md for complete documentation
See: rules/ABBREVIATION_SPECIAL_CHAR_RULE.md for complete documentation
===========================================================================
MANDATORY RULE: Diacritics MUST Be Normalized to ASCII in Abbreviations
@ -152,7 +152,7 @@ classes:
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
```
See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md for complete documentation
See: rules/ABBREVIATION_SPECIAL_CHAR_RULE.md for complete documentation
Can be generated by:
1. ReconstructionActivity (formal entity resolution) - was_generated_by link

View file

@ -35,6 +35,11 @@ imports:
- ./OrganizationalStructure
- ./Collection
- ./PersonObservation
# Import global slots
- ../slots/staff_members
- ../slots/contact_point
- ../slots/located_at
- ../slots/refers_to_custodian
classes:
Department:
@ -456,6 +461,20 @@ slots:
description: Person heading the department
range: PersonObservation
# NOTE: staff_members imported from global slot ../slots/staff_members.yaml
manages_collections:
slot_uri: rico:isManagerOf
description: Collections managed by this department
range: Collection
multivalued: true
# NOTE: located_at imported from global slot ../slots/located_at.yaml
# NOTE: contact_point imported from global slot ../slots/contact_point.yaml
# NOTE: refers_to_custodian imported from global slot ../slots/refers_to_custodian.yaml
established_date:
description: Date department was established
range: date

View file

@ -470,7 +470,7 @@ classes:
- "Follows 4-stage GLAM-NER pipeline: recognition → layout → resolution → linking"
see_also:
- ".opencode/WEB_OBSERVATION_PROVENANCE_RULES.md"
- "rules/WEB_OBSERVATION_PROVENANCE_RULES.md"
- "scripts/fetch_website_playwright.py"
- "scripts/add_xpath_provenance.py"
- "docs/convention/schema/20251202/entity_annotation_rules_v1.6.0_unified.yaml"

View file

@ -0,0 +1,303 @@
# Abbreviation Character Filtering Rules
**Rule ID**: ABBREV-CHAR-FILTER
**Status**: MANDATORY
**Applies To**: GHCID abbreviation component generation
**Created**: 2025-12-07
**Updated**: 2025-12-08 (added diacritics rule)
---
## Summary
**When generating abbreviations for GHCID, ONLY ASCII uppercase letters (A-Z) are permitted. Both special characters AND diacritics MUST be removed/normalized.**
This is a **MANDATORY** rule. Abbreviations containing special characters or diacritics are INVALID and must be regenerated.
### Two Mandatory Sub-Rules:
1. **ABBREV-SPECIAL-CHAR**: Remove all special characters and symbols
2. **ABBREV-DIACRITICS**: Normalize all diacritics to ASCII equivalents
---
## Rule 1: Diacritics MUST Be Normalized to ASCII (ABBREV-DIACRITICS)
**Diacritics (accented characters) MUST be normalized to their ASCII base letter equivalents.**
### Example (Real Case)
```
❌ WRONG: CZ-VY-TEL-L-VHSPAOČRZS (contains Č)
✅ CORRECT: CZ-VY-TEL-L-VHSPAOCRZS (ASCII only)
```
### Diacritics Normalization Table
| Diacritic | ASCII | Example |
|-----------|-------|---------|
| Á, À, Â, Ã, Ä, Å, Ā | A | "Ålborg" → A |
| Č, Ć, Ç | C | "Český" → C |
| Ď | D | "Ďáblice" → D |
| É, È, Ê, Ë, Ě, Ē | E | "Éire" → E |
| Í, Ì, Î, Ï, Ī | I | "Ísland" → I |
| Ñ, Ń, Ň | N | "España" → N |
| Ó, Ò, Ô, Õ, Ö, Ø, Ō | O | "Österreich" → O |
| Ř | R | "Říčany" → R |
| Š, Ś, Ş | S | "Šumperk" → S |
| Ť | T | "Ťažký" → T |
| Ú, Ù, Û, Ü, Ů, Ū | U | "Ústí" → U |
| Ý, Ÿ | Y | "Ýmir" → Y |
| Ž, Ź, Ż | Z | "Žilina" → Z |
| Ł | L | "Łódź" → L |
| Æ | AE | "Ærø" → AE |
| Œ | OE | "Œuvre" → OE |
| ß | SS | "Straße" → SS |
### Implementation
```python
import unicodedata
def normalize_diacritics(text: str) -> str:
"""
Normalize diacritics to ASCII equivalents.
Examples:
"Č" → "C"
"Ř" → "R"
"Ö" → "O"
"ñ" → "n"
"""
# NFD decomposition separates base characters from combining marks
normalized = unicodedata.normalize('NFD', text)
# Remove combining marks (category 'Mn' = Mark, Nonspacing)
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return ascii_text
# Example
normalize_diacritics("VHSPAOČRZS") # Returns "VHSPAOCRZS"
```
### Languages Commonly Affected
| Language | Common Diacritics | Example Institution |
|----------|-------------------|---------------------|
| **Czech** | Č, Ř, Š, Ž, Ě, Ů | Vlastivědné muzeum → VM (not VM with háček) |
| **Polish** | Ł, Ń, Ó, Ś, Ź, Ż, Ą, Ę | Biblioteka Łódzka → BL |
| **German** | Ä, Ö, Ü, ß | Österreichische Nationalbibliothek → ON |
| **French** | É, È, Ê, Ç, Ô | Bibliothèque nationale → BN |
| **Spanish** | Ñ, Á, É, Í, Ó, Ú | Museo Nacional → MN |
| **Portuguese** | Ã, Õ, Ç, Á, É | Biblioteca Nacional → BN |
| **Nordic** | Å, Ä, Ö, Ø, Æ | Nationalmuseet → N |
| **Turkish** | Ç, Ğ, İ, Ö, Ş, Ü | İstanbul Üniversitesi → IU |
| **Hungarian** | Á, É, Í, Ó, Ö, Ő, Ú, Ü, Ű | Országos Levéltár → OL |
| **Romanian** | Ă, Â, Î, Ș, Ț | Biblioteca Națională → BN |
---
## Rule 2: Special Characters MUST Be Removed (ABBREV-SPECIAL-CHAR)
---
## Rationale
### 1. URL/URI Safety
Special characters require percent-encoding in URIs. For example:
- `&` becomes `%26`
- `+` becomes `%2B`
This makes identifiers harder to share, copy, and verify.
### 2. Filename Safety
Many special characters are invalid in filenames across operating systems:
- Windows: `\ / : * ? " < > |`
- macOS/Linux: `/` and null bytes
Files like `SX-XX-PHI-O-DR&IMSM.yaml` may cause issues on some systems.
### 3. Parsing Consistency
Special characters can conflict with delimiters in data pipelines:
- `&` is used in query strings
- `:` is used in YAML, JSON
- `/` is a path separator
- `|` is a common CSV delimiter alternative
### 4. Cross-System Compatibility
Identifiers should work across all systems:
- Databases (SQL, TypeDB, Neo4j)
- RDF/SPARQL endpoints
- REST APIs
- Command-line tools
- Spreadsheets
### 5. Human Readability
Clean identifiers are easier to:
- Communicate verbally
- Type correctly
- Proofread
- Remember
---
## Characters to Remove
The following characters MUST be completely removed (not replaced) when generating abbreviations:
| Character | Name | Example Issue |
|-----------|------|---------------|
| `&` | Ampersand | "R&A" in URLs, HTML entities |
| `/` | Slash | Path separator confusion |
| `\` | Backslash | Escape sequence issues |
| `+` | Plus | URL encoding (`+` = space) |
| `@` | At sign | Email/handle confusion |
| `#` | Hash/Pound | Fragment identifier in URLs |
| `%` | Percent | URL encoding prefix |
| `$` | Dollar | Variable prefix in shells |
| `*` | Asterisk | Glob/wildcard character |
| `(` `)` | Parentheses | Grouping in regex, code |
| `[` `]` | Square brackets | Array notation |
| `{` `}` | Curly braces | Object notation |
| `\|` | Pipe | Command chaining, OR operator |
| `:` | Colon | YAML key-value, namespace separator |
| `;` | Semicolon | Statement terminator |
| `"` `'` `` ` `` | Quotes | String delimiters |
| `,` | Comma | List separator |
| `.` | Period | File extension, namespace |
| `-` | Hyphen | Already used as GHCID component separator |
| `_` | Underscore | Reserved for name suffix in collisions |
| `=` | Equals | Assignment operator |
| `?` | Question mark | Query string indicator |
| `!` | Exclamation | Negation, shell history |
| `~` | Tilde | Home directory, bitwise NOT |
| `^` | Caret | Regex anchor, power operator |
| `<` `>` | Angle brackets | HTML tags, redirects |
---
## Implementation
### Algorithm
When extracting abbreviation from institution name:
```python
import re
import unicodedata
def extract_abbreviation_from_name(name: str, skip_words: set) -> str:
"""
Extract abbreviation from institution name.
Args:
name: Full institution name (emic)
skip_words: Set of prepositions/articles to skip
Returns:
Uppercase abbreviation with only A-Z characters
"""
# Step 1: Normalize unicode (remove diacritics)
normalized = unicodedata.normalize('NFD', name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Step 2: Replace special characters with spaces (to split words)
# This handles cases like "Records&Information" -> "Records Information"
clean_name = re.sub(r'[^a-zA-Z\s]', ' ', ascii_name)
# Step 3: Split into words
words = clean_name.split()
# Step 4: Filter out skip words (prepositions, articles)
significant_words = [w for w in words if w.lower() not in skip_words]
# Step 5: Take first letter of each significant word
abbreviation = ''.join(w[0].upper() for w in significant_words if w)
# Step 6: Limit to 10 characters
return abbreviation[:10]
```
### Handling Special Cases
**Case 1: "Records & Information Management"**
1. Input: `"Records & Information Management"`
2. After special char removal: `"Records Information Management"`
3. After split: `["Records", "Information", "Management"]`
4. Abbreviation: `RIM`
**Case 2: "Art/Design Museum"**
1. Input: `"Art/Design Museum"`
2. After special char removal: `"Art Design Museum"`
3. After split: `["Art", "Design", "Museum"]`
4. Abbreviation: `ADM`
**Case 3: "Culture+"**
1. Input: `"Culture+"`
2. After special char removal: `"Culture"`
3. After split: `["Culture"]`
4. Abbreviation: `C`
---
## Examples
| Institution Name | Correct | Incorrect |
|------------------|---------|-----------|
| Department of Records & Information Management | DRIM | DR&IM |
| Art + Culture Center | ACC | A+CC |
| Museum/Gallery Amsterdam | MGA | M/GA |
| Heritage@Digital | HD | H@D |
| Archives (Historical) | AH | A(H) |
| Research & Development Institute | RDI | R&DI |
| Sint Maarten Records & Information | SMRI | SMR&I |
---
## Validation
### Check for Invalid Abbreviations
```bash
# Find GHCID files with special characters in abbreviation
find data/custodian -name "*.yaml" | xargs grep -l '[&+@#%$*|:;?!=~^<>]' | head -20
# Specifically check for & in filenames
find data/custodian -name "*&*.yaml"
```
### Programmatic Validation
```python
import re
def validate_abbreviation(abbrev: str) -> bool:
"""
Validate that abbreviation contains only A-Z.
Returns True if valid, False if contains special characters.
"""
return bool(re.match(r'^[A-Z]+$', abbrev))
# Examples
validate_abbreviation("DRIMSM") # True - valid
validate_abbreviation("DR&IMSM") # False - contains &
validate_abbreviation("A+CC") # False - contains +
```
---
## Related Documentation
- `AGENTS.md` - Section "INSTITUTION ABBREVIATION: EMIC NAME FIRST-LETTER PROTOCOL"
- `schemas/20251121/linkml/modules/classes/CustodianName.yaml` - Schema description
- `rules/LEGAL_FORM_FILTER.md` - Related filtering rule for legal forms
- `docs/PERSISTENT_IDENTIFIERS.md` - GHCID specification
---
## Changelog
| Date | Change |
|------|--------|
| 2025-12-07 | Initial rule created after discovery of `&` in GHCID |
| 2025-12-08 | Added diacritics normalization rule |

View file

@ -0,0 +1,237 @@
# Enum-to-Class Principle: Single Source of Truth
**Rule ID**: ENUM-TO-CLASS
**Status**: ACTIVE
**Applies To**: Schema evolution decisions
**Version**: 1.0
**Last Updated**: 2025-12-06
---
## Core Principle
**Enums are TEMPORARY scaffolding. Once an enum is promoted to a class hierarchy, the enum MUST be deleted to maintain a Single Source of Truth.**
---
## Rationale
### The Problem: Dual Representation
When both an enum AND a class hierarchy exist for the same concept:
- **Data sync issues**: Enum values and class names can drift apart
- **Maintenance burden**: Changes must be made in two places
- **Developer confusion**: Which one should I use?
- **Validation conflicts**: Enum constraints vs class ranges may diverge
### The Solution: Single Source of Truth
- **Enums**: Use for simple, fixed value constraints (e.g., `DataTierEnum: TIER_1, TIER_2, TIER_3, TIER_4`)
- **Classes**: Use when the concept needs properties, relationships, or rich documentation
- **NEVER BOTH**: Once promoted to classes, DELETE the enum
---
## When to Promote Enum to Classes
**Promote when the concept needs**:
| Need | Enum Can Do? | Class Required? |
|------|-------------|-----------------|
| Fixed value constraint | Yes | Yes |
| Properties (e.g., `role_category`, `typical_domains`) | No | Yes |
| Rich description per value | Limited | Yes |
| Relationships to other entities | No | Yes |
| Inheritance hierarchy | No | Yes |
| Independent identity (URI) | Limited | Yes |
| Ontology class mapping (`class_uri`) | Via `meaning` | Native |
**Rule of thumb**: If you're adding detailed documentation to each enum value, or want to attach properties, it's time to promote to classes.
---
## Promotion Workflow
### Step 1: Create Class Hierarchy
```yaml
# modules/classes/StaffRole.yaml (base class)
StaffRole:
abstract: true
description: Base class for staff role categories
slots:
- role_id
- role_name
- role_category
- typical_domains
# modules/classes/StaffRoles.yaml (subclasses)
Curator:
is_a: StaffRole
description: Museum curator specializing in collection research...
Conservator:
is_a: StaffRole
description: Conservator specializing in preservation...
```
### Step 2: Update Slot Ranges
```yaml
# BEFORE (enum)
staff_role:
range: StaffRoleTypeEnum
# AFTER (class)
staff_role:
range: StaffRole
```
### Step 3: Update Modular Schema Imports
```yaml
# REMOVE enum import
# - modules/enums/StaffRoleTypeEnum # DELETED
# ADD class imports
- modules/classes/StaffRole
- modules/classes/StaffRoles
```
### Step 4: Archive the Enum
```bash
mkdir -p schemas/.../archive/enums
mv modules/enums/OldEnum.yaml archive/enums/OldEnum.yaml.archived_$(date +%Y%m%d)
```
### Step 5: Document the Change
- Update `archive/enums/README.md` with migration entry
- Add comment in modular schema explaining removal
- Update any documentation referencing the old enum
---
## Example: StaffRoleTypeEnum to StaffRole
**Before** (2025-12-05):
```yaml
# StaffRoleTypeEnum.yaml
StaffRoleTypeEnum:
permissible_values:
CURATOR:
description: Museum curator
CONSERVATOR:
description: Conservator
# ... 51 values with limited documentation
```
**After** (2025-12-06):
```yaml
# StaffRole.yaml (abstract base)
StaffRole:
abstract: true
slots:
- role_id
- role_name
- role_category
- typical_domains
- typical_responsibilities
- requires_qualification
# StaffRoles.yaml (51 subclasses)
Curator:
is_a: StaffRole
class_uri: schema:curator
description: |
Museum curator specializing in collection research...
**IMPORTANT - FORMAL TITLE vs DE FACTO WORK**:
This is the OFFICIAL job appellation/title. Actual work may differ.
slot_usage:
role_category:
equals_string: CURATORIAL
typical_domains:
equals_expression: "[Museums, Galleries]"
```
**Why the promotion?**
1. Need to distinguish FORMAL TITLE from DE FACTO WORK
2. Each role has `role_category`, `common_variants`, `typical_domains`, `typical_responsibilities`
3. Roles benefit from inheritance (`Curator is_a StaffRole`)
4. Richer documentation per role
---
## Enums That Should REMAIN Enums
Some enums are appropriate as permanent fixtures:
| Enum | Why Keep as Enum |
|------|------------------|
| `DataTierEnum` | Simple 4-value tier (TIER_1 through TIER_4), no properties needed |
| `DataSourceEnum` | Fixed source types, simple strings |
| `CountryCodeEnum` | ISO 3166-1 standard, no custom properties |
| `LanguageCodeEnum` | ISO 639 standard, no custom properties |
**Characteristics of "permanent" enums**:
- Based on external standards (ISO, etc.)
- Simple values with no need for properties
- Unlikely to require rich per-value documentation
- Used purely for validation/constraint
---
## Anti-Patterns
### WRONG: Keep Both Enum and Classes
```yaml
# modules/enums/StaffRoleTypeEnum.yaml # Still exists!
# modules/classes/StaffRole.yaml # Also exists!
# Which one is authoritative? CONFUSION!
```
### WRONG: Create Classes but Keep Enum "for backwards compatibility"
```yaml
# "Let's keep the enum for old code"
# Result: Two sources of truth, guaranteed drift
```
### CORRECT: Delete Enum After Creating Classes
```yaml
# modules/enums/StaffRoleTypeEnum.yaml # ARCHIVED
# modules/classes/StaffRole.yaml # Single source of truth
# modules/classes/StaffRoles.yaml # All 51 role subclasses
```
---
## Verification Checklist
After promoting an enum to classes:
- [ ] Old enum file moved to `archive/enums/`
- [ ] Modular schema import removed for enum
- [ ] Modular schema import added for new class(es)
- [ ] All slot ranges updated from enum to class
- [ ] No grep results for old enum name in active schema files
- [ ] `archive/enums/README.md` updated with migration entry
- [ ] Comment added in modular schema explaining removal
```bash
# Verify enum is fully removed (should return only archive hits)
grep -r "StaffRoleTypeEnum" schemas/20251121/linkml/
```
---
## See Also
- `docs/ENUM_CLASS_SINGLE_SOURCE.md` - Extended documentation
- `schemas/20251121/linkml/archive/enums/README.md` - Archive directory
- LinkML documentation on enums: https://linkml.io/linkml/schemas/enums.html
- LinkML documentation on classes: https://linkml.io/linkml/schemas/models.html

View file

@ -0,0 +1,436 @@
# GeoNames Settlement Standardization Rules
**Rule ID**: GEONAMES-SETTLEMENT
**Status**: MANDATORY
**Applies To**: GHCID settlement component generation
**Version**: 1.1.0
**Effective Date**: 2025-12-01
**Last Updated**: 2025-12-01
---
## Purpose
This document defines the rules for standardizing settlement names in GHCID (Global Heritage Custodian Identifier) generation using the GeoNames geographical database.
## Core Principle
**ALL settlement names in GHCID must be derived from GeoNames standardized names, not from source data.**
The GeoNames database serves as the **single source of truth** for:
- Settlement names (cities, towns, villages)
- Settlement abbreviations/codes
- Administrative region codes (admin1)
- Geographic coordinates validation
## Why GeoNames Standardization?
1. **Consistency**: Same settlement = same GHCID component, regardless of source data variations
2. **Disambiguation**: Handles duplicate city names across regions
3. **Internationalization**: Provides ASCII-safe names for identifiers
4. **Authority**: GeoNames is a well-maintained, CC-licensed geographic database
5. **Persistence**: Settlement names don't change frequently, ensuring GHCID stability
---
## CRITICAL: Feature Code Filtering
**NEVER use neighborhoods or districts (PPLX) for GHCID generation. ONLY use proper settlements (cities, towns, villages).**
GeoNames classifies populated places with feature codes. When reverse geocoding coordinates to find a settlement, you MUST filter by feature code.
### ALLOWED Feature Codes
| Code | Description | Example |
|------|-------------|---------|
| **PPL** | Populated place (city/town/village) | Apeldoorn, Hamont, Lelystad |
| **PPLA** | Seat of first-order admin division | Provincial capitals |
| **PPLA2** | Seat of second-order admin division | Municipal seats |
| **PPLA3** | Seat of third-order admin division | District seats |
| **PPLA4** | Seat of fourth-order admin division | Sub-district seats |
| **PPLC** | Capital of a political entity | Amsterdam, Brussels |
| **PPLS** | Populated places (multiple) | Settlement clusters |
| **PPLG** | Seat of government | The Hague |
### EXCLUDED Feature Codes
| Code | Description | Why Excluded |
|------|-------------|--------------|
| **PPLX** | Section of populated place | Neighborhoods, districts, quarters (e.g., "Binnenstad", "Amsterdam Binnenstad") |
### Implementation
```python
VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
query = """
SELECT name, feature_code, geonames_id, ...
FROM cities
WHERE country_code = ?
AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
ORDER BY distance_sq
LIMIT 1
"""
cursor.execute(query, (country_code, *VALID_FEATURE_CODES))
```
### Verification
Always check `feature_code` in location_resolution metadata:
```yaml
location_resolution:
geonames_name: Apeldoorn
feature_code: PPL # ← MUST be PPL, PPLA*, PPLC, PPLS, or PPLG
```
**If you see `feature_code: PPLX`**, the GHCID is WRONG and must be regenerated.
---
## CRITICAL: Country Code Detection
**Determine country code from entry data BEFORE calling GeoNames reverse geocoding.**
GeoNames queries are country-specific. Using the wrong country code will return incorrect results.
### Country Code Resolution Priority
1. `zcbs_enrichment.country` - Most explicit source
2. `location.country` - Direct location field
3. `locations[].country` - Array location field
4. `original_entry.country` - CSV source field
5. `google_maps_enrichment.address` - Parse from address string
6. `wikidata_enrichment.located_in.label` - Infer from Wikidata
7. Default: `"NL"` (Netherlands) - Only if no other source
### Example
```python
# Determine country code FIRST
country_code = "NL" # Default
if entry.get('zcbs_enrichment', {}).get('country'):
country_code = entry['zcbs_enrichment']['country']
elif entry.get('google_maps_enrichment', {}).get('address', ''):
address = entry['google_maps_enrichment']['address']
if ', Belgium' in address:
country_code = "BE"
elif ', Germany' in address:
country_code = "DE"
# THEN call reverse geocoding
result = reverse_geocode_to_city(latitude, longitude, country_code)
```
---
## Settlement Resolution Process
### Step 1: Coordinate-Based Resolution (Preferred)
When coordinates are available, use reverse geocoding to find the nearest GeoNames settlement:
```python
def resolve_settlement_from_coordinates(latitude: float, longitude: float, country_code: str = "NL") -> dict:
"""
Find the GeoNames settlement nearest to given coordinates.
Returns:
{
'settlement_name': 'Lelystad', # GeoNames standardized name
'settlement_code': 'LEL', # 3-letter abbreviation
'admin1_code': '16', # GeoNames admin1 code
'region_code': 'FL', # ISO 3166-2 region code
'geonames_id': 2751792, # GeoNames ID for provenance
'distance_km': 0.5 # Distance from coords to settlement center
}
"""
```
### Step 2: Name-Based Resolution (Fallback)
When only a settlement name is available (no coordinates), look up in GeoNames:
```python
def resolve_settlement_from_name(name: str, country_code: str = "NL") -> dict:
"""
Find the GeoNames settlement matching the given name.
Uses fuzzy matching and disambiguation when multiple matches exist.
"""
```
### Step 3: Manual Resolution (Last Resort)
If GeoNames lookup fails, flag the entry for manual review with:
- `settlement_source: MANUAL`
- `settlement_needs_review: true`
---
## GHCID Settlement Component Rules
### Format
The settlement component in GHCID uses a **3-letter uppercase code**:
```
NL-{REGION}-{SETTLEMENT}-{TYPE}-{ABBREV}
^^^^^^^^^^^
3-letter code from GeoNames
```
### Code Generation Rules
1. **Single-word settlements**: First 3 letters uppercase
- `Amsterdam``AMS`
- `Rotterdam``ROT`
- `Lelystad``LEL`
2. **Settlements with Dutch articles** (`de`, `het`, `den`, `'s`):
- First letter of article + first 2 letters of main word
- `Den Haag``DHA`
- `'s-Hertogenbosch``SHE`
- `De Bilt``DBI`
3. **Multi-word settlements** (no article):
- First letter of each word (up to 3)
- `Nieuw Amsterdam``NAM`
- `Oud Beijerland``OBE`
4. **GeoNames Disambiguation Database**:
- For known problematic settlements, use pre-defined codes from disambiguation table
- Example: Both `Zwolle` (OV) and `Zwolle` (LI) exist - use `ZWO` with region for uniqueness
### Measurement Point for Historical Custodians
**Rule**: For heritage custodians that no longer exist or have historical coordinates, the **modern-day settlement** (as of 2025-12-01) is used.
Rationale:
- GHCIDs should be stable over time
- Historical place names may have changed
- Modern settlements are easier to verify and look up
- GeoNames reflects current geographic reality
Example:
- A museum that operated 1900-1950 in what was then "Nieuw Land" (before Flevoland province existed)
- Modern coordinates fall within Lelystad municipality
- GHCID uses `LEL` (Lelystad) as settlement code, not historical name
---
## GeoNames Database Integration
### Database Location
```
/data/reference/geonames.db
```
### Required Tables
```sql
-- Cities/settlements table
CREATE TABLE cities (
geonames_id INTEGER PRIMARY KEY,
name TEXT, -- Local name (may have diacritics)
ascii_name TEXT, -- ASCII-safe name for identifiers
country_code TEXT, -- ISO 3166-1 alpha-2
admin1_code TEXT, -- First-level administrative division
admin1_name TEXT, -- Region/province name
latitude REAL,
longitude REAL,
population INTEGER,
feature_code TEXT -- PPL, PPLA, PPLC, etc.
);
-- Disambiguation table for problematic settlements
CREATE TABLE settlement_codes (
geonames_id INTEGER PRIMARY KEY,
country_code TEXT,
settlement_code TEXT, -- 3-letter code
is_primary BOOLEAN, -- Primary code for this settlement
notes TEXT
);
```
### Admin1 Code Mapping (Netherlands)
**IMPORTANT**: GeoNames admin1 codes differ from historical numbering. Use this mapping:
| GeoNames admin1 | Province | ISO 3166-2 |
|-----------------|----------|------------|
| 01 | Drenthe | NL-DR |
| 02 | Friesland | NL-FR |
| 03 | Gelderland | NL-GE |
| 04 | Groningen | NL-GR |
| 05 | Limburg | NL-LI |
| 06 | Noord-Brabant | NL-NB |
| 07 | Noord-Holland | NL-NH |
| 09 | Utrecht | NL-UT |
| 10 | Zeeland | NL-ZE |
| 11 | Zuid-Holland | NL-ZH |
| 15 | Overijssel | NL-OV |
| 16 | Flevoland | NL-FL |
**Note**: Code 08 is not used in Netherlands (was assigned to former region).
---
## Validation Requirements
### Before GHCID Generation
Every entry MUST have:
- [ ] Settlement name resolved via GeoNames
- [ ] `geonames_id` recorded in entry metadata
- [ ] Settlement code (3-letter) generated consistently
- [ ] Admin1/region code mapped correctly
### Provenance Tracking
Record GeoNames resolution in entry metadata:
```yaml
location_resolution:
method: REVERSE_GEOCODE # or NAME_LOOKUP or MANUAL
geonames_id: 2751792
geonames_name: Lelystad
settlement_code: LEL
admin1_code: "16"
region_code: FL
resolution_date: "2025-12-01T00:00:00Z"
source_coordinates:
latitude: 52.52111
longitude: 5.43722
distance_to_settlement_km: 0.5
```
---
## CRITICAL: XXX Placeholders Are TEMPORARY - Research Required
**XXX placeholders for region/settlement codes are NEVER acceptable as a final state.**
When an entry has `XX` (unknown region) or `XXX` (unknown settlement), the agent MUST conduct research to resolve the location.
### Resolution Strategy by Institution Type
| Institution Type | Location Resolution Method |
|------------------|---------------------------|
| **Destroyed institution** | Use last known physical location before destruction |
| **Historical (closed)** | Use last operating location |
| **Refugee/diaspora org** | Use current headquarters OR original founding location |
| **Digital-only platform** | Use parent/founding organization's headquarters |
| **Decentralized initiative** | Use founding location or primary organizer location |
| **Unknown city, known country** | Research via Wikidata, Google Maps, official website |
### Research Sources (Priority Order)
1. **Wikidata** - P131 (located in), P159 (headquarters location), P625 (coordinates)
2. **Google Maps** - Search institution name
3. **Official Website** - Contact page, about page
4. **Web Archive** - archive.org for destroyed/closed institutions
5. **Academic Sources** - Papers, reports
6. **News Articles** - Particularly for destroyed heritage sites
### Location Resolution Metadata
When resolving XXX placeholders, update `location_resolution`:
```yaml
location_resolution:
method: MANUAL_RESEARCH # Previously was NAME_LOOKUP with XXX
country_code: PS
region_code: GZ
region_name: Gaza Strip
city_code: GAZ
city_name: Gaza City
geonames_id: 281133
research_date: "2025-12-06T00:00:00Z"
research_sources:
- type: wikidata
id: Q123456
claim: P131
- type: web_archive
url: https://web.archive.org/web/20231001/https://institution-website.org/contact
notes: "Located in Gaza City prior to destruction in 2024"
```
### File Renaming After Resolution
When GHCID changes due to XXX resolution, the file MUST be renamed:
```bash
# Before
data/custodian/PS-XX-XXX-A-NAPR.yaml
# After
data/custodian/PS-GZ-GAZ-A-NAPR.yaml
```
### Prohibited Practices
- ❌ Leaving XXX placeholders in production data
- ❌ Using "Online" or country name as location
- ❌ Skipping research because it's difficult
- ❌ Using XX/XXX for diaspora organizations
---
## Error Handling
### No GeoNames Match
If a settlement cannot be resolved via automated lookup:
1. Log warning with entry details
2. Set `settlement_code: XXX` (temporary placeholder)
3. Set `settlement_needs_review: true`
4. Do NOT skip the entry - generate GHCID with XXX placeholder
5. **IMMEDIATELY** begin manual research to resolve
### Multiple GeoNames Matches
When multiple settlements match a name:
1. Use coordinates to disambiguate (if available)
2. Use admin1/region context (if available)
3. Use population as tiebreaker (prefer larger settlement)
4. Flag for manual review if still ambiguous
### Coordinates Outside Country
If coordinates fall outside the expected country:
1. Log warning
2. Use nearest settlement within country
3. Flag for manual review
---
## Related Documentation
- `AGENTS.md` - Section on GHCID generation
- `docs/PERSISTENT_IDENTIFIERS.md` - Complete GHCID specification
- `docs/GHCID_PID_SCHEME.md` - PID scheme details
- `scripts/enrich_nde_entries_ghcid.py` - Implementation
---
## Changelog
### v1.1.0 (2025-12-01)
- **CRITICAL**: Added feature code filtering rules
- MUST filter for PPL, PPLA, PPLA2, PPLA3, PPLA4, PPLC, PPLS, PPLG
- MUST exclude PPLX (neighborhoods/districts)
- Example: Apeldoorn (PPL) not "Binnenstad" (PPLX)
- **CRITICAL**: Added country code detection rules
- Must determine country from entry data BEFORE reverse geocoding
- Priority: zcbs_enrichment.country > location.country > address parsing
- Example: Belgian institutions use BE, not NL
- Added Belgium admin1 code mapping (BRU, VLG, WAL)
### v1.0.0 (2025-12-01)
- Initial version
- Established GeoNames as authoritative source for settlement standardization
- Defined measurement point rule for historical custodians
- Documented admin1 code mapping for Netherlands

View file

@ -0,0 +1,346 @@
# Legal Form Filtering Rule for CustodianName
**Rule ID**: LEGAL-FORM-FILTER
**Status**: MANDATORY
**Applies To**: CustodianName standardization
**Created**: 2025-12-02
---
## Overview
**CRITICAL RULE**: Legal form designations MUST ALWAYS be filtered from `CustodianName`, even when the custodian self-identifies with them.
This is the **ONE EXCEPTION** to the emic (insider name) principle in the Heritage Custodian Ontology.
## Rationale
### Why Legal Forms Are NOT Part of Identity
1. **Legal Form ≠ Identity**: The legal structure is administrative metadata, not the custodian's core identity
- "Stichting Rijksmuseum" → Identity is "Rijksmuseum", legal form is "Stichting"
2. **Legal Forms Change Over Time**: Organizations transform while identity persists
- Association → Foundation → Corporation (same museum, different legal structures)
3. **Cross-Jurisdictional Consistency**: Same organization may have different legal forms in different countries
- "Getty Foundation" (US) = "Stichting Getty" (NL) = same identity
4. **Deduplication**: Prevents false duplicates
- "Museum X" and "Stichting Museum X" should NOT be separate entities
5. **ISO 20275 Alignment**: The Legal Entity Identifier (LEI) standard explicitly separates legal form from entity name
### Where Legal Form IS Stored
Legal form information is NOT discarded - it is stored in appropriate metadata fields:
| Field | Location | Purpose |
|-------|----------|---------|
| `legal_form` | `CustodianLegalStatus` | ISO 20275 legal form code |
| `legal_name` | `CustodianLegalStatus` | Full registered name including legal form |
| `observed_name` | `CustodianObservation` | Original name as observed in source (may include legal form) |
## Examples
### Dutch Examples
| Source Name | CustodianName | Legal Form | Notes |
|-------------|---------------|------------|-------|
| Stichting Rijksmuseum | Rijksmuseum | Stichting | Prefix removal |
| Hidde Nijland Stichting | Hidde Nijland | Stichting | Suffix removal |
| Stichting Het Loo | Het Loo | Stichting | Preserve article "Het" |
| Coöperatie Erfgoed | Erfgoed | Coöperatie | |
| Vereniging Ons Huis | Ons Huis | Vereniging | |
| Museum B.V. | Museum | B.V. | |
### International Examples
| Source Name | CustodianName | Legal Form | Language |
|-------------|---------------|------------|----------|
| The Getty Foundation | The Getty | Foundation | English |
| British Museum Trust Ltd | British Museum | Trust Ltd | English |
| Smithsonian Institution Inc. | Smithsonian Institution | Inc. | English |
| Fundação Biblioteca Nacional | Biblioteca Nacional | Fundação | Portuguese |
| Verein Deutsches Museum | Deutsches Museum | Verein | German |
| Association des Amis du Louvre | Amis du Louvre | Association | French |
| Fondazione Musei Civici | Musei Civici | Fondazione | Italian |
| Fundación Museo del Prado | Museo del Prado | Fundación | Spanish |
---
## Global Legal Form Terms Reference
### Dutch (Netherlands, Belgium-Flanders)
**Foundations and Non-Profits:**
- Stichting (foundation)
- Vereniging (association)
- Coöperatie, Coöperatieve (cooperative)
**Business Entities:**
- B.V., BV (besloten vennootschap - private limited company)
- N.V., NV (naamloze vennootschap - public limited company)
- V.O.F., VOF (vennootschap onder firma - general partnership)
- C.V., CV (commanditaire vennootschap - limited partnership)
- Maatschap (partnership)
- Eenmanszaak (sole proprietorship)
### English (UK, US, Ireland, Australia, etc.)
**Foundations and Non-Profits:**
- Foundation
- Trust
- Association
- Society
- Institute
- Institution (when followed by Inc./Ltd.)
- Charity
- Fund
**Business Entities:**
- Inc., Incorporated
- Ltd., Limited
- LLC, L.L.C. (limited liability company)
- LLP, L.L.P. (limited liability partnership)
- Corp., Corporation
- Co., Company
- PLC, plc (public limited company - UK)
- Pty Ltd (proprietary limited - Australia)
### German (Germany, Austria, Switzerland)
**Foundations and Non-Profits:**
- Stiftung (foundation)
- Verein (association)
- e.V., eingetragener Verein (registered association)
- gGmbH (gemeinnützige GmbH - charitable limited company)
**Business Entities:**
- GmbH (Gesellschaft mit beschränkter Haftung - limited liability company)
- AG (Aktiengesellschaft - stock corporation)
- KG (Kommanditgesellschaft - limited partnership)
- OHG (offene Handelsgesellschaft - general partnership)
- GmbH & Co. KG
- UG (Unternehmergesellschaft - mini-GmbH)
### French (France, Belgium-Wallonia, Switzerland, Canada-Quebec)
**Foundations and Non-Profits:**
- Fondation (foundation)
- Association (association)
- Fonds (fund)
**Business Entities:**
- S.A., SA (société anonyme - public limited company)
- S.A.R.L., SARL (société à responsabilité limitée - private limited company)
- S.A.S., SAS (société par actions simplifiée)
- S.C.I., SCI (société civile immobilière)
- S.N.C., SNC (société en nom collectif - general partnership)
- S.C.S., SCS (société en commandite simple)
- EURL (entreprise unipersonnelle à responsabilité limitée)
### Spanish (Spain, Latin America)
**Foundations and Non-Profits:**
- Fundación (foundation)
- Asociación (association)
- Sociedad (society) - when not followed by commercial designator
**Business Entities:**
- S.A., SA (sociedad anónima - public limited company)
- S.L., SL (sociedad limitada - private limited company)
- S.L.L., SLL (sociedad limitada laboral)
- S.Coop. (sociedad cooperativa)
- S.C., SC (sociedad colectiva - general partnership)
- S.Com., S. en C. (sociedad en comandita)
### Portuguese (Portugal, Brazil)
**Foundations and Non-Profits:**
- Fundação (foundation)
- Associação (association)
- Instituto (institute)
**Business Entities:**
- Ltda., Limitada (limited liability company)
- S.A., SA (sociedade anônima - corporation)
- S/A
- Cia., Companhia (company)
- ME (microempresa)
- EPP (empresa de pequeno porte)
### Italian (Italy, Switzerland-Ticino)
**Foundations and Non-Profits:**
- Fondazione (foundation)
- Associazione (association)
- Ente (entity/institution)
- Onlus (non-profit organization)
**Business Entities:**
- S.p.A., SpA (società per azioni - joint-stock company)
- S.r.l., Srl (società a responsabilità limitata - limited liability company)
- S.a.s., Sas (società in accomandita semplice)
- S.n.c., Snc (società in nome collettivo)
- S.c.a.r.l. (società cooperativa a responsabilità limitata)
### Scandinavian Languages
**Danish:**
- Fond (foundation)
- Forening (association)
- A/S (aktieselskab - public limited company)
- ApS (anpartsselskab - private limited company)
**Swedish:**
- Stiftelse (foundation)
- Förening (association)
- AB (aktiebolag - limited company)
**Norwegian:**
- Stiftelse (foundation)
- Forening (association)
- AS (aksjeselskap - limited company)
- ASA (allmennaksjeselskap - public limited company)
### Other European Languages
**Polish:**
- Fundacja (foundation)
- Stowarzyszenie (association)
- Sp. z o.o. (limited liability company)
- S.A. (joint-stock company)
**Czech:**
- Nadace (foundation)
- Spolek (association)
- s.r.o. (limited liability company)
- a.s. (joint-stock company)
**Hungarian:**
- Alapítvány (foundation)
- Egyesület (association)
- Kft. (limited liability company)
- Zrt. (private limited company)
- Nyrt. (public limited company)
**Greek:**
- Ίδρυμα (Idryma - foundation)
- Σύλλογος (Syllogos - association)
- Α.Ε., ΑΕ (Ανώνυμη Εταιρεία - corporation)
- Ε.Π.Ε., ΕΠΕ (limited liability company)
**Finnish:**
- Säätiö (foundation)
- Yhdistys (association)
- Oy (osakeyhtiö - limited company)
- Oyj (public limited company)
### Asian Languages
**Japanese:**
- 財団法人 (zaidan hōjin - incorporated foundation)
- 社団法人 (shadan hōjin - incorporated association)
- 株式会社, K.K. (kabushiki kaisha - corporation)
- 合同会社, G.K. (gōdō kaisha - LLC)
- 有限会社, Y.K. (yūgen kaisha - limited company)
**Chinese:**
- 基金会 (jījīn huì - foundation)
- 协会 (xiéhuì - association)
- 有限公司 (yǒuxiàn gōngsī - limited company)
- 股份有限公司 (gǔfèn yǒuxiàn gōngsī - joint-stock company)
**Korean:**
- 재단법인 (jaedan beobin - incorporated foundation)
- 사단법인 (sadan beobin - incorporated association)
- 주식회사 (jusik hoesa - corporation)
- 유한회사 (yuhan hoesa - limited company)
### Middle Eastern Languages
**Arabic:**
- مؤسسة (mu'assasa - foundation/institution)
- جمعية (jam'iyya - association)
- شركة (sharika - company)
- ش.م.م (limited liability company)
- ش.م.ع (public joint-stock company)
**Hebrew:**
- עמותה (amuta - non-profit association)
- חל"צ (company for public benefit)
- בע"מ (limited company)
**Turkish:**
- Vakıf (foundation)
- Dernek (association)
- A.Ş. (anonim şirket - joint-stock company)
- Ltd. Şti. (limited şirket - limited company)
### Latin American Specific
**Brazilian Portuguese:**
- OSCIP (organização da sociedade civil de interesse público)
- ONG (organização não governamental)
- EIRELI (empresa individual de responsabilidade limitada)
**Mexican Spanish:**
- A.C. (asociación civil - civil association)
- S.C. (sociedad civil)
- S. de R.L. (sociedad de responsabilidad limitada)
---
## Implementation Guidelines
### Filtering Algorithm
```python
def filter_legal_form(name: str, language: str = None) -> tuple[str, str | None]:
"""
Remove legal form terms from custodian name.
Returns:
tuple: (filtered_name, legal_form_found)
"""
# Apply language-specific patterns first if language known
# Then apply universal patterns
# Handle both prefix and suffix positions
# Preserve articles (the, het, de, la, le, etc.)
pass
```
### Position Handling
Legal forms can appear as:
1. **Prefix**: "Stichting Rijksmuseum" → Remove "Stichting "
2. **Suffix**: "British Museum Trust Ltd" → Remove " Trust Ltd"
3. **Infix** (rare): Handle case-by-case
### Edge Cases
1. **Multiple legal forms**: "Foundation Trust Ltd" → Remove all
2. **Abbreviation variations**: "Inc." = "Inc" = "Incorporated"
3. **Case insensitivity**: "STICHTING" = "Stichting" = "stichting"
4. **With punctuation**: "B.V." = "BV" = "B.V"
5. **Compound terms**: "GmbH & Co. KG" → Remove entire compound
### Validation Script
Use `scripts/validate_organization_names.py` to detect names that still contain legal form terms after filtering.
---
## References
- ISO 20275:2017 - Financial services — Entity legal forms (ELF)
- GLEIF Legal Entity Identifier documentation
- LinkML Schema: `schemas/20251121/linkml/modules/classes/CustodianName.yaml`
- AGENTS.md: Rule 8 (Legal Form Filtering)
---
**Last Updated**: 2025-12-02
**Maintained By**: GLAM Heritage Custodian Ontology Project

View file

@ -0,0 +1,156 @@
# Value Standardization Rules
**Location**: `schemas/20251121/linkml/rules/`
**Purpose**: Data transformation and processing rules for achieving standardized values required by Heritage Custodian (HC) classes.
---
## About These Rules
These rules are **formally outside the LinkML schema convention** but document HOW data values are:
- Transformed
- Converted
- Processed
- Normalized
to achieve the standardized values required by particular HC classes.
**IMPORTANT**: These are NOT LinkML validation rules. They are **processing instructions** for data pipelines and extraction agents.
---
## Rule Categories
### 1. Name Standardization Rules
| Rule ID | File | Applies To | Summary |
|---------|------|------------|---------|
| **LEGAL-FORM-FILTER** | [`LEGAL_FORM_FILTER.md`](LEGAL_FORM_FILTER.md) | `CustodianName` | Remove legal form terms (Stichting, Foundation, Inc.) from emic names |
| **ABBREV-CHAR-FILTER** | [`ABBREVIATION_RULES.md`](ABBREVIATION_RULES.md) | GHCID abbreviation | Remove special characters (&, /, +, @) and normalize diacritics to ASCII |
| **TRANSLIT-ISO** | [`TRANSLITERATION.md`](TRANSLITERATION.md) | GHCID abbreviation | Transliterate non-Latin scripts (Cyrillic, CJK, Arabic) using ISO standards |
### 2. Geographic Standardization Rules
| Rule ID | File | Applies To | Summary |
|---------|------|------------|---------|
| **GEONAMES-SETTLEMENT** | [`GEONAMES_SETTLEMENT.md`](GEONAMES_SETTLEMENT.md) | Settlement codes | Use GeoNames as single source for settlement names |
| **FEATURE-CODE-FILTER** | [`GEONAMES_SETTLEMENT.md`](GEONAMES_SETTLEMENT.md) | Reverse geocoding | Only use PPL* feature codes, never PPLX (neighborhoods) |
### 3. Web Observation Rules
| Rule ID | File | Applies To | Summary |
|---------|------|------------|---------|
| **XPATH-PROVENANCE** | [`XPATH_PROVENANCE.md`](XPATH_PROVENANCE.md) | `WebClaim` | Every web claim MUST have XPath pointer to archived HTML |
### 4. Schema Evolution Rules
| Rule ID | File | Applies To | Summary |
|---------|------|------------|---------|
| **ENUM-TO-CLASS** | [`ENUM_TO_CLASS.md`](ENUM_TO_CLASS.md) | Enums/Classes | When enum promoted to class hierarchy, delete original enum |
---
## GLAMORCUBESFIXPHDNT Taxonomy Applicability
Each rule primarily applies to certain custodian types:
| Rule | Primary Types | All Types |
|------|--------------|-----------|
| LEGAL-FORM-FILTER | All | ✅ |
| ABBREV-SPECIAL-CHAR | All | ✅ |
| ABBREV-DIACRITICS | All | ✅ |
| TRANSLITERATION | International (non-Latin script countries) | Partial |
| GEONAMES-SETTLEMENT | All | ✅ |
| XPATH-PROVENANCE | D (Digital platforms) | Partial |
---
## Integration with bronhouder.nl
These rules are displayed under a separate "Regels" (Rules) category on the bronhouder.nl LinkML visualization page, distinct from:
- Classes
- Slots
- Enums
- Instances
Each rule includes:
- Rule ID (short identifier)
- Applicable class(es)
- GLAMORCUBESFIXPHDNT type indicator
- Transformation examples
- Implementation code (Python)
---
## Rule Template
New rules should follow this template:
```markdown
# Rule Title
**Rule ID**: SHORT-ID
**Status**: MANDATORY | RECOMMENDED | OPTIONAL
**Applies To**: Class or slot name
**Created**: YYYY-MM-DD
**Updated**: YYYY-MM-DD
---
## Summary
One-paragraph summary of what this rule does.
---
## Rationale
Why this rule exists (numbered list of reasons).
---
## Specification
Detailed specification with examples.
---
## Implementation
Python code showing how to implement this rule.
---
## Examples
| Input | Output | Explanation |
|-------|--------|-------------|
---
## Related Rules
- Other related rules
---
## Changelog
| Date | Change |
|------|--------|
```
---
## File List
```
rules/
├── README.md # This file (rule index)
├── ABBREVIATION_RULES.md # ABBREV-CHAR-FILTER: Special char + diacritics normalization
├── LEGAL_FORM_FILTER.md # LEGAL-FORM-FILTER: Legal form removal from emic names
├── GEONAMES_SETTLEMENT.md # GEONAMES-SETTLEMENT: Geographic standardization via GeoNames
├── XPATH_PROVENANCE.md # XPATH-PROVENANCE: WebClaim XPath requirements
├── TRANSLITERATION.md # TRANSLIT-ISO: Non-Latin script transliteration
└── ENUM_TO_CLASS.md # ENUM-TO-CLASS: Schema evolution pattern
```

View file

@ -0,0 +1,337 @@
# Transliteration Standards for Non-Latin Scripts
**Rule ID**: TRANSLIT-ISO
**Status**: MANDATORY
**Applies To**: GHCID abbreviation generation from emic names in non-Latin scripts
**Created**: 2025-12-08
---
## Summary
**When generating GHCID abbreviations from institution names written in non-Latin scripts, the emic name MUST first be transliterated to Latin characters using the designated ISO or recognized standard for that script.**
This rule affects **170 institutions** across **21 languages** with non-Latin writing systems.
### Key Principles
1. **Emic name is preserved** - The original script is stored in `custodian_name.emic_name`
2. **Transliteration is for processing only** - Used to generate abbreviations
3. **ISO/recognized standards required** - No ad-hoc romanization
4. **Deterministic output** - Same input always produces same Latin output
5. **Existing GHCIDs grandfathered** - Only applies to NEW custodians
---
## Transliteration Standards by Script/Language
### Cyrillic Scripts
| Language | ISO Code | Standard | Library/Tool | Notes |
|----------|----------|----------|--------------|-------|
| **Russian** | ru | ISO 9:1995 | `transliterate` | Scientific transliteration |
| **Ukrainian** | uk | ISO 9:1995 | `transliterate` | Includes Ukrainian-specific letters |
| **Bulgarian** | bg | ISO 9:1995 | `transliterate` | Uses same Cyrillic base |
| **Serbian** | sr | ISO 9:1995 | `transliterate` | Serbian Cyrillic variant |
| **Kazakh** | kk | ISO 9:1995 | `transliterate` | Cyrillic-based (pre-2023) |
**Example**:
```
Input: Институт восточных рукописей РАН
ISO 9: Institut vostocnyh rukopisej RAN
Abbrev: IVRRAN (after diacritic normalization)
```
---
### CJK Scripts
#### Chinese (Hanzi)
| Variant | Standard | Library/Tool | Notes |
|---------|----------|--------------|-------|
| Simplified | Hanyu Pinyin (ISO 7098) | `pypinyin` | Standard PRC romanization |
| Traditional | Hanyu Pinyin | `pypinyin` | Same standard applies |
**Pinyin Rules**:
- Tone marks are OMITTED for abbreviation (diacritics removed anyway)
- Word boundaries follow natural spacing
- Proper nouns capitalized
**Example**:
```
Input: 东巴文化博物院
Pinyin: Dongba Wenhua Bowuyuan
ASCII: Dongba Wenhua Bowuyuan
Abbrev: DWB
```
#### Japanese (Kanji/Kana)
| Standard | Library/Tool | Notes |
|----------|--------------|-------|
| Modified Hepburn | `pykakasi`, `romkan` | Most widely used internationally |
**Hepburn Rules**:
- Long vowels: o, u (normalized to o, u for abbreviation)
- Particles: ha (wa), wo (wo), he (e)
- Syllabic n: n = n (before vowels: n')
**Example**:
```
Input: 国立中央博物館
Romaji: Kokuritsu Chuo Hakubutsukan
ASCII: Kokuritsu Chuo Hakubutsukan
Abbrev: KCH
```
#### Korean (Hangul)
| Standard | Library/Tool | Notes |
|----------|--------------|-------|
| Revised Romanization (RR) | `korean-romanizer`, `hangul-romanize` | Official South Korean standard (2000) |
**RR Rules**:
- No diacritics (unlike McCune-Reischauer)
- Consonant assimilation reflected in spelling
- Word boundaries at natural breaks
**Example**:
```
Input: 독립기념관
RR: Dongnip Ginyeomgwan
Abbrev: DG
```
---
### Arabic Script
| Language | ISO Code | Standard | Library/Tool | Notes |
|----------|----------|----------|--------------|-------|
| **Arabic** | ar | ISO 233-2:1993 | `arabic-transliteration` | Simplified standard |
| **Persian/Farsi** | fa | ISO 233-3:1999 | `persian-transliteration` | Persian extensions |
| **Urdu** | ur | ISO 233-3 + Urdu extensions | `urdu-transliteration` | Additional characters |
**Example (Arabic)**:
```
Input: المكتبة الوطنية للمملكة المغربية
ISO: al-Maktaba al-Wataniya lil-Mamlaka al-Maghribiya
ASCII: al-Maktaba al-Wataniya lil-Mamlaka al-Maghribiya
Abbrev: MWMM (skip "al-" articles)
```
---
### Hebrew Script
| Standard | Library/Tool | Notes |
|----------|--------------|-------|
| ISO 259-3:1999 | `hebrew-transliteration` | Simplified romanization |
**Example**:
```
Input: ארכיון הסיפור העממי בישראל
ISO: Arkhiyon ha-Sipur ha-Amami be-Yisrael
ASCII: Arkhiyon ha-Sipur ha-Amami be-Yisrael
Abbrev: ASAY (skip "ha-" and "be-" articles)
```
---
### Greek Script
| Standard | Library/Tool | Notes |
|----------|--------------|-------|
| ISO 843:1997 | `greek-transliteration` | Romanization of Greek |
**Example**:
```
Input: Αρχαιολογικό Μουσείο Θεσσαλονίκης
ISO: Archaiologiko Mouseio Thessalonikis
ASCII: Archaiologiko Mouseio Thessalonikis
Abbrev: AMT
```
---
### Indic Scripts
| Language | Script | Standard | Library/Tool |
|----------|--------|----------|--------------|
| **Hindi** | Devanagari | ISO 15919 | `indic-transliteration` |
| **Bengali** | Bengali | ISO 15919 | `indic-transliteration` |
| **Nepali** | Devanagari | ISO 15919 | `indic-transliteration` |
| **Sinhala** | Sinhala | ISO 15919 | `indic-transliteration` |
**Example (Hindi)**:
```
Input: राजस्थान प्राच्यविद्या प्रतिष्ठान
ISO: Rajasthana Pracyavidya Pratishthana
ASCII: Rajasthana Pracyavidya Pratishthana
Abbrev: RPP
```
---
### Southeast Asian Scripts
| Language | Script | Standard | Library/Tool |
|----------|--------|----------|--------------|
| **Thai** | Thai | ISO 11940-2 | `thai-romanization` |
| **Khmer** | Khmer | ALA-LC | `khmer-romanization` |
**Thai Example**:
```
Input: สำนักหอจดหมายเหตุแห่งชาติ
ISO: Samnak Ho Chotmaihet Haeng Chat
Abbrev: SHCHC
```
---
### Other Scripts
| Language | Script | Standard | Library/Tool |
|----------|--------|----------|--------------|
| **Armenian** | Armenian | ISO 9985 | `armenian-transliteration` |
| **Georgian** | Georgian | ISO 9984 | `georgian-transliteration` |
**Georgian Example**:
```
Input: ხელნაწერთა ეროვნული ცენტრი
ISO: Khelnawerti Erovnuli Centri
ASCII: Khelnawerti Erovnuli Centri
Abbrev: KEC
```
---
## Implementation
### Python Transliteration Utility
```python
import unicodedata
from typing import Optional
def detect_script(text: str) -> str:
"""
Detect the primary script of the input text.
Returns one of: 'latin', 'cyrillic', 'chinese', 'japanese',
'korean', 'arabic', 'hebrew', 'greek', 'devanagari', etc.
"""
script_ranges = {
'cyrillic': (0x0400, 0x04FF),
'arabic': (0x0600, 0x06FF),
'hebrew': (0x0590, 0x05FF),
'devanagari': (0x0900, 0x097F),
'thai': (0x0E00, 0x0E7F),
'greek': (0x0370, 0x03FF),
'korean': (0xAC00, 0xD7AF),
'chinese': (0x4E00, 0x9FFF),
}
for char in text:
code = ord(char)
for script, (start, end) in script_ranges.items():
if start <= code <= end:
return script
return 'latin'
def transliterate_for_abbreviation(emic_name: str, lang: str) -> str:
"""
Transliterate emic name for GHCID abbreviation generation.
Args:
emic_name: Institution name in original script
lang: ISO 639-1 language code
Returns:
Transliterated name ready for abbreviation extraction
"""
import re
# Step 1: Transliterate to Latin (implementation depends on script)
latin = transliterate(emic_name, lang)
# Step 2: Normalize diacritics
normalized = unicodedata.normalize('NFD', latin)
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Step 3: Remove special characters (except spaces)
clean = re.sub(r'[^a-zA-Z\s]', ' ', ascii_text)
# Step 4: Normalize whitespace
clean = ' '.join(clean.split())
return clean
```
---
## Skip Words by Language
When extracting abbreviations from transliterated text, skip these articles/prepositions:
### Arabic
- `al-` (the definite article)
- `bi-`, `li-`, `fi-` (prepositions)
### Hebrew
- `ha-` (the)
- `ve-` (and)
- `be-`, `le-`, `me-` (prepositions)
### Persian
- `-e`, `-ye` (ezafe connector)
- `va` (and)
### CJK Languages
- No skip words (particles are integral to meaning)
### Indic Languages
- `ka`, `ki`, `ke` (Hindi: of)
- `aur` (Hindi: and)
---
## Validation
### Check Transliteration Output
```python
def validate_transliteration(result: str) -> bool:
"""
Validate that transliteration output contains only ASCII letters and spaces.
"""
import re
return bool(re.match(r'^[a-zA-Z\s]+$', result))
```
### Manual Review Queue
Non-Latin institutions should be flagged for manual review if:
1. Transliteration library not available for that script
2. Confidence in transliteration is low
3. Institution has multiple official romanizations
---
## Related Documentation
- `AGENTS.md` - Rule 12: Transliteration Standards
- `rules/ABBREVIATION_RULES.md` - Character filtering after transliteration
- `docs/TRANSLITERATION_CONVENTIONS.md` - Extended examples and edge cases
- `scripts/transliterate_emic_names.py` - Production transliteration script
---
## Changelog
| Date | Change |
|------|--------|
| 2025-12-08 | Initial standards document created |

View file

@ -0,0 +1,210 @@
# WebObservation XPath Provenance Rules
**Rule ID**: XPATH-PROVENANCE
**Status**: MANDATORY
**Applies To**: WebClaim extraction from websites
**Created**: 2025-11-29
---
## Core Principle: Every Claim MUST Have Verifiable Provenance
**If a claim allegedly came from a webpage, it MUST have an XPath pointer to the exact location in the archived HTML where that value appears. Claims without XPath provenance are considered FABRICATED and must be removed.**
This is not about "confidence" or "uncertainty" - it's about **verifiability**. Either the claim value exists in the HTML at a specific XPath, or it was hallucinated/fabricated by an LLM.
---
## Required Fields for WebObservation Claims
Every claim in `web_enrichment.claims` MUST have:
| Field | Required | Description |
|-------|----------|-------------|
| `claim_type` | YES | Type of claim (full_name, description, email, etc.) |
| `claim_value` | YES | The extracted value |
| `source_url` | YES | URL the claim was extracted from |
| `retrieved_on` | YES | ISO 8601 timestamp when page was archived |
| `xpath` | YES | XPath to the element containing this value |
| `html_file` | YES | Relative path to archived HTML file |
| `xpath_match_score` | YES | 1.0 for exact match, <1.0 for fuzzy match |
### Example - CORRECT (Verifiable)
```yaml
web_enrichment:
claims:
- claim_type: full_name
claim_value: Historische Vereniging Nijeveen
source_url: https://historischeverenigingnijeveen.nl/
retrieved_on: "2025-11-29T12:28:00Z"
xpath: /[document][1]/html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]
html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
xpath_match_score: 1.0
```
### Example - WRONG (Fabricated - Must Be Removed)
```yaml
web_enrichment:
claims:
- claim_type: full_name
claim_value: Historische Vereniging Nijeveen
confidence: 0.95 # ← NO! This is meaningless without XPath
```
---
## Forbidden: Confidence Scores Without XPath
**NEVER use arbitrary confidence scores for web-extracted claims.**
Confidence scores like `0.95`, `0.90`, `0.85` are meaningless because:
1. There is NO methodology defining what these numbers mean
2. They cannot be verified or reproduced
3. They give false impression of rigor
4. They mask the fact that claims may be fabricated
If a value appears in the HTML → `xpath_match_score: 1.0`
If a value does NOT appear in the HTML → **REMOVE THE CLAIM**
---
## Website Archiving Workflow
### Step 1: Archive the Website
Use Playwright to archive websites with JavaScript rendering:
```bash
python scripts/fetch_website_playwright.py <entry_number> <url>
# Example:
python scripts/fetch_website_playwright.py 0021 https://historischeverenigingnijeveen.nl/
```
This creates:
```
data/nde/enriched/entries/web/{entry_number}/{domain}/
├── index.html # Raw HTML as received
├── rendered.html # HTML after JS execution
├── content.md # Markdown conversion
└── metadata.yaml # XPath extractions for provenance
```
### Step 2: Add XPath Provenance to Claims
Run the XPath migration script:
```bash
python scripts/add_xpath_provenance.py
# Or for specific entries:
python scripts/add_xpath_provenance.py --entries 0021,0022,0023
```
This script:
1. Reads each entry's `web_enrichment.claims`
2. Searches archived HTML for each claim value
3. Adds `xpath` + `html_file` if found
4. **REMOVES claims that cannot be verified** (stores in `removed_unverified_claims`)
### Step 3: Audit Removed Claims
Check `removed_unverified_claims` in each entry file:
```yaml
removed_unverified_claims:
- claim_type: phone
claim_value: "+31 6 12345678"
reason: "Value not found in archived HTML - likely fabricated"
removed_on: "2025-11-29T14:30:00Z"
```
These claims were NOT in the HTML and should NOT be restored without proper sourcing.
---
## Claim Types and Expected Sources
| Claim Type | Expected Source | Notes |
|------------|-----------------|-------|
| `full_name` | Page title, heading, logo text | Usually in `<h1>`, `<title>`, or prominent `<div>` |
| `description` | Meta description, about text | Check `<meta name="description">` first |
| `email` | Contact page, footer | Often in `<a href="mailto:...">` |
| `phone` | Contact page, footer | May need normalization |
| `address` | Contact page, footer | Check for structured data too |
| `social_media` | Footer, contact page | Links to social platforms |
| `opening_hours` | Contact/visit page | May be in structured data |
---
## XPath Matching Strategy
The `add_xpath_provenance.py` script uses this matching strategy:
1. **Exact match**: Claim value appears exactly in element text
2. **Normalized match**: After whitespace normalization
3. **Substring match**: Claim value is substring of element text (score < 1.0)
Priority order for matching:
1. `rendered.html` (after JS execution) - preferred
2. `index.html` (raw HTML) - fallback
---
## Integration with LinkML Schema
The `WebClaim` class in the LinkML schema requires:
```yaml
# schemas/20251121/linkml/modules/classes/WebClaim.yaml
WebClaim:
slots:
- source_url # Required
- retrieved_on # Required (timestamp)
- xpath # Required for claims
- html_archive_path # Path to archived HTML
```
---
## Rules for AI Agents
### When Extracting Claims from Websites
1. **ALWAYS archive the website first** using Playwright
2. **ALWAYS extract claims with XPath provenance** using the archived HTML
3. **NEVER invent or infer claims** not present in the HTML
4. **NEVER use confidence scores** without XPath backing
### When Processing Existing Claims
1. **Verify each claim** against archived HTML
2. **Add XPath provenance** to verified claims
3. **REMOVE fabricated claims** that cannot be verified
4. **Document removed claims** in `removed_unverified_claims`
### When Reviewing Data Quality
1. Claims with `xpath` + `html_file` = **VERIFIED**
2. Claims with only `confidence` = **SUSPECT** (migrate or remove)
3. Claims in `removed_unverified_claims` = **FABRICATED** (do not restore)
---
## Scripts Reference
| Script | Purpose |
|--------|---------|
| `scripts/fetch_website_playwright.py` | Archive website with Playwright |
| `scripts/add_xpath_provenance.py` | Add XPath to claims, remove fabricated |
| `scripts/batch_fetch_websites.py` | Batch archive multiple entries |
---
## Version History
- **2025-11-29**: Initial version - established XPath provenance requirement
- Replaced confidence scores with verifiable XPath pointers
- Established policy of removing fabricated claims

View file

@ -25,6 +25,7 @@ import NDEMapPage from './pages/NDEMapPageMapLibre';
import NDEStatsPage from './pages/NDEStatsPage';
import ProjectPlanPage from './pages/ProjectPlanPage';
import OverviewPage from './pages/OverviewPage';
import GesprekPage from './pages/GesprekPage';
import './App.css';
// Create router configuration with protected routes
@ -88,6 +89,10 @@ const router = createBrowserRouter([
path: 'overview',
element: <OverviewPage />,
},
{
path: 'gesprek',
element: <GesprekPage />,
},
],
},
]);

File diff suppressed because it is too large Load diff

View file

@ -18,6 +18,8 @@ import { useState, useCallback, useMemo } from 'react';
import { useQdrant } from '@/hooks/useQdrant';
import type { QdrantCollection, QdrantPoint } from '@/hooks/useQdrant';
import { useLanguage } from '@/contexts/LanguageContext';
import { EmbeddingProjector } from './EmbeddingProjector';
import type { EmbeddingPoint } from './EmbeddingProjector';
interface QdrantPanelProps {
compact?: boolean;
@ -85,65 +87,6 @@ const TEXT = {
dimensions: { nl: 'dimensies', en: 'dimensions' },
};
// Simple PCA implementation for initial visualization (UMAP/t-SNE would require additional libraries)
function computePCA(vectors: number[][], dimensions: number = 2): number[][] {
if (vectors.length === 0) return [];
const n = vectors.length;
const d = vectors[0].length;
// Center the data
const means = new Array(d).fill(0);
for (const vec of vectors) {
for (let i = 0; i < d; i++) {
means[i] += vec[i] / n;
}
}
const centered = vectors.map(vec => vec.map((v, i) => v - means[i]));
// Power iteration for top eigenvectors (simplified PCA)
const result: number[][] = [];
for (const vec of centered) {
// Simple projection using first `dimensions` components
const projected = vec.slice(0, dimensions);
result.push(projected);
}
// Normalize to [-1, 1] range
const mins = new Array(dimensions).fill(Infinity);
const maxs = new Array(dimensions).fill(-Infinity);
for (const point of result) {
for (let i = 0; i < dimensions; i++) {
mins[i] = Math.min(mins[i], point[i]);
maxs[i] = Math.max(maxs[i], point[i]);
}
}
return result.map(point =>
point.map((v, i) => {
const range = maxs[i] - mins[i];
return range > 0 ? ((v - mins[i]) / range) * 2 - 1 : 0;
})
);
}
// Color palette for categorical data
const COLORS = [
'#6366f1', '#8b5cf6', '#a855f7', '#d946ef', '#ec4899',
'#f43f5e', '#ef4444', '#f97316', '#f59e0b', '#eab308',
'#84cc16', '#22c55e', '#10b981', '#14b8a6', '#06b6d4',
'#0ea5e9', '#3b82f6', '#6366f1',
];
// Get color for a category
function getCategoryColor(value: string, categories: string[]): string {
const index = categories.indexOf(value);
return COLORS[index % COLORS.length];
}
// Get status icon
const getStatusIcon = (status: string): string => {
switch (status) {
@ -188,14 +131,21 @@ export function QdrantPanel({ compact = false }: QdrantPanelProps) {
const [isLoadingPoints, setIsLoadingPoints] = useState(false);
const [nextOffset, setNextOffset] = useState<string | number | null>(null);
// Visualization state
const [projectedPoints, setProjectedPoints] = useState<number[][]>([]);
const [projectionMethod, setProjectionMethod] = useState<'pca' | 'umap' | 'tsne'>('pca');
const [colorByField, setColorByField] = useState<string>('');
const [selectedPointIndex, setSelectedPointIndex] = useState<number | null>(null);
const [isComputing, setIsComputing] = useState(false);
// Visualization state (simplified - most logic moved to EmbeddingProjector)
const [colorByField, _setColorByField] = useState<string>('');
const [_selectedPointIndex, setSelectedPointIndex] = useState<number | null>(null);
// Extract unique payload fields for color coding
// Convert QdrantPoints to EmbeddingPoints for the projector
const embeddingPoints: EmbeddingPoint[] = useMemo(() => {
return points.map(p => ({
id: p.id,
vector: p.vector,
payload: p.payload,
}));
}, [points]);
// Extract unique payload fields for color coding (available for future use)
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const payloadFields = useMemo(() => {
const fields = new Set<string>();
for (const point of points) {
@ -205,19 +155,9 @@ export function QdrantPanel({ compact = false }: QdrantPanelProps) {
}
return Array.from(fields).sort();
}, [points]);
// Get unique values for selected field (for color legend)
const fieldCategories = useMemo(() => {
if (!colorByField) return [];
const values = new Set<string>();
for (const point of points) {
const value = point.payload[colorByField];
if (value !== undefined && value !== null) {
values.add(String(value));
}
}
return Array.from(values).slice(0, 20); // Limit to 20 categories
}, [points, colorByField]);
// Suppress unused variable warning - available for future features
void payloadFields;
// Load points from selected collection
const loadPoints = useCallback(async (append: boolean = false) => {
@ -244,33 +184,10 @@ export function QdrantPanel({ compact = false }: QdrantPanelProps) {
}
}, [selectedCollection, nextOffset, scrollPoints]);
// Compute 2D projection
const computeProjection = useCallback(() => {
if (points.length === 0) return;
setIsComputing(true);
// Use setTimeout to allow UI to update
setTimeout(() => {
const vectors = points.map(p => p.vector).filter(v => v.length > 0);
if (vectors.length === 0) {
setIsComputing(false);
return;
}
// For now, use PCA. UMAP/t-SNE would require additional libraries
const projected = computePCA(vectors, 2);
setProjectedPoints(projected);
setIsComputing(false);
}, 100);
}, [points]);
// Select a collection
const selectCollection = useCallback(async (collection: QdrantCollection) => {
setSelectedCollection(collection);
setPoints([]);
setProjectedPoints([]);
setSelectedPointIndex(null);
setExplorerView('data');
}, []);
@ -280,7 +197,6 @@ export function QdrantPanel({ compact = false }: QdrantPanelProps) {
setExplorerView('list');
setSelectedCollection(null);
setPoints([]);
setProjectedPoints([]);
setSelectedPointIndex(null);
};
@ -507,13 +423,8 @@ export function QdrantPanel({ compact = false }: QdrantPanelProps) {
{activeTab === 'visualize' && (
<div className="visualization-panel">
<div className="viz-header">
<h3>{t('embeddingVisualization')}</h3>
<p>{t('visualizationDescription')}</p>
</div>
{/* Controls */}
<div className="viz-controls">
{/* Collection selector for visualization */}
<div className="viz-collection-selector">
<div className="control-group">
<label>{t('collections')}:</label>
<select
@ -531,125 +442,61 @@ export function QdrantPanel({ compact = false }: QdrantPanelProps) {
))}
</select>
</div>
<div className="control-group">
<label>{t('projectionMethod')}:</label>
<select
value={projectionMethod}
onChange={(e) => setProjectionMethod(e.target.value as 'pca' | 'umap' | 'tsne')}
>
<option value="pca">{t('pca')}</option>
<option value="umap" disabled>{t('umap')} (coming soon)</option>
<option value="tsne" disabled>{t('tsne')} (coming soon)</option>
</select>
</div>
<div className="control-group">
<label>{t('colorBy')}:</label>
<select
value={colorByField}
onChange={(e) => setColorByField(e.target.value)}
>
<option value="">{t('noField')}</option>
{payloadFields.map(field => (
<option key={field} value={field}>{field}</option>
))}
</select>
</div>
<div className="control-actions">
<button
className="primary-button"
onClick={() => loadPoints(false)}
disabled={!selectedCollection || isLoadingPoints}
>
{isLoadingPoints ? t('loadingVectors') : t('loadVectors')}
</button>
<button
className="secondary-button"
onClick={computeProjection}
disabled={points.length === 0 || isComputing}
>
{isComputing ? t('computing') : t('computeProjection')}
</button>
</div>
</div>
{/* Visualization Canvas */}
<div className="viz-container">
{projectedPoints.length > 0 ? (
<div className="scatter-plot">
<svg viewBox="-1.2 -1.2 2.4 2.4" className="viz-svg">
{/* Grid lines */}
<line x1="-1" y1="0" x2="1" y2="0" stroke="#e2e8f0" strokeWidth="0.01" />
<line x1="0" y1="-1" x2="0" y2="1" stroke="#e2e8f0" strokeWidth="0.01" />
{/* Points */}
{projectedPoints.map((point, idx) => {
const payload = points[idx]?.payload || {};
const colorValue = colorByField ? String(payload[colorByField] ?? '') : '';
const color = colorByField && colorValue
? getCategoryColor(colorValue, fieldCategories)
: '#6366f1';
const isSelected = selectedPointIndex === idx;
return (
<circle
key={idx}
cx={point[0]}
cy={-point[1]} // Flip Y axis
r={isSelected ? 0.04 : 0.02}
fill={color}
opacity={isSelected ? 1 : 0.7}
stroke={isSelected ? '#1e293b' : 'none'}
strokeWidth="0.01"
className="viz-point"
onClick={() => setSelectedPointIndex(idx)}
/>
);
})}
</svg>
{/* Legend */}
{colorByField && fieldCategories.length > 0 && (
<div className="viz-legend">
<strong>{colorByField}:</strong>
{fieldCategories.map((value, idx) => (
<span key={value} className="legend-item">
<span
className="legend-color"
style={{ backgroundColor: COLORS[idx % COLORS.length] }}
/>
{value.length > 20 ? value.slice(0, 17) + '...' : value}
</span>
))}
</div>
{selectedCollection && (
<div className="viz-load-controls">
<button
className="primary-button"
onClick={() => loadPoints(false)}
disabled={isLoadingPoints}
>
{isLoadingPoints ? t('loadingVectors') : t('loadVectors')}
</button>
{points.length > 0 && (
<span className="loaded-count">
{points.length} {t('vectorsLoaded')}
</span>
)}
{nextOffset !== null && points.length > 0 && (
<button
className="secondary-button"
onClick={() => loadPoints(true)}
disabled={isLoadingPoints}
>
Load more
</button>
)}
</div>
) : (
<div className="viz-placeholder">
<p>{points.length > 0
? t('computeProjection')
: selectedCollection
? t('loadVectors')
: t('selectCollection')
}</p>
</div>
)}
</div>
{/* Selected Point Details */}
{selectedPointIndex !== null && points[selectedPointIndex] && (
<div className="selected-point-details">
<h4>{t('selectedPoint')}</h4>
<div className="point-info">
<div className="point-id">
<strong>{t('id')}:</strong> {String(points[selectedPointIndex].id)}
</div>
<div className="point-payload">
<strong>{t('payload')}:</strong>
<pre>{JSON.stringify(points[selectedPointIndex].payload, null, 2)}</pre>
</div>
{/* Embedding Projector */}
{points.length > 0 ? (
<EmbeddingProjector
points={embeddingPoints}
onPointSelect={(point) => {
if (point) {
const idx = points.findIndex(p => p.id === point.id);
setSelectedPointIndex(idx >= 0 ? idx : null);
} else {
setSelectedPointIndex(null);
}
}}
colorByField={colorByField}
height={600}
/>
) : (
<div className="viz-placeholder">
<div className="placeholder-content">
<span className="placeholder-icon"></span>
<h3>{t('embeddingVisualization')}</h3>
<p>{t('visualizationDescription')}</p>
<p className="placeholder-hint">
{selectedCollection
? t('loadVectors')
: t('selectCollection')
}
</p>
</div>
</div>
)}

View file

@ -10,3 +10,5 @@ export { PostgreSQLPanel } from './PostgreSQLPanel';
export { TypeDBPanel } from './TypeDBPanel';
export { OxigraphPanel } from './OxigraphPanel';
export { QdrantPanel } from './QdrantPanel';
export { EmbeddingProjector } from './EmbeddingProjector';
export type { EmbeddingPoint, ProjectedPoint, ProjectionMethod, ViewMode } from './EmbeddingProjector';

View file

@ -0,0 +1,434 @@
/**
* GesprekBarChart.tsx - D3 Bar Chart Visualization for Gesprek Page
*
* Features:
* - Horizontal and vertical bar charts
* - Animated transitions
* - Hover interactions with tooltips
* - Responsive sizing
* - NDE house style colors
*
* Uses D3.js v7 with React 19
*/
import React, { useRef, useEffect, useState, useMemo } from 'react';
import * as d3 from 'd3';
import type { ChartData } from '../../hooks/useMultiDatabaseRAG';
// NDE House Style Colors
const COLORS = {
primary: '#154273',
secondary: '#2E5A8B',
accent: '#3B82F6',
background: '#f8fafc',
text: '#1e293b',
textLight: '#64748b',
grid: '#e2e8f0',
barDefault: '#154273',
barHover: '#3B82F6',
};
// Default color palette for multiple datasets
const COLOR_PALETTE = [
'#154273', // Primary blue
'#ef4444', // Red (museum)
'#10b981', // Green (archive)
'#f59e0b', // Amber (gallery)
'#8b5cf6', // Purple (university)
'#ec4899', // Pink
'#06b6d4', // Cyan
'#84cc16', // Lime
];
export interface GesprekBarChartProps {
data: ChartData;
width?: number;
height?: number;
orientation?: 'vertical' | 'horizontal';
showGrid?: boolean;
showValues?: boolean;
animate?: boolean;
onBarClick?: (label: string, value: number, datasetIndex: number) => void;
language?: 'nl' | 'en';
className?: string;
title?: string;
}
interface TooltipState {
visible: boolean;
x: number;
y: number;
label: string;
value: number;
dataset: string;
}
export const GesprekBarChart: React.FC<GesprekBarChartProps> = ({
data,
width = 500,
height = 300,
orientation = 'vertical',
showGrid = true,
showValues = true,
animate = true,
onBarClick,
language = 'nl',
className,
title,
}) => {
const svgRef = useRef<SVGSVGElement>(null);
const containerRef = useRef<HTMLDivElement>(null);
const [tooltip, setTooltip] = useState<TooltipState>({
visible: false,
x: 0,
y: 0,
label: '',
value: 0,
dataset: '',
});
// Margins for axes
const margin = useMemo(() => ({
top: title ? 40 : 20,
right: 20,
bottom: 60,
left: orientation === 'horizontal' ? 120 : 50,
}), [orientation, title]);
const innerWidth = width - margin.left - margin.right;
const innerHeight = height - margin.top - margin.bottom;
// Main D3 visualization
useEffect(() => {
if (!svgRef.current || !data.labels.length || !data.datasets.length) return;
// Clear previous content
d3.select(svgRef.current).selectAll('*').remove();
const svg = d3.select(svgRef.current)
.attr('width', width)
.attr('height', height)
.attr('viewBox', [0, 0, width, height]);
// Create main group with margins
const g = svg.append('g')
.attr('transform', `translate(${margin.left},${margin.top})`);
// Add title if provided
if (title) {
svg.append('text')
.attr('x', width / 2)
.attr('y', 20)
.attr('text-anchor', 'middle')
.attr('font-size', '14px')
.attr('font-weight', '600')
.attr('fill', COLORS.text)
.text(title);
}
// Calculate max value across all datasets
const maxValue = d3.max(data.datasets.flatMap(d => d.data)) || 0;
// Create scales based on orientation
const xScale = orientation === 'vertical'
? d3.scaleBand()
.domain(data.labels)
.range([0, innerWidth])
.padding(0.2)
: d3.scaleLinear()
.domain([0, maxValue * 1.1])
.range([0, innerWidth])
.nice();
const yScale = orientation === 'vertical'
? d3.scaleLinear()
.domain([0, maxValue * 1.1])
.range([innerHeight, 0])
.nice()
: d3.scaleBand()
.domain(data.labels)
.range([0, innerHeight])
.padding(0.2);
// Add grid lines
if (showGrid) {
const gridGroup = g.append('g').attr('class', 'grid');
if (orientation === 'vertical') {
gridGroup.selectAll('.grid-line')
.data((yScale as d3.ScaleLinear<number, number>).ticks(5))
.join('line')
.attr('class', 'grid-line')
.attr('x1', 0)
.attr('x2', innerWidth)
.attr('y1', d => (yScale as d3.ScaleLinear<number, number>)(d))
.attr('y2', d => (yScale as d3.ScaleLinear<number, number>)(d))
.attr('stroke', COLORS.grid)
.attr('stroke-dasharray', '3,3');
} else {
gridGroup.selectAll('.grid-line')
.data((xScale as d3.ScaleLinear<number, number>).ticks(5))
.join('line')
.attr('class', 'grid-line')
.attr('x1', d => (xScale as d3.ScaleLinear<number, number>)(d))
.attr('x2', d => (xScale as d3.ScaleLinear<number, number>)(d))
.attr('y1', 0)
.attr('y2', innerHeight)
.attr('stroke', COLORS.grid)
.attr('stroke-dasharray', '3,3');
}
}
// Add axes
const xAxis = orientation === 'vertical'
? d3.axisBottom(xScale as d3.ScaleBand<string>)
: d3.axisBottom(xScale as d3.ScaleLinear<number, number>).ticks(5);
const yAxis = orientation === 'vertical'
? d3.axisLeft(yScale as d3.ScaleLinear<number, number>).ticks(5)
: d3.axisLeft(yScale as d3.ScaleBand<string>);
g.append('g')
.attr('class', 'x-axis')
.attr('transform', `translate(0,${innerHeight})`)
.call(xAxis)
.selectAll('text')
.attr('font-size', '11px')
.attr('fill', COLORS.textLight)
.attr('transform', orientation === 'vertical' ? 'rotate(-45)' : null)
.attr('text-anchor', orientation === 'vertical' ? 'end' : 'middle')
.attr('dx', orientation === 'vertical' ? '-0.5em' : null)
.attr('dy', orientation === 'vertical' ? '0.5em' : '1em');
g.append('g')
.attr('class', 'y-axis')
.call(yAxis)
.selectAll('text')
.attr('font-size', '11px')
.attr('fill', COLORS.textLight);
// Draw bars for each dataset
const numDatasets = data.datasets.length;
const bandWidth = orientation === 'vertical'
? (xScale as d3.ScaleBand<string>).bandwidth()
: (yScale as d3.ScaleBand<string>).bandwidth();
const barWidth = bandWidth / numDatasets;
data.datasets.forEach((dataset, datasetIndex) => {
const barGroup = g.append('g')
.attr('class', `bars-${datasetIndex}`);
const color = Array.isArray(dataset.backgroundColor)
? dataset.backgroundColor
: dataset.backgroundColor || COLOR_PALETTE[datasetIndex % COLOR_PALETTE.length];
barGroup.selectAll('rect')
.data(dataset.data.map((value, i) => ({
value,
label: data.labels[i],
index: i,
})))
.join('rect')
.attr('x', d => {
if (orientation === 'vertical') {
const bandX = (xScale as d3.ScaleBand<string>)(d.label) || 0;
return bandX + barWidth * datasetIndex;
}
return 0;
})
.attr('y', d => {
if (orientation === 'vertical') {
return animate ? innerHeight : (yScale as d3.ScaleLinear<number, number>)(d.value);
}
const bandY = (yScale as d3.ScaleBand<string>)(d.label) || 0;
return bandY + barWidth * datasetIndex;
})
.attr('width', orientation === 'vertical' ? barWidth - 2 : (animate ? 0 : (xScale as d3.ScaleLinear<number, number>)(0)))
.attr('height', orientation === 'vertical'
? (animate ? 0 : innerHeight - (yScale as d3.ScaleLinear<number, number>)(0))
: barWidth - 2)
.attr('fill', d => Array.isArray(color) ? color[d.index % color.length] : color)
.attr('rx', 2)
.attr('ry', 2)
.style('cursor', 'pointer')
.on('mouseenter', function(event, d) {
d3.select(this)
.transition()
.duration(200)
.attr('opacity', 0.8);
const [x, y] = d3.pointer(event, containerRef.current);
setTooltip({
visible: true,
x: x + 10,
y: y - 10,
label: d.label,
value: d.value,
dataset: dataset.label,
});
})
.on('mouseleave', function() {
d3.select(this)
.transition()
.duration(200)
.attr('opacity', 1);
setTooltip(prev => ({ ...prev, visible: false }));
})
.on('click', (_event, d) => {
if (onBarClick) {
onBarClick(d.label, d.value, datasetIndex);
}
});
// Animate bars
if (animate) {
barGroup.selectAll<SVGRectElement, { value: number; label: string; index: number }>('rect')
.transition()
.duration(800)
.delay((_d, i) => i * 50)
.ease(d3.easeCubicOut)
.attr('y', d => {
if (orientation === 'vertical') {
return (yScale as d3.ScaleLinear<number, number>)(d.value);
}
const bandY = (yScale as d3.ScaleBand<string>)(d.label) || 0;
return bandY + barWidth * datasetIndex;
})
.attr('width', d => {
if (orientation === 'horizontal') {
return (xScale as d3.ScaleLinear<number, number>)(d.value);
}
return barWidth - 2;
})
.attr('height', d => {
if (orientation === 'vertical') {
return innerHeight - (yScale as d3.ScaleLinear<number, number>)(d.value);
}
return barWidth - 2;
});
}
// Add value labels
if (showValues) {
barGroup.selectAll('.value-label')
.data(dataset.data.map((value, i) => ({
value,
label: data.labels[i],
index: i,
})))
.join('text')
.attr('class', 'value-label')
.attr('x', d => {
if (orientation === 'vertical') {
const bandX = (xScale as d3.ScaleBand<string>)(d.label) || 0;
return bandX + barWidth * datasetIndex + (barWidth - 2) / 2;
}
return (xScale as d3.ScaleLinear<number, number>)(d.value) + 5;
})
.attr('y', d => {
if (orientation === 'vertical') {
return (yScale as d3.ScaleLinear<number, number>)(d.value) - 5;
}
const bandY = (yScale as d3.ScaleBand<string>)(d.label) || 0;
return bandY + barWidth * datasetIndex + (barWidth - 2) / 2;
})
.attr('text-anchor', orientation === 'vertical' ? 'middle' : 'start')
.attr('dominant-baseline', orientation === 'horizontal' ? 'middle' : 'auto')
.attr('font-size', '10px')
.attr('font-weight', '500')
.attr('fill', COLORS.text)
.attr('opacity', animate ? 0 : 1)
.text(d => d.value.toLocaleString(language === 'nl' ? 'nl-NL' : 'en-US'));
// Animate value labels
if (animate) {
barGroup.selectAll('.value-label')
.transition()
.duration(800)
.delay((_d, i) => i * 50 + 400)
.attr('opacity', 1);
}
}
});
// Add legend if multiple datasets
if (numDatasets > 1) {
const legend = svg.append('g')
.attr('class', 'legend')
.attr('transform', `translate(${margin.left}, ${height - 15})`);
data.datasets.forEach((dataset, i) => {
const legendItem = legend.append('g')
.attr('transform', `translate(${i * 100}, 0)`);
legendItem.append('rect')
.attr('width', 12)
.attr('height', 12)
.attr('rx', 2)
.attr('fill', Array.isArray(dataset.backgroundColor)
? dataset.backgroundColor[0]
: dataset.backgroundColor || COLOR_PALETTE[i % COLOR_PALETTE.length]);
legendItem.append('text')
.attr('x', 16)
.attr('y', 10)
.attr('font-size', '11px')
.attr('fill', COLORS.textLight)
.text(dataset.label);
});
}
}, [data, width, height, orientation, showGrid, showValues, animate, innerWidth, innerHeight, margin, language, onBarClick]);
// Empty state
if (!data.labels.length || !data.datasets.length) {
return (
<div className={`gesprek-bar-chart gesprek-bar-chart--empty ${className || ''}`}>
<div className="gesprek-bar-chart__empty">
<span>{language === 'nl' ? 'Geen gegevens beschikbaar' : 'No data available'}</span>
</div>
</div>
);
}
return (
<div
ref={containerRef}
className={`gesprek-bar-chart ${className || ''}`}
style={{ position: 'relative' }}
>
<svg ref={svgRef} />
{/* Tooltip */}
{tooltip.visible && (
<div
className="gesprek-bar-chart__tooltip"
style={{
position: 'absolute',
left: tooltip.x,
top: tooltip.y,
pointerEvents: 'none',
zIndex: 100,
backgroundColor: 'white',
border: '1px solid #e2e8f0',
borderRadius: '4px',
padding: '8px 12px',
boxShadow: '0 2px 8px rgba(0,0,0,0.1)',
fontSize: '12px',
}}
>
<div style={{ fontWeight: '600', color: COLORS.text }}>{tooltip.label}</div>
{data.datasets.length > 1 && (
<div style={{ color: COLORS.textLight, fontSize: '11px' }}>{tooltip.dataset}</div>
)}
<div style={{ color: COLORS.primary, fontWeight: '500', marginTop: '4px' }}>
{tooltip.value.toLocaleString(language === 'nl' ? 'nl-NL' : 'en-US')}
</div>
</div>
)}
</div>
);
};
export default GesprekBarChart;

View file

@ -0,0 +1,433 @@
/**
* GesprekGeoMap.tsx - D3 Geographic Map Visualization for Gesprek Page
*
* Features:
* - Netherlands province boundaries
* - Bubble map with institution markers
* - Clustering for dense areas
* - Zoom and pan
* - Tooltips with institution details
*
* Uses D3.js v7 with React 19
*/
import React, { useRef, useEffect, useState, useCallback } from 'react';
import * as d3 from 'd3';
import type { GeoCoordinate, InstitutionData } from '../../hooks/useMultiDatabaseRAG';
// NDE House Style Colors
const COLORS = {
primary: '#154273',
secondary: '#2E5A8B',
accent: '#3B82F6',
background: '#f8fafc',
water: '#e0f2fe',
land: '#f1f5f9',
border: '#cbd5e1',
marker: '#154273',
markerHover: '#3B82F6',
markerSelected: '#ef4444',
text: '#1e293b',
};
// Institution type to color mapping
const TYPE_COLORS: Record<string, string> = {
museum: '#ef4444',
library: '#3b82f6',
archive: '#10b981',
gallery: '#f59e0b',
university: '#8b5cf6',
default: '#154273',
};
export interface GesprekGeoMapProps {
coordinates: GeoCoordinate[];
width?: number;
height?: number;
onMarkerClick?: (data: InstitutionData) => void;
onMarkerHover?: (data: InstitutionData | null) => void;
selectedId?: string | null;
language?: 'nl' | 'en';
showClustering?: boolean;
className?: string;
}
interface TooltipState {
visible: boolean;
x: number;
y: number;
data: InstitutionData | null;
}
/**
* Get marker color based on institution type
*/
function getMarkerColor(type?: string): string {
if (!type) return TYPE_COLORS.default;
const normalizedType = type.toLowerCase();
if (normalizedType.includes('museum')) return TYPE_COLORS.museum;
if (normalizedType.includes('bibliotheek') || normalizedType.includes('library')) return TYPE_COLORS.library;
if (normalizedType.includes('archief') || normalizedType.includes('archive')) return TYPE_COLORS.archive;
if (normalizedType.includes('galerie') || normalizedType.includes('gallery')) return TYPE_COLORS.gallery;
if (normalizedType.includes('universiteit') || normalizedType.includes('university')) return TYPE_COLORS.university;
return TYPE_COLORS.default;
}
/**
* Calculate marker radius based on data (e.g., rating or reviews)
*/
function getMarkerRadius(data?: InstitutionData): number {
if (!data) return 6;
// Scale based on reviews if available
if (data.reviews && data.reviews > 0) {
return Math.max(4, Math.min(20, 4 + Math.log10(data.reviews + 1) * 5));
}
// Scale based on rating if available
if (data.rating && data.rating > 0) {
return Math.max(4, Math.min(15, 4 + data.rating * 2));
}
return 6;
}
export const GesprekGeoMap: React.FC<GesprekGeoMapProps> = ({
coordinates,
width = 600,
height = 500,
onMarkerClick,
onMarkerHover,
selectedId,
language = 'nl',
showClustering = true,
className,
}) => {
const svgRef = useRef<SVGSVGElement>(null);
const containerRef = useRef<HTMLDivElement>(null);
const [tooltip, setTooltip] = useState<TooltipState>({
visible: false,
x: 0,
y: 0,
data: null,
});
const [geoData, setGeoData] = useState<GeoJSON.FeatureCollection | null>(null);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
// Load Netherlands GeoJSON
useEffect(() => {
const loadGeoJSON = async () => {
try {
setLoading(true);
const response = await fetch('/data/netherlands_provinces.geojson');
if (!response.ok) throw new Error('Failed to load map data');
const data = await response.json();
setGeoData(data);
setError(null);
} catch (err) {
console.error('Failed to load GeoJSON:', err);
setError(language === 'nl' ? 'Kaartgegevens laden mislukt' : 'Failed to load map data');
} finally {
setLoading(false);
}
};
loadGeoJSON();
}, [language]);
// Fit to bounds function
const fitToBounds = useCallback(() => {
if (!svgRef.current) return;
window.dispatchEvent(new CustomEvent('gesprek-map-fit'));
}, []);
// Main D3 visualization
useEffect(() => {
if (!svgRef.current || !geoData || loading) return;
// Clear previous content
d3.select(svgRef.current).selectAll('*').remove();
const svg = d3.select(svgRef.current)
.attr('width', width)
.attr('height', height)
.attr('viewBox', [0, 0, width, height]);
// Create container group for zoom
const g = svg.append('g');
// Setup projection centered on Netherlands
const projection = d3.geoMercator()
.center([5.5, 52.2]) // Center of Netherlands
.scale(width * 15)
.translate([width / 2, height / 2]);
const pathGenerator = d3.geoPath().projection(projection);
// Setup zoom behavior
const zoom = d3.zoom<SVGSVGElement, unknown>()
.scaleExtent([0.5, 20])
.on('zoom', (event) => {
g.attr('transform', event.transform);
});
svg.call(zoom);
// Listen for fit-to-bounds event
const handleFit = () => {
if (coordinates.length === 0) return;
// Calculate bounds of all markers
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
coordinates.forEach(coord => {
const [x, y] = projection([coord.lng, coord.lat]) || [0, 0];
minX = Math.min(minX, x);
minY = Math.min(minY, y);
maxX = Math.max(maxX, x);
maxY = Math.max(maxY, y);
});
const padding = 50;
const boundsWidth = maxX - minX + padding * 2;
const boundsHeight = maxY - minY + padding * 2;
const scale = Math.min(width / boundsWidth, height / boundsHeight, 4);
const translateX = (width - boundsWidth * scale) / 2 - (minX - padding) * scale;
const translateY = (height - boundsHeight * scale) / 2 - (minY - padding) * scale;
svg.transition()
.duration(750)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
.call(zoom.transform as any, d3.zoomIdentity.translate(translateX, translateY).scale(scale));
};
window.addEventListener('gesprek-map-fit', handleFit);
// Draw water background
svg.insert('rect', ':first-child')
.attr('width', width)
.attr('height', height)
.attr('fill', COLORS.water);
// Draw province boundaries
g.append('g')
.attr('class', 'provinces')
.selectAll('path')
.data(geoData.features)
.join('path')
.attr('d', pathGenerator as never)
.attr('fill', COLORS.land)
.attr('stroke', COLORS.border)
.attr('stroke-width', 1)
.attr('stroke-linejoin', 'round');
// Draw markers
const markersGroup = g.append('g').attr('class', 'markers');
// Filter out invalid coordinates
const validCoords = coordinates.filter(c =>
c.lat && c.lng &&
!isNaN(c.lat) && !isNaN(c.lng) &&
c.lat >= 50 && c.lat <= 54 && // Netherlands bounds
c.lng >= 3 && c.lng <= 8
);
// Simple clustering for dense areas (if enabled)
let displayCoords = validCoords;
if (showClustering && validCoords.length > 100) {
// Use quadtree-based clustering
const clustered: GeoCoordinate[] = [];
const cellSize = 0.1; // degrees
const cells = new Map<string, GeoCoordinate[]>();
validCoords.forEach(coord => {
const key = `${Math.floor(coord.lat / cellSize)},${Math.floor(coord.lng / cellSize)}`;
if (!cells.has(key)) cells.set(key, []);
cells.get(key)!.push(coord);
});
cells.forEach(cellCoords => {
if (cellCoords.length === 1) {
clustered.push(cellCoords[0]);
} else {
// Create cluster centroid
const avgLat = cellCoords.reduce((s, c) => s + c.lat, 0) / cellCoords.length;
const avgLng = cellCoords.reduce((s, c) => s + c.lng, 0) / cellCoords.length;
clustered.push({
lat: avgLat,
lng: avgLng,
label: `${cellCoords.length} ${language === 'nl' ? 'instellingen' : 'institutions'}`,
data: {
id: `cluster-${avgLat}-${avgLng}`,
name: `${cellCoords.length} ${language === 'nl' ? 'instellingen' : 'institutions'}`,
reviews: cellCoords.length * 10, // Use for sizing
},
});
}
});
displayCoords = clustered;
}
// Draw marker circles
markersGroup.selectAll('circle')
.data(displayCoords)
.join('circle')
.attr('cx', d => {
const [x] = projection([d.lng, d.lat]) || [0, 0];
return x;
})
.attr('cy', d => {
const [, y] = projection([d.lng, d.lat]) || [0, 0];
return y;
})
.attr('r', d => getMarkerRadius(d.data))
.attr('fill', d => getMarkerColor(d.type))
.attr('fill-opacity', 0.7)
.attr('stroke', d => selectedId && d.data?.id === selectedId ? COLORS.markerSelected : '#fff')
.attr('stroke-width', d => selectedId && d.data?.id === selectedId ? 3 : 1.5)
.style('cursor', 'pointer')
.on('mouseenter', function(event, d) {
d3.select(this)
.transition()
.duration(200)
.attr('r', getMarkerRadius(d.data) * 1.3)
.attr('fill-opacity', 1);
const [x, y] = d3.pointer(event, containerRef.current);
setTooltip({
visible: true,
x: x + 10,
y: y - 10,
data: d.data || null,
});
if (onMarkerHover && d.data) {
onMarkerHover(d.data);
}
})
.on('mouseleave', function(_event, d) {
d3.select(this)
.transition()
.duration(200)
.attr('r', getMarkerRadius(d.data))
.attr('fill-opacity', 0.7);
setTooltip(prev => ({ ...prev, visible: false }));
if (onMarkerHover) {
onMarkerHover(null);
}
})
.on('click', (_event, d) => {
if (onMarkerClick && d.data) {
onMarkerClick(d.data);
}
});
// Cleanup
return () => {
window.removeEventListener('gesprek-map-fit', handleFit);
};
}, [geoData, coordinates, width, height, loading, selectedId, showClustering, language, onMarkerClick, onMarkerHover]);
if (loading) {
return (
<div className={`gesprek-geo-map gesprek-geo-map--loading ${className || ''}`}>
<div className="gesprek-geo-map__loading">
<div className="gesprek-geo-map__spinner" />
<span>{language === 'nl' ? 'Kaart laden...' : 'Loading map...'}</span>
</div>
</div>
);
}
if (error) {
return (
<div className={`gesprek-geo-map gesprek-geo-map--error ${className || ''}`}>
<span>{error}</span>
</div>
);
}
return (
<div
ref={containerRef}
className={`gesprek-geo-map ${className || ''}`}
style={{ position: 'relative' }}
>
<svg ref={svgRef} />
{/* Tooltip */}
{tooltip.visible && tooltip.data && (
<div
className="gesprek-geo-map__tooltip"
style={{
position: 'absolute',
left: tooltip.x,
top: tooltip.y,
pointerEvents: 'none',
zIndex: 100,
}}
>
<div className="gesprek-geo-map__tooltip-name">{tooltip.data.name}</div>
{tooltip.data.type && (
<div className="gesprek-geo-map__tooltip-type">{tooltip.data.type}</div>
)}
{tooltip.data.city && (
<div className="gesprek-geo-map__tooltip-city">{tooltip.data.city}</div>
)}
{tooltip.data.rating && tooltip.data.rating > 0 && (
<div className="gesprek-geo-map__tooltip-rating">
{'★'.repeat(Math.round(tooltip.data.rating))}
{' '}
{tooltip.data.rating.toFixed(1)}
</div>
)}
</div>
)}
{/* Map controls */}
<div className="gesprek-geo-map__controls">
<button
className="gesprek-geo-map__control-btn"
onClick={fitToBounds}
title={language === 'nl' ? 'Zoom naar markers' : 'Fit to markers'}
>
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2">
<path d="M15 3h6v6M9 21H3v-6M21 3l-7 7M3 21l7-7" />
</svg>
</button>
</div>
{/* Legend */}
<div className="gesprek-geo-map__legend">
<div className="gesprek-geo-map__legend-item">
<span className="gesprek-geo-map__legend-dot" style={{ backgroundColor: TYPE_COLORS.museum }} />
<span>Museum</span>
</div>
<div className="gesprek-geo-map__legend-item">
<span className="gesprek-geo-map__legend-dot" style={{ backgroundColor: TYPE_COLORS.library }} />
<span>{language === 'nl' ? 'Bibliotheek' : 'Library'}</span>
</div>
<div className="gesprek-geo-map__legend-item">
<span className="gesprek-geo-map__legend-dot" style={{ backgroundColor: TYPE_COLORS.archive }} />
<span>{language === 'nl' ? 'Archief' : 'Archive'}</span>
</div>
<div className="gesprek-geo-map__legend-item">
<span className="gesprek-geo-map__legend-dot" style={{ backgroundColor: TYPE_COLORS.gallery }} />
<span>{language === 'nl' ? 'Galerie' : 'Gallery'}</span>
</div>
</div>
{/* Stats */}
<div className="gesprek-geo-map__stats">
{coordinates.length} {language === 'nl' ? 'locaties' : 'locations'}
</div>
</div>
);
};
export default GesprekGeoMap;

View file

@ -0,0 +1,549 @@
/**
* GesprekNetworkGraph.tsx - D3 Force-Directed Network Graph for Gesprek Page
*
* Features:
* - Force-directed layout with collision detection
* - Node dragging
* - Zoom and pan
* - Node and edge highlighting on hover
* - Tooltips with entity details
* - Edge labels
*
* Uses D3.js v7 with React 19
*/
import React, { useRef, useEffect, useState, useMemo } from 'react';
import * as d3 from 'd3';
import type { GraphVisualizationData } from '../../hooks/useMultiDatabaseRAG';
// NDE House Style Colors
const COLORS = {
primary: '#154273',
secondary: '#2E5A8B',
accent: '#3B82F6',
background: '#f8fafc',
text: '#1e293b',
textLight: '#64748b',
link: '#94a3b8',
linkHighlight: '#3B82F6',
nodeStroke: '#fff',
};
// Node type to color mapping
const NODE_TYPE_COLORS: Record<string, string> = {
institution: '#154273',
museum: '#ef4444',
library: '#3b82f6',
archive: '#10b981',
gallery: '#f59e0b',
person: '#8b5cf6',
collection: '#ec4899',
event: '#06b6d4',
place: '#84cc16',
organization: '#154273',
default: '#154273',
};
export interface GesprekNetworkGraphProps {
data: GraphVisualizationData;
width?: number;
height?: number;
onNodeClick?: (nodeId: string, nodeData: GraphVisualizationData['nodes'][0]) => void;
onNodeHover?: (nodeId: string | null, nodeData: GraphVisualizationData['nodes'][0] | null) => void;
selectedNodeId?: string | null;
language?: 'nl' | 'en';
showLabels?: boolean;
showEdgeLabels?: boolean;
className?: string;
}
interface SimulationNode extends d3.SimulationNodeDatum {
id: string;
label: string;
type: string;
attributes?: Record<string, unknown>;
}
interface SimulationLink extends d3.SimulationLinkDatum<SimulationNode> {
id: string;
label: string;
type?: string;
}
interface TooltipState {
visible: boolean;
x: number;
y: number;
node: GraphVisualizationData['nodes'][0] | null;
}
/**
* Get node color based on type
*/
function getNodeColor(type?: string): string {
if (!type) return NODE_TYPE_COLORS.default;
const normalizedType = type.toLowerCase();
for (const [key, color] of Object.entries(NODE_TYPE_COLORS)) {
if (normalizedType.includes(key)) {
return color;
}
}
return NODE_TYPE_COLORS.default;
}
/**
* Get node radius based on connections
*/
function getNodeRadius(id: string, edges: GraphVisualizationData['edges']): number {
const connections = edges.filter(e => e.source === id || e.target === id).length;
return Math.max(8, Math.min(25, 8 + connections * 2));
}
export const GesprekNetworkGraph: React.FC<GesprekNetworkGraphProps> = ({
data,
width = 600,
height = 400,
onNodeClick,
onNodeHover,
selectedNodeId,
language = 'nl',
showLabels = true,
showEdgeLabels = false,
className,
}) => {
const svgRef = useRef<SVGSVGElement>(null);
const containerRef = useRef<HTMLDivElement>(null);
const simulationRef = useRef<d3.Simulation<SimulationNode, SimulationLink> | null>(null);
const [tooltip, setTooltip] = useState<TooltipState>({
visible: false,
x: 0,
y: 0,
node: null,
});
// Process data for D3 simulation
const { nodes, links } = useMemo(() => {
const simNodes: SimulationNode[] = data.nodes.map(node => ({
...node,
x: undefined,
y: undefined,
}));
const simLinks: SimulationLink[] = data.edges.map(edge => ({
...edge,
source: edge.source,
target: edge.target,
}));
return { nodes: simNodes, links: simLinks };
}, [data]);
// Main D3 visualization
useEffect(() => {
if (!svgRef.current || nodes.length === 0) return;
// Clear previous content
d3.select(svgRef.current).selectAll('*').remove();
const svg = d3.select(svgRef.current)
.attr('width', width)
.attr('height', height)
.attr('viewBox', [0, 0, width, height]);
// Create container group for zoom
const g = svg.append('g');
// Create arrow marker for directed edges
svg.append('defs')
.append('marker')
.attr('id', 'arrowhead')
.attr('viewBox', '0 -5 10 10')
.attr('refX', 20)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', COLORS.link);
// Setup zoom behavior
const zoom = d3.zoom<SVGSVGElement, unknown>()
.scaleExtent([0.2, 4])
.on('zoom', (event) => {
g.attr('transform', event.transform);
});
svg.call(zoom);
// Create force simulation
const simulation = d3.forceSimulation<SimulationNode>(nodes)
.force('link', d3.forceLink<SimulationNode, SimulationLink>(links)
.id(d => d.id)
.distance(100)
.strength(0.5))
.force('charge', d3.forceManyBody().strength(-300))
.force('center', d3.forceCenter(width / 2, height / 2))
.force('collision', d3.forceCollide<SimulationNode>().radius(d => getNodeRadius(d.id, data.edges) + 10));
simulationRef.current = simulation;
// Draw links
const linksGroup = g.append('g').attr('class', 'links');
const link = linksGroup.selectAll('.link')
.data(links)
.join('g')
.attr('class', 'link');
const linkLine = link.append('line')
.attr('stroke', COLORS.link)
.attr('stroke-width', 1.5)
.attr('stroke-opacity', 0.6)
.attr('marker-end', 'url(#arrowhead)');
// Edge labels
let linkLabels: d3.Selection<SVGTextElement, SimulationLink, SVGGElement, unknown> | null = null;
if (showEdgeLabels) {
linkLabels = link.append('text')
.attr('class', 'link-label')
.attr('font-size', '9px')
.attr('fill', COLORS.textLight)
.attr('text-anchor', 'middle')
.attr('dy', '-5')
.text(d => d.label || '');
}
// Draw nodes
const nodesGroup = g.append('g').attr('class', 'nodes');
const node = nodesGroup.selectAll('.node')
.data(nodes)
.join('g')
.attr('class', 'node')
.style('cursor', 'pointer');
// Node circles
const nodeCircle = node.append('circle')
.attr('r', d => getNodeRadius(d.id, data.edges))
.attr('fill', d => getNodeColor(d.type))
.attr('stroke', d => selectedNodeId === d.id ? COLORS.accent : COLORS.nodeStroke)
.attr('stroke-width', d => selectedNodeId === d.id ? 3 : 2)
.attr('stroke-opacity', 0.9);
// Node labels
if (showLabels) {
node.append('text')
.attr('class', 'node-label')
.attr('dy', d => getNodeRadius(d.id, data.edges) + 12)
.attr('text-anchor', 'middle')
.attr('font-size', '10px')
.attr('font-weight', '500')
.attr('fill', COLORS.text)
.text(d => {
const maxLength = 15;
return d.label.length > maxLength
? d.label.substring(0, maxLength) + '...'
: d.label;
});
}
// Drag behavior
const drag = d3.drag<SVGGElement, SimulationNode>()
.on('start', (event, d) => {
if (!event.active) simulation.alphaTarget(0.3).restart();
d.fx = d.x;
d.fy = d.y;
})
.on('drag', (event, d) => {
d.fx = event.x;
d.fy = event.y;
})
.on('end', (event, d) => {
if (!event.active) simulation.alphaTarget(0);
d.fx = null;
d.fy = null;
});
// eslint-disable-next-line @typescript-eslint/no-explicit-any
node.call(drag as any);
// Hover interactions
node
.on('mouseenter', function(event, d) {
// Highlight node
d3.select(this).select('circle')
.transition()
.duration(200)
.attr('r', getNodeRadius(d.id, data.edges) * 1.2)
.attr('stroke', COLORS.accent)
.attr('stroke-width', 3);
// Highlight connected links
linkLine
.attr('stroke', l =>
(l.source as SimulationNode).id === d.id || (l.target as SimulationNode).id === d.id
? COLORS.linkHighlight
: COLORS.link
)
.attr('stroke-width', l =>
(l.source as SimulationNode).id === d.id || (l.target as SimulationNode).id === d.id
? 2.5
: 1.5
)
.attr('stroke-opacity', l =>
(l.source as SimulationNode).id === d.id || (l.target as SimulationNode).id === d.id
? 1
: 0.3
);
// Fade unconnected nodes
nodeCircle
.attr('opacity', n => {
if (n.id === d.id) return 1;
const connected = links.some(l =>
((l.source as SimulationNode).id === d.id && (l.target as SimulationNode).id === n.id) ||
((l.target as SimulationNode).id === d.id && (l.source as SimulationNode).id === n.id)
);
return connected ? 1 : 0.3;
});
// Show tooltip
const [x, y] = d3.pointer(event, containerRef.current);
setTooltip({
visible: true,
x: x + 10,
y: y - 10,
node: data.nodes.find(n => n.id === d.id) || null,
});
if (onNodeHover) {
onNodeHover(d.id, data.nodes.find(n => n.id === d.id) || null);
}
})
.on('mouseleave', function(_event, d) {
// Reset node
d3.select(this).select('circle')
.transition()
.duration(200)
.attr('r', getNodeRadius(d.id, data.edges))
.attr('stroke', selectedNodeId === d.id ? COLORS.accent : COLORS.nodeStroke)
.attr('stroke-width', selectedNodeId === d.id ? 3 : 2);
// Reset links
linkLine
.attr('stroke', COLORS.link)
.attr('stroke-width', 1.5)
.attr('stroke-opacity', 0.6);
// Reset nodes
nodeCircle.attr('opacity', 1);
setTooltip(prev => ({ ...prev, visible: false }));
if (onNodeHover) {
onNodeHover(null, null);
}
})
.on('click', (_event, d) => {
if (onNodeClick) {
onNodeClick(d.id, data.nodes.find(n => n.id === d.id)!);
}
});
// Update positions on simulation tick
simulation.on('tick', () => {
linkLine
.attr('x1', d => (d.source as SimulationNode).x || 0)
.attr('y1', d => (d.source as SimulationNode).y || 0)
.attr('x2', d => (d.target as SimulationNode).x || 0)
.attr('y2', d => (d.target as SimulationNode).y || 0);
if (linkLabels) {
linkLabels
.attr('x', d => (((d.source as SimulationNode).x || 0) + ((d.target as SimulationNode).x || 0)) / 2)
.attr('y', d => (((d.source as SimulationNode).y || 0) + ((d.target as SimulationNode).y || 0)) / 2);
}
node.attr('transform', d => `translate(${d.x || 0},${d.y || 0})`);
});
// Fit to bounds function
const handleFit = () => {
const bounds = g.node()?.getBBox();
if (!bounds) return;
const padding = 50;
const fullWidth = bounds.width + padding * 2;
const fullHeight = bounds.height + padding * 2;
const scale = Math.min(width / fullWidth, height / fullHeight, 1.5);
const translateX = (width - bounds.width * scale) / 2 - bounds.x * scale;
const translateY = (height - bounds.height * scale) / 2 - bounds.y * scale;
svg.transition()
.duration(750)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
.call(zoom.transform as any, d3.zoomIdentity.translate(translateX, translateY).scale(scale));
};
window.addEventListener('gesprek-network-fit', handleFit);
// Initial fit after simulation stabilizes
simulation.on('end', () => {
setTimeout(handleFit, 100);
});
// Cleanup
return () => {
simulation.stop();
window.removeEventListener('gesprek-network-fit', handleFit);
};
}, [data, nodes, links, width, height, selectedNodeId, showLabels, showEdgeLabels, onNodeClick, onNodeHover]);
// Control handlers
const handleFitToBounds = () => {
window.dispatchEvent(new CustomEvent('gesprek-network-fit'));
};
const handleRestartSimulation = () => {
if (simulationRef.current) {
simulationRef.current.alpha(1).restart();
}
};
// Empty state
if (nodes.length === 0) {
return (
<div className={`gesprek-network-graph gesprek-network-graph--empty ${className || ''}`}>
<div className="gesprek-network-graph__empty">
<span>{language === 'nl' ? 'Geen netwerkgegevens beschikbaar' : 'No network data available'}</span>
</div>
</div>
);
}
return (
<div
ref={containerRef}
className={`gesprek-network-graph ${className || ''}`}
style={{ position: 'relative' }}
>
<svg ref={svgRef} />
{/* Tooltip */}
{tooltip.visible && tooltip.node && (
<div
className="gesprek-network-graph__tooltip"
style={{
position: 'absolute',
left: Math.min(tooltip.x, width - 180),
top: Math.max(tooltip.y, 10),
pointerEvents: 'none',
zIndex: 100,
backgroundColor: 'white',
border: '1px solid #e2e8f0',
borderRadius: '4px',
padding: '8px 12px',
boxShadow: '0 2px 8px rgba(0,0,0,0.1)',
fontSize: '12px',
maxWidth: '180px',
}}
>
<div style={{ fontWeight: '600', color: COLORS.text }}>{tooltip.node.label}</div>
{tooltip.node.type && (
<div style={{
display: 'inline-block',
marginTop: '4px',
padding: '2px 6px',
fontSize: '10px',
borderRadius: '3px',
backgroundColor: getNodeColor(tooltip.node.type) + '20',
color: getNodeColor(tooltip.node.type),
}}>
{tooltip.node.type}
</div>
)}
{tooltip.node.attributes && Object.keys(tooltip.node.attributes).length > 0 && (
<div style={{ marginTop: '6px', fontSize: '11px', color: COLORS.textLight }}>
{Object.entries(tooltip.node.attributes).slice(0, 3).map(([key, value]) => (
<div key={key}>
<span style={{ fontWeight: '500' }}>{key}:</span> {String(value)}
</div>
))}
</div>
)}
</div>
)}
{/* Controls */}
<div
className="gesprek-network-graph__controls"
style={{
position: 'absolute',
top: '5px',
right: '5px',
display: 'flex',
gap: '4px',
}}
>
<button
onClick={handleFitToBounds}
title={language === 'nl' ? 'Zoom aanpassen' : 'Fit to view'}
style={{
padding: '4px 8px',
fontSize: '11px',
backgroundColor: 'white',
border: '1px solid #e2e8f0',
borderRadius: '4px',
cursor: 'pointer',
color: COLORS.textLight,
}}
>
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2">
<path d="M15 3h6v6M9 21H3v-6M21 3l-7 7M3 21l7-7" />
</svg>
</button>
<button
onClick={handleRestartSimulation}
title={language === 'nl' ? 'Herstart simulatie' : 'Restart simulation'}
style={{
padding: '4px 8px',
fontSize: '11px',
backgroundColor: 'white',
border: '1px solid #e2e8f0',
borderRadius: '4px',
cursor: 'pointer',
color: COLORS.textLight,
}}
>
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2">
<path d="M1 4v6h6M23 20v-6h-6" />
<path d="M20.49 9A9 9 0 0 0 5.64 5.64L1 10m22 4l-4.64 4.36A9 9 0 0 1 3.51 15" />
</svg>
</button>
</div>
{/* Legend */}
<div
className="gesprek-network-graph__legend"
style={{
position: 'absolute',
bottom: '5px',
left: '5px',
display: 'flex',
gap: '12px',
fontSize: '10px',
color: COLORS.textLight,
}}
>
<span>{nodes.length} {language === 'nl' ? 'knopen' : 'nodes'}</span>
<span></span>
<span>{links.length} {language === 'nl' ? 'verbindingen' : 'edges'}</span>
</div>
</div>
);
};
export default GesprekNetworkGraph;

View file

@ -0,0 +1,497 @@
/**
* GesprekTimeline.tsx - D3 Timeline Visualization for Gesprek Page
*
* Features:
* - Horizontal timeline with event markers
* - Zoom and pan support
* - Event clustering for dense periods
* - Tooltips with event details
* - Animated transitions
*
* Uses D3.js v7 with React 19
*/
import React, { useRef, useEffect, useState, useMemo } from 'react';
import * as d3 from 'd3';
import type { TimelineEvent } from '../../hooks/useMultiDatabaseRAG';
// NDE House Style Colors
const COLORS = {
primary: '#154273',
secondary: '#2E5A8B',
accent: '#3B82F6',
background: '#f8fafc',
text: '#1e293b',
textLight: '#64748b',
axis: '#94a3b8',
axisLine: '#cbd5e1',
marker: '#154273',
markerHover: '#3B82F6',
};
// Event type to color mapping
const EVENT_TYPE_COLORS: Record<string, string> = {
founding: '#10b981', // Green
closure: '#ef4444', // Red
merger: '#8b5cf6', // Purple
relocation: '#f59e0b', // Amber
name_change: '#06b6d4', // Cyan
acquisition: '#ec4899', // Pink
default: '#154273', // Primary blue
};
export interface GesprekTimelineProps {
events: TimelineEvent[];
width?: number;
height?: number;
onEventClick?: (event: TimelineEvent) => void;
onEventHover?: (event: TimelineEvent | null) => void;
selectedEventId?: string | null;
language?: 'nl' | 'en';
showLabels?: boolean;
className?: string;
}
interface ParsedEvent extends TimelineEvent {
parsedDate: Date;
}
interface TooltipState {
visible: boolean;
x: number;
y: number;
event: TimelineEvent | null;
}
/**
* Parse various date formats to Date objects
*/
function parseDate(dateStr: string): Date | null {
if (!dateStr) return null;
// Try ISO format first
let date = new Date(dateStr);
if (!isNaN(date.getTime())) return date;
// Try year-only format
const yearMatch = dateStr.match(/^(\d{4})$/);
if (yearMatch) {
return new Date(parseInt(yearMatch[1]), 0, 1);
}
// Try "Month YYYY" format
const monthYearMatch = dateStr.match(/^(\w+)\s+(\d{4})$/);
if (monthYearMatch) {
date = new Date(`${monthYearMatch[1]} 1, ${monthYearMatch[2]}`);
if (!isNaN(date.getTime())) return date;
}
return null;
}
/**
* Get marker color based on event type
*/
function getEventColor(type?: string): string {
if (!type) return EVENT_TYPE_COLORS.default;
const normalizedType = type.toLowerCase().replace(/[_-]/g, '');
for (const [key, color] of Object.entries(EVENT_TYPE_COLORS)) {
if (normalizedType.includes(key) || key.includes(normalizedType)) {
return color;
}
}
return EVENT_TYPE_COLORS.default;
}
/**
* Format date for display
*/
function formatDate(date: Date, language: 'nl' | 'en'): string {
return date.toLocaleDateString(language === 'nl' ? 'nl-NL' : 'en-US', {
year: 'numeric',
month: 'short',
day: date.getDate() !== 1 ? 'numeric' : undefined,
});
}
export const GesprekTimeline: React.FC<GesprekTimelineProps> = ({
events,
width = 800,
height = 200,
onEventClick,
onEventHover,
selectedEventId,
language = 'nl',
showLabels = true,
className,
}) => {
const svgRef = useRef<SVGSVGElement>(null);
const containerRef = useRef<HTMLDivElement>(null);
const [tooltip, setTooltip] = useState<TooltipState>({
visible: false,
x: 0,
y: 0,
event: null,
});
// Margins
const margin = useMemo(() => ({
top: 30,
right: 30,
bottom: 40,
left: 30,
}), []);
const innerWidth = width - margin.left - margin.right;
const innerHeight = height - margin.top - margin.bottom;
// Parse and filter events
const parsedEvents = useMemo<ParsedEvent[]>(() => {
return events
.map(event => ({
...event,
parsedDate: parseDate(event.date),
}))
.filter((event): event is ParsedEvent => event.parsedDate !== null)
.sort((a, b) => a.parsedDate.getTime() - b.parsedDate.getTime());
}, [events]);
// Main D3 visualization
useEffect(() => {
if (!svgRef.current || parsedEvents.length === 0) return;
// Clear previous content
d3.select(svgRef.current).selectAll('*').remove();
const svg = d3.select(svgRef.current)
.attr('width', width)
.attr('height', height)
.attr('viewBox', [0, 0, width, height]);
// Create clip path
svg.append('defs')
.append('clipPath')
.attr('id', 'timeline-clip')
.append('rect')
.attr('x', margin.left)
.attr('y', margin.top)
.attr('width', innerWidth)
.attr('height', innerHeight);
// Create main group with margins
const g = svg.append('g')
.attr('transform', `translate(${margin.left},${margin.top})`);
// Calculate time extent with padding
const timeExtent = d3.extent(parsedEvents, d => d.parsedDate) as [Date, Date];
const timePadding = (timeExtent[1].getTime() - timeExtent[0].getTime()) * 0.05;
const xScale = d3.scaleTime()
.domain([
new Date(timeExtent[0].getTime() - timePadding),
new Date(timeExtent[1].getTime() + timePadding),
])
.range([0, innerWidth]);
// Store original scale for zoom reset
const xScaleOriginal = xScale.copy();
// Create clipped group for content
const content = g.append('g')
.attr('clip-path', 'url(#timeline-clip)');
// Draw timeline axis line
content.append('line')
.attr('class', 'axis-line')
.attr('x1', 0)
.attr('x2', innerWidth)
.attr('y1', innerHeight / 2)
.attr('y2', innerHeight / 2)
.attr('stroke', COLORS.axisLine)
.attr('stroke-width', 2);
// Draw axis
const xAxis = d3.axisBottom(xScale)
.ticks(Math.min(parsedEvents.length, 10))
.tickFormat((d) => formatDate(d as Date, language));
const axisGroup = g.append('g')
.attr('class', 'x-axis')
.attr('transform', `translate(0,${innerHeight})`)
.call(xAxis);
axisGroup.selectAll('text')
.attr('font-size', '10px')
.attr('fill', COLORS.textLight)
.attr('transform', 'rotate(-30)')
.attr('text-anchor', 'end')
.attr('dx', '-0.5em')
.attr('dy', '0.5em');
axisGroup.selectAll('line')
.attr('stroke', COLORS.axis);
axisGroup.select('.domain')
.attr('stroke', COLORS.axis);
// Draw event markers
const markersGroup = content.append('g').attr('class', 'markers');
const markers = markersGroup.selectAll('.event-marker')
.data(parsedEvents)
.join('g')
.attr('class', 'event-marker')
.attr('transform', d => `translate(${xScale(d.parsedDate)},${innerHeight / 2})`)
.style('cursor', 'pointer');
// Marker circles
markers.append('circle')
.attr('r', 0)
.attr('fill', d => getEventColor(d.type))
.attr('stroke', '#fff')
.attr('stroke-width', 2)
.transition()
.duration(500)
.delay((_, i) => i * 50)
.attr('r', d => selectedEventId === d.date ? 10 : 7);
// Marker connectors (vertical lines)
markers.append('line')
.attr('class', 'connector')
.attr('x1', 0)
.attr('x2', 0)
.attr('y1', 0)
.attr('y2', 0)
.attr('stroke', d => getEventColor(d.type))
.attr('stroke-width', 1.5)
.attr('stroke-dasharray', '3,3')
.attr('opacity', 0)
.transition()
.duration(500)
.delay((_, i) => i * 50 + 300)
.attr('y2', (_, i) => (i % 2 === 0 ? -25 : 25))
.attr('opacity', showLabels ? 0.7 : 0);
// Event labels
if (showLabels) {
markers.append('text')
.attr('class', 'event-label')
.attr('x', 0)
.attr('y', (_, i) => (i % 2 === 0 ? -32 : 40))
.attr('text-anchor', 'middle')
.attr('font-size', '10px')
.attr('fill', COLORS.text)
.attr('opacity', 0)
.text(d => {
// Truncate long labels
const maxLength = 20;
return d.label.length > maxLength
? d.label.substring(0, maxLength) + '...'
: d.label;
})
.transition()
.duration(500)
.delay((_, i) => i * 50 + 500)
.attr('opacity', 1);
}
// Interaction handlers
markers
.on('mouseenter', function(event, d) {
d3.select(this).select('circle')
.transition()
.duration(200)
.attr('r', 10);
const [x, y] = d3.pointer(event, containerRef.current);
setTooltip({
visible: true,
x: x + 10,
y: y - 10,
event: d,
});
if (onEventHover) {
onEventHover(d);
}
})
.on('mouseleave', function(_event, d) {
d3.select(this).select('circle')
.transition()
.duration(200)
.attr('r', selectedEventId === d.date ? 10 : 7);
setTooltip(prev => ({ ...prev, visible: false }));
if (onEventHover) {
onEventHover(null);
}
})
.on('click', (_event, d) => {
if (onEventClick) {
onEventClick(d);
}
});
// Zoom behavior
const zoom = d3.zoom<SVGSVGElement, unknown>()
.scaleExtent([0.5, 10])
.translateExtent([[-innerWidth, -innerHeight], [innerWidth * 2, innerHeight * 2]])
.on('zoom', (event) => {
const newXScale = event.transform.rescaleX(xScaleOriginal);
// Update axis
axisGroup.call(xAxis.scale(newXScale));
// Update markers
markers.attr('transform', d =>
`translate(${newXScale(d.parsedDate)},${innerHeight / 2})`
);
// Update axis styling
axisGroup.selectAll('text')
.attr('font-size', '10px')
.attr('fill', COLORS.textLight)
.attr('transform', 'rotate(-30)')
.attr('text-anchor', 'end');
});
svg.call(zoom);
// Add zoom reset button handler
const handleReset = () => {
svg.transition()
.duration(750)
// eslint-disable-next-line @typescript-eslint/no-explicit-any
.call(zoom.transform as any, d3.zoomIdentity);
};
window.addEventListener('gesprek-timeline-reset', handleReset);
return () => {
window.removeEventListener('gesprek-timeline-reset', handleReset);
};
}, [parsedEvents, width, height, innerWidth, innerHeight, margin, selectedEventId, showLabels, language, onEventClick, onEventHover]);
// Handle zoom reset
const handleResetZoom = () => {
window.dispatchEvent(new CustomEvent('gesprek-timeline-reset'));
};
// Empty state
if (parsedEvents.length === 0) {
return (
<div className={`gesprek-timeline gesprek-timeline--empty ${className || ''}`}>
<div className="gesprek-timeline__empty">
<span>{language === 'nl' ? 'Geen tijdlijngegevens beschikbaar' : 'No timeline data available'}</span>
</div>
</div>
);
}
return (
<div
ref={containerRef}
className={`gesprek-timeline ${className || ''}`}
style={{ position: 'relative' }}
>
<svg ref={svgRef} />
{/* Tooltip */}
{tooltip.visible && tooltip.event && (
<div
className="gesprek-timeline__tooltip"
style={{
position: 'absolute',
left: Math.min(tooltip.x, width - 200),
top: Math.max(tooltip.y, 10),
pointerEvents: 'none',
zIndex: 100,
backgroundColor: 'white',
border: '1px solid #e2e8f0',
borderRadius: '4px',
padding: '8px 12px',
boxShadow: '0 2px 8px rgba(0,0,0,0.1)',
fontSize: '12px',
maxWidth: '200px',
}}
>
<div style={{ fontWeight: '600', color: COLORS.text }}>{tooltip.event.label}</div>
<div style={{ color: COLORS.primary, fontSize: '11px', marginTop: '2px' }}>
{formatDate(parseDate(tooltip.event.date)!, language)}
</div>
{tooltip.event.type && (
<div style={{
display: 'inline-block',
marginTop: '4px',
padding: '2px 6px',
fontSize: '10px',
borderRadius: '3px',
backgroundColor: getEventColor(tooltip.event.type) + '20',
color: getEventColor(tooltip.event.type),
}}>
{tooltip.event.type}
</div>
)}
{tooltip.event.description && (
<div style={{ color: COLORS.textLight, fontSize: '11px', marginTop: '4px' }}>
{tooltip.event.description}
</div>
)}
</div>
)}
{/* Controls */}
<div
className="gesprek-timeline__controls"
style={{
position: 'absolute',
top: '5px',
right: '5px',
display: 'flex',
gap: '4px',
}}
>
<button
onClick={handleResetZoom}
title={language === 'nl' ? 'Zoom resetten' : 'Reset zoom'}
style={{
padding: '4px 8px',
fontSize: '11px',
backgroundColor: 'white',
border: '1px solid #e2e8f0',
borderRadius: '4px',
cursor: 'pointer',
color: COLORS.textLight,
}}
>
{language === 'nl' ? 'Reset' : 'Reset'}
</button>
</div>
{/* Legend */}
<div
className="gesprek-timeline__legend"
style={{
position: 'absolute',
bottom: '5px',
left: '5px',
display: 'flex',
gap: '12px',
fontSize: '10px',
color: COLORS.textLight,
}}
>
<span>{parsedEvents.length} {language === 'nl' ? 'gebeurtenissen' : 'events'}</span>
<span></span>
<span>
{formatDate(parsedEvents[0].parsedDate, language)} {formatDate(parsedEvents[parsedEvents.length - 1].parsedDate, language)}
</span>
</div>
</div>
);
};
export default GesprekTimeline;

View file

@ -0,0 +1,18 @@
/**
* Gesprek Components Index
*
* D3 visualization components for the Gesprek (Conversation) page.
* All components follow NDE house style and support Dutch/English.
*/
export { GesprekGeoMap } from './GesprekGeoMap';
export type { GesprekGeoMapProps } from './GesprekGeoMap';
export { GesprekBarChart } from './GesprekBarChart';
export type { GesprekBarChartProps } from './GesprekBarChart';
export { GesprekTimeline } from './GesprekTimeline';
export type { GesprekTimelineProps } from './GesprekTimeline';
export { GesprekNetworkGraph } from './GesprekNetworkGraph';
export type { GesprekNetworkGraphProps } from './GesprekNetworkGraph';

View file

@ -178,6 +178,12 @@ export function Navigation() {
>
{t('overview')}
</Link>
<Link
to="/gesprek"
className={`nav-link ${isActive('/gesprek') ? 'active' : ''}`}
>
{t('gesprek')}
</Link>
<Link
to="/settings"
className={`nav-link ${isActive('/settings') ? 'active' : ''}`}
@ -268,6 +274,9 @@ export function Navigation() {
<Link to="/overview" className={`nav-mobile-link ${isActive('/overview') ? 'active' : ''}`}>
{t('overview')}
</Link>
<Link to="/gesprek" className={`nav-mobile-link ${isActive('/gesprek') ? 'active' : ''}`}>
{t('gesprek')}
</Link>
<Link to="/settings" className={`nav-mobile-link ${isActive('/settings') ? 'active' : ''}`}>
{t('settings')}
</Link>

View file

@ -6,10 +6,26 @@
*/
import React, { useEffect, useRef, useState } from 'react';
import mermaid from 'mermaid';
import type { SparqlClient } from '../../lib/sparql/client';
import './OntologyVisualizer.css';
// Lazy load mermaid to avoid bundling issues
let mermaidInstance: typeof import('mermaid').default | null = null;
const getMermaid = async () => {
if (!mermaidInstance) {
const mod = await import('mermaid');
mermaidInstance = mod.default;
mermaidInstance.initialize({
startOnLoad: false,
theme: 'default',
securityLevel: 'loose',
fontFamily: 'Arial, sans-serif',
logLevel: 'error',
});
}
return mermaidInstance;
};
export interface OntologyVisualizerProps {
/** Pre-loaded Mermaid diagram source */
mermaidSource?: string;
@ -35,16 +51,11 @@ export const OntologyVisualizer: React.FC<OntologyVisualizerProps> = ({
const [error, setError] = useState<string | null>(null);
const [zoom, setZoom] = useState(1);
const [generatedSource, setGeneratedSource] = useState<string | null>(null);
const [mermaidReady, setMermaidReady] = useState(false);
// Initialize Mermaid
// Initialize Mermaid (lazy loaded)
useEffect(() => {
mermaid.initialize({
startOnLoad: true,
theme: 'default',
securityLevel: 'loose',
fontFamily: 'Arial, sans-serif',
logLevel: 'error',
});
getMermaid().then(() => setMermaidReady(true)).catch(console.error);
}, []);
// Generate Mermaid diagram from RDF data
@ -72,10 +83,11 @@ export const OntologyVisualizer: React.FC<OntologyVisualizerProps> = ({
// Render Mermaid diagram
useEffect(() => {
const source = mermaidSource || generatedSource;
if (!source || !containerRef.current) return;
if (!source || !containerRef.current || !mermaidReady) return;
const renderDiagram = async () => {
try {
const mermaid = await getMermaid();
const { svg } = await mermaid.render('mermaid-diagram', source);
if (containerRef.current) {
containerRef.current.innerHTML = svg;
@ -87,7 +99,7 @@ export const OntologyVisualizer: React.FC<OntologyVisualizerProps> = ({
};
renderDiagram();
}, [mermaidSource, generatedSource]);
}, [mermaidSource, generatedSource, mermaidReady]);
// Generate diagram when sparqlClient is provided
useEffect(() => {

View file

@ -0,0 +1,477 @@
/**
* CustodianTypeIndicator.tsx - Three.js 19-sided Polygon for Custodian Type Display
*
* Displays a 3D polygon badge showing which CustodianType(s) a schema element
* relates to. Uses the GLAMORCUBESFIXPHDNT taxonomy with color-coded polygons.
*
* The polygon has 19 sides - one for each letter in GLAMORCUBESFIXPHDNT:
* G-L-A-M-O-R-C-U-B-E-S-F-I-X-P-H-D-N-T
*
* Usage:
* - Pass one or more custodian type codes (e.g., ['M', 'A'] for Museum + Archive)
* - Component renders a rotating 3D 19-gon (enneadecagon) with the type letter(s)
* - Colors match the centralized custodian-types.ts configuration
*/
import React, { useEffect, useRef, useMemo } from 'react';
import * as THREE from 'three';
import {
getCustodianTypeByCode,
type CustodianTypeCode,
} from '@/lib/custodian-types';
import { useLanguage } from '@/contexts/LanguageContext';
// Total number of custodian types in GLAMORCUBESFIXPHDNT = 19
const POLYGON_SIDES = 19;
export interface CustodianTypeIndicatorProps {
/** Array of custodian type codes (single letters from GLAMORCUBESFIXPHDNT) */
types: CustodianTypeCode[];
/** Size of the indicator in pixels */
size?: number;
/** Whether to animate the polygon rotation */
animate?: boolean;
/** Show tooltip on hover */
showTooltip?: boolean;
/** Custom CSS class */
className?: string;
}
/**
* Create a regular polygon geometry with the specified number of sides
*
* For GLAMORCUBESFIXPHDNT taxonomy, this creates a 19-sided polygon (enneadecagon)
* where each side represents one custodian type letter.
*
* @param sides - Number of sides (default: 19 for GLAMORCUBESFIXPHDNT)
* @param radius - Radius of the polygon
*/
function createPolygonGeometry(sides: number = POLYGON_SIDES, radius: number = 1): THREE.BufferGeometry {
console.log('[CustodianTypeIndicator] Creating polygon geometry with', sides, 'sides (one per GLAMORCUBESFIXPHDNT letter)');
const shape = new THREE.Shape();
for (let i = 0; i <= sides; i++) {
const angle = (i / sides) * Math.PI * 2 - Math.PI / 2; // Start from top
const x = Math.cos(angle) * radius;
const y = Math.sin(angle) * radius;
if (i === 0) {
shape.moveTo(x, y);
} else {
shape.lineTo(x, y);
}
}
const extrudeSettings = {
depth: 0.15,
bevelEnabled: true,
bevelThickness: 0.02,
bevelSize: 0.02,
bevelSegments: 2,
};
return new THREE.ExtrudeGeometry(shape, extrudeSettings);
}
/**
* Create a canvas texture with the type letter
*/
function createLetterTexture(letter: string, color: string): THREE.CanvasTexture {
const canvas = document.createElement('canvas');
canvas.width = 128;
canvas.height = 128;
const ctx = canvas.getContext('2d')!;
// Transparent background
ctx.clearRect(0, 0, 128, 128);
// Draw letter
ctx.fillStyle = color;
ctx.font = 'bold 80px system-ui, -apple-system, sans-serif';
ctx.textAlign = 'center';
ctx.textBaseline = 'middle';
ctx.fillText(letter, 64, 68);
const texture = new THREE.CanvasTexture(canvas);
texture.needsUpdate = true;
return texture;
}
/**
* CustodianTypeIndicator Component
*
* Renders a 3D polygon with the custodian type letter(s)
*/
export const CustodianTypeIndicator: React.FC<CustodianTypeIndicatorProps> = ({
types,
size = 32,
animate = false,
showTooltip = true,
className = '',
}) => {
const { language } = useLanguage();
const containerRef = useRef<HTMLDivElement>(null);
const rendererRef = useRef<THREE.WebGLRenderer | null>(null);
const sceneRef = useRef<THREE.Scene | null>(null);
const cameraRef = useRef<THREE.OrthographicCamera | null>(null);
const meshRef = useRef<THREE.Mesh | null>(null);
const animationFrameRef = useRef<number | null>(null);
// Get type configurations
const typeConfigs = useMemo(() => {
return types
.map(code => getCustodianTypeByCode(code))
.filter((config): config is NonNullable<typeof config> => config !== undefined);
}, [types]);
// Primary type for color (first one if multiple)
const primaryConfig = typeConfigs[0];
// Tooltip text
const tooltipText = useMemo(() => {
if (typeConfigs.length === 0) return '';
return typeConfigs
.map(config => config.label[language])
.join(', ');
}, [typeConfigs, language]);
// Letters to display
const displayLetters = useMemo(() => {
if (types.length === 0) return '?';
if (types.length === 1) return types[0];
if (types.length <= 3) return types.join('');
return types.slice(0, 2).join('') + '+';
}, [types]);
useEffect(() => {
console.log('[CustodianTypeIndicator] useEffect triggered');
console.log('[CustodianTypeIndicator] containerRef.current:', containerRef.current);
console.log('[CustodianTypeIndicator] primaryConfig:', primaryConfig);
console.log('[CustodianTypeIndicator] types:', types);
console.log('[CustodianTypeIndicator] displayLetters:', displayLetters);
if (!containerRef.current) {
console.warn('[CustodianTypeIndicator] No container ref - cannot render 3D polygon');
return;
}
if (!primaryConfig) {
console.warn('[CustodianTypeIndicator] No primaryConfig - cannot determine color');
return;
}
const container = containerRef.current;
console.log('[CustodianTypeIndicator] Container dimensions:', container.clientWidth, 'x', container.clientHeight);
// Initialize scene
const scene = new THREE.Scene();
sceneRef.current = scene;
console.log('[CustodianTypeIndicator] Scene created');
// Orthographic camera for flat 2D-like appearance
const aspect = 1;
const frustumSize = 2.5;
const camera = new THREE.OrthographicCamera(
-frustumSize * aspect / 2,
frustumSize * aspect / 2,
frustumSize / 2,
-frustumSize / 2,
0.1,
100
);
camera.position.z = 5;
cameraRef.current = camera;
console.log('[CustodianTypeIndicator] Camera created');
// Renderer with transparency
let renderer: THREE.WebGLRenderer;
try {
renderer = new THREE.WebGLRenderer({
antialias: true,
alpha: true,
});
renderer.setSize(size, size);
renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
renderer.setClearColor(0x000000, 0);
container.appendChild(renderer.domElement);
rendererRef.current = renderer;
console.log('[CustodianTypeIndicator] Renderer created and appended to container');
} catch (err) {
console.error('[CustodianTypeIndicator] Failed to create WebGL renderer:', err);
return;
}
// Create 19-sided polygon geometry (one side per GLAMORCUBESFIXPHDNT letter)
console.log('[CustodianTypeIndicator] Creating 19-sided polygon (enneadecagon) for GLAMORCUBESFIXPHDNT');
const geometry = createPolygonGeometry(POLYGON_SIDES, 1);
console.log('[CustodianTypeIndicator] Geometry created:', geometry);
// Material with custodian type color
const primaryColor = new THREE.Color(primaryConfig.color);
console.log('[CustodianTypeIndicator] Using color:', primaryConfig.color);
const material = new THREE.MeshStandardMaterial({
color: primaryColor,
metalness: 0.3,
roughness: 0.4,
side: THREE.DoubleSide,
});
const mesh = new THREE.Mesh(geometry, material);
mesh.rotation.x = 0.15; // Slight tilt for 3D effect
scene.add(mesh);
meshRef.current = mesh;
console.log('[CustodianTypeIndicator] Mesh added to scene');
// Create text sprite for the letter
const letterTexture = createLetterTexture(displayLetters, '#ffffff');
const spriteMaterial = new THREE.SpriteMaterial({
map: letterTexture,
transparent: true,
});
const sprite = new THREE.Sprite(spriteMaterial);
sprite.scale.set(1.4, 1.4, 1);
sprite.position.z = 0.2;
scene.add(sprite);
console.log('[CustodianTypeIndicator] Letter sprite added:', displayLetters);
// Lighting
const ambientLight = new THREE.AmbientLight(0xffffff, 0.8);
scene.add(ambientLight);
const directionalLight = new THREE.DirectionalLight(0xffffff, 0.6);
directionalLight.position.set(2, 2, 5);
scene.add(directionalLight);
console.log('[CustodianTypeIndicator] Lighting added');
// Animation loop
let rotationAngle = 0;
const animateScene = () => {
animationFrameRef.current = requestAnimationFrame(animateScene);
if (animate && mesh) {
rotationAngle += 0.01;
mesh.rotation.y = Math.sin(rotationAngle) * 0.3;
}
renderer.render(scene, camera);
};
animateScene();
console.log('[CustodianTypeIndicator] Animation loop started');
// Initial render
renderer.render(scene, camera);
console.log('[CustodianTypeIndicator] Initial render complete');
// Cleanup
return () => {
console.log('[CustodianTypeIndicator] Cleanup triggered');
if (animationFrameRef.current) {
cancelAnimationFrame(animationFrameRef.current);
}
if (renderer) {
container.removeChild(renderer.domElement);
renderer.dispose();
}
geometry.dispose();
material.dispose();
letterTexture.dispose();
spriteMaterial.dispose();
};
}, [primaryConfig, displayLetters, size, animate, types]);
if (!primaryConfig) {
console.warn('[CustodianTypeIndicator] Rendering null - no primaryConfig for types:', types);
return null;
}
console.log('[CustodianTypeIndicator] Rendering container for types:', types, 'with size:', size);
return (
<div
ref={containerRef}
className={`custodian-type-indicator ${className}`}
title={showTooltip ? tooltipText : undefined}
style={{
width: size,
height: size,
display: 'inline-flex',
alignItems: 'center',
justifyContent: 'center',
cursor: showTooltip ? 'help' : 'default',
}}
/>
);
};
/**
* Simplified 2D Badge version (CSS-based, no Three.js)
* For use in lists or where 3D is overkill
*/
export interface CustodianTypeBadgeProps {
/** Array of custodian type codes */
types: CustodianTypeCode[];
/** Size variant */
size?: 'small' | 'medium' | 'large';
/** Show label text */
showLabel?: boolean;
/** Custom CSS class */
className?: string;
}
export const CustodianTypeBadge: React.FC<CustodianTypeBadgeProps> = ({
types,
size = 'medium',
showLabel = false,
className = '',
}) => {
const { language } = useLanguage();
const typeConfigs = useMemo(() => {
return types
.map(code => getCustodianTypeByCode(code))
.filter((config): config is NonNullable<typeof config> => config !== undefined);
}, [types]);
if (typeConfigs.length === 0) {
return null;
}
const primaryConfig = typeConfigs[0];
const sizeClasses = {
small: { fontSize: '10px', padding: '2px 4px', minWidth: '16px' },
medium: { fontSize: '12px', padding: '3px 6px', minWidth: '20px' },
large: { fontSize: '14px', padding: '4px 8px', minWidth: '24px' },
};
const displayLetters = types.length <= 3
? types.join('')
: types.slice(0, 2).join('') + '+';
return (
<span
className={`custodian-type-badge ${className}`}
title={typeConfigs.map(c => c.label[language]).join(', ')}
style={{
display: 'inline-flex',
alignItems: 'center',
gap: '4px',
backgroundColor: primaryConfig.bgColor,
color: primaryConfig.textColor,
border: `1px solid ${primaryConfig.borderColor}`,
borderRadius: '4px',
fontWeight: 600,
fontFamily: 'system-ui, -apple-system, sans-serif',
whiteSpace: 'nowrap',
...sizeClasses[size],
}}
>
<span
style={{
backgroundColor: primaryConfig.color,
color: '#ffffff',
borderRadius: '2px',
padding: '1px 3px',
fontSize: 'inherit',
lineHeight: 1,
}}
>
{displayLetters}
</span>
{showLabel && (
<span style={{ color: primaryConfig.textColor }}>
{primaryConfig.label[language]}
</span>
)}
</span>
);
};
/**
* Multi-type indicator showing all types in a row
*/
export interface CustodianTypeRowProps {
/** Array of custodian type codes */
types: CustodianTypeCode[];
/** Maximum types to show before collapsing */
maxVisible?: number;
/** Size variant */
size?: 'small' | 'medium' | 'large';
/** Custom CSS class */
className?: string;
}
export const CustodianTypeRow: React.FC<CustodianTypeRowProps> = ({
types,
maxVisible = 5,
size = 'small',
className = '',
}) => {
const { language } = useLanguage();
const visibleTypes = types.slice(0, maxVisible);
const hiddenCount = types.length - maxVisible;
const sizeStyles = {
small: { width: '16px', height: '16px', fontSize: '10px' },
medium: { width: '20px', height: '20px', fontSize: '12px' },
large: { width: '24px', height: '24px', fontSize: '14px' },
};
return (
<div
className={`custodian-type-row ${className}`}
style={{
display: 'inline-flex',
alignItems: 'center',
gap: '2px',
}}
>
{visibleTypes.map(code => {
const config = getCustodianTypeByCode(code);
if (!config) return null;
return (
<span
key={code}
title={config.label[language]}
style={{
display: 'inline-flex',
alignItems: 'center',
justifyContent: 'center',
backgroundColor: config.color,
color: '#ffffff',
borderRadius: '3px',
fontWeight: 700,
fontFamily: 'system-ui, -apple-system, sans-serif',
...sizeStyles[size],
}}
>
{code}
</span>
);
})}
{hiddenCount > 0 && (
<span
style={{
display: 'inline-flex',
alignItems: 'center',
justifyContent: 'center',
backgroundColor: '#94a3b8',
color: '#ffffff',
borderRadius: '3px',
fontWeight: 600,
fontSize: sizeStyles[size].fontSize,
padding: '0 3px',
height: sizeStyles[size].height,
}}
title={`+${hiddenCount} more`}
>
+{hiddenCount}
</span>
)}
</div>
);
};
export default CustodianTypeIndicator;

View file

@ -75,6 +75,7 @@ export const translations = {
map: { nl: 'Kaart', en: 'Map' },
stats: { nl: 'Statistieken', en: 'Stats' },
overview: { nl: 'Overzicht', en: 'Overview' },
gesprek: { nl: 'Gesprek', en: 'Chat' },
settings: { nl: 'Instellingen', en: 'Settings' },
signOut: { nl: 'Uitloggen', en: 'Sign Out' },
},

View file

@ -0,0 +1,657 @@
/**
* useMultiDatabaseRAG.ts - Multi-Database RAG (Retrieval-Augmented Generation) Hook
*
* Orchestrates queries across multiple databases for conversational AI:
* - Qdrant: Vector similarity search for semantic retrieval
* - Oxigraph: SPARQL queries for structured RDF data
* - TypeDB: TypeQL queries for knowledge graph traversal
*
* Based on DSPy RAG patterns for heritage institution conversations.
* Self-hosted infrastructure - no external API keys required.
*
* @see https://dspy.ai/
*/
import { useState, useCallback } from 'react';
import type { QdrantSearchResult } from './useQdrant';
// Configuration - all services use Caddy proxy paths
const API_BASE = ''; // Relative URLs via Caddy proxy
const QDRANT_URL = '/qdrant';
const SPARQL_URL = '/sparql';
const TYPEDB_URL = '/api/typedb';
const DSPY_URL = '/api/dspy';
// ============================================================================
// Types
// ============================================================================
export interface RAGContext {
qdrantResults: QdrantSearchResult[];
sparqlResults: Record<string, unknown>[];
typedbResults: Record<string, unknown>[];
totalRetrieved: number;
}
export interface RAGResponse {
answer: string;
sparqlQuery?: string;
typeqlQuery?: string;
context: RAGContext;
visualizationType?: VisualizationType;
visualizationData?: VisualizationData;
sources: RAGSource[];
confidence: number;
}
export interface RAGSource {
database: 'qdrant' | 'oxigraph' | 'typedb';
id: string;
name?: string;
score?: number;
snippet?: string;
}
export type VisualizationType =
| 'none'
| 'map' // Geographic visualization
| 'timeline' // Temporal visualization
| 'network' // Graph/relationship visualization
| 'chart' // Bar/line charts
| 'table' // Tabular data
| 'card' // Institution cards
| 'gallery'; // Image gallery
export interface VisualizationData {
type: VisualizationType;
institutions?: InstitutionData[];
coordinates?: GeoCoordinate[];
timeline?: TimelineEvent[];
graphData?: GraphVisualizationData;
chartData?: ChartData;
}
export interface InstitutionData {
id: string;
name: string;
type?: string;
city?: string;
province?: string;
country?: string;
latitude?: number;
longitude?: number;
description?: string;
website?: string;
isil?: string;
wikidata?: string;
rating?: number;
reviews?: number;
photoCount?: number;
}
export interface GeoCoordinate {
lat: number;
lng: number;
label: string;
type?: string;
data?: InstitutionData;
}
export interface TimelineEvent {
date: string;
label: string;
description?: string;
type?: string;
}
export interface GraphVisualizationData {
nodes: Array<{
id: string;
label: string;
type: string;
attributes?: Record<string, unknown>;
}>;
edges: Array<{
id: string;
source: string;
target: string;
label: string;
type?: string;
}>;
}
export interface ChartData {
labels: string[];
datasets: Array<{
label: string;
data: number[];
backgroundColor?: string | string[];
borderColor?: string;
}>;
}
export interface ConversationMessage {
id: string;
role: 'user' | 'assistant' | 'system';
content: string;
timestamp: Date;
response?: RAGResponse;
isLoading?: boolean;
error?: string;
}
export interface UseMultiDatabaseRAGReturn {
// State
isLoading: boolean;
error: Error | null;
lastContext: RAGContext | null;
// Core RAG function
queryRAG: (
question: string,
options?: RAGOptions
) => Promise<RAGResponse>;
// Individual database queries (for debugging/advanced use)
searchQdrant: (query: string, limit?: number) => Promise<QdrantSearchResult[]>;
querySparql: (sparql: string) => Promise<Record<string, unknown>[]>;
queryTypeDB: (typeql: string) => Promise<Record<string, unknown>[]>;
// Utility functions
clearContext: () => void;
detectVisualizationType: (question: string, results: RAGContext) => VisualizationType;
}
export interface RAGOptions {
model?: string;
language?: 'nl' | 'en';
maxQdrantResults?: number;
maxSparqlResults?: number;
maxTypeDBResults?: number;
includeSparql?: boolean;
includeTypeDB?: boolean;
conversationHistory?: ConversationMessage[];
}
// ============================================================================
// Helper Functions
// ============================================================================
/**
* Generate text embedding using local embedding service or fallback
* In production, this would use a local embedding model (e.g., sentence-transformers)
* For now, we'll use keyword-based Qdrant filtering as a fallback
*/
async function generateEmbedding(text: string): Promise<number[] | null> {
try {
// Try local embedding service first
const response = await fetch(`${API_BASE}/api/embed`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text }),
});
if (response.ok) {
const data = await response.json();
return data.embedding;
}
} catch {
// Fallback: return null to use keyword search
}
return null;
}
/**
* Search Qdrant using vector similarity or keyword filter
*/
async function qdrantSearch(
query: string,
limit: number = 10
): Promise<QdrantSearchResult[]> {
const collectionName = 'heritage_custodians';
// Try to get embedding for semantic search
const embedding = await generateEmbedding(query);
if (embedding) {
// Vector similarity search
const response = await fetch(`${QDRANT_URL}/collections/${collectionName}/points/search`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
vector: embedding,
limit,
with_payload: true,
}),
});
if (response.ok) {
const data = await response.json();
return data.result || [];
}
}
// Fallback: Scroll through points with keyword filter
// Extract keywords from query for filtering
const keywords = query.toLowerCase().split(/\s+/).filter(w => w.length > 2);
const response = await fetch(`${QDRANT_URL}/collections/${collectionName}/points/scroll`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
limit: limit * 2, // Get more to filter
with_payload: true,
with_vector: false,
}),
});
if (!response.ok) {
throw new Error(`Qdrant scroll failed: ${response.status}`);
}
const data = await response.json();
const points = data.result?.points || [];
// Simple keyword matching in payload
const scored = points.map((p: { id: string | number; payload: Record<string, unknown> }) => {
const payload = p.payload || {};
const text = JSON.stringify(payload).toLowerCase();
const matches = keywords.filter(k => text.includes(k)).length;
return {
id: p.id,
score: matches / Math.max(keywords.length, 1),
payload,
};
});
// Sort by score and return top results
return scored
.filter((p: { score: number }) => p.score > 0)
.sort((a: { score: number }, b: { score: number }) => b.score - a.score)
.slice(0, limit);
}
/**
* Execute SPARQL query against Oxigraph
*/
async function sparqlQuery(query: string): Promise<Record<string, unknown>[]> {
const response = await fetch(`${SPARQL_URL}/query`, {
method: 'POST',
headers: {
'Content-Type': 'application/sparql-query',
'Accept': 'application/sparql-results+json',
},
body: query,
});
if (!response.ok) {
const error = await response.text();
throw new Error(`SPARQL query failed: ${response.status} - ${error}`);
}
const data = await response.json();
return data.results?.bindings || [];
}
/**
* Execute TypeQL query against TypeDB
*/
async function typedbQuery(query: string): Promise<Record<string, unknown>[]> {
const response = await fetch(`${TYPEDB_URL}/query`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ query, queryType: 'read' }),
});
if (!response.ok) {
const error = await response.text();
throw new Error(`TypeDB query failed: ${response.status} - ${error}`);
}
const data = await response.json();
return data.results || [];
}
/**
* Call DSPy backend to generate queries and response
*/
async function callDSPy(
question: string,
context: RAGContext,
options: RAGOptions
): Promise<{
answer: string;
sparqlQuery?: string;
typeqlQuery?: string;
visualizationType?: VisualizationType;
confidence: number;
}> {
const response = await fetch(`${DSPY_URL}/rag-query`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
question,
context: {
qdrant_results: context.qdrantResults.slice(0, 5),
sparql_results: context.sparqlResults.slice(0, 10),
typedb_results: context.typedbResults.slice(0, 10),
},
language: options.language || 'nl',
model: options.model || 'claude-sonnet-4-5-20250929',
conversation_history: options.conversationHistory?.slice(-4).map(m => ({
role: m.role,
content: m.content,
})),
}),
});
if (!response.ok) {
// Fallback response if DSPy service unavailable
return {
answer: generateFallbackAnswer(question, context, options.language || 'nl'),
confidence: 0.5,
};
}
return response.json();
}
/**
* Generate a fallback answer when DSPy service is unavailable
*/
function generateFallbackAnswer(
_question: string,
context: RAGContext,
language: 'nl' | 'en'
): string {
const count = context.totalRetrieved;
if (count === 0) {
return language === 'nl'
? 'Geen resultaten gevonden voor uw vraag.'
: 'No results found for your question.';
}
const institutions = context.qdrantResults.slice(0, 5).map(r => {
const name = r.payload?.name || r.payload?.custodian_name || 'Unknown';
return name;
});
if (language === 'nl') {
return `Ik heb ${count} resultaten gevonden. Enkele relevante instellingen: ${institutions.join(', ')}.`;
}
return `I found ${count} results. Some relevant institutions: ${institutions.join(', ')}.`;
}
/**
* Detect appropriate visualization type based on question and results
*/
function detectVisualizationType(
question: string,
context: RAGContext
): VisualizationType {
const q = question.toLowerCase();
// Map visualization keywords
if (q.includes('kaart') || q.includes('map') || q.includes('waar') ||
q.includes('where') || q.includes('locatie') || q.includes('location') ||
q.includes('provincie') || q.includes('province') || q.includes('stad') ||
q.includes('city') || q.includes('geografisch') || q.includes('geographic')) {
return 'map';
}
// Timeline keywords
if (q.includes('wanneer') || q.includes('when') || q.includes('geschiedenis') ||
q.includes('history') || q.includes('tijdlijn') || q.includes('timeline') ||
q.includes('opgericht') || q.includes('founded') || q.includes('jaar') ||
q.includes('year')) {
return 'timeline';
}
// Network/graph keywords
if (q.includes('relatie') || q.includes('relationship') || q.includes('verbinding') ||
q.includes('connection') || q.includes('netwerk') || q.includes('network') ||
q.includes('samenwer') || q.includes('collaborat')) {
return 'network';
}
// Chart keywords
if (q.includes('hoeveel') || q.includes('how many') || q.includes('aantal') ||
q.includes('count') || q.includes('statistiek') || q.includes('statistic') ||
q.includes('verdeling') || q.includes('distribution') || q.includes('vergelijk') ||
q.includes('compare')) {
return 'chart';
}
// If we have location data, show map
const hasCoordinates = context.qdrantResults.some(r =>
r.payload?.latitude || r.payload?.coordinates
);
if (hasCoordinates && context.totalRetrieved > 0) {
return 'map';
}
// Default to cards for institution results
if (context.qdrantResults.length > 0) {
return 'card';
}
return 'table';
}
/**
* Extract visualization data from RAG context
*/
function extractVisualizationData(
type: VisualizationType,
context: RAGContext
): VisualizationData {
const data: VisualizationData = { type };
// Extract institution data from Qdrant results
data.institutions = context.qdrantResults.map(r => {
const p = (r.payload || {}) as Record<string, unknown>;
const location = (p.location || {}) as Record<string, unknown>;
const coordinates = (p.coordinates || {}) as Record<string, unknown>;
return {
id: String(r.id),
name: String(p.name || p.custodian_name || p.institution_name || 'Unknown'),
type: String(p.type || p.institution_type || ''),
city: String(p.city || location.city || ''),
province: String(p.province || p.region || ''),
country: String(p.country || 'NL'),
latitude: Number(p.latitude || coordinates.lat || location.latitude),
longitude: Number(p.longitude || coordinates.lng || location.longitude),
description: String(p.description || ''),
website: String(p.website || p.url || ''),
isil: String(p.isil || p.isil_code || ''),
wikidata: String(p.wikidata || p.wikidata_id || ''),
rating: Number(p.rating || p.google_rating || 0),
reviews: Number(p.reviews || p.review_count || 0),
photoCount: Number(p.photoCount || p.photo_count || 0),
};
});
// Extract coordinates for map
if (type === 'map') {
data.coordinates = data.institutions
.filter(i => i.latitude && i.longitude && !isNaN(i.latitude) && !isNaN(i.longitude))
.map(i => ({
lat: i.latitude!,
lng: i.longitude!,
label: i.name,
type: i.type,
data: i,
}));
}
return data;
}
// ============================================================================
// Hook Implementation
// ============================================================================
export function useMultiDatabaseRAG(): UseMultiDatabaseRAGReturn {
const [isLoading, setIsLoading] = useState(false);
const [error, setError] = useState<Error | null>(null);
const [lastContext, setLastContext] = useState<RAGContext | null>(null);
/**
* Main RAG query function - orchestrates multi-database retrieval
*/
const queryRAG = useCallback(async (
question: string,
options: RAGOptions = {}
): Promise<RAGResponse> => {
setIsLoading(true);
setError(null);
const {
maxQdrantResults = 20,
maxSparqlResults = 50,
maxTypeDBResults = 50,
includeSparql = true,
includeTypeDB = false, // Disabled by default (may not be running)
} = options;
try {
// Parallel retrieval from all databases
const retrievalPromises: Promise<unknown>[] = [
qdrantSearch(question, maxQdrantResults),
];
// Add SPARQL if enabled (construct a basic query from keywords)
if (includeSparql) {
const keywords = question.split(/\s+/).filter(w => w.length > 2).slice(0, 3);
const sparqlSearchQuery = `
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT ?s ?label ?type WHERE {
?s rdfs:label|schema:name|skos:prefLabel ?label .
OPTIONAL { ?s a ?type }
FILTER(CONTAINS(LCASE(STR(?label)), "${keywords[0]?.toLowerCase() || ''}"))
}
LIMIT ${maxSparqlResults}
`;
retrievalPromises.push(
sparqlQuery(sparqlSearchQuery).catch(() => [])
);
}
// Add TypeDB if enabled
if (includeTypeDB) {
const typeqlSearchQuery = `match $x isa heritage_custodian, has name $n; get $x, $n; limit ${maxTypeDBResults};`;
retrievalPromises.push(
typedbQuery(typeqlSearchQuery).catch(() => [])
);
}
// Wait for all retrievals
const results = await Promise.all(retrievalPromises);
const qdrantResults = results[0] as QdrantSearchResult[];
const sparqlResults = (includeSparql ? results[1] : []) as Record<string, unknown>[];
const typedbResults = (includeTypeDB ? results[2] || results[1] : []) as Record<string, unknown>[];
const context: RAGContext = {
qdrantResults,
sparqlResults,
typedbResults,
totalRetrieved: qdrantResults.length + sparqlResults.length + typedbResults.length,
};
setLastContext(context);
// Call DSPy to generate response
const dspyResponse = await callDSPy(question, context, options);
// Detect visualization type
const vizType = dspyResponse.visualizationType || detectVisualizationType(question, context);
// Extract visualization data
const vizData = extractVisualizationData(vizType, context);
// Build sources list
const sources: RAGSource[] = [
...qdrantResults.slice(0, 5).map(r => ({
database: 'qdrant' as const,
id: String(r.id),
name: String(r.payload?.name || r.payload?.custodian_name || ''),
score: r.score,
snippet: String(r.payload?.description || '').slice(0, 200),
})),
];
return {
answer: dspyResponse.answer,
sparqlQuery: dspyResponse.sparqlQuery,
typeqlQuery: dspyResponse.typeqlQuery,
context,
visualizationType: vizType,
visualizationData: vizData,
sources,
confidence: dspyResponse.confidence,
};
} catch (err) {
const error = err instanceof Error ? err : new Error('RAG query failed');
setError(error);
throw error;
} finally {
setIsLoading(false);
}
}, []);
/**
* Direct Qdrant search (for debugging/advanced use)
*/
const searchQdrant = useCallback(async (
query: string,
limit: number = 10
): Promise<QdrantSearchResult[]> => {
return qdrantSearch(query, limit);
}, []);
/**
* Direct SPARQL query (for debugging/advanced use)
*/
const querySparql = useCallback(async (
sparql: string
): Promise<Record<string, unknown>[]> => {
return sparqlQuery(sparql);
}, []);
/**
* Direct TypeDB query (for debugging/advanced use)
*/
const queryTypeDB = useCallback(async (
typeql: string
): Promise<Record<string, unknown>[]> => {
return typedbQuery(typeql);
}, []);
/**
* Clear cached context
*/
const clearContext = useCallback(() => {
setLastContext(null);
setError(null);
}, []);
return {
isLoading,
error,
lastContext,
queryRAG,
searchQdrant,
querySparql,
queryTypeDB,
clearContext,
detectVisualizationType,
};
}
export default useMultiDatabaseRAG;

View file

@ -18,6 +18,7 @@
import { useState, useEffect, useCallback, useRef } from 'react';
import maplibregl from 'maplibre-gl';
import type { GeoJSONSource, LngLatLike } from 'maplibre-gl';
import type {
Archive,
WerkgebiedMapping,
@ -375,7 +376,7 @@ export function useWerkgebiedMapLibre(map: maplibregl.Map | null): WerkgebiedHoo
const hideWerkgebied = useCallback(() => {
if (!map) return;
const source = map.getSource(WERKGEBIED_SOURCE_ID) as maplibregl.GeoJSONSource | undefined;
const source = map.getSource(WERKGEBIED_SOURCE_ID) as GeoJSONSource | undefined;
if (source) {
source.setData({
type: 'FeatureCollection',
@ -486,7 +487,7 @@ export function useWerkgebiedMapLibre(map: maplibregl.Map | null): WerkgebiedHoo
}
// Update source data
const source = map.getSource(WERKGEBIED_SOURCE_ID) as maplibregl.GeoJSONSource | undefined;
const source = map.getSource(WERKGEBIED_SOURCE_ID) as GeoJSONSource | undefined;
if (source) {
source.setData({
type: 'FeatureCollection',
@ -509,12 +510,12 @@ export function useWerkgebiedMapLibre(map: maplibregl.Map | null): WerkgebiedHoo
const geometry = feature.geometry;
if (geometry.type === 'Polygon') {
geometry.coordinates[0].forEach((coord: number[]) => {
bounds.extend([coord[0], coord[1]] as maplibregl.LngLatLike);
bounds.extend([coord[0], coord[1]] as LngLatLike);
});
} else if (geometry.type === 'MultiPolygon') {
geometry.coordinates.forEach((polygon: number[][][]) => {
polygon[0].forEach((coord: number[]) => {
bounds.extend([coord[0], coord[1]] as maplibregl.LngLatLike);
bounds.extend([coord[0], coord[1]] as LngLatLike);
});
});
}
@ -843,7 +844,7 @@ export function useWerkgebiedMapLibre(map: maplibregl.Map | null): WerkgebiedHoo
}
// Update source
const source = map.getSource(WERKGEBIED_SOURCE_ID) as maplibregl.GeoJSONSource | undefined;
const source = map.getSource(WERKGEBIED_SOURCE_ID) as GeoJSONSource | undefined;
if (source) {
source.setData({
type: 'FeatureCollection',
@ -864,12 +865,12 @@ export function useWerkgebiedMapLibre(map: maplibregl.Map | null): WerkgebiedHoo
const geometry = feature.geometry;
if (geometry.type === 'Polygon') {
(geometry.coordinates[0] as number[][]).forEach((coord) => {
bounds.extend([coord[0], coord[1]] as maplibregl.LngLatLike);
bounds.extend([coord[0], coord[1]] as LngLatLike);
});
} else if (geometry.type === 'MultiPolygon') {
(geometry.coordinates as number[][][][]).forEach((polygon) => {
(polygon[0] as number[][]).forEach((coord) => {
bounds.extend([coord[0], coord[1]] as maplibregl.LngLatLike);
bounds.extend([coord[0], coord[1]] as LngLatLike);
});
});
}

View file

@ -0,0 +1,386 @@
/**
* GLAMORCUBESFIXPHDNT Taxonomy - Heritage Custodian Type Configuration
*
* This module provides centralized color, label, and metadata configuration
* for the 19-type GLAMORCUBESFIXPHDNT heritage custodian taxonomy.
*
* Mnemonic: Galleries, Libraries, Archives, Museums, Official institutions,
* Research centers, Corporations, Unknown, Botanical gardens/zoos,
* Education providers, Societies, Features, Intangible heritage groups,
* miXed, Personal collections, Holy sites, Digital platforms, NGOs,
* Taste/smell heritage
*
* @see AGENTS.md - Institution Type Taxonomy section
* @see schemas/20251121/linkml/modules/enums/CustodianTypeEnum.yaml
*/
/**
* Single-letter codes for each custodian type (used in GHCID generation)
*/
export type CustodianTypeCode =
| 'G' | 'L' | 'A' | 'M' | 'O' | 'R' | 'C' | 'U' | 'B' | 'E'
| 'S' | 'F' | 'I' | 'X' | 'P' | 'H' | 'D' | 'N' | 'T';
/**
* Full custodian type names (matches LinkML enum values)
*/
export type CustodianType =
| 'GALLERY' | 'LIBRARY' | 'ARCHIVE' | 'MUSEUM' | 'OFFICIAL_INSTITUTION'
| 'RESEARCH_CENTER' | 'CORPORATION' | 'UNKNOWN' | 'BOTANICAL_ZOO'
| 'EDUCATION_PROVIDER' | 'COLLECTING_SOCIETY' | 'FEATURES'
| 'INTANGIBLE_HERITAGE_GROUP' | 'MIXED' | 'PERSONAL_COLLECTION'
| 'HOLY_SITES' | 'DIGITAL_PLATFORM' | 'NGO' | 'TASTE_SMELL';
/**
* Bilingual labels for each custodian type
*/
export interface BilingualLabel {
nl: string;
en: string;
}
/**
* Complete configuration for a single custodian type
*/
export interface CustodianTypeConfig {
/** Single-letter code for GHCID */
code: CustodianTypeCode;
/** Full enum name (LinkML) */
name: CustodianType;
/** Primary color (hex) - used for map markers, badges, etc. */
color: string;
/** Light background color (hex) - used for cards, highlights */
bgColor: string;
/** Border/accent color (hex) - used for outlines */
borderColor: string;
/** Text color for high contrast on bgColor */
textColor: string;
/** Bilingual display labels */
label: BilingualLabel;
/** Short description */
description: BilingualLabel;
/** Icon name (Lucide React) */
icon: string;
}
/**
* Complete GLAMORCUBESFIXPHDNT taxonomy configuration
*
* Colors are designed to be:
* - Distinguishable from each other
* - Colorblind-friendly where possible
* - Consistent with the existing map page colors
* - Suitable for both light and dark modes (primary colors work on both)
*/
export const CUSTODIAN_TYPES: Record<CustodianTypeCode, CustodianTypeConfig> = {
G: {
code: 'G',
name: 'GALLERY',
color: '#00bcd4', // Cyan
bgColor: '#e0f7fa',
borderColor: '#0097a7',
textColor: '#006064',
label: { nl: 'Galerie', en: 'Gallery' },
description: { nl: 'Kunstgalerij of tentoonstellingsruimte', en: 'Art gallery or exhibition space' },
icon: 'Frame',
},
L: {
code: 'L',
name: 'LIBRARY',
color: '#2ecc71', // Green
bgColor: '#e8f5e9',
borderColor: '#27ae60',
textColor: '#1b5e20',
label: { nl: 'Bibliotheek', en: 'Library' },
description: { nl: 'Openbare, academische of gespecialiseerde bibliotheek', en: 'Public, academic, or specialized library' },
icon: 'BookOpen',
},
A: {
code: 'A',
name: 'ARCHIVE',
color: '#3498db', // Blue
bgColor: '#e3f2fd',
borderColor: '#2980b9',
textColor: '#0d47a1',
label: { nl: 'Archief', en: 'Archive' },
description: { nl: 'Overheids-, bedrijfs- of persoonlijk archief', en: 'Government, corporate, or personal archive' },
icon: 'Archive',
},
M: {
code: 'M',
name: 'MUSEUM',
color: '#e74c3c', // Red
bgColor: '#ffebee',
borderColor: '#c0392b',
textColor: '#b71c1c',
label: { nl: 'Museum', en: 'Museum' },
description: { nl: 'Kunst-, geschiedenis- of wetenschapsmuseum', en: 'Art, history, or science museum' },
icon: 'Building2',
},
O: {
code: 'O',
name: 'OFFICIAL_INSTITUTION',
color: '#f39c12', // Orange
bgColor: '#fff8e1',
borderColor: '#e67e22',
textColor: '#e65100',
label: { nl: 'Officieel', en: 'Official' },
description: { nl: 'Overheidserfgoedinstantie of -platform', en: 'Government heritage agency or platform' },
icon: 'Landmark',
},
R: {
code: 'R',
name: 'RESEARCH_CENTER',
color: '#1abc9c', // Teal
bgColor: '#e0f2f1',
borderColor: '#16a085',
textColor: '#004d40',
label: { nl: 'Onderzoek', en: 'Research' },
description: { nl: 'Onderzoeksinstituut of documentatiecentrum', en: 'Research institute or documentation center' },
icon: 'Search',
},
C: {
code: 'C',
name: 'CORPORATION',
color: '#795548', // Brown
bgColor: '#efebe9',
borderColor: '#5d4037',
textColor: '#3e2723',
label: { nl: 'Bedrijf', en: 'Corporation' },
description: { nl: 'Bedrijfserfgoedcollectie', en: 'Corporate heritage collection' },
icon: 'Building',
},
U: {
code: 'U',
name: 'UNKNOWN',
color: '#9e9e9e', // Gray
bgColor: '#f5f5f5',
borderColor: '#757575',
textColor: '#424242',
label: { nl: 'Onbekend', en: 'Unknown' },
description: { nl: 'Type kan niet worden bepaald', en: 'Type cannot be determined' },
icon: 'HelpCircle',
},
B: {
code: 'B',
name: 'BOTANICAL_ZOO',
color: '#4caf50', // Green (different shade)
bgColor: '#e8f5e9',
borderColor: '#388e3c',
textColor: '#1b5e20',
label: { nl: 'Botanisch', en: 'Botanical' },
description: { nl: 'Botanische tuin of dierentuin', en: 'Botanical garden or zoo' },
icon: 'Leaf',
},
E: {
code: 'E',
name: 'EDUCATION_PROVIDER',
color: '#ff9800', // Amber
bgColor: '#fff3e0',
borderColor: '#f57c00',
textColor: '#e65100',
label: { nl: 'Onderwijs', en: 'Education' },
description: { nl: 'Onderwijsinstelling met collecties', en: 'Educational institution with collections' },
icon: 'GraduationCap',
},
S: {
code: 'S',
name: 'COLLECTING_SOCIETY',
color: '#9b59b6', // Purple
bgColor: '#f3e5f5',
borderColor: '#8e24aa',
textColor: '#4a148c',
label: { nl: 'Vereniging', en: 'Society' },
description: { nl: 'Vereniging die gespecialiseerde materialen verzamelt', en: 'Society collecting specialized materials' },
icon: 'Users',
},
F: {
code: 'F',
name: 'FEATURES',
color: '#95a5a6', // Gray-green
bgColor: '#eceff1',
borderColor: '#78909c',
textColor: '#37474f',
label: { nl: 'Monumenten', en: 'Features' },
description: { nl: 'Fysieke landschapskenmerken met erfgoedwaarde', en: 'Physical landscape features with heritage significance' },
icon: 'Map',
},
I: {
code: 'I',
name: 'INTANGIBLE_HERITAGE_GROUP',
color: '#673ab7', // Deep purple
bgColor: '#ede7f6',
borderColor: '#5e35b1',
textColor: '#311b92',
label: { nl: 'Immaterieel', en: 'Intangible' },
description: { nl: 'Organisatie die immaterieel erfgoed bewaart', en: 'Organization preserving intangible heritage' },
icon: 'Music',
},
X: {
code: 'X',
name: 'MIXED',
color: '#607d8b', // Blue-gray
bgColor: '#eceff1',
borderColor: '#546e7a',
textColor: '#263238',
label: { nl: 'Gemengd', en: 'Mixed' },
description: { nl: 'Meerdere types (gecombineerde faciliteit)', en: 'Multiple types (combined facility)' },
icon: 'Layers',
},
P: {
code: 'P',
name: 'PERSONAL_COLLECTION',
color: '#8bc34a', // Light green
bgColor: '#f1f8e9',
borderColor: '#689f38',
textColor: '#33691e',
label: { nl: 'Persoonlijk', en: 'Personal' },
description: { nl: 'Privé persoonlijke collectie', en: 'Private personal collection' },
icon: 'User',
},
H: {
code: 'H',
name: 'HOLY_SITES',
color: '#607d8b', // Blue-gray (same as Mixed - consider changing)
bgColor: '#fce4ec',
borderColor: '#c2185b',
textColor: '#880e4f',
label: { nl: 'Heilige plaatsen', en: 'Holy sites' },
description: { nl: 'Religieuze erfgoedlocaties en -instellingen', en: 'Religious heritage sites and institutions' },
icon: 'Church',
},
D: {
code: 'D',
name: 'DIGITAL_PLATFORM',
color: '#34495e', // Dark gray-blue
bgColor: '#e8eaf6',
borderColor: '#3949ab',
textColor: '#1a237e',
label: { nl: 'Digitaal', en: 'Digital' },
description: { nl: 'Digitale erfgoedplatforms en repositories', en: 'Digital heritage platforms and repositories' },
icon: 'Monitor',
},
N: {
code: 'N',
name: 'NGO',
color: '#e91e63', // Pink
bgColor: '#fce4ec',
borderColor: '#c2185b',
textColor: '#880e4f',
label: { nl: 'NGO', en: 'NGO' },
description: { nl: 'Niet-gouvernementele erfgoedorganisatie', en: 'Non-governmental heritage organization' },
icon: 'Heart',
},
T: {
code: 'T',
name: 'TASTE_SMELL',
color: '#ff5722', // Deep orange
bgColor: '#fbe9e7',
borderColor: '#e64a19',
textColor: '#bf360c',
label: { nl: 'Smaak/geur', en: 'Taste/smell' },
description: { nl: 'Culinair en olfactorisch erfgoedinstelling', en: 'Culinary and olfactory heritage institution' },
icon: 'ChefHat',
},
};
/**
* Get custodian type configuration by single-letter code
* Returns undefined if code is not valid
*/
export function getCustodianTypeByCode(code: CustodianTypeCode | string): CustodianTypeConfig | undefined {
if (!code || typeof code !== 'string') return undefined;
const upperCode = code.toUpperCase();
if (upperCode in CUSTODIAN_TYPES) {
return CUSTODIAN_TYPES[upperCode as CustodianTypeCode];
}
return undefined;
}
/**
* Get custodian type configuration by single-letter code, with fallback to UNKNOWN
* Always returns a valid config (never undefined)
*/
export function getCustodianTypeByCodeSafe(code: CustodianTypeCode | string): CustodianTypeConfig {
return getCustodianTypeByCode(code) ?? CUSTODIAN_TYPES.U;
}
/**
* Get custodian type configuration by full name
*/
export function getCustodianTypeByName(name: CustodianType): CustodianTypeConfig | undefined {
return Object.values(CUSTODIAN_TYPES).find(t => t.name === name);
}
/**
* Get all custodian type codes as array (in GLAMORCUBESFIXPHDNT order)
*/
export const CUSTODIAN_TYPE_CODES: CustodianTypeCode[] = [
'G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E',
'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'
];
/**
* Color-only map for backwards compatibility with existing code
* (Same as TYPE_COLORS in NDEMapPageMapLibre.tsx)
*/
export const CUSTODIAN_TYPE_COLORS: Record<CustodianTypeCode, string> = Object.fromEntries(
CUSTODIAN_TYPE_CODES.map(code => [code, CUSTODIAN_TYPES[code].color])
) as Record<CustodianTypeCode, string>;
/**
* Label-only map for backwards compatibility
* (Same as TYPE_NAMES in NDEMapPageMapLibre.tsx)
*/
export const CUSTODIAN_TYPE_LABELS: Record<CustodianTypeCode, BilingualLabel> = Object.fromEntries(
CUSTODIAN_TYPE_CODES.map(code => [code, CUSTODIAN_TYPES[code].label])
) as Record<CustodianTypeCode, BilingualLabel>;
/**
* Full name to code mapping
*/
export const NAME_TO_CODE: Record<CustodianType, CustodianTypeCode> = Object.fromEntries(
Object.entries(CUSTODIAN_TYPES).map(([code, config]) => [config.name, code as CustodianTypeCode])
) as Record<CustodianType, CustodianTypeCode>;
/**
* Code to full name mapping
*/
export const CODE_TO_NAME: Record<CustodianTypeCode, CustodianType> = Object.fromEntries(
CUSTODIAN_TYPE_CODES.map(code => [code, CUSTODIAN_TYPES[code].name])
) as Record<CustodianTypeCode, CustodianType>;
/**
* Parse a custodian type string (code or full name) to a code
* Returns undefined if not recognized
*/
export function parseCustodianType(input: string): CustodianTypeCode | undefined {
// Check if it's already a single-letter code
if (input.length === 1 && CUSTODIAN_TYPE_CODES.includes(input.toUpperCase() as CustodianTypeCode)) {
return input.toUpperCase() as CustodianTypeCode;
}
// Check if it's a full name
const upperInput = input.toUpperCase().replace(/[-\s]/g, '_');
if (upperInput in NAME_TO_CODE) {
return NAME_TO_CODE[upperInput as CustodianType];
}
return undefined;
}
/**
* Get display color for a custodian type (by code or name)
*/
export function getCustodianTypeColor(input: string): string {
const code = parseCustodianType(input);
return code ? CUSTODIAN_TYPES[code].color : CUSTODIAN_TYPES.U.color;
}
/**
* Get display label for a custodian type in specified language
*/
export function getCustodianTypeLabel(input: string, lang: 'nl' | 'en' = 'en'): string {
const code = parseCustodianType(input);
return code ? CUSTODIAN_TYPES[code].label[lang] : CUSTODIAN_TYPES.U.label[lang];
}

View file

@ -0,0 +1,237 @@
/**
* schema-custodian-mapping.ts - Maps LinkML schema elements to CustodianTypes
*
* This module provides mappings between schema elements (classes, slots, enums)
* and the GLAMORCUBESFIXPHDNT custodian types they primarily relate to.
*
* Used by the CustodianTypeIndicator component to show which type(s) a schema
* element is most relevant to.
*/
import type { CustodianTypeCode } from './custodian-types';
/**
* Mapping of schema class names to relevant custodian types
*
* Key: Class name (as it appears in LinkML schema)
* Value: Array of CustodianTypeCode(s) the class relates to
*
* Most classes relate to ALL types (universal), but some are type-specific.
*/
export const CLASS_TO_CUSTODIAN_TYPE: Record<string, CustodianTypeCode[]> = {
// Universal classes (apply to all custodian types)
'CustodianObservation': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'CustodianName': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'CustodianReconstruction': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'Location': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'GHCID': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'Provenance': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
// Place-related classes
'FeaturePlace': ['F'], // Features (monuments, statues)
'FeaturePlaceClass': ['F'],
// Collection-related classes
'Collection': ['G', 'L', 'A', 'M', 'B', 'H'], // Galleries, Libraries, Archives, Museums, Botanical, Holy sites
'CollectionItem': ['G', 'L', 'A', 'M', 'B', 'H'],
// Digital platform classes
'DigitalPlatform': ['D'], // Digital platforms
'DigitalPlatformClass': ['D'],
'WebObservation': ['D'],
'WebClaim': ['D'],
// Archive-specific
'ArchivalFonds': ['A'], // Archives
'ArchivalSeries': ['A'],
'ArchivalRecord': ['A'],
// Library-specific
'BibliographicRecord': ['L'], // Libraries
'Catalog': ['L'],
// Museum-specific
'Exhibition': ['M', 'G'], // Museums, Galleries
'MuseumObject': ['M'],
// Research-related
'ResearchProject': ['R'], // Research centers
'Publication': ['R', 'L'], // Research centers, Libraries
// Education-related
'Course': ['E'], // Education providers
'LearningResource': ['E', 'D'], // Education, Digital platforms
// Religious heritage
'ReligiousCollection': ['H'], // Holy sites
'LiturgicalObject': ['H'],
// Botanical/Zoo
'LivingCollection': ['B'], // Botanical gardens/zoos
'Specimen': ['B'],
// Intangible heritage
'IntangibleHeritage': ['I'], // Intangible heritage groups
'Performance': ['I'],
'Tradition': ['I'],
// Organizational
'StaffRole': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'E', 'S', 'H', 'N'],
'OrganizationalChange': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
// Personal collections
'PersonalCollection': ['P'], // Personal collections
'PrivateArchive': ['P'],
// Corporate
'CorporateCollection': ['C'], // Corporations
'CorporateArchive': ['C'],
// Society-related
'SocietyMembership': ['S'], // Collecting societies
'HeemkundigeKring': ['S'],
// Taste/Smell heritage
'CulinaryHeritage': ['T'], // Taste/smell heritage
'Recipe': ['T'],
'Formulation': ['T'],
// NGO-specific
'AdvocacyOrganization': ['N'], // NGOs
'HeritageInitiative': ['N'],
};
/**
* Mapping of schema slot names to relevant custodian types
*/
export const SLOT_TO_CUSTODIAN_TYPE: Record<string, CustodianTypeCode[]> = {
// Universal slots
'custodian_name': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'location': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'ghcid': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'provenance': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
// Archive-specific slots
'fonds': ['A'],
'finding_aid': ['A'],
'archival_hierarchy': ['A'],
// Library-specific slots
'call_number': ['L'],
'bibliographic_record': ['L'],
'catalog_entry': ['L'],
// Museum-specific slots
'accession_number': ['M'],
'exhibition_history': ['M', 'G'],
'conservation_status': ['M', 'G'],
// Digital platform slots
'platform_url': ['D'],
'api_endpoint': ['D'],
'metadata_format': ['D', 'L', 'A', 'M'],
// Religious heritage slots
'denomination': ['H'],
'consecration_date': ['H'],
'liturgical_calendar': ['H'],
// Botanical/Zoo slots
'species': ['B'],
'habitat': ['B'],
'conservation_program': ['B'],
// Intangible heritage slots
'tradition_type': ['I'],
'transmission_method': ['I'],
'practitioners': ['I'],
// Taste/Smell slots
'recipe_origin': ['T'],
'ingredients': ['T'],
'preparation_method': ['T'],
};
/**
* Mapping of enum names to relevant custodian types
*/
export const ENUM_TO_CUSTODIAN_TYPE: Record<string, CustodianTypeCode[]> = {
// Universal enums
'CustodianTypeEnum': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'DataTierEnum': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'DataSourceEnum': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'CountryCodeEnum': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
'LanguageCodeEnum': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'],
// Type-specific enums
'ArchivalLevelEnum': ['A'],
'BibliographicFormatEnum': ['L'],
'ExhibitionTypeEnum': ['M', 'G'],
'DigitalPlatformTypeEnum': ['D'],
'ReligiousDenominationEnum': ['H'],
'SpeciesClassificationEnum': ['B'],
'IntangibleHeritageTypeEnum': ['I'],
'CulinaryHeritageTypeEnum': ['T'],
'StaffRoleTypeEnum': ['G', 'L', 'A', 'M', 'O', 'R', 'C', 'E', 'S', 'H', 'N'],
};
/**
* Default types for elements not explicitly mapped (universal)
*/
export const DEFAULT_CUSTODIAN_TYPES: CustodianTypeCode[] = [
'G', 'L', 'A', 'M', 'O', 'R', 'C', 'U', 'B', 'E', 'S', 'F', 'I', 'X', 'P', 'H', 'D', 'N', 'T'
];
/**
* Get custodian types for a schema class
*/
export function getCustodianTypesForClass(className: string): CustodianTypeCode[] {
return CLASS_TO_CUSTODIAN_TYPE[className] || DEFAULT_CUSTODIAN_TYPES;
}
/**
* Get custodian types for a schema slot
*/
export function getCustodianTypesForSlot(slotName: string): CustodianTypeCode[] {
return SLOT_TO_CUSTODIAN_TYPE[slotName] || DEFAULT_CUSTODIAN_TYPES;
}
/**
* Get custodian types for a schema enum
*/
export function getCustodianTypesForEnum(enumName: string): CustodianTypeCode[] {
return ENUM_TO_CUSTODIAN_TYPE[enumName] || DEFAULT_CUSTODIAN_TYPES;
}
/**
* Check if a schema element is universal (applies to all types)
*/
export function isUniversalElement(types: CustodianTypeCode[]): boolean {
return types.length >= 19; // All 19 types
}
/**
* Get the primary custodian type for a schema element
* Returns the first type, or 'U' (Unknown) if empty
*/
export function getPrimaryCustodianType(types: CustodianTypeCode[]): CustodianTypeCode {
if (types.length === 0) return 'U';
// For universal elements, return the most common types first
if (isUniversalElement(types)) {
return 'M'; // Museum as default primary for universal
}
return types[0];
}
/**
* Get a compact representation of custodian types
* For universal: returns "ALL"
* For few types: returns joined string (e.g., "MAL")
* For many types: returns abbreviated (e.g., "MA+5")
*/
export function getCompactTypeRepresentation(types: CustodianTypeCode[]): string {
if (types.length === 0) return '?';
if (isUniversalElement(types)) return 'ALL';
if (types.length <= 4) return types.join('');
return types.slice(0, 2).join('') + `+${types.length - 2}`;
}

View file

@ -3662,3 +3662,730 @@ body.resizing-row * {
color: #888;
}
}
/* ============================================
EMBEDDING PROJECTOR STYLES
============================================ */
.embedding-projector {
display: flex;
flex-direction: column;
height: 100%;
background: #fafafa;
border-radius: 8px;
overflow: hidden;
}
.projector-header {
display: flex;
align-items: center;
justify-content: space-between;
padding: 1rem 1.25rem;
background: #fff;
border-bottom: 1px solid #e0e0e0;
flex-shrink: 0;
}
.projector-header h3 {
margin: 0;
font-size: 1.1rem;
font-weight: 600;
color: #333;
}
.projector-stats {
display: flex;
gap: 1rem;
font-size: 0.85rem;
color: #666;
}
.projector-stats span {
display: flex;
align-items: center;
gap: 0.35rem;
}
.projector-controls {
display: flex;
align-items: center;
gap: 1.5rem;
padding: 0.75rem 1.25rem;
background: #fff;
border-bottom: 1px solid #e0e0e0;
flex-shrink: 0;
flex-wrap: wrap;
}
.control-section {
display: flex;
align-items: center;
gap: 0.5rem;
}
.control-section label {
font-size: 0.8rem;
font-weight: 500;
color: #555;
white-space: nowrap;
}
.control-group {
display: flex;
align-items: center;
gap: 0.5rem;
}
.control-group select,
.control-group input[type="number"] {
padding: 0.4rem 0.6rem;
font-size: 0.85rem;
border: 1px solid #ddd;
border-radius: 4px;
background: #fff;
color: #333;
min-width: 80px;
}
.control-group select:focus,
.control-group input[type="number"]:focus {
outline: none;
border-color: #FFC107;
box-shadow: 0 0 0 2px rgba(255, 193, 7, 0.2);
}
.control-group input[type="number"] {
width: 70px;
}
.button-group {
display: flex;
gap: 0.25rem;
}
.button-group button {
padding: 0.4rem 0.75rem;
font-size: 0.8rem;
border: 1px solid #ddd;
background: #fff;
color: #555;
cursor: pointer;
transition: all 0.15s;
}
.button-group button:first-child {
border-radius: 4px 0 0 4px;
}
.button-group button:last-child {
border-radius: 0 4px 4px 0;
}
.button-group button:not(:last-child) {
border-right: none;
}
.button-group button:hover {
background: #f5f5f5;
}
.button-group button.active {
background: #FFC107;
border-color: #FFC107;
color: #000;
font-weight: 500;
}
.compute-btn {
padding: 0.5rem 1rem;
font-size: 0.85rem;
font-weight: 500;
background: #FFC107;
border: none;
border-radius: 4px;
color: #000;
cursor: pointer;
transition: all 0.15s;
display: flex;
align-items: center;
gap: 0.4rem;
}
.compute-btn:hover:not(:disabled) {
background: #ffca2c;
transform: translateY(-1px);
}
.compute-btn:disabled {
opacity: 0.6;
cursor: not-allowed;
}
.projector-body {
display: flex;
flex: 1;
overflow: hidden;
}
.projector-main {
flex: 1;
display: flex;
flex-direction: column;
overflow: hidden;
position: relative;
}
.projector-canvas {
flex: 1;
position: relative;
background: #fff;
border-right: 1px solid #e0e0e0;
overflow: hidden;
}
.projector-canvas svg {
width: 100%;
height: 100%;
}
.projector-canvas .three-container {
width: 100%;
height: 100%;
min-height: 500px;
}
.projector-canvas .three-container canvas {
width: 100% !important;
height: 100% !important;
}
.projector-canvas .point {
cursor: pointer;
transition: r 0.15s, opacity 0.15s;
}
.projector-canvas .point:hover {
r: 6;
}
.projector-canvas .point.selected {
stroke: #000;
stroke-width: 2;
}
.projector-canvas .point.neighbor {
stroke: #FFC107;
stroke-width: 2;
}
.projector-canvas .point.dimmed {
opacity: 0.15;
}
.projector-sidebar {
width: 280px;
display: flex;
flex-direction: column;
background: #fff;
border-left: 1px solid #e0e0e0;
overflow: hidden;
flex-shrink: 0;
}
.projector-search {
padding: 0.75rem;
border-bottom: 1px solid #e0e0e0;
}
.projector-search input {
width: 100%;
padding: 0.5rem 0.75rem;
font-size: 0.85rem;
border: 1px solid #ddd;
border-radius: 4px;
background: #f8f8f8;
}
.projector-search input:focus {
outline: none;
border-color: #FFC107;
background: #fff;
}
.projector-legend {
padding: 0.75rem;
border-bottom: 1px solid #e0e0e0;
max-height: 200px;
overflow-y: auto;
}
.projector-legend h4 {
margin: 0 0 0.5rem 0;
font-size: 0.8rem;
font-weight: 600;
color: #555;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.legend-items {
display: flex;
flex-direction: column;
gap: 0.35rem;
}
.legend-item {
display: flex;
align-items: center;
gap: 0.5rem;
font-size: 0.8rem;
color: #444;
cursor: pointer;
padding: 0.25rem;
border-radius: 3px;
transition: background 0.15s;
}
.legend-item:hover {
background: #f5f5f5;
}
.legend-color {
width: 12px;
height: 12px;
border-radius: 2px;
flex-shrink: 0;
}
.legend-label {
flex: 1;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
.legend-count {
font-size: 0.75rem;
color: #888;
}
.projector-details {
flex: 1;
padding: 0.75rem;
overflow-y: auto;
}
.projector-details h4 {
margin: 0 0 0.75rem 0;
font-size: 0.8rem;
font-weight: 600;
color: #555;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.detail-section {
margin-bottom: 1rem;
}
.detail-section:last-child {
margin-bottom: 0;
}
.detail-label {
font-size: 0.75rem;
color: #888;
margin-bottom: 0.25rem;
}
.detail-value {
font-size: 0.85rem;
color: #333;
word-break: break-all;
}
.detail-value.id {
font-family: 'SF Mono', 'Monaco', 'Inconsolata', monospace;
font-size: 0.8rem;
background: #f5f5f5;
padding: 0.25rem 0.5rem;
border-radius: 3px;
}
.nearest-neighbors {
margin-top: 1rem;
padding-top: 1rem;
border-top: 1px solid #e0e0e0;
}
.nearest-neighbors h5 {
margin: 0 0 0.5rem 0;
font-size: 0.8rem;
font-weight: 600;
color: #555;
}
.neighbor-list {
display: flex;
flex-direction: column;
gap: 0.35rem;
}
.neighbor-item {
display: flex;
align-items: center;
justify-content: space-between;
padding: 0.35rem 0.5rem;
background: #f8f8f8;
border-radius: 3px;
font-size: 0.8rem;
cursor: pointer;
transition: background 0.15s;
}
.neighbor-item:hover {
background: #FFC107;
}
.neighbor-id {
font-family: 'SF Mono', 'Monaco', 'Inconsolata', monospace;
font-size: 0.75rem;
color: #555;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
max-width: 140px;
}
.neighbor-distance {
font-size: 0.75rem;
color: #888;
}
.no-selection {
text-align: center;
padding: 2rem 1rem;
color: #888;
font-size: 0.85rem;
}
.viz-placeholder {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
height: 100%;
padding: 2rem;
text-align: center;
color: #666;
}
.viz-placeholder svg {
width: 64px;
height: 64px;
margin-bottom: 1rem;
opacity: 0.5;
}
.viz-placeholder p {
margin: 0;
font-size: 0.9rem;
}
.viz-placeholder p:first-of-type {
font-weight: 500;
color: #444;
margin-bottom: 0.5rem;
}
.viz-collection-selector {
display: flex;
flex-direction: column;
align-items: center;
gap: 1rem;
margin-top: 1.5rem;
}
.viz-collection-selector select {
padding: 0.5rem 1rem;
font-size: 0.9rem;
border: 1px solid #ddd;
border-radius: 4px;
background: #fff;
min-width: 200px;
}
.viz-collection-selector button {
padding: 0.5rem 1.5rem;
font-size: 0.9rem;
font-weight: 500;
background: #FFC107;
border: none;
border-radius: 4px;
color: #000;
cursor: pointer;
transition: all 0.15s;
}
.viz-collection-selector button:hover:not(:disabled) {
background: #ffca2c;
}
.viz-collection-selector button:disabled {
opacity: 0.6;
cursor: not-allowed;
}
.variance-info {
display: flex;
align-items: center;
gap: 0.5rem;
font-size: 0.8rem;
color: #666;
padding: 0.5rem 1rem;
background: #f8f8f8;
border-radius: 4px;
margin-left: auto;
}
.variance-info strong {
color: #333;
}
.computing-overlay {
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: rgba(255, 255, 255, 0.9);
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
z-index: 10;
}
.computing-spinner {
width: 40px;
height: 40px;
border: 3px solid #e0e0e0;
border-top-color: #FFC107;
border-radius: 50%;
animation: spin 1s linear infinite;
margin-bottom: 1rem;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
.computing-overlay p {
font-size: 0.9rem;
color: #666;
margin: 0;
}
.tooltip {
position: absolute;
pointer-events: none;
background: rgba(0, 0, 0, 0.85);
color: #fff;
padding: 0.5rem 0.75rem;
border-radius: 4px;
font-size: 0.8rem;
max-width: 250px;
z-index: 100;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.2);
}
.tooltip-id {
font-family: 'SF Mono', 'Monaco', 'Inconsolata', monospace;
font-size: 0.75rem;
opacity: 0.8;
margin-bottom: 0.25rem;
}
.tooltip-payload {
font-size: 0.75rem;
}
/* Dark mode for Embedding Projector */
[data-theme="dark"] .embedding-projector {
background: #16161e;
}
[data-theme="dark"] .projector-header {
background: #1a1a2e;
border-color: #333;
}
[data-theme="dark"] .projector-header h3 {
color: #e0e0e0;
}
[data-theme="dark"] .projector-stats {
color: #888;
}
[data-theme="dark"] .projector-controls {
background: #1a1a2e;
border-color: #333;
}
[data-theme="dark"] .control-section label {
color: #aaa;
}
[data-theme="dark"] .control-group select,
[data-theme="dark"] .control-group input[type="number"] {
background: #252538;
border-color: #404050;
color: #e0e0e0;
}
[data-theme="dark"] .button-group button {
background: #252538;
border-color: #404050;
color: #aaa;
}
[data-theme="dark"] .button-group button:hover {
background: #3a3a4e;
}
[data-theme="dark"] .button-group button.active {
background: #FFC107;
border-color: #FFC107;
color: #000;
}
[data-theme="dark"] .projector-canvas {
background: #1a1a2e;
border-color: #333;
}
[data-theme="dark"] .projector-canvas .three-container {
background: #1a1a2e;
}
[data-theme="dark"] .projector-sidebar {
background: #1a1a2e;
border-color: #333;
}
[data-theme="dark"] .projector-search input {
background: #252538;
border-color: #404050;
color: #e0e0e0;
}
[data-theme="dark"] .projector-search input:focus {
background: #2a2a3e;
border-color: #FFC107;
}
[data-theme="dark"] .projector-legend {
border-color: #333;
}
[data-theme="dark"] .projector-legend h4 {
color: #aaa;
}
[data-theme="dark"] .legend-item {
color: #ccc;
}
[data-theme="dark"] .legend-item:hover {
background: #252538;
}
[data-theme="dark"] .legend-count {
color: #666;
}
[data-theme="dark"] .projector-details h4 {
color: #aaa;
}
[data-theme="dark"] .detail-label {
color: #666;
}
[data-theme="dark"] .detail-value {
color: #e0e0e0;
}
[data-theme="dark"] .detail-value.id {
background: #252538;
}
[data-theme="dark"] .nearest-neighbors {
border-color: #333;
}
[data-theme="dark"] .nearest-neighbors h5 {
color: #aaa;
}
[data-theme="dark"] .neighbor-item {
background: #252538;
}
[data-theme="dark"] .neighbor-item:hover {
background: #FFC107;
}
[data-theme="dark"] .neighbor-item:hover .neighbor-id,
[data-theme="dark"] .neighbor-item:hover .neighbor-distance {
color: #000;
}
[data-theme="dark"] .neighbor-id {
color: #aaa;
}
[data-theme="dark"] .neighbor-distance {
color: #666;
}
[data-theme="dark"] .no-selection {
color: #666;
}
[data-theme="dark"] .viz-placeholder {
color: #888;
}
[data-theme="dark"] .viz-placeholder p:first-of-type {
color: #aaa;
}
[data-theme="dark"] .viz-collection-selector select {
background: #252538;
border-color: #404050;
color: #e0e0e0;
}
[data-theme="dark"] .variance-info {
background: #252538;
color: #888;
}
[data-theme="dark"] .variance-info strong {
color: #e0e0e0;
}
[data-theme="dark"] .computing-overlay {
background: rgba(22, 22, 30, 0.95);
}
[data-theme="dark"] .computing-spinner {
border-color: #333;
border-top-color: #FFC107;
}
[data-theme="dark"] .computing-overlay p {
color: #888;
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -318,6 +318,28 @@
border-color: var(--primary-color, #1976d2);
}
/* Tab separator for 2D/3D toggle */
.linkml-viewer-page__tab-separator {
display: flex;
align-items: center;
color: var(--border-color, #e0e0e0);
padding: 0 0.25rem;
font-weight: 300;
}
/* 3D/2D indicator toggle button */
.linkml-viewer-page__tab--indicator {
font-size: 0.8125rem;
min-width: auto;
padding: 0.375rem 0.75rem;
}
/* 3D Custodian Type Indicator container */
.linkml-viewer__custodian-indicator {
margin-left: 0.5rem;
vertical-align: middle;
}
/* Content Area */
.linkml-viewer-page__content {
flex: 1;
@ -476,6 +498,20 @@
color: var(--warning-color, #f57c00);
}
/* Custodian Type Badge - shows which GLAMORCUBESFIXPHDNT types apply */
.linkml-viewer__custodian-badge {
margin-left: 0.5rem;
vertical-align: middle;
}
/* Item header with badges - flex layout for proper alignment */
.linkml-viewer__item-name {
display: flex;
flex-wrap: wrap;
align-items: center;
gap: 0.375rem;
}
/* URI and Range */
.linkml-viewer__uri,
.linkml-viewer__range,

View file

@ -29,6 +29,13 @@ import {
import { useLanguage } from '../contexts/LanguageContext';
import { useCollapsibleHeader } from '../hooks/useCollapsibleHeader';
import { ChevronUp, ChevronDown } from 'lucide-react';
import { CustodianTypeBadge, CustodianTypeIndicator } from '../components/uml/CustodianTypeIndicator';
import {
getCustodianTypesForClass,
getCustodianTypesForSlot,
getCustodianTypesForEnum,
isUniversalElement,
} from '../lib/schema-custodian-mapping';
import './LinkMLViewerPage.css';
import '../styles/collapsible.css';
@ -160,6 +167,8 @@ const TEXT = {
noMatchingSchemas: { nl: 'Geen overeenkomende schema\'s', en: 'No matching schemas' },
copyToClipboard: { nl: 'Kopieer naar klembord', en: 'Copy to clipboard' },
copied: { nl: 'Gekopieerd!', en: 'Copied!' },
use3DPolygon: { nl: '3D-polygoon', en: '3D Polygon' },
use2DBadge: { nl: '2D-badge', en: '2D Badge' },
};
// Dynamically discover schema files from the modules directory
@ -203,6 +212,10 @@ const LinkMLViewerPage: React.FC = () => {
// State for copy to clipboard feedback
const [copyFeedback, setCopyFeedback] = useState(false);
// State for 3D polygon indicator toggle (future feature)
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const [use3DIndicator, setUse3DIndicator] = useState(false);
// Collapsible header
const mainContentRef = useRef<HTMLElement>(null);
const { isCollapsed: isHeaderCollapsed, setIsCollapsed: setIsHeaderCollapsed } = useCollapsibleHeader(mainContentRef);
@ -490,6 +503,8 @@ const LinkMLViewerPage: React.FC = () => {
const renderClassDetails = (cls: LinkMLClass) => {
const isHighlighted = highlightedClass === cls.name;
const custodianTypes = getCustodianTypesForClass(cls.name);
const isUniversal = isUniversalElement(custodianTypes);
return (
<div
@ -500,6 +515,23 @@ const LinkMLViewerPage: React.FC = () => {
<h4 className="linkml-viewer__item-name">
{cls.name}
{cls.abstract && <span className="linkml-viewer__badge linkml-viewer__badge--abstract">{t('abstract')}</span>}
{!isUniversal && (
use3DIndicator ? (
<CustodianTypeIndicator
types={custodianTypes}
size={28}
animate={true}
showTooltip={true}
className="linkml-viewer__custodian-indicator"
/>
) : (
<CustodianTypeBadge
types={custodianTypes}
size="small"
className="linkml-viewer__custodian-badge"
/>
)
)}
</h4>
{cls.class_uri && (
<div className="linkml-viewer__uri">
@ -550,6 +582,8 @@ const LinkMLViewerPage: React.FC = () => {
const rangeIsEnum = slot.range && isEnumRange(slot.range);
const enumKey = slot.range ? `${slot.name}:${slot.range}` : '';
const isExpanded = expandedEnumRanges.has(enumKey);
const custodianTypes = getCustodianTypesForSlot(slot.name);
const isUniversal = isUniversalElement(custodianTypes);
return (
<div key={slot.name} className="linkml-viewer__item">
@ -557,6 +591,23 @@ const LinkMLViewerPage: React.FC = () => {
{slot.name}
{slot.required && <span className="linkml-viewer__badge linkml-viewer__badge--required">{t('required')}</span>}
{slot.multivalued && <span className="linkml-viewer__badge linkml-viewer__badge--multi">{t('multivalued')}</span>}
{!isUniversal && (
use3DIndicator ? (
<CustodianTypeIndicator
types={custodianTypes}
size={28}
animate={true}
showTooltip={true}
className="linkml-viewer__custodian-indicator"
/>
) : (
<CustodianTypeBadge
types={custodianTypes}
size="small"
className="linkml-viewer__custodian-badge"
/>
)
)}
</h4>
{slot.slot_uri && (
<div className="linkml-viewer__uri">
@ -604,6 +655,8 @@ const LinkMLViewerPage: React.FC = () => {
const searchFilter = enumSearchFilters[enumName] || '';
const showAll = enumShowAll[enumName] || false;
const displayCount = 20;
const custodianTypes = getCustodianTypesForEnum(enumDef.name);
const isUniversal = isUniversalElement(custodianTypes);
// Filter values based on search
const filteredValues = searchFilter
@ -622,7 +675,26 @@ const LinkMLViewerPage: React.FC = () => {
return (
<div key={enumDef.name} className="linkml-viewer__item">
<h4 className="linkml-viewer__item-name">{enumDef.name}</h4>
<h4 className="linkml-viewer__item-name">
{enumDef.name}
{!isUniversal && (
use3DIndicator ? (
<CustodianTypeIndicator
types={custodianTypes}
size={28}
animate={true}
showTooltip={true}
className="linkml-viewer__custodian-indicator"
/>
) : (
<CustodianTypeBadge
types={custodianTypes}
size="small"
className="linkml-viewer__custodian-badge"
/>
)
)}
</h4>
{enumDef.description && (
<div className="linkml-viewer__description linkml-viewer__markdown">
<ReactMarkdown remarkPlugins={[remarkGfm]} rehypePlugins={[rehypeRaw]}>{transformContent(enumDef.description)}</ReactMarkdown>
@ -1035,6 +1107,14 @@ const LinkMLViewerPage: React.FC = () => {
>
{t('rawYaml')}
</button>
<span className="linkml-viewer-page__tab-separator">|</span>
<button
className={`linkml-viewer-page__tab linkml-viewer-page__tab--indicator ${use3DIndicator ? 'linkml-viewer-page__tab--active' : ''}`}
onClick={() => setUse3DIndicator(!use3DIndicator)}
title={use3DIndicator ? t('use2DBadge') : t('use3DPolygon')}
>
{use3DIndicator ? '🔷 3D' : '🏷️ 2D'}
</button>
</div>
</header>

View file

@ -16,6 +16,7 @@
import { useEffect, useRef, useState, useMemo, useCallback } from 'react';
import { useSearchParams } from 'react-router-dom';
import maplibregl from 'maplibre-gl';
import type { StyleSpecification, MapLayerMouseEvent, GeoJSONSource } from 'maplibre-gl';
import 'maplibre-gl/dist/maplibre-gl.css';
import { useLanguage } from '../contexts/LanguageContext';
import { useUIState } from '../contexts/UIStateContext';
@ -89,7 +90,7 @@ const TYPE_NAMES: Record<string, { nl: string; en: string }> = {
};
// Map tile styles for light and dark modes
const getMapStyle = (isDarkMode: boolean): maplibregl.StyleSpecification => {
const getMapStyle = (isDarkMode: boolean): StyleSpecification => {
if (isDarkMode) {
// CartoDB Dark Matter - dark mode tiles
return {
@ -824,7 +825,8 @@ export default function NDEMapPage() {
zoom: 7,
});
map.addControl(new maplibregl.NavigationControl(), 'top-right');
// eslint-disable-next-line @typescript-eslint/no-explicit-any
map.addControl(new maplibregl.NavigationControl() as any, 'top-right');
map.on('load', () => {
mapInstanceRef.current = map;
@ -947,9 +949,9 @@ export default function NDEMapPage() {
}
// Check if source already exists (with safety check)
let existingSource: maplibregl.GeoJSONSource | undefined;
let existingSource: GeoJSONSource | undefined;
try {
existingSource = map.getSource('institutions') as maplibregl.GeoJSONSource | undefined;
existingSource = map.getSource('institutions') as GeoJSONSource | undefined;
} catch {
console.log('[Markers] Error getting source, map may be destroyed');
return;
@ -1056,12 +1058,12 @@ export default function NDEMapPage() {
const map = mapInstanceRef.current;
// Click handler using ref to always get current filtered data
const handleClick = (e: maplibregl.MapLayerMouseEvent) => {
const handleClick = (e: MapLayerMouseEvent) => {
if (!e.features || e.features.length === 0) return;
const feature = e.features[0];
const index = feature.properties?.index;
if (index === undefined) return;
const feature = e.features[0] as GeoJSON.Feature;
const index = feature.properties?.index as number | undefined;
if (index === undefined || typeof index !== 'number') return;
// Use ref to get current filtered institutions, not stale closure
const inst = filteredInstitutionsRef.current[index];

View file

@ -765,7 +765,7 @@ export default function ProjectPlanPage() {
<Paper sx={{ mb: 4 }}>
<Tabs
value={activeTab}
onChange={(_, newValue) => setActiveTab(newValue)}
onChange={(_: React.SyntheticEvent, newValue: number) => setActiveTab(newValue)}
variant="scrollable"
scrollButtons="auto"
>

560
frontend/src/vite-env.d.ts vendored Normal file
View file

@ -0,0 +1,560 @@
/// <reference types="vite/client" />
// MUI Icons Material module declaration
// The package exports individual icon components but lacks a proper barrel index.d.ts
declare module '@mui/icons-material' {
import type { SvgIconComponent } from '@mui/icons-material/esm';
export const ExpandMore: SvgIconComponent;
export const CheckCircle: SvgIconComponent;
export const RadioButtonUnchecked: SvgIconComponent;
export const Schedule: SvgIconComponent;
export const Assignment: SvgIconComponent;
export const AccountTree: SvgIconComponent;
export const Timeline: SvgIconComponent;
export const Block: SvgIconComponent;
export const Link: SvgIconComponent;
}
// For default icon component type
declare module '@mui/icons-material/esm' {
import type { SvgIconProps } from '@mui/material/SvgIcon';
import type React from 'react';
export type SvgIconComponent = React.FC<SvgIconProps>;
}
// Module declarations for packages without type definitions
declare module 'lucide-react' {
import type { FC, SVGAttributes } from 'react';
interface LucideIconProps extends SVGAttributes<SVGElement> {
size?: number | string;
color?: string;
strokeWidth?: number | string;
absoluteStrokeWidth?: boolean;
}
type LucideIcon = FC<LucideIconProps>;
export const ChevronUp: LucideIcon;
export const ChevronDown: LucideIcon;
export const ChevronRight: LucideIcon;
export const ChevronLeft: LucideIcon;
export const X: LucideIcon;
export const Search: LucideIcon;
export const Filter: LucideIcon;
export const RefreshCw: LucideIcon;
export const Download: LucideIcon;
export const Upload: LucideIcon;
export const Settings: LucideIcon;
export const Info: LucideIcon;
export const AlertTriangle: LucideIcon;
export const AlertCircle: LucideIcon;
export const CheckCircle: LucideIcon;
export const XCircle: LucideIcon;
export const HelpCircle: LucideIcon;
export const Loader2: LucideIcon;
export const Send: LucideIcon;
export const MessageSquare: LucideIcon;
export const Bot: LucideIcon;
export const User: LucideIcon;
export const Copy: LucideIcon;
export const Check: LucideIcon;
export const Play: LucideIcon;
export const Pause: LucideIcon;
export const RotateCcw: LucideIcon;
export const ExternalLink: LucideIcon;
export const Sparkles: LucideIcon;
export const MapPin: LucideIcon;
export const Map: LucideIcon;
export const Layers: LucideIcon;
export const Globe: LucideIcon;
export const Building: LucideIcon;
export const Building2: LucideIcon;
export const Library: LucideIcon;
export const Archive: LucideIcon;
export const Landmark: LucideIcon;
export const Database: LucideIcon;
export const Network: LucideIcon;
export const Share2: LucideIcon;
export const ZoomIn: LucideIcon;
export const ZoomOut: LucideIcon;
export const Maximize: LucideIcon;
export const Maximize2: LucideIcon;
export const Minimize: LucideIcon;
export const Minimize2: LucideIcon;
export const Palette: LucideIcon;
export const Image: LucideIcon;
export const ImageIcon: LucideIcon;
export const FileText: LucideIcon;
export const Eye: LucideIcon;
export const EyeOff: LucideIcon;
export const Sun: LucideIcon;
export const Moon: LucideIcon;
export const Menu: LucideIcon;
export const ArrowRight: LucideIcon;
export const ArrowLeft: LucideIcon;
export const ArrowUp: LucideIcon;
export const ArrowDown: LucideIcon;
export const Plus: LucideIcon;
export const Minus: LucideIcon;
export const Trash2: LucideIcon;
export const Edit: LucideIcon;
export const Save: LucideIcon;
export const Clock: LucideIcon;
export const Calendar: LucideIcon;
export const Star: LucideIcon;
export const Heart: LucideIcon;
export const ThumbsUp: LucideIcon;
export const ThumbsDown: LucideIcon;
export const Flag: LucideIcon;
export const Bookmark: LucideIcon;
export const Tag: LucideIcon;
export const Hash: LucideIcon;
export const AtSign: LucideIcon;
export const Link2: LucideIcon;
export const Unlink: LucideIcon;
export const Lock: LucideIcon;
export const Unlock: LucideIcon;
export const Key: LucideIcon;
export const Shield: LucideIcon;
export const Bell: LucideIcon;
export const BellOff: LucideIcon;
export const Volume2: LucideIcon;
export const VolumeX: LucideIcon;
export const Mic: LucideIcon;
export const MicOff: LucideIcon;
export const Camera: LucideIcon;
export const Video: LucideIcon;
export const Printer: LucideIcon;
export const Mail: LucideIcon;
export const Phone: LucideIcon;
export const Home: LucideIcon;
export const List: LucideIcon;
export const Grid: LucideIcon;
export const LayoutGrid: LucideIcon;
export const LayoutList: LucideIcon;
export const Columns: LucideIcon;
export const Rows: LucideIcon;
export const SlidersHorizontal: LucideIcon;
export const History: LucideIcon;
export const Languages: LucideIcon;
export const BarChart3: LucideIcon;
export const BarChart: LucideIcon;
export const PieChart: LucideIcon;
export const LineChart: LucideIcon;
export const TrendingUp: LucideIcon;
export const TrendingDown: LucideIcon;
export const Activity: LucideIcon;
export const Zap: LucideIcon;
export const Terminal: LucideIcon;
export const Code: LucideIcon;
export const Code2: LucideIcon;
export const FileCode: LucideIcon;
export const Folder: LucideIcon;
export const FolderOpen: LucideIcon;
export const File: LucideIcon;
export const Files: LucideIcon;
export const MoreHorizontal: LucideIcon;
export const MoreVertical: LucideIcon;
export const Grip: LucideIcon;
export const GripVertical: LucideIcon;
export const Move: LucideIcon;
export const Crosshair: LucideIcon;
export const Target: LucideIcon;
export const Compass: LucideIcon;
export const Navigation: LucideIcon;
export const Focus: LucideIcon;
export const Scan: LucideIcon;
export const QrCode: LucideIcon;
export const Table2: LucideIcon;
export const Table: LucideIcon;
export const CreditCard: LucideIcon;
export const LayoutGrid: LucideIcon;
export const IdCard: LucideIcon;
}
declare module 'mermaid' {
interface MermaidConfig {
startOnLoad?: boolean;
theme?: string;
securityLevel?: string;
fontFamily?: string;
logLevel?: string;
flowchart?: Record<string, unknown>;
sequence?: Record<string, unknown>;
gantt?: Record<string, unknown>;
class?: Record<string, unknown>;
}
interface MermaidAPI {
initialize: (config: MermaidConfig) => void;
render: (id: string, text: string, svgContainingElement?: Element) => Promise<{ svg: string }>;
parse: (text: string) => Promise<boolean>;
}
const mermaid: MermaidAPI;
export default mermaid;
}
declare module 'maplibre-gl' {
export interface IControl {
onAdd(map: Map): HTMLElement;
onRemove(map: Map): void;
getDefaultPosition?: () => string;
}
export class Map {
constructor(options: MapOptions);
on(type: string, listener: (e: MapLayerMouseEvent) => void): this;
on(type: string, layerId: string, listener: (e: MapLayerMouseEvent) => void): this;
off(type: string, listener: (e: MapLayerMouseEvent) => void): this;
off(type: string, layerId: string, listener: (e: MapLayerMouseEvent) => void): this;
once(type: string, listener: () => void): this;
remove(): void;
getSource(id: string): GeoJSONSource | undefined;
getLayer(id: string): unknown;
addSource(id: string, source: unknown): this;
addLayer(layer: unknown, before?: string): this;
removeLayer(id: string): this;
removeSource(id: string): this;
moveLayer(id: string, beforeId?: string): this;
setFilter(layerId: string, filter: unknown): this;
setPaintProperty(layerId: string, property: string, value: unknown): this;
setLayoutProperty(layerId: string, property: string, value: unknown): this;
fitBounds(bounds: LngLatBoundsLike, options?: FitBoundsOptions): this;
flyTo(options: FlyToOptions): this;
getCenter(): LngLat;
getZoom(): number;
setCenter(center: LngLatLike): this;
setZoom(zoom: number): this;
resize(): this;
getBounds(): LngLatBounds;
getCanvas(): HTMLCanvasElement;
getContainer(): HTMLElement;
queryRenderedFeatures(point?: PointLike, options?: unknown): MapGeoJSONFeature[];
project(lngLat: LngLatLike): Point;
unproject(point: PointLike): LngLat;
loaded(): boolean;
isStyleLoaded(): boolean;
isMoving(): boolean;
isZooming(): boolean;
isRotating(): boolean;
triggerRepaint(): void;
easeTo(options: unknown): this;
jumpTo(options: unknown): this;
panTo(lngLat: LngLatLike, options?: unknown): this;
zoomTo(zoom: number, options?: unknown): this;
addControl(control: IControl | NavigationControl | ScaleControl, position?: string): this;
removeControl(control: IControl): this;
setStyle(style: StyleSpecification | string, options?: { diff?: boolean }): this;
getStyle(): StyleSpecification;
}
export interface MapGeoJSONFeature {
type: 'Feature';
geometry: GeoJSON.Geometry;
properties: Record<string, unknown>;
id?: string | number;
layer?: unknown;
source?: string;
sourceLayer?: string;
state?: Record<string, unknown>;
}
export class GeoJSONSource {
setData(data: GeoJSON.GeoJSON): this;
}
export class Popup {
constructor(options?: PopupOptions);
setLngLat(lngLat: LngLatLike): this;
setHTML(html: string): this;
setText(text: string): this;
addTo(map: Map): this;
remove(): this;
isOpen(): boolean;
}
export class Marker {
constructor(options?: MarkerOptions);
setLngLat(lngLat: LngLatLike): this;
addTo(map: Map): this;
remove(): this;
getElement(): HTMLElement;
setPopup(popup: Popup): this;
getPopup(): Popup;
}
export class NavigationControl {
constructor(options?: NavigationControlOptions);
}
export class ScaleControl {
constructor(options?: ScaleControlOptions);
}
export class LngLat {
constructor(lng: number, lat: number);
lng: number;
lat: number;
wrap(): LngLat;
toArray(): [number, number];
toString(): string;
distanceTo(lngLat: LngLat): number;
}
export class LngLatBounds {
constructor(sw?: LngLatLike, ne?: LngLatLike);
extend(obj: LngLatLike | LngLatBoundsLike): this;
getCenter(): LngLat;
getSouthWest(): LngLat;
getNorthEast(): LngLat;
getNorthWest(): LngLat;
getSouthEast(): LngLat;
getWest(): number;
getSouth(): number;
getEast(): number;
getNorth(): number;
toArray(): [[number, number], [number, number]];
toString(): string;
isEmpty(): boolean;
contains(lngLat: LngLatLike): boolean;
}
export class Point {
constructor(x: number, y: number);
x: number;
y: number;
}
export type LngLatLike = LngLat | [number, number] | { lng: number; lat: number } | { lon: number; lat: number };
export type LngLatBoundsLike = LngLatBounds | [LngLatLike, LngLatLike] | [number, number, number, number];
export type PointLike = Point | [number, number];
export interface MapOptions {
container: HTMLElement | string;
style: StyleSpecification | string;
center?: LngLatLike;
zoom?: number;
bearing?: number;
pitch?: number;
bounds?: LngLatBoundsLike;
fitBoundsOptions?: FitBoundsOptions;
attributionControl?: boolean;
customAttribution?: string | string[];
interactive?: boolean;
hash?: boolean | string;
maxBounds?: LngLatBoundsLike;
maxZoom?: number;
minZoom?: number;
maxPitch?: number;
minPitch?: number;
scrollZoom?: boolean;
boxZoom?: boolean;
dragRotate?: boolean;
dragPan?: boolean;
keyboard?: boolean;
doubleClickZoom?: boolean;
touchZoomRotate?: boolean;
touchPitch?: boolean;
cooperativeGestures?: boolean;
trackResize?: boolean;
locale?: Record<string, string>;
fadeDuration?: number;
crossSourceCollisions?: boolean;
collectResourceTiming?: boolean;
clickTolerance?: number;
preserveDrawingBuffer?: boolean;
antialias?: boolean;
refreshExpiredTiles?: boolean;
maxTileCacheSize?: number;
transformRequest?: (url: string, resourceType: string) => unknown;
localIdeographFontFamily?: string;
pitchWithRotate?: boolean;
pixelRatio?: number;
validateStyle?: boolean;
}
export interface StyleSpecification {
version: number;
name?: string;
metadata?: unknown;
center?: [number, number];
zoom?: number;
bearing?: number;
pitch?: number;
light?: unknown;
sources: Record<string, unknown>;
sprite?: string;
glyphs?: string;
layers: unknown[];
terrain?: unknown;
fog?: unknown;
transition?: unknown;
}
export interface PopupOptions {
closeButton?: boolean;
closeOnClick?: boolean;
closeOnMove?: boolean;
focusAfterOpen?: boolean;
anchor?: string;
offset?: number | PointLike | Record<string, PointLike>;
className?: string;
maxWidth?: string;
}
export interface MarkerOptions {
element?: HTMLElement;
anchor?: string;
offset?: PointLike;
color?: string;
scale?: number;
draggable?: boolean;
clickTolerance?: number;
rotation?: number;
rotationAlignment?: string;
pitchAlignment?: string;
}
export interface NavigationControlOptions {
showCompass?: boolean;
showZoom?: boolean;
visualizePitch?: boolean;
}
export interface ScaleControlOptions {
maxWidth?: number;
unit?: string;
}
export interface FitBoundsOptions {
padding?: number | { top?: number; bottom?: number; left?: number; right?: number };
offset?: PointLike;
maxZoom?: number;
maxDuration?: number;
linear?: boolean;
easing?: (t: number) => number;
essential?: boolean;
}
export interface FlyToOptions {
center?: LngLatLike;
zoom?: number;
bearing?: number;
pitch?: number;
duration?: number;
easing?: (t: number) => number;
offset?: PointLike;
animate?: boolean;
essential?: boolean;
padding?: number | { top?: number; bottom?: number; left?: number; right?: number };
}
export interface MapLayerMouseEvent {
type: string;
target: Map;
originalEvent: MouseEvent;
point: Point;
lngLat: LngLat;
preventDefault(): void;
defaultPrevented: boolean;
features?: MapGeoJSONFeature[];
}
}
declare module '@mui/material' {
export const Box: React.FC<Record<string, unknown>>;
export const Container: React.FC<Record<string, unknown>>;
export const Typography: React.FC<Record<string, unknown>>;
export const Paper: React.FC<Record<string, unknown>>;
export const Grid: React.FC<Record<string, unknown>>;
export const Card: React.FC<Record<string, unknown>>;
export const CardContent: React.FC<Record<string, unknown>>;
export const CardHeader: React.FC<Record<string, unknown>>;
export const CardActions: React.FC<Record<string, unknown>>;
export const Button: React.FC<Record<string, unknown>>;
export const IconButton: React.FC<Record<string, unknown>>;
export const TextField: React.FC<Record<string, unknown>>;
export const Select: React.FC<Record<string, unknown>>;
export const MenuItem: React.FC<Record<string, unknown>>;
export const FormControl: React.FC<Record<string, unknown>>;
export const FormLabel: React.FC<Record<string, unknown>>;
export const FormHelperText: React.FC<Record<string, unknown>>;
export const InputLabel: React.FC<Record<string, unknown>>;
export const Input: React.FC<Record<string, unknown>>;
export const Checkbox: React.FC<Record<string, unknown>>;
export const Radio: React.FC<Record<string, unknown>>;
export const RadioGroup: React.FC<Record<string, unknown>>;
export const Switch: React.FC<Record<string, unknown>>;
export const Slider: React.FC<Record<string, unknown>>;
export const Tabs: React.FC<Record<string, unknown>>;
export const Tab: React.FC<Record<string, unknown>>;
export const TabPanel: React.FC<Record<string, unknown>>;
export const AppBar: React.FC<Record<string, unknown>>;
export const Toolbar: React.FC<Record<string, unknown>>;
export const Drawer: React.FC<Record<string, unknown>>;
export const Dialog: React.FC<Record<string, unknown>>;
export const DialogTitle: React.FC<Record<string, unknown>>;
export const DialogContent: React.FC<Record<string, unknown>>;
export const DialogActions: React.FC<Record<string, unknown>>;
export const Modal: React.FC<Record<string, unknown>>;
export const Tooltip: React.FC<Record<string, unknown>>;
export const Popover: React.FC<Record<string, unknown>>;
export const Menu: React.FC<Record<string, unknown>>;
export const List: React.FC<Record<string, unknown>>;
export const ListItem: React.FC<Record<string, unknown>>;
export const ListItemText: React.FC<Record<string, unknown>>;
export const ListItemIcon: React.FC<Record<string, unknown>>;
export const ListItemButton: React.FC<Record<string, unknown>>;
export const Divider: React.FC<Record<string, unknown>>;
export const Avatar: React.FC<Record<string, unknown>>;
export const Badge: React.FC<Record<string, unknown>>;
export const Chip: React.FC<Record<string, unknown>>;
export const Alert: React.FC<Record<string, unknown>>;
export const AlertTitle: React.FC<Record<string, unknown>>;
export const Snackbar: React.FC<Record<string, unknown>>;
export const CircularProgress: React.FC<Record<string, unknown>>;
export const LinearProgress: React.FC<Record<string, unknown>>;
export const Skeleton: React.FC<Record<string, unknown>>;
export const Table: React.FC<Record<string, unknown>>;
export const TableBody: React.FC<Record<string, unknown>>;
export const TableCell: React.FC<Record<string, unknown>>;
export const TableContainer: React.FC<Record<string, unknown>>;
export const TableHead: React.FC<Record<string, unknown>>;
export const TableRow: React.FC<Record<string, unknown>>;
export const TablePagination: React.FC<Record<string, unknown>>;
export const TableSortLabel: React.FC<Record<string, unknown>>;
export const Accordion: React.FC<Record<string, unknown>>;
export const AccordionSummary: React.FC<Record<string, unknown>>;
export const AccordionDetails: React.FC<Record<string, unknown>>;
export const Breadcrumbs: React.FC<Record<string, unknown>>;
export const Link: React.FC<Record<string, unknown>>;
export const Pagination: React.FC<Record<string, unknown>>;
export const Rating: React.FC<Record<string, unknown>>;
export const Stepper: React.FC<Record<string, unknown>>;
export const Step: React.FC<Record<string, unknown>>;
export const StepLabel: React.FC<Record<string, unknown>>;
export const StepContent: React.FC<Record<string, unknown>>;
export const SpeedDial: React.FC<Record<string, unknown>>;
export const SpeedDialAction: React.FC<Record<string, unknown>>;
export const SpeedDialIcon: React.FC<Record<string, unknown>>;
export const ToggleButton: React.FC<Record<string, unknown>>;
export const ToggleButtonGroup: React.FC<Record<string, unknown>>;
export const Fab: React.FC<Record<string, unknown>>;
export const FormGroup: React.FC<Record<string, unknown>>;
export const FormControlLabel: React.FC<Record<string, unknown>>;
export const InputAdornment: React.FC<Record<string, unknown>>;
export const Autocomplete: React.FC<Record<string, unknown>>;
export const Stack: React.FC<Record<string, unknown>>;
export const Collapse: React.FC<Record<string, unknown>>;
export const Fade: React.FC<Record<string, unknown>>;
export const Grow: React.FC<Record<string, unknown>>;
export const Slide: React.FC<Record<string, unknown>>;
export const Zoom: React.FC<Record<string, unknown>>;
export const useTheme: () => unknown;
export const useMediaQuery: (query: string) => boolean;
export const createTheme: (options: unknown) => unknown;
export const ThemeProvider: React.FC<Record<string, unknown>>;
export const CssBaseline: React.FC;
export const GlobalStyles: React.FC<Record<string, unknown>>;
export const styled: (component: unknown, options?: unknown) => unknown;
}

View file

@ -4,12 +4,30 @@ import path from 'path'
// https://vite.dev/config/
export default defineConfig({
logLevel: 'info',
plugins: [react()],
resolve: {
alias: {
'@': path.resolve(__dirname, './src'),
},
},
build: {
// Increase chunk size warning limit (mermaid is large)
chunkSizeWarningLimit: 2000,
rollupOptions: {
output: {
// Manual chunks to separate large dependencies
manualChunks: {
maplibre: ['maplibre-gl'],
},
},
},
},
optimizeDeps: {
include: ['maplibre-gl'],
// Exclude mermaid from pre-bundling - it's dynamically imported
exclude: ['mermaid'],
},
server: {
port: 5173,
proxy: {

View file

@ -185,7 +185,7 @@ imports:
- modules/enums/ReconstructionActivityTypeEnum
- modules/enums/SourceDocumentTypeEnum
# StaffRoleTypeEnum REMOVED - replaced by StaffRole class hierarchy
# See: .opencode/ENUM_TO_CLASS_PRINCIPLE.md for rationale
# See: rules/ENUM_TO_CLASS_PRINCIPLE.md for rationale
- modules/enums/CallForApplicationStatusEnum
- modules/enums/FundingRequirementTypeEnum
@ -242,7 +242,7 @@ imports:
- modules/classes/PersonObservation
# Staff role class hierarchy (replaces StaffRoleTypeEnum - Single Source of Truth)
# See: .opencode/ENUM_TO_CLASS_PRINCIPLE.md
# See: rules/ENUM_TO_CLASS_PRINCIPLE.md
- modules/classes/StaffRole
- modules/classes/StaffRoles

View file

@ -2,6 +2,9 @@ id: https://nde.nl/ontology/hc/class/Conservatoria
name: Conservatoria
title: Conservatória Type (Lusophone)
prefixes:
linkml: https://w3id.org/linkml/
imports:
- linkml:types
- ./ArchiveOrganizationType
@ -16,7 +19,8 @@ classes:
**Wikidata**: Q9854379
**Geographic Restriction**: Portugal, Brazil, and other Lusophone countries
**Geographic Restriction**: Lusophone countries (PT, BR, AO, MZ, CV, GW, ST, TL)
This constraint is enforced via LinkML `rules` with `postconditions`.
**CUSTODIAN-ONLY**: This type does NOT have a corresponding rico:RecordSetType
class. Conservatórias are administrative offices with registration functions,
@ -59,6 +63,7 @@ classes:
**Multilingual Labels**:
- pt: Conservatória
- pt-BR: Cartório de Registro
slot_usage:
primary_type:
@ -70,10 +75,49 @@ classes:
wikidata_entity:
description: |
Should be Q9854379 for Conservatórias.
MUST be Q9854379 for Conservatórias.
Lusophone civil/property registration offices.
pattern: "^Q[0-9]+$"
equals_string: "Q9854379"
applicable_countries:
description: |
**Geographic Restriction**: Lusophone countries only.
Conservatórias exist in Portuguese-speaking countries:
- PT (Portugal) - Conservatórias do Registo
- BR (Brazil) - Cartórios de Registro
- AO (Angola) - Conservatórias
- MZ (Mozambique) - Conservatórias
- CV (Cape Verde) - Conservatórias
- GW (Guinea-Bissau) - Conservatórias
- ST (São Tomé and Príncipe) - Conservatórias
- TL (Timor-Leste) - Conservatórias (Portuguese legal heritage)
The `rules` section below enforces this constraint during validation.
multivalued: true
required: true
minimum_cardinality: 1
# LinkML rules for geographic constraint validation
rules:
- description: >-
Conservatoria MUST have applicable_countries containing at least one
Lusophone country (PT, BR, AO, MZ, CV, GW, ST, TL).
This is a mandatory geographic restriction for Portuguese-speaking
civil registry and notarial archive offices.
postconditions:
slot_conditions:
applicable_countries:
any_of:
- equals_string: "PT"
- equals_string: "BR"
- equals_string: "AO"
- equals_string: "MZ"
- equals_string: "CV"
- equals_string: "GW"
- equals_string: "ST"
- equals_string: "TL"
exact_mappings:
- skos:Concept
@ -82,8 +126,10 @@ classes:
- rico:CorporateBody
comments:
- "Conservatória (pt)"
- "Cartório de Registro (pt-BR)"
- "CUSTODIAN-ONLY type: No corresponding rico:RecordSetType class"
- "Geographic restriction: Lusophone countries (Portugal, Brazil, etc.)"
- "Geographic restriction enforced via LinkML rules: Lusophone countries only"
- "Government registration office, not traditional archive"
- "Essential for genealogical and legal research"

View file

@ -2,21 +2,27 @@ id: https://nde.nl/ontology/hc/class/CountyRecordOffice
name: CountyRecordOffice
title: County Record Office Type
prefixes:
linkml: https://w3id.org/linkml/
org: http://www.w3.org/ns/org#
imports:
- linkml:types
- ./ArchiveOrganizationType
- ./OrganizationBranch
classes:
CountyRecordOffice:
is_a: ArchiveOrganizationType
class_uri: skos:Concept
description: |
Local authority repository in the United Kingdom and similar jurisdictions,
preserving historical records of the county and its communities.
Local authority repository in the United Kingdom, preserving historical
records of the county and its communities.
**Wikidata**: Q5177943
**Geographic Context**: Primarily United Kingdom
**Geographic Restriction**: United Kingdom (GB) only.
This constraint is enforced via LinkML `rules` with `postconditions`.
**CUSTODIAN-ONLY**: This type does NOT have a corresponding rico:RecordSetType
class. County Record Offices are institutional types, not collection
@ -40,16 +46,25 @@ classes:
- Often designated as place of deposit for public records
- Increasingly rebranded as "Archives and Local Studies"
In Scotland:
- Similar functions performed by local authority archives
- National Records of Scotland at national level
In Northern Ireland:
- Public Record Office of Northern Ireland (PRONI)
- Local council archives
**Related Types**:
- LocalGovernmentArchive (Q118281267) - Local authority records
- MunicipalArchive (Q604177) - City/town archives
- LocalHistoryArchive (Q12324798) - Local history focus
**Notable Examples**:
- The National Archives (Kew) - National level
- London Metropolitan Archives
- Oxfordshire History Centre
- Lancashire Archives
- West Yorkshire Archive Service
- Surrey History Centre
**Ontological Alignment**:
- **SKOS**: skos:Concept with skos:broader Q166118 (archive)
@ -57,6 +72,8 @@ classes:
- **RiC-O**: rico:CorporateBody (as agent)
**Multilingual Labels**:
- en: County Record Office
- en-GB: County Record Office
- it: archivio pubblico territoriale
slot_usage:
@ -67,7 +84,7 @@ classes:
wikidata_entity:
description: |
Should be Q5177943 for county record offices.
MUST be Q5177943 for county record offices.
UK local authority archive type.
pattern: "^Q[0-9]+$"
equals_string: "Q5177943"
@ -76,6 +93,66 @@ classes:
description: |
Typically 'county' or 'local' for this archive type.
Corresponds to UK county administrative level.
is_branch_of_authority:
description: |
**Organizational Relationship**: County Record Offices may be branches
of larger local authority structures.
**Common Parent Organizations**:
- County Councils (e.g., Oxfordshire County Council)
- Unitary Authorities (e.g., Bristol City Council)
- Combined Authorities (e.g., Greater Manchester)
- Joint Archive Services (e.g., East Sussex / Brighton & Hove)
**Legal Context**:
County Record Offices are typically:
- Designated "place of deposit" under Public Records Act 1958
- Part of local authority heritage/cultural services
- May share governance with local studies libraries
**Use org:unitOf pattern** from OrganizationBranch to link to parent
authority when modeled as formal organizational unit.
**Examples**:
- Oxfordshire History Centre → part of Oxfordshire County Council
- London Metropolitan Archives → part of City of London Corporation
- West Yorkshire Archive Service → joint service of five councils
range: uriorcurie
multivalued: false
required: false
examples:
- value: "https://nde.nl/ontology/hc/uk/oxfordshire-county-council"
description: "Parent local authority"
applicable_countries:
description: |
**Geographic Restriction**: United Kingdom (GB) only.
County Record Offices are a UK-specific institution type within
the local authority structure of England, Wales, Scotland, and
Northern Ireland.
Note: Uses ISO 3166-1 alpha-2 code "GB" for United Kingdom
(not "UK" which is not a valid ISO code).
The `rules` section below enforces this constraint during validation.
ifabsent: "string(GB)"
required: true
minimum_cardinality: 1
maximum_cardinality: 1
# LinkML rules for geographic constraint validation
rules:
- description: >-
CountyRecordOffice MUST have applicable_countries containing "GB"
(United Kingdom). This is a mandatory geographic restriction for
UK county record offices and local authority archives.
postconditions:
slot_conditions:
applicable_countries:
any_of:
- equals_string: "GB"
exact_mappings:
- skos:Concept
@ -84,7 +161,9 @@ classes:
- rico:CorporateBody
comments:
- "County Record Office (en-GB)"
- "CUSTODIAN-ONLY type: No corresponding rico:RecordSetType class"
- "Geographic restriction enforced via LinkML rules: United Kingdom (GB) only"
- "UK local authority archive institution type"
- "Often designated place of deposit for public records"
- "Key resource for local and family history research"
@ -93,3 +172,12 @@ classes:
- LocalGovernmentArchive
- MunicipalArchive
- LocalHistoryArchive
- OrganizationBranch
slots:
is_branch_of_authority:
slot_uri: org:unitOf
description: |
Parent local authority or governing body for this County Record Office.
Uses W3C Org ontology org:unitOf relationship.
range: uriorcurie

View file

@ -22,6 +22,7 @@ imports:
- linkml:types
- ./ArchiveOrganizationType
- ./CustodianAdministration
- ./CustodianArchive
classes:
CurrentArchive:
@ -63,6 +64,24 @@ classes:
- HistoricalArchive (Q3621673) - non-current permanent records
- RecordsCenter - semi-current storage facility
**RELATIONSHIP TO CustodianArchive**:
CurrentArchive (this class) is a TYPE classification (skos:Concept) for
archives managing records in the active/current phase of the lifecycle.
CustodianArchive is an INSTANCE class (rico:RecordSet) representing the
actual operational archives of a heritage custodian awaiting processing.
**Semantic Relationship**:
- CurrentArchive is a HYPERNYM (broader type) for the concept of active records
- CustodianArchive records MAY be typed as CurrentArchive when in active use
- When CustodianArchive.processing_status = "UNPROCESSED", records may still
be in the current/active phase conceptually
**SKOS Alignment**:
- skos:broader: CurrentArchive → DepositArchive (lifecycle progression)
- skos:narrower: CurrentArchive ← specific current archive types
**ONTOLOGICAL ALIGNMENT**:
- **SKOS**: skos:Concept (type classification)
- **RiC-O**: rico:RecordSet for active record groups
@ -74,6 +93,7 @@ classes:
- retention_schedule
- creating_organization
- transfer_policy
- has_narrower_instance
slot_usage:
wikidata_entity:
@ -101,6 +121,25 @@ classes:
Policy for transferring records to intermediate or permanent archives.
Describes triggers, timelines, and procedures for transfer.
range: string
has_narrower_instance:
slot_uri: skos:narrowerTransitive
description: |
Links this archive TYPE to specific CustodianArchive INSTANCES
that are classified under this lifecycle phase.
**SKOS**: skos:narrowerTransitive for type-instance relationship.
**Usage**:
When a CustodianArchive contains records in the "current/active" phase,
it can be linked from CurrentArchive via this property.
**Example**:
- CurrentArchive (type) → has_narrower_instance →
CustodianArchive "Director's Active Files 2020-2024" (instance)
range: CustodianArchive
multivalued: true
required: false
exact_mappings:
- wikidata:Q3621648
@ -145,3 +184,11 @@ slots:
transfer_policy:
description: Policy for transferring to permanent archive
range: string
has_narrower_instance:
slot_uri: skos:narrowerTransitive
description: |
Links archive TYPE to specific CustodianArchive INSTANCES.
SKOS narrowerTransitive for type-to-instance relationship.
range: CustodianArchive
multivalued: true

View file

@ -20,6 +20,7 @@ imports:
- ../slots/access_restrictions
- ../slots/storage_location
- ./ReconstructedEntity
- ./CurrentArchive
prefixes:
linkml: https://w3id.org/linkml/
@ -31,6 +32,8 @@ prefixes:
time: http://www.w3.org/2006/time#
org: http://www.w3.org/ns/org#
premis: http://www.loc.gov/premis/rdf/v3/
skos: http://www.w3.org/2004/02/skos/core#
wikidata: http://www.wikidata.org/entity/
classes:
CustodianArchive:
@ -122,6 +125,18 @@ classes:
- **Storage**: Physical location of unprocessed archives
- **OrganizationalStructure**: Unit responsible for processing
**RELATIONSHIP TO LIFECYCLE TYPE CLASSES**:
CustodianArchive (this class) is an INSTANCE class representing actual
operational archives. It can be TYPED using lifecycle phase classifications:
- **CurrentArchive** (Q3621648): Active records in daily use
- skos:broaderTransitive links CustodianArchive → CurrentArchive type
- **DepositArchive** (Q244904): Intermediate/semi-current records
- **HistoricalArchive** (Q3621673): Permanent archival records
Use `lifecycle_phase_type` slot to classify by lifecycle position.
exact_mappings:
- rico:RecordSet
@ -162,6 +177,7 @@ classes:
- was_generated_by
- valid_from
- valid_to
- lifecycle_phase_type
slot_usage:
id:
@ -591,6 +607,33 @@ classes:
required: false
description: |
End of validity period (typically = transfer_to_collection_date).
lifecycle_phase_type:
slot_uri: skos:broaderTransitive
range: uriorcurie
required: false
description: |
Links this CustodianArchive INSTANCE to its lifecycle phase TYPE.
**SKOS**: skos:broaderTransitive for instance-to-type relationship.
**Archive Lifecycle Types (Wikidata)**:
- Q3621648 (CurrentArchive) - Active records phase
- Q244904 (DepositArchive) - Intermediate/semi-current phase
- Q3621673 (HistoricalArchive) - Archival/permanent phase
**Usage**:
Classify this operational archive by its position in the records lifecycle.
Most CustodianArchive records are in the intermediate phase (awaiting processing).
**Example**:
- CustodianArchive "Ministry Records 2010-2020" → lifecycle_phase_type →
DepositArchive (Q244904) - semi-current, awaiting processing
examples:
- value: "wikidata:Q244904"
description: "Deposit archive / semi-current records"
- value: "wikidata:Q3621648"
description: "Current archive / active records"
comments:
- "Represents operational archives BEFORE integration into CustodianCollection"
@ -719,3 +762,12 @@ slots:
arrangement_notes:
description: Notes from arrangement process
range: string
lifecycle_phase_type:
slot_uri: skos:broaderTransitive
description: |
Links CustodianArchive INSTANCE to lifecycle phase TYPE.
SKOS broaderTransitive for instance-to-type relationship.
Values: CurrentArchive (Q3621648), DepositArchive (Q244904),
HistoricalArchive (Q3621673).
range: uriorcurie

View file

@ -61,7 +61,7 @@ classes:
- Portuguese: Fundação, Associação, Ltda., S.A.
- Italian: Fondazione, Associazione, S.p.A., S.r.l.
See: .opencode/LEGAL_FORM_FILTERING_RULE.md for comprehensive global list
See: rules/LEGAL_FORM_FILTERING_RULE.md for comprehensive global list
===========================================================================
MANDATORY RULE: Special Characters MUST Be Excluded from Abbreviations
@ -112,7 +112,7 @@ classes:
- "Heritage@Digital" → "HD" (not "H@D")
- "Archives (Historical)" → "AH" (not "A(H)")
See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md for complete documentation
See: rules/ABBREVIATION_SPECIAL_CHAR_RULE.md for complete documentation
===========================================================================
MANDATORY RULE: Diacritics MUST Be Normalized to ASCII in Abbreviations
@ -152,7 +152,7 @@ classes:
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
```
See: .opencode/ABBREVIATION_SPECIAL_CHAR_RULE.md for complete documentation
See: rules/ABBREVIATION_SPECIAL_CHAR_RULE.md for complete documentation
Can be generated by:
1. ReconstructionActivity (formal entity resolution) - was_generated_by link

View file

@ -470,7 +470,7 @@ classes:
- "Follows 4-stage GLAM-NER pipeline: recognition → layout → resolution → linking"
see_also:
- ".opencode/WEB_OBSERVATION_PROVENANCE_RULES.md"
- "rules/WEB_OBSERVATION_PROVENANCE_RULES.md"
- "scripts/fetch_website_playwright.py"
- "scripts/add_xpath_provenance.py"
- "docs/convention/schema/20251202/entity_annotation_rules_v1.6.0_unified.yaml"

View file

@ -0,0 +1,303 @@
# Abbreviation Character Filtering Rules
**Rule ID**: ABBREV-CHAR-FILTER
**Status**: MANDATORY
**Applies To**: GHCID abbreviation component generation
**Created**: 2025-12-07
**Updated**: 2025-12-08 (added diacritics rule)
---
## Summary
**When generating abbreviations for GHCID, ONLY ASCII uppercase letters (A-Z) are permitted. Both special characters AND diacritics MUST be removed/normalized.**
This is a **MANDATORY** rule. Abbreviations containing special characters or diacritics are INVALID and must be regenerated.
### Two Mandatory Sub-Rules:
1. **ABBREV-SPECIAL-CHAR**: Remove all special characters and symbols
2. **ABBREV-DIACRITICS**: Normalize all diacritics to ASCII equivalents
---
## Rule 1: Diacritics MUST Be Normalized to ASCII (ABBREV-DIACRITICS)
**Diacritics (accented characters) MUST be normalized to their ASCII base letter equivalents.**
### Example (Real Case)
```
❌ WRONG: CZ-VY-TEL-L-VHSPAOČRZS (contains Č)
✅ CORRECT: CZ-VY-TEL-L-VHSPAOCRZS (ASCII only)
```
### Diacritics Normalization Table
| Diacritic | ASCII | Example |
|-----------|-------|---------|
| Á, À, Â, Ã, Ä, Å, Ā | A | "Ålborg" → A |
| Č, Ć, Ç | C | "Český" → C |
| Ď | D | "Ďáblice" → D |
| É, È, Ê, Ë, Ě, Ē | E | "Éire" → E |
| Í, Ì, Î, Ï, Ī | I | "Ísland" → I |
| Ñ, Ń, Ň | N | "España" → N |
| Ó, Ò, Ô, Õ, Ö, Ø, Ō | O | "Österreich" → O |
| Ř | R | "Říčany" → R |
| Š, Ś, Ş | S | "Šumperk" → S |
| Ť | T | "Ťažký" → T |
| Ú, Ù, Û, Ü, Ů, Ū | U | "Ústí" → U |
| Ý, Ÿ | Y | "Ýmir" → Y |
| Ž, Ź, Ż | Z | "Žilina" → Z |
| Ł | L | "Łódź" → L |
| Æ | AE | "Ærø" → AE |
| Œ | OE | "Œuvre" → OE |
| ß | SS | "Straße" → SS |
### Implementation
```python
import unicodedata
def normalize_diacritics(text: str) -> str:
"""
Normalize diacritics to ASCII equivalents.
Examples:
"Č" → "C"
"Ř" → "R"
"Ö" → "O"
"ñ" → "n"
"""
# NFD decomposition separates base characters from combining marks
normalized = unicodedata.normalize('NFD', text)
# Remove combining marks (category 'Mn' = Mark, Nonspacing)
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return ascii_text
# Example
normalize_diacritics("VHSPAOČRZS") # Returns "VHSPAOCRZS"
```
### Languages Commonly Affected
| Language | Common Diacritics | Example Institution |
|----------|-------------------|---------------------|
| **Czech** | Č, Ř, Š, Ž, Ě, Ů | Vlastivědné muzeum → VM (not VM with háček) |
| **Polish** | Ł, Ń, Ó, Ś, Ź, Ż, Ą, Ę | Biblioteka Łódzka → BL |
| **German** | Ä, Ö, Ü, ß | Österreichische Nationalbibliothek → ON |
| **French** | É, È, Ê, Ç, Ô | Bibliothèque nationale → BN |
| **Spanish** | Ñ, Á, É, Í, Ó, Ú | Museo Nacional → MN |
| **Portuguese** | Ã, Õ, Ç, Á, É | Biblioteca Nacional → BN |
| **Nordic** | Å, Ä, Ö, Ø, Æ | Nationalmuseet → N |
| **Turkish** | Ç, Ğ, İ, Ö, Ş, Ü | İstanbul Üniversitesi → IU |
| **Hungarian** | Á, É, Í, Ó, Ö, Ő, Ú, Ü, Ű | Országos Levéltár → OL |
| **Romanian** | Ă, Â, Î, Ș, Ț | Biblioteca Națională → BN |
---
## Rule 2: Special Characters MUST Be Removed (ABBREV-SPECIAL-CHAR)
---
## Rationale
### 1. URL/URI Safety
Special characters require percent-encoding in URIs. For example:
- `&` becomes `%26`
- `+` becomes `%2B`
This makes identifiers harder to share, copy, and verify.
### 2. Filename Safety
Many special characters are invalid in filenames across operating systems:
- Windows: `\ / : * ? " < > |`
- macOS/Linux: `/` and null bytes
Files like `SX-XX-PHI-O-DR&IMSM.yaml` may cause issues on some systems.
### 3. Parsing Consistency
Special characters can conflict with delimiters in data pipelines:
- `&` is used in query strings
- `:` is used in YAML, JSON
- `/` is a path separator
- `|` is a common CSV delimiter alternative
### 4. Cross-System Compatibility
Identifiers should work across all systems:
- Databases (SQL, TypeDB, Neo4j)
- RDF/SPARQL endpoints
- REST APIs
- Command-line tools
- Spreadsheets
### 5. Human Readability
Clean identifiers are easier to:
- Communicate verbally
- Type correctly
- Proofread
- Remember
---
## Characters to Remove
The following characters MUST be completely removed (not replaced) when generating abbreviations:
| Character | Name | Example Issue |
|-----------|------|---------------|
| `&` | Ampersand | "R&A" in URLs, HTML entities |
| `/` | Slash | Path separator confusion |
| `\` | Backslash | Escape sequence issues |
| `+` | Plus | URL encoding (`+` = space) |
| `@` | At sign | Email/handle confusion |
| `#` | Hash/Pound | Fragment identifier in URLs |
| `%` | Percent | URL encoding prefix |
| `$` | Dollar | Variable prefix in shells |
| `*` | Asterisk | Glob/wildcard character |
| `(` `)` | Parentheses | Grouping in regex, code |
| `[` `]` | Square brackets | Array notation |
| `{` `}` | Curly braces | Object notation |
| `\|` | Pipe | Command chaining, OR operator |
| `:` | Colon | YAML key-value, namespace separator |
| `;` | Semicolon | Statement terminator |
| `"` `'` `` ` `` | Quotes | String delimiters |
| `,` | Comma | List separator |
| `.` | Period | File extension, namespace |
| `-` | Hyphen | Already used as GHCID component separator |
| `_` | Underscore | Reserved for name suffix in collisions |
| `=` | Equals | Assignment operator |
| `?` | Question mark | Query string indicator |
| `!` | Exclamation | Negation, shell history |
| `~` | Tilde | Home directory, bitwise NOT |
| `^` | Caret | Regex anchor, power operator |
| `<` `>` | Angle brackets | HTML tags, redirects |
---
## Implementation
### Algorithm
When extracting abbreviation from institution name:
```python
import re
import unicodedata
def extract_abbreviation_from_name(name: str, skip_words: set) -> str:
"""
Extract abbreviation from institution name.
Args:
name: Full institution name (emic)
skip_words: Set of prepositions/articles to skip
Returns:
Uppercase abbreviation with only A-Z characters
"""
# Step 1: Normalize unicode (remove diacritics)
normalized = unicodedata.normalize('NFD', name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Step 2: Replace special characters with spaces (to split words)
# This handles cases like "Records&Information" -> "Records Information"
clean_name = re.sub(r'[^a-zA-Z\s]', ' ', ascii_name)
# Step 3: Split into words
words = clean_name.split()
# Step 4: Filter out skip words (prepositions, articles)
significant_words = [w for w in words if w.lower() not in skip_words]
# Step 5: Take first letter of each significant word
abbreviation = ''.join(w[0].upper() for w in significant_words if w)
# Step 6: Limit to 10 characters
return abbreviation[:10]
```
### Handling Special Cases
**Case 1: "Records & Information Management"**
1. Input: `"Records & Information Management"`
2. After special char removal: `"Records Information Management"`
3. After split: `["Records", "Information", "Management"]`
4. Abbreviation: `RIM`
**Case 2: "Art/Design Museum"**
1. Input: `"Art/Design Museum"`
2. After special char removal: `"Art Design Museum"`
3. After split: `["Art", "Design", "Museum"]`
4. Abbreviation: `ADM`
**Case 3: "Culture+"**
1. Input: `"Culture+"`
2. After special char removal: `"Culture"`
3. After split: `["Culture"]`
4. Abbreviation: `C`
---
## Examples
| Institution Name | Correct | Incorrect |
|------------------|---------|-----------|
| Department of Records & Information Management | DRIM | DR&IM |
| Art + Culture Center | ACC | A+CC |
| Museum/Gallery Amsterdam | MGA | M/GA |
| Heritage@Digital | HD | H@D |
| Archives (Historical) | AH | A(H) |
| Research & Development Institute | RDI | R&DI |
| Sint Maarten Records & Information | SMRI | SMR&I |
---
## Validation
### Check for Invalid Abbreviations
```bash
# Find GHCID files with special characters in abbreviation
find data/custodian -name "*.yaml" | xargs grep -l '[&+@#%$*|:;?!=~^<>]' | head -20
# Specifically check for & in filenames
find data/custodian -name "*&*.yaml"
```
### Programmatic Validation
```python
import re
def validate_abbreviation(abbrev: str) -> bool:
"""
Validate that abbreviation contains only A-Z.
Returns True if valid, False if contains special characters.
"""
return bool(re.match(r'^[A-Z]+$', abbrev))
# Examples
validate_abbreviation("DRIMSM") # True - valid
validate_abbreviation("DR&IMSM") # False - contains &
validate_abbreviation("A+CC") # False - contains +
```
---
## Related Documentation
- `AGENTS.md` - Section "INSTITUTION ABBREVIATION: EMIC NAME FIRST-LETTER PROTOCOL"
- `schemas/20251121/linkml/modules/classes/CustodianName.yaml` - Schema description
- `rules/LEGAL_FORM_FILTER.md` - Related filtering rule for legal forms
- `docs/PERSISTENT_IDENTIFIERS.md` - GHCID specification
---
## Changelog
| Date | Change |
|------|--------|
| 2025-12-07 | Initial rule created after discovery of `&` in GHCID |
| 2025-12-08 | Added diacritics normalization rule |

View file

@ -0,0 +1,237 @@
# Enum-to-Class Principle: Single Source of Truth
**Rule ID**: ENUM-TO-CLASS
**Status**: ACTIVE
**Applies To**: Schema evolution decisions
**Version**: 1.0
**Last Updated**: 2025-12-06
---
## Core Principle
**Enums are TEMPORARY scaffolding. Once an enum is promoted to a class hierarchy, the enum MUST be deleted to maintain a Single Source of Truth.**
---
## Rationale
### The Problem: Dual Representation
When both an enum AND a class hierarchy exist for the same concept:
- **Data sync issues**: Enum values and class names can drift apart
- **Maintenance burden**: Changes must be made in two places
- **Developer confusion**: Which one should I use?
- **Validation conflicts**: Enum constraints vs class ranges may diverge
### The Solution: Single Source of Truth
- **Enums**: Use for simple, fixed value constraints (e.g., `DataTierEnum: TIER_1, TIER_2, TIER_3, TIER_4`)
- **Classes**: Use when the concept needs properties, relationships, or rich documentation
- **NEVER BOTH**: Once promoted to classes, DELETE the enum
---
## When to Promote Enum to Classes
**Promote when the concept needs**:
| Need | Enum Can Do? | Class Required? |
|------|-------------|-----------------|
| Fixed value constraint | Yes | Yes |
| Properties (e.g., `role_category`, `typical_domains`) | No | Yes |
| Rich description per value | Limited | Yes |
| Relationships to other entities | No | Yes |
| Inheritance hierarchy | No | Yes |
| Independent identity (URI) | Limited | Yes |
| Ontology class mapping (`class_uri`) | Via `meaning` | Native |
**Rule of thumb**: If you're adding detailed documentation to each enum value, or want to attach properties, it's time to promote to classes.
---
## Promotion Workflow
### Step 1: Create Class Hierarchy
```yaml
# modules/classes/StaffRole.yaml (base class)
StaffRole:
abstract: true
description: Base class for staff role categories
slots:
- role_id
- role_name
- role_category
- typical_domains
# modules/classes/StaffRoles.yaml (subclasses)
Curator:
is_a: StaffRole
description: Museum curator specializing in collection research...
Conservator:
is_a: StaffRole
description: Conservator specializing in preservation...
```
### Step 2: Update Slot Ranges
```yaml
# BEFORE (enum)
staff_role:
range: StaffRoleTypeEnum
# AFTER (class)
staff_role:
range: StaffRole
```
### Step 3: Update Modular Schema Imports
```yaml
# REMOVE enum import
# - modules/enums/StaffRoleTypeEnum # DELETED
# ADD class imports
- modules/classes/StaffRole
- modules/classes/StaffRoles
```
### Step 4: Archive the Enum
```bash
mkdir -p schemas/.../archive/enums
mv modules/enums/OldEnum.yaml archive/enums/OldEnum.yaml.archived_$(date +%Y%m%d)
```
### Step 5: Document the Change
- Update `archive/enums/README.md` with migration entry
- Add comment in modular schema explaining removal
- Update any documentation referencing the old enum
---
## Example: StaffRoleTypeEnum to StaffRole
**Before** (2025-12-05):
```yaml
# StaffRoleTypeEnum.yaml
StaffRoleTypeEnum:
permissible_values:
CURATOR:
description: Museum curator
CONSERVATOR:
description: Conservator
# ... 51 values with limited documentation
```
**After** (2025-12-06):
```yaml
# StaffRole.yaml (abstract base)
StaffRole:
abstract: true
slots:
- role_id
- role_name
- role_category
- typical_domains
- typical_responsibilities
- requires_qualification
# StaffRoles.yaml (51 subclasses)
Curator:
is_a: StaffRole
class_uri: schema:curator
description: |
Museum curator specializing in collection research...
**IMPORTANT - FORMAL TITLE vs DE FACTO WORK**:
This is the OFFICIAL job appellation/title. Actual work may differ.
slot_usage:
role_category:
equals_string: CURATORIAL
typical_domains:
equals_expression: "[Museums, Galleries]"
```
**Why the promotion?**
1. Need to distinguish FORMAL TITLE from DE FACTO WORK
2. Each role has `role_category`, `common_variants`, `typical_domains`, `typical_responsibilities`
3. Roles benefit from inheritance (`Curator is_a StaffRole`)
4. Richer documentation per role
---
## Enums That Should REMAIN Enums
Some enums are appropriate as permanent fixtures:
| Enum | Why Keep as Enum |
|------|------------------|
| `DataTierEnum` | Simple 4-value tier (TIER_1 through TIER_4), no properties needed |
| `DataSourceEnum` | Fixed source types, simple strings |
| `CountryCodeEnum` | ISO 3166-1 standard, no custom properties |
| `LanguageCodeEnum` | ISO 639 standard, no custom properties |
**Characteristics of "permanent" enums**:
- Based on external standards (ISO, etc.)
- Simple values with no need for properties
- Unlikely to require rich per-value documentation
- Used purely for validation/constraint
---
## Anti-Patterns
### WRONG: Keep Both Enum and Classes
```yaml
# modules/enums/StaffRoleTypeEnum.yaml # Still exists!
# modules/classes/StaffRole.yaml # Also exists!
# Which one is authoritative? CONFUSION!
```
### WRONG: Create Classes but Keep Enum "for backwards compatibility"
```yaml
# "Let's keep the enum for old code"
# Result: Two sources of truth, guaranteed drift
```
### CORRECT: Delete Enum After Creating Classes
```yaml
# modules/enums/StaffRoleTypeEnum.yaml # ARCHIVED
# modules/classes/StaffRole.yaml # Single source of truth
# modules/classes/StaffRoles.yaml # All 51 role subclasses
```
---
## Verification Checklist
After promoting an enum to classes:
- [ ] Old enum file moved to `archive/enums/`
- [ ] Modular schema import removed for enum
- [ ] Modular schema import added for new class(es)
- [ ] All slot ranges updated from enum to class
- [ ] No grep results for old enum name in active schema files
- [ ] `archive/enums/README.md` updated with migration entry
- [ ] Comment added in modular schema explaining removal
```bash
# Verify enum is fully removed (should return only archive hits)
grep -r "StaffRoleTypeEnum" schemas/20251121/linkml/
```
---
## See Also
- `docs/ENUM_CLASS_SINGLE_SOURCE.md` - Extended documentation
- `schemas/20251121/linkml/archive/enums/README.md` - Archive directory
- LinkML documentation on enums: https://linkml.io/linkml/schemas/enums.html
- LinkML documentation on classes: https://linkml.io/linkml/schemas/models.html

View file

@ -0,0 +1,436 @@
# GeoNames Settlement Standardization Rules
**Rule ID**: GEONAMES-SETTLEMENT
**Status**: MANDATORY
**Applies To**: GHCID settlement component generation
**Version**: 1.1.0
**Effective Date**: 2025-12-01
**Last Updated**: 2025-12-01
---
## Purpose
This document defines the rules for standardizing settlement names in GHCID (Global Heritage Custodian Identifier) generation using the GeoNames geographical database.
## Core Principle
**ALL settlement names in GHCID must be derived from GeoNames standardized names, not from source data.**
The GeoNames database serves as the **single source of truth** for:
- Settlement names (cities, towns, villages)
- Settlement abbreviations/codes
- Administrative region codes (admin1)
- Geographic coordinates validation
## Why GeoNames Standardization?
1. **Consistency**: Same settlement = same GHCID component, regardless of source data variations
2. **Disambiguation**: Handles duplicate city names across regions
3. **Internationalization**: Provides ASCII-safe names for identifiers
4. **Authority**: GeoNames is a well-maintained, CC-licensed geographic database
5. **Persistence**: Settlement names don't change frequently, ensuring GHCID stability
---
## CRITICAL: Feature Code Filtering
**NEVER use neighborhoods or districts (PPLX) for GHCID generation. ONLY use proper settlements (cities, towns, villages).**
GeoNames classifies populated places with feature codes. When reverse geocoding coordinates to find a settlement, you MUST filter by feature code.
### ALLOWED Feature Codes
| Code | Description | Example |
|------|-------------|---------|
| **PPL** | Populated place (city/town/village) | Apeldoorn, Hamont, Lelystad |
| **PPLA** | Seat of first-order admin division | Provincial capitals |
| **PPLA2** | Seat of second-order admin division | Municipal seats |
| **PPLA3** | Seat of third-order admin division | District seats |
| **PPLA4** | Seat of fourth-order admin division | Sub-district seats |
| **PPLC** | Capital of a political entity | Amsterdam, Brussels |
| **PPLS** | Populated places (multiple) | Settlement clusters |
| **PPLG** | Seat of government | The Hague |
### EXCLUDED Feature Codes
| Code | Description | Why Excluded |
|------|-------------|--------------|
| **PPLX** | Section of populated place | Neighborhoods, districts, quarters (e.g., "Binnenstad", "Amsterdam Binnenstad") |
### Implementation
```python
VALID_FEATURE_CODES = ('PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLS', 'PPLG')
query = """
SELECT name, feature_code, geonames_id, ...
FROM cities
WHERE country_code = ?
AND feature_code IN (?, ?, ?, ?, ?, ?, ?, ?)
ORDER BY distance_sq
LIMIT 1
"""
cursor.execute(query, (country_code, *VALID_FEATURE_CODES))
```
### Verification
Always check `feature_code` in location_resolution metadata:
```yaml
location_resolution:
geonames_name: Apeldoorn
feature_code: PPL # ← MUST be PPL, PPLA*, PPLC, PPLS, or PPLG
```
**If you see `feature_code: PPLX`**, the GHCID is WRONG and must be regenerated.
---
## CRITICAL: Country Code Detection
**Determine country code from entry data BEFORE calling GeoNames reverse geocoding.**
GeoNames queries are country-specific. Using the wrong country code will return incorrect results.
### Country Code Resolution Priority
1. `zcbs_enrichment.country` - Most explicit source
2. `location.country` - Direct location field
3. `locations[].country` - Array location field
4. `original_entry.country` - CSV source field
5. `google_maps_enrichment.address` - Parse from address string
6. `wikidata_enrichment.located_in.label` - Infer from Wikidata
7. Default: `"NL"` (Netherlands) - Only if no other source
### Example
```python
# Determine country code FIRST
country_code = "NL" # Default
if entry.get('zcbs_enrichment', {}).get('country'):
country_code = entry['zcbs_enrichment']['country']
elif entry.get('google_maps_enrichment', {}).get('address', ''):
address = entry['google_maps_enrichment']['address']
if ', Belgium' in address:
country_code = "BE"
elif ', Germany' in address:
country_code = "DE"
# THEN call reverse geocoding
result = reverse_geocode_to_city(latitude, longitude, country_code)
```
---
## Settlement Resolution Process
### Step 1: Coordinate-Based Resolution (Preferred)
When coordinates are available, use reverse geocoding to find the nearest GeoNames settlement:
```python
def resolve_settlement_from_coordinates(latitude: float, longitude: float, country_code: str = "NL") -> dict:
"""
Find the GeoNames settlement nearest to given coordinates.
Returns:
{
'settlement_name': 'Lelystad', # GeoNames standardized name
'settlement_code': 'LEL', # 3-letter abbreviation
'admin1_code': '16', # GeoNames admin1 code
'region_code': 'FL', # ISO 3166-2 region code
'geonames_id': 2751792, # GeoNames ID for provenance
'distance_km': 0.5 # Distance from coords to settlement center
}
"""
```
### Step 2: Name-Based Resolution (Fallback)
When only a settlement name is available (no coordinates), look up in GeoNames:
```python
def resolve_settlement_from_name(name: str, country_code: str = "NL") -> dict:
"""
Find the GeoNames settlement matching the given name.
Uses fuzzy matching and disambiguation when multiple matches exist.
"""
```
### Step 3: Manual Resolution (Last Resort)
If GeoNames lookup fails, flag the entry for manual review with:
- `settlement_source: MANUAL`
- `settlement_needs_review: true`
---
## GHCID Settlement Component Rules
### Format
The settlement component in GHCID uses a **3-letter uppercase code**:
```
NL-{REGION}-{SETTLEMENT}-{TYPE}-{ABBREV}
^^^^^^^^^^^
3-letter code from GeoNames
```
### Code Generation Rules
1. **Single-word settlements**: First 3 letters uppercase
- `Amsterdam``AMS`
- `Rotterdam``ROT`
- `Lelystad``LEL`
2. **Settlements with Dutch articles** (`de`, `het`, `den`, `'s`):
- First letter of article + first 2 letters of main word
- `Den Haag``DHA`
- `'s-Hertogenbosch``SHE`
- `De Bilt``DBI`
3. **Multi-word settlements** (no article):
- First letter of each word (up to 3)
- `Nieuw Amsterdam``NAM`
- `Oud Beijerland``OBE`
4. **GeoNames Disambiguation Database**:
- For known problematic settlements, use pre-defined codes from disambiguation table
- Example: Both `Zwolle` (OV) and `Zwolle` (LI) exist - use `ZWO` with region for uniqueness
### Measurement Point for Historical Custodians
**Rule**: For heritage custodians that no longer exist or have historical coordinates, the **modern-day settlement** (as of 2025-12-01) is used.
Rationale:
- GHCIDs should be stable over time
- Historical place names may have changed
- Modern settlements are easier to verify and look up
- GeoNames reflects current geographic reality
Example:
- A museum that operated 1900-1950 in what was then "Nieuw Land" (before Flevoland province existed)
- Modern coordinates fall within Lelystad municipality
- GHCID uses `LEL` (Lelystad) as settlement code, not historical name
---
## GeoNames Database Integration
### Database Location
```
/data/reference/geonames.db
```
### Required Tables
```sql
-- Cities/settlements table
CREATE TABLE cities (
geonames_id INTEGER PRIMARY KEY,
name TEXT, -- Local name (may have diacritics)
ascii_name TEXT, -- ASCII-safe name for identifiers
country_code TEXT, -- ISO 3166-1 alpha-2
admin1_code TEXT, -- First-level administrative division
admin1_name TEXT, -- Region/province name
latitude REAL,
longitude REAL,
population INTEGER,
feature_code TEXT -- PPL, PPLA, PPLC, etc.
);
-- Disambiguation table for problematic settlements
CREATE TABLE settlement_codes (
geonames_id INTEGER PRIMARY KEY,
country_code TEXT,
settlement_code TEXT, -- 3-letter code
is_primary BOOLEAN, -- Primary code for this settlement
notes TEXT
);
```
### Admin1 Code Mapping (Netherlands)
**IMPORTANT**: GeoNames admin1 codes differ from historical numbering. Use this mapping:
| GeoNames admin1 | Province | ISO 3166-2 |
|-----------------|----------|------------|
| 01 | Drenthe | NL-DR |
| 02 | Friesland | NL-FR |
| 03 | Gelderland | NL-GE |
| 04 | Groningen | NL-GR |
| 05 | Limburg | NL-LI |
| 06 | Noord-Brabant | NL-NB |
| 07 | Noord-Holland | NL-NH |
| 09 | Utrecht | NL-UT |
| 10 | Zeeland | NL-ZE |
| 11 | Zuid-Holland | NL-ZH |
| 15 | Overijssel | NL-OV |
| 16 | Flevoland | NL-FL |
**Note**: Code 08 is not used in Netherlands (was assigned to former region).
---
## Validation Requirements
### Before GHCID Generation
Every entry MUST have:
- [ ] Settlement name resolved via GeoNames
- [ ] `geonames_id` recorded in entry metadata
- [ ] Settlement code (3-letter) generated consistently
- [ ] Admin1/region code mapped correctly
### Provenance Tracking
Record GeoNames resolution in entry metadata:
```yaml
location_resolution:
method: REVERSE_GEOCODE # or NAME_LOOKUP or MANUAL
geonames_id: 2751792
geonames_name: Lelystad
settlement_code: LEL
admin1_code: "16"
region_code: FL
resolution_date: "2025-12-01T00:00:00Z"
source_coordinates:
latitude: 52.52111
longitude: 5.43722
distance_to_settlement_km: 0.5
```
---
## CRITICAL: XXX Placeholders Are TEMPORARY - Research Required
**XXX placeholders for region/settlement codes are NEVER acceptable as a final state.**
When an entry has `XX` (unknown region) or `XXX` (unknown settlement), the agent MUST conduct research to resolve the location.
### Resolution Strategy by Institution Type
| Institution Type | Location Resolution Method |
|------------------|---------------------------|
| **Destroyed institution** | Use last known physical location before destruction |
| **Historical (closed)** | Use last operating location |
| **Refugee/diaspora org** | Use current headquarters OR original founding location |
| **Digital-only platform** | Use parent/founding organization's headquarters |
| **Decentralized initiative** | Use founding location or primary organizer location |
| **Unknown city, known country** | Research via Wikidata, Google Maps, official website |
### Research Sources (Priority Order)
1. **Wikidata** - P131 (located in), P159 (headquarters location), P625 (coordinates)
2. **Google Maps** - Search institution name
3. **Official Website** - Contact page, about page
4. **Web Archive** - archive.org for destroyed/closed institutions
5. **Academic Sources** - Papers, reports
6. **News Articles** - Particularly for destroyed heritage sites
### Location Resolution Metadata
When resolving XXX placeholders, update `location_resolution`:
```yaml
location_resolution:
method: MANUAL_RESEARCH # Previously was NAME_LOOKUP with XXX
country_code: PS
region_code: GZ
region_name: Gaza Strip
city_code: GAZ
city_name: Gaza City
geonames_id: 281133
research_date: "2025-12-06T00:00:00Z"
research_sources:
- type: wikidata
id: Q123456
claim: P131
- type: web_archive
url: https://web.archive.org/web/20231001/https://institution-website.org/contact
notes: "Located in Gaza City prior to destruction in 2024"
```
### File Renaming After Resolution
When GHCID changes due to XXX resolution, the file MUST be renamed:
```bash
# Before
data/custodian/PS-XX-XXX-A-NAPR.yaml
# After
data/custodian/PS-GZ-GAZ-A-NAPR.yaml
```
### Prohibited Practices
- ❌ Leaving XXX placeholders in production data
- ❌ Using "Online" or country name as location
- ❌ Skipping research because it's difficult
- ❌ Using XX/XXX for diaspora organizations
---
## Error Handling
### No GeoNames Match
If a settlement cannot be resolved via automated lookup:
1. Log warning with entry details
2. Set `settlement_code: XXX` (temporary placeholder)
3. Set `settlement_needs_review: true`
4. Do NOT skip the entry - generate GHCID with XXX placeholder
5. **IMMEDIATELY** begin manual research to resolve
### Multiple GeoNames Matches
When multiple settlements match a name:
1. Use coordinates to disambiguate (if available)
2. Use admin1/region context (if available)
3. Use population as tiebreaker (prefer larger settlement)
4. Flag for manual review if still ambiguous
### Coordinates Outside Country
If coordinates fall outside the expected country:
1. Log warning
2. Use nearest settlement within country
3. Flag for manual review
---
## Related Documentation
- `AGENTS.md` - Section on GHCID generation
- `docs/PERSISTENT_IDENTIFIERS.md` - Complete GHCID specification
- `docs/GHCID_PID_SCHEME.md` - PID scheme details
- `scripts/enrich_nde_entries_ghcid.py` - Implementation
---
## Changelog
### v1.1.0 (2025-12-01)
- **CRITICAL**: Added feature code filtering rules
- MUST filter for PPL, PPLA, PPLA2, PPLA3, PPLA4, PPLC, PPLS, PPLG
- MUST exclude PPLX (neighborhoods/districts)
- Example: Apeldoorn (PPL) not "Binnenstad" (PPLX)
- **CRITICAL**: Added country code detection rules
- Must determine country from entry data BEFORE reverse geocoding
- Priority: zcbs_enrichment.country > location.country > address parsing
- Example: Belgian institutions use BE, not NL
- Added Belgium admin1 code mapping (BRU, VLG, WAL)
### v1.0.0 (2025-12-01)
- Initial version
- Established GeoNames as authoritative source for settlement standardization
- Defined measurement point rule for historical custodians
- Documented admin1 code mapping for Netherlands

View file

@ -0,0 +1,346 @@
# Legal Form Filtering Rule for CustodianName
**Rule ID**: LEGAL-FORM-FILTER
**Status**: MANDATORY
**Applies To**: CustodianName standardization
**Created**: 2025-12-02
---
## Overview
**CRITICAL RULE**: Legal form designations MUST ALWAYS be filtered from `CustodianName`, even when the custodian self-identifies with them.
This is the **ONE EXCEPTION** to the emic (insider name) principle in the Heritage Custodian Ontology.
## Rationale
### Why Legal Forms Are NOT Part of Identity
1. **Legal Form ≠ Identity**: The legal structure is administrative metadata, not the custodian's core identity
- "Stichting Rijksmuseum" → Identity is "Rijksmuseum", legal form is "Stichting"
2. **Legal Forms Change Over Time**: Organizations transform while identity persists
- Association → Foundation → Corporation (same museum, different legal structures)
3. **Cross-Jurisdictional Consistency**: Same organization may have different legal forms in different countries
- "Getty Foundation" (US) = "Stichting Getty" (NL) = same identity
4. **Deduplication**: Prevents false duplicates
- "Museum X" and "Stichting Museum X" should NOT be separate entities
5. **ISO 20275 Alignment**: The Legal Entity Identifier (LEI) standard explicitly separates legal form from entity name
### Where Legal Form IS Stored
Legal form information is NOT discarded - it is stored in appropriate metadata fields:
| Field | Location | Purpose |
|-------|----------|---------|
| `legal_form` | `CustodianLegalStatus` | ISO 20275 legal form code |
| `legal_name` | `CustodianLegalStatus` | Full registered name including legal form |
| `observed_name` | `CustodianObservation` | Original name as observed in source (may include legal form) |
## Examples
### Dutch Examples
| Source Name | CustodianName | Legal Form | Notes |
|-------------|---------------|------------|-------|
| Stichting Rijksmuseum | Rijksmuseum | Stichting | Prefix removal |
| Hidde Nijland Stichting | Hidde Nijland | Stichting | Suffix removal |
| Stichting Het Loo | Het Loo | Stichting | Preserve article "Het" |
| Coöperatie Erfgoed | Erfgoed | Coöperatie | |
| Vereniging Ons Huis | Ons Huis | Vereniging | |
| Museum B.V. | Museum | B.V. | |
### International Examples
| Source Name | CustodianName | Legal Form | Language |
|-------------|---------------|------------|----------|
| The Getty Foundation | The Getty | Foundation | English |
| British Museum Trust Ltd | British Museum | Trust Ltd | English |
| Smithsonian Institution Inc. | Smithsonian Institution | Inc. | English |
| Fundação Biblioteca Nacional | Biblioteca Nacional | Fundação | Portuguese |
| Verein Deutsches Museum | Deutsches Museum | Verein | German |
| Association des Amis du Louvre | Amis du Louvre | Association | French |
| Fondazione Musei Civici | Musei Civici | Fondazione | Italian |
| Fundación Museo del Prado | Museo del Prado | Fundación | Spanish |
---
## Global Legal Form Terms Reference
### Dutch (Netherlands, Belgium-Flanders)
**Foundations and Non-Profits:**
- Stichting (foundation)
- Vereniging (association)
- Coöperatie, Coöperatieve (cooperative)
**Business Entities:**
- B.V., BV (besloten vennootschap - private limited company)
- N.V., NV (naamloze vennootschap - public limited company)
- V.O.F., VOF (vennootschap onder firma - general partnership)
- C.V., CV (commanditaire vennootschap - limited partnership)
- Maatschap (partnership)
- Eenmanszaak (sole proprietorship)
### English (UK, US, Ireland, Australia, etc.)
**Foundations and Non-Profits:**
- Foundation
- Trust
- Association
- Society
- Institute
- Institution (when followed by Inc./Ltd.)
- Charity
- Fund
**Business Entities:**
- Inc., Incorporated
- Ltd., Limited
- LLC, L.L.C. (limited liability company)
- LLP, L.L.P. (limited liability partnership)
- Corp., Corporation
- Co., Company
- PLC, plc (public limited company - UK)
- Pty Ltd (proprietary limited - Australia)
### German (Germany, Austria, Switzerland)
**Foundations and Non-Profits:**
- Stiftung (foundation)
- Verein (association)
- e.V., eingetragener Verein (registered association)
- gGmbH (gemeinnützige GmbH - charitable limited company)
**Business Entities:**
- GmbH (Gesellschaft mit beschränkter Haftung - limited liability company)
- AG (Aktiengesellschaft - stock corporation)
- KG (Kommanditgesellschaft - limited partnership)
- OHG (offene Handelsgesellschaft - general partnership)
- GmbH & Co. KG
- UG (Unternehmergesellschaft - mini-GmbH)
### French (France, Belgium-Wallonia, Switzerland, Canada-Quebec)
**Foundations and Non-Profits:**
- Fondation (foundation)
- Association (association)
- Fonds (fund)
**Business Entities:**
- S.A., SA (société anonyme - public limited company)
- S.A.R.L., SARL (société à responsabilité limitée - private limited company)
- S.A.S., SAS (société par actions simplifiée)
- S.C.I., SCI (société civile immobilière)
- S.N.C., SNC (société en nom collectif - general partnership)
- S.C.S., SCS (société en commandite simple)
- EURL (entreprise unipersonnelle à responsabilité limitée)
### Spanish (Spain, Latin America)
**Foundations and Non-Profits:**
- Fundación (foundation)
- Asociación (association)
- Sociedad (society) - when not followed by commercial designator
**Business Entities:**
- S.A., SA (sociedad anónima - public limited company)
- S.L., SL (sociedad limitada - private limited company)
- S.L.L., SLL (sociedad limitada laboral)
- S.Coop. (sociedad cooperativa)
- S.C., SC (sociedad colectiva - general partnership)
- S.Com., S. en C. (sociedad en comandita)
### Portuguese (Portugal, Brazil)
**Foundations and Non-Profits:**
- Fundação (foundation)
- Associação (association)
- Instituto (institute)
**Business Entities:**
- Ltda., Limitada (limited liability company)
- S.A., SA (sociedade anônima - corporation)
- S/A
- Cia., Companhia (company)
- ME (microempresa)
- EPP (empresa de pequeno porte)
### Italian (Italy, Switzerland-Ticino)
**Foundations and Non-Profits:**
- Fondazione (foundation)
- Associazione (association)
- Ente (entity/institution)
- Onlus (non-profit organization)
**Business Entities:**
- S.p.A., SpA (società per azioni - joint-stock company)
- S.r.l., Srl (società a responsabilità limitata - limited liability company)
- S.a.s., Sas (società in accomandita semplice)
- S.n.c., Snc (società in nome collettivo)
- S.c.a.r.l. (società cooperativa a responsabilità limitata)
### Scandinavian Languages
**Danish:**
- Fond (foundation)
- Forening (association)
- A/S (aktieselskab - public limited company)
- ApS (anpartsselskab - private limited company)
**Swedish:**
- Stiftelse (foundation)
- Förening (association)
- AB (aktiebolag - limited company)
**Norwegian:**
- Stiftelse (foundation)
- Forening (association)
- AS (aksjeselskap - limited company)
- ASA (allmennaksjeselskap - public limited company)
### Other European Languages
**Polish:**
- Fundacja (foundation)
- Stowarzyszenie (association)
- Sp. z o.o. (limited liability company)
- S.A. (joint-stock company)
**Czech:**
- Nadace (foundation)
- Spolek (association)
- s.r.o. (limited liability company)
- a.s. (joint-stock company)
**Hungarian:**
- Alapítvány (foundation)
- Egyesület (association)
- Kft. (limited liability company)
- Zrt. (private limited company)
- Nyrt. (public limited company)
**Greek:**
- Ίδρυμα (Idryma - foundation)
- Σύλλογος (Syllogos - association)
- Α.Ε., ΑΕ (Ανώνυμη Εταιρεία - corporation)
- Ε.Π.Ε., ΕΠΕ (limited liability company)
**Finnish:**
- Säätiö (foundation)
- Yhdistys (association)
- Oy (osakeyhtiö - limited company)
- Oyj (public limited company)
### Asian Languages
**Japanese:**
- 財団法人 (zaidan hōjin - incorporated foundation)
- 社団法人 (shadan hōjin - incorporated association)
- 株式会社, K.K. (kabushiki kaisha - corporation)
- 合同会社, G.K. (gōdō kaisha - LLC)
- 有限会社, Y.K. (yūgen kaisha - limited company)
**Chinese:**
- 基金会 (jījīn huì - foundation)
- 协会 (xiéhuì - association)
- 有限公司 (yǒuxiàn gōngsī - limited company)
- 股份有限公司 (gǔfèn yǒuxiàn gōngsī - joint-stock company)
**Korean:**
- 재단법인 (jaedan beobin - incorporated foundation)
- 사단법인 (sadan beobin - incorporated association)
- 주식회사 (jusik hoesa - corporation)
- 유한회사 (yuhan hoesa - limited company)
### Middle Eastern Languages
**Arabic:**
- مؤسسة (mu'assasa - foundation/institution)
- جمعية (jam'iyya - association)
- شركة (sharika - company)
- ش.م.م (limited liability company)
- ش.م.ع (public joint-stock company)
**Hebrew:**
- עמותה (amuta - non-profit association)
- חל"צ (company for public benefit)
- בע"מ (limited company)
**Turkish:**
- Vakıf (foundation)
- Dernek (association)
- A.Ş. (anonim şirket - joint-stock company)
- Ltd. Şti. (limited şirket - limited company)
### Latin American Specific
**Brazilian Portuguese:**
- OSCIP (organização da sociedade civil de interesse público)
- ONG (organização não governamental)
- EIRELI (empresa individual de responsabilidade limitada)
**Mexican Spanish:**
- A.C. (asociación civil - civil association)
- S.C. (sociedad civil)
- S. de R.L. (sociedad de responsabilidad limitada)
---
## Implementation Guidelines
### Filtering Algorithm
```python
def filter_legal_form(name: str, language: str = None) -> tuple[str, str | None]:
"""
Remove legal form terms from custodian name.
Returns:
tuple: (filtered_name, legal_form_found)
"""
# Apply language-specific patterns first if language known
# Then apply universal patterns
# Handle both prefix and suffix positions
# Preserve articles (the, het, de, la, le, etc.)
pass
```
### Position Handling
Legal forms can appear as:
1. **Prefix**: "Stichting Rijksmuseum" → Remove "Stichting "
2. **Suffix**: "British Museum Trust Ltd" → Remove " Trust Ltd"
3. **Infix** (rare): Handle case-by-case
### Edge Cases
1. **Multiple legal forms**: "Foundation Trust Ltd" → Remove all
2. **Abbreviation variations**: "Inc." = "Inc" = "Incorporated"
3. **Case insensitivity**: "STICHTING" = "Stichting" = "stichting"
4. **With punctuation**: "B.V." = "BV" = "B.V"
5. **Compound terms**: "GmbH & Co. KG" → Remove entire compound
### Validation Script
Use `scripts/validate_organization_names.py` to detect names that still contain legal form terms after filtering.
---
## References
- ISO 20275:2017 - Financial services — Entity legal forms (ELF)
- GLEIF Legal Entity Identifier documentation
- LinkML Schema: `schemas/20251121/linkml/modules/classes/CustodianName.yaml`
- AGENTS.md: Rule 8 (Legal Form Filtering)
---
**Last Updated**: 2025-12-02
**Maintained By**: GLAM Heritage Custodian Ontology Project

View file

@ -0,0 +1,156 @@
# Value Standardization Rules
**Location**: `schemas/20251121/linkml/rules/`
**Purpose**: Data transformation and processing rules for achieving standardized values required by Heritage Custodian (HC) classes.
---
## About These Rules
These rules are **formally outside the LinkML schema convention** but document HOW data values are:
- Transformed
- Converted
- Processed
- Normalized
to achieve the standardized values required by particular HC classes.
**IMPORTANT**: These are NOT LinkML validation rules. They are **processing instructions** for data pipelines and extraction agents.
---
## Rule Categories
### 1. Name Standardization Rules
| Rule ID | File | Applies To | Summary |
|---------|------|------------|---------|
| **LEGAL-FORM-FILTER** | [`LEGAL_FORM_FILTER.md`](LEGAL_FORM_FILTER.md) | `CustodianName` | Remove legal form terms (Stichting, Foundation, Inc.) from emic names |
| **ABBREV-CHAR-FILTER** | [`ABBREVIATION_RULES.md`](ABBREVIATION_RULES.md) | GHCID abbreviation | Remove special characters (&, /, +, @) and normalize diacritics to ASCII |
| **TRANSLIT-ISO** | [`TRANSLITERATION.md`](TRANSLITERATION.md) | GHCID abbreviation | Transliterate non-Latin scripts (Cyrillic, CJK, Arabic) using ISO standards |
### 2. Geographic Standardization Rules
| Rule ID | File | Applies To | Summary |
|---------|------|------------|---------|
| **GEONAMES-SETTLEMENT** | [`GEONAMES_SETTLEMENT.md`](GEONAMES_SETTLEMENT.md) | Settlement codes | Use GeoNames as single source for settlement names |
| **FEATURE-CODE-FILTER** | [`GEONAMES_SETTLEMENT.md`](GEONAMES_SETTLEMENT.md) | Reverse geocoding | Only use PPL* feature codes, never PPLX (neighborhoods) |
### 3. Web Observation Rules
| Rule ID | File | Applies To | Summary |
|---------|------|------------|---------|
| **XPATH-PROVENANCE** | [`XPATH_PROVENANCE.md`](XPATH_PROVENANCE.md) | `WebClaim` | Every web claim MUST have XPath pointer to archived HTML |
### 4. Schema Evolution Rules
| Rule ID | File | Applies To | Summary |
|---------|------|------------|---------|
| **ENUM-TO-CLASS** | [`ENUM_TO_CLASS.md`](ENUM_TO_CLASS.md) | Enums/Classes | When enum promoted to class hierarchy, delete original enum |
---
## GLAMORCUBESFIXPHDNT Taxonomy Applicability
Each rule primarily applies to certain custodian types:
| Rule | Primary Types | All Types |
|------|--------------|-----------|
| LEGAL-FORM-FILTER | All | ✅ |
| ABBREV-SPECIAL-CHAR | All | ✅ |
| ABBREV-DIACRITICS | All | ✅ |
| TRANSLITERATION | International (non-Latin script countries) | Partial |
| GEONAMES-SETTLEMENT | All | ✅ |
| XPATH-PROVENANCE | D (Digital platforms) | Partial |
---
## Integration with bronhouder.nl
These rules are displayed under a separate "Regels" (Rules) category on the bronhouder.nl LinkML visualization page, distinct from:
- Classes
- Slots
- Enums
- Instances
Each rule includes:
- Rule ID (short identifier)
- Applicable class(es)
- GLAMORCUBESFIXPHDNT type indicator
- Transformation examples
- Implementation code (Python)
---
## Rule Template
New rules should follow this template:
```markdown
# Rule Title
**Rule ID**: SHORT-ID
**Status**: MANDATORY | RECOMMENDED | OPTIONAL
**Applies To**: Class or slot name
**Created**: YYYY-MM-DD
**Updated**: YYYY-MM-DD
---
## Summary
One-paragraph summary of what this rule does.
---
## Rationale
Why this rule exists (numbered list of reasons).
---
## Specification
Detailed specification with examples.
---
## Implementation
Python code showing how to implement this rule.
---
## Examples
| Input | Output | Explanation |
|-------|--------|-------------|
---
## Related Rules
- Other related rules
---
## Changelog
| Date | Change |
|------|--------|
```
---
## File List
```
rules/
├── README.md # This file (rule index)
├── ABBREVIATION_RULES.md # ABBREV-CHAR-FILTER: Special char + diacritics normalization
├── LEGAL_FORM_FILTER.md # LEGAL-FORM-FILTER: Legal form removal from emic names
├── GEONAMES_SETTLEMENT.md # GEONAMES-SETTLEMENT: Geographic standardization via GeoNames
├── XPATH_PROVENANCE.md # XPATH-PROVENANCE: WebClaim XPath requirements
├── TRANSLITERATION.md # TRANSLIT-ISO: Non-Latin script transliteration
└── ENUM_TO_CLASS.md # ENUM-TO-CLASS: Schema evolution pattern
```

View file

@ -0,0 +1,337 @@
# Transliteration Standards for Non-Latin Scripts
**Rule ID**: TRANSLIT-ISO
**Status**: MANDATORY
**Applies To**: GHCID abbreviation generation from emic names in non-Latin scripts
**Created**: 2025-12-08
---
## Summary
**When generating GHCID abbreviations from institution names written in non-Latin scripts, the emic name MUST first be transliterated to Latin characters using the designated ISO or recognized standard for that script.**
This rule affects **170 institutions** across **21 languages** with non-Latin writing systems.
### Key Principles
1. **Emic name is preserved** - The original script is stored in `custodian_name.emic_name`
2. **Transliteration is for processing only** - Used to generate abbreviations
3. **ISO/recognized standards required** - No ad-hoc romanization
4. **Deterministic output** - Same input always produces same Latin output
5. **Existing GHCIDs grandfathered** - Only applies to NEW custodians
---
## Transliteration Standards by Script/Language
### Cyrillic Scripts
| Language | ISO Code | Standard | Library/Tool | Notes |
|----------|----------|----------|--------------|-------|
| **Russian** | ru | ISO 9:1995 | `transliterate` | Scientific transliteration |
| **Ukrainian** | uk | ISO 9:1995 | `transliterate` | Includes Ukrainian-specific letters |
| **Bulgarian** | bg | ISO 9:1995 | `transliterate` | Uses same Cyrillic base |
| **Serbian** | sr | ISO 9:1995 | `transliterate` | Serbian Cyrillic variant |
| **Kazakh** | kk | ISO 9:1995 | `transliterate` | Cyrillic-based (pre-2023) |
**Example**:
```
Input: Институт восточных рукописей РАН
ISO 9: Institut vostocnyh rukopisej RAN
Abbrev: IVRRAN (after diacritic normalization)
```
---
### CJK Scripts
#### Chinese (Hanzi)
| Variant | Standard | Library/Tool | Notes |
|---------|----------|--------------|-------|
| Simplified | Hanyu Pinyin (ISO 7098) | `pypinyin` | Standard PRC romanization |
| Traditional | Hanyu Pinyin | `pypinyin` | Same standard applies |
**Pinyin Rules**:
- Tone marks are OMITTED for abbreviation (diacritics removed anyway)
- Word boundaries follow natural spacing
- Proper nouns capitalized
**Example**:
```
Input: 东巴文化博物院
Pinyin: Dongba Wenhua Bowuyuan
ASCII: Dongba Wenhua Bowuyuan
Abbrev: DWB
```
#### Japanese (Kanji/Kana)
| Standard | Library/Tool | Notes |
|----------|--------------|-------|
| Modified Hepburn | `pykakasi`, `romkan` | Most widely used internationally |
**Hepburn Rules**:
- Long vowels: o, u (normalized to o, u for abbreviation)
- Particles: ha (wa), wo (wo), he (e)
- Syllabic n: n = n (before vowels: n')
**Example**:
```
Input: 国立中央博物館
Romaji: Kokuritsu Chuo Hakubutsukan
ASCII: Kokuritsu Chuo Hakubutsukan
Abbrev: KCH
```
#### Korean (Hangul)
| Standard | Library/Tool | Notes |
|----------|--------------|-------|
| Revised Romanization (RR) | `korean-romanizer`, `hangul-romanize` | Official South Korean standard (2000) |
**RR Rules**:
- No diacritics (unlike McCune-Reischauer)
- Consonant assimilation reflected in spelling
- Word boundaries at natural breaks
**Example**:
```
Input: 독립기념관
RR: Dongnip Ginyeomgwan
Abbrev: DG
```
---
### Arabic Script
| Language | ISO Code | Standard | Library/Tool | Notes |
|----------|----------|----------|--------------|-------|
| **Arabic** | ar | ISO 233-2:1993 | `arabic-transliteration` | Simplified standard |
| **Persian/Farsi** | fa | ISO 233-3:1999 | `persian-transliteration` | Persian extensions |
| **Urdu** | ur | ISO 233-3 + Urdu extensions | `urdu-transliteration` | Additional characters |
**Example (Arabic)**:
```
Input: المكتبة الوطنية للمملكة المغربية
ISO: al-Maktaba al-Wataniya lil-Mamlaka al-Maghribiya
ASCII: al-Maktaba al-Wataniya lil-Mamlaka al-Maghribiya
Abbrev: MWMM (skip "al-" articles)
```
---
### Hebrew Script
| Standard | Library/Tool | Notes |
|----------|--------------|-------|
| ISO 259-3:1999 | `hebrew-transliteration` | Simplified romanization |
**Example**:
```
Input: ארכיון הסיפור העממי בישראל
ISO: Arkhiyon ha-Sipur ha-Amami be-Yisrael
ASCII: Arkhiyon ha-Sipur ha-Amami be-Yisrael
Abbrev: ASAY (skip "ha-" and "be-" articles)
```
---
### Greek Script
| Standard | Library/Tool | Notes |
|----------|--------------|-------|
| ISO 843:1997 | `greek-transliteration` | Romanization of Greek |
**Example**:
```
Input: Αρχαιολογικό Μουσείο Θεσσαλονίκης
ISO: Archaiologiko Mouseio Thessalonikis
ASCII: Archaiologiko Mouseio Thessalonikis
Abbrev: AMT
```
---
### Indic Scripts
| Language | Script | Standard | Library/Tool |
|----------|--------|----------|--------------|
| **Hindi** | Devanagari | ISO 15919 | `indic-transliteration` |
| **Bengali** | Bengali | ISO 15919 | `indic-transliteration` |
| **Nepali** | Devanagari | ISO 15919 | `indic-transliteration` |
| **Sinhala** | Sinhala | ISO 15919 | `indic-transliteration` |
**Example (Hindi)**:
```
Input: राजस्थान प्राच्यविद्या प्रतिष्ठान
ISO: Rajasthana Pracyavidya Pratishthana
ASCII: Rajasthana Pracyavidya Pratishthana
Abbrev: RPP
```
---
### Southeast Asian Scripts
| Language | Script | Standard | Library/Tool |
|----------|--------|----------|--------------|
| **Thai** | Thai | ISO 11940-2 | `thai-romanization` |
| **Khmer** | Khmer | ALA-LC | `khmer-romanization` |
**Thai Example**:
```
Input: สำนักหอจดหมายเหตุแห่งชาติ
ISO: Samnak Ho Chotmaihet Haeng Chat
Abbrev: SHCHC
```
---
### Other Scripts
| Language | Script | Standard | Library/Tool |
|----------|--------|----------|--------------|
| **Armenian** | Armenian | ISO 9985 | `armenian-transliteration` |
| **Georgian** | Georgian | ISO 9984 | `georgian-transliteration` |
**Georgian Example**:
```
Input: ხელნაწერთა ეროვნული ცენტრი
ISO: Khelnawerti Erovnuli Centri
ASCII: Khelnawerti Erovnuli Centri
Abbrev: KEC
```
---
## Implementation
### Python Transliteration Utility
```python
import unicodedata
from typing import Optional
def detect_script(text: str) -> str:
"""
Detect the primary script of the input text.
Returns one of: 'latin', 'cyrillic', 'chinese', 'japanese',
'korean', 'arabic', 'hebrew', 'greek', 'devanagari', etc.
"""
script_ranges = {
'cyrillic': (0x0400, 0x04FF),
'arabic': (0x0600, 0x06FF),
'hebrew': (0x0590, 0x05FF),
'devanagari': (0x0900, 0x097F),
'thai': (0x0E00, 0x0E7F),
'greek': (0x0370, 0x03FF),
'korean': (0xAC00, 0xD7AF),
'chinese': (0x4E00, 0x9FFF),
}
for char in text:
code = ord(char)
for script, (start, end) in script_ranges.items():
if start <= code <= end:
return script
return 'latin'
def transliterate_for_abbreviation(emic_name: str, lang: str) -> str:
"""
Transliterate emic name for GHCID abbreviation generation.
Args:
emic_name: Institution name in original script
lang: ISO 639-1 language code
Returns:
Transliterated name ready for abbreviation extraction
"""
import re
# Step 1: Transliterate to Latin (implementation depends on script)
latin = transliterate(emic_name, lang)
# Step 2: Normalize diacritics
normalized = unicodedata.normalize('NFD', latin)
ascii_text = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Step 3: Remove special characters (except spaces)
clean = re.sub(r'[^a-zA-Z\s]', ' ', ascii_text)
# Step 4: Normalize whitespace
clean = ' '.join(clean.split())
return clean
```
---
## Skip Words by Language
When extracting abbreviations from transliterated text, skip these articles/prepositions:
### Arabic
- `al-` (the definite article)
- `bi-`, `li-`, `fi-` (prepositions)
### Hebrew
- `ha-` (the)
- `ve-` (and)
- `be-`, `le-`, `me-` (prepositions)
### Persian
- `-e`, `-ye` (ezafe connector)
- `va` (and)
### CJK Languages
- No skip words (particles are integral to meaning)
### Indic Languages
- `ka`, `ki`, `ke` (Hindi: of)
- `aur` (Hindi: and)
---
## Validation
### Check Transliteration Output
```python
def validate_transliteration(result: str) -> bool:
"""
Validate that transliteration output contains only ASCII letters and spaces.
"""
import re
return bool(re.match(r'^[a-zA-Z\s]+$', result))
```
### Manual Review Queue
Non-Latin institutions should be flagged for manual review if:
1. Transliteration library not available for that script
2. Confidence in transliteration is low
3. Institution has multiple official romanizations
---
## Related Documentation
- `AGENTS.md` - Rule 12: Transliteration Standards
- `rules/ABBREVIATION_RULES.md` - Character filtering after transliteration
- `docs/TRANSLITERATION_CONVENTIONS.md` - Extended examples and edge cases
- `scripts/transliterate_emic_names.py` - Production transliteration script
---
## Changelog
| Date | Change |
|------|--------|
| 2025-12-08 | Initial standards document created |

View file

@ -0,0 +1,210 @@
# WebObservation XPath Provenance Rules
**Rule ID**: XPATH-PROVENANCE
**Status**: MANDATORY
**Applies To**: WebClaim extraction from websites
**Created**: 2025-11-29
---
## Core Principle: Every Claim MUST Have Verifiable Provenance
**If a claim allegedly came from a webpage, it MUST have an XPath pointer to the exact location in the archived HTML where that value appears. Claims without XPath provenance are considered FABRICATED and must be removed.**
This is not about "confidence" or "uncertainty" - it's about **verifiability**. Either the claim value exists in the HTML at a specific XPath, or it was hallucinated/fabricated by an LLM.
---
## Required Fields for WebObservation Claims
Every claim in `web_enrichment.claims` MUST have:
| Field | Required | Description |
|-------|----------|-------------|
| `claim_type` | YES | Type of claim (full_name, description, email, etc.) |
| `claim_value` | YES | The extracted value |
| `source_url` | YES | URL the claim was extracted from |
| `retrieved_on` | YES | ISO 8601 timestamp when page was archived |
| `xpath` | YES | XPath to the element containing this value |
| `html_file` | YES | Relative path to archived HTML file |
| `xpath_match_score` | YES | 1.0 for exact match, <1.0 for fuzzy match |
### Example - CORRECT (Verifiable)
```yaml
web_enrichment:
claims:
- claim_type: full_name
claim_value: Historische Vereniging Nijeveen
source_url: https://historischeverenigingnijeveen.nl/
retrieved_on: "2025-11-29T12:28:00Z"
xpath: /[document][1]/html[1]/body[1]/div[6]/div[1]/table[3]/tbody[1]/tr[1]/td[1]/p[6]
html_file: web/0021/historischeverenigingnijeveen.nl/rendered.html
xpath_match_score: 1.0
```
### Example - WRONG (Fabricated - Must Be Removed)
```yaml
web_enrichment:
claims:
- claim_type: full_name
claim_value: Historische Vereniging Nijeveen
confidence: 0.95 # ← NO! This is meaningless without XPath
```
---
## Forbidden: Confidence Scores Without XPath
**NEVER use arbitrary confidence scores for web-extracted claims.**
Confidence scores like `0.95`, `0.90`, `0.85` are meaningless because:
1. There is NO methodology defining what these numbers mean
2. They cannot be verified or reproduced
3. They give false impression of rigor
4. They mask the fact that claims may be fabricated
If a value appears in the HTML → `xpath_match_score: 1.0`
If a value does NOT appear in the HTML → **REMOVE THE CLAIM**
---
## Website Archiving Workflow
### Step 1: Archive the Website
Use Playwright to archive websites with JavaScript rendering:
```bash
python scripts/fetch_website_playwright.py <entry_number> <url>
# Example:
python scripts/fetch_website_playwright.py 0021 https://historischeverenigingnijeveen.nl/
```
This creates:
```
data/nde/enriched/entries/web/{entry_number}/{domain}/
├── index.html # Raw HTML as received
├── rendered.html # HTML after JS execution
├── content.md # Markdown conversion
└── metadata.yaml # XPath extractions for provenance
```
### Step 2: Add XPath Provenance to Claims
Run the XPath migration script:
```bash
python scripts/add_xpath_provenance.py
# Or for specific entries:
python scripts/add_xpath_provenance.py --entries 0021,0022,0023
```
This script:
1. Reads each entry's `web_enrichment.claims`
2. Searches archived HTML for each claim value
3. Adds `xpath` + `html_file` if found
4. **REMOVES claims that cannot be verified** (stores in `removed_unverified_claims`)
### Step 3: Audit Removed Claims
Check `removed_unverified_claims` in each entry file:
```yaml
removed_unverified_claims:
- claim_type: phone
claim_value: "+31 6 12345678"
reason: "Value not found in archived HTML - likely fabricated"
removed_on: "2025-11-29T14:30:00Z"
```
These claims were NOT in the HTML and should NOT be restored without proper sourcing.
---
## Claim Types and Expected Sources
| Claim Type | Expected Source | Notes |
|------------|-----------------|-------|
| `full_name` | Page title, heading, logo text | Usually in `<h1>`, `<title>`, or prominent `<div>` |
| `description` | Meta description, about text | Check `<meta name="description">` first |
| `email` | Contact page, footer | Often in `<a href="mailto:...">` |
| `phone` | Contact page, footer | May need normalization |
| `address` | Contact page, footer | Check for structured data too |
| `social_media` | Footer, contact page | Links to social platforms |
| `opening_hours` | Contact/visit page | May be in structured data |
---
## XPath Matching Strategy
The `add_xpath_provenance.py` script uses this matching strategy:
1. **Exact match**: Claim value appears exactly in element text
2. **Normalized match**: After whitespace normalization
3. **Substring match**: Claim value is substring of element text (score < 1.0)
Priority order for matching:
1. `rendered.html` (after JS execution) - preferred
2. `index.html` (raw HTML) - fallback
---
## Integration with LinkML Schema
The `WebClaim` class in the LinkML schema requires:
```yaml
# schemas/20251121/linkml/modules/classes/WebClaim.yaml
WebClaim:
slots:
- source_url # Required
- retrieved_on # Required (timestamp)
- xpath # Required for claims
- html_archive_path # Path to archived HTML
```
---
## Rules for AI Agents
### When Extracting Claims from Websites
1. **ALWAYS archive the website first** using Playwright
2. **ALWAYS extract claims with XPath provenance** using the archived HTML
3. **NEVER invent or infer claims** not present in the HTML
4. **NEVER use confidence scores** without XPath backing
### When Processing Existing Claims
1. **Verify each claim** against archived HTML
2. **Add XPath provenance** to verified claims
3. **REMOVE fabricated claims** that cannot be verified
4. **Document removed claims** in `removed_unverified_claims`
### When Reviewing Data Quality
1. Claims with `xpath` + `html_file` = **VERIFIED**
2. Claims with only `confidence` = **SUSPECT** (migrate or remove)
3. Claims in `removed_unverified_claims` = **FABRICATED** (do not restore)
---
## Scripts Reference
| Script | Purpose |
|--------|---------|
| `scripts/fetch_website_playwright.py` | Archive website with Playwright |
| `scripts/add_xpath_provenance.py` | Add XPath to claims, remove fabricated |
| `scripts/batch_fetch_websites.py` | Batch archive multiple entries |
---
## Version History
- **2025-11-29**: Initial version - established XPath provenance requirement
- Replaced confidence scores with verifiable XPath pointers
- Established policy of removing fabricated claims

View file

@ -0,0 +1,124 @@
```mermaid
%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#e3f2fd', 'primaryTextColor': '#1565c0', 'primaryBorderColor': '#1565c0', 'lineColor': '#424242', 'secondaryColor': '#fff3e0', 'tertiaryColor': '#e8f5e9'}}}%%
graph TB
%% Heritage Custodian Records Lifecycle
%% Generated: 2025-12-09 13:12:05
%% Three-tier model: Administration → Archive → Collection
%% For bronhouder.nl visual representation
subgraph "PHASE 1: ACTIVE RECORDS"
direction TB
ADMIN["<b>CustodianAdministration</b><br/><i>rico:RecordResource</i><br/>━━━━━━━━━━━━━━━━━<br/>ACTIVE records in daily use<br/>• Current correspondence<br/>• Personnel files<br/>• Financial records<br/>• Digital files on shared drives<br/>• Email, databases<br/>━━━━━━━━━━━━━━━━━<br/>Managed by: Business units<br/>Retention: Per schedule"]
style ADMIN fill:#c8e6c9,stroke:#2e7d32,stroke-width:3px
end
subgraph "PHASE 2: INACTIVE ARCHIVES"
direction TB
ARCHIVE["<b>CustodianArchive</b><br/><i>rico:RecordSet</i><br/>━━━━━━━━━━━━━━━━━<br/>INACTIVE records awaiting processing<br/>• Transferred from administration<br/>• In BACKLOG (may wait DECADES)<br/>• Basic accession-level description<br/>• NOT searchable by researchers<br/>• Tracked in CMS for inventory<br/>━━━━━━━━━━━━━━━━━<br/>Managed by: Archives staff<br/>Status: ArchiveProcessingStatusEnum"]
style ARCHIVE fill:#fff9c4,stroke:#f9a825,stroke-width:3px
end
subgraph "PHASE 3: HERITAGE COLLECTION"
direction TB
COLLECTION["<b>CustodianCollection</b><br/><i>crm:E78_Curated_Holding</i><br/>━━━━━━━━━━━━━━━━━<br/>PROCESSED heritage collection<br/>• Full finding aid available<br/>• Searchable by researchers<br/>• Arranged per archival standards<br/>• Integrated into public collection<br/>• Managed as cultural heritage<br/>━━━━━━━━━━━━━━━━━<br/>Managed by: Curators<br/>Access: Public/Restricted"]
style COLLECTION fill:#bbdefb,stroke:#1565c0,stroke-width:3px
end
%% Transitions between phases
ADMIN -->|"<b>TRANSFER</b><br/>Retention period ends<br/>Records closed<br/>prov:wasGeneratedBy"| ARCHIVE
ARCHIVE -->|"<b>PROCESSING</b><br/>Appraisal complete<br/>Finding aid created<br/>prov:hadDerivation"| COLLECTION
%% Lifecycle Type Classifications (SKOS)
subgraph "Archive Lifecycle Types (Wikidata)"
direction LR
TYPE_CURRENT["<b>CurrentArchive</b><br/>Q3621648<br/><i>Active phase</i>"]
TYPE_DEPOSIT["<b>DepositArchive</b><br/>Q244904<br/><i>Semi-current phase</i>"]
TYPE_HISTORICAL["<b>HistoricalArchive</b><br/>Q3621673<br/><i>Archival phase</i>"]
style TYPE_CURRENT fill:#c8e6c9,stroke:#2e7d32,stroke-width:2px,stroke-dasharray: 5 5
style TYPE_DEPOSIT fill:#fff9c4,stroke:#f9a825,stroke-width:2px,stroke-dasharray: 5 5
style TYPE_HISTORICAL fill:#bbdefb,stroke:#1565c0,stroke-width:2px,stroke-dasharray: 5 5
end
%% Type classifications link to phases
TYPE_CURRENT -.->|skos:narrower| ADMIN
TYPE_DEPOSIT -.->|skos:narrower| ARCHIVE
TYPE_HISTORICAL -.->|skos:narrower| COLLECTION
%% Timeline example
subgraph "Example: Ministry Records"
direction TB
EX_TIMELINE["<b>Temporal Reality</b><br/>━━━━━━━━━━━━━━━━━━━━━━━━<br/>2010-2020: Created (Administration)<br/>2021: Transferred to Archives<br/>2021-2024: In processing backlog<br/>2024: Archivist assigned<br/>2025: Finding aid complete<br/>2025: Available to researchers<br/>━━━━━━━━━━━━━━━━━━━━━━━━<br/><i>Total processing time: 4 years</i><br/><i>(Large archives: 30-50 years)</i>"]
style EX_TIMELINE fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
end
%% Processing Status Enum connection
subgraph "Processing Status"
direction LR
STATUS["<b>ArchiveProcessingStatusEnum</b><br/>UNPROCESSED → IN_APPRAISAL →<br/>IN_ARRANGEMENT → IN_DESCRIPTION →<br/>PROCESSED_PENDING_TRANSFER →<br/>TRANSFERRED_TO_COLLECTION"]
style STATUS fill:#e0e0e0,stroke:#616161,stroke-width:1px
end
ARCHIVE -.->|processing_status| STATUS
%% Custodian Hub connection
subgraph "Central Entity"
HUB["<b>Custodian</b><br/>(Hub Entity)<br/>All records belong to<br/>one heritage institution"]
style HUB fill:#ffeb3b,stroke:#f57f17,stroke-width:4px
end
ADMIN -.->|"refers_to_custodian<br/>crm:P46i"| HUB
ARCHIVE -.->|"refers_to_custodian<br/>crm:P46i"| HUB
COLLECTION -.->|"refers_to_custodian<br/>crm:P46i"| HUB
%% Legend
subgraph "Legend"
direction LR
L1["Solid arrow = Data flow/transition"]
L2["Dashed arrow = Reference/classification"]
L3["Green = Active | Yellow = Processing | Blue = Archived"]
end
```
---
## Records Lifecycle Model
This diagram shows the three-tier model for heritage custodian records:
### Phase 1: CustodianAdministration (Active Records)
- **Ontology**: `rico:RecordResource`
- **Status**: In daily operational use
- **Managed by**: Business units (not archives staff)
- **Examples**: Current correspondence, personnel files, financial records
### Phase 2: CustodianArchive (Inactive Archives)
- **Ontology**: `rico:RecordSet`
- **Status**: Awaiting archival processing (often DECADES)
- **Managed by**: Archives staff
- **Tracking**: `ArchiveProcessingStatusEnum`
- **Key insight**: NOT yet searchable by researchers
### Phase 3: CustodianCollection (Heritage Collection)
- **Ontology**: `crm:E78_Curated_Holding`
- **Status**: Fully processed, public/restricted access
- **Managed by**: Curators
- **Features**: Full finding aid, integrated into heritage collection
### Key Relationships
- `prov:wasGeneratedBy`: Links archive to transfer activity
- `prov:hadDerivation`: Links archive to resulting collection
- `crm:P46i_forms_part_of`: All phases belong to same Custodian hub
### Lifecycle Type Classifications (SKOS/Wikidata)
- **CurrentArchive** (Q3621648): Active records phase TYPE
- **DepositArchive** (Q244904): Semi-current/intermediate phase TYPE
- **HistoricalArchive** (Q3621673): Permanent archival phase TYPE
These are TYPE classifications (skos:Concept) that can be applied to INSTANCE records via `lifecycle_phase_type` slot using `skos:broaderTransitive`.

View file

@ -0,0 +1,388 @@
#!/usr/bin/env python3
"""
Geocode Missing Coordinates from GeoNames Database
This script geocodes custodian files that are missing coordinates using the local
GeoNames database. It's much faster than API-based geocoding (no rate limits).
Features:
- Uses local GeoNames SQLite database for instant lookups
- Fuzzy matching for city names
- Updates files in-place preserving YAML structure
- Batch processing with progress tracking
- Safe updates (additive only, preserves existing data)
Usage:
python scripts/geocode_missing_from_geonames.py --dry-run
python scripts/geocode_missing_from_geonames.py --country JP --limit 100
python scripts/geocode_missing_from_geonames.py --all
"""
import argparse
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
import unicodedata
from ruamel.yaml import YAML
# Setup ruamel.yaml for round-trip preservation
yaml = YAML()
yaml.preserve_quotes = True
yaml.width = 120
# Configuration
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
def normalize_city_name(name: Optional[str]) -> str:
"""Normalize city name for matching."""
if not name:
return ""
# NFD decomposition and remove accents
normalized = unicodedata.normalize('NFD', name)
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
result = ascii_name.lower().strip()
# Remove common Japanese administrative suffixes
# These are romanized forms of 市 (shi/city), 区 (ku/ward), 町 (machi/town), etc.
jp_suffixes = [' shi', '-shi', ' ku', '-ku', ' machi', '-machi', ' cho', '-cho',
' ken', '-ken', ' gun', '-gun', ' son', '-son', ' mura', '-mura']
for suffix in jp_suffixes:
if result.endswith(suffix):
result = result[:-len(suffix)]
break
return result
class GeoNamesLookup:
"""Fast city coordinate lookup from GeoNames database."""
def __init__(self, db_path: Path):
self.conn = sqlite3.connect(db_path)
self.conn.row_factory = sqlite3.Row
def lookup_city(self, city: str, country_code: str, region: str = None) -> Optional[dict]:
"""
Look up city coordinates in GeoNames database.
Returns dict with latitude, longitude, geonames_id, etc. or None if not found.
"""
if not city or not country_code:
return None
# Normalize inputs
city_norm = normalize_city_name(city)
country_code = country_code.upper()
# Try exact match first (case-insensitive)
cursor = self.conn.execute("""
SELECT geonames_id, name, ascii_name, latitude, longitude,
admin1_code, admin1_name, feature_code, population
FROM cities
WHERE country_code = ?
AND (LOWER(name) = ? OR LOWER(ascii_name) = ?)
ORDER BY population DESC
LIMIT 1
""", (country_code, city_norm or "", city_norm or ""))
row = cursor.fetchone()
if row:
return self._row_to_dict(row)
# Try with original city name (for non-ASCII)
cursor = self.conn.execute("""
SELECT geonames_id, name, ascii_name, latitude, longitude,
admin1_code, admin1_name, feature_code, population
FROM cities
WHERE country_code = ?
AND (name = ? OR ascii_name = ?)
ORDER BY population DESC
LIMIT 1
""", (country_code, city, city))
row = cursor.fetchone()
if row:
return self._row_to_dict(row)
# Try partial match (city name contains or is contained in)
cursor = self.conn.execute("""
SELECT geonames_id, name, ascii_name, latitude, longitude,
admin1_code, admin1_name, feature_code, population
FROM cities
WHERE country_code = ?
AND (LOWER(name) LIKE ? OR LOWER(ascii_name) LIKE ?)
ORDER BY population DESC
LIMIT 1
""", (country_code, f"%{city_norm}%", f"%{city_norm}%"))
row = cursor.fetchone()
if row:
return self._row_to_dict(row)
return None
def _row_to_dict(self, row) -> dict:
"""Convert database row to dictionary."""
return {
'geonames_id': row['geonames_id'],
'geonames_name': row['name'],
'latitude': row['latitude'],
'longitude': row['longitude'],
'admin1_code': row['admin1_code'],
'admin1_name': row['admin1_name'],
'feature_code': row['feature_code'],
'population': row['population']
}
def close(self):
self.conn.close()
def extract_city_country(data: dict) -> tuple[Optional[str], Optional[str]]:
"""Extract city and country from custodian data."""
city = None
country = None
# Try location block first
loc = data.get('location', {})
if loc:
city = loc.get('city')
country = loc.get('country')
# Try ghcid.location_resolution
if not city:
ghcid_loc = data.get('ghcid', {}).get('location_resolution', {})
if ghcid_loc:
city = (ghcid_loc.get('city_name') or
ghcid_loc.get('city_label') or
ghcid_loc.get('geonames_name') or
ghcid_loc.get('google_maps_locality'))
if not country:
country = ghcid_loc.get('country_code')
# Try original_entry.locations
if not city:
orig_locs = data.get('original_entry', {}).get('locations', [])
if orig_locs and len(orig_locs) > 0:
city = orig_locs[0].get('city')
country = orig_locs[0].get('country')
# Try to infer country from GHCID
if not country:
ghcid = data.get('ghcid', {}).get('ghcid_current', '')
if ghcid and len(ghcid) >= 2:
country = ghcid[:2]
return city, country
def geocode_file(filepath: Path, geonames: GeoNamesLookup, dry_run: bool = False) -> dict:
"""
Geocode a single custodian file using GeoNames.
Returns:
Dictionary with results:
- success: bool
- geocoded: bool (True if coordinates were added)
- already_has_coords: bool
- error: str or None
"""
result = {
'success': False,
'geocoded': False,
'already_has_coords': False,
'city': None,
'country': None,
'error': None
}
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.load(f)
if not isinstance(data, dict):
result['error'] = "Invalid YAML structure"
return result
# Check if already has coordinates
loc = data.get('location', {})
if loc.get('latitude') is not None and loc.get('longitude') is not None:
result['success'] = True
result['already_has_coords'] = True
return result
# Extract city and country
city, country = extract_city_country(data)
result['city'] = city
result['country'] = country
if not city or not country:
result['error'] = f"Missing city ({city}) or country ({country})"
result['success'] = True # Not an error, just no data to geocode
return result
# Look up in GeoNames
geo_result = geonames.lookup_city(city, country)
if not geo_result:
result['error'] = f"City not found in GeoNames: {city}, {country}"
result['success'] = True # Not a fatal error
return result
# Update location block with coordinates
if 'location' not in data:
data['location'] = {}
data['location']['latitude'] = geo_result['latitude']
data['location']['longitude'] = geo_result['longitude']
data['location']['coordinate_provenance'] = {
'source_type': 'GEONAMES_LOCAL',
'source_path': 'data/reference/geonames.db',
'entity_id': geo_result['geonames_id'],
'original_timestamp': datetime.now(timezone.utc).isoformat()
}
# Add geonames reference if not present
if not data['location'].get('geonames_id'):
data['location']['geonames_id'] = geo_result['geonames_id']
if not data['location'].get('geonames_name'):
data['location']['geonames_name'] = geo_result['geonames_name']
if not data['location'].get('feature_code'):
data['location']['feature_code'] = geo_result['feature_code']
# Update normalization timestamp
data['location']['normalization_timestamp'] = datetime.now(timezone.utc).isoformat()
if not dry_run:
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f)
result['success'] = True
result['geocoded'] = True
return result
except Exception as e:
result['error'] = str(e)
return result
def main():
parser = argparse.ArgumentParser(
description="Geocode missing coordinates using GeoNames database"
)
parser.add_argument('--dry-run', action='store_true', help="Preview without writing")
parser.add_argument('--country', type=str, help="Only process specific country code (e.g., JP)")
parser.add_argument('--limit', type=int, default=0, help="Limit number of files to process")
parser.add_argument('--all', action='store_true', help="Process all files (no limit)")
parser.add_argument('--verbose', action='store_true', help="Show detailed output")
args = parser.parse_args()
if args.dry_run:
print("DRY RUN - No files will be modified\n")
# Initialize GeoNames lookup
if not GEONAMES_DB.exists():
print(f"Error: GeoNames database not found at {GEONAMES_DB}")
return 1
geonames = GeoNamesLookup(GEONAMES_DB)
# Get list of files to process
if args.country:
pattern = f"{args.country.upper()}-*.yaml"
files = sorted(CUSTODIAN_DIR.glob(pattern))
print(f"Processing {args.country.upper()} files: {len(files)} found")
else:
files = sorted(CUSTODIAN_DIR.glob("*.yaml"))
print(f"Processing all files: {len(files)} found")
if args.limit and not args.all:
files = files[:args.limit]
print(f"Limited to first {args.limit} files")
# Statistics
stats = {
'total': len(files),
'geocoded': 0,
'already_has_coords': 0,
'no_city_data': 0,
'not_found': 0,
'errors': 0,
'by_country': {}
}
errors = []
not_found = []
for i, filepath in enumerate(files):
result = geocode_file(filepath, geonames, dry_run=args.dry_run)
# Extract country from filename
country = filepath.name[:2]
if country not in stats['by_country']:
stats['by_country'][country] = {'geocoded': 0, 'not_found': 0}
if result['geocoded']:
stats['geocoded'] += 1
stats['by_country'][country]['geocoded'] += 1
elif result['already_has_coords']:
stats['already_has_coords'] += 1
elif result['error'] and 'Missing city' in result['error']:
stats['no_city_data'] += 1
elif result['error'] and 'not found in GeoNames' in result['error']:
stats['not_found'] += 1
stats['by_country'][country]['not_found'] += 1
if len(not_found) < 100:
not_found.append((filepath.name, result['city'], result['country']))
elif result['error']:
stats['errors'] += 1
if len(errors) < 20:
errors.append((filepath.name, result['error']))
if args.verbose:
status = "GEOCODED" if result['geocoded'] else "SKIP" if result['already_has_coords'] else "FAIL"
print(f"[{i+1}/{len(files)}] {filepath.name}: {status}")
elif (i + 1) % 1000 == 0:
print(f"Processed {i+1}/{len(files)} files... (geocoded: {stats['geocoded']})")
# Print summary
print("\n" + "=" * 60)
print("GEOCODING SUMMARY")
print("=" * 60)
print(f"Total files processed: {stats['total']}")
print(f"Already had coordinates: {stats['already_has_coords']}")
print(f"Successfully geocoded: {stats['geocoded']}")
print(f"No city data available: {stats['no_city_data']}")
print(f"City not found in GeoNames: {stats['not_found']}")
print(f"Errors: {stats['errors']}")
if stats['by_country']:
print("\nResults by country:")
for country, data in sorted(stats['by_country'].items(), key=lambda x: -x[1]['geocoded']):
if data['geocoded'] > 0 or data['not_found'] > 0:
print(f" {country}: geocoded={data['geocoded']}, not_found={data['not_found']}")
if not_found:
print(f"\nFirst {len(not_found)} cities not found:")
for filename, city, country in not_found[:20]:
print(f" {filename}: {city}, {country}")
if errors:
print(f"\nFirst {len(errors)} errors:")
for filename, error in errors:
print(f" {filename}: {error}")
if args.dry_run:
print("\n(DRY RUN - No files were modified)")
geonames.close()
return 0
if __name__ == "__main__":
exit(main())

View file

@ -399,9 +399,26 @@ def extract_top_level_fields(data: dict) -> dict:
# Extract wikidata inception/dissolution
wd = data.get("wikidata_enrichment", {})
if wd:
record["wikidata_inception"] = wd.get("wikidata_inception", "") or wd.get("wikidata_founded", "")
if wd.get("wikidata_dissolution") and not record["dissolution_date"]:
record["dissolution_date"] = wd.get("wikidata_dissolution", "") or wd.get("wikidata_dissolved", "")
# Try multiple paths for inception date
wikidata_inception = (
wd.get("wikidata_inception", "") or
wd.get("wikidata_founded", "") or
wd.get("wikidata_temporal", {}).get("inception", "")
)
record["wikidata_inception"] = wikidata_inception
# Use wikidata_inception as founding_date fallback
if wikidata_inception and not record["founding_date"]:
record["founding_date"] = wikidata_inception
# Try multiple paths for dissolution date
wikidata_dissolution = (
wd.get("wikidata_dissolution", "") or
wd.get("wikidata_dissolved", "")
)
if wikidata_dissolution and not record["dissolution_date"]:
record["dissolution_date"] = wikidata_dissolution
record["wikidata_enrichment_json"] = json.dumps(wd, ensure_ascii=False, default=str)
# Extract service_area