clean up GHCID

This commit is contained in:
kempersc 2025-12-17 11:58:40 +01:00
parent 99430c2a70
commit 23b1d8ee5f
75 changed files with 13132 additions and 7905 deletions

1
.gitignore vendored
View file

@ -26,6 +26,7 @@ MANIFEST
docs/invoice
data/custodian/web/bu
data/custodian/weboj
data/custodian/person/affiliated/manual/
# Virtual environments
venv/

View file

@ -86,11 +86,16 @@ select_visualization: Any = None
VisualizationSelector: Any = None # type: ignore[no-redef]
generate_sparql: Any = None
configure_dspy: Any = None
get_province_code: Any = None # Province name to ISO 3166-2 code converter
try:
import sys
sys.path.insert(0, str(os.path.join(os.path.dirname(__file__), "..", "..", "src")))
from glam_extractor.api.hybrid_retriever import HybridRetriever as _HybridRetriever, create_hybrid_retriever as _create_hybrid_retriever
from glam_extractor.api.hybrid_retriever import (
HybridRetriever as _HybridRetriever,
create_hybrid_retriever as _create_hybrid_retriever,
get_province_code as _get_province_code,
)
from glam_extractor.api.qdrant_retriever import HeritageCustodianRetriever as _HeritageCustodianRetriever
from glam_extractor.api.typedb_retriever import TypeDBRetriever as _TypeDBRetriever, create_typedb_retriever as _create_typedb_retriever
from glam_extractor.api.visualization import select_visualization as _select_visualization, VisualizationSelector as _VisualizationSelector
@ -100,9 +105,14 @@ try:
create_typedb_retriever = _create_typedb_retriever
select_visualization = _select_visualization
VisualizationSelector = _VisualizationSelector
get_province_code = _get_province_code
RETRIEVERS_AVAILABLE = True
except ImportError as e:
logger.warning(f"Core retrievers not available: {e}")
# Provide a fallback get_province_code that returns None
def get_province_code(province_name: str | None) -> str | None:
"""Fallback when hybrid_retriever is not available."""
return None
# DSPy is optional - don't block retrievers if it's missing
try:
@ -112,6 +122,17 @@ try:
except ImportError as e:
logger.warning(f"DSPy SPARQL not available: {e}")
# Atomic query decomposition for geographic/type filtering
decompose_query: Any = None
DECOMPOSER_AVAILABLE = False
try:
from atomic_decomposer import decompose_query as _decompose_query
decompose_query = _decompose_query
DECOMPOSER_AVAILABLE = True
logger.info("Query decomposer loaded successfully")
except ImportError as e:
logger.info(f"Query decomposer not available: {e}")
# Cost tracker is optional - gracefully degrades if unavailable
COST_TRACKER_AVAILABLE = False
get_tracker = None
@ -126,6 +147,98 @@ except ImportError as e:
logger.info(f"Cost tracker not available (optional): {e}")
# Province detection for geographic filtering
DUTCH_PROVINCES = {
"noord-holland", "noordholland", "north holland", "north-holland",
"zuid-holland", "zuidholland", "south holland", "south-holland",
"utrecht", "gelderland", "noord-brabant", "noordbrabant", "brabant",
"north brabant", "limburg", "overijssel", "friesland", "fryslân",
"fryslan", "groningen", "drenthe", "flevoland", "zeeland",
}
def infer_location_level(location: str) -> str:
"""Infer whether location is city, province, or region.
Returns:
'province' if location is a Dutch province
'region' if location is a sub-provincial region
'city' otherwise
"""
location_lower = location.lower().strip()
if location_lower in DUTCH_PROVINCES:
return "province"
# Sub-provincial regions
regions = {"randstad", "veluwe", "achterhoek", "twente", "de betuwe", "betuwe"}
if location_lower in regions:
return "region"
return "city"
def extract_geographic_filters(question: str) -> dict[str, list[str] | None]:
"""Extract geographic filters from a question using query decomposition.
Returns:
dict with keys: region_codes, cities, institution_types
"""
filters: dict[str, list[str] | None] = {
"region_codes": None,
"cities": None,
"institution_types": None,
}
if not DECOMPOSER_AVAILABLE or not decompose_query:
return filters
try:
decomposed = decompose_query(question)
# Extract location and determine if it's a province or city
if decomposed.location:
location = decomposed.location
level = infer_location_level(location)
if level == "province":
# Convert province name to ISO 3166-2 code for Qdrant filtering
# e.g., "Noord-Holland" → "NH"
province_code = get_province_code(location)
if province_code:
filters["region_codes"] = [province_code]
logger.info(f"Province filter: {location}{province_code}")
elif level == "city":
filters["cities"] = [location]
logger.info(f"City filter: {location}")
# Extract institution type
if decomposed.institution_type:
# Map common types to enum values
type_mapping = {
"archive": "ARCHIVE",
"archief": "ARCHIVE",
"archieven": "ARCHIVE",
"museum": "MUSEUM",
"musea": "MUSEUM",
"museums": "MUSEUM",
"library": "LIBRARY",
"bibliotheek": "LIBRARY",
"bibliotheken": "LIBRARY",
"gallery": "GALLERY",
"galerie": "GALLERY",
}
inst_type = decomposed.institution_type.lower()
mapped_type = type_mapping.get(inst_type, inst_type.upper())
filters["institution_types"] = [mapped_type]
logger.info(f"Institution type filter: {mapped_type}")
except Exception as e:
logger.warning(f"Failed to extract geographic filters: {e}")
return filters
# Configuration
class Settings:
"""Application settings from environment variables."""
@ -1235,12 +1348,20 @@ async def query_rag(request: QueryRequest) -> QueryResponse:
intent, sources = retriever.router.get_sources(request.question, request.sources)
logger.info(f"Query intent: {intent}, sources: {sources}")
# Extract geographic filters from question (province, city, institution type)
geo_filters = extract_geographic_filters(request.question)
if any(geo_filters.values()):
logger.info(f"Geographic filters extracted: {geo_filters}")
# Retrieve from all sources
results = await retriever.retrieve(
request.question,
sources,
request.k,
embedding_model=request.embedding_model,
region_codes=geo_filters["region_codes"],
cities=geo_filters["cities"],
institution_types=geo_filters["institution_types"],
)
# Merge results
@ -1664,10 +1785,14 @@ async def stream_query_response(
# Route query
intent, sources = retriever.router.get_sources(request.question, request.sources)
# Extract geographic filters from question (province, city, institution type)
geo_filters = extract_geographic_filters(request.question)
yield json.dumps({
"type": "status",
"message": f"Routing query to {len(sources)} sources...",
"intent": intent.value,
"geo_filters": {k: v for k, v in geo_filters.items() if v},
}) + "\n"
# Retrieve from sources and stream progress
@ -1683,6 +1808,9 @@ async def stream_query_response(
[source],
request.k,
embedding_model=request.embedding_model,
region_codes=geo_filters["region_codes"],
cities=geo_filters["cities"],
institution_types=geo_filters["institution_types"],
)
results.extend(source_results)

View file

@ -21,28 +21,39 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/v-z-w-archief-en-documentatiecentrum-erfgoed-binnenvaart.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
country: NL
city: Oudenburg
region: West-Vlaanderen
country: BE
address: Vaartdijk zuid 11, 8460 Oudenburg (aboard Museumschip Tordino)
ghcid:
ghcid_current: NL-XX-XXX-A-VZWADEB
ghcid_original: NL-XX-XXX-A-VZWADEB
ghcid_uuid: 1f4e98ec-143f-5448-90ef-a295fd4a1a6e
ghcid_uuid_sha256: f6cabc91-9be6-875f-ab66-ffc5be964330
ghcid_numeric: 17783233412197709663
ghcid_current: BE-VWV-OUD-A-VZWADEB
ghcid_original: BE-VWV-OUD-A-VZWADEB
ghcid_uuid: 2975de6a-8d00-51ac-8ef7-238dc217515a
ghcid_uuid_sha256: 865f764c-c704-803b-8338-b16c56fcdc45
ghcid_numeric: 9682587795998437435
record_id: fdcd0fb5-b8cf-453d-9a7c-1d0bc87be5d0
generation_timestamp: '2025-12-16T21:06:45.654173+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-A-VZWADEB
ghcid_numeric: 17783233412197709663
valid_from: '2025-12-16T21:06:45.654173+00:00'
valid_to: null
valid_to: '2025-12-17T09:41:23.616579+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: BE-VWV-OUD-A-VZWADEB
ghcid_numeric: 9682587795998437435
valid_from: '2025-12-17T09:41:23.616579+00:00'
valid_to: null
reason: 'Country code corrected: NL -> BE (Belgium). Location: Oudenburg, West-Vlaanderen'
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
country_code: NL
method: EXA_WEB_SEARCH
city_code: OUD
city_name: Oudenburg
region_code: VWV
region_name: West-Vlaanderen
country_code: BE
resolution_date: '2025-12-17T09:41:23.616579+00:00'
source_url: http://binnenvaarterfgoed.be/
notes: Belgian v.z.w. (vzw = Belgian non-profit), located aboard museum ship
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.654173+00:00'
@ -59,9 +70,21 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_2_VERIFIED
source_url: http://binnenvaarterfgoed.be/
extraction_timestamp: '2025-12-17T09:41:23.616579+00:00'
claims_extracted:
- country
- region
- city
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Country code corrected on 2025-12-17: NL was incorrect, institution is in Belgium
(BE)'

View file

@ -18,28 +18,41 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-of-batik-pekalongan.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
country: NL
city: Pekalongan
region: Jawa Tengah
country: ID
address: Jl. Jetayu No.1, Pekalongan 51152
ghcid:
ghcid_current: NL-XX-XXX-M-MBP
ghcid_original: NL-XX-XXX-M-MBP
ghcid_uuid: eb74d910-63b2-5e5e-9db8-d91073782abf
ghcid_uuid_sha256: 6e7ba504-8c91-8169-aeb8-137a6a295254
ghcid_numeric: 7961138205264294249
ghcid_current: ID-JT-PEK-M-MBP
ghcid_original: ID-JT-PEK-M-MBP
ghcid_uuid: c3b6fa1c-543c-509b-8200-9c3e55ea5917
ghcid_uuid_sha256: fe292b5f-2a03-82a6-8ea7-13192da4c6f8
ghcid_numeric: 18314217047405564582
record_id: 3e933428-e095-4b85-aeb4-ed7eaa57b11c
generation_timestamp: '2025-12-16T21:06:37.585649+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MBP
ghcid_numeric: 7961138205264294249
valid_from: '2025-12-16T21:06:37.585649+00:00'
valid_to: null
valid_to: '2025-12-17T09:53:29.196550+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: ID-JT-PEK-M-MBP
ghcid_numeric: 18314217047405564582
valid_from: '2025-12-17T09:53:29.196550+00:00'
valid_to: null
reason: 'Country code corrected: NL -> ID (Indonesia). Location: Pekalongan, Jawa
Tengah'
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
country_code: NL
method: EXA_WEB_SEARCH
city_code: PEK
city_name: Pekalongan
region_code: JT
region_name: Jawa Tengah
country_code: ID
resolution_date: '2025-12-17T09:53:29.196550+00:00'
source_url: https://id.wikipedia.org/wiki/Museum_Batik_Pekalongan
notes: UNESCO recognized museum for batik conservation, opened 12 July 2006 by
President SBY
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:37.585649+00:00'
@ -56,9 +69,21 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_2_VERIFIED
source_url: https://id.wikipedia.org/wiki/Museum_Batik_Pekalongan
extraction_timestamp: '2025-12-17T09:53:29.196550+00:00'
claims_extracted:
- country
- region
- city
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Country code corrected on 2025-12-17: NL was incorrect, institution is in Indonesia
(ID)'

View file

@ -2,7 +2,7 @@ custodian_name:
emic_name: Diorama Arsip Jogja
emic_name_source: linkedin
institution_type:
- M
- A
linkedin_enrichment:
linkedin_url: https://www.linkedin.com/company/diorama-arsip-jogja
linkedin_slug: diorama-arsip-jogja
@ -24,28 +24,41 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/diorama-arsip-jogja.yaml
schema_version: 1.0.0
location:
city: Technology
region: XX
country: NL
city: Bantul
region: Daerah Istimewa Yogyakarta
country: ID
address: LT 1 Gedung DEPO ARSIP, Jl. Janti, Banguntapan, Kabupaten Bantul, Yogyakarta
55198
ghcid:
ghcid_current: NL-XX-XXX-M-DAJ
ghcid_original: NL-XX-XXX-M-DAJ
ghcid_uuid: 70fabdbb-cfa2-579c-9ed6-715ed3a9961b
ghcid_uuid_sha256: a495caff-0e4e-8b6a-8166-888465abc0cd
ghcid_numeric: 11859608390555585386
ghcid_current: ID-YO-BAN-A-DAJ
ghcid_original: ID-YO-BAN-A-DAJ
ghcid_uuid: 059d21ea-5974-5a1e-8525-ea372adb2f57
ghcid_uuid_sha256: 2dcb25b6-bf85-86e0-820b-be86083fea2d
ghcid_numeric: 3299772618806470368
record_id: 7e4ea863-e058-47a7-ab46-85aa9b50ec7c
generation_timestamp: '2025-12-16T21:06:39.082344+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-DAJ
ghcid_numeric: 11859608390555585386
valid_from: '2025-12-16T21:06:39.082344+00:00'
valid_to: null
valid_to: '2025-12-17T09:41:23.625814+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: ID-YO-BAN-A-DAJ
ghcid_numeric: 3299772618806470368
valid_from: '2025-12-17T09:41:23.625814+00:00'
valid_to: null
reason: 'Country code corrected: NL -> ID (Indonesia). Location: Bantul, Daerah
Istimewa Yogyakarta'
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
country_code: NL
method: EXA_WEB_SEARCH
city_code: BAN
city_name: Bantul
region_code: YO
region_name: Daerah Istimewa Yogyakarta
country_code: ID
resolution_date: '2025-12-17T09:41:23.625814+00:00'
source_url: https://dioramaarsip.jogjaprov.go.id/home
notes: Digital archive diorama of Yogyakarta history, opened February 2022
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:39.082344+00:00'
@ -62,9 +75,21 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_2_VERIFIED
source_url: https://dioramaarsip.jogjaprov.go.id/home
extraction_timestamp: '2025-12-17T09:41:23.625814+00:00'
claims_extracted:
- country
- region
- city
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Country code corrected on 2025-12-17: NL was incorrect, institution is in Indonesia
(ID)'

View file

@ -86,23 +86,41 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-benteng-vredeburg.yaml
schema_version: 1.0.0
location:
city: null
region: null
country: NL
city: Yogyakarta
region: Daerah Istimewa Yogyakarta
country: ID
address: Jl. Margo Mulyo No.6, Ngupasan, Kec. Gondomanan, Kota Yogyakarta 55122
ghcid:
ghcid_current: NL-XX-XXX-M-MBV
ghcid_original: NL-XX-XXX-M-MBV
ghcid_uuid: f52e92b3-a191-56c0-95ca-c93ebfd1fa81
ghcid_uuid_sha256: 669bd4a6-699d-87ae-9590-77f5dd087af7
ghcid_numeric: 7393737024460408750
ghcid_current: ID-YO-YOG-M-MBV
ghcid_original: ID-YO-YOG-M-MBV
ghcid_uuid: 5d5e4910-7cd2-5ef8-a51c-c7dc54a055f0
ghcid_uuid_sha256: aedc6dac-9c94-8841-abd8-6a7416a4b795
ghcid_numeric: 12600066445604583489
record_id: 1eed48b4-a9a7-436e-a4ac-edfef3de4aee
generation_timestamp: '2025-12-16T21:06:42.973186+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MBV
ghcid_numeric: 7393737024460408750
valid_from: '2025-12-17T08:44:26.023035+00:00'
valid_to: null
valid_to: '2025-12-17T09:53:29.174813+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: ID-YO-YOG-M-MBV
ghcid_numeric: 12600066445604583489
valid_from: '2025-12-17T09:53:29.174813+00:00'
valid_to: null
reason: 'Country code corrected: NL -> ID (Indonesia). Location: Yogyakarta, Daerah
Istimewa Yogyakarta'
location_resolution:
method: EXA_WEB_SEARCH
city_code: YOG
city_name: Yogyakarta
region_code: YO
region_name: Daerah Istimewa Yogyakarta
country_code: ID
resolution_date: '2025-12-17T09:53:29.174813+00:00'
source_url: https://forevervacation.com/yogyakarta/museum-benteng-vredeburg
notes: Dutch colonial fortress converted to museum in 1992, documents Indonesian
independence struggle
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:42.973186+00:00'
@ -119,6 +137,16 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_2_VERIFIED
source_url: https://forevervacation.com/yogyakarta/museum-benteng-vredeburg
extraction_timestamp: '2025-12-17T09:53:29.174813+00:00'
claims_extracted:
- country
- region
- city
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -127,3 +155,5 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Country code corrected on 2025-12-17: NL was incorrect, institution is in Indonesia
(ID)'

View file

@ -18,28 +18,39 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/historical-archives-of-the-european-union.yaml
schema_version: 1.0.0
location:
city: Director
region: XX
country: NL
city: Firenze
region: Tuscany
country: IT
address: Via Bolognese 156, 50139 Firenze, Villa Salviati
ghcid:
ghcid_current: NL-XX-XXX-A-HAEU
ghcid_original: NL-XX-XXX-A-HAEU
ghcid_uuid: 8c50158d-ca20-52fa-a586-123d856d30ce
ghcid_uuid_sha256: a5122731-05a4-8550-91dc-709ae9537003
ghcid_numeric: 11894612657340421456
ghcid_current: IT-52-FIR-A-HAEU
ghcid_original: IT-52-FIR-A-HAEU
ghcid_uuid: f61c2f7c-d9e1-5ffe-b5d8-a79fddadc795
ghcid_uuid_sha256: b546a4f3-4270-80ca-a53a-00d1bf0d4469
ghcid_numeric: 13062309133933396170
record_id: 63749121-4b05-471e-b075-ec53cbbf0917
generation_timestamp: '2025-12-16T21:06:45.012969+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-A-HAEU
ghcid_numeric: 11894612657340421456
valid_from: '2025-12-16T21:06:45.012969+00:00'
valid_to: null
valid_to: '2025-12-17T09:41:23.611933+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: IT-52-FIR-A-HAEU
ghcid_numeric: 13062309133933396170
valid_from: '2025-12-17T09:41:23.611933+00:00'
valid_to: null
reason: 'Country code corrected: NL -> IT (Italy). Location: Firenze, Tuscany'
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
country_code: NL
method: EXA_WEB_SEARCH
city_code: FIR
city_name: Firenze
region_code: '52'
region_name: Tuscany
country_code: IT
resolution_date: '2025-12-17T09:41:23.611933+00:00'
source_url: https://archives.eui.eu/en/repositories/1
notes: Part of European University Institute, Florence
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.012969+00:00'
@ -56,9 +67,21 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_2_VERIFIED
source_url: https://archives.eui.eu/en/repositories/1
extraction_timestamp: '2025-12-17T09:41:23.611933+00:00'
claims_extracted:
- country
- region
- city
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Country code corrected on 2025-12-17: NL was incorrect, institution is in Italy
(IT)'

View file

@ -21,23 +21,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-janning.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Nieuw Schoonebeek
region: Drenthe
country: NL
address: Europaweg 143a, 7766 AE Nieuw Schoonebeek
ghcid:
ghcid_current: NL-XX-XXX-M-MJ-museum_janning
ghcid_current: NL-DR-NIS-M-MJ-museum_janning
ghcid_original: NL-XX-XXX-M-MJ-museum_janning
ghcid_uuid: 2da042ef-f83f-5b4c-9cf6-e95338656e75
ghcid_uuid_sha256: 2841359b-879c-88e5-8c7f-27f6e806a1d5
ghcid_numeric: 2900658577114687717
ghcid_uuid: b14501ab-9840-5df7-b1d5-599a1606b08d
ghcid_uuid_sha256: b8ef7355-6ab8-860e-b9b7-8a9de4f58c9d
ghcid_numeric: 13325996633112462862
record_id: ba018a83-8c5e-422b-a8a0-8685147c0268
generation_timestamp: '2025-12-16T21:06:42.719826+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MJ-museum_janning
ghcid_numeric: 2900658577114687717
valid_from: '2025-12-17T08:44:26.037456+00:00'
valid_to: null
valid_to: '2025-12-17T10:56:09.433656+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-DR-NIS-M-MJ-museum_janning
ghcid_numeric: 13325996633112462862
valid_from: '2025-12-17T10:56:09.433656+00:00'
valid_to: null
reason: Location enriched via Exa web search - Nieuw Schoonebeek, Drenthe
location_resolution:
method: EXA_WEB_SEARCH
city_code: NIS
city_name: Nieuw Schoonebeek
region_code: DR
region_name: Drenthe
country_code: NL
resolution_date: '2025-12-17T10:56:09.433656+00:00'
source_url: https://www.museumjanning.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:42.719826+00:00'
@ -54,6 +69,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.museumjanning.nl/
extraction_timestamp: '2025-12-17T10:56:09.433656+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -62,3 +86,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Nieuw Schoonebeek, Drenthe'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/jopie-huismanmuseum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Workum
region: Friesland
country: NL
address: Noard 6, 8711 AH Workum
ghcid:
ghcid_current: NL-XX-XXX-M-JH
ghcid_current: NL-FR-WOR-M-JH
ghcid_original: NL-XX-XXX-M-JH
ghcid_uuid: d356866c-b69b-5f7c-9b0d-5be544a5a315
ghcid_uuid_sha256: 383eb391-9231-8e3c-b45b-2c9a8ff180df
ghcid_numeric: 4052874152484372028
ghcid_uuid: 3a0b508c-4200-5076-91aa-0c5296f68636
ghcid_uuid_sha256: 0ded076f-d63e-84a1-af3a-86c064d7338e
ghcid_numeric: 1003466468890657953
record_id: 70d3a7ac-504a-4bca-b45f-7feb1f7fce95
generation_timestamp: '2025-12-16T21:06:39.406621+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-JH
ghcid_numeric: 4052874152484372028
valid_from: '2025-12-16T21:06:39.406621+00:00'
valid_to: null
valid_to: '2025-12-17T09:25:04.169919+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-FR-WOR-M-JH
ghcid_numeric: 1003466468890657953
valid_from: '2025-12-17T09:25:04.169919+00:00'
valid_to: null
reason: Location enriched via Exa web search - Workum, Friesland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: WOR
city_name: Workum
region_code: FR
region_name: Friesland
country_code: NL
resolution_date: '2025-12-17T09:25:04.169919+00:00'
source_url: https://www.jopiehuismanmuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:39.406621+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.jopiehuismanmuseum.nl/
extraction_timestamp: '2025-12-17T09:25:04.169919+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Workum, Friesland'

View file

@ -18,23 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-de-grote-glind.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Barneveld
region: Gelderland
country: NL
address: Scherpenzeelseweg 158, 3772 MG Barneveld
ghcid:
ghcid_current: NL-XX-XXX-M-MGG
ghcid_current: NL-GE-BAR-M-MGG
ghcid_original: NL-XX-XXX-M-MGG
ghcid_uuid: 10736a2b-5d91-5b7a-b4b9-a4f66e26d978
ghcid_uuid_sha256: 3037e34d-a06d-8ce6-a605-466e1ce15bb0
ghcid_numeric: 3474495560083172582
ghcid_uuid: 1c2dc7dc-bc91-5fb7-bb54-1b2c846b2363
ghcid_uuid_sha256: c22fdb77-ceac-81a6-8d77-c89492e65c7b
ghcid_numeric: 13992643874878439846
record_id: b014bedb-05da-4f35-9192-e07ab708ed0e
generation_timestamp: '2025-12-16T21:06:43.269943+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MGG
ghcid_numeric: 3474495560083172582
valid_from: '2025-12-17T08:44:26.015666+00:00'
valid_to: null
valid_to: '2025-12-17T10:56:09.456702+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-GE-BAR-M-MGG
ghcid_numeric: 13992643874878439846
valid_from: '2025-12-17T10:56:09.456702+00:00'
valid_to: null
reason: Location enriched via Exa web search - Barneveld, Gelderland
location_resolution:
method: EXA_WEB_SEARCH
city_code: BAR
city_name: Barneveld
region_code: GE
region_name: Gelderland
country_code: NL
resolution_date: '2025-12-17T10:56:09.456702+00:00'
source_url: https://www.degroteglind.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:43.269943+00:00'
@ -51,6 +66,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.degroteglind.nl/
extraction_timestamp: '2025-12-17T10:56:09.456702+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -59,3 +83,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Barneveld, Gelderland'

View file

@ -18,23 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-1939-1945.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Harreveld
region: Gelderland
country: NL
address: Schurinkweg 14, 7135 KJ Harreveld
ghcid:
ghcid_current: NL-XX-XXX-M-M-museum_19391945
ghcid_current: NL-GE-HAR-M-M-museum_19391945
ghcid_original: NL-XX-XXX-M-M-museum_19391945
ghcid_uuid: e853e192-4695-59e4-87ee-4174351509b7
ghcid_uuid_sha256: 3b9218d3-0891-8043-bab3-18ddb7bb105f
ghcid_numeric: 4292520689498423363
ghcid_uuid: f7319b45-8862-5063-932b-0ad042cd196f
ghcid_uuid_sha256: 98c8af8e-ffc8-85e5-a162-937baa38ee40
ghcid_numeric: 11009242317818750437
record_id: f533f9a7-b9e4-40d9-9406-1003736c61ba
generation_timestamp: '2025-12-16T21:06:44.440421+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-M-museum_19391945
ghcid_numeric: 4292520689498423363
valid_from: '2025-12-17T08:44:26.038723+00:00'
valid_to: null
valid_to: '2025-12-17T10:15:28.609076+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-GE-HAR-M-M-museum_19391945
ghcid_numeric: 11009242317818750437
valid_from: '2025-12-17T10:15:28.609076+00:00'
valid_to: null
reason: Location enriched via Exa web search - Harreveld, Gelderland
location_resolution:
method: EXA_WEB_SEARCH
city_code: HAR
city_name: Harreveld
region_code: GE
region_name: Gelderland
country_code: NL
resolution_date: '2025-12-17T10:15:28.609076+00:00'
source_url: https://www.tracesofwar.nl/sights/157857/Museum-Opdat-wij-niet-Vergeten-1939-1945.htm
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:44.440421+00:00'
@ -51,6 +66,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.tracesofwar.nl/sights/157857/Museum-Opdat-wij-niet-Vergeten-1939-1945.htm
extraction_timestamp: '2025-12-17T10:15:28.609076+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -59,3 +83,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Harreveld, Gelderland'

View file

@ -22,28 +22,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-geelvinck-hinlopen-huis.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Heerde
region: Gelderland
country: NL
address: Kamperweg 23, 8181 CS Heerde
ghcid:
ghcid_current: NL-XX-XXX-M-MGHH
ghcid_current: NL-GE-HEE-M-MGHH
ghcid_original: NL-XX-XXX-M-MGHH
ghcid_uuid: 2f84f10f-a9d9-518a-867b-76fd5744c9c8
ghcid_uuid_sha256: dbc820ca-9861-8e25-9543-d30ccde30e84
ghcid_numeric: 15836944144160349733
ghcid_uuid: 26d6aef0-f3cd-5237-ac57-d1cdfa9b3ee1
ghcid_uuid_sha256: 76b6959c-be1e-84cb-b010-a9e54b718f30
ghcid_numeric: 8554189042673935563
record_id: 3d9547a0-45c3-4759-8b11-f8193c5abccb
generation_timestamp: '2025-12-16T21:06:38.518452+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MGHH
ghcid_numeric: 15836944144160349733
valid_from: '2025-12-16T21:06:38.518452+00:00'
valid_to: null
valid_to: '2025-12-17T10:56:09.444275+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-GE-HEE-M-MGHH
ghcid_numeric: 8554189042673935563
valid_from: '2025-12-17T10:56:09.444275+00:00'
valid_to: null
reason: Location enriched via Exa web search - Heerde, Gelderland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: HEE
city_name: Heerde
region_code: GE
region_name: Gelderland
country_code: NL
resolution_date: '2025-12-17T10:56:09.444275+00:00'
source_url: https://geelvinck.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:38.518452+00:00'
@ -60,9 +70,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://geelvinck.nl/
extraction_timestamp: '2025-12-17T10:56:09.444275+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Heerde, Gelderland'

View file

@ -21,23 +21,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-de-brandkas-van-henny.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Zutphen
region: Gelderland
country: NL
address: Ravenstraatje 3, 7201 DG Zutphen
ghcid:
ghcid_current: NL-XX-XXX-M-MBH
ghcid_current: NL-GE-ZUT-M-MBH
ghcid_original: NL-XX-XXX-M-MBH
ghcid_uuid: cfc24f04-4169-55af-891f-ed4942860a5b
ghcid_uuid_sha256: f10091e2-000c-8152-bb4a-9c6e2dc1c4bb
ghcid_numeric: 17366040562990059858
ghcid_uuid: 8f3789a5-dac1-584a-a369-881ca1fcb35a
ghcid_uuid_sha256: f0f4973c-c174-823c-8ec4-f5bba7997043
ghcid_numeric: 17362668750619554364
record_id: 7cfce701-5c47-477c-9973-7f9e578d177b
generation_timestamp: '2025-12-16T21:06:40.699134+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MBH
ghcid_numeric: 17366040562990059858
valid_from: '2025-12-17T08:44:26.075139+00:00'
valid_to: null
valid_to: '2025-12-17T10:15:28.621915+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-GE-ZUT-M-MBH
ghcid_numeric: 17362668750619554364
valid_from: '2025-12-17T10:15:28.621915+00:00'
valid_to: null
reason: Location enriched via Exa web search - Zutphen, Gelderland
location_resolution:
method: EXA_WEB_SEARCH
city_code: ZUT
city_name: Zutphen
region_code: GE
region_name: Gelderland
country_code: NL
resolution_date: '2025-12-17T10:15:28.621915+00:00'
source_url: https://brandkashenny.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:40.699134+00:00'
@ -54,6 +69,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://brandkashenny.nl/
extraction_timestamp: '2025-12-17T10:15:28.621915+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -62,3 +86,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Zutphen, Gelderland'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/blik-trommel-en-oudheden-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Niezijl
region: Groningen
country: NL
address: Hoofdstraat 39, 9842 PC Niezijl
ghcid:
ghcid_current: NL-XX-XXX-M-BTOM
ghcid_current: NL-GR-NIE-M-BTOM
ghcid_original: NL-XX-XXX-M-BTOM
ghcid_uuid: 0e8e0112-b0fd-53c4-ad06-e5956c833244
ghcid_uuid_sha256: 5b616934-3786-8a12-b647-addf66bdd3bc
ghcid_numeric: 6584659803183176210
ghcid_uuid: 451234ac-ac1a-59ff-8906-8921a26babaa
ghcid_uuid_sha256: 8afdfde5-a833-8383-a403-9fbe787faf89
ghcid_numeric: 10015440309153149827
record_id: 8480c9b3-ebdf-47fe-8515-fb69b4a82c51
generation_timestamp: '2025-12-16T21:06:45.483497+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-BTOM
ghcid_numeric: 6584659803183176210
valid_from: '2025-12-16T21:06:45.483497+00:00'
valid_to: null
valid_to: '2025-12-17T09:33:15.814554+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-GR-NIE-M-BTOM
ghcid_numeric: 10015440309153149827
valid_from: '2025-12-17T09:33:15.814554+00:00'
valid_to: null
reason: Location enriched via Exa web search - Niezijl, Groningen
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: NIE
city_name: Niezijl
region_code: GR
region_name: Groningen
country_code: NL
resolution_date: '2025-12-17T09:33:15.814554+00:00'
source_url: https://www.blikentrommelmuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.483497+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.blikentrommelmuseum.nl/
extraction_timestamp: '2025-12-17T09:33:15.814554+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Niezijl, Groningen'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/fortuna-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Sittard
region: Limburg
country: NL
address: Sittard, Limburg, Netherlands
ghcid:
ghcid_current: NL-XX-XXX-M-FM
ghcid_current: NL-LI-SIT-M-FM
ghcid_original: NL-XX-XXX-M-FM
ghcid_uuid: be219abf-cc81-5a14-aee9-dc9b1b72efcb
ghcid_uuid_sha256: ad64fc4c-380d-8a38-8268-9c0bc7b7af6f
ghcid_numeric: 12494388670520703544
ghcid_uuid: 87c8fa4f-7f89-567f-9bcb-fdaa02cefa47
ghcid_uuid_sha256: 05de3d19-96ff-8790-9b8c-8069d9f4891f
ghcid_numeric: 422842595136202640
record_id: 208c7f58-eaf3-47d8-8d44-f4d714e17ddd
generation_timestamp: '2025-12-16T21:06:36.149767+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-FM
ghcid_numeric: 12494388670520703544
valid_from: '2025-12-16T21:06:36.149767+00:00'
valid_to: null
valid_to: '2025-12-17T09:25:04.202624+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-LI-SIT-M-FM
ghcid_numeric: 422842595136202640
valid_from: '2025-12-17T09:25:04.202624+00:00'
valid_to: null
reason: Location enriched via Exa web search - Sittard, Limburg
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: SIT
city_name: Sittard
region_code: LI
region_name: Limburg
country_code: NL
resolution_date: '2025-12-17T09:25:04.202624+00:00'
source_url: https://www.fortunasittard.nl/museum/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:36.149767+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.fortunasittard.nl/museum/
extraction_timestamp: '2025-12-17T09:25:04.202624+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Sittard, Limburg'

View file

@ -18,23 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/frans-maas-museum-verzameling.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Venlo
region: Limburg
country: NL
address: Groethofstraat 11L, 5916 PA Venlo
ghcid:
ghcid_current: NL-XX-XXX-M-FMMV
ghcid_current: NL-LI-VEN-M-FMMV
ghcid_original: NL-XX-XXX-M-FMMV
ghcid_uuid: f9f5904b-3f0e-589a-87d8-45c59c789c4f
ghcid_uuid_sha256: da43503b-8690-884b-8e60-946b38ab8434
ghcid_numeric: 15727502540298811467
ghcid_uuid: cdd8c1c8-7fd3-5f8b-babd-54c369e694ca
ghcid_uuid_sha256: dcb56ef8-9c2b-87f4-b515-6ce79a06fdf3
ghcid_numeric: 15903739673179830260
record_id: 0fe8c62f-329d-4dcd-b2d3-87f1ae6c591e
generation_timestamp: '2025-12-16T21:06:38.565300+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-FMMV
ghcid_numeric: 15727502540298811467
valid_from: '2025-12-17T08:44:26.042454+00:00'
valid_to: null
valid_to: '2025-12-17T10:18:03.391406+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-LI-VEN-M-FMMV
ghcid_numeric: 15903739673179830260
valid_from: '2025-12-17T10:18:03.391406+00:00'
valid_to: null
reason: Location enriched via Exa web search - Venlo, Limburg
location_resolution:
method: EXA_WEB_SEARCH
city_code: VEN
city_name: Venlo
region_code: LI
region_name: Limburg
country_code: NL
resolution_date: '2025-12-17T10:18:03.391406+00:00'
source_url: https://www.fransmaasvenlo.nl/contact/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:38.565300+00:00'
@ -51,6 +66,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.fransmaasvenlo.nl/contact/
extraction_timestamp: '2025-12-17T10:18:03.391406+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -59,3 +83,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Venlo, Limburg'

View file

@ -21,23 +21,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-canonije.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Boxtel
region: Noord-Brabant
country: NL
address: Duinendaal 9, 5281 AP Boxtel
ghcid:
ghcid_current: NL-XX-XXX-M-MC-museum_canonije
ghcid_current: NL-NB-BOX-M-MC-museum_canonije
ghcid_original: NL-XX-XXX-M-MC-museum_canonije
ghcid_uuid: 15100a44-415a-5ed9-afce-263513d42a1c
ghcid_uuid_sha256: 66912ce3-a33b-8452-ac7f-3980eb9a6eaf
ghcid_numeric: 7390737819699496018
ghcid_uuid: 3d464ce4-bba5-5dbd-8e3b-525c78016137
ghcid_uuid_sha256: 0a248f2b-2a81-8242-ad8c-f9badd60b151
ghcid_numeric: 730866455100334658
record_id: 90dc824a-f2bd-4406-9a5d-bfa30962be9b
generation_timestamp: '2025-12-16T21:06:45.443420+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MC-museum_canonije
ghcid_numeric: 7390737819699496018
valid_from: '2025-12-17T08:44:26.070346+00:00'
valid_to: null
valid_to: '2025-12-17T10:41:01.921443+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-NB-BOX-M-MC-museum_canonije
ghcid_numeric: 730866455100334658
valid_from: '2025-12-17T10:41:01.921443+00:00'
valid_to: null
reason: Location enriched via Exa web search - Boxtel, Noord-Brabant
location_resolution:
method: EXA_WEB_SEARCH
city_code: BOX
city_name: Boxtel
region_code: NB
region_name: Noord-Brabant
country_code: NL
resolution_date: '2025-12-17T10:41:01.921443+00:00'
source_url: https://www.museumgidsnederland.nl/en/boxtel/museum-de-canonije/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.443420+00:00'
@ -54,6 +69,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.museumgidsnederland.nl/en/boxtel/museum-de-canonije/
extraction_timestamp: '2025-12-17T10:41:01.921443+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -62,3 +86,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Boxtel, Noord-Brabant'

View file

@ -22,28 +22,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/bierreclame-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Breda
region: Noord-Brabant
country: NL
address: Haagweg 375, 4813 XC Breda
ghcid:
ghcid_current: NL-XX-XXX-M-BM
ghcid_current: NL-NB-BRE-M-BM
ghcid_original: NL-XX-XXX-M-BM
ghcid_uuid: 84ed37a1-9cef-55df-9e60-e2e684672c12
ghcid_uuid_sha256: c1231a5d-45dd-8add-a7d4-d95d70d66ab9
ghcid_numeric: 13916996261411379933
ghcid_uuid: a2dbc8ee-b755-55ee-84b1-0f99466ca820
ghcid_uuid_sha256: 8800c7b5-4e0d-8a90-8ee5-6e2e0ca99ede
ghcid_numeric: 9800052370670697104
record_id: a36adeaa-47da-4568-bf3d-798f7bdb1af0
generation_timestamp: '2025-12-16T21:06:43.291899+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-BM
ghcid_numeric: 13916996261411379933
valid_from: '2025-12-16T21:06:43.291899+00:00'
valid_to: null
valid_to: '2025-12-17T09:25:04.082081+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-NB-BRE-M-BM
ghcid_numeric: 9800052370670697104
valid_from: '2025-12-17T09:25:04.082081+00:00'
valid_to: null
reason: Location enriched via Exa web search - Breda, Noord-Brabant
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: BRE
city_name: Breda
region_code: NB
region_name: Noord-Brabant
country_code: NL
resolution_date: '2025-12-17T09:25:04.082081+00:00'
source_url: https://bierreclamemuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:43.291899+00:00'
@ -60,9 +70,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://bierreclamemuseum.nl/
extraction_timestamp: '2025-12-17T09:25:04.082081+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Breda, Noord-Brabant'

View file

@ -18,23 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-ceuclum.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Cuijk
region: Noord-Brabant
country: NL
address: Castellum 1, 5431 EM Cuijk
ghcid:
ghcid_current: NL-XX-XXX-M-MC
ghcid_current: NL-NB-CUI-M-MC
ghcid_original: NL-XX-XXX-M-MC
ghcid_uuid: 5e2dd6dd-62ce-5e89-8ce7-31ad00950bc2
ghcid_uuid_sha256: 12ec5579-cc21-8718-8082-0a26ec3b1e28
ghcid_numeric: 1363558768790574872
ghcid_uuid: 97f8ab3c-cfe2-53fb-ba27-dd2b3d058a83
ghcid_uuid_sha256: 695826fd-daa1-82e0-9dbf-919364b88536
ghcid_numeric: 7590860043669570272
record_id: 8f7d0da9-11c4-4162-9cf3-3b848e24d9b2
generation_timestamp: '2025-12-16T21:06:38.217498+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MC
ghcid_numeric: 1363558768790574872
valid_from: '2025-12-17T08:44:26.063071+00:00'
valid_to: null
valid_to: '2025-12-17T10:56:09.487585+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-NB-CUI-M-MC
ghcid_numeric: 7590860043669570272
valid_from: '2025-12-17T10:56:09.487585+00:00'
valid_to: null
reason: Location enriched via Exa web search - Cuijk, Noord-Brabant
location_resolution:
method: EXA_WEB_SEARCH
city_code: CUI
city_name: Cuijk
region_code: NB
region_name: Noord-Brabant
country_code: NL
resolution_date: '2025-12-17T10:56:09.487585+00:00'
source_url: https://www.museumceuclum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:38.217498+00:00'
@ -51,6 +66,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.museumceuclum.nl/
extraction_timestamp: '2025-12-17T10:56:09.487585+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -59,3 +83,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Cuijk, Noord-Brabant'

View file

@ -22,23 +22,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/crypto-museum.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Eindhoven
region: Noord-Brabant
country: NL
address: Crypto Museum, Eindhoven, Netherlands
ghcid:
ghcid_current: NL-XX-XXX-M-CM-crypto_museum
ghcid_current: NL-NB-EIN-M-CM-crypto_museum
ghcid_original: NL-XX-XXX-M-CM-crypto_museum
ghcid_uuid: e2d3a427-14ff-5a42-9504-f9e67b5cb6fb
ghcid_uuid_sha256: be8c4743-a6b9-86ac-bcab-ae0016ba508b
ghcid_numeric: 13730427719831340716
ghcid_uuid: e3b135ce-d513-5cf6-9e94-ecfba6686a53
ghcid_uuid_sha256: c7a798b7-74cc-85cc-b39a-69aa49b9fdb1
ghcid_numeric: 14386635448364312012
record_id: f760bfbd-3158-41b8-b25d-07e8218aff7c
generation_timestamp: '2025-12-16T21:06:45.197167+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-CM-crypto_museum
ghcid_numeric: 13730427719831340716
valid_from: '2025-12-17T08:44:25.987908+00:00'
valid_to: null
valid_to: '2025-12-17T09:17:11.063469+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-NB-EIN-M-CM-crypto_museum
ghcid_numeric: 14386635448364312012
valid_from: '2025-12-17T09:17:11.063469+00:00'
valid_to: null
reason: Location enriched via Exa web search - Eindhoven, Noord-Brabant
location_resolution:
method: EXA_WEB_SEARCH
city_code: EIN
city_name: Eindhoven
region_code: NB
region_name: Noord-Brabant
country_code: NL
resolution_date: '2025-12-17T09:17:11.063469+00:00'
source_url: https://www.cryptomuseum.com/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.197167+00:00'
@ -55,6 +70,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.cryptomuseum.com/
extraction_timestamp: '2025-12-17T09:17:11.063469+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -63,3 +87,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Eindhoven, Noord-Brabant'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/ambachtelijke-zagerij-en-klompenmakerij-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Etten-Leur
region: Noord-Brabant
country: NL
address: Schuitvaartjaagpad 179, 4873 NS Etten-Leur
ghcid:
ghcid_current: NL-XX-XXX-M-AZKM
ghcid_current: NL-NB-ETL-M-AZKM
ghcid_original: NL-XX-XXX-M-AZKM
ghcid_uuid: 79baa8e8-7730-5cb3-9781-8a42082b4427
ghcid_uuid_sha256: 230c046b-3ca2-865d-bb53-98d69bb656cd
ghcid_numeric: 2525398349673322077
ghcid_uuid: 57483d0d-fba0-502f-89fd-9e2149db05e4
ghcid_uuid_sha256: 6922be22-4716-888a-82fb-d5098b9c0f1b
ghcid_numeric: 7575826577621440650
record_id: 47d673a1-1143-42aa-a27c-56387035a200
generation_timestamp: '2025-12-16T21:06:45.151954+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-AZKM
ghcid_numeric: 2525398349673322077
valid_from: '2025-12-16T21:06:45.151954+00:00'
valid_to: null
valid_to: '2025-12-17T09:33:15.831394+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-NB-ETL-M-AZKM
ghcid_numeric: 7575826577621440650
valid_from: '2025-12-17T09:33:15.831394+00:00'
valid_to: null
reason: Location enriched via Exa web search - Etten-Leur, Noord-Brabant
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: ETL
city_name: Etten-Leur
region_code: NB
region_name: Noord-Brabant
country_code: NL
resolution_date: '2025-12-17T09:33:15.831394+00:00'
source_url: https://www.klompenmakerij.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.151954+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.klompenmakerij.nl/
extraction_timestamp: '2025-12-17T09:33:15.831394+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Etten-Leur, Noord-Brabant'

View file

@ -21,28 +21,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/edah-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Helmond
region: Noord-Brabant
country: NL
address: Helmond, Noord-Brabant, Netherlands
ghcid:
ghcid_current: NL-XX-XXX-M-EM
ghcid_current: NL-NB-HEL-M-EM
ghcid_original: NL-XX-XXX-M-EM
ghcid_uuid: abfeb5ac-0e66-5995-8d86-8a3cee6fbe6d
ghcid_uuid_sha256: 6fcf6f37-53e2-8880-ab6f-44b77364aa56
ghcid_numeric: 8056780541810337920
ghcid_uuid: 9568b2dc-355a-54bb-8e42-cc7ee9003173
ghcid_uuid_sha256: ae82fbff-3874-8406-9567-a9aae50c658c
ghcid_numeric: 12574890183154267142
record_id: d19ceb8a-e1ca-4669-8c0d-8584d562c8f5
generation_timestamp: '2025-12-16T21:06:37.295988+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-EM
ghcid_numeric: 8056780541810337920
valid_from: '2025-12-16T21:06:37.295988+00:00'
valid_to: null
valid_to: '2025-12-17T09:25:04.116169+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-NB-HEL-M-EM
ghcid_numeric: 12574890183154267142
valid_from: '2025-12-17T09:25:04.116169+00:00'
valid_to: null
reason: Location enriched via Exa web search - Helmond, Noord-Brabant
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: HEL
city_name: Helmond
region_code: NB
region_name: Noord-Brabant
country_code: NL
resolution_date: '2025-12-17T09:25:04.116169+00:00'
source_url: https://www.edahmuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:37.295988+00:00'
@ -59,9 +69,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.edahmuseum.nl/
extraction_timestamp: '2025-12-17T09:25:04.116169+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Helmond, Noord-Brabant'

View file

@ -18,23 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/stichting-abrahamdag.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Oosterhout
region: Noord-Brabant
country: NL
address: Van Wijngaardestraat 46, 4901VM Oosterhout
ghcid:
ghcid_current: NL-XX-XXX-M-A
ghcid_current: NL-NB-OOS-M-A
ghcid_original: NL-XX-XXX-M-A
ghcid_uuid: 3dbf1031-5d5d-5640-b949-96e68e4df376
ghcid_uuid_sha256: b4cf931e-ab67-8939-b358-c1b70b1ed7f8
ghcid_numeric: 13028794006940141881
ghcid_uuid: 1b16490b-f37d-5b67-b035-37c14ae23c53
ghcid_uuid_sha256: 3c7e1c0c-6706-822a-bdac-1946550f8bcf
ghcid_numeric: 4358952328934863402
record_id: 13121906-9f48-4cd3-81e5-32e43aae4a5a
generation_timestamp: '2025-12-16T21:06:45.023838+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-A
ghcid_numeric: 13028794006940141881
valid_from: '2025-12-17T08:44:26.069013+00:00'
valid_to: null
valid_to: '2025-12-17T10:08:05.575717+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-NB-OOS-M-A
ghcid_numeric: 4358952328934863402
valid_from: '2025-12-17T10:08:05.575717+00:00'
valid_to: null
reason: Location enriched via Exa web search - Oosterhout, Noord-Brabant
location_resolution:
method: EXA_WEB_SEARCH
city_code: OOS
city_name: Oosterhout
region_code: NB
region_name: Noord-Brabant
country_code: NL
resolution_date: '2025-12-17T10:08:05.575717+00:00'
source_url: https://abrahamdag.com/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.023838+00:00'
@ -51,6 +66,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://abrahamdag.com/
extraction_timestamp: '2025-12-17T10:08:05.575717+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -59,3 +83,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Oosterhout, Noord-Brabant'

View file

@ -226,23 +226,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/huis73-nl.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: '''s-Hertogenbosch'
region: Noord-Brabant
country: NL
address: Hekellaan 2, 5211 LX 's-Hertogenbosch
ghcid:
ghcid_current: NL-XX-XXX-L-HN
ghcid_current: NL-NB-SHE-L-HN
ghcid_original: NL-XX-XXX-L-HN
ghcid_uuid: 3e812b40-08d1-5d01-b3f4-d3eb2b053f41
ghcid_uuid_sha256: 193b35bb-74cd-807d-81e9-68924d148c39
ghcid_numeric: 1818105953808932989
ghcid_uuid: 0f399e85-7c29-5e69-b963-c635aaa79cf6
ghcid_uuid_sha256: 5b7d526a-8eec-8b1b-8a68-416064436348
ghcid_numeric: 6592516047158119195
record_id: 348d146d-cf2e-4a95-be5a-2d5aa300ebaa
generation_timestamp: '2025-12-16T21:06:38.288769+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-L-HN
ghcid_numeric: 1818105953808932989
valid_from: '2025-12-17T08:44:26.044915+00:00'
valid_to: null
valid_to: '2025-12-17T10:06:03.418243+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-NB-SHE-L-HN
ghcid_numeric: 6592516047158119195
valid_from: '2025-12-17T10:06:03.418243+00:00'
valid_to: null
reason: Location enriched via Exa web search - 's-Hertogenbosch, Noord-Brabant
location_resolution:
method: EXA_WEB_SEARCH
city_code: SHE
city_name: '''s-Hertogenbosch'
region_code: NB
region_name: Noord-Brabant
country_code: NL
resolution_date: '2025-12-17T10:06:03.418243+00:00'
source_url: https://www.huis73.nl/locaties
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:38.288769+00:00'
@ -259,6 +274,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.huis73.nl/locaties
extraction_timestamp: '2025-12-17T10:06:03.418243+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -267,3 +291,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: ''s-Hertogenbosch, Noord-Brabant'

View file

@ -22,23 +22,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-van-brabantse-mutsen-en-poffers.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Sint-Oedenrode
region: Noord-Brabant
country: NL
address: Kerkstraat 20, 5492 AH Sint-Oedenrode
ghcid:
ghcid_current: NL-XX-XXX-M-MBMP
ghcid_current: NL-NB-SOR-M-MBMP
ghcid_original: NL-XX-XXX-M-MBMP
ghcid_uuid: 21336fc7-3662-5602-9119-7d73c036534e
ghcid_uuid_sha256: 593b5119-0fd3-86ae-90c6-c45eb4887beb
ghcid_numeric: 6429822061083035310
ghcid_uuid: 31deef24-1de5-533c-a94b-41cb77270114
ghcid_uuid_sha256: 797305b2-f764-8c7b-a93b-56cc8f1a6ab1
ghcid_numeric: 8751344767123889275
record_id: a2dc4dee-6368-4d74-a185-413ed2e74f2f
generation_timestamp: '2025-12-16T21:06:45.177876+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MBMP
ghcid_numeric: 6429822061083035310
valid_from: '2025-12-17T08:44:25.982154+00:00'
valid_to: null
valid_to: '2025-12-17T10:56:09.506688+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-NB-SOR-M-MBMP
ghcid_numeric: 8751344767123889275
valid_from: '2025-12-17T10:56:09.506688+00:00'
valid_to: null
reason: Location enriched via Exa web search - Sint-Oedenrode, Noord-Brabant
location_resolution:
method: EXA_WEB_SEARCH
city_code: SOR
city_name: Sint-Oedenrode
region_code: NB
region_name: Noord-Brabant
country_code: NL
resolution_date: '2025-12-17T10:56:09.506688+00:00'
source_url: https://mutsenmuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.177876+00:00'
@ -55,6 +70,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://mutsenmuseum.nl/
extraction_timestamp: '2025-12-17T10:56:09.506688+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -63,3 +87,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Sint-Oedenrode, Noord-Brabant'

View file

@ -18,23 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-buitenlust.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Waalwijk
region: Noord-Brabant
country: NL
address: Jan de Rooystraat 14, 5141 EN Waalwijk
ghcid:
ghcid_current: NL-XX-XXX-M-MB-museum_buitenlust
ghcid_current: NL-NB-WAA-M-MB-museum_buitenlust
ghcid_original: NL-XX-XXX-M-MB-museum_buitenlust
ghcid_uuid: 78e29a5b-ff12-532e-b6a9-26b96527d012
ghcid_uuid_sha256: 5d687b60-5e1c-83ce-9fd3-be1ab3eda87c
ghcid_numeric: 6730765296931226574
ghcid_uuid: df4d5ef7-4ec4-57be-b445-576a334c5cfb
ghcid_uuid_sha256: e3bb7ab7-eaa4-8dd1-aadc-a05bb8f7a822
ghcid_numeric: 16409844597588803025
record_id: ed0fd5bd-9d38-4b94-bcbc-9b927711b645
generation_timestamp: '2025-12-16T21:06:42.414009+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MB-museum_buitenlust
ghcid_numeric: 6730765296931226574
valid_from: '2025-12-17T08:44:26.056155+00:00'
valid_to: null
valid_to: '2025-12-17T10:41:01.904174+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-NB-WAA-M-MB-museum_buitenlust
ghcid_numeric: 16409844597588803025
valid_from: '2025-12-17T10:41:01.904174+00:00'
valid_to: null
reason: Location enriched via Exa web search - Waalwijk, Noord-Brabant
location_resolution:
method: EXA_WEB_SEARCH
city_code: WAA
city_name: Waalwijk
region_code: NB
region_name: Noord-Brabant
country_code: NL
resolution_date: '2025-12-17T10:41:01.904174+00:00'
source_url: https://www.museumbuitenlust.nl/contact
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:42.414009+00:00'
@ -51,6 +66,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.museumbuitenlust.nl/contact
extraction_timestamp: '2025-12-17T10:41:01.904174+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -59,3 +83,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Waalwijk, Noord-Brabant'

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -18,23 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/allard-pierson-museum.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Amsterdam
region: Noord-Holland
country: NL
address: Oude Turfmarkt 127, 1012 GC Amsterdam
ghcid:
ghcid_current: NL-XX-XXX-M-APM-allard_pierson_museum
ghcid_current: NL-NH-AMS-M-APM-allard_pierson_museum
ghcid_original: NL-XX-XXX-M-APM-allard_pierson_museum
ghcid_uuid: 48392b8c-36a8-53a6-a03d-271eaf106bb4
ghcid_uuid_sha256: e9b99db6-f222-8dc7-bbbe-18ed689ad863
ghcid_numeric: 16841665690600619463
ghcid_uuid: 0ec4308c-0f54-5138-99c6-809709534df8
ghcid_uuid_sha256: 4f66990b-cd99-83cd-9376-1c589d0ffbb1
ghcid_numeric: 5721428652593849293
record_id: ffc23f4f-a760-406f-b103-46f70b81736a
generation_timestamp: '2025-12-16T21:06:45.602986+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-APM-allard_pierson_museum
ghcid_numeric: 16841665690600619463
valid_from: '2025-12-17T08:44:25.980684+00:00'
valid_to: null
valid_to: '2025-12-17T09:17:11.075305+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-NH-AMS-M-APM-allard_pierson_museum
ghcid_numeric: 5721428652593849293
valid_from: '2025-12-17T09:17:11.075305+00:00'
valid_to: null
reason: Location enriched via Exa web search - Amsterdam, Noord-Holland
location_resolution:
method: EXA_WEB_SEARCH
city_code: AMS
city_name: Amsterdam
region_code: NH
region_name: Noord-Holland
country_code: NL
resolution_date: '2025-12-17T09:17:11.075305+00:00'
source_url: https://www.allardpierson.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.602986+00:00'
@ -51,6 +66,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.allardpierson.nl/
extraction_timestamp: '2025-12-17T09:17:11.075305+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -59,3 +83,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Amsterdam, Noord-Holland'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/cow-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Amsterdam
region: Noord-Holland
country: NL
address: Leliegracht 4, Amsterdam
ghcid:
ghcid_current: NL-XX-XXX-M-CM-cow_museum
ghcid_current: NL-NH-AMS-M-CM-cow_museum
ghcid_original: NL-XX-XXX-M-CM-cow_museum
ghcid_uuid: 9590fe8f-c06b-55d1-a4f0-8a497b9a9d6b
ghcid_uuid_sha256: 7748f0fe-3dc8-8f0f-b5d3-6e253897ec97
ghcid_numeric: 8595384863585521423
ghcid_uuid: 552df309-3f63-5bce-ba90-0a8cd7fab8c1
ghcid_uuid_sha256: 6e182c92-bd64-882a-9adc-0a40d4b7ab19
ghcid_numeric: 7933139752367454250
record_id: 5ae057ba-c6a2-43d0-a200-1285bcb507a7
generation_timestamp: '2025-12-16T21:06:42.129981+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-CM-cow_museum
ghcid_numeric: 8595384863585521423
valid_from: '2025-12-16T21:06:42.129981+00:00'
valid_to: null
valid_to: '2025-12-17T09:17:11.106062+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-NH-AMS-M-CM-cow_museum
ghcid_numeric: 7933139752367454250
valid_from: '2025-12-17T09:17:11.106062+00:00'
valid_to: null
reason: Location enriched via Exa web search - Amsterdam, Noord-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: AMS
city_name: Amsterdam
region_code: NH
region_name: Noord-Holland
country_code: NL
resolution_date: '2025-12-17T09:17:11.106062+00:00'
source_url: https://cowmuseum.amsterdam/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:42.129981+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://cowmuseum.amsterdam/
extraction_timestamp: '2025-12-17T09:17:11.106062+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Amsterdam, Noord-Holland'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/cacao-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Amsterdam
region: Noord-Holland
country: NL
address: Amsterdam, Netherlands
ghcid:
ghcid_current: NL-XX-XXX-M-CM
ghcid_current: NL-NH-AMS-M-CM
ghcid_original: NL-XX-XXX-M-CM
ghcid_uuid: 583583d1-81e1-58c3-93fb-c5e26dd537b3
ghcid_uuid_sha256: 9a9161f8-d0d2-8a31-a4cd-762b71024dc4
ghcid_numeric: 11137791074747161137
ghcid_uuid: 47f14c04-b37a-5e60-aa25-de2ed442b7ff
ghcid_uuid_sha256: 3411844b-6dbc-869e-8f22-4cc95c99b917
ghcid_numeric: 3751925424074823326
record_id: 1241c462-d2b4-4afa-b46c-e2b0f3c22fde
generation_timestamp: '2025-12-16T21:06:40.671775+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-CM
ghcid_numeric: 11137791074747161137
valid_from: '2025-12-16T21:06:40.671775+00:00'
valid_to: null
valid_to: '2025-12-17T09:25:04.103278+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-NH-AMS-M-CM
ghcid_numeric: 3751925424074823326
valid_from: '2025-12-17T09:25:04.103278+00:00'
valid_to: null
reason: Location enriched via Exa web search - Amsterdam, Noord-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: AMS
city_name: Amsterdam
region_code: NH
region_name: Noord-Holland
country_code: NL
resolution_date: '2025-12-17T09:25:04.103278+00:00'
source_url: https://www.cacaomuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:40.671775+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.cacaomuseum.nl/
extraction_timestamp: '2025-12-17T09:25:04.103278+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Amsterdam, Noord-Holland'

View file

@ -21,23 +21,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/dutch-directors-guild-ddg.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Amsterdam
region: Noord-Holland
country: NL
address: De Lairessestraat 125 sous, 1075 HH Amsterdam
ghcid:
ghcid_current: NL-XX-XXX-M-DDGD
ghcid_current: NL-NH-AMS-M-DDGD
ghcid_original: NL-XX-XXX-M-DDGD
ghcid_uuid: dfc8ca6f-3c25-5bd7-9444-fc482affad31
ghcid_uuid_sha256: 122993cf-061a-8925-b3a5-bdb49e87451f
ghcid_numeric: 1308739684097775909
ghcid_uuid: 918191cb-0b1b-52ef-9f89-a91876e7b672
ghcid_uuid_sha256: a132b2a0-d58a-8be6-b7e0-bf41b618d481
ghcid_numeric: 11615542792789044198
record_id: e3bb2498-22cf-48e3-8d5c-9c5dc0c398bb
generation_timestamp: '2025-12-16T21:06:46.065288+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-DDGD
ghcid_numeric: 1308739684097775909
valid_from: '2025-12-17T08:44:26.036192+00:00'
valid_to: null
valid_to: '2025-12-17T10:06:03.451255+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-NH-AMS-M-DDGD
ghcid_numeric: 11615542792789044198
valid_from: '2025-12-17T10:06:03.451255+00:00'
valid_to: null
reason: Location enriched via Exa web search - Amsterdam, Noord-Holland
location_resolution:
method: EXA_WEB_SEARCH
city_code: AMS
city_name: Amsterdam
region_code: NH
region_name: Noord-Holland
country_code: NL
resolution_date: '2025-12-17T10:06:03.451255+00:00'
source_url: https://www.directorsguild.nl/contact/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:46.065288+00:00'
@ -54,6 +69,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.directorsguild.nl/contact/
extraction_timestamp: '2025-12-17T10:06:03.451255+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -62,3 +86,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Amsterdam, Noord-Holland'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/eddie-the-eagle-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Amsterdam
region: Noord-Holland
country: NL
address: Amsterdam, Netherlands
ghcid:
ghcid_current: NL-XX-XXX-M-EEM
ghcid_current: NL-NH-AMS-M-EEM
ghcid_original: NL-XX-XXX-M-EEM
ghcid_uuid: 13bc9959-7082-5d46-bd65-825b494a755f
ghcid_uuid_sha256: 4a81f985-bb1f-8275-98cf-b5550faf1698
ghcid_numeric: 5368846583567700597
ghcid_uuid: ca8ffd5a-ae44-5a97-aa5f-b470c2889849
ghcid_uuid_sha256: 0b19894f-1307-8a1a-828a-3f5550e5bc6d
ghcid_numeric: 799821381549623834
record_id: bd5b97ab-2d9b-411a-87ef-4190f7694ea5
generation_timestamp: '2025-12-16T21:06:36.344748+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-EEM
ghcid_numeric: 5368846583567700597
valid_from: '2025-12-16T21:06:36.344748+00:00'
valid_to: null
valid_to: '2025-12-17T09:25:04.153616+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-NH-AMS-M-EEM
ghcid_numeric: 799821381549623834
valid_from: '2025-12-17T09:25:04.153616+00:00'
valid_to: null
reason: Location enriched via Exa web search - Amsterdam, Noord-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: AMS
city_name: Amsterdam
region_code: NH
region_name: Noord-Holland
country_code: NL
resolution_date: '2025-12-17T09:25:04.153616+00:00'
source_url: https://www.eddie-the-eagle-museum.com/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:36.344748+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.eddie-the-eagle-museum.com/
extraction_timestamp: '2025-12-17T09:25:04.153616+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Amsterdam, Noord-Holland'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/erotisch-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Amsterdam
region: Noord-Holland
country: NL
address: Oudezijds Achterburgwal 54, 1012 DP Amsterdam
ghcid:
ghcid_current: NL-XX-XXX-M-EM-erotisch_museum
ghcid_current: NL-NH-AMS-M-EM-erotisch_museum
ghcid_original: NL-XX-XXX-M-EM-erotisch_museum
ghcid_uuid: bb4cab24-67cd-5fd5-81ae-a23a875e15b6
ghcid_uuid_sha256: 7e6cc478-4325-8950-8c84-78ec611b00c8
ghcid_numeric: 9109872167065323856
ghcid_uuid: 5f62f68f-f9fa-5300-8be8-e189faf31ea4
ghcid_uuid_sha256: fff75562-9984-8ad1-87c7-cf7a85b16c89
ghcid_numeric: 18444304680889912017
record_id: b35f2b5c-438b-4f53-98e8-6836db080fdd
generation_timestamp: '2025-12-16T21:06:40.810369+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-EM-erotisch_museum
ghcid_numeric: 9109872167065323856
valid_from: '2025-12-16T21:06:40.810369+00:00'
valid_to: null
valid_to: '2025-12-17T09:57:11.842394+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-NH-AMS-M-EM-erotisch_museum
ghcid_numeric: 18444304680889912017
valid_from: '2025-12-17T09:57:11.842394+00:00'
valid_to: null
reason: Location enriched via Exa web search - Amsterdam, Noord-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: AMS
city_name: Amsterdam
region_code: NH
region_name: Noord-Holland
country_code: NL
resolution_date: '2025-12-17T09:57:11.842394+00:00'
source_url: https://erotisch-museum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:40.810369+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://erotisch-museum.nl/
extraction_timestamp: '2025-12-17T09:57:11.842394+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Amsterdam, Noord-Holland'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/van-gogh-museum-enterprises-b-v.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Amsterdam
region: Noord-Holland
country: NL
address: Gabriel Metsustraat 8, 1071 EA Amsterdam
ghcid:
ghcid_current: NL-XX-XXX-M-GMEBV
ghcid_current: NL-NH-AMS-M-GMEBV
ghcid_original: NL-XX-XXX-M-GMEBV
ghcid_uuid: c08ffbd9-798e-5759-99a9-6ebcca90eaf6
ghcid_uuid_sha256: d180b682-bda8-8c1b-a59b-23d28bbadf01
ghcid_numeric: 15096266623589833755
ghcid_uuid: 37b64938-f8fa-5c8e-9491-518df13f6217
ghcid_uuid_sha256: b111cb97-78ad-8caa-88f5-ee638ae4422c
ghcid_numeric: 12759203070742441130
record_id: 1b2f11f2-09a6-4238-9ece-7ecf802f2bf6
generation_timestamp: '2025-12-16T21:06:39.631236+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-GMEBV
ghcid_numeric: 15096266623589833755
valid_from: '2025-12-16T21:06:39.631236+00:00'
valid_to: null
valid_to: '2025-12-17T09:57:11.875121+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-NH-AMS-M-GMEBV
ghcid_numeric: 12759203070742441130
valid_from: '2025-12-17T09:57:11.875121+00:00'
valid_to: null
reason: Location enriched via Exa web search - Amsterdam, Noord-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: AMS
city_name: Amsterdam
region_code: NH
region_name: Noord-Holland
country_code: NL
resolution_date: '2025-12-17T09:57:11.875121+00:00'
source_url: https://www.vangoghmuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:39.631236+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.vangoghmuseum.nl/
extraction_timestamp: '2025-12-17T09:57:11.875121+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Amsterdam, Noord-Holland'

View file

@ -21,23 +21,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-galerie-rat.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Den Burg
region: Noord-Holland
country: NL
address: Burgwal 20, 1791 Den Burg, Texel
ghcid:
ghcid_current: NL-XX-XXX-M-MGR
ghcid_current: NL-NH-DEB-M-MGR
ghcid_original: NL-XX-XXX-M-MGR
ghcid_uuid: 429368ac-f508-5f8a-85c9-45a0e2f17ede
ghcid_uuid_sha256: e316442c-29a1-85ba-8466-96b219533229
ghcid_numeric: 16363341252565001658
ghcid_uuid: 181eacab-1bc0-5d2e-9cb3-323d13d4af7d
ghcid_uuid_sha256: 62635b76-af7b-8348-a0c4-3e807eb3c11b
ghcid_numeric: 7089610803719668552
record_id: ac9f145d-7a7a-433d-871a-3f2b6d08b418
generation_timestamp: '2025-12-16T21:06:38.459660+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MGR
ghcid_numeric: 16363341252565001658
valid_from: '2025-12-17T08:44:26.014402+00:00'
valid_to: null
valid_to: '2025-12-17T10:56:09.466192+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-NH-DEB-M-MGR
ghcid_numeric: 7089610803719668552
valid_from: '2025-12-17T10:56:09.466192+00:00'
valid_to: null
reason: Location enriched via Exa web search - Den Burg, Noord-Holland
location_resolution:
method: EXA_WEB_SEARCH
city_code: DEB
city_name: Den Burg
region_code: NH
region_name: Noord-Holland
country_code: NL
resolution_date: '2025-12-17T10:56:09.466192+00:00'
source_url: https://www.mapquest.com/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:38.459660+00:00'
@ -54,6 +69,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.mapquest.com/
extraction_timestamp: '2025-12-17T10:56:09.466192+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -62,3 +86,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Den Burg, Noord-Holland'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/flessenscheepjes-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Enkhuizen
region: Noord-Holland
country: NL
address: Zuiderspui 1, 1601 GN Enkhuizen
ghcid:
ghcid_current: NL-XX-XXX-M-FM-flessenscheepjes_museum
ghcid_current: NL-NH-ENK-M-FM-flessenscheepjes_museum
ghcid_original: NL-XX-XXX-M-FM-flessenscheepjes_museum
ghcid_uuid: 97c8b3fd-d3fe-56f8-8347-200b16870c5a
ghcid_uuid_sha256: ac6ed3ce-d178-8ef6-aa8b-809066354c99
ghcid_numeric: 12425101307192844022
ghcid_uuid: 9b9ca02a-fdf6-56d6-b459-606fbeb06876
ghcid_uuid_sha256: 200ffdc7-eea1-8fc5-8519-21259ce03cf6
ghcid_numeric: 2310344169008254917
record_id: cbc2e3fa-e583-45d3-a439-9068faa1ad07
generation_timestamp: '2025-12-16T21:06:40.027467+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-FM-flessenscheepjes_museum
ghcid_numeric: 12425101307192844022
valid_from: '2025-12-16T21:06:40.027467+00:00'
valid_to: null
valid_to: '2025-12-17T09:25:04.131278+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-NH-ENK-M-FM-flessenscheepjes_museum
ghcid_numeric: 2310344169008254917
valid_from: '2025-12-17T09:25:04.131278+00:00'
valid_to: null
reason: Location enriched via Exa web search - Enkhuizen, Noord-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: ENK
city_name: Enkhuizen
region_code: NH
region_name: Noord-Holland
country_code: NL
resolution_date: '2025-12-17T09:25:04.131278+00:00'
source_url: https://www.flessenscheepjesmuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:40.027467+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.flessenscheepjesmuseum.nl/
extraction_timestamp: '2025-12-17T09:25:04.131278+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Enkhuizen, Noord-Holland'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/c1000-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Hellendoorn
region: Overijssel
country: NL
address: Katenhorstweg 2, 7447 RN Hellendoorn
ghcid:
ghcid_current: NL-XX-XXX-M-CM-c1000_museum
ghcid_current: NL-OV-HEL-M-CM-c1000_museum
ghcid_original: NL-XX-XXX-M-CM-c1000_museum
ghcid_uuid: 1ca741dc-8ca8-5e04-a15f-3d12fbcef31d
ghcid_uuid_sha256: b92c4170-b851-81f2-8aa3-b844c73b7104
ghcid_numeric: 13343111748376396274
ghcid_uuid: 23ca8d87-2929-59d3-a67d-9a70e3a30e81
ghcid_uuid_sha256: 13ca320c-b106-895a-a8a2-7cf17e62a22b
ghcid_numeric: 1426007262107171162
record_id: 1428fc8d-3fcf-41a4-b9cf-f000c2cee234
generation_timestamp: '2025-12-16T21:06:46.862264+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-CM-c1000_museum
ghcid_numeric: 13343111748376396274
valid_from: '2025-12-16T21:06:46.862264+00:00'
valid_to: null
valid_to: '2025-12-17T09:33:15.763818+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-OV-HEL-M-CM-c1000_museum
ghcid_numeric: 1426007262107171162
valid_from: '2025-12-17T09:33:15.763818+00:00'
valid_to: null
reason: Location enriched via Exa web search - Hellendoorn, Overijssel
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: HEL
city_name: Hellendoorn
region_code: OV
region_name: Overijssel
country_code: NL
resolution_date: '2025-12-17T09:33:15.763818+00:00'
source_url: https://www.c1000museum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:46.862264+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.c1000museum.nl/
extraction_timestamp: '2025-12-17T09:33:15.763818+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Hellendoorn, Overijssel'

View file

@ -18,23 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-erve-hofman.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Hellendoorn
region: Overijssel
country: NL
address: Hofmanstraat 2, 7447 AS Hellendoorn
ghcid:
ghcid_current: NL-XX-XXX-M-MEH
ghcid_current: NL-OV-HEL-M-MEH
ghcid_original: NL-XX-XXX-M-MEH
ghcid_uuid: c3289aa2-9498-57b6-b7f9-02e6680143b4
ghcid_uuid_sha256: fb2403ee-f058-84a1-8014-d00e7832e0fc
ghcid_numeric: 18096593527450940577
ghcid_uuid: c48fb5e7-c5cc-544d-a446-cbaf423febc6
ghcid_uuid_sha256: f974021f-883b-8c6c-bb8f-18ee6041b691
ghcid_numeric: 17974994347195337836
record_id: aafc1760-7283-4a18-8aea-ff347f86bcd5
generation_timestamp: '2025-12-16T21:06:37.002843+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MEH
ghcid_numeric: 18096593527450940577
valid_from: '2025-12-17T08:44:26.021837+00:00'
valid_to: null
valid_to: '2025-12-17T10:41:01.945830+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-OV-HEL-M-MEH
ghcid_numeric: 17974994347195337836
valid_from: '2025-12-17T10:41:01.945830+00:00'
valid_to: null
reason: Location enriched via Exa web search - Hellendoorn, Overijssel
location_resolution:
method: EXA_WEB_SEARCH
city_code: HEL
city_name: Hellendoorn
region_code: OV
region_name: Overijssel
country_code: NL
resolution_date: '2025-12-17T10:41:01.945830+00:00'
source_url: https://www.oaldheldern.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:37.002843+00:00'
@ -51,6 +66,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.oaldheldern.nl/
extraction_timestamp: '2025-12-17T10:41:01.945830+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -59,3 +83,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Hellendoorn, Overijssel'

View file

@ -21,23 +21,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-averlo-frieswijk-schalkhaar.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Schalkhaar
region: Overijssel
country: NL
address: Frieswijkerweg 7, 7433 RB Schalkhaar
ghcid:
ghcid_current: NL-XX-XXX-M-MAFS
ghcid_current: NL-OV-SCK-M-MAFS
ghcid_original: NL-XX-XXX-M-MAFS
ghcid_uuid: a0095672-8164-5162-bbeb-1326d8c47c2e
ghcid_uuid_sha256: 418c9005-af59-8721-aff7-09d320097bdc
ghcid_numeric: 4723308463295911713
ghcid_uuid: 55dcd23c-2bde-524f-b846-b568f848c489
ghcid_uuid_sha256: 65d5a742-39a5-845f-81de-a188a31e9760
ghcid_numeric: 7337955070746895455
record_id: cf3d0896-1d2a-49d1-94a2-a5962c995d76
generation_timestamp: '2025-12-16T21:06:45.760441+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MAFS
ghcid_numeric: 4723308463295911713
valid_from: '2025-12-17T08:44:26.071718+00:00'
valid_to: null
valid_to: '2025-12-17T10:56:09.477040+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-OV-SCK-M-MAFS
ghcid_numeric: 7337955070746895455
valid_from: '2025-12-17T10:56:09.477040+00:00'
valid_to: null
reason: Location enriched via Exa web search - Schalkhaar, Overijssel
location_resolution:
method: EXA_WEB_SEARCH
city_code: SCK
city_name: Schalkhaar
region_code: OV
region_name: Overijssel
country_code: NL
resolution_date: '2025-12-17T10:56:09.477040+00:00'
source_url: https://www.museum-afs.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.760441+00:00'
@ -54,6 +69,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.museum-afs.nl/
extraction_timestamp: '2025-12-17T10:56:09.477040+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -62,3 +86,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Schalkhaar, Overijssel'

View file

@ -21,23 +21,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/museum-dijkmagazijn-de-heul.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Schalkwijk
region: Utrecht
country: NL
address: Provincialeweg 70, 3998 JK Schalkwijk
ghcid:
ghcid_current: NL-XX-XXX-M-MDH
ghcid_current: NL-UT-SCH-M-MDH
ghcid_original: NL-XX-XXX-M-MDH
ghcid_uuid: 9a5ac714-4ad1-59f8-9262-72cc6d85edeb
ghcid_uuid_sha256: f2883142-38d3-88b3-87c8-2eaa798c5d80
ghcid_numeric: 17476272514502215859
ghcid_uuid: c9455255-b55c-5df9-ac7d-b2dcebd80af4
ghcid_uuid_sha256: 9fc9707e-9dda-8a57-b694-011dc66da335
ghcid_numeric: 11513857611465906775
record_id: a8300219-47d6-4b17-b9da-5e8e5e96ce49
generation_timestamp: '2025-12-16T21:06:45.958015+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MDH
ghcid_numeric: 17476272514502215859
valid_from: '2025-12-17T08:44:26.064337+00:00'
valid_to: null
valid_to: '2025-12-17T10:41:01.934919+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-UT-SCH-M-MDH
ghcid_numeric: 11513857611465906775
valid_from: '2025-12-17T10:41:01.934919+00:00'
valid_to: null
reason: Location enriched via Exa web search - Schalkwijk, Utrecht
location_resolution:
method: EXA_WEB_SEARCH
city_code: SCH
city_name: Schalkwijk
region_code: UT
region_name: Utrecht
country_code: NL
resolution_date: '2025-12-17T10:41:01.934919+00:00'
source_url: https://www.museuminschalkwijk.nl/contact.html
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:45.958015+00:00'
@ -54,6 +69,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.museuminschalkwijk.nl/contact.html
extraction_timestamp: '2025-12-17T10:41:01.934919+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -62,3 +86,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Schalkwijk, Utrecht'

View file

@ -1,64 +0,0 @@
custodian_name:
emic_name: Ajax Museum
emic_name_source: linkedin
institution_type:
- M
linkedin_enrichment:
linkedin_url: https://www.linkedin.com/company/ajax-museum
linkedin_slug: ajax-museum
industry: Museums, Historical Sites, and Zoos
website: https://lnkd.in/ezz5r9nF
follower_count: 5,707
staff_count: 1
heritage_staff_count: 0
heritage_staff: []
enrichment_timestamp: '2025-12-16T21:06:39.152742+00:00'
provenance:
source: linkedin_company_scrape
original_file: data/custodian/linkedin/ajax-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
country: NL
ghcid:
ghcid_current: NL-XX-XXX-M-AM
ghcid_original: NL-XX-XXX-M-AM
ghcid_uuid: 44e6a20c-0e8e-509c-a126-749148b44831
ghcid_uuid_sha256: f2392c8d-320f-8be0-9a91-2e8d6d706953
ghcid_numeric: 17454030815792942048
record_id: 1bbc810e-b76d-4d89-8351-99e851dae39f
generation_timestamp: '2025-12-16T21:06:39.152742+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-AM
ghcid_numeric: 17454030815792942048
valid_from: '2025-12-16T21:06:39.152742+00:00'
valid_to: null
reason: Initial GHCID assignment from LinkedIn batch import
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
country_code: NL
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:39.152742+00:00'
sources:
linkedin:
- source_type: linkedin_company_profile
data_tier: TIER_4_INFERRED
source_file: data/custodian/linkedin/ajax-museum.yaml
extraction_timestamp: '2025-12-16T21:06:39.152742+00:00'
claims_extracted:
- name
- industry
- location
- website
- staff_count
- heritage_staff
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'

View file

@ -1,64 +0,0 @@
custodian_name:
emic_name: Hollands Kaas Museum
emic_name_source: linkedin
institution_type:
- M
linkedin_enrichment:
linkedin_url: https://www.linkedin.com/company/hollands-kaas-museum
linkedin_slug: hollands-kaas-museum
industry: Museums, Historical Sites, and Zoos
website: null
follower_count: 5,618,343
staff_count: 1
heritage_staff_count: 0
heritage_staff: []
enrichment_timestamp: '2025-12-16T21:06:41.385742+00:00'
provenance:
source: linkedin_company_scrape
original_file: data/custodian/linkedin/hollands-kaas-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
country: NL
ghcid:
ghcid_current: NL-XX-XXX-M-HKM
ghcid_original: NL-XX-XXX-M-HKM
ghcid_uuid: b936825d-f6f5-5f75-9693-59ffe0983b6e
ghcid_uuid_sha256: bfd2770a-451a-80ec-8095-a0315a56c4eb
ghcid_numeric: 13822241092346228972
record_id: b51e4aff-c0c7-40da-b064-3f308125b650
generation_timestamp: '2025-12-16T21:06:41.385742+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-HKM
ghcid_numeric: 13822241092346228972
valid_from: '2025-12-16T21:06:41.385742+00:00'
valid_to: null
reason: Initial GHCID assignment from LinkedIn batch import
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
country_code: NL
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:41.385742+00:00'
sources:
linkedin:
- source_type: linkedin_company_profile
data_tier: TIER_4_INFERRED
source_file: data/custodian/linkedin/hollands-kaas-museum.yaml
extraction_timestamp: '2025-12-16T21:06:41.385742+00:00'
claims_extracted:
- name
- industry
- location
- website
- staff_count
- heritage_staff
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/het-kaas-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Bodegraven
region: Zuid-Holland
country: NL
address: Marktstraat 1, 2411 BE Bodegraven
ghcid:
ghcid_current: NL-XX-XXX-M-KM
ghcid_current: NL-ZH-BOD-M-KM
ghcid_original: NL-XX-XXX-M-KM
ghcid_uuid: 7846a559-0853-5206-b4b9-466f633e8f37
ghcid_uuid_sha256: cb461ec7-5b09-808f-9bc2-1c805d2534dd
ghcid_numeric: 14647428679643816079
ghcid_uuid: a36d9483-7116-5e2a-97fe-182b3517f1ef
ghcid_uuid_sha256: 2c53af73-8d88-8c89-b68d-cc86cc3491bb
ghcid_numeric: 3194089471566462089
record_id: 91fffa60-6f4f-4cb9-9579-385811b087a6
generation_timestamp: '2025-12-16T21:06:36.526045+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-KM
ghcid_numeric: 14647428679643816079
valid_from: '2025-12-16T21:06:36.526045+00:00'
valid_to: null
valid_to: '2025-12-17T10:08:05.564102+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-ZH-BOD-M-KM
ghcid_numeric: 3194089471566462089
valid_from: '2025-12-17T10:08:05.564102+00:00'
valid_to: null
reason: Location enriched via Exa web search - Bodegraven, Zuid-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: BOD
city_name: Bodegraven
region_code: ZH
region_name: Zuid-Holland
country_code: NL
resolution_date: '2025-12-17T10:08:05.564102+00:00'
source_url: https://www.hetkaasmuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:36.526045+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.hetkaasmuseum.nl/
extraction_timestamp: '2025-12-17T10:08:05.564102+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Bodegraven, Zuid-Holland'

View file

@ -16,12 +16,14 @@ linkedin_enrichment:
headline: ''
heritage_type: M
- name: Oumaima Hajri أميمة حاجري
headline: AI Ethicist/Data Scientist | Sr. International Advisor, Dutch DPA | AI Ethics & Society, University of Cambridge
| Elsevier 30 under 30 | Public speaker
headline: AI Ethicist/Data Scientist | Sr. International Advisor, Dutch DPA |
AI Ethics & Society, University of Cambridge | Elsevier 30 under 30 | Public
speaker
linkedin_url: https://www.linkedin.com/in/oumaima-hajri
heritage_type: S
- name: Annemarie Bloemen-Patberg
headline: Senior Strategic Advisor AI law @Dutch DCA / AI for Business @Oxford University Saïd Business School
headline: Senior Strategic Advisor AI law @Dutch DCA / AI for Business @Oxford
University Saïd Business School
linkedin_url: https://www.linkedin.com/in/annemarie-bloemen-patberg-2203bb5
heritage_type: E
- name: Mies Beljaars - Snellen van Vollenhoven
@ -60,7 +62,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/samira-farouk-0b2520b6
heritage_type: R
- name: Emma-Julia Vuijk
headline: Double Master Student Law and Technology & International and European Law - Werkstudent Autoriteit Persoonsgegevens
headline: Double Master Student Law and Technology & International and European
Law - Werkstudent Autoriteit Persoonsgegevens
linkedin_url: https://www.linkedin.com/in/emmajuliavuijk
heritage_type: E
- name: Melike Yeniay
@ -140,7 +143,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/mr-anne-loes-van-den-brand-valk-73583557
heritage_type: R
- name: Ruqaya Zaki
headline: BSc Law Student at Leiden University | Judicial supporter @ Dutch Data Protection Authority
headline: BSc Law Student at Leiden University | Judicial supporter @ Dutch Data
Protection Authority
linkedin_url: https://www.linkedin.com/in/ruqaya-zaki
heritage_type: E
- name: Tessa van Wickevoort Crommelin-van Velzen
@ -160,28 +164,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/autoriteit-persoonsgegevens-ap-dutch-dpa.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Den Haag
region: Zuid-Holland
country: NL
address: Bezuidenhoutseweg 30, 2594 AV Den Haag
ghcid:
ghcid_current: NL-XX-XXX-E-APADD
ghcid_current: NL-ZH-DHA-E-APADD
ghcid_original: NL-XX-XXX-E-APADD
ghcid_uuid: d5ccddb8-f488-5cf8-98e1-bd7343c9a0fd
ghcid_uuid_sha256: c6433cf3-6117-812b-ae67-99c499d6eb42
ghcid_numeric: 14286329458952962347
ghcid_uuid: 370a68bc-969d-53a9-811a-ee3b1d2ef12b
ghcid_uuid_sha256: 237c1e87-0bb9-8f2e-a36d-36d950cbe0a3
ghcid_numeric: 2556952253805727534
record_id: acf11347-204a-40e8-8098-6ad5ea33e6f3
generation_timestamp: '2025-12-16T21:06:42.375776+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-E-APADD
ghcid_numeric: 14286329458952962347
valid_from: '2025-12-16T21:06:42.375776+00:00'
valid_to: null
valid_to: '2025-12-17T09:57:11.740348+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-ZH-DHA-E-APADD
ghcid_numeric: 2556952253805727534
valid_from: '2025-12-17T09:57:11.740348+00:00'
valid_to: null
reason: Location enriched via Exa web search - Den Haag, Zuid-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: DHA
city_name: Den Haag
region_code: ZH
region_name: Zuid-Holland
country_code: NL
resolution_date: '2025-12-17T09:57:11.740348+00:00'
source_url: https://autoriteitpersoonsgegevens.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:42.375776+00:00'
@ -198,9 +212,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://autoriteitpersoonsgegevens.nl/
extraction_timestamp: '2025-12-17T09:57:11.740348+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Den Haag, Zuid-Holland'

View file

@ -35,7 +35,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/anna-holties-771939152
heritage_type: M
- name: Lianne Sleutjes
headline: Woordvoerder Inspectie Justitie en Veiligheid at Ministerie van Justitie en Veiligheid
headline: Woordvoerder Inspectie Justitie en Veiligheid at Ministerie van Justitie
en Veiligheid
linkedin_url: https://www.linkedin.com/in/lianne-sleutjes-a805338
heritage_type: O
- name: Daman Jafra
@ -43,7 +44,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/daman-jafra-32b1bb236
heritage_type: E
- name: Selene Fagel
headline: Coördinerend specialistisch inspecteur (PhD) Familie inspecteur Vertrouwenspersoon Inspectie Justitie en Veiligheid
headline: Coördinerend specialistisch inspecteur (PhD) Familie inspecteur Vertrouwenspersoon
Inspectie Justitie en Veiligheid
linkedin_url: https://www.linkedin.com/in/selene-fagel-1287645
heritage_type: E
- name: Kirsten Van Noort
@ -51,7 +53,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/kirsten-van-noort-4385812b7
heritage_type: M
- name: Wieger van der Heide
headline: Senior inspecteur bij het Ministerie van VenJ / Inspectie Veiligheid en Justitie
headline: Senior inspecteur bij het Ministerie van VenJ / Inspectie Veiligheid
en Justitie
linkedin_url: https://www.linkedin.com/in/wieger-van-der-heide-91857478
heritage_type: O
- name: Madhu R.
@ -72,28 +75,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/inspectie-justitie-en-veiligheid.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Den Haag
region: Zuid-Holland
country: NL
address: Hoge Nieuwstraat 8, 2514 EL Den Haag
ghcid:
ghcid_current: NL-XX-XXX-E-IJV
ghcid_current: NL-ZH-DHA-E-IJV
ghcid_original: NL-XX-XXX-E-IJV
ghcid_uuid: f17d729f-be1d-544f-846f-893be1caf69c
ghcid_uuid_sha256: 236e4c6e-f6da-885a-9aa7-2c0be92d8a2a
ghcid_numeric: 2553062078237304922
ghcid_uuid: f7fc9fb4-146c-51e9-a5e8-c8fea8c7c34c
ghcid_uuid_sha256: 1251c359-7bcf-8c90-8a5e-1718a915cafe
ghcid_numeric: 1320050954892979344
record_id: 4c890a50-1738-45fd-b173-a38eb7fffa3f
generation_timestamp: '2025-12-16T21:06:47.359970+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-E-IJV
ghcid_numeric: 2553062078237304922
valid_from: '2025-12-16T21:06:47.359970+00:00'
valid_to: null
valid_to: '2025-12-17T09:57:11.821589+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-ZH-DHA-E-IJV
ghcid_numeric: 1320050954892979344
valid_from: '2025-12-17T09:57:11.821589+00:00'
valid_to: null
reason: Location enriched via Exa web search - Den Haag, Zuid-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: DHA
city_name: Den Haag
region_code: ZH
region_name: Zuid-Holland
country_code: NL
resolution_date: '2025-12-17T09:57:11.821589+00:00'
source_url: https://www.inspectie-jenv.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:47.359970+00:00'
@ -110,9 +123,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.inspectie-jenv.nl/
extraction_timestamp: '2025-12-17T09:57:11.821589+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Den Haag, Zuid-Holland'

View file

@ -30,7 +30,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/pieter-bots-78140a22
heritage_type: E
- name: Annet Pasveer
headline: Architectuurhistoricus | Senior beleidsadviseur Monumenten & Archeologie bij Raad voor Cultuur
headline: Architectuurhistoricus | Senior beleidsadviseur Monumenten & Archeologie
bij Raad voor Cultuur
linkedin_url: https://www.linkedin.com/in/annetpasveer
heritage_type: R
- name: Mirjam Sneeuwloper
@ -38,7 +39,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/mirjamsneeuwloper
heritage_type: M
- name: Kiran Sukul
headline: Sr. beleidsadviseur musea Raad voor Cultuur • Kunst, Cultuur & Erfgoed • Governance
headline: Sr. beleidsadviseur musea Raad voor Cultuur • Kunst, Cultuur & Erfgoed
• Governance
linkedin_url: https://www.linkedin.com/in/sukul13881
heritage_type: M
- name: Ronald Nijboer
@ -55,7 +57,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/philippine-jenster-15567156
heritage_type: L
- name: Kirsten de Graaff
headline: Specialist/adviseur cultuureducatie, cultuurmakers en woordkunst. Beleidsmatig, strategisch en uitvoerend.
headline: Specialist/adviseur cultuureducatie, cultuurmakers en woordkunst. Beleidsmatig,
strategisch en uitvoerend.
linkedin_url: https://www.linkedin.com/in/kirsten-de-graaff-76467a2
heritage_type: E
- name: Marc de Beyer
@ -66,7 +69,8 @@ linkedin_enrichment:
headline: Bureau of the Dutch Council for Culture, The Hague
linkedin_url: https://www.linkedin.com/in/marieke-van-ommeren-b6206438
- name: Christien Bok
headline: creatieve initiator van innovaties voor veilig en toegankelijk onderwijs van hoge kwaliteit
headline: creatieve initiator van innovaties voor veilig en toegankelijk onderwijs
van hoge kwaliteit
linkedin_url: https://www.linkedin.com/in/christienbok
heritage_type: E
- name: Lejo Schenk
@ -84,7 +88,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/evakleeman
heritage_type: M
- name: Judi Mesman
headline: Distinguished professor societal responsibility and impact, author of Leadership in color (Leiderschap in kleur)
headline: Distinguished professor societal responsibility and impact, author of
Leadership in color (Leiderschap in kleur)
linkedin_url: https://www.linkedin.com/in/judi-mesman-64279b140
heritage_type: E
- name: Madelon Van Wandelen
@ -125,7 +130,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/gwen-parry-7923ba13b
heritage_type: R
- name: Femke van Hest
headline: Strategisch adviseur cultuur gemeente s-Hertogenbosch | Adviseur Raad voor Cultuur
headline: Strategisch adviseur cultuur gemeente s-Hertogenbosch | Adviseur Raad
voor Cultuur
linkedin_url: https://www.linkedin.com/in/femkevanhest
heritage_type: O
- name: Ellen Hardy
@ -146,8 +152,8 @@ linkedin_enrichment:
headline: directeur-bestuurder bij Cultuurkwadraat / directeur Cultuurconsortium
linkedin_url: https://www.linkedin.com/in/josje-de-regt-0a018b52
- name: Wilbert Mutsaers
headline: CEO/Algemeen Directeur Agents After All | Kroonlid Raad voor Cultuur | ex-Spotify, Mojo/Live Nation, NPO 3FM/FunX,
Universal Music & Sony Music
headline: CEO/Algemeen Directeur Agents After All | Kroonlid Raad voor Cultuur
| ex-Spotify, Mojo/Live Nation, NPO 3FM/FunX, Universal Music & Sony Music
linkedin_url: https://www.linkedin.com/in/wilbert-mutsaers-22256a
- name: Lies Wijnterp PhD
headline: Researcher, cultural manager
@ -160,11 +166,13 @@ linkedin_enrichment:
headline: Zakelijk leider in de culturele sector | Lerende bij De Metselarij
linkedin_url: https://www.linkedin.com/in/elisiape%C3%A7as
- name: Otto Berg
headline: Auditor bij CBCT - Certificeringsorganisatie Bibliotheekwerk, Cultuur en Taal
headline: Auditor bij CBCT - Certificeringsorganisatie Bibliotheekwerk, Cultuur
en Taal
linkedin_url: https://www.linkedin.com/in/otto-berg-30686410
heritage_type: L
- name: Vanessa Lann
headline: Head of Composition for Film and Theatre, at ArtEZ University of the Arts
headline: Head of Composition for Film and Theatre, at ArtEZ University of the
Arts
linkedin_url: https://www.linkedin.com/in/vanessa-lann-a51724276
heritage_type: E
- name: Ilonka Kolthof
@ -179,7 +187,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/joepvossebeld
heritage_type: M
- name: Edo Righini
headline: Director of de Doelen; Crown Member of the Dutch Council for Culture. EMBA HEC Paris
headline: Director of de Doelen; Crown Member of the Dutch Council for Culture.
EMBA HEC Paris
linkedin_url: https://www.linkedin.com/in/edo-righini-7b303129
- name: Marc Versteeg
headline: voor de culturele sector
@ -254,28 +263,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/raad-voor-cultuur.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Den Haag
region: Zuid-Holland
country: NL
address: Prins Willem Alexanderhof 20, 2595 BE Den Haag
ghcid:
ghcid_current: NL-XX-XXX-E-RC
ghcid_current: NL-ZH-DHA-E-RC
ghcid_original: NL-XX-XXX-E-RC
ghcid_uuid: de024a36-abbe-53d1-b137-9a2518941e40
ghcid_uuid_sha256: dcb4a290-0d37-8352-a637-cacbd563e68f
ghcid_numeric: 15903514923732800338
ghcid_uuid: 1c22d683-b4d0-5fc4-afec-ad60ef82995f
ghcid_uuid_sha256: 7d58a26e-c63a-8a79-8643-8117147c1b10
ghcid_numeric: 9032147649347328633
record_id: 716f5d24-baa6-409c-a708-9c07f0241568
generation_timestamp: '2025-12-16T21:06:42.755062+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-E-RC
ghcid_numeric: 15903514923732800338
valid_from: '2025-12-16T21:06:42.755062+00:00'
valid_to: null
valid_to: '2025-12-17T09:57:11.795457+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-ZH-DHA-E-RC
ghcid_numeric: 9032147649347328633
valid_from: '2025-12-17T09:57:11.795457+00:00'
valid_to: null
reason: Location enriched via Exa web search - Den Haag, Zuid-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: DHA
city_name: Den Haag
region_code: ZH
region_name: Zuid-Holland
country_code: NL
resolution_date: '2025-12-17T09:57:11.795457+00:00'
source_url: https://raadvoorcultuur.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:42.755062+00:00'
@ -292,9 +311,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://raadvoorcultuur.nl/
extraction_timestamp: '2025-12-17T09:57:11.795457+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Den Haag, Zuid-Holland'

View file

@ -28,7 +28,8 @@ linkedin_enrichment:
linkedin_url: https://www.linkedin.com/in/jannekevanasperen
heritage_type: M
- name: Tamar van Riessen
headline: PhD-Candidate Art History & Visual Culture | Curator 17th-Century Paintings | Art Historian
headline: PhD-Candidate Art History & Visual Culture | Curator 17th-Century Paintings
| Art Historian
linkedin_url: https://www.linkedin.com/in/tamar-van-riessen-959248177
heritage_type: M
- name: Femke van Leeuwen-Jansen
@ -44,28 +45,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/codart.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Den Haag
region: Zuid-Holland
country: NL
address: Koninginnegracht 15, 2514 AB Den Haag
ghcid:
ghcid_current: NL-XX-XXX-M-C-codart
ghcid_current: NL-ZH-DHA-M-C-codart
ghcid_original: NL-XX-XXX-M-C-codart
ghcid_uuid: 8cf5c2ed-b8fe-523f-89e5-e996147f2943
ghcid_uuid_sha256: 9d0c059d-9b9e-8965-9c08-705eef0556ee
ghcid_numeric: 11316426138154068325
ghcid_uuid: 932104a7-6e72-57c3-a4d1-f3c6218f277e
ghcid_uuid_sha256: 11cb3492-54b6-8435-ab01-46e83370b99f
ghcid_numeric: 1282176322008179765
record_id: 0afab665-c130-445d-a2e0-4c79f1ed1759
generation_timestamp: '2025-12-16T21:06:44.128751+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-C-codart
ghcid_numeric: 11316426138154068325
valid_from: '2025-12-16T21:06:44.128751+00:00'
valid_to: null
valid_to: '2025-12-17T09:33:15.796477+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-ZH-DHA-M-C-codart
ghcid_numeric: 1282176322008179765
valid_from: '2025-12-17T09:33:15.796477+00:00'
valid_to: null
reason: Location enriched via Exa web search - Den Haag, Zuid-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: DHA
city_name: Den Haag
region_code: ZH
region_name: Zuid-Holland
country_code: NL
resolution_date: '2025-12-17T09:33:15.796477+00:00'
source_url: https://www.codart.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:44.128751+00:00'
@ -82,9 +93,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.codart.nl/
extraction_timestamp: '2025-12-17T09:33:15.796477+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Den Haag, Zuid-Holland'

View file

@ -103,23 +103,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/cultuurschakel.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Den Haag
region: Zuid-Holland
country: NL
address: Den Haag, Netherlands
ghcid:
ghcid_current: NL-XX-XXX-M-C
ghcid_current: NL-ZH-DHA-M-C
ghcid_original: NL-XX-XXX-M-C
ghcid_uuid: a1434c1d-73d9-5630-a0b7-2f2a87bdcdec
ghcid_uuid_sha256: e5e6fc5b-a7f2-8ea3-8625-eeab02101858
ghcid_numeric: 16566205749918891683
ghcid_uuid: 65f3c875-a26d-5d51-acdb-f534a44aeecc
ghcid_uuid_sha256: 426f1a86-c2b5-8d14-a981-320d7e1211da
ghcid_numeric: 4787074095012740372
record_id: a97e198d-10ca-4595-9b13-073a8b741f07
generation_timestamp: '2025-12-16T21:06:38.738101+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-C
ghcid_numeric: 16566205749918891683
valid_from: '2025-12-17T08:44:25.966336+00:00'
valid_to: null
valid_to: '2025-12-17T09:33:15.877796+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-ZH-DHA-M-C
ghcid_numeric: 4787074095012740372
valid_from: '2025-12-17T09:33:15.877796+00:00'
valid_to: null
reason: Location enriched via Exa web search - Den Haag, Zuid-Holland
location_resolution:
method: EXA_WEB_SEARCH
city_code: DHA
city_name: Den Haag
region_code: ZH
region_name: Zuid-Holland
country_code: NL
resolution_date: '2025-12-17T09:33:15.877796+00:00'
source_url: https://www.cultuurschakel.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:38.738101+00:00'
@ -136,6 +151,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.cultuurschakel.nl/
extraction_timestamp: '2025-12-17T09:33:15.877796+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -144,3 +168,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Den Haag, Zuid-Holland'

View file

@ -18,28 +18,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/kresse-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Gouda
region: Zuid-Holland
country: NL
address: Achter de Kerk 14, 2801 JX Gouda
ghcid:
ghcid_current: NL-XX-XXX-M-KM-kresse_museum
ghcid_current: NL-ZH-GOU-M-KM-kresse_museum
ghcid_original: NL-XX-XXX-M-KM-kresse_museum
ghcid_uuid: 6b8e1a47-41dc-5654-9b67-84c813805af0
ghcid_uuid_sha256: b0d1df63-6965-8cbe-8184-25ee491660e4
ghcid_numeric: 12741210438870891710
ghcid_uuid: f4fe63e9-9e94-5cbd-83b9-4744f5262e8b
ghcid_uuid_sha256: 8c0997a3-e194-8d2d-9beb-882f3022388a
ghcid_numeric: 10090763170220346669
record_id: 3c3397e5-cde3-4cd9-91e8-4b55728a88af
generation_timestamp: '2025-12-16T21:06:43.981200+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-KM-kresse_museum
ghcid_numeric: 12741210438870891710
valid_from: '2025-12-16T21:06:43.981200+00:00'
valid_to: null
valid_to: '2025-12-17T09:57:11.866127+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-ZH-GOU-M-KM-kresse_museum
ghcid_numeric: 10090763170220346669
valid_from: '2025-12-17T09:57:11.866127+00:00'
valid_to: null
reason: Location enriched via Exa web search - Gouda, Zuid-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: GOU
city_name: Gouda
region_code: ZH
region_name: Zuid-Holland
country_code: NL
resolution_date: '2025-12-17T09:57:11.866127+00:00'
source_url: https://www.kressemuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:43.981200+00:00'
@ -56,9 +66,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.kressemuseum.nl/
extraction_timestamp: '2025-12-17T09:57:11.866127+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Gouda, Zuid-Holland'

View file

@ -25,23 +25,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/atlantikwall-museum.yaml
schema_version: 1.0.0
location:
city: null
region: null
city: Noordwijk
region: Zuid-Holland
country: NL
address: Noordwijk, Zuid-Holland, Netherlands
ghcid:
ghcid_current: NL-XX-XXX-M-AM-atlantikwall_museum
ghcid_current: NL-ZH-NRW-M-AM-atlantikwall_museum
ghcid_original: NL-XX-XXX-M-AM-atlantikwall_museum
ghcid_uuid: 73fb4ed0-8683-5f49-871a-7e38d3876855
ghcid_uuid_sha256: 5203625a-77fe-8dd0-baa4-1154287a2d72
ghcid_numeric: 5909675276739976656
ghcid_uuid: af7ac4a4-ffe3-5edc-850f-b31a18c67b95
ghcid_uuid_sha256: 5b8a41e9-fc43-8522-80bc-a47b6e5ed0a6
ghcid_numeric: 6596157077453010210
record_id: e0c12c4e-b088-48d4-a5dd-d2a7a73ca493
generation_timestamp: '2025-12-16T21:06:41.396388+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-AM-atlantikwall_museum
ghcid_numeric: 5909675276739976656
valid_from: '2025-12-17T08:44:26.007952+00:00'
valid_to: null
valid_to: '2025-12-17T09:25:04.226988+00:00'
reason: Reverted incorrect LinkedIn HTML extraction - original XX-XXX restored
- ghcid: NL-ZH-NRW-M-AM-atlantikwall_museum
ghcid_numeric: 6596157077453010210
valid_from: '2025-12-17T09:25:04.226988+00:00'
valid_to: null
reason: Location enriched via Exa web search - Noordwijk, Zuid-Holland
location_resolution:
method: EXA_WEB_SEARCH
city_code: NRW
city_name: Noordwijk
region_code: ZH
region_name: Zuid-Holland
country_code: NL
resolution_date: '2025-12-17T09:25:04.226988+00:00'
source_url: https://www.atlantikwallmuseum.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:41.396388+00:00'
@ -58,6 +73,15 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://www.atlantikwallmuseum.nl/
extraction_timestamp: '2025-12-17T09:25:04.226988+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
@ -66,3 +90,4 @@ provenance:
- 'Location resolution method: UNRESOLVED'
- Reverted incorrect location enrichment on 2025-12-17 - LinkedIn HTML extraction
was extracting wrong company's data
- 'Location enriched on 2025-12-17 via Exa web search: Noordwijk, Zuid-Holland'

View file

@ -43,28 +43,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/moslim-archief.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Rotterdam
region: Zuid-Holland
country: NL
address: Rotterdam, Netherlands
ghcid:
ghcid_current: NL-XX-XXX-A-MA
ghcid_current: NL-ZH-ROT-A-MA
ghcid_original: NL-XX-XXX-A-MA
ghcid_uuid: 7215318b-74db-57ce-9921-09db2d6b8e20
ghcid_uuid_sha256: 50024496-b73c-85c0-938d-b77e01e80c85
ghcid_numeric: 5765245887097644480
ghcid_uuid: 18570ab5-c0f9-5bb4-8e4c-11e35cfdebb8
ghcid_uuid_sha256: 92b222fc-d21c-8eb2-8ff4-01400305db5e
ghcid_numeric: 10570549744644181682
record_id: d2e14ba0-55d0-42c1-b324-1b4b6226d836
generation_timestamp: '2025-12-16T21:06:36.624598+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-A-MA
ghcid_numeric: 5765245887097644480
valid_from: '2025-12-16T21:06:36.624598+00:00'
valid_to: null
valid_to: '2025-12-17T09:33:15.780333+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-ZH-ROT-A-MA
ghcid_numeric: 10570549744644181682
valid_from: '2025-12-17T09:33:15.780333+00:00'
valid_to: null
reason: Location enriched via Exa web search - Rotterdam, Zuid-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: ROT
city_name: Rotterdam
region_code: ZH
region_name: Zuid-Holland
country_code: NL
resolution_date: '2025-12-17T09:33:15.780333+00:00'
source_url: https://moslimarchief.nl/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:36.624598+00:00'
@ -81,9 +91,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://moslimarchief.nl/
extraction_timestamp: '2025-12-17T09:33:15.780333+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Rotterdam, Zuid-Holland'

View file

@ -21,28 +21,38 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/dutch-pinball-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
city: Rotterdam
region: Zuid-Holland
country: NL
address: Voorhaven 17, 3024 RC Rotterdam
ghcid:
ghcid_current: NL-XX-XXX-M-DPM
ghcid_current: NL-ZH-ROT-M-DPM
ghcid_original: NL-XX-XXX-M-DPM
ghcid_uuid: 9c34f38d-168a-5223-9726-799e16461c0c
ghcid_uuid_sha256: 17c1e31a-45c8-8454-8fa4-db8bdc7d464f
ghcid_numeric: 1711899035356951636
ghcid_uuid: ec309d57-2eac-5371-a08c-360af34e4b08
ghcid_uuid_sha256: 43c4854a-6502-8ffb-a89c-bbf1b151ff96
ghcid_numeric: 4883174450545205243
record_id: 49ba58fd-88c2-47fb-9ca5-b2b694147e0e
generation_timestamp: '2025-12-16T21:06:43.663154+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-DPM
ghcid_numeric: 1711899035356951636
valid_from: '2025-12-16T21:06:43.663154+00:00'
valid_to: null
valid_to: '2025-12-17T09:17:11.091657+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: NL-ZH-ROT-M-DPM
ghcid_numeric: 4883174450545205243
valid_from: '2025-12-17T09:17:11.091657+00:00'
valid_to: null
reason: Location enriched via Exa web search - Rotterdam, Zuid-Holland
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
method: EXA_WEB_SEARCH
city_code: ROT
city_name: Rotterdam
region_code: ZH
region_name: Zuid-Holland
country_code: NL
resolution_date: '2025-12-17T09:17:11.091657+00:00'
source_url: https://dutchpinballmuseum.com/
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:43.663154+00:00'
@ -59,9 +69,19 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_3_CROWD_SOURCED
source_url: https://dutchpinballmuseum.com/
extraction_timestamp: '2025-12-17T09:17:11.091657+00:00'
claims_extracted:
- city
- region
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Location enriched on 2025-12-17 via Exa web search: Rotterdam, Zuid-Holland'

View file

@ -2,7 +2,7 @@ custodian_name:
emic_name: Municipality of Gaza
emic_name_source: linkedin
institution_type:
- M
- O
linkedin_enrichment:
linkedin_url: https://www.linkedin.com/company/municipality-of-gaza
linkedin_slug: municipality-of-gaza
@ -18,28 +18,40 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/municipality-of-gaza.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
country: NL
city: Gaza City
region: Gaza Strip
country: PS
ghcid:
ghcid_current: NL-XX-XXX-M-MG
ghcid_original: NL-XX-XXX-M-MG
ghcid_uuid: 38ca88a9-b9da-55b8-80b0-f3d223ceb625
ghcid_uuid_sha256: b26a272e-97aa-8e77-96fa-3a370b897bb8
ghcid_numeric: 12856131167348993655
ghcid_current: PS-GZ-GAZ-O-MG
ghcid_original: PS-GZ-GAZ-O-MG
ghcid_uuid: 2657ff95-d637-595a-88aa-541c7020c8d5
ghcid_uuid_sha256: 2e49ae93-4105-89bb-9667-65aac87ec550
ghcid_numeric: 3335388946518841787
record_id: ea2d7d50-d449-4959-9bf6-8f00e2bddeb1
generation_timestamp: '2025-12-16T21:06:41.550370+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-MG
ghcid_numeric: 12856131167348993655
valid_from: '2025-12-16T21:06:41.550370+00:00'
valid_to: null
valid_to: '2025-12-17T09:53:29.215327+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: PS-GZ-GAZ-O-MG
ghcid_numeric: 3335388946518841787
valid_from: '2025-12-17T09:53:29.215327+00:00'
valid_to: null
reason: 'Country code corrected: NL -> PS (Palestine). Location: Gaza City, Gaza
Strip'
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
country_code: NL
method: EXA_WEB_SEARCH
city_code: GAZ
city_name: Gaza City
region_code: GZ
region_name: Gaza Strip
country_code: PS
resolution_date: '2025-12-17T09:53:29.215327+00:00'
source_url: https://www.gaza-city.org
notes: Municipal government, founded 1898. Type corrected from M (Museum) to O
(Official Institution)
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:41.550370+00:00'
@ -56,9 +68,21 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_2_VERIFIED
source_url: https://www.gaza-city.org
extraction_timestamp: '2025-12-17T09:53:29.215327+00:00'
claims_extracted:
- country
- region
- city
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Country code corrected on 2025-12-17: NL was incorrect, institution is in Palestine
(PS)'

View file

@ -2,7 +2,7 @@ custodian_name:
emic_name: Ford museum
emic_name_source: linkedin
institution_type:
- M
- O
linkedin_enrichment:
linkedin_url: https://www.linkedin.com/company/ford-museum
linkedin_slug: ford-museum
@ -21,28 +21,40 @@ linkedin_enrichment:
original_file: data/custodian/linkedin/ford-museum.yaml
schema_version: 1.0.0
location:
city: ''
region: XX
country: NL
city: Grand Rapids
region: Michigan
country: US
address: 303 Pearl Street NW, Grand Rapids, MI 49504
ghcid:
ghcid_current: NL-XX-XXX-M-FM-ford_museum
ghcid_original: NL-XX-XXX-M-FM-ford_museum
ghcid_uuid: 4be08f3c-3b19-5820-9dc7-522026b51400
ghcid_uuid_sha256: cf7d92d4-f6c8-81d0-a27d-128f93e68e0b
ghcid_numeric: 14951267781310939600
ghcid_current: US-MI-GRA-O-FM-ford_museum
ghcid_original: US-MI-GRA-O-FM-ford_museum
ghcid_uuid: fb8e145b-80ae-5b68-9fbd-93240bd798a9
ghcid_uuid_sha256: 8dd63d17-5941-825f-a6b4-872358ea25eb
ghcid_numeric: 10220423574854939231
record_id: 89d83d98-3121-4969-bd7e-a06f14edc812
generation_timestamp: '2025-12-16T21:06:42.087370+00:00'
ghcid_history:
- ghcid: NL-XX-XXX-M-FM-ford_museum
ghcid_numeric: 14951267781310939600
valid_from: '2025-12-16T21:06:42.087370+00:00'
valid_to: null
valid_to: '2025-12-17T09:41:23.620522+00:00'
reason: Initial GHCID assignment from LinkedIn batch import
- ghcid: US-MI-GRA-O-FM-ford_museum
ghcid_numeric: 10220423574854939231
valid_from: '2025-12-17T09:41:23.620522+00:00'
valid_to: null
reason: 'Country code corrected: NL -> US (United States). Location: Grand Rapids,
Michigan'
location_resolution:
method: UNRESOLVED
city_code: XXX
region_code: XX
country_code: NL
method: EXA_WEB_SEARCH
city_code: GRA
city_name: Grand Rapids
region_code: MI
region_name: Michigan
country_code: US
resolution_date: '2025-12-17T09:41:23.620522+00:00'
source_url: https://www.fordlibrarymuseum.gov/visit/museum
notes: Part of National Archives system, commemorates 38th US President
provenance:
schema_version: 1.0.0
generated_at: '2025-12-16T21:06:42.087370+00:00'
@ -59,9 +71,21 @@ provenance:
- website
- staff_count
- heritage_staff
web_search:
- source_type: exa_web_search
data_tier: TIER_2_VERIFIED
source_url: https://www.fordlibrarymuseum.gov/visit/museum
extraction_timestamp: '2025-12-17T09:41:23.620522+00:00'
claims_extracted:
- country
- region
- city
- address
data_tier_summary:
TIER_4_INFERRED:
- linkedin_company_profile
notes:
- Created from unmatched LinkedIn company profile
- 'Location resolution method: UNRESOLVED'
- 'Country code corrected on 2025-12-17: NL was incorrect, institution is in United
States (US)'

View file

@ -1,5 +1,5 @@
{
"generated": "2025-12-17T08:54:52.876Z",
"generated": "2025-12-17T10:54:35.492Z",
"version": "1.0.0",
"categories": [
{

View file

@ -363,6 +363,10 @@
to { transform: rotate(360deg); }
}
.conversation-embedding-panel__spinning {
animation: spin 1s linear infinite;
}
.conversation-embedding-panel__empty svg {
color: #999;
}

View file

@ -28,6 +28,8 @@ import {
Info,
Globe,
Target,
Settings,
Download,
} from 'lucide-react';
import { EmbeddingProjector, type EmbeddingPoint } from '../database/EmbeddingProjector';
import { isTargetInsideAny } from '../../utils/dom';
@ -94,6 +96,12 @@ export interface ConversationEmbeddingPanelProps {
onModeChange?: (mode: 'global' | 'context') => void;
/** Number of context points available (for badge display) */
contextPointsCount?: number;
/** Total points in the collection (for Load All display) */
totalPointsCount?: number;
/** Called when user clicks Load All */
onLoadAll?: () => void;
/** Whether Load All is currently in progress */
isLoadingAll?: boolean;
}
/**
@ -116,6 +124,9 @@ const ConversationEmbeddingPanelComponent: React.FC<ConversationEmbeddingPanelPr
mode = 'global',
onModeChange,
contextPointsCount = 0,
totalPointsCount,
onLoadAll,
isLoadingAll = false,
}) => {
const panelRef = useRef<HTMLDivElement>(null);
@ -456,9 +467,26 @@ const ConversationEmbeddingPanelComponent: React.FC<ConversationEmbeddingPanelPr
: (language === 'nl' ? 'Eenvoudige weergave' : 'Simple view')}
aria-label={simpleMode ? 'Switch to advanced' : 'Switch to simple'}
>
<span className="conversation-embedding-panel__mode-icon">
{simpleMode ? '◧' : '▣'}
</span>
<Settings size={16} />
</button>
)}
{/* Load All button */}
{onLoadAll && mode === 'global' && (
<button
className="conversation-embedding-panel__control-btn"
onClick={onLoadAll}
disabled={isLoadingAll}
title={language === 'nl'
? `Laad alle punten${totalPointsCount ? ` (${totalPointsCount.toLocaleString()})` : ''}`
: `Load all points${totalPointsCount ? ` (${totalPointsCount.toLocaleString()})` : ''}`}
aria-label="Load all points"
>
{isLoadingAll ? (
<Loader2 size={18} className="conversation-embedding-panel__spinning" />
) : (
<Download size={18} />
)}
</button>
)}

View file

@ -1276,11 +1276,13 @@ export function EmbeddingProjector({
positions[i * 3 + 1] = point.y;
positions[i * 3 + 2] = point.z ?? 0;
// Get color for point (initial color without selection)
const color = new THREE.Color(getPointColor(point.originalIndex));
colors[i * 3] = color.r;
colors[i * 3 + 1] = color.g;
colors[i * 3 + 2] = color.b;
// Use default color - actual colors will be set by the color update effect
// This avoids having getPointColor as a dependency which would recreate
// the scene (and reset camera) when selection/highlighting changes
const defaultColor = new THREE.Color(COLORS[0]);
colors[i * 3] = defaultColor.r;
colors[i * 3 + 1] = defaultColor.g;
colors[i * 3 + 2] = defaultColor.b;
// Initial size (will be updated by selection effect)
sizes[i] = 4;
@ -1458,7 +1460,11 @@ export function EmbeddingProjector({
highlightedHalosRef.current = null;
}
};
}, [viewMode, projectedPoints, getPointColor]);
// Note: getPointColor is intentionally NOT a dependency here.
// Colors are initialized with a default and updated by the separate color/size effect.
// Including getPointColor would recreate the scene (reset camera) on selection changes.
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [viewMode, projectedPoints]);
// Update point sizes and colors when selection changes (without recreating the scene)
useEffect(() => {

View file

@ -207,6 +207,45 @@ export function QdrantPanel({ compact = false }: QdrantPanelProps) {
}
}, [selectedCollection, nextOffset, scrollPoints]);
// Load ALL points from the collection (for visualization/search)
const loadAllPoints = useCallback(async () => {
if (!selectedCollection) return;
setIsLoadingPoints(true);
try {
const allPoints: QdrantPoint[] = [];
let currentOffset: string | number | null = null;
const batchSize = 500; // Load in larger batches for efficiency
// Scroll through all points
while (true) {
const result = await scrollPoints(
selectedCollection.name,
batchSize,
currentOffset
);
allPoints.push(...result.points);
// Update UI with progress
setPoints([...allPoints]);
if (!result.nextOffset) {
break; // No more points
}
currentOffset = result.nextOffset;
}
setPoints(allPoints);
setNextOffset(null); // All loaded
console.log(`[QdrantPanel] Loaded all ${allPoints.length} points from collection`);
} catch (err) {
console.error('Failed to load all points:', err);
} finally {
setIsLoadingPoints(false);
}
}, [selectedCollection, scrollPoints]);
// Select a collection
const selectCollection = useCallback(async (collection: QdrantCollection) => {
setSelectedCollection(collection);
@ -641,13 +680,23 @@ export function QdrantPanel({ compact = false }: QdrantPanelProps) {
</span>
)}
{nextOffset !== null && points.length > 0 && (
<button
className="secondary-button"
onClick={() => loadPoints(true)}
disabled={isLoadingPoints}
>
Load more
</button>
<>
<button
className="secondary-button"
onClick={() => loadPoints(true)}
disabled={isLoadingPoints}
>
Load more
</button>
<button
className="secondary-button"
onClick={loadAllPoints}
disabled={isLoadingPoints}
title="Load all points for comprehensive search"
>
{isLoadingPoints ? 'Loading...' : 'Load All'}
</button>
</>
)}
</div>
)}

View file

@ -96,6 +96,7 @@
font-family: 'Roboto', Helvetica, Arial, sans-serif;
cursor: pointer;
transition: border-color 0.2s;
min-width: 220px;
}
.mapping-explorer__category-select:focus {

View file

@ -17,11 +17,13 @@ import {
getCategoryForSourceByType,
getStatisticsForDataSource,
exportToLinkMLMapYaml,
CATEGORY_GROUP_LABELS,
type EnrichmentSourceMapping,
type FieldMapping,
type TransformationType,
type DataSourceType,
type MappingStatus,
type CategoryGroup,
} from '../../lib/linkml/custodian-data-mappings';
import './MappingExplorer.css';
@ -760,11 +762,19 @@ export const MappingExplorer: React.FC<MappingExplorerProps> = ({ language = 'en
onChange={(e) => setSelectedCategory(e.target.value || null)}
>
<option value="">{t('allCategories')}</option>
{currentCategories.map(cat => (
<option key={cat.id} value={cat.id}>
{cat.icon} {language === 'nl' ? cat.nameNl : cat.name}
</option>
))}
{(Object.keys(CATEGORY_GROUP_LABELS) as CategoryGroup[]).map(group => {
const groupCategories = currentCategories.filter(c => c.group === group);
if (groupCategories.length === 0) return null;
return (
<optgroup key={group} label={CATEGORY_GROUP_LABELS[group][language]}>
{groupCategories.map(cat => (
<option key={cat.id} value={cat.id}>
{cat.icon} {language === 'nl' ? cat.nameNl : cat.name}
</option>
))}
</optgroup>
);
})}
</select>
<select

File diff suppressed because it is too large Load diff

View file

@ -883,6 +883,9 @@ const ConversationPage: React.FC = () => {
// Projector mode: 'global' shows all 500 points, 'context' shows only RAG results with vectors
const [projectorMode, setProjectorMode] = useState<'global' | 'context'>('global');
const [contextEmbeddingPoints, setContextEmbeddingPoints] = useState<EmbeddingPoint[]>([]);
// Load All state for embedding projector
const [isLoadingAllEmbeddings, setIsLoadingAllEmbeddings] = useState(false);
const [totalEmbeddingPoints, setTotalEmbeddingPoints] = useState<number | undefined>(undefined);
// Knowledge Graph Projector state
const [showGraphProjector, setShowGraphProjector] = useState(false);
@ -1062,6 +1065,42 @@ const ConversationPage: React.FC = () => {
setContextEmbeddingPoints(pointsWithVectors);
}, [lastContext?.qdrantResults]);
// Load ALL embeddings from Qdrant (for comprehensive search)
const loadAllEmbeddings = useCallback(async () => {
if (!qdrantStatus.isConnected || isLoadingAllEmbeddings) return;
setIsLoadingAllEmbeddings(true);
try {
const collectionName = 'heritage_custodians';
const allPoints: EmbeddingPoint[] = [];
let offset: string | number | null = null;
// Scroll through all points in batches
while (true) {
const { points, nextOffset } = await scrollPoints(collectionName, 500, offset);
const batch: EmbeddingPoint[] = points.map(p => ({
id: p.id,
vector: p.vector,
payload: p.payload,
}));
allPoints.push(...batch);
if (!nextOffset || points.length === 0) break;
offset = nextOffset;
}
setEmbeddingPoints(allPoints);
setTotalEmbeddingPoints(allPoints.length);
console.log(`[EmbeddingProjector] Loaded all ${allPoints.length} points`);
} catch (err) {
console.error('Failed to load all embeddings:', err);
} finally {
setIsLoadingAllEmbeddings(false);
}
}, [qdrantStatus.isConnected, isLoadingAllEmbeddings, scrollPoints]);
// Handle panel resize with mouse
useEffect(() => {
if (!isResizing) return;
@ -1988,6 +2027,9 @@ const ConversationPage: React.FC = () => {
mode={projectorMode}
onModeChange={setProjectorMode}
contextPointsCount={contextEmbeddingPoints.length}
totalPointsCount={totalEmbeddingPoints}
onLoadAll={loadAllEmbeddings}
isLoadingAll={isLoadingAllEmbeddings}
/>
)}

View file

@ -921,6 +921,148 @@
font-weight: 500;
}
.query-limit-refresh-button {
display: flex;
align-items: center;
gap: 0.375rem;
margin-top: 0.5rem;
padding: 0.5rem 0.75rem;
background: #4a7dff;
color: white;
border: none;
border-radius: 6px;
font-size: 0.75rem;
font-weight: 500;
cursor: pointer;
transition: background 0.2s ease;
}
.query-limit-refresh-button:hover {
background: #3366e6;
}
.query-limit-refresh-button:disabled {
opacity: 0.6;
cursor: not-allowed;
}
/* Query Mode Section (inside query-limit-section) */
.query-mode-subsection {
margin-top: 1rem;
padding-top: 1rem;
border-top: 1px solid #e0e0e0;
}
.query-mode-subsection h4 {
margin: 0 0 0.5rem 0;
font-size: 0.8125rem;
color: #172a59;
font-weight: 600;
}
.query-mode-desc {
font-size: 0.75rem;
color: #666;
margin: 0 0 0.75rem 0;
}
.query-mode-options {
display: flex;
flex-direction: column;
gap: 0.5rem;
}
.query-mode-option {
display: flex;
align-items: center;
gap: 0.5rem;
padding: 0.625rem 0.75rem;
background: white;
border: 1px solid #e0e0e0;
border-radius: 6px;
cursor: pointer;
transition: all 0.15s ease;
}
.query-mode-option:hover {
border-color: #4a7dff;
background: #f8f9ff;
}
.query-mode-option.active {
border-color: #4a7dff;
background: #ebefff;
}
.query-mode-option input[type="radio"] {
margin: 0;
accent-color: #4a7dff;
}
.query-mode-content {
display: flex;
flex-direction: column;
gap: 0.125rem;
flex: 1;
}
.query-mode-label {
font-size: 0.8125rem;
font-weight: 500;
color: #172a59;
}
.query-mode-hint {
font-size: 0.6875rem;
color: #888;
}
.query-mode-option.active .query-mode-label {
color: #4a7dff;
}
.query-mode-option.query-mode-custom {
background: linear-gradient(135deg, #f8f9ff 0%, #fff 100%);
border-style: dashed;
}
.query-mode-option.query-mode-custom:hover {
background: linear-gradient(135deg, #ebefff 0%, #f8f9ff 100%);
}
.query-mode-external-icon {
font-size: 0.875rem;
color: #4a7dff;
margin-left: auto;
}
.query-mode-refresh-button {
margin-top: 0.75rem;
width: 100%;
padding: 0.5rem 0.75rem;
background: #4a7dff;
color: white;
border: none;
border-radius: 6px;
cursor: pointer;
font-weight: 500;
font-size: 0.8125rem;
display: flex;
align-items: center;
justify-content: center;
gap: 0.5rem;
transition: background 0.15s ease;
}
.query-mode-refresh-button:hover {
background: #3366e6;
}
.query-mode-refresh-button:disabled {
opacity: 0.6;
cursor: not-allowed;
}
/* Node Info Section */
.node-info-section {
padding: 1rem 1.5rem;
@ -2154,6 +2296,75 @@ body:has(.visualize-page.is-mobile .sidebar--mobile:not(.collapsed)) {
color: #f39c12;
}
[data-theme="dark"] .query-limit-refresh-button {
background: #4a7dff;
color: white;
}
[data-theme="dark"] .query-limit-refresh-button:hover {
background: #6b9eff;
}
/* Query Mode Section - Dark Mode */
[data-theme="dark"] .query-mode-subsection {
border-top-color: #3d3d5c;
}
[data-theme="dark"] .query-mode-subsection h4 {
color: #e0e0e0;
}
[data-theme="dark"] .query-mode-desc {
color: #a0a0b0;
}
[data-theme="dark"] .query-mode-option {
background: #1e1e32;
border-color: #3d3d5c;
}
[data-theme="dark"] .query-mode-option:hover {
border-color: #4a7dff;
background: #2d2d4a;
}
[data-theme="dark"] .query-mode-option.active {
border-color: #4a7dff;
background: #2d2d4a;
}
[data-theme="dark"] .query-mode-label {
color: #e0e0e0;
}
[data-theme="dark"] .query-mode-hint {
color: #888;
}
[data-theme="dark"] .query-mode-option.active .query-mode-label {
color: #6b9eff;
}
[data-theme="dark"] .query-mode-option.query-mode-custom {
background: linear-gradient(135deg, #1e1e32 0%, #252545 100%);
}
[data-theme="dark"] .query-mode-option.query-mode-custom:hover {
background: linear-gradient(135deg, #2d2d4a 0%, #1e1e32 100%);
}
[data-theme="dark"] .query-mode-external-icon {
color: #6b9eff;
}
[data-theme="dark"] .query-mode-refresh-button {
background: #4a7dff;
}
[data-theme="dark"] .query-mode-refresh-button:hover {
background: #6b9eff;
}
/* Node Info Section */
[data-theme="dark"] .node-info-section {
background: #2d2d4a;

View file

@ -114,6 +114,16 @@ const TEXT = {
custodiansAvailable: { nl: 'beschikbaar', en: 'available' },
performanceWarning: { nl: '⚠️ Meer dan 1000 kan traag zijn', en: '⚠️ More than 1000 may be slow' },
// RDF Query Mode
queryMode: { nl: 'Query Modus', en: 'Query Mode' },
queryModeDesc: { nl: 'Selecteer hoe data wordt opgehaald', en: 'Select how data is fetched' },
queryModeDetailed: { nl: 'Gedetailleerd (standaard)', en: 'Detailed (default)' },
queryModeDetailedHint: { nl: 'Specifieke eigenschappen, sneller', en: 'Specific properties, faster' },
queryModeGeneric: { nl: 'Generiek (alle relaties)', en: 'Generic (all relations)' },
queryModeGenericHint: { nl: 'Alle triples, meer connectiviteit', en: 'All triples, more connectivity' },
queryModeCustom: { nl: 'Aangepaste query...', en: 'Custom query...' },
queryModeCustomHint: { nl: 'Open Query Builder', en: 'Open Query Builder' },
// Selected node
selectedNode: { nl: 'Geselecteerd knooppunt', en: 'Selected Node' },
id: { nl: 'ID', en: 'ID' },
@ -368,14 +378,28 @@ export function Visualize() {
// RDF query limit - default to 500 to prevent browser overload
// Oxigraph contains 27,000+ custodians; rendering all at once crashes the browser
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const [rdfLimit, setRdfLimit] = useState<number>(() => {
const saved = localStorage.getItem('visualize-rdf-limit');
return saved ? parseInt(saved, 10) : 500;
});
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const [totalCustodiansAvailable, setTotalCustodiansAvailable] = useState<number | null>(null);
// Track the limit that was used for the current cached data
const cachedRdfLimitRef = useRef<number | null>(null);
// Prevent duplicate RDF fetch requests (React StrictMode protection)
const rdfFetchInProgressRef = useRef<boolean>(false);
// RDF Query Mode - allows switching between detailed (limited properties) and generic (all triples)
type RdfQueryMode = 'detailed' | 'generic' | 'custom';
const [rdfQueryMode, setRdfQueryMode] = useState<RdfQueryMode>(() => {
const saved = localStorage.getItem('visualize-rdf-query-mode');
return (saved === 'detailed' || saved === 'generic' || saved === 'custom') ? saved : 'detailed';
});
// Track the mode used for cached data (to show Apply button when changed)
const cachedRdfQueryModeRef = useRef<RdfQueryMode | null>(null);
// Hooks
const { isInitialized, isLoading: dbLoading, storageInfo } = useDatabase();
const { parse, isLoading: parserLoading, error: parserError } = useRdfParser();
@ -489,6 +513,13 @@ export function Visualize() {
// Generate RDF overview - Fetch and visualize all heritage custodian RDF data
const handleGenerateRdf = useCallback(async () => {
// Prevent duplicate concurrent fetches (React StrictMode protection)
if (rdfFetchInProgressRef.current) {
console.log('[RDF] Fetch already in progress, skipping duplicate request');
return;
}
rdfFetchInProgressRef.current = true;
setGeneratingRdf(true);
// Don't clear UML visualization - we want to keep both cached
setCurrentCategory('rdf');
@ -533,84 +564,116 @@ export function Visualize() {
console.warn('Could not fetch custodian count:', countErr);
}
// SPARQL query aligned with actual RDF generated by oxigraph_sync.py and oxigraph_person_sync.py
// Namespaces match the Python sync scripts:
// - nde: <https://nde.nl/ontology/hc/class/> for Custodian type
// - hc: <https://w3id.org/heritage/custodian/> for ghcid, isil predicates
// - hp: <https://w3id.org/heritage/person/> for person URIs
const constructQuery = `
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX schema: <http://schema.org/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX org: <http://www.w3.org/ns/org#>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX cidoc: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX nde: <https://nde.nl/ontology/hc/class/>
PREFIX hc: <https://w3id.org/heritage/custodian/>
PREFIX hp: <https://w3id.org/heritage/person/>
// SPARQL query - conditional based on rdfQueryMode
// - detailed: Specific properties only (faster, less connectivity)
// - generic: All triples for custodians (slower, full connectivity)
let constructQuery: string;
if (rdfQueryMode === 'generic') {
// Generic query: Returns ALL triples for custodians
// This provides maximum connectivity in the graph but may be slower
console.log('Using GENERIC query mode - fetching all triples');
constructQuery = `
PREFIX nde: <https://nde.nl/ontology/hc/class/>
CONSTRUCT { ?s ?p ?o }
WHERE {
{
# All triples where custodian is subject
?s a nde:Custodian .
?s ?p ?o .
}
UNION
{
# All triples where custodian is object (incoming links)
?o a nde:Custodian .
?s ?p ?o .
}
}
LIMIT ${rdfLimit * 10}
`;
// Note: Higher limit because generic query returns more triples per custodian
} else {
// Detailed query: Specific properties only (default)
// Namespaces match the Python sync scripts:
// - nde: <https://nde.nl/ontology/hc/class/> for Custodian type
// - hc: <https://w3id.org/heritage/custodian/> for ghcid, isil predicates
// - hp: <https://w3id.org/heritage/person/> for person URIs
console.log('Using DETAILED query mode - specific properties');
constructQuery = `
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX schema: <http://schema.org/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX org: <http://www.w3.org/ns/org#>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX cidoc: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX nde: <https://nde.nl/ontology/hc/class/>
PREFIX hc: <https://w3id.org/heritage/custodian/>
PREFIX hp: <https://w3id.org/heritage/person/>
CONSTRUCT {
# Custodians - core data
?custodian a nde:Custodian ;
rdfs:label ?label ;
skos:prefLabel ?prefLabel ;
schema:name ?name ;
hc:ghcid ?ghcid ;
hc:isil ?isil ;
schema:url ?website ;
foaf:homepage ?homepage ;
owl:sameAs ?wikidata .
# Location (schema:location, not crm:P53)
?custodian schema:location ?location .
?location geo:lat ?lat ;
geo:long ?lon .
# Persons linked to custodians
?person a schema:Person ;
rdfs:label ?personLabel ;
schema:name ?personName ;
schema:worksFor ?custodian ;
org:memberOf ?custodian ;
schema:jobTitle ?jobTitle .
}
WHERE {
# Get all custodians (nde:Custodian is the primary type)
?custodian a nde:Custodian .
OPTIONAL { ?custodian rdfs:label ?label }
OPTIONAL { ?custodian skos:prefLabel ?prefLabel }
OPTIONAL { ?custodian schema:name ?name }
OPTIONAL { ?custodian hc:ghcid ?ghcid }
OPTIONAL { ?custodian hc:isil ?isil }
OPTIONAL { ?custodian schema:url ?website }
OPTIONAL { ?custodian foaf:homepage ?homepage }
OPTIONAL {
?custodian owl:sameAs ?wikidata .
FILTER(STRSTARTS(STR(?wikidata), "http://www.wikidata.org/"))
}
# Location using schema:location (as generated by oxigraph_sync.py)
OPTIONAL {
CONSTRUCT {
# Custodians - core data
?custodian a nde:Custodian ;
rdfs:label ?label ;
skos:prefLabel ?prefLabel ;
schema:name ?name ;
hc:ghcid ?ghcid ;
hc:isil ?isil ;
schema:url ?website ;
foaf:homepage ?homepage ;
owl:sameAs ?wikidata .
# Location (schema:location, not crm:P53)
?custodian schema:location ?location .
OPTIONAL { ?location geo:lat ?lat }
OPTIONAL { ?location geo:long ?lon }
}
# Persons linked to custodians via schema:worksFor
OPTIONAL {
?location geo:lat ?lat ;
geo:long ?lon .
# Persons linked to custodians
?person a schema:Person ;
schema:worksFor ?custodian .
OPTIONAL { ?person rdfs:label ?personLabel }
OPTIONAL { ?person schema:name ?personName }
OPTIONAL { ?person schema:jobTitle ?jobTitle }
OPTIONAL { ?person org:memberOf ?custodian }
rdfs:label ?personLabel ;
schema:name ?personName ;
schema:worksFor ?custodian ;
org:memberOf ?custodian ;
schema:jobTitle ?jobTitle .
}
}
LIMIT ${rdfLimit}
`;
WHERE {
# Get all custodians (nde:Custodian is the primary type)
?custodian a nde:Custodian .
OPTIONAL { ?custodian rdfs:label ?label }
OPTIONAL { ?custodian skos:prefLabel ?prefLabel }
OPTIONAL { ?custodian schema:name ?name }
OPTIONAL { ?custodian hc:ghcid ?ghcid }
OPTIONAL { ?custodian hc:isil ?isil }
OPTIONAL { ?custodian schema:url ?website }
OPTIONAL { ?custodian foaf:homepage ?homepage }
OPTIONAL {
?custodian owl:sameAs ?wikidata .
FILTER(STRSTARTS(STR(?wikidata), "http://www.wikidata.org/"))
}
# Location using schema:location (as generated by oxigraph_sync.py)
OPTIONAL {
?custodian schema:location ?location .
OPTIONAL { ?location geo:lat ?lat }
OPTIONAL { ?location geo:long ?lon }
}
# Persons linked to custodians via schema:worksFor
OPTIONAL {
?person a schema:Person ;
schema:worksFor ?custodian .
OPTIONAL { ?person rdfs:label ?personLabel }
OPTIONAL { ?person schema:name ?personName }
OPTIONAL { ?person schema:jobTitle ?jobTitle }
OPTIONAL { ?person org:memberOf ?custodian }
}
}
LIMIT ${rdfLimit}
`;
}
let rdfData = '';
let dataFormat: 'application/n-triples' | 'text/turtle' = 'application/n-triples';
@ -678,12 +741,15 @@ export function Visualize() {
console.log(`RDF loaded: showing all ${result.nodes.length} nodes across ${typesArray.length} types: ${typesArray.join(', ')}`);
}
// Update cache state
// Update cache state and track the limit and mode used
setHasRdfCache(true);
setRdfNodeCount(result.nodes.length);
cachedRdfLimitRef.current = rdfLimit; // Remember which limit was used for this cache
cachedRdfQueryModeRef.current = rdfQueryMode; // Remember which mode was used
// Update filename with count
setFileName(`NDE Heritage Custodians (${result.nodes.length} entities)`);
// Update filename with count and mode indicator
const modeLabel = rdfQueryMode === 'generic' ? 'all triples' : 'detailed';
setFileName(`NDE Heritage Custodians (${result.nodes.length} entities, ${modeLabel})`);
} catch (err) {
console.error('Error generating RDF overview:', err);
@ -694,8 +760,9 @@ export function Visualize() {
);
} finally {
setGeneratingRdf(false);
rdfFetchInProgressRef.current = false; // Allow new fetches
}
}, [parse, loadGraphData]);
}, [rdfLimit, rdfQueryMode, parse, loadGraphData]);
// Close dropdowns when clicking outside
useEffect(() => {
@ -977,8 +1044,11 @@ export function Visualize() {
}, [hasUmlCache, handleGenerateUml]);
const handleSwitchToRdf = useCallback(() => {
if (!hasRdfCache) {
// No cache, generate it
// Check if we have cache AND it was fetched with the current limit setting
const cacheValid = hasRdfCache && cachedRdfLimitRef.current === rdfLimit;
if (!cacheValid) {
// No cache, or limit has changed - fetch fresh data
handleGenerateRdf();
} else {
// Switch to cached RDF view
@ -986,7 +1056,7 @@ export function Visualize() {
setLayoutType('force');
localStorage.setItem('visualize-layout-type', 'force');
}
}, [hasRdfCache, handleGenerateRdf]);
}, [hasRdfCache, rdfLimit, handleGenerateRdf]);
// Check if we have content to display
const hasRdfContent = filteredNodes.length > 0;
@ -1181,6 +1251,8 @@ export function Visualize() {
const newLimit = parseInt(e.target.value, 10);
setRdfLimit(newLimit);
localStorage.setItem('visualize-rdf-limit', String(newLimit));
// Invalidate cache so next view switch triggers refresh
// User will need to click "Refresh RDF" or switch away and back
}}
>
<option value="100">100</option>
@ -1192,6 +1264,17 @@ export function Visualize() {
<option value="10000">10,000</option>
<option value="50000">All (50,000+)</option>
</select>
{/* Show refresh prompt if limit changed from cached value */}
{hasRdfCache && cachedRdfLimitRef.current !== null && cachedRdfLimitRef.current !== rdfLimit && (
<button
className="query-limit-refresh-button"
onClick={handleGenerateRdf}
disabled={_generatingRdf}
>
<RefreshCw size={14} />
{language === 'nl' ? 'Toepassen' : 'Apply'}
</button>
)}
{totalCustodiansAvailable && (
<p className="query-limit-info">
{rdfNodeCount > 0 ? rdfNodeCount : rdfLimit} {t('showingOf')} {totalCustodiansAvailable.toLocaleString()} {t('custodiansAvailable')}
@ -1200,6 +1283,70 @@ export function Visualize() {
{rdfLimit > 1000 && (
<p className="query-limit-warning">{t('performanceWarning')}</p>
)}
{/* Query Mode Selector */}
<div className="query-mode-subsection">
<h4>{t('queryMode')}</h4>
<p className="query-mode-desc">{t('queryModeDesc')}</p>
<div className="query-mode-options">
<label className={`query-mode-option ${rdfQueryMode === 'detailed' ? 'active' : ''}`}>
<input
type="radio"
name="queryMode"
value="detailed"
checked={rdfQueryMode === 'detailed'}
onChange={() => {
setRdfQueryMode('detailed');
localStorage.setItem('visualize-rdf-query-mode', 'detailed');
}}
/>
<div className="query-mode-content">
<span className="query-mode-label">{t('queryModeDetailed')}</span>
<span className="query-mode-hint">{t('queryModeDetailedHint')}</span>
</div>
</label>
<label className={`query-mode-option ${rdfQueryMode === 'generic' ? 'active' : ''}`}>
<input
type="radio"
name="queryMode"
value="generic"
checked={rdfQueryMode === 'generic'}
onChange={() => {
setRdfQueryMode('generic');
localStorage.setItem('visualize-rdf-query-mode', 'generic');
}}
/>
<div className="query-mode-content">
<span className="query-mode-label">{t('queryModeGeneric')}</span>
<span className="query-mode-hint">{t('queryModeGenericHint')}</span>
</div>
</label>
<label
className="query-mode-option query-mode-custom"
onClick={(e) => {
e.preventDefault();
window.open('/query-builder', '_blank');
}}
>
<div className="query-mode-content">
<span className="query-mode-label">{t('queryModeCustom')}</span>
<span className="query-mode-hint">{t('queryModeCustomHint')}</span>
</div>
<span className="query-mode-external-icon"></span>
</label>
</div>
{/* Show Apply button if mode changed from cached value */}
{hasRdfCache && cachedRdfQueryModeRef.current !== null && cachedRdfQueryModeRef.current !== rdfQueryMode && (
<button
className="query-mode-refresh-button"
onClick={handleGenerateRdf}
disabled={_generatingRdf}
>
<RefreshCw size={14} />
{language === 'nl' ? 'Toepassen' : 'Apply'}
</button>
)}
</div>
</div>
)}

View file

@ -0,0 +1,291 @@
#!/usr/bin/env python3
"""
Apply verified location enrichments to XXX files and rename them.
This script:
1. Updates ghcid_current with the correct region/city codes
2. Updates location with city/region
3. Adds ghcid_history entry
4. Updates provenance notes
5. Renames file to match new GHCID
"""
import yaml
import os
import re
import uuid
import hashlib
from datetime import datetime, timezone
from pathlib import Path
# Verified enrichments from Exa web search
# History:
# - Batch 1 (2025-12-17): 4 files - Crypto Museum, Allard Pierson, DPM Rotterdam, Cow Museum
# - Batch 2 (2025-12-17): 8 files - Bierreclame, Cacao, Edah, Flessenscheepjes, Eddie the Eagle, Jopie Huisman, Fortuna, Atlantikwall
# - Batch 3 (2025-12-17): 7 files - Ajax, C1000, Moslim Archief, CODART, Blik Trommel, Klompenmakerij, CultuurSchakel
# - Batch 4 (2025-12-17): 7 files - Autoriteit Persoonsgegevens, Raad voor Cultuur, IJV, Erotisch Museum, Hollands Kaas Museum, Kresse Museum, Van Gogh Museum Enterprises
# - Batch 5 (2025-12-17): 4 files - Huis73, Dutch Directors Guild, Het Kaas Museum (Bodegraven), Stichting Abrahamdag
# - Batch 6 (2025-12-17): 2 files - Museum 1939-1945, Brandkas van Henny
# - Batch 7 (2025-12-17): 5 files - Frans Maas Museum, Museum Buitenlust, Museum De Canonije, Museum Dijkmagazijn De Heul, Museumboerderij Erve Hofman
# - Batch 8 (2025-12-17): 7 files - Museum Janning, Museum Geelvinck Hinlopen Huis, Museumboerderij De Grote Glind, Museum Galerie RAT, Museum Averlo-Frieswijk-Schalkhaar, Museum Ceuclum, Museum van Brabantse Mutsen en Poffers
# Total enriched: 44 files
# Remaining: ~133 NL-XX-XXX files
#
# All previously processed entries have been removed from VERIFIED_ENRICHMENTS.
# Only add new entries that have not been processed yet.
VERIFIED_ENRICHMENTS = [
# Batch 8 - 2025-12-17
{
'old_filename': 'NL-XX-XXX-M-MJ-museum_janning.yaml',
'institution_name': 'Museum Janning',
'city': 'Nieuw Schoonebeek',
'region': 'Drenthe',
'region_code': 'DR',
'city_code': 'NIS',
'address': 'Europaweg 143a, 7766 AE Nieuw Schoonebeek',
'source': 'exa_web_search',
'source_url': 'https://www.museumjanning.nl/',
},
{
'old_filename': 'NL-XX-XXX-M-MGHH.yaml',
'institution_name': 'Museum Geelvinck Hinlopen Huis',
'city': 'Heerde',
'region': 'Gelderland',
'region_code': 'GE',
'city_code': 'HEE',
'address': 'Kamperweg 23, 8181 CS Heerde',
'source': 'exa_web_search',
'source_url': 'https://geelvinck.nl/',
},
{
'old_filename': 'NL-XX-XXX-M-MGG.yaml',
'institution_name': 'Museumboerderij De Grote Glind',
'city': 'Barneveld',
'region': 'Gelderland',
'region_code': 'GE',
'city_code': 'BAR',
'address': 'Scherpenzeelseweg 158, 3772 MG Barneveld',
'source': 'exa_web_search',
'source_url': 'https://www.degroteglind.nl/',
},
{
'old_filename': 'NL-XX-XXX-M-MGR.yaml',
'institution_name': 'Museum Galerie RAT',
'city': 'Den Burg',
'region': 'Noord-Holland',
'region_code': 'NH',
'city_code': 'DEB',
'address': 'Burgwal 20, 1791 Den Burg, Texel',
'source': 'exa_web_search',
'source_url': 'https://www.mapquest.com/',
},
{
'old_filename': 'NL-XX-XXX-M-MAFS.yaml',
'institution_name': 'Museum Averlo-Frieswijk-Schalkhaar',
'city': 'Schalkhaar',
'region': 'Overijssel',
'region_code': 'OV',
'city_code': 'SCK',
'address': 'Frieswijkerweg 7, 7433 RB Schalkhaar',
'source': 'exa_web_search',
'source_url': 'https://www.museum-afs.nl/',
},
{
'old_filename': 'NL-XX-XXX-M-MC.yaml',
'institution_name': 'Museum Ceuclum',
'city': 'Cuijk',
'region': 'Noord-Brabant',
'region_code': 'NB',
'city_code': 'CUI',
'address': 'Castellum 1, 5431 EM Cuijk',
'source': 'exa_web_search',
'source_url': 'https://www.museumceuclum.nl/',
},
{
'old_filename': 'NL-XX-XXX-M-MBMP.yaml',
'institution_name': 'Museum van Brabantse Mutsen en Poffers',
'city': 'Sint-Oedenrode',
'region': 'Noord-Brabant',
'region_code': 'NB',
'city_code': 'SOR',
'address': 'Kerkstraat 20, 5492 AH Sint-Oedenrode',
'source': 'exa_web_search',
'source_url': 'https://mutsenmuseum.nl/',
},
]
def generate_ghcid_uuid(ghcid_string: str) -> str:
"""Generate UUID v5 from GHCID string."""
GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # DNS namespace
return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string))
def generate_ghcid_numeric(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
return int.from_bytes(sha256_hash[:8], byteorder='big')
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
# Create UUID v8 format
uuid_bytes = bytearray(sha256_hash[:16])
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
def apply_enrichment(custodian_dir: Path, enrichment: dict) -> tuple[str | None, str | None]:
"""Apply enrichment to a file and return (old_path, new_path)."""
old_path = custodian_dir / enrichment['old_filename']
if not old_path.exists():
print(f" ❌ File not found: {old_path}")
return None, None
# Load YAML
with open(old_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Extract current GHCID components
old_ghcid = data['ghcid']['ghcid_current']
# Parse old GHCID to get type and abbreviation
# Format: NL-XX-XXX-{type}-{abbrev}[-{name_suffix}]
match = re.match(r'NL-XX-XXX-([A-Z])-([A-Z0-9]+)(?:-(.+))?', old_ghcid)
if not match:
print(f" ❌ Could not parse GHCID: {old_ghcid}")
return None, None
inst_type = match.group(1)
abbrev = match.group(2)
name_suffix = match.group(3) # May be None
# Build new GHCID
new_ghcid = f"NL-{enrichment['region_code']}-{enrichment['city_code']}-{inst_type}-{abbrev}"
if name_suffix:
new_ghcid += f"-{name_suffix}"
# Generate new identifiers
new_uuid = generate_ghcid_uuid(new_ghcid)
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
new_numeric = generate_ghcid_numeric(new_ghcid)
timestamp = datetime.now(timezone.utc).isoformat()
# Update location
data['location'] = {
'city': enrichment['city'],
'region': enrichment['region'],
'country': 'NL',
}
if enrichment.get('address'):
data['location']['address'] = enrichment['address']
# Update ghcid
old_numeric = data['ghcid'].get('ghcid_numeric', 0)
# Add to ghcid_history - mark old as ended
if 'ghcid_history' not in data['ghcid']:
data['ghcid_history'] = []
# Close out the old entry
for entry in data['ghcid']['ghcid_history']:
if entry.get('valid_to') is None:
entry['valid_to'] = timestamp
# Add new history entry
data['ghcid']['ghcid_history'].append({
'ghcid': new_ghcid,
'ghcid_numeric': new_numeric,
'valid_from': timestamp,
'valid_to': None,
'reason': f"Location enriched via Exa web search - {enrichment['city']}, {enrichment['region']}"
})
# Update current GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_uuid'] = new_uuid
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
data['ghcid']['ghcid_numeric'] = new_numeric
# Add location_resolution
data['ghcid']['location_resolution'] = {
'method': 'EXA_WEB_SEARCH',
'city_code': enrichment['city_code'],
'city_name': enrichment['city'],
'region_code': enrichment['region_code'],
'region_name': enrichment['region'],
'country_code': 'NL',
'resolution_date': timestamp,
'source_url': enrichment.get('source_url'),
}
# Update provenance notes
if 'provenance' not in data:
data['provenance'] = {}
if 'notes' not in data['provenance']:
data['provenance']['notes'] = []
data['provenance']['notes'].append(
f"Location enriched on {timestamp[:10]} via Exa web search: {enrichment['city']}, {enrichment['region']}"
)
# Add web search source to provenance
if 'sources' not in data['provenance']:
data['provenance']['sources'] = {}
if 'web_search' not in data['provenance']['sources']:
data['provenance']['sources']['web_search'] = []
data['provenance']['sources']['web_search'].append({
'source_type': 'exa_web_search',
'data_tier': 'TIER_3_CROWD_SOURCED',
'source_url': enrichment.get('source_url'),
'extraction_timestamp': timestamp,
'claims_extracted': ['city', 'region', 'address'],
})
# Write updated YAML to new filename
new_filename = new_ghcid.replace('/', '_') + '.yaml'
new_path = custodian_dir / new_filename
with open(new_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Remove old file
if old_path != new_path:
old_path.unlink()
return str(old_path), str(new_path)
def main():
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
print("=" * 60)
print("Applying Verified Location Enrichments")
print("=" * 60)
if not VERIFIED_ENRICHMENTS:
print("\nNo enrichments to process. Add entries to VERIFIED_ENRICHMENTS list.")
return
success_count = 0
for enrichment in VERIFIED_ENRICHMENTS:
print(f"\nProcessing: {enrichment['old_filename']}")
print(f"{enrichment['city']}, {enrichment['region']} ({enrichment['region_code']}-{enrichment['city_code']})")
old_path, new_path = apply_enrichment(custodian_dir, enrichment)
if old_path and new_path:
old_name = os.path.basename(old_path)
new_name = os.path.basename(new_path)
print(f" ✅ Renamed: {old_name}")
print(f"{new_name}")
success_count += 1
print("\n" + "=" * 60)
print(f"Summary: {success_count}/{len(VERIFIED_ENRICHMENTS)} files enriched and renamed")
print("=" * 60)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,297 @@
#!/usr/bin/env python3
"""
Enrich NL-XX-XXX files with correct location data via web search.
The LinkedIn HTML extraction method was flawed - it extracted location data from
wrong companies in the HTML. This script uses web search to find correct locations.
Strategy:
1. Read custodian name and website from YAML file
2. Search web for "[name] Netherlands location address city"
3. Parse results to extract city/region
4. Update YAML file with correct location
5. Regenerate GHCID based on new location
"""
import os
import re
import yaml
import json
import subprocess
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Tuple
# Directory containing custodian files
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# GeoNames database for settlement lookup
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
# Dutch province mapping
PROVINCE_MAP = {
'drenthe': 'DR',
'friesland': 'FR', 'fryslân': 'FR',
'gelderland': 'GE',
'groningen': 'GR',
'limburg': 'LI',
'noord-brabant': 'NB', 'north brabant': 'NB', 'nordbrabant': 'NB', 'brabant': 'NB',
'noord-holland': 'NH', 'north holland': 'NH',
'overijssel': 'OV',
'utrecht': 'UT',
'zeeland': 'ZE',
'zuid-holland': 'ZH', 'south holland': 'ZH',
'flevoland': 'FL',
}
# Dutch city to 3-letter code mapping (common cities)
CITY_CODES = {
'amsterdam': 'AMS',
'rotterdam': 'ROT',
'den haag': 'DHA', 'the hague': 'DHA', "'s-gravenhage": 'DHA',
'utrecht': 'UTR',
'eindhoven': 'EIN',
'groningen': 'GRO',
'tilburg': 'TIL',
'almere': 'ALM',
'breda': 'BRE',
'nijmegen': 'NIJ',
'apeldoorn': 'APE',
'haarlem': 'HAA',
'arnhem': 'ARN',
'enschede': 'ENS',
'amersfoort': 'AME',
'zaanstad': 'ZAA',
'haarlemmermeer': 'HMM',
'zwolle': 'ZWO',
'leiden': 'LEI',
'maastricht': 'MAA',
'dordrecht': 'DOR',
'zoetermeer': 'ZOE',
'deventer': 'DEV',
'delft': 'DEL',
'alkmaar': 'ALK',
'venlo': 'VEN',
'leeuwarden': 'LEE',
'heerlen': 'HEE',
'hilversum': 'HIL',
'assen': 'ASS',
'schiedam': 'SCH',
'weert': 'WEE',
'duivendrecht': 'DUI',
'noordwijk': 'NOO',
}
def get_city_code(city: str) -> str:
"""Get 3-letter code for a city."""
city_lower = city.lower().strip()
if city_lower in CITY_CODES:
return CITY_CODES[city_lower]
# Generate code from first 3 letters
clean = re.sub(r'[^a-z]', '', city_lower)
return clean[:3].upper() if len(clean) >= 3 else clean.upper().ljust(3, 'X')
def get_region_code(region: str) -> Optional[str]:
"""Get 2-letter province code from region name."""
region_lower = region.lower().strip()
for key, code in PROVINCE_MAP.items():
if key in region_lower:
return code
return None
def extract_location_from_search_results(results: list) -> Optional[dict]:
"""Extract city and region from Exa search results."""
# Patterns to match Dutch locations
patterns = [
# "City, Netherlands" or "City (Province)"
r'(\w+(?:\s+\w+)?)\s*,\s*Netherlands\s*\((\w+(?:\s+\w+)?)\)',
# "in City, Province"
r'in\s+(\w+(?:\s+\w+)?)\s*,\s*(Noord-Holland|Zuid-Holland|Noord-Brabant|Gelderland|Limburg|Overijssel|Friesland|Drenthe|Groningen|Utrecht|Zeeland|Flevoland)',
# "legal seat in City"
r'legal\s+seat\s+in\s+(\w+)',
# "Address: ... City"
r'Address[:\s]+[^,]+,\s*(\d{4}\s*[A-Z]{2})\s+(\w+)',
# Dutch postal code pattern
r'(\d{4}\s*[A-Z]{2})\s+(\w+(?:\s+\w+)?)\s*,?\s*(?:Netherlands|NL)',
]
for result in results:
text = result.get('text', '') + ' ' + result.get('title', '')
# Try each pattern
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
groups = match.groups()
if len(groups) >= 2:
# Check if first group is postal code
if re.match(r'\d{4}\s*[A-Z]{2}', groups[0]):
city = groups[1]
region = None
else:
city = groups[0]
region = groups[1] if len(groups) > 1 else None
else:
city = groups[0]
region = None
city = city.strip()
region_code = get_region_code(region) if region else None
return {
'city': city,
'region_code': region_code,
'source_text': text[:200]
}
return None
def search_institution_location(name: str, website: Optional[str] = None) -> Optional[dict]:
"""Search web for institution location using Exa."""
# Build search query
query = f'"{name}" Netherlands location address city'
if website and 'lnkd.in' not in website:
# Add website domain to query for better results
domain = re.sub(r'https?://(www\.)?', '', website).split('/')[0]
query = f'site:{domain} OR "{name}" Netherlands address city location'
# Use Exa via subprocess (since we can't import the MCP client directly)
# For now, return None - we'll use the MCP tool directly in the main flow
return None
def find_xxx_files_needing_enrichment():
"""Find NL-XX-XXX files that need location enrichment."""
files = []
for f in sorted(CUSTODIAN_DIR.glob("NL-XX-XXX-*.yaml")):
try:
with open(f, 'r', encoding='utf-8') as file:
content = yaml.safe_load(file)
if not content:
continue
# Get institution name
name = content.get('custodian_name', {}).get('emic_name', '')
# Get website
website = content.get('linkedin_enrichment', {}).get('website')
# Get LinkedIn slug
slug = content.get('linkedin_enrichment', {}).get('linkedin_slug', '')
files.append({
'file': f,
'name': name,
'website': website,
'slug': slug,
'content': content
})
except Exception as e:
print(f"Error reading {f}: {e}")
return files
def update_file_with_location(file_info: dict, city: str, region_code: str, source: str):
"""Update a YAML file with correct location data."""
f = file_info['file']
content = file_info['content']
name = file_info['name']
# Get city code
city_code = get_city_code(city)
# Update location
content['location'] = {
'city': city,
'region': region_code,
'country': 'NL'
}
# Generate new GHCID
# Extract type and abbreviation from filename
filename = f.stem
# Pattern: NL-XX-XXX-{TYPE}-{ABBREV}[-{name_suffix}]
match = re.match(r'NL-XX-XXX-([A-Z])-(.+)', filename)
if match:
inst_type = match.group(1)
abbrev_suffix = match.group(2)
new_ghcid = f"NL-{region_code}-{city_code}-{inst_type}-{abbrev_suffix}"
# Update GHCID
if 'ghcid' not in content:
content['ghcid'] = {}
old_ghcid = content['ghcid'].get('ghcid_current', filename)
content['ghcid']['ghcid_current'] = new_ghcid
content['ghcid']['ghcid_original'] = old_ghcid
# Update history
content['ghcid']['ghcid_history'] = [{
'ghcid': new_ghcid,
'ghcid_numeric': content['ghcid'].get('ghcid_numeric'),
'valid_from': datetime.now(timezone.utc).isoformat(),
'valid_to': None,
'reason': f'Location enriched via web search: {city}, {region_code}'
}]
# Add location resolution
content['ghcid']['location_resolution'] = {
'method': 'WEB_SEARCH',
'city': city,
'city_code': city_code,
'region_code': region_code,
'country_code': 'NL',
'source': source,
'resolution_date': datetime.now(timezone.utc).isoformat()
}
# Add provenance note
if 'provenance' not in content:
content['provenance'] = {}
if 'notes' not in content['provenance']:
content['provenance']['notes'] = []
content['provenance']['notes'].append(
f"Location enriched via web search on {datetime.now(timezone.utc).strftime('%Y-%m-%d')}: {city}, {region_code}"
)
# Write back
with open(f, 'w', encoding='utf-8') as file:
yaml.dump(content, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
return new_ghcid if match else None
def main():
"""Main function to list files needing enrichment."""
print("Finding NL-XX-XXX files needing location enrichment...\n")
files = find_xxx_files_needing_enrichment()
print(f"Found {len(files)} files\n")
# Group by whether they have website
with_website = [f for f in files if f['website'] and 'lnkd.in' not in str(f['website'])]
without_website = [f for f in files if not f['website'] or 'lnkd.in' in str(f['website'])]
print(f"Files with valid website: {len(with_website)}")
print(f"Files without valid website: {len(without_website)}")
print("\n--- Sample files with websites (first 20) ---")
for f in with_website[:20]:
print(f" {f['name']}")
print(f" Website: {f['website']}")
print(f" File: {f['file'].name}")
print()
if __name__ == "__main__":
main()

View file

@ -0,0 +1,299 @@
#!/usr/bin/env python3
"""
Fix institutions incorrectly assigned to NL (Netherlands) that are actually in other countries.
These institutions were imported from LinkedIn batch import but have wrong country codes.
"""
import yaml
import os
import re
import uuid
import hashlib
from datetime import datetime, timezone
from pathlib import Path
# Non-Dutch institutions to fix
# Verified via Exa web search 2025-12-17
NON_DUTCH_INSTITUTIONS = [
{
'old_filename': 'NL-XX-XXX-A-HAEU.yaml',
'institution_name': 'Historical Archives of the European Union',
'country': 'IT',
'country_name': 'Italy',
'region': 'Tuscany',
'region_code': '52', # Italian region code
'city': 'Firenze',
'city_code': 'FIR',
'address': 'Via Bolognese 156, 50139 Firenze, Villa Salviati',
'source_url': 'https://archives.eui.eu/en/repositories/1',
'notes': 'Part of European University Institute, Florence'
},
{
'old_filename': 'NL-XX-XXX-A-VZWADEB.yaml',
'institution_name': 'v.z.w. Archief- en Documentatiecentrum Erfgoed Binnenvaart',
'country': 'BE',
'country_name': 'Belgium',
'region': 'West-Vlaanderen',
'region_code': 'VWV',
'city': 'Oudenburg',
'city_code': 'OUD',
'address': 'Vaartdijk zuid 11, 8460 Oudenburg (aboard Museumschip Tordino)',
'source_url': 'http://binnenvaarterfgoed.be/',
'notes': 'Belgian v.z.w. (vzw = Belgian non-profit), located aboard museum ship'
},
{
'old_filename': 'NL-XX-XXX-M-FM-ford_museum.yaml',
'institution_name': 'Gerald R. Ford Presidential Museum',
'country': 'US',
'country_name': 'United States',
'region': 'Michigan',
'region_code': 'MI',
'city': 'Grand Rapids',
'city_code': 'GRA',
'address': '303 Pearl Street NW, Grand Rapids, MI 49504',
'source_url': 'https://www.fordlibrarymuseum.gov/visit/museum',
'notes': 'Part of National Archives system, commemorates 38th US President',
# Update institution_type from M to O (Official Institution - Presidential Library)
'new_institution_type': 'O',
},
{
'old_filename': 'NL-XX-XXX-M-DAJ.yaml',
'institution_name': 'Diorama Arsip Jogja',
'country': 'ID',
'country_name': 'Indonesia',
'region': 'Daerah Istimewa Yogyakarta',
'region_code': 'YO',
'city': 'Bantul',
'city_code': 'BAN',
'address': 'LT 1 Gedung DEPO ARSIP, Jl. Janti, Banguntapan, Kabupaten Bantul, Yogyakarta 55198',
'source_url': 'https://dioramaarsip.jogjaprov.go.id/home',
'notes': 'Digital archive diorama of Yogyakarta history, opened February 2022',
# It's actually an Archive (A), not Museum (M)
'new_institution_type': 'A',
},
# Batch 2: Added 2025-12-17 - More Indonesian and Palestinian institutions
{
'old_filename': 'NL-XX-XXX-M-MBV.yaml',
'institution_name': 'Museum Benteng Vredeburg',
'country': 'ID',
'country_name': 'Indonesia',
'region': 'Daerah Istimewa Yogyakarta',
'region_code': 'YO',
'city': 'Yogyakarta',
'city_code': 'YOG',
'address': 'Jl. Margo Mulyo No.6, Ngupasan, Kec. Gondomanan, Kota Yogyakarta 55122',
'source_url': 'https://forevervacation.com/yogyakarta/museum-benteng-vredeburg',
'notes': 'Dutch colonial fortress converted to museum in 1992, documents Indonesian independence struggle',
},
{
'old_filename': 'NL-XX-XXX-M-MBP.yaml',
'institution_name': 'Museum Batik Pekalongan',
'country': 'ID',
'country_name': 'Indonesia',
'region': 'Jawa Tengah',
'region_code': 'JT', # Central Java
'city': 'Pekalongan',
'city_code': 'PEK',
'address': 'Jl. Jetayu No.1, Pekalongan 51152',
'source_url': 'https://id.wikipedia.org/wiki/Museum_Batik_Pekalongan',
'notes': 'UNESCO recognized museum for batik conservation, opened 12 July 2006 by President SBY',
},
{
'old_filename': 'NL-XX-XXX-M-MG.yaml',
'institution_name': 'Municipality of Gaza',
'country': 'PS',
'country_name': 'Palestine',
'region': 'Gaza Strip',
'region_code': 'GZ',
'city': 'Gaza City',
'city_code': 'GAZ',
'address': None, # Address not verifiable due to current situation
'source_url': 'https://www.gaza-city.org',
'notes': 'Municipal government, founded 1898. Type corrected from M (Museum) to O (Official Institution)',
# It's a municipality (government), not a museum
'new_institution_type': 'O',
},
]
def generate_ghcid_uuid(ghcid_string: str) -> str:
"""Generate UUID v5 from GHCID string."""
GLAM_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
return str(uuid.uuid5(GLAM_NAMESPACE, ghcid_string))
def generate_ghcid_numeric(ghcid_string: str) -> int:
"""Generate 64-bit numeric ID from GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
return int.from_bytes(sha256_hash[:8], byteorder='big')
def generate_ghcid_uuid_sha256(ghcid_string: str) -> str:
"""Generate UUID v8 (SHA-256 based) from GHCID string."""
sha256_hash = hashlib.sha256(ghcid_string.encode()).digest()
uuid_bytes = bytearray(sha256_hash[:16])
uuid_bytes[6] = (uuid_bytes[6] & 0x0F) | 0x80 # Version 8
uuid_bytes[8] = (uuid_bytes[8] & 0x3F) | 0x80 # Variant
return str(uuid.UUID(bytes=bytes(uuid_bytes)))
def fix_institution(custodian_dir: Path, inst: dict) -> tuple[str | None, str | None]:
"""Fix a non-Dutch institution and return (old_path, new_path)."""
old_path = custodian_dir / inst['old_filename']
if not old_path.exists():
print(f" File not found: {old_path}")
return None, None
# Load YAML
with open(old_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Extract current GHCID components
old_ghcid = data['ghcid']['ghcid_current']
# Parse old GHCID to get type and abbreviation
# Format: NL-XX-XXX-{type}-{abbrev}[-{name_suffix}]
match = re.match(r'NL-XX-XXX-([A-Z])-([A-Z0-9]+)(?:-(.+))?', old_ghcid)
if not match:
print(f" Could not parse GHCID: {old_ghcid}")
return None, None
inst_type = match.group(1)
abbrev = match.group(2)
name_suffix = match.group(3) # May be None
# Check if we need to change institution type
if inst.get('new_institution_type'):
inst_type = inst['new_institution_type']
# Also update the institution_type field
data['institution_type'] = [inst_type]
# Build new GHCID with correct country
new_ghcid = f"{inst['country']}-{inst['region_code']}-{inst['city_code']}-{inst_type}-{abbrev}"
if name_suffix:
new_ghcid += f"-{name_suffix}"
# Generate new identifiers
new_uuid = generate_ghcid_uuid(new_ghcid)
new_uuid_sha256 = generate_ghcid_uuid_sha256(new_ghcid)
new_numeric = generate_ghcid_numeric(new_ghcid)
timestamp = datetime.now(timezone.utc).isoformat()
# Update location
data['location'] = {
'city': inst['city'],
'region': inst['region'],
'country': inst['country'],
}
if inst.get('address'):
data['location']['address'] = inst['address']
# Close out old ghcid_history entries
if 'ghcid_history' not in data['ghcid']:
data['ghcid']['ghcid_history'] = []
for entry in data['ghcid']['ghcid_history']:
if entry.get('valid_to') is None:
entry['valid_to'] = timestamp
# Add new history entry
data['ghcid']['ghcid_history'].append({
'ghcid': new_ghcid,
'ghcid_numeric': new_numeric,
'valid_from': timestamp,
'valid_to': None,
'reason': f"Country code corrected: NL -> {inst['country']} ({inst['country_name']}). "
f"Location: {inst['city']}, {inst['region']}"
})
# Update current GHCID
data['ghcid']['ghcid_current'] = new_ghcid
data['ghcid']['ghcid_original'] = new_ghcid # Also update original since NL was wrong
data['ghcid']['ghcid_uuid'] = new_uuid
data['ghcid']['ghcid_uuid_sha256'] = new_uuid_sha256
data['ghcid']['ghcid_numeric'] = new_numeric
# Add location_resolution
data['ghcid']['location_resolution'] = {
'method': 'EXA_WEB_SEARCH',
'city_code': inst['city_code'],
'city_name': inst['city'],
'region_code': inst['region_code'],
'region_name': inst['region'],
'country_code': inst['country'],
'resolution_date': timestamp,
'source_url': inst.get('source_url'),
'notes': inst.get('notes'),
}
# Update provenance
if 'provenance' not in data:
data['provenance'] = {}
if 'notes' not in data['provenance']:
data['provenance']['notes'] = []
data['provenance']['notes'].append(
f"Country code corrected on {timestamp[:10]}: NL was incorrect, "
f"institution is in {inst['country_name']} ({inst['country']})"
)
# Add web search source to provenance
if 'sources' not in data['provenance']:
data['provenance']['sources'] = {}
if 'web_search' not in data['provenance']['sources']:
data['provenance']['sources']['web_search'] = []
data['provenance']['sources']['web_search'].append({
'source_type': 'exa_web_search',
'data_tier': 'TIER_2_VERIFIED', # Higher tier since we verified country
'source_url': inst.get('source_url'),
'extraction_timestamp': timestamp,
'claims_extracted': ['country', 'region', 'city', 'address'],
})
# Write updated YAML to new filename
new_filename = new_ghcid.replace('/', '_') + '.yaml'
new_path = custodian_dir / new_filename
with open(new_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Remove old file
if old_path != new_path:
old_path.unlink()
return str(old_path), str(new_path)
def main():
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
print("=" * 70)
print("Fixing Non-Dutch Institutions (Country Code Corrections)")
print("=" * 70)
success_count = 0
for inst in NON_DUTCH_INSTITUTIONS:
print(f"\nProcessing: {inst['old_filename']}")
print(f" Institution: {inst['institution_name']}")
print(f" Correction: NL -> {inst['country']} ({inst['country_name']})")
print(f" Location: {inst['city']}, {inst['region']}")
old_path, new_path = fix_institution(custodian_dir, inst)
if old_path and new_path:
old_name = os.path.basename(old_path)
new_name = os.path.basename(new_path)
print(f" Renamed: {old_name}")
print(f" -> {new_name}")
success_count += 1
print("\n" + "=" * 70)
print(f"Summary: {success_count}/{len(NON_DUTCH_INSTITUTIONS)} institutions corrected")
print("=" * 70)
if __name__ == '__main__':
main()

View file

@ -202,14 +202,20 @@ def extract_metadata(data: dict[str, Any], filepath: Path) -> dict[str, Any]:
metadata["country"] = loc["country"]
if loc.get("city"):
metadata["city"] = loc["city"]
if loc.get("region"):
# Use region_code (ISO 3166-2) for filtering, fallback to region name
if loc.get("region_code"):
metadata["region"] = loc["region_code"] # e.g., "NH" not "Noord-Holland"
elif loc.get("region"):
metadata["region"] = loc["region"]
elif location:
if location.get("country"):
metadata["country"] = location["country"]
if location.get("city"):
metadata["city"] = location["city"]
if location.get("region"):
# Use region_code (ISO 3166-2) for filtering, fallback to region name
if location.get("region_code"):
metadata["region"] = location["region_code"] # e.g., "NH" not "Noord-Holland"
elif location.get("region"):
metadata["region"] = location["region"]
# Also extract country from GHCID if not found elsewhere
@ -290,24 +296,15 @@ def extract_metadata(data: dict[str, Any], filepath: Path) -> dict[str, Any]:
def find_institution_files(data_dir: Path) -> list[Path]:
"""Find all institution YAML files in the data directory."""
"""Find all institution YAML files in the data directory.
Optimized for large directories using os.listdir instead of glob.
"""
import os
files = []
# Look for YAML files in common patterns
patterns = [
"*.yaml",
"*.yml",
"**/*.yaml",
"**/*.yml",
]
for pattern in patterns:
files.extend(data_dir.glob(pattern))
# Deduplicate
files = list(set(files))
# Filter out non-institution files
# Filter patterns
excluded_patterns = [
"_schema",
"_config",
@ -316,12 +313,37 @@ def find_institution_files(data_dir: Path) -> list[Path]:
"example_",
]
filtered = []
for f in files:
if not any(excl in f.name.lower() for excl in excluded_patterns):
filtered.append(f)
def is_valid_file(name: str) -> bool:
"""Check if file is a valid institution YAML file."""
if not name.endswith(('.yaml', '.yml')):
return False
if name.startswith('.'):
return False
name_lower = name.lower()
return not any(excl in name_lower for excl in excluded_patterns)
return sorted(filtered)
# Get top-level YAML files (most common case - fast with os.listdir)
try:
for name in os.listdir(data_dir):
if is_valid_file(name):
filepath = data_dir / name
if filepath.is_file():
files.append(filepath)
except PermissionError:
logger.warning(f"Permission denied accessing {data_dir}")
# Check known subdirectories for additional files
known_subdirs = ["person", "web", "archived"]
for subdir in known_subdirs:
subdir_path = data_dir / subdir
if subdir_path.exists():
for root, _, filenames in os.walk(subdir_path):
root_path = Path(root)
for name in filenames:
if is_valid_file(name):
files.append(root_path / name)
return sorted(files)
def main():
@ -371,6 +393,12 @@ def main():
action="store_true",
help="Parse files but don't index"
)
parser.add_argument(
"--limit",
type=int,
default=None,
help="Limit number of files to process (for testing)"
)
args = parser.parse_args()
@ -384,6 +412,11 @@ def main():
files = find_institution_files(args.data_dir)
logger.info(f"Found {len(files)} institution files")
# Apply limit if specified
if args.limit:
files = files[:args.limit]
logger.info(f"Limited to {len(files)} files for processing")
if not files:
logger.warning("No institution files found")
sys.exit(0)

6643
scripts/sync/mappings.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -179,14 +179,20 @@ def extract_metadata(data: dict[str, Any], filepath: Path) -> dict[str, Any]:
metadata["country"] = loc["country"]
if loc.get("city"):
metadata["city"] = loc["city"]
if loc.get("region"):
# Use region_code (ISO 3166-2) for filtering, fallback to region name
if loc.get("region_code"):
metadata["region"] = loc["region_code"] # e.g., "NH" not "Noord-Holland"
elif loc.get("region"):
metadata["region"] = loc["region"]
elif location:
if location.get("country"):
metadata["country"] = location["country"]
if location.get("city"):
metadata["city"] = location["city"]
if location.get("region"):
# Use region_code (ISO 3166-2) for filtering, fallback to region name
if location.get("region_code"):
metadata["region"] = location["region_code"] # e.g., "NH" not "Noord-Holland"
elif location.get("region"):
metadata["region"] = location["region"]
# Country from GHCID