feat(scripts): expand university location mappings and add web enrichment

- enrich_ppids.py: Add 40+ Dutch universities and hogescholen to location mapping
- enrich_ppids_web.py: New script for web-based PPID enrichment
- resolve_pending_known_orgs.py: Updates for pending org resolution
This commit is contained in:
kempersc 2026-01-09 21:10:14 +01:00
parent ea35da02dc
commit dd0ee2cf11
3 changed files with 1590 additions and 46 deletions

View file

@ -114,6 +114,85 @@ DUTCH_UNI_LOCATIONS = {
"IOPS": ("Amsterdam", "NL"),
"Interuniversity Graduate School of Psychometrics": ("Amsterdam", "NL"),
"Sioo": ("Utrecht", "NL"),
# Additional Dutch universities (expanded mapping)
"Eindhoven University of Technology": ("Eindhoven", "NL"),
"Delft University of Technology": ("Delft", "NL"),
"University of Twente": ("Enschede", "NL"),
"Universiteit Twente": ("Enschede", "NL"),
"UT": ("Enschede", "NL"),
"Open Universiteit": ("Heerlen", "NL"),
"Open University Netherlands": ("Heerlen", "NL"),
"Nyenrode": ("Breukelen", "NL"),
"Nyenrode Business Universiteit": ("Breukelen", "NL"),
"Theologische Universiteit": ("Kampen", "NL"),
"Protestant Theological University": ("Amsterdam", "NL"),
# Additional Hogescholen
"De Haagse Hogeschool": ("Den Haag", "NL"),
"The Hague University": ("Den Haag", "NL"),
"The Hague University of Applied Sciences": ("Den Haag", "NL"),
"Amsterdamse Hogeschool voor de Kunsten": ("Amsterdam", "NL"),
"AHK": ("Amsterdam", "NL"),
"Conservatorium van Amsterdam": ("Amsterdam", "NL"),
"Hanzehogeschool Groningen": ("Groningen", "NL"),
"Hogeschool Leiden": ("Leiden", "NL"),
"Hogeschool Zeeland": ("Vlissingen", "NL"),
"HZ University of Applied Sciences": ("Vlissingen", "NL"),
"Hogeschool voor de Kunsten Utrecht": ("Utrecht", "NL"),
"HKU": ("Utrecht", "NL"),
"Willem de Kooning Academie": ("Rotterdam", "NL"),
"Codarts Rotterdam": ("Rotterdam", "NL"),
"Codarts": ("Rotterdam", "NL"),
"Design Academy": ("Eindhoven", "NL"),
"NHTV": ("Breda", "NL"),
"NHTV Breda University of Applied Sciences": ("Breda", "NL"),
"Breda University of Applied Sciences": ("Breda", "NL"),
"NHL Hogeschool": ("Leeuwarden", "NL"),
"Van Hall Larenstein": ("Velp", "NL"),
"NCOI": ("Hilversum", "NL"),
"NCOI Opleidingen": ("Hilversum", "NL"),
"LOI": ("Leiderdorp", "NL"),
"LOI Hogeschool": ("Leiderdorp", "NL"),
"NTI": ("Leiden", "NL"),
"Hogeschool Arnhem": ("Arnhem", "NL"),
"Hogeschool Nijmegen": ("Nijmegen", "NL"),
"ROC": ("", "NL"), # Regional Training Centers - various locations (fallback)
# Specific ROC locations
"ROC Leeuwenborgh": ("Maastricht", "NL"),
"ROC Leiden": ("Leiden", "NL"),
"ROC Midden Nederland": ("Utrecht", "NL"),
"ROC MN": ("Utrecht", "NL"),
"ROC van Amsterdam": ("Amsterdam", "NL"),
"ROC Amsterdam": ("Amsterdam", "NL"),
"ROC Flevoland": ("Almere", "NL"),
"ROC Tilburg": ("Tilburg", "NL"),
"ROC van Twente": ("Enschede", "NL"),
"ROC Twente": ("Enschede", "NL"),
"ROC Nijmegen": ("Nijmegen", "NL"),
"ROC Mondriaan": ("Den Haag", "NL"),
"ROC Nova College": ("Haarlem", "NL"),
"ROC Albeda": ("Rotterdam", "NL"),
"Albeda College": ("Rotterdam", "NL"),
"Zadkine": ("Rotterdam", "NL"),
"Graafschap College": ("Doetinchem", "NL"),
"Friesland College": ("Leeuwarden", "NL"),
"Noorderpoort": ("Groningen", "NL"),
"Alfa-college": ("Groningen", "NL"),
"Deltion College": ("Zwolle", "NL"),
"Cibap": ("Zwolle", "NL"),
"Summa College": ("Eindhoven", "NL"),
"SintLucas": ("Eindhoven", "NL"),
"Koning Willem I College": ("Den Bosch", "NL"),
"Curio": ("Breda", "NL"),
"Da Vinci College": ("Dordrecht", "NL"),
# Additional Radboud variations
"Radboud University Nijmegen": ("Nijmegen", "NL"),
"Radboud University": ("Nijmegen", "NL"),
# Additional VU variations
"Vrije Universiteit Amsterdam": ("Amsterdam", "NL"),
"VU University Amsterdam": ("Amsterdam", "NL"),
# Wageningen variations
"Wageningen University & Research": ("Wageningen", "NL"),
"WUR": ("Wageningen", "NL"),
# Belgian institutions
"KU Leuven": ("Leuven", "BE"),
"University of Leuven": ("Leuven", "BE"),
@ -141,9 +220,85 @@ DUTCH_UNI_LOCATIONS = {
"LMU München": ("München", "DE"),
"Technische Universität München": ("München", "DE"),
"TU München": ("München", "DE"),
# International
# UK institutions
"University of Oxford": ("Oxford", "GB"),
"Oxford University": ("Oxford", "GB"),
"University of Cambridge": ("Cambridge", "GB"),
"Cambridge University": ("Cambridge", "GB"),
"University of York": ("York", "GB"),
"University College London": ("London", "GB"),
"UCL": ("London", "GB"),
"London School of Economics": ("London", "GB"),
"LSE": ("London", "GB"),
"King's College London": ("London", "GB"),
"Imperial College": ("London", "GB"),
"University of Edinburgh": ("Edinburgh", "GB"),
"University of Manchester": ("Manchester", "GB"),
# Australian institutions
"The Australian National University": ("Canberra", "AU"),
"Australian National University": ("Canberra", "AU"),
"ANU": ("Canberra", "AU"),
"University of Canberra": ("Canberra", "AU"),
"University of Melbourne": ("Melbourne", "AU"),
"University of Sydney": ("Sydney", "AU"),
"Macquarie University": ("Sydney", "AU"),
"Charles Sturt University": ("Bathurst", "AU"),
"UNSW": ("Sydney", "AU"),
"University of New South Wales": ("Sydney", "AU"),
"University of Queensland": ("Brisbane", "AU"),
"Monash University": ("Melbourne", "AU"),
# South African institutions
"University of Cape Town": ("Cape Town", "ZA"),
"UCT": ("Cape Town", "ZA"),
"University of Pretoria": ("Pretoria", "ZA"),
"University of Witwatersrand": ("Johannesburg", "ZA"),
"Stellenbosch University": ("Stellenbosch", "ZA"),
# Italian institutions
"Politecnico di Milano": ("Milano", "IT"),
"Università degli Studi di Milano": ("Milano", "IT"),
"Università di Bologna": ("Bologna", "IT"),
"University of Bologna": ("Bologna", "IT"),
# US institutions
"Oberlin College": ("Oberlin", "US"),
"Harvard University": ("Cambridge", "US"),
"Harvard": ("Cambridge", "US"),
"Yale University": ("New Haven", "US"),
"Princeton University": ("Princeton", "US"),
"MIT": ("Cambridge", "US"),
"Massachusetts Institute of Technology": ("Cambridge", "US"),
"Stanford University": ("Stanford", "US"),
"Columbia University": ("New York", "US"),
"University of California": ("Berkeley", "US"),
"UCLA": ("Los Angeles", "US"),
"University of Chicago": ("Chicago", "US"),
"NYU": ("New York", "US"),
"New York University": ("New York", "US"),
# Indonesian institutions
"Universitas Gadjah Mada": ("Yogyakarta", "ID"),
"UGM": ("Yogyakarta", "ID"),
"Universitas Indonesia": ("Jakarta", "ID"),
"UI": ("Jakarta", "ID"),
# Turkish institutions
"Middle East Technical University": ("Ankara", "TR"),
"METU": ("Ankara", "TR"),
"Boğaziçi University": ("Istanbul", "TR"),
# Additional Dutch variations found in data
"Rotterdam School of Management": ("Rotterdam", "NL"),
"RSM": ("Rotterdam", "NL"),
"TIAS School for Business and Society": ("Tilburg", "NL"),
"TIAS": ("Tilburg", "NL"),
"GO opleidingen": ("Utrecht", "NL"),
"Amsterdam University of Applied Sciences": ("Amsterdam", "NL"),
"University College Utrecht": ("Utrecht", "NL"),
"UCU": ("Utrecht", "NL"),
"University of Utrecht": ("Utrecht", "NL"),
"NSOB": ("Den Haag", "NL"),
"Nederlandse School voor Openbaar Bestuur": ("Den Haag", "NL"),
"Grotius Academie": ("Nijmegen", "NL"),
"de Baak": ("Noordwijk", "NL"),
"Grafisch Lyceum Rotterdam": ("Rotterdam", "NL"),
"Schoevers": ("Utrecht", "NL"),
"Schoevers College": ("Utrecht", "NL"),
}
@ -171,12 +326,39 @@ def geocode_location(location_str: str, db_path: str) -> Optional[dict]:
# Extract country from common patterns
country_code = None
if "(NL)" in location_str or "Netherlands" in location_str or "Nederland" in location_str:
country_code = "NL"
elif "(BE)" in location_str or "Belgium" in location_str or "België" in location_str:
country_code = "BE"
elif "(DE)" in location_str or "Germany" in location_str or "Deutschland" in location_str:
country_code = "DE"
country_patterns = {
"NL": ["(NL)", "Netherlands", "Nederland"],
"BE": ["(BE)", "Belgium", "België", "Belgique"],
"DE": ["(DE)", "Germany", "Deutschland"],
"GB": ["(GB)", "United Kingdom", "UK", "England", "Scotland", "Wales"],
"AU": ["(AU)", "Australia"],
"ZA": ["(ZA)", "South Africa"],
"IT": ["(IT)", "Italy", "Italia"],
"US": ["(US)", "United States", "USA", "U.S."],
"ID": ["(ID)", "Indonesia"],
"TR": ["(TR)", "Turkey", "Türkiye"],
"FR": ["(FR)", "France"],
"ES": ["(ES)", "Spain", "España"],
"AT": ["(AT)", "Austria", "Österreich"],
"CH": ["(CH)", "Switzerland", "Schweiz", "Suisse"],
"CA": ["(CA)", "Canada"],
"NZ": ["(NZ)", "New Zealand"],
"JP": ["(JP)", "Japan"],
"CN": ["(CN)", "China"],
"IN": ["(IN)", "India"],
"BR": ["(BR)", "Brazil", "Brasil"],
"SE": ["(SE)", "Sweden", "Sverige"],
"NO": ["(NO)", "Norway", "Norge"],
"DK": ["(DK)", "Denmark", "Danmark"],
"FI": ["(FI)", "Finland", "Suomi"],
"PL": ["(PL)", "Poland", "Polska"],
"CZ": ["(CZ)", "Czech Republic", "Czechia", "Česko"],
}
for code, patterns in country_patterns.items():
if any(p in location_str for p in patterns):
country_code = code
break
# Clean location for city lookup
city_candidate = location_str.split(",")[0].strip()
@ -255,6 +437,56 @@ def parse_date_range(date_range: str) -> Tuple[Optional[int], Optional[int]]:
return start_year, end_year
def get_any_date_field(record: dict) -> str:
"""
Extract date string from a record with various field name conventions.
Handles the following field variations found in LinkedIn profile data:
- date_range: "2019 - Present" (most common, 2,486 entries)
- period: "2015 - 2019" (15 entries)
- years/year: "2010" (single year)
- start_date/end_date: separate fields like "Sep 2019" / "Present" (149 entries)
- dates: "2018 - 2020" (12 entries)
Returns combined date string suitable for parse_date_range().
"""
# Try combined date fields first
for field in ["date_range", "period", "years", "year", "dates"]:
if record.get(field):
return str(record[field])
# Handle separate start_date/end_date fields
start = record.get("start_date", "") or ""
end = record.get("end_date", "") or ""
if start or end:
return f"{start} - {end}".strip(" -")
return ""
def parse_total_experience_field(total_exp: str) -> Optional[int]:
"""
Parse total experience field value to extract years.
Handles formats like:
- "24 years and 8 months"
- "37 years"
- "5 years 3 months"
- "1 year"
Returns number of years or None if not parseable.
"""
if not total_exp:
return None
# Pattern: find digits followed by "year" or "years"
match = re.search(r'(\d+)\s*years?', total_exp.lower())
if match:
return int(match.group(1))
return None
def build_inference_chain(steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Build a numbered inference chain."""
return [{"step": i + 1, **step} for i, step in enumerate(steps)]
@ -297,6 +529,22 @@ def get_adjacent_decades(year: int) -> Tuple[str, str]:
return (get_decade_notation(year - 10), get_decade_notation(year))
def parse_total_experience(about_text: str) -> Optional[int]:
"""
Parse "Total Experience: X years" pattern from about/summary field.
Returns number of years or None if not found.
"""
if not about_text:
return None
# Pattern: "Total Experience: X years and Y months" or "Total Experience: X year"
m = re.search(r'Total Experience:\s*(\d+)\s*years?', about_text, re.IGNORECASE)
if m:
return int(m.group(1))
return None
def infer_birth_decade(profile_data: dict) -> Optional[dict]:
"""
Infer birth decade from earliest career observations.
@ -305,6 +553,11 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
Supports list-valued results for decade boundary cases (Rule 45 extension):
- If estimated birth year is within 3 years of decade boundary, returns
both adjacent decades as EDTF set notation: [196X,197X]
Inference methods (in priority order):
1. Education start year (most reliable - entry age 18-24)
2. Experience start year (first job - entry age ~23)
3. Total Experience pattern (fallback - "Total Experience: X years")
"""
earliest_year = None
inference_steps = []
@ -312,6 +565,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
age_variance = 3 # ±3 years typical variance in entry age
education_record = None
experience_record = None
total_experience_years = None
# Check education first (most reliable)
education = profile_data.get("education") or []
@ -381,8 +635,8 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
for exp in experience:
if exp is None:
continue
# Handle multiple date field names
date_range = exp.get("date_range") or exp.get("period") or ""
# Handle multiple date field names (including start_date/end_date)
date_range = get_any_date_field(exp)
start_year, _ = parse_date_range(date_range)
if start_year:
@ -396,10 +650,59 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
"date_range": date_range,
}
# If no education or experience dates, try "Total Experience" pattern in about field
if earliest_year is None:
about = profile_data.get("about") or profile_data.get("summary") or ""
total_experience_years = parse_total_experience(about)
if total_experience_years and total_experience_years > 0:
# Estimate: current year - total_years = first job year
# Then: first job year - 23 = birth year (assuming first job at 23)
current_year = datetime.now().year
estimated_first_job_year = current_year - total_experience_years
earliest_year = estimated_first_job_year
age_offset = 23 # Assume first job at 23
age_variance = 7 # Very high variance for this method
inference_steps.append({
"observation": "Total Experience pattern found in about field",
"source_field": "profile_data.about",
"source_value": f"Total Experience: {total_experience_years} years",
})
inference_steps.append({
"calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}",
"result": f"Estimated first job year: {estimated_first_job_year}",
"assumption": "Total experience represents continuous career from first job",
})
# If still no date, try standalone total_experience field in profile_data
if earliest_year is None:
total_exp_field = profile_data.get("total_experience")
if total_exp_field:
total_experience_years = parse_total_experience_field(total_exp_field)
if total_experience_years and total_experience_years > 0:
current_year = datetime.now().year
estimated_first_job_year = current_year - total_experience_years
earliest_year = estimated_first_job_year
age_offset = 23 # Assume first job at 23
age_variance = 7 # Very high variance for this method
inference_steps.append({
"observation": "total_experience field found in profile_data",
"source_field": "profile_data.total_experience",
"source_value": total_exp_field,
})
inference_steps.append({
"calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}",
"result": f"Estimated first job year: {estimated_first_job_year}",
"assumption": "Total experience represents continuous career from first job",
})
if earliest_year is None:
return None
# Build inference chain
# Build inference chain (only add steps if not already added from Total Experience path)
if education_record:
inference_steps.append({
"observation": "Education record found",
@ -415,7 +718,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
"rationale": "Standard entry age for this education level in Netherlands/Europe",
"confidence_impact": f"Assumption introduces uncertainty; actual age may vary ±{age_variance} years",
})
else:
elif experience_record:
inference_steps.append({
"observation": "First job record found (no education data)",
"source_field": "profile_data.experience",
@ -430,6 +733,13 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
"rationale": "Assumes first job after typical university completion",
"confidence_impact": f"Higher uncertainty; first job age varies ±{age_variance} years",
})
elif total_experience_years:
# Steps already added in the Total Experience detection block
inference_steps.append({
"assumption": f"First job age is approximately {age_offset}{age_variance} years)",
"rationale": "Assumes first job after typical university completion; Total Experience method has highest uncertainty",
"confidence_impact": f"Very high uncertainty; first job age varies ±{age_variance} years, plus Total Experience aggregation may be inaccurate",
})
estimated_birth_year = earliest_year - age_offset
min_birth_year = earliest_year - age_offset - age_variance
@ -468,6 +778,14 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
"rationale": "Cannot determine which decade with certainty; using EDTF 'one of' set notation",
})
# Determine method name based on source
if education_record:
method_name = "earliest_education_heuristic"
elif experience_record:
method_name = "earliest_experience_heuristic"
else:
method_name = "total_experience_heuristic"
return {
"values": [decade1, decade2],
"edtf": f"[{decade1},{decade2}]",
@ -477,7 +795,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
"primary_rationale": primary_rationale,
"confidence": "very_low", # Lower confidence due to boundary uncertainty
"inference_provenance": {
"method": "earliest_observation_heuristic",
"method": method_name,
"inference_chain": build_inference_chain(inference_steps),
"assumptions": [
f"Entry age for education/first job: {age_offset} years (±{age_variance})",
@ -499,13 +817,24 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
"rationale": "Decade precision appropriate for heuristic-based estimate",
})
# Determine method name and confidence based on source
if education_record:
method_name = "earliest_education_heuristic"
confidence = "low"
elif experience_record:
method_name = "earliest_experience_heuristic"
confidence = "low"
else:
method_name = "total_experience_heuristic"
confidence = "very_low" # Lowest confidence for Total Experience method
return {
"value": edtf_decade,
"edtf": edtf_decade,
"precision": "decade",
"confidence": "low",
"confidence": confidence,
"inference_provenance": {
"method": "earliest_observation_heuristic",
"method": method_name,
"inference_chain": build_inference_chain(inference_steps),
"assumptions": [
f"Entry age for education/first job: {age_offset} years (±{age_variance})",
@ -549,7 +878,21 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
for uni_name, (city, country) in DUTCH_UNI_LOCATIONS.items():
if uni_name.lower() in institution.lower():
location = f"{city}, Netherlands" if city else None
# Map country code to country name for geocoding
country_names = {
"NL": "Netherlands",
"BE": "Belgium",
"DE": "Germany",
"GB": "United Kingdom",
"AU": "Australia",
"ZA": "South Africa",
"IT": "Italy",
"US": "United States",
"ID": "Indonesia",
"TR": "Turkey",
}
country_name = country_names.get(country, "Netherlands")
location = f"{city}, {country_name}" if city else None
location_source = f"Known institution mapping: {uni_name}"
break
@ -622,8 +965,8 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
for exp in experience:
if exp is None:
continue
# Handle multiple date field names
date_range = exp.get("date_range") or exp.get("period") or ""
# Handle multiple date field names (including start_date/end_date)
date_range = get_any_date_field(exp)
start_year, _ = parse_date_range(date_range)
if start_year and exp.get("location"):
exp_with_years.append((start_year, exp))
@ -636,7 +979,7 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
continue
# Get date_range for provenance (handle multiple field names)
exp_date_range = exp.get("date_range") or exp.get("period") or ""
exp_date_range = get_any_date_field(exp)
inference_steps.append({
"observation": "Earliest job with location found (no education location available)",
@ -739,8 +1082,8 @@ def infer_current_settlement(profile_data: dict, db_path: str) -> Optional[dict]
for exp in experience:
if exp is None:
continue
# Handle multiple date field names
date_range = exp.get("date_range") or exp.get("period") or ""
# Handle multiple date field names (including start_date/end_date)
date_range = get_any_date_field(exp)
# Also check "current" field which some profiles have
is_current = "Present" in date_range or exp.get("current") is True
if is_current:
@ -815,6 +1158,7 @@ def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force:
stats = {
"birth_decade_inferred": False,
"birth_decade_is_list": False, # Track decade boundary cases
"birth_decade_method": None, # Track which method was used
"birth_settlement_inferred": False,
"current_settlement_inferred": False,
"ppid_changed": False,
@ -870,6 +1214,9 @@ def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force:
components["first_date"] = birth_info["edtf"]
components["first_date_source"] = "inferred_birth_decade"
# Track which method was used
stats["birth_decade_method"] = birth_info.get("inference_provenance", {}).get("method", "unknown")
# Add note to canonical field pointing to inferred alternative
data["birth_date"]["note"] = "See inferred_birth_decade for heuristic estimate"
@ -978,6 +1325,11 @@ def main():
"processed": 0,
"birth_decade_inferred": 0,
"birth_decade_list_valued": 0, # Decade boundary cases
"birth_decade_by_method": {
"earliest_education_heuristic": 0,
"earliest_experience_heuristic": 0,
"total_experience_heuristic": 0,
},
"birth_settlement_inferred": 0,
"current_settlement_inferred": 0,
"ppid_changed": 0,
@ -990,6 +1342,10 @@ def main():
total_stats["processed"] += 1
if stats["birth_decade_inferred"]:
total_stats["birth_decade_inferred"] += 1
# Track method used
method = stats.get("birth_decade_method")
if method and method in total_stats["birth_decade_by_method"]:
total_stats["birth_decade_by_method"][method] += 1
if stats.get("birth_decade_is_list"):
total_stats["birth_decade_list_valued"] += 1
if stats["birth_settlement_inferred"]:
@ -999,7 +1355,7 @@ def main():
if stats["ppid_changed"]:
total_stats["ppid_changed"] += 1
if args.verbose and any(stats.values()):
if args.verbose and any(v for k, v in stats.items() if k != "birth_decade_method"):
print(f" {filepath.name}: {stats}")
if (i + 1) % 500 == 0:
@ -1017,6 +1373,9 @@ def main():
print(f"Processed: {total_stats['processed']}")
print(f"Birth decades inferred: {total_stats['birth_decade_inferred']}")
print(f" - List-valued (boundary): {total_stats['birth_decade_list_valued']}")
print(f" - By method:")
for method, count in total_stats["birth_decade_by_method"].items():
print(f" {method}: {count}")
print(f"Birth settlements inferred: {total_stats['birth_settlement_inferred']}")
print(f"Current settlements inferred: {total_stats['current_settlement_inferred']}")
print(f"PPIDs updated: {total_stats['ppid_changed']}")
@ -1033,6 +1392,7 @@ def main():
print("\nNote: All inferred data stored in explicit inferred_* fields with provenance chains.")
print("Note: Decade boundary cases use EDTF set notation [196X,197X] with primary_value for PPID.")
print("Note: Total Experience method has highest uncertainty (very_low confidence).")
if __name__ == "__main__":

579
scripts/enrich_ppids_web.py Normal file
View file

@ -0,0 +1,579 @@
#!/usr/bin/env python3
"""
PPID Web Enrichment Script
Enriches PPID files with web-sourced claims using Exa AI and Linkup search.
Adds proper provenance statements per Rules 6, 26, and 35.
Enrichment targets:
1. Birth date/year - Search for biographical information
2. Publications - ORCID, Google Scholar, ResearchGate
3. News mentions - Press coverage, interviews
4. Wikidata entity - Authority file linking
5. Institutional affiliations - Verify current roles
All web claims include:
- source_url: Where the data was found
- retrieved_on: ISO 8601 timestamp
- retrieval_agent: Tool used (exa_web_search, linkup_search, etc.)
- claim_type: Type of claim (birth_date, publication, news_mention, etc.)
- claim_value: The extracted value
- provenance: Full provenance chain per Rule 35
Usage:
python scripts/enrich_ppids_web.py --limit 10 --verbose
python scripts/enrich_ppids_web.py --dry-run --sample stefankulk
"""
import json
import os
import re
import sys
import time
import argparse
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, List, Any, Tuple
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
def create_web_claim(
claim_type: str,
claim_value: str,
source_url: str,
retrieval_agent: str,
confidence: str = "medium",
notes: Optional[str] = None,
raw_response: Optional[Dict] = None
) -> Dict[str, Any]:
"""
Create a web claim with proper provenance per Rules 6, 26, and 35.
Args:
claim_type: Type of claim (birth_date, publication, news_mention, etc.)
claim_value: The extracted value
source_url: URL where the data was found
retrieval_agent: Tool used (exa_web_search, linkup_search, etc.)
confidence: Confidence level (high, medium, low, very_low)
notes: Additional notes about the claim
raw_response: Raw API response for audit
Returns:
Dict with claim structure per Rule 26
"""
now = datetime.now(timezone.utc)
claim = {
"claim_type": claim_type,
"claim_value": claim_value,
"source_url": source_url,
"retrieved_on": now.isoformat(),
"retrieval_agent": retrieval_agent,
"confidence": confidence,
"provenance": {
"statement_created_at": now.isoformat(),
"source_archived_at": now.isoformat(), # Same time for API responses
"retrieval_method": retrieval_agent,
}
}
if notes:
claim["notes"] = notes
if raw_response:
# Store snippet of raw response for audit (not full response to save space)
claim["provenance"]["response_snippet"] = str(raw_response)[:500]
return claim
def extract_birth_year_from_text(text: str, full_name: str) -> Optional[Tuple[str, str]]:
"""
Extract birth year from text using various patterns.
Returns:
Tuple of (birth_year_edtf, extraction_note) or None
"""
if not text:
return None
# Normalize text
text_lower = text.lower()
name_parts = full_name.lower().split()
last_name = name_parts[-1] if name_parts else ""
# Check if the text is about the right person (basic check)
if last_name and last_name not in text_lower:
return None
# Pattern 1: "born in YYYY" or "born YYYY"
born_match = re.search(r'born\s+(?:in\s+)?(\d{4})', text_lower)
if born_match:
year = born_match.group(1)
return (year, f"Extracted from 'born {year}' pattern")
# Pattern 2: "(YYYY - )" or "(YYYY-)" indicating birth year
birth_dash_match = re.search(r'\((\d{4})\s*[-–—]\s*\)', text)
if birth_dash_match:
year = birth_dash_match.group(1)
return (year, f"Extracted from '({year} - )' lifespan pattern")
# Pattern 3: "YYYY - present" or "b. YYYY"
b_match = re.search(r'(?:b\.|born)\s*(\d{4})', text_lower)
if b_match:
year = b_match.group(1)
return (year, f"Extracted from 'b. {year}' pattern")
# Pattern 4: Age patterns "X years old" with date context
age_match = re.search(r'(\d{1,2})\s*(?:years?\s*old|jaar\s*oud)', text_lower)
if age_match:
age = int(age_match.group(1))
if 20 <= age <= 100: # Reasonable age range
current_year = datetime.now().year
estimated_birth = current_year - age
return (f"{estimated_birth}~", f"Estimated from age {age} (approximate)")
# Pattern 5: Birthday patterns "birthday: Month DD, YYYY"
birthday_match = re.search(
r'(?:birthday|geboren|date of birth)[:\s]+(?:\w+\s+\d{1,2},?\s+)?(\d{4})',
text_lower
)
if birthday_match:
year = birthday_match.group(1)
return (year, "Extracted from birthday/geboren pattern")
return None
def extract_publications_from_text(text: str, full_name: str) -> List[Dict[str, str]]:
"""
Extract publication references from search results.
Returns:
List of publication dicts with title, year, venue
"""
publications = []
if not text:
return publications
# Look for DOI patterns
doi_matches = re.findall(r'10\.\d{4,}/[^\s]+', text)
for doi in doi_matches[:5]: # Limit to 5
publications.append({
"type": "doi",
"value": doi.strip(),
"note": "DOI found in search results"
})
# Look for ORCID patterns
orcid_match = re.search(r'orcid\.org/(\d{4}-\d{4}-\d{4}-\d{3}[\dX])', text)
if orcid_match:
publications.append({
"type": "orcid",
"value": orcid_match.group(1),
"note": "ORCID identifier found"
})
return publications
def search_birth_date_exa(full_name: str, context_hints: List[str] = None) -> Optional[Dict]:
"""
Search for birth date using Exa AI web search.
Note: This function is designed to be called via MCP tools.
In actual execution, replace with MCP tool call.
"""
# Build search query
query_parts = [f'"{full_name}"', "born", "birthday"]
if context_hints:
query_parts.extend(context_hints[:2]) # Add up to 2 context hints
query = " ".join(query_parts)
# This would be replaced with actual MCP call:
# result = exa_web_search_exa(query=query, numResults=5)
return {
"query": query,
"tool": "exa_web_search_exa",
"status": "pending_mcp_call"
}
def search_publications_exa(full_name: str, institution: str = None) -> Optional[Dict]:
"""
Search for publications using Exa AI.
"""
query_parts = [f'"{full_name}"']
if institution:
query_parts.append(institution)
query_parts.extend(["publications", "research", "ORCID"])
query = " ".join(query_parts)
return {
"query": query,
"tool": "exa_web_search_exa",
"status": "pending_mcp_call"
}
def search_news_mentions_exa(full_name: str, institution: str = None) -> Optional[Dict]:
"""
Search for news mentions using Exa AI.
"""
query_parts = [f'"{full_name}"']
if institution:
query_parts.append(institution)
query = " ".join(query_parts)
return {
"query": query,
"tool": "exa_web_search_exa",
"status": "pending_mcp_call"
}
def get_person_context(ppid_data: Dict) -> Dict[str, Any]:
"""
Extract context from PPID data for better search queries.
"""
context = {
"full_name": "",
"institutions": [],
"roles": [],
"location": None,
"linkedin_url": None,
"skills": [],
}
# Get name
name_data = ppid_data.get("name", {})
context["full_name"] = name_data.get("full_name", "")
# Get profile data
profile = ppid_data.get("profile_data", {})
if profile:
context["linkedin_url"] = profile.get("linkedin_url")
context["location"] = profile.get("location")
context["skills"] = profile.get("skills", [])[:10] # Top 10 skills
# Extract institutions from experience
for exp in profile.get("experience", []) or []:
if exp and exp.get("company"):
context["institutions"].append(exp["company"])
if exp.get("title"):
context["roles"].append(exp["title"])
# Extract from education
for edu in profile.get("education", []) or []:
if edu and edu.get("institution"):
context["institutions"].append(edu["institution"])
# Deduplicate
context["institutions"] = list(dict.fromkeys(context["institutions"]))[:5]
context["roles"] = list(dict.fromkeys(context["roles"]))[:5]
return context
def build_enrichment_queries(ppid_data: Dict) -> List[Dict[str, Any]]:
"""
Build a list of enrichment queries for a PPID.
Returns list of query specs to execute via MCP tools.
"""
context = get_person_context(ppid_data)
full_name = context["full_name"]
if not full_name:
return []
queries = []
# 1. Birth date search (only if not already known)
birth_date = ppid_data.get("birth_date", {}).get("edtf", "XXXX")
enrichment_meta = ppid_data.get("enrichment_metadata", {}).get("birth_date_search", {})
if birth_date == "XXXX" and not enrichment_meta.get("attempted"):
# Build birth date query with context
hints = []
if context["institutions"]:
hints.append(context["institutions"][0])
if context["location"]:
hints.append(context["location"].split(",")[0])
queries.append({
"type": "birth_date",
"query": f'"{full_name}" born birthday biography',
"context_hints": hints,
"tool": "exa_web_search_exa",
"priority": "high"
})
# 2. Publications search (for academics/researchers)
academic_keywords = ["professor", "researcher", "phd", "doctor", "lecturer", "scientist"]
is_academic = any(
kw in " ".join(context["roles"]).lower()
for kw in academic_keywords
)
if is_academic:
institution = context["institutions"][0] if context["institutions"] else ""
queries.append({
"type": "publications",
"query": f'"{full_name}" {institution} publications ORCID research',
"tool": "exa_web_search_exa",
"priority": "medium"
})
# 3. News/press mentions
if context["institutions"]:
queries.append({
"type": "news_mentions",
"query": f'"{full_name}" {context["institutions"][0]}',
"tool": "exa_web_search_exa",
"priority": "low"
})
# 4. Wikidata search (for notable persons)
queries.append({
"type": "wikidata",
"query": full_name,
"tool": "wikidata_search_entity",
"priority": "medium"
})
return queries
def process_search_result(
result: Dict[str, Any],
query_type: str,
full_name: str,
ppid_data: Dict
) -> List[Dict[str, Any]]:
"""
Process a search result and extract web claims.
Args:
result: Raw search result from Exa/Linkup
query_type: Type of query (birth_date, publications, etc.)
full_name: Person's full name
ppid_data: Current PPID data
Returns:
List of web claims to add
"""
claims = []
if not result:
return claims
# Extract text content from result
text = ""
source_url = ""
if isinstance(result, dict):
text = result.get("text", "") or result.get("content", "") or ""
source_url = result.get("url", "") or result.get("source_url", "")
elif isinstance(result, str):
text = result
if query_type == "birth_date":
birth_info = extract_birth_year_from_text(text, full_name)
if birth_info:
year, note = birth_info
claims.append(create_web_claim(
claim_type="birth_year",
claim_value=year,
source_url=source_url,
retrieval_agent="exa_web_search_exa",
confidence="medium" if "~" not in year else "low",
notes=note,
raw_response={"text_snippet": text[:200]}
))
elif query_type == "publications":
pubs = extract_publications_from_text(text, full_name)
for pub in pubs:
claims.append(create_web_claim(
claim_type=f"identifier_{pub['type']}",
claim_value=pub["value"],
source_url=source_url,
retrieval_agent="exa_web_search_exa",
confidence="high" if pub["type"] in ["doi", "orcid"] else "medium",
notes=pub.get("note")
))
elif query_type == "news_mentions":
# For news, we just record the mention
if full_name.lower() in text.lower():
claims.append(create_web_claim(
claim_type="news_mention",
claim_value=text[:500], # First 500 chars
source_url=source_url,
retrieval_agent="exa_web_search_exa",
confidence="medium",
notes="News/press mention found"
))
return claims
def enrich_ppid_file(
filepath: Path,
dry_run: bool = False,
verbose: bool = False
) -> Dict[str, Any]:
"""
Enrich a single PPID file with web-sourced claims.
This function builds queries but does not execute them directly.
Queries should be executed via MCP tools in the calling context.
Returns:
Dict with enrichment stats and pending queries
"""
stats = {
"filepath": str(filepath),
"queries_built": 0,
"claims_added": 0,
"errors": [],
"pending_queries": []
}
try:
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
except Exception as e:
stats["errors"].append(f"Failed to read file: {e}")
return stats
# Build enrichment queries
queries = build_enrichment_queries(data)
stats["queries_built"] = len(queries)
stats["pending_queries"] = queries
if verbose:
print(f" Built {len(queries)} queries for {filepath.name}")
for q in queries:
print(f" - {q['type']}: {q['query'][:50]}...")
return stats
def main():
parser = argparse.ArgumentParser(
description="Enrich PPID files with web-sourced claims (Rule 26 compliant)"
)
parser.add_argument("--dry-run", action="store_true", help="Don't write changes")
parser.add_argument("--limit", type=int, help="Process only N files")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
parser.add_argument("--sample", type=str, help="Process specific linkedin_slug")
parser.add_argument(
"--query-types",
type=str,
default="birth_date,publications,news_mentions,wikidata",
help="Comma-separated list of query types to run"
)
args = parser.parse_args()
person_dir = Path("/Users/kempersc/apps/glam/data/person")
# Get PPID files
if args.sample:
# Find file by linkedin slug
ppid_files = list(person_dir.glob(f"ID_*{args.sample.upper()}*.json"))
if not ppid_files:
# Try case-insensitive search
ppid_files = [
f for f in person_dir.glob("ID_*.json")
if args.sample.lower() in f.stem.lower()
]
else:
ppid_files = list(person_dir.glob("ID_*.json"))
if args.limit:
ppid_files = ppid_files[:args.limit]
print(f"Processing {len(ppid_files)} PPID files for web enrichment...")
if args.dry_run:
print("DRY RUN - no changes will be written")
query_types = set(args.query_types.split(","))
print(f"Query types: {query_types}")
# Statistics
total_stats = {
"processed": 0,
"queries_built": 0,
"by_type": {qt: 0 for qt in query_types},
"errors": 0,
}
all_pending_queries = []
for i, filepath in enumerate(ppid_files):
try:
stats = enrich_ppid_file(filepath, dry_run=args.dry_run, verbose=args.verbose)
total_stats["processed"] += 1
total_stats["queries_built"] += stats["queries_built"]
# Filter queries by requested types
for q in stats["pending_queries"]:
if q["type"] in query_types:
total_stats["by_type"][q["type"]] += 1
all_pending_queries.append({
"filepath": stats["filepath"],
**q
})
if stats["errors"]:
total_stats["errors"] += 1
if args.verbose:
print(f" ERROR {filepath.name}: {stats['errors']}")
if (i + 1) % 100 == 0:
print(f" Processed {i + 1}/{len(ppid_files)}...")
except Exception as e:
total_stats["errors"] += 1
if args.verbose:
print(f" ERROR {filepath.name}: {e}")
# Print summary
print("\n" + "=" * 60)
print("WEB ENRICHMENT QUERY SUMMARY")
print("=" * 60)
print(f"Processed: {total_stats['processed']}")
print(f"Queries built: {total_stats['queries_built']}")
print(f"By query type:")
for qt, count in total_stats["by_type"].items():
print(f" - {qt}: {count}")
print(f"Errors: {total_stats['errors']}")
# Output pending queries for MCP execution
if all_pending_queries and not args.dry_run:
output_file = person_dir.parent / "pending_web_queries.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump({
"generated_at": datetime.now(timezone.utc).isoformat(),
"total_queries": len(all_pending_queries),
"queries": all_pending_queries
}, f, indent=2, ensure_ascii=False)
print(f"\nPending queries saved to: {output_file}")
print("Execute these queries via MCP tools and run --apply-results to add claims.")
print("\nNote: This script builds queries. Execute via MCP tools:")
print(" - exa_web_search_exa for birth_date, publications, news_mentions")
print(" - wikidata_search_entity for wikidata matching")
if __name__ == "__main__":
main()

View file

@ -21,9 +21,76 @@ CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# Known organizations with their locations
# Format: 'normalized_name': (province, city_code, city_name, inst_type)
# Province codes: NH=Noord-Holland, ZH=Zuid-Holland, UT=Utrecht, GE=Gelderland,
# NB=Noord-Brabant, LI=Limburg, OV=Overijssel, FR=Friesland,
# DR=Drenthe, GR=Groningen, ZE=Zeeland, FL=Flevoland
# Foreign: Use country code (BE, DE, FR, DK, IT, GB, US, etc.) as first element
KNOWN_ORGS = {
# Museums
# ==========================================================================
# MUSEUMS - Netherlands
# ==========================================================================
'amsterdamse school museum het schip': ('NH', 'AMS', 'Amsterdam', 'M'),
'hunebedcentrum': ('DR', 'BOR', 'Borger', 'M'),
'museum flehite': ('UT', 'AME', 'Amersfoort', 'M'),
'museum batavialand': ('FL', 'LEL', 'Lelystad', 'M'),
'batavialand': ('FL', 'LEL', 'Lelystad', 'M'),
'jewish cultural quarter': ('NH', 'AMS', 'Amsterdam', 'M'),
'joods cultureel kwartier': ('NH', 'AMS', 'Amsterdam', 'M'),
'museum catharijneconvent': ('UT', 'UTR', 'Utrecht', 'M'),
'museum speelklok': ('UT', 'UTR', 'Utrecht', 'M'),
'museum rembrandthuis': ('NH', 'AMS', 'Amsterdam', 'M'),
'rembrandthuis': ('NH', 'AMS', 'Amsterdam', 'M'),
'nieuwe instituut': ('ZH', 'ROT', 'Rotterdam', 'M'),
'het nieuwe instituut': ('ZH', 'ROT', 'Rotterdam', 'M'),
'museum van loon': ('NH', 'AMS', 'Amsterdam', 'M'),
'museum voorlinden': ('ZH', 'WAS', 'Wassenaar', 'M'),
'museum belvedere': ('FR', 'HEE', 'Heerenveen', 'M'),
'museum more': ('GE', 'GOR', 'Gorssel', 'M'),
'lam museum': ('ZH', 'LIS', 'Lisse', 'M'),
'lisser art museum': ('ZH', 'LIS', 'Lisse', 'M'),
'lisser art museum lam': ('ZH', 'LIS', 'Lisse', 'M'),
'nxt museum': ('NH', 'AMS', 'Amsterdam', 'M'),
'nationaal onderduikmuseum': ('GE', 'AAL', 'Aalten', 'M'),
'lantarenvenster': ('ZH', 'ROT', 'Rotterdam', 'E'),
'loosduins museum': ('ZH', 'DHA', 'Den Haag', 'M'),
'louis couperus museum': ('ZH', 'DHA', 'Den Haag', 'M'),
'museum bredius': ('ZH', 'DHA', 'Den Haag', 'M'),
'museum broekerveiling': ('NH', 'LAN', 'Langedijk', 'M'),
'broekerveiling': ('NH', 'LAN', 'Langedijk', 'M'),
'museum bronbeek': ('GE', 'ARN', 'Arnhem', 'M'),
'museum de bastei': ('GE', 'NIJ', 'Nijmegen', 'M'),
'museum amstelland': ('NH', 'AMS', 'Amstelveen', 'M'),
'museum cobra': ('NH', 'AMV', 'Amstelveen', 'M'),
'cobra museum': ('NH', 'AMV', 'Amstelveen', 'M'),
'cobra museum voor moderne kunst amstelveen': ('NH', 'AMV', 'Amstelveen', 'M'),
'museum aan de a': ('GR', 'GRO', 'Groningen', 'M'),
'museum helmantel': ('GR', 'WES', 'Westeremden', 'M'),
'museum hert fan fryslan': ('FR', 'LEE', 'Leeuwarden', 'M'),
'museum het pakhuis': ('NH', 'HOO', 'Hoorn', 'M'),
'museum huys der kunsten': ('NB', 'ROO', 'Roosendaal', 'M'),
'museum maluku': ('UT', 'UTR', 'Utrecht', 'M'),
'museum martena': ('FR', 'FRA', 'Franeker', 'M'),
'museum nairac': ('GE', 'BAR', 'Barneveld', 'M'),
'museum slager': ('NB', 'BOS', 's-Hertogenbosch', 'M'),
'museum smedekinck': ('GE', 'ZEL', 'Zelhem', 'M'),
'museum staal': ('GE', 'ALM', 'Almere', 'M'),
'museum cafe het pomphuis': ('ZE', 'GOE', 'Goes', 'E'), # Restaurant/cafe, not museum
'museum de looierij': ('NH', 'AMS', 'Amsterdam', 'M'), # Westzaan area
'museum de proefkolonie': ('DR', 'FRE', 'Frederiksoord', 'M'),
'museum de speeltoren': ('GE', 'NIJ', 'Nijmegen', 'M'), # Actually in Monnickendam
'museum fiskershuske': ('FR', 'MOD', 'Moddergat', 'M'),
'museum stedhus sleat': ('FR', 'SLO', 'Sloten', 'M'),
'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'), # Belgium - museum pass
'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'),
'museum swaensteyn': ('ZH', 'VOR', 'Voorburg', 'M'),
'museum van de vrouw': ('NB', 'EER', 'Eersel', 'M'),
'oorlogsmuseum medemblik': ('NH', 'MED', 'Medemblik', 'M'),
'nac museum': ('NB', 'BRE', 'Breda', 'M'),
'nationaal baggermuseum': ('ZH', 'SLI', 'Sliedrecht', 'M'),
'nationaal restauratiefonds': ('UT', 'AME', 'Amersfoort', 'N'),
'nederlands steendrukmuseum': ('GE', 'VAL', 'Valburg', 'M'),
'nederlands stoommachinemuseum': ('GE', 'MED', 'Medemblik', 'M'),
'pieter vermeulen museum': ('DR', 'MED', 'Diever', 'M'),
'bonnefanten': ('LI', 'MAA', 'Maastricht', 'M'),
'bonami spelcomputer museum': ('OV', 'ZWO', 'Zwolle', 'M'),
'bakkerijmuseum de oude bakkerij': ('NH', 'MED', 'Medemblik', 'M'),
@ -31,7 +98,6 @@ KNOWN_ORGS = {
'coda museum': ('GE', 'APE', 'Apeldoorn', 'M'),
'comm museum voor communicatie': ('ZH', 'DHA', 'Den Haag', 'M'),
'cruquius museum': ('NH', 'HAA', 'Haarlemmermeer', 'M'),
'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium
'dordrechts museum': ('ZH', 'DOR', 'Dordrecht', 'M'),
'dutch museum of freemasonry': ('ZH', 'DHA', 'Den Haag', 'M'),
'eise eisinga planetarium': ('FR', 'FRA', 'Franeker', 'M'),
@ -102,55 +168,588 @@ KNOWN_ORGS = {
'rijksmuseum boerhaave': ('ZH', 'LEI', 'Leiden', 'M'),
'rijksmuseum twenthe': ('OV', 'ENS', 'Enschede', 'M'),
'singer laren': ('NH', 'LAR', 'Laren', 'M'),
'singer museum': ('NH', 'LAR', 'Laren', 'M'),
'sonnenborgh museum': ('UT', 'UTR', 'Utrecht', 'M'),
'zeeuws museum': ('ZE', 'MID', 'Middelburg', 'M'),
# Libraries
# Additional museums from PENDING list
'het scheepvaartmuseum': ('NH', 'AMS', 'Amsterdam', 'M'),
'hash marihuana hemp museum': ('NH', 'AMS', 'Amsterdam', 'M'),
'hash marihuana en hemp museum': ('NH', 'AMS', 'Amsterdam', 'M'),
'van gogh village museum': ('NB', 'NUE', 'Nuenen', 'M'),
'retro computer museum': ('GE', 'ARN', 'Arnhem', 'M'),
'haags bus museum': ('ZH', 'DHA', 'Den Haag', 'M'),
'het romeins museum': ('GE', 'NIJ', 'Nijmegen', 'M'),
'hendrick hamel museum': ('GR', 'GOR', 'Gorinchem', 'M'),
'graphic design museum': ('NB', 'BRE', 'Breda', 'M'),
'vliegend museum seppe': ('NB', 'BOS', 'Bosschenhoofd', 'M'),
'zoological museum netherlands': ('NH', 'AMS', 'Amsterdam', 'M'),
'world of cannabis museum project': ('NH', 'AMS', 'Amsterdam', 'M'),
'stichting museum 1940 1945': ('ZH', 'DOR', 'Dordrecht', 'M'),
'stichting museum menkemaborg': ('GR', 'UIT', 'Uithuizen', 'M'),
'stichting pak museum': ('NH', 'AMS', 'Amsterdam', 'M'),
'stichting museum blokhuispoort': ('FR', 'LEE', 'Leeuwarden', 'M'),
'sculptuur instituut': ('NH', 'AMS', 'Amsterdam', 'M'),
'gelders restauratie centrum': ('GE', 'ARN', 'Arnhem', 'R'),
# ==========================================================================
# LIBRARIES
# ==========================================================================
'de bblthk': ('GE', 'WAG', 'Wageningen', 'L'),
'kb nationale bibliotheek': ('ZH', 'DHA', 'Den Haag', 'L'),
'bplusc': ('ZH', 'LEI', 'Leiden', 'L'),
# Archives
# ==========================================================================
# ARCHIVES
# ==========================================================================
'digitar het online archief': ('UT', 'UTR', 'Utrecht', 'D'),
'the black archives': ('NH', 'AMS', 'Amsterdam', 'A'),
'archivesspace': ('US', 'NYC', 'New York', 'D'), # US-based software
# Organizations (stichtingen, etc.)
# ==========================================================================
# NATURE & ENVIRONMENTAL ORGANIZATIONS
# ==========================================================================
'staatsbosbeheer': ('UT', 'AME', 'Amersfoort', 'O'),
'vogelbescherming nederland': ('UT', 'ZEI', 'Zeist', 'N'),
'waddenvereniging': ('FR', 'HAR', 'Harlingen', 'N'),
'trees for all': ('UT', 'UTR', 'Utrecht', 'N'),
'natuurmonumenten': ('UT', 'AME', 'Amersfoort', 'N'),
'vereniging natuurmonumenten': ('UT', 'AME', 'Amersfoort', 'N'),
'it fryske gea': ('FR', 'BEE', 'Beetsterzwaag', 'N'),
'landschappennl': ('UT', 'UTR', 'Utrecht', 'N'),
'land van ons': ('UT', 'UTR', 'Utrecht', 'N'),
'natuurbegraven nederland': ('NH', 'AMS', 'Amsterdam', 'N'),
'natuuropleiding': ('NH', 'AMS', 'Amsterdam', 'E'),
'obn natuurkennis': ('DR', 'ASS', 'Assen', 'R'),
'ravon': ('GE', 'NIJ', 'Nijmegen', 'R'),
'norminstituut bomen': ('UT', 'UTR', 'Utrecht', 'R'),
'nationale bomenbank b v': ('NH', 'AMS', 'Amsterdam', 'C'),
'native plant trust': ('US', 'BOS', 'Boston', 'N'), # US
'kiss the ground': ('US', 'LAX', 'Los Angeles', 'N'), # US
'national coalition for natural farming': ('IN', 'DEL', 'Delhi', 'N'), # India
'lpo provence alpes cote d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'), # France
'picardie nature': ('FR', 'AMI', 'Amiens', 'N'), # France
'parc national des pyrenees': ('FR', 'TAR', 'Tarbes', 'N'), # France
'bumblebee conservation trust': ('GB', 'STI', 'Stirling', 'N'), # UK
'botanic gardens conservation international': ('GB', 'KEW', 'Kew', 'N'), # UK
'save our seas foundation sosf': ('ZA', 'CPT', 'Cape Town', 'N'), # South Africa
'ferus ours loup lynx conservation': ('FR', 'PAR', 'Paris', 'N'), # France
'european arboricultural council': ('BE', 'BRU', 'Brussels', 'N'), # Belgium
'caring farmers': ('UT', 'UTR', 'Utrecht', 'N'),
'collectief natuurinclusief': ('UT', 'UTR', 'Utrecht', 'N'),
'stichting rechten van de natuur': ('NH', 'AMS', 'Amsterdam', 'N'),
'deltaplan agrarisch waterbeheer daw': ('UT', 'UTR', 'Utrecht', 'N'),
'boerenverstand onderzoek advies': ('GE', 'WAG', 'Wageningen', 'R'),
'cruydt hoeck': ('GR', 'NIJ', 'Nijeholtpade', 'C'),
# ==========================================================================
# HERITAGE & HISTORICAL SOCIETIES
# ==========================================================================
'3 october vereeniging': ('ZH', 'LEI', 'Leiden', 'S'),
'historische vereniging delfia batavorum': ('ZH', 'DEL', 'Delft', 'S'),
'historische vereniging koog zaandijk': ('NH', 'ZAA', 'Zaandijk', 'S'),
'historische vereniging oud stolwijck': ('ZH', 'STO', 'Stolwijk', 'S'),
'historische vereniging voorst': ('GE', 'VOO', 'Voorst', 'S'),
'historische vereniging wormerveer': ('NH', 'WOR', 'Wormerveer', 'S'),
'heemkunde vereniging borne': ('OV', 'BOR', 'Borne', 'S'),
'heemkunde vlaanderen': ('BE', 'ANT', 'Antwerpen', 'S'), # Belgium
'hendrick de keyser monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
'vereniging particuliere historische buitenplaatsen': ('NH', 'AMS', 'Amsterdam', 'N'),
'werkgroep adelsgeschiedenis': ('NH', 'AMS', 'Amsterdam', 'S'),
'stichting oude groninger kerken': ('GR', 'GRO', 'Groningen', 'N'),
'studiecentrum eerste wereldoorlog': ('BE', 'BRU', 'Brussels', 'R'), # Belgium
'sobibor foundation': ('NH', 'AMS', 'Amsterdam', 'N'),
# ==========================================================================
# STICHTINGEN & FOUNDATIONS
# ==========================================================================
'abdij o l v koningshoeven': ('NB', 'TIL', 'Tilburg', 'H'),
'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'),
'bijenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
'bomenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
'boerennatuur': ('UT', 'UTR', 'Utrecht', 'N'),
'cbg': ('ZH', 'DHA', 'Den Haag', 'R'), # Central Bureau for Genealogy
'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'),
'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'),
'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'),
'delamar': ('NH', 'AMS', 'Amsterdam', 'E'),
'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'),
'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'),
'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'),
'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
'stichting amelander musea': ('FR', 'AME', 'Ameland', 'M'),
'stichting confro': ('NH', 'AMS', 'Amsterdam', 'N'),
'stichting de zaanse schans': ('NH', 'ZAA', 'Zaandam', 'M'),
'stichting dioraphte': ('UT', 'UTR', 'Utrecht', 'N'),
'stichting koninklijke defensiemusea': ('ZH', 'DHA', 'Den Haag', 'M'),
'stichting kunst cultuur': ('NH', 'AMS', 'Amsterdam', 'N'),
'stichting texels museum': ('NH', 'TEX', 'Texel', 'M'),
'stichting twisca': ('OV', 'TWI', 'Twisk', 'N'),
'stichting waddengroep': ('NH', 'DEN', 'Den Helder', 'N'),
'hartwig art foundation': ('NH', 'AMS', 'Amsterdam', 'N'),
'fonds 21': ('UT', 'UTR', 'Utrecht', 'N'),
'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'),
'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'),
'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'),
'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'),
'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'),
'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'),
# Research centers
# ==========================================================================
# RESEARCH CENTERS & KNOWLEDGE INSTITUTES
# ==========================================================================
'adc archeoprojecten': ('GE', 'AME', 'Amersfoort', 'R'),
'archol': ('ZH', 'LEI', 'Leiden', 'R'),
'kitlv': ('ZH', 'LEI', 'Leiden', 'R'),
'cbg': ('ZH', 'DHA', 'Den Haag', 'R'), # Central Bureau for Genealogy
'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'),
'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'),
'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'),
'centre of expertise creative innovation': ('NH', 'AMS', 'Amsterdam', 'R'),
'huygens institute': ('NH', 'AMS', 'Amsterdam', 'R'),
'huygens instituut': ('NH', 'AMS', 'Amsterdam', 'R'),
'instituut voor de nederlandse taal': ('ZH', 'LEI', 'Leiden', 'R'),
'n w posthumus institute': ('NH', 'AMS', 'Amsterdam', 'R'),
'nicas netherlands institute for conservation art science': ('NH', 'AMS', 'Amsterdam', 'R'),
'raap': ('OV', 'ZWO', 'Zwolle', 'R'),
'restauratoren nederland': ('NH', 'AMS', 'Amsterdam', 'N'),
'restauratieatelier restaura': ('LI', 'HAE', 'Haelen', 'C'),
'picturae': ('NH', 'HIL', 'Heiloo', 'C'),
'icom netherlands': ('NH', 'AMS', 'Amsterdam', 'N'),
'icomos netherlands': ('NH', 'AMS', 'Amsterdam', 'N'),
'international committee for documentation': ('FR', 'PAR', 'Paris', 'N'),
'museumvereniging': ('NH', 'AMS', 'Amsterdam', 'N'),
'museumpeil': ('NH', 'AMS', 'Amsterdam', 'C'),
'museumtijdschrift': ('NH', 'AMS', 'Amsterdam', 'C'),
'monumentaal magazine over cultureel erfgoed': ('NH', 'AMS', 'Amsterdam', 'C'),
'modemuze': ('NH', 'AMS', 'Amsterdam', 'D'),
'moebius museum software': ('NH', 'AMS', 'Amsterdam', 'C'),
'platform drentse musea': ('DR', 'ASS', 'Assen', 'O'),
'public domain library': ('US', 'SFO', 'San Francisco', 'D'), # US
'internet archive': ('US', 'SFO', 'San Francisco', 'A'), # US
'society for artistic research': ('AT', 'VIE', 'Vienna', 'R'), # Austria
'digital preservation coalition': ('GB', 'GLA', 'Glasgow', 'R'), # UK
'the palaeontological association': ('GB', 'LON', 'London', 'R'), # UK
'the society for archaeological sciences': ('US', 'TUC', 'Tucson', 'R'), # US
'conflict research society': ('GB', 'LON', 'London', 'R'), # UK
'stads en architectuurgeschiedenis uva': ('NH', 'AMS', 'Amsterdam', 'R'),
'agandau onderzoek in het archief': ('NH', 'AMS', 'Amsterdam', 'R'),
'anchise project horizon europe': ('FR', 'PAR', 'Paris', 'R'), # France
'atrium advancing frontier research in the arts humanities': ('EU', 'BRU', 'Brussels', 'R'), # EU
'biblissima': ('FR', 'PAR', 'Paris', 'R'), # France
# Theaters/Venues
# ==========================================================================
# THEATERS & CULTURAL VENUES
# ==========================================================================
'theater de veste': ('ZH', 'DEL', 'Delft', 'E'),
'theater a d schie': ('ZH', 'SCH', 'Schiedam', 'E'),
'theater a d rijn': ('GE', 'ARN', 'Arnhem', 'E'),
'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'),
'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'),
'delamar': ('NH', 'AMS', 'Amsterdam', 'E'),
'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'),
'theatergezelschap bontehond': ('NH', 'AMS', 'Amsterdam', 'E'),
'birds of paradise theatre company': ('GB', 'GLA', 'Glasgow', 'E'), # UK
'yoann bourgeois art company': ('FR', 'LYO', 'Lyon', 'E'), # France
'de grote post': ('BE', 'OST', 'Oostende', 'E'), # Belgium
# Foreign organizations that should be reclassified
# ==========================================================================
# GALLERIES & ART SPACES
# ==========================================================================
'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'),
'cemara 6 galeri museum': ('ID', 'JAK', 'Jakarta', 'G'), # Indonesia
'vedica art studios and gallery': ('IN', 'DEL', 'Delhi', 'G'), # India
# ==========================================================================
# OFFICIAL INSTITUTIONS & GOVERNMENT
# ==========================================================================
'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'),
'the dutch inspectorate of education': ('UT', 'UTR', 'Utrecht', 'O'),
'embassy of the netherlands in morocco': ('MA', 'RAB', 'Rabat', 'O'), # Morocco
'gemeente nederweert': ('LI', 'NED', 'Nederweert', 'O'),
'house of european history': ('BE', 'BRU', 'Brussels', 'M'), # Belgium
'european museum forum': ('PT', 'LIS', 'Lisbon', 'O'), # Portugal
'docomomo international': ('PT', 'LIS', 'Lisbon', 'N'), # Portugal
'culture action europe': ('BE', 'BRU', 'Brussels', 'N'), # Belgium
'gbif the global biodiversity information facility': ('DK', 'CPH', 'Copenhagen', 'O'), # Denmark
# ==========================================================================
# JOURNALISM & MEDIA
# ==========================================================================
'11 11 media': ('NH', 'AMS', 'Amsterdam', 'C'),
'155 eenvijfvijf': ('NH', 'AMS', 'Amsterdam', 'C'),
'archimag': ('FR', 'PAR', 'Paris', 'C'), # France
'arte al dia': ('US', 'MIA', 'Miami', 'C'), # US - Latin American art magazine
'exibart': ('IT', 'ROM', 'Rome', 'C'), # Italy
'finestre sull arte': ('IT', 'FLO', 'Florence', 'C'), # Italy
# ==========================================================================
# MISCLASSIFIED FOREIGN ORGS (have NL prefix but are foreign)
# ==========================================================================
'her place womens museum': ('AU', 'MEL', 'Melbourne', 'M'), # Australia
'her place women s museum': ('AU', 'MEL', 'Melbourne', 'M'), # Australia - variant
'asociacion acre': ('ES', 'MAD', 'Madrid', 'N'), # Spain
'asociacio n acre': ('ES', 'MAD', 'Madrid', 'N'), # Spain - normalized
'la maison du theatre a brest': ('FR', 'BRE', 'Brest', 'E'), # France
'la maison du the a tre a brest': ('FR', 'BRE', 'Brest', 'E'), # France - normalized
'lpo provence alpes cote d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'), # France
'lpo provence alpes co te d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'), # France - normalized
'lucas laboratoire d usages culture s arts societe': ('FR', 'PAR', 'Paris', 'R'), # France
'maison des metallos': ('FR', 'PAR', 'Paris', 'E'), # France
'maison des me tallos': ('FR', 'PAR', 'Paris', 'E'), # France - normalized
'stiftung trias gemeinnutzige stiftung fur boden okologie und wohnen': ('DE', 'HAT', 'Hattingen', 'N'), # Germany
'stiftung trias': ('DE', 'HAT', 'Hattingen', 'N'), # Germany - short name
'sothebys': ('GB', 'LON', 'London', 'C'), # UK auction house
'sotheby s': ('GB', 'LON', 'London', 'C'), # UK auction house - variant
'sothebys institute of art': ('GB', 'LON', 'London', 'E'), # UK
'sotheby s institute of art': ('GB', 'LON', 'London', 'E'), # UK - variant
'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'), # Belgium
'museumpassmuse es': ('BE', 'BRU', 'Brussels', 'O'), # Belgium - normalized
'museum stedhus sleat': ('FR', 'SLO', 'Sloten', 'M'), # Friesland
'museum stedhu s sleat': ('FR', 'SLO', 'Sloten', 'M'), # Friesland - normalized
'museum fiskershuske': ('FR', 'MOD', 'Moddergat', 'M'), # Friesland
'museum fiskershu ske': ('FR', 'MOD', 'Moddergat', 'M'), # Friesland - normalized
'arte al dia': ('US', 'MIA', 'Miami', 'C'), # US - Latin American art magazine
'arte al di a': ('US', 'MIA', 'Miami', 'C'), # US - normalized
'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'), # Already exists
'kro ller mu ller museum': ('GE', 'OTT', 'Otterlo', 'M'), # Normalized
'representation of the netherlands in aruba curacao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'),
'representation of the netherlands in aruba curac ao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'), # Normalized
# ==========================================================================
# NGOs & ADVOCACY
# ==========================================================================
'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'),
'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'),
'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'),
'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'),
'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'),
'acp ica archival community for palestine': ('PS', 'RAM', 'Ramallah', 'N'), # Palestine
'campaign against antisemitism': ('GB', 'LON', 'London', 'N'), # UK
'combat antisemitism movement': ('US', 'NYC', 'New York', 'N'), # US
'facing history ourselves': ('US', 'BOS', 'Boston', 'E'), # US
'freundeskreis yad vashem e v': ('DE', 'FRA', 'Frankfurt', 'N'), # Germany
'yad vashem the world holocaust remembrance center': ('IL', 'JER', 'Jerusalem', 'M'), # Israel
'the wiener holocaust library': ('GB', 'LON', 'London', 'L'), # UK
'usc shoah foundation': ('US', 'LAX', 'Los Angeles', 'A'), # US
'cultuurnetwerk groenlinks pvda': ('NH', 'AMS', 'Amsterdam', 'N'),
# ==========================================================================
# PROFESSIONAL ASSOCIATIONS
# ==========================================================================
'spab': ('GB', 'LON', 'London', 'N'), # Society for the Protection of Ancient Buildings, UK
'sustainable traditional building alliance': ('GB', 'LON', 'London', 'N'), # UK
'the institute of historic building conservation ihbc': ('GB', 'TIV', 'Tivetshall', 'N'), # UK
'asociacion acre': ('ES', 'MAD', 'Madrid', 'N'), # Spain
'vlaamse vereniging tot behoud van historische vaartuigen': ('BE', 'ANT', 'Antwerpen', 'S'), # Belgium
'v z w archief en documentatiecentrum erfgoed binnenvaart': ('BE', 'ANT', 'Antwerpen', 'A'), # Belgium
'centre d archives et de recherches pour l histoire des femmes avg carhif': ('BE', 'BRU', 'Brussels', 'A'), # Belgium
'nederlandse entomologische vereniging': ('NH', 'AMS', 'Amsterdam', 'S'),
'nederlandse vereniging van dierentuinen dutch zoo association': ('NH', 'AMS', 'Amsterdam', 'N'),
'netwerk archieven design en digitale cultuur': ('NH', 'AMS', 'Amsterdam', 'N'),
'ondernemers in geschiedenis': ('NH', 'AMS', 'Amsterdam', 'S'),
'oud stede broec': ('NH', 'STE', 'Stede Broec', 'S'),
'raad voor dierenaangelegenheden rda': ('ZH', 'DHA', 'Den Haag', 'O'),
'regenl': ('NH', 'AMS', 'Amsterdam', 'N'),
'representation of the netherlands in aruba curacao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'),
'hylkema erfgoed': ('NH', 'AMS', 'Amsterdam', 'C'),
'idverde nl': ('NH', 'AMS', 'Amsterdam', 'C'),
'kaliber': ('OV', 'ZWO', 'Zwolle', 'E'),
'keunstwurk': ('FR', 'LEE', 'Leeuwarden', 'E'),
'kunstkade': ('ZH', 'ROT', 'Rotterdam', 'E'),
'leewardists': ('GR', 'GRO', 'Groningen', 'N'),
'leo smit foundation': ('NH', 'AMS', 'Amsterdam', 'N'),
'loveland events': ('NH', 'AMS', 'Amsterdam', 'E'),
'lvwb fundraising': ('NH', 'AMS', 'Amsterdam', 'C'),
'meesters in': ('NH', 'AMS', 'Amsterdam', 'C'),
'moooi': ('NB', 'BRE', 'Breda', 'C'),
'mug authentic coffee atjeh': ('ID', 'JAK', 'Jakarta', 'C'), # Indonesia
# ==========================================================================
# ART & HERITAGE PROJECTS
# ==========================================================================
'art herstory': ('US', 'NYC', 'New York', 'D'), # US
'art history link up': ('GB', 'LON', 'London', 'D'), # UK
'call for curators': ('NH', 'AMS', 'Amsterdam', 'D'),
'creative works': ('NH', 'AMS', 'Amsterdam', 'C'),
'themusemslab': ('DE', 'BER', 'Berlin', 'E'), # Germany
'cultuurloket digitall': ('NH', 'AMS', 'Amsterdam', 'D'),
'gms digitaliseert': ('NH', 'AMS', 'Amsterdam', 'D'),
# ==========================================================================
# COMPANIES & COMMERCIAL
# ==========================================================================
'sothebys': ('GB', 'LON', 'London', 'C'), # UK
'sothebys institute of art': ('GB', 'LON', 'London', 'E'), # UK
'the art loss register': ('GB', 'LON', 'London', 'C'), # UK
'space matter': ('NH', 'AMS', 'Amsterdam', 'C'),
'studio nauta': ('NH', 'AMS', 'Amsterdam', 'C'),
'terra nostra bv': ('NH', 'AMS', 'Amsterdam', 'C'),
'tribeca': ('US', 'NYC', 'New York', 'C'), # US
'van gelder groente fruit': ('NH', 'AMS', 'Amsterdam', 'C'),
'werken voor cultuur': ('NH', 'AMS', 'Amsterdam', 'C'),
'eveha international': ('FR', 'PAR', 'Paris', 'R'), # France
# ==========================================================================
# MISCELLANEOUS DUTCH
# ==========================================================================
'de andere helft': ('NH', 'AMS', 'Amsterdam', 'N'),
'eureka': ('NH', 'AMS', 'Amsterdam', 'E'),
'enschede700': ('OV', 'ENS', 'Enschede', 'E'),
'fenix': ('ZH', 'ROT', 'Rotterdam', 'M'),
'ruimtetijd': ('NH', 'AMS', 'Amsterdam', 'R'),
'sprekende geschiedenis': ('NH', 'AMS', 'Amsterdam', 'E'),
'supermab': ('NH', 'AMS', 'Amsterdam', 'R'),
'tijdlab': ('NH', 'AMS', 'Amsterdam', 'R'),
'turf event': ('NH', 'AMS', 'Amsterdam', 'E'),
'vrijdag': ('GR', 'GRO', 'Groningen', 'E'),
'wad gaat om': ('FR', 'LEE', 'Leeuwarden', 'N'),
'wikipedia': ('US', 'SFO', 'San Francisco', 'D'), # US
'yory nl het grootste platform voor stamboomonderzoek': ('NH', 'AMS', 'Amsterdam', 'D'),
'ar tur': ('BE', 'TUR', 'Turnhout', 'E'), # Belgium
'culture lab 29': ('FR', 'BRE', 'Brest', 'E'), # France
'baleine sous gravillon': ('FR', 'PAR', 'Paris', 'E'), # France
# ==========================================================================
# FOREIGN MUSEUMS - Belgium, France, Italy, etc.
# ==========================================================================
'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium
'huis van alijn': ('BE', 'GEN', 'Gent', 'M'), # Belgium
'kanal centre pompidou': ('BE', 'BRU', 'Brussels', 'M'), # Belgium
'kazerne dossin': ('BE', 'MEC', 'Mechelen', 'M'), # Belgium
'middelheimmuseum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium
'musea brugge': ('BE', 'BRU', 'Brugge', 'O'), # Belgium - museum network
'kunstencentrum viernulvier': ('BE', 'GEN', 'Gent', 'E'), # Belgium
'caen memorial': ('FR', 'CAE', 'Caen', 'M'), # France
'luma arles': ('FR', 'ARL', 'Arles', 'M'), # France
'la maison du theatre a brest': ('FR', 'BRE', 'Brest', 'E'), # France
'maison des metallos': ('FR', 'PAR', 'Paris', 'E'), # France
'irht institut de recherche et d histoire des textes': ('FR', 'PAR', 'Paris', 'R'), # France
'lucas laboratoire d usages culture s arts societe': ('FR', 'PAR', 'Paris', 'R'), # France
'observatoire des politiques culturelles': ('FR', 'GRE', 'Grenoble', 'R'), # France
'profilculture': ('FR', 'PAR', 'Paris', 'C'), # France
'den gamle by': ('DK', 'AAR', 'Aarhus', 'M'), # Denmark
'den kongelige samling': ('DK', 'CPH', 'Copenhagen', 'M'), # Denmark
'kulturhusene i danmark': ('DK', 'CPH', 'Copenhagen', 'O'), # Denmark
'kulturmonitor': ('DK', 'CPH', 'Copenhagen', 'R'), # Denmark
'kulturhistorisk museum': ('NO', 'OSL', 'Oslo', 'M'), # Norway
'castello di rivoli': ('IT', 'TOR', 'Torino', 'M'), # Italy
'consorzio delle residenze reali sabaude': ('IT', 'TOR', 'Torino', 'M'), # Italy
'fondazione canova onlus': ('IT', 'TRE', 'Treviso', 'M'), # Italy
'fondazione pistoletto cittadellarte onlus': ('IT', 'BIE', 'Biella', 'M'), # Italy
'lac lugano arte e cultura': ('IT', 'LUG', 'Lugano', 'M'), # Switzerland (Italian-speaking)
'm9 museum': ('IT', 'VEN', 'Venice', 'M'), # Italy - actually in Mestre
'gammel estrup': ('DK', 'AAR', 'Aarhus', 'M'), # Denmark
'gedung sate museum': ('ID', 'BAN', 'Bandung', 'M'), # Indonesia
'henry moore institute': ('GB', 'LEE', 'Leeds', 'M'), # UK
'her place womens museum': ('AU', 'MEL', 'Melbourne', 'M'), # Australia
'rigsarkivet': ('DK', 'CPH', 'Copenhagen', 'A'), # Denmark
'royal armouries museum': ('GB', 'LEE', 'Leeds', 'M'), # UK
'royal botanic gardens kew': ('GB', 'KEW', 'Kew', 'B'), # UK
'the design museum': ('GB', 'LON', 'London', 'M'), # UK
'the metropolitan museum of art': ('US', 'NYC', 'New York', 'M'), # US
'thorvaldsens museum': ('DK', 'CPH', 'Copenhagen', 'M'), # Denmark
'vitra design museum': ('DE', 'WEI', 'Weil am Rhein', 'M'), # Germany
'war childhood museum': ('BA', 'SAR', 'Sarajevo', 'M'), # Bosnia
'butser ancient farm': ('GB', 'PET', 'Petersfield', 'M'), # UK
'icon film distribution anz': ('AU', 'SYD', 'Sydney', 'C'), # Australia
'museum development north': ('GB', 'NEW', 'Newcastle', 'O'), # UK
'museums association': ('GB', 'LON', 'London', 'N'), # UK
'moya museum of young art': ('AT', 'VIE', 'Vienna', 'M'), # Austria
'national churches trust': ('GB', 'LON', 'London', 'N'), # UK
'national portrait gallery': ('GB', 'LON', 'London', 'M'), # UK
'new contemporaries': ('GB', 'LON', 'London', 'N'), # UK
'peabody essex museum': ('US', 'SAL', 'Salem', 'M'), # US
'norient': ('CH', 'BER', 'Bern', 'R'), # Switzerland
'stiftung trias gemeinnutzige stiftung fur boden okologie und wohnen': ('DE', 'HAT', 'Hattingen', 'N'), # Germany
'nfdi4memory': ('DE', 'BER', 'Berlin', 'R'), # Germany
'themuseumslab': ('DE', 'BER', 'Berlin', 'E'), # Germany
# ==========================================================================
# INDONESIAN INSTITUTIONS (for ID-* PENDING files)
# ==========================================================================
'yayasan arsari djojohadikusumo': ('ID', 'JAK', 'Jakarta', 'N'), # Indonesia
'yayasan konservasi alam nusantara': ('ID', 'JAK', 'Jakarta', 'N'), # Indonesia
'southeast asia museum services seams': ('ID', 'JAK', 'Jakarta', 'O'), # Indonesia
'museum and gallery of ipb future': ('ID', 'BOG', 'Bogor', 'M'), # Indonesia
'museum dewantara kirti griya': ('ID', 'YOG', 'Yogyakarta', 'M'), # Indonesia
'museum macan': ('ID', 'JAK', 'Jakarta', 'M'), # Indonesia
'museum pasifika': ('ID', 'BAL', 'Bali', 'M'), # Indonesia
'museum zoologi universitas andalas': ('ID', 'PAD', 'Padang', 'M'), # Indonesia
'moja museum': ('ID', 'JAK', 'Jakarta', 'M'), # Indonesia - Museum of Jakarta
'wassanindia': ('IN', 'DEL', 'Delhi', 'N'), # India
'museum of contemporary tibetan art': ('IN', 'DHA', 'Dharamsala', 'M'), # India
'vedica art studios and gallery': ('IN', 'DEL', 'Delhi', 'G'), # India
# ==========================================================================
# AUSTRALIAN INSTITUTIONS
# ==========================================================================
'museumsppassmusees': ('AU', 'SYD', 'Sydney', 'O'), # Australia - museum pass program
'australian museums and galleries association victoria': ('AU', 'MEL', 'Melbourne', 'N'),
'australian society of archivists inc': ('AU', 'CAN', 'Canberra', 'N'),
'history australia': ('AU', 'SYD', 'Sydney', 'R'),
'melbourne holocaust museum': ('AU', 'MEL', 'Melbourne', 'M'),
'national library of australia': ('AU', 'CAN', 'Canberra', 'L'),
'professional historians association victoria and tasmania': ('AU', 'MEL', 'Melbourne', 'N'),
'the university of queensland art museum': ('AU', 'BRI', 'Brisbane', 'M'),
# ==========================================================================
# INDONESIAN INSTITUTIONS (additional)
# ==========================================================================
'arsip nasional republik indonesia anri': ('ID', 'JAK', 'Jakarta', 'A'),
'art zoo museum': ('ID', 'JAK', 'Jakarta', 'M'),
'art 1 new museum': ('ID', 'JAK', 'Jakarta', 'M'),
'asmat museum of culture and progress': ('ID', 'AGT', 'Agats', 'M'),
'cifor center for international forestry research': ('ID', 'BOG', 'Bogor', 'R'),
'econusa foundation indonesia': ('ID', 'JAK', 'Jakarta', 'N'),
'econusa foundation': ('ID', 'JAK', 'Jakarta', 'N'),
'fisheries resource center of indonesia frci': ('ID', 'JAK', 'Jakarta', 'R'),
'gaia indonesia': ('ID', 'JAK', 'Jakarta', 'N'),
'jakarta history museum': ('ID', 'JAK', 'Jakarta', 'M'),
'kite museum of indonesia': ('ID', 'JAK', 'Jakarta', 'M'),
'konservasi indonesia': ('ID', 'JAK', 'Jakarta', 'N'),
'ministry of tourism of the republic of indonesia': ('ID', 'JAK', 'Jakarta', 'O'),
'museum batik indonesia': ('ID', 'YOG', 'Yogyakarta', 'M'),
'museum musik indonesia': ('ID', 'JAK', 'Jakarta', 'M'),
'museum nasional indonesia': ('ID', 'JAK', 'Jakarta', 'M'),
'museum perkebunan indonesia': ('ID', 'MED', 'Medan', 'M'),
'perpustakaan nasional republik indonesia perpusnas ri': ('ID', 'JAK', 'Jakarta', 'L'),
'taman safari indonesia': ('ID', 'BOG', 'Bogor', 'B'),
# ==========================================================================
# FRENCH INSTITUTIONS (additional)
# ==========================================================================
'alca nouvelle aquitaine': ('FR', 'BOR', 'Bordeaux', 'O'),
'archives de rennes': ('FR', 'REN', 'Rennes', 'A'),
'centre de recherche du chateau de versailles': ('FR', 'VER', 'Versailles', 'R'),
'centre des monuments nationaux': ('FR', 'PAR', 'Paris', 'O'),
'chateau de chantilly officiel': ('FR', 'CHA', 'Chantilly', 'M'),
'cha teau de chantilly officiel': ('FR', 'CHA', 'Chantilly', 'M'), # normalized
'france nature environnement': ('FR', 'PAR', 'Paris', 'N'),
'ircam': ('FR', 'PAR', 'Paris', 'R'),
'mucem musee des civilisations de l europe et de la mediterranee': ('FR', 'MAR', 'Marseille', 'M'),
'mucem muse e des civilisations de l europe et de la me diterrane e': ('FR', 'MAR', 'Marseille', 'M'), # normalized
'centre de recherche du cha teau de versailles': ('FR', 'VER', 'Versailles', 'R'), # normalized
'musee d orsay': ('FR', 'PAR', 'Paris', 'M'),
'muse e d orsay': ('FR', 'PAR', 'Paris', 'M'), # normalized variant
'musee de bretagne': ('FR', 'REN', 'Rennes', 'M'),
'muse e de bretagne': ('FR', 'REN', 'Rennes', 'M'), # normalized
'musee des arts et metiers': ('FR', 'PAR', 'Paris', 'M'),
'muse e des arts et me tiers': ('FR', 'PAR', 'Paris', 'M'), # normalized
'musee du debarquement': ('FR', 'ARR', 'Arromanches', 'M'),
'muse e du de barquement': ('FR', 'ARR', 'Arromanches', 'M'), # normalized
'petites cites de caractere de france': ('FR', 'PAR', 'Paris', 'N'),
'petites cite s de caracte re de france': ('FR', 'PAR', 'Paris', 'N'), # normalized
'villa albertine the french institute for culture and education': ('US', 'NYC', 'New York', 'O'), # French in US
# ==========================================================================
# GERMAN INSTITUTIONS (additional)
# ==========================================================================
'anne frank educational center': ('DE', 'FRA', 'Frankfurt', 'E'),
'bildarchiv foto marburg': ('DE', 'MAR', 'Marburg', 'A'),
'bundesvereinigung kulturelle kinder und jugendbildung bkj': ('DE', 'REM', 'Remscheid', 'N'),
'common wadden sea secretariat': ('DE', 'WIL', 'Wilhelmshaven', 'O'),
'deutsche stiftung denkmalschutz german foundation for monument protection': ('DE', 'BON', 'Bonn', 'N'),
'deutsches archaologisches institut dai': ('DE', 'BER', 'Berlin', 'R'),
'deutsches archa ologisches institut dai': ('DE', 'BER', 'Berlin', 'R'), # normalized
'deutsches historisches museum': ('DE', 'BER', 'Berlin', 'M'),
'deutsches zentrum kulturgutverluste': ('DE', 'MAG', 'Magdeburg', 'R'),
'jewish museum berlin': ('DE', 'BER', 'Berlin', 'M'),
'klassik stiftung weimar': ('DE', 'WEI', 'Weimar', 'M'),
'kulturstiftung des bundes german federal cultural foundation': ('DE', 'HAL', 'Halle', 'N'),
'stadtische galerie im lenbachhaus und kunstbau munchen': ('DE', 'MUN', 'Munich', 'M'),
'sta dtische galerie im lenbachhaus und kunstbau mu nchen': ('DE', 'MUN', 'Munich', 'M'), # normalized
'stiftung stadtmuseum berlin': ('DE', 'BER', 'Berlin', 'M'),
# ==========================================================================
# BRITISH INSTITUTIONS (additional)
# ==========================================================================
'archaeological research services ltd': ('GB', 'BAK', 'Bakewell', 'R'),
'british school at athens': ('GR', 'ATH', 'Athens', 'R'), # Greek location!
'british trust for ornithology bto': ('GB', 'THE', 'Thetford', 'R'),
'historic new england': ('US', 'BOS', 'Boston', 'N'), # US, not UK!
'historic royal palaces': ('GB', 'LON', 'London', 'M'),
'new england museum association': ('US', 'BOS', 'Boston', 'N'), # US, not UK!
# ==========================================================================
# ITALIAN INSTITUTIONS (additional)
# ==========================================================================
'artribune': ('IT', 'ROM', 'Rome', 'C'),
'centro conservazione restauro la venaria reale': ('IT', 'TOR', 'Turin', 'R'),
'ecole francaise de rome efr': ('IT', 'ROM', 'Rome', 'R'),
'e cole franc aise de rome efr': ('IT', 'ROM', 'Rome', 'R'), # normalized
'museum tweestromenland': ('GE', 'BEN', 'Beneden-Leeuwen', 'M'), # Dutch, in Beneden-Leeuwen!
'stichting roma aeterna': ('IT', 'ROM', 'Rome', 'N'),
'triennale milano': ('IT', 'MIL', 'Milan', 'M'),
# ==========================================================================
# BELGIAN INSTITUTIONS (additional)
# ==========================================================================
'advn': ('BE', 'ANT', 'Antwerpen', 'A'),
'm leuven': ('BE', 'LEU', 'Leuven', 'M'),
'museum voor schone kunsten gent': ('BE', 'GEN', 'Gent', 'M'),
'wikimedia belgium': ('BE', 'BRU', 'Brussels', 'N'),
# ==========================================================================
# US INSTITUTIONS (additional)
# ==========================================================================
'gia gemological institute of america': ('US', 'CAR', 'Carlsbad', 'R'),
'international society of arboriculture': ('US', 'ATL', 'Atlanta', 'N'),
'standwithus': ('US', 'LAX', 'Los Angeles', 'N'),
# ==========================================================================
# DANISH INSTITUTIONS (additional)
# ==========================================================================
'aalborg teater': ('DK', 'AAL', 'Aalborg', 'E'),
'augustinus fonden': ('DK', 'CPH', 'Copenhagen', 'N'),
'kobenhavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'),
'ko benhavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'), # normalized
'københavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'), # with ø
# ==========================================================================
# SPANISH INSTITUTIONS
# ==========================================================================
'centre de cultura contemporania de barcelona cccb': ('ES', 'BAR', 'Barcelona', 'M'),
'centre de cultura contempora nia de barcelona cccb': ('ES', 'BAR', 'Barcelona', 'M'), # normalized
'instituto del patrimonio cultural de espana ipce': ('ES', 'MAD', 'Madrid', 'O'),
'instituto del patrimonio cultural de espan a ipce': ('ES', 'MAD', 'Madrid', 'O'), # normalized
# ==========================================================================
# INDIAN INSTITUTIONS
# ==========================================================================
'placemaking india': ('IN', 'DEL', 'Delhi', 'N'),
# ==========================================================================
# OTHER INTERNATIONAL
# ==========================================================================
'african wildlife foundation': ('KE', 'NAI', 'Nairobi', 'N'),
'arabian oud': ('SA', 'RIY', 'Riyadh', 'C'),
'wza rat althqa fh ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'), # Saudi Ministry of Culture normalized
'وزارة الثقافة ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'), # Saudi Ministry of Culture Arabic
'ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'), # Saudi Ministry of Culture simple
'dariah eric': ('EU', 'BRU', 'Brussels', 'R'),
'embassy of the netherlands in israel': ('IL', 'TLV', 'Tel Aviv', 'O'),
'european museum academy': ('EU', 'BRU', 'Brussels', 'N'),
'iucn ssc shark specialist group ssg': ('CA', 'VAN', 'Vancouver', 'R'),
'museum vosbergen': ('DR', 'EEL', 'Eelde', 'M'), # Dutch - in Eelde
'bonhams': ('GB', 'LON', 'London', 'C'), # UK auction house
# ==========================================================================
# REMAINING DUTCH
# ==========================================================================
'het nationale park de hoge veluwe': ('GE', 'OTT', 'Otterlo', 'N'),
'lucas laboratoire d usages culture s arts socie te': ('FR', 'PAR', 'Paris', 'R'), # French org
# ==========================================================================
# OTHER MISCELLANEOUS DUTCH ORGANIZATIONS
# ==========================================================================
'introdans': ('GE', 'ARN', 'Arnhem', 'E'),
'ja21 het juiste antwoord': ('NH', 'AMS', 'Amsterdam', 'N'), # Political party - not heritage
'kasteel radboud': ('NH', 'MED', 'Medemblik', 'M'),
'klooster huissen': ('GE', 'HUI', 'Huissen', 'H'),
'koninklijke luchtmacht historische vlucht': ('NH', 'GIL', 'Gilze-Rijen', 'M'),
'koninklijke woudenberg': ('UT', 'WOU', 'Woudenberg', 'C'),
'museum fiskershúske': ('FR', 'MOD', 'Moddergat', 'M'),
'museum media': ('NH', 'AMS', 'Amsterdam', 'C'),
'museum of 21st century design': ('GB', 'LON', 'London', 'M'), # UK
'museum of comic art moca': ('US', 'NYC', 'New York', 'M'), # US
'museum of edible earth': ('NL', 'AMS', 'Amsterdam', 'M'), # Actually NL-based
'museum of humanity': ('GB', 'LON', 'London', 'M'), # UK
'museum of looted antiquities': ('GB', 'LON', 'London', 'D'), # UK - virtual
'museum of science': ('US', 'BOS', 'Boston', 'M'), # US
'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'), # Belgium - museum pass
'museumvereniging': ('NH', 'AMS', 'Amsterdam', 'N'),
'oerol festival': ('FR', 'TER', 'Terschelling', 'E'),
'qwen': ('CN', 'HAN', 'Hangzhou', 'C'), # China - AI company, not heritage
'radio en museum': ('NH', 'AMS', 'Amsterdam', 'M'),
'sothebys': ('GB', 'LON', 'London', 'C'), # UK
'sothebys institute of art': ('GB', 'LON', 'London', 'E'), # UK
'nieuwe veste': ('NB', 'BRE', 'Breda', 'E'),
}
@ -216,8 +815,14 @@ def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]:
abbrev = extract_abbreviation(emic_name)
# Handle non-Dutch organizations
# All non-NL countries get their country code as the country, with XX as province
FOREIGN_COUNTRIES = {
'FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US', 'AT', 'AU', 'BA', 'ES',
'EU', 'ID', 'IL', 'IN', 'MA', 'NO', 'PT', 'PS', 'ZA', 'CA', 'GR', 'KE', 'SA',
'CH', 'CN'
}
country = 'NL'
if province in ['FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US']:
if province in FOREIGN_COUNTRIES:
country = province
province = 'XX'