From dd0ee2cf113d18d9023e85c5f5038e599a1be726 Mon Sep 17 00:00:00 2001 From: kempersc Date: Fri, 9 Jan 2026 21:10:14 +0100 Subject: [PATCH] feat(scripts): expand university location mappings and add web enrichment - enrich_ppids.py: Add 40+ Dutch universities and hogescholen to location mapping - enrich_ppids_web.py: New script for web-based PPID enrichment - resolve_pending_known_orgs.py: Updates for pending org resolution --- scripts/enrich_ppids.py | 402 +++++++++++++++- scripts/enrich_ppids_web.py | 579 +++++++++++++++++++++++ scripts/resolve_pending_known_orgs.py | 655 +++++++++++++++++++++++++- 3 files changed, 1590 insertions(+), 46 deletions(-) create mode 100644 scripts/enrich_ppids_web.py diff --git a/scripts/enrich_ppids.py b/scripts/enrich_ppids.py index 07ccda05df..c2ef674ab5 100644 --- a/scripts/enrich_ppids.py +++ b/scripts/enrich_ppids.py @@ -114,6 +114,85 @@ DUTCH_UNI_LOCATIONS = { "IOPS": ("Amsterdam", "NL"), "Interuniversity Graduate School of Psychometrics": ("Amsterdam", "NL"), "Sioo": ("Utrecht", "NL"), + # Additional Dutch universities (expanded mapping) + "Eindhoven University of Technology": ("Eindhoven", "NL"), + "Delft University of Technology": ("Delft", "NL"), + "University of Twente": ("Enschede", "NL"), + "Universiteit Twente": ("Enschede", "NL"), + "UT": ("Enschede", "NL"), + "Open Universiteit": ("Heerlen", "NL"), + "Open University Netherlands": ("Heerlen", "NL"), + "Nyenrode": ("Breukelen", "NL"), + "Nyenrode Business Universiteit": ("Breukelen", "NL"), + "Theologische Universiteit": ("Kampen", "NL"), + "Protestant Theological University": ("Amsterdam", "NL"), + # Additional Hogescholen + "De Haagse Hogeschool": ("Den Haag", "NL"), + "The Hague University": ("Den Haag", "NL"), + "The Hague University of Applied Sciences": ("Den Haag", "NL"), + "Amsterdamse Hogeschool voor de Kunsten": ("Amsterdam", "NL"), + "AHK": ("Amsterdam", "NL"), + "Conservatorium van Amsterdam": ("Amsterdam", "NL"), + "Hanzehogeschool Groningen": ("Groningen", "NL"), + "Hogeschool Leiden": ("Leiden", "NL"), + "Hogeschool Zeeland": ("Vlissingen", "NL"), + "HZ University of Applied Sciences": ("Vlissingen", "NL"), + "Hogeschool voor de Kunsten Utrecht": ("Utrecht", "NL"), + "HKU": ("Utrecht", "NL"), + "Willem de Kooning Academie": ("Rotterdam", "NL"), + "Codarts Rotterdam": ("Rotterdam", "NL"), + "Codarts": ("Rotterdam", "NL"), + "Design Academy": ("Eindhoven", "NL"), + "NHTV": ("Breda", "NL"), + "NHTV Breda University of Applied Sciences": ("Breda", "NL"), + "Breda University of Applied Sciences": ("Breda", "NL"), + "NHL Hogeschool": ("Leeuwarden", "NL"), + "Van Hall Larenstein": ("Velp", "NL"), + "NCOI": ("Hilversum", "NL"), + "NCOI Opleidingen": ("Hilversum", "NL"), + "LOI": ("Leiderdorp", "NL"), + "LOI Hogeschool": ("Leiderdorp", "NL"), + "NTI": ("Leiden", "NL"), + "Hogeschool Arnhem": ("Arnhem", "NL"), + "Hogeschool Nijmegen": ("Nijmegen", "NL"), + "ROC": ("", "NL"), # Regional Training Centers - various locations (fallback) + # Specific ROC locations + "ROC Leeuwenborgh": ("Maastricht", "NL"), + "ROC Leiden": ("Leiden", "NL"), + "ROC Midden Nederland": ("Utrecht", "NL"), + "ROC MN": ("Utrecht", "NL"), + "ROC van Amsterdam": ("Amsterdam", "NL"), + "ROC Amsterdam": ("Amsterdam", "NL"), + "ROC Flevoland": ("Almere", "NL"), + "ROC Tilburg": ("Tilburg", "NL"), + "ROC van Twente": ("Enschede", "NL"), + "ROC Twente": ("Enschede", "NL"), + "ROC Nijmegen": ("Nijmegen", "NL"), + "ROC Mondriaan": ("Den Haag", "NL"), + "ROC Nova College": ("Haarlem", "NL"), + "ROC Albeda": ("Rotterdam", "NL"), + "Albeda College": ("Rotterdam", "NL"), + "Zadkine": ("Rotterdam", "NL"), + "Graafschap College": ("Doetinchem", "NL"), + "Friesland College": ("Leeuwarden", "NL"), + "Noorderpoort": ("Groningen", "NL"), + "Alfa-college": ("Groningen", "NL"), + "Deltion College": ("Zwolle", "NL"), + "Cibap": ("Zwolle", "NL"), + "Summa College": ("Eindhoven", "NL"), + "SintLucas": ("Eindhoven", "NL"), + "Koning Willem I College": ("Den Bosch", "NL"), + "Curio": ("Breda", "NL"), + "Da Vinci College": ("Dordrecht", "NL"), + # Additional Radboud variations + "Radboud University Nijmegen": ("Nijmegen", "NL"), + "Radboud University": ("Nijmegen", "NL"), + # Additional VU variations + "Vrije Universiteit Amsterdam": ("Amsterdam", "NL"), + "VU University Amsterdam": ("Amsterdam", "NL"), + # Wageningen variations + "Wageningen University & Research": ("Wageningen", "NL"), + "WUR": ("Wageningen", "NL"), # Belgian institutions "KU Leuven": ("Leuven", "BE"), "University of Leuven": ("Leuven", "BE"), @@ -141,9 +220,85 @@ DUTCH_UNI_LOCATIONS = { "LMU München": ("München", "DE"), "Technische Universität München": ("München", "DE"), "TU München": ("München", "DE"), - # International + # UK institutions + "University of Oxford": ("Oxford", "GB"), + "Oxford University": ("Oxford", "GB"), + "University of Cambridge": ("Cambridge", "GB"), + "Cambridge University": ("Cambridge", "GB"), + "University of York": ("York", "GB"), + "University College London": ("London", "GB"), + "UCL": ("London", "GB"), + "London School of Economics": ("London", "GB"), + "LSE": ("London", "GB"), + "King's College London": ("London", "GB"), + "Imperial College": ("London", "GB"), + "University of Edinburgh": ("Edinburgh", "GB"), + "University of Manchester": ("Manchester", "GB"), + # Australian institutions + "The Australian National University": ("Canberra", "AU"), + "Australian National University": ("Canberra", "AU"), + "ANU": ("Canberra", "AU"), + "University of Canberra": ("Canberra", "AU"), + "University of Melbourne": ("Melbourne", "AU"), + "University of Sydney": ("Sydney", "AU"), + "Macquarie University": ("Sydney", "AU"), + "Charles Sturt University": ("Bathurst", "AU"), + "UNSW": ("Sydney", "AU"), + "University of New South Wales": ("Sydney", "AU"), + "University of Queensland": ("Brisbane", "AU"), + "Monash University": ("Melbourne", "AU"), + # South African institutions + "University of Cape Town": ("Cape Town", "ZA"), + "UCT": ("Cape Town", "ZA"), + "University of Pretoria": ("Pretoria", "ZA"), + "University of Witwatersrand": ("Johannesburg", "ZA"), + "Stellenbosch University": ("Stellenbosch", "ZA"), + # Italian institutions "Politecnico di Milano": ("Milano", "IT"), + "Università degli Studi di Milano": ("Milano", "IT"), + "Università di Bologna": ("Bologna", "IT"), + "University of Bologna": ("Bologna", "IT"), + # US institutions "Oberlin College": ("Oberlin", "US"), + "Harvard University": ("Cambridge", "US"), + "Harvard": ("Cambridge", "US"), + "Yale University": ("New Haven", "US"), + "Princeton University": ("Princeton", "US"), + "MIT": ("Cambridge", "US"), + "Massachusetts Institute of Technology": ("Cambridge", "US"), + "Stanford University": ("Stanford", "US"), + "Columbia University": ("New York", "US"), + "University of California": ("Berkeley", "US"), + "UCLA": ("Los Angeles", "US"), + "University of Chicago": ("Chicago", "US"), + "NYU": ("New York", "US"), + "New York University": ("New York", "US"), + # Indonesian institutions + "Universitas Gadjah Mada": ("Yogyakarta", "ID"), + "UGM": ("Yogyakarta", "ID"), + "Universitas Indonesia": ("Jakarta", "ID"), + "UI": ("Jakarta", "ID"), + # Turkish institutions + "Middle East Technical University": ("Ankara", "TR"), + "METU": ("Ankara", "TR"), + "Boğaziçi University": ("Istanbul", "TR"), + # Additional Dutch variations found in data + "Rotterdam School of Management": ("Rotterdam", "NL"), + "RSM": ("Rotterdam", "NL"), + "TIAS School for Business and Society": ("Tilburg", "NL"), + "TIAS": ("Tilburg", "NL"), + "GO opleidingen": ("Utrecht", "NL"), + "Amsterdam University of Applied Sciences": ("Amsterdam", "NL"), + "University College Utrecht": ("Utrecht", "NL"), + "UCU": ("Utrecht", "NL"), + "University of Utrecht": ("Utrecht", "NL"), + "NSOB": ("Den Haag", "NL"), + "Nederlandse School voor Openbaar Bestuur": ("Den Haag", "NL"), + "Grotius Academie": ("Nijmegen", "NL"), + "de Baak": ("Noordwijk", "NL"), + "Grafisch Lyceum Rotterdam": ("Rotterdam", "NL"), + "Schoevers": ("Utrecht", "NL"), + "Schoevers College": ("Utrecht", "NL"), } @@ -171,12 +326,39 @@ def geocode_location(location_str: str, db_path: str) -> Optional[dict]: # Extract country from common patterns country_code = None - if "(NL)" in location_str or "Netherlands" in location_str or "Nederland" in location_str: - country_code = "NL" - elif "(BE)" in location_str or "Belgium" in location_str or "België" in location_str: - country_code = "BE" - elif "(DE)" in location_str or "Germany" in location_str or "Deutschland" in location_str: - country_code = "DE" + country_patterns = { + "NL": ["(NL)", "Netherlands", "Nederland"], + "BE": ["(BE)", "Belgium", "België", "Belgique"], + "DE": ["(DE)", "Germany", "Deutschland"], + "GB": ["(GB)", "United Kingdom", "UK", "England", "Scotland", "Wales"], + "AU": ["(AU)", "Australia"], + "ZA": ["(ZA)", "South Africa"], + "IT": ["(IT)", "Italy", "Italia"], + "US": ["(US)", "United States", "USA", "U.S."], + "ID": ["(ID)", "Indonesia"], + "TR": ["(TR)", "Turkey", "Türkiye"], + "FR": ["(FR)", "France"], + "ES": ["(ES)", "Spain", "España"], + "AT": ["(AT)", "Austria", "Österreich"], + "CH": ["(CH)", "Switzerland", "Schweiz", "Suisse"], + "CA": ["(CA)", "Canada"], + "NZ": ["(NZ)", "New Zealand"], + "JP": ["(JP)", "Japan"], + "CN": ["(CN)", "China"], + "IN": ["(IN)", "India"], + "BR": ["(BR)", "Brazil", "Brasil"], + "SE": ["(SE)", "Sweden", "Sverige"], + "NO": ["(NO)", "Norway", "Norge"], + "DK": ["(DK)", "Denmark", "Danmark"], + "FI": ["(FI)", "Finland", "Suomi"], + "PL": ["(PL)", "Poland", "Polska"], + "CZ": ["(CZ)", "Czech Republic", "Czechia", "Česko"], + } + + for code, patterns in country_patterns.items(): + if any(p in location_str for p in patterns): + country_code = code + break # Clean location for city lookup city_candidate = location_str.split(",")[0].strip() @@ -255,6 +437,56 @@ def parse_date_range(date_range: str) -> Tuple[Optional[int], Optional[int]]: return start_year, end_year +def get_any_date_field(record: dict) -> str: + """ + Extract date string from a record with various field name conventions. + + Handles the following field variations found in LinkedIn profile data: + - date_range: "2019 - Present" (most common, 2,486 entries) + - period: "2015 - 2019" (15 entries) + - years/year: "2010" (single year) + - start_date/end_date: separate fields like "Sep 2019" / "Present" (149 entries) + - dates: "2018 - 2020" (12 entries) + + Returns combined date string suitable for parse_date_range(). + """ + # Try combined date fields first + for field in ["date_range", "period", "years", "year", "dates"]: + if record.get(field): + return str(record[field]) + + # Handle separate start_date/end_date fields + start = record.get("start_date", "") or "" + end = record.get("end_date", "") or "" + if start or end: + return f"{start} - {end}".strip(" -") + + return "" + + +def parse_total_experience_field(total_exp: str) -> Optional[int]: + """ + Parse total experience field value to extract years. + + Handles formats like: + - "24 years and 8 months" + - "37 years" + - "5 years 3 months" + - "1 year" + + Returns number of years or None if not parseable. + """ + if not total_exp: + return None + + # Pattern: find digits followed by "year" or "years" + match = re.search(r'(\d+)\s*years?', total_exp.lower()) + if match: + return int(match.group(1)) + + return None + + def build_inference_chain(steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Build a numbered inference chain.""" return [{"step": i + 1, **step} for i, step in enumerate(steps)] @@ -297,6 +529,22 @@ def get_adjacent_decades(year: int) -> Tuple[str, str]: return (get_decade_notation(year - 10), get_decade_notation(year)) +def parse_total_experience(about_text: str) -> Optional[int]: + """ + Parse "Total Experience: X years" pattern from about/summary field. + Returns number of years or None if not found. + """ + if not about_text: + return None + + # Pattern: "Total Experience: X years and Y months" or "Total Experience: X year" + m = re.search(r'Total Experience:\s*(\d+)\s*years?', about_text, re.IGNORECASE) + if m: + return int(m.group(1)) + + return None + + def infer_birth_decade(profile_data: dict) -> Optional[dict]: """ Infer birth decade from earliest career observations. @@ -305,6 +553,11 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]: Supports list-valued results for decade boundary cases (Rule 45 extension): - If estimated birth year is within 3 years of decade boundary, returns both adjacent decades as EDTF set notation: [196X,197X] + + Inference methods (in priority order): + 1. Education start year (most reliable - entry age 18-24) + 2. Experience start year (first job - entry age ~23) + 3. Total Experience pattern (fallback - "Total Experience: X years") """ earliest_year = None inference_steps = [] @@ -312,6 +565,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]: age_variance = 3 # ±3 years typical variance in entry age education_record = None experience_record = None + total_experience_years = None # Check education first (most reliable) education = profile_data.get("education") or [] @@ -381,8 +635,8 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]: for exp in experience: if exp is None: continue - # Handle multiple date field names - date_range = exp.get("date_range") or exp.get("period") or "" + # Handle multiple date field names (including start_date/end_date) + date_range = get_any_date_field(exp) start_year, _ = parse_date_range(date_range) if start_year: @@ -396,10 +650,59 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]: "date_range": date_range, } + # If no education or experience dates, try "Total Experience" pattern in about field + if earliest_year is None: + about = profile_data.get("about") or profile_data.get("summary") or "" + total_experience_years = parse_total_experience(about) + + if total_experience_years and total_experience_years > 0: + # Estimate: current year - total_years = first job year + # Then: first job year - 23 = birth year (assuming first job at 23) + current_year = datetime.now().year + estimated_first_job_year = current_year - total_experience_years + earliest_year = estimated_first_job_year + age_offset = 23 # Assume first job at 23 + age_variance = 7 # Very high variance for this method + + inference_steps.append({ + "observation": "Total Experience pattern found in about field", + "source_field": "profile_data.about", + "source_value": f"Total Experience: {total_experience_years} years", + }) + inference_steps.append({ + "calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}", + "result": f"Estimated first job year: {estimated_first_job_year}", + "assumption": "Total experience represents continuous career from first job", + }) + + # If still no date, try standalone total_experience field in profile_data + if earliest_year is None: + total_exp_field = profile_data.get("total_experience") + if total_exp_field: + total_experience_years = parse_total_experience_field(total_exp_field) + + if total_experience_years and total_experience_years > 0: + current_year = datetime.now().year + estimated_first_job_year = current_year - total_experience_years + earliest_year = estimated_first_job_year + age_offset = 23 # Assume first job at 23 + age_variance = 7 # Very high variance for this method + + inference_steps.append({ + "observation": "total_experience field found in profile_data", + "source_field": "profile_data.total_experience", + "source_value": total_exp_field, + }) + inference_steps.append({ + "calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}", + "result": f"Estimated first job year: {estimated_first_job_year}", + "assumption": "Total experience represents continuous career from first job", + }) + if earliest_year is None: return None - # Build inference chain + # Build inference chain (only add steps if not already added from Total Experience path) if education_record: inference_steps.append({ "observation": "Education record found", @@ -415,7 +718,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]: "rationale": "Standard entry age for this education level in Netherlands/Europe", "confidence_impact": f"Assumption introduces uncertainty; actual age may vary ±{age_variance} years", }) - else: + elif experience_record: inference_steps.append({ "observation": "First job record found (no education data)", "source_field": "profile_data.experience", @@ -430,6 +733,13 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]: "rationale": "Assumes first job after typical university completion", "confidence_impact": f"Higher uncertainty; first job age varies ±{age_variance} years", }) + elif total_experience_years: + # Steps already added in the Total Experience detection block + inference_steps.append({ + "assumption": f"First job age is approximately {age_offset} (±{age_variance} years)", + "rationale": "Assumes first job after typical university completion; Total Experience method has highest uncertainty", + "confidence_impact": f"Very high uncertainty; first job age varies ±{age_variance} years, plus Total Experience aggregation may be inaccurate", + }) estimated_birth_year = earliest_year - age_offset min_birth_year = earliest_year - age_offset - age_variance @@ -468,6 +778,14 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]: "rationale": "Cannot determine which decade with certainty; using EDTF 'one of' set notation", }) + # Determine method name based on source + if education_record: + method_name = "earliest_education_heuristic" + elif experience_record: + method_name = "earliest_experience_heuristic" + else: + method_name = "total_experience_heuristic" + return { "values": [decade1, decade2], "edtf": f"[{decade1},{decade2}]", @@ -477,7 +795,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]: "primary_rationale": primary_rationale, "confidence": "very_low", # Lower confidence due to boundary uncertainty "inference_provenance": { - "method": "earliest_observation_heuristic", + "method": method_name, "inference_chain": build_inference_chain(inference_steps), "assumptions": [ f"Entry age for education/first job: {age_offset} years (±{age_variance})", @@ -499,13 +817,24 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]: "rationale": "Decade precision appropriate for heuristic-based estimate", }) + # Determine method name and confidence based on source + if education_record: + method_name = "earliest_education_heuristic" + confidence = "low" + elif experience_record: + method_name = "earliest_experience_heuristic" + confidence = "low" + else: + method_name = "total_experience_heuristic" + confidence = "very_low" # Lowest confidence for Total Experience method + return { "value": edtf_decade, "edtf": edtf_decade, "precision": "decade", - "confidence": "low", + "confidence": confidence, "inference_provenance": { - "method": "earliest_observation_heuristic", + "method": method_name, "inference_chain": build_inference_chain(inference_steps), "assumptions": [ f"Entry age for education/first job: {age_offset} years (±{age_variance})", @@ -549,7 +878,21 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]: for uni_name, (city, country) in DUTCH_UNI_LOCATIONS.items(): if uni_name.lower() in institution.lower(): - location = f"{city}, Netherlands" if city else None + # Map country code to country name for geocoding + country_names = { + "NL": "Netherlands", + "BE": "Belgium", + "DE": "Germany", + "GB": "United Kingdom", + "AU": "Australia", + "ZA": "South Africa", + "IT": "Italy", + "US": "United States", + "ID": "Indonesia", + "TR": "Turkey", + } + country_name = country_names.get(country, "Netherlands") + location = f"{city}, {country_name}" if city else None location_source = f"Known institution mapping: {uni_name}" break @@ -622,8 +965,8 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]: for exp in experience: if exp is None: continue - # Handle multiple date field names - date_range = exp.get("date_range") or exp.get("period") or "" + # Handle multiple date field names (including start_date/end_date) + date_range = get_any_date_field(exp) start_year, _ = parse_date_range(date_range) if start_year and exp.get("location"): exp_with_years.append((start_year, exp)) @@ -636,7 +979,7 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]: continue # Get date_range for provenance (handle multiple field names) - exp_date_range = exp.get("date_range") or exp.get("period") or "" + exp_date_range = get_any_date_field(exp) inference_steps.append({ "observation": "Earliest job with location found (no education location available)", @@ -739,8 +1082,8 @@ def infer_current_settlement(profile_data: dict, db_path: str) -> Optional[dict] for exp in experience: if exp is None: continue - # Handle multiple date field names - date_range = exp.get("date_range") or exp.get("period") or "" + # Handle multiple date field names (including start_date/end_date) + date_range = get_any_date_field(exp) # Also check "current" field which some profiles have is_current = "Present" in date_range or exp.get("current") is True if is_current: @@ -815,6 +1158,7 @@ def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force: stats = { "birth_decade_inferred": False, "birth_decade_is_list": False, # Track decade boundary cases + "birth_decade_method": None, # Track which method was used "birth_settlement_inferred": False, "current_settlement_inferred": False, "ppid_changed": False, @@ -870,6 +1214,9 @@ def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force: components["first_date"] = birth_info["edtf"] components["first_date_source"] = "inferred_birth_decade" + # Track which method was used + stats["birth_decade_method"] = birth_info.get("inference_provenance", {}).get("method", "unknown") + # Add note to canonical field pointing to inferred alternative data["birth_date"]["note"] = "See inferred_birth_decade for heuristic estimate" @@ -978,6 +1325,11 @@ def main(): "processed": 0, "birth_decade_inferred": 0, "birth_decade_list_valued": 0, # Decade boundary cases + "birth_decade_by_method": { + "earliest_education_heuristic": 0, + "earliest_experience_heuristic": 0, + "total_experience_heuristic": 0, + }, "birth_settlement_inferred": 0, "current_settlement_inferred": 0, "ppid_changed": 0, @@ -990,6 +1342,10 @@ def main(): total_stats["processed"] += 1 if stats["birth_decade_inferred"]: total_stats["birth_decade_inferred"] += 1 + # Track method used + method = stats.get("birth_decade_method") + if method and method in total_stats["birth_decade_by_method"]: + total_stats["birth_decade_by_method"][method] += 1 if stats.get("birth_decade_is_list"): total_stats["birth_decade_list_valued"] += 1 if stats["birth_settlement_inferred"]: @@ -999,7 +1355,7 @@ def main(): if stats["ppid_changed"]: total_stats["ppid_changed"] += 1 - if args.verbose and any(stats.values()): + if args.verbose and any(v for k, v in stats.items() if k != "birth_decade_method"): print(f" {filepath.name}: {stats}") if (i + 1) % 500 == 0: @@ -1017,6 +1373,9 @@ def main(): print(f"Processed: {total_stats['processed']}") print(f"Birth decades inferred: {total_stats['birth_decade_inferred']}") print(f" - List-valued (boundary): {total_stats['birth_decade_list_valued']}") + print(f" - By method:") + for method, count in total_stats["birth_decade_by_method"].items(): + print(f" {method}: {count}") print(f"Birth settlements inferred: {total_stats['birth_settlement_inferred']}") print(f"Current settlements inferred: {total_stats['current_settlement_inferred']}") print(f"PPIDs updated: {total_stats['ppid_changed']}") @@ -1033,6 +1392,7 @@ def main(): print("\nNote: All inferred data stored in explicit inferred_* fields with provenance chains.") print("Note: Decade boundary cases use EDTF set notation [196X,197X] with primary_value for PPID.") + print("Note: Total Experience method has highest uncertainty (very_low confidence).") if __name__ == "__main__": diff --git a/scripts/enrich_ppids_web.py b/scripts/enrich_ppids_web.py new file mode 100644 index 0000000000..4eb90de762 --- /dev/null +++ b/scripts/enrich_ppids_web.py @@ -0,0 +1,579 @@ +#!/usr/bin/env python3 +""" +PPID Web Enrichment Script + +Enriches PPID files with web-sourced claims using Exa AI and Linkup search. +Adds proper provenance statements per Rules 6, 26, and 35. + +Enrichment targets: +1. Birth date/year - Search for biographical information +2. Publications - ORCID, Google Scholar, ResearchGate +3. News mentions - Press coverage, interviews +4. Wikidata entity - Authority file linking +5. Institutional affiliations - Verify current roles + +All web claims include: +- source_url: Where the data was found +- retrieved_on: ISO 8601 timestamp +- retrieval_agent: Tool used (exa_web_search, linkup_search, etc.) +- claim_type: Type of claim (birth_date, publication, news_mention, etc.) +- claim_value: The extracted value +- provenance: Full provenance chain per Rule 35 + +Usage: + python scripts/enrich_ppids_web.py --limit 10 --verbose + python scripts/enrich_ppids_web.py --dry-run --sample stefankulk +""" + +import json +import os +import re +import sys +import time +import argparse +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional, Dict, List, Any, Tuple + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + + +def create_web_claim( + claim_type: str, + claim_value: str, + source_url: str, + retrieval_agent: str, + confidence: str = "medium", + notes: Optional[str] = None, + raw_response: Optional[Dict] = None +) -> Dict[str, Any]: + """ + Create a web claim with proper provenance per Rules 6, 26, and 35. + + Args: + claim_type: Type of claim (birth_date, publication, news_mention, etc.) + claim_value: The extracted value + source_url: URL where the data was found + retrieval_agent: Tool used (exa_web_search, linkup_search, etc.) + confidence: Confidence level (high, medium, low, very_low) + notes: Additional notes about the claim + raw_response: Raw API response for audit + + Returns: + Dict with claim structure per Rule 26 + """ + now = datetime.now(timezone.utc) + + claim = { + "claim_type": claim_type, + "claim_value": claim_value, + "source_url": source_url, + "retrieved_on": now.isoformat(), + "retrieval_agent": retrieval_agent, + "confidence": confidence, + "provenance": { + "statement_created_at": now.isoformat(), + "source_archived_at": now.isoformat(), # Same time for API responses + "retrieval_method": retrieval_agent, + } + } + + if notes: + claim["notes"] = notes + + if raw_response: + # Store snippet of raw response for audit (not full response to save space) + claim["provenance"]["response_snippet"] = str(raw_response)[:500] + + return claim + + +def extract_birth_year_from_text(text: str, full_name: str) -> Optional[Tuple[str, str]]: + """ + Extract birth year from text using various patterns. + + Returns: + Tuple of (birth_year_edtf, extraction_note) or None + """ + if not text: + return None + + # Normalize text + text_lower = text.lower() + name_parts = full_name.lower().split() + last_name = name_parts[-1] if name_parts else "" + + # Check if the text is about the right person (basic check) + if last_name and last_name not in text_lower: + return None + + # Pattern 1: "born in YYYY" or "born YYYY" + born_match = re.search(r'born\s+(?:in\s+)?(\d{4})', text_lower) + if born_match: + year = born_match.group(1) + return (year, f"Extracted from 'born {year}' pattern") + + # Pattern 2: "(YYYY - )" or "(YYYY-)" indicating birth year + birth_dash_match = re.search(r'\((\d{4})\s*[-–—]\s*\)', text) + if birth_dash_match: + year = birth_dash_match.group(1) + return (year, f"Extracted from '({year} - )' lifespan pattern") + + # Pattern 3: "YYYY - present" or "b. YYYY" + b_match = re.search(r'(?:b\.|born)\s*(\d{4})', text_lower) + if b_match: + year = b_match.group(1) + return (year, f"Extracted from 'b. {year}' pattern") + + # Pattern 4: Age patterns "X years old" with date context + age_match = re.search(r'(\d{1,2})\s*(?:years?\s*old|jaar\s*oud)', text_lower) + if age_match: + age = int(age_match.group(1)) + if 20 <= age <= 100: # Reasonable age range + current_year = datetime.now().year + estimated_birth = current_year - age + return (f"{estimated_birth}~", f"Estimated from age {age} (approximate)") + + # Pattern 5: Birthday patterns "birthday: Month DD, YYYY" + birthday_match = re.search( + r'(?:birthday|geboren|date of birth)[:\s]+(?:\w+\s+\d{1,2},?\s+)?(\d{4})', + text_lower + ) + if birthday_match: + year = birthday_match.group(1) + return (year, "Extracted from birthday/geboren pattern") + + return None + + +def extract_publications_from_text(text: str, full_name: str) -> List[Dict[str, str]]: + """ + Extract publication references from search results. + + Returns: + List of publication dicts with title, year, venue + """ + publications = [] + + if not text: + return publications + + # Look for DOI patterns + doi_matches = re.findall(r'10\.\d{4,}/[^\s]+', text) + for doi in doi_matches[:5]: # Limit to 5 + publications.append({ + "type": "doi", + "value": doi.strip(), + "note": "DOI found in search results" + }) + + # Look for ORCID patterns + orcid_match = re.search(r'orcid\.org/(\d{4}-\d{4}-\d{4}-\d{3}[\dX])', text) + if orcid_match: + publications.append({ + "type": "orcid", + "value": orcid_match.group(1), + "note": "ORCID identifier found" + }) + + return publications + + +def search_birth_date_exa(full_name: str, context_hints: List[str] = None) -> Optional[Dict]: + """ + Search for birth date using Exa AI web search. + + Note: This function is designed to be called via MCP tools. + In actual execution, replace with MCP tool call. + """ + # Build search query + query_parts = [f'"{full_name}"', "born", "birthday"] + if context_hints: + query_parts.extend(context_hints[:2]) # Add up to 2 context hints + + query = " ".join(query_parts) + + # This would be replaced with actual MCP call: + # result = exa_web_search_exa(query=query, numResults=5) + + return { + "query": query, + "tool": "exa_web_search_exa", + "status": "pending_mcp_call" + } + + +def search_publications_exa(full_name: str, institution: str = None) -> Optional[Dict]: + """ + Search for publications using Exa AI. + """ + query_parts = [f'"{full_name}"'] + if institution: + query_parts.append(institution) + query_parts.extend(["publications", "research", "ORCID"]) + + query = " ".join(query_parts) + + return { + "query": query, + "tool": "exa_web_search_exa", + "status": "pending_mcp_call" + } + + +def search_news_mentions_exa(full_name: str, institution: str = None) -> Optional[Dict]: + """ + Search for news mentions using Exa AI. + """ + query_parts = [f'"{full_name}"'] + if institution: + query_parts.append(institution) + + query = " ".join(query_parts) + + return { + "query": query, + "tool": "exa_web_search_exa", + "status": "pending_mcp_call" + } + + +def get_person_context(ppid_data: Dict) -> Dict[str, Any]: + """ + Extract context from PPID data for better search queries. + """ + context = { + "full_name": "", + "institutions": [], + "roles": [], + "location": None, + "linkedin_url": None, + "skills": [], + } + + # Get name + name_data = ppid_data.get("name", {}) + context["full_name"] = name_data.get("full_name", "") + + # Get profile data + profile = ppid_data.get("profile_data", {}) + if profile: + context["linkedin_url"] = profile.get("linkedin_url") + context["location"] = profile.get("location") + context["skills"] = profile.get("skills", [])[:10] # Top 10 skills + + # Extract institutions from experience + for exp in profile.get("experience", []) or []: + if exp and exp.get("company"): + context["institutions"].append(exp["company"]) + if exp.get("title"): + context["roles"].append(exp["title"]) + + # Extract from education + for edu in profile.get("education", []) or []: + if edu and edu.get("institution"): + context["institutions"].append(edu["institution"]) + + # Deduplicate + context["institutions"] = list(dict.fromkeys(context["institutions"]))[:5] + context["roles"] = list(dict.fromkeys(context["roles"]))[:5] + + return context + + +def build_enrichment_queries(ppid_data: Dict) -> List[Dict[str, Any]]: + """ + Build a list of enrichment queries for a PPID. + + Returns list of query specs to execute via MCP tools. + """ + context = get_person_context(ppid_data) + full_name = context["full_name"] + + if not full_name: + return [] + + queries = [] + + # 1. Birth date search (only if not already known) + birth_date = ppid_data.get("birth_date", {}).get("edtf", "XXXX") + enrichment_meta = ppid_data.get("enrichment_metadata", {}).get("birth_date_search", {}) + + if birth_date == "XXXX" and not enrichment_meta.get("attempted"): + # Build birth date query with context + hints = [] + if context["institutions"]: + hints.append(context["institutions"][0]) + if context["location"]: + hints.append(context["location"].split(",")[0]) + + queries.append({ + "type": "birth_date", + "query": f'"{full_name}" born birthday biography', + "context_hints": hints, + "tool": "exa_web_search_exa", + "priority": "high" + }) + + # 2. Publications search (for academics/researchers) + academic_keywords = ["professor", "researcher", "phd", "doctor", "lecturer", "scientist"] + is_academic = any( + kw in " ".join(context["roles"]).lower() + for kw in academic_keywords + ) + + if is_academic: + institution = context["institutions"][0] if context["institutions"] else "" + queries.append({ + "type": "publications", + "query": f'"{full_name}" {institution} publications ORCID research', + "tool": "exa_web_search_exa", + "priority": "medium" + }) + + # 3. News/press mentions + if context["institutions"]: + queries.append({ + "type": "news_mentions", + "query": f'"{full_name}" {context["institutions"][0]}', + "tool": "exa_web_search_exa", + "priority": "low" + }) + + # 4. Wikidata search (for notable persons) + queries.append({ + "type": "wikidata", + "query": full_name, + "tool": "wikidata_search_entity", + "priority": "medium" + }) + + return queries + + +def process_search_result( + result: Dict[str, Any], + query_type: str, + full_name: str, + ppid_data: Dict +) -> List[Dict[str, Any]]: + """ + Process a search result and extract web claims. + + Args: + result: Raw search result from Exa/Linkup + query_type: Type of query (birth_date, publications, etc.) + full_name: Person's full name + ppid_data: Current PPID data + + Returns: + List of web claims to add + """ + claims = [] + + if not result: + return claims + + # Extract text content from result + text = "" + source_url = "" + + if isinstance(result, dict): + text = result.get("text", "") or result.get("content", "") or "" + source_url = result.get("url", "") or result.get("source_url", "") + elif isinstance(result, str): + text = result + + if query_type == "birth_date": + birth_info = extract_birth_year_from_text(text, full_name) + if birth_info: + year, note = birth_info + claims.append(create_web_claim( + claim_type="birth_year", + claim_value=year, + source_url=source_url, + retrieval_agent="exa_web_search_exa", + confidence="medium" if "~" not in year else "low", + notes=note, + raw_response={"text_snippet": text[:200]} + )) + + elif query_type == "publications": + pubs = extract_publications_from_text(text, full_name) + for pub in pubs: + claims.append(create_web_claim( + claim_type=f"identifier_{pub['type']}", + claim_value=pub["value"], + source_url=source_url, + retrieval_agent="exa_web_search_exa", + confidence="high" if pub["type"] in ["doi", "orcid"] else "medium", + notes=pub.get("note") + )) + + elif query_type == "news_mentions": + # For news, we just record the mention + if full_name.lower() in text.lower(): + claims.append(create_web_claim( + claim_type="news_mention", + claim_value=text[:500], # First 500 chars + source_url=source_url, + retrieval_agent="exa_web_search_exa", + confidence="medium", + notes="News/press mention found" + )) + + return claims + + +def enrich_ppid_file( + filepath: Path, + dry_run: bool = False, + verbose: bool = False +) -> Dict[str, Any]: + """ + Enrich a single PPID file with web-sourced claims. + + This function builds queries but does not execute them directly. + Queries should be executed via MCP tools in the calling context. + + Returns: + Dict with enrichment stats and pending queries + """ + stats = { + "filepath": str(filepath), + "queries_built": 0, + "claims_added": 0, + "errors": [], + "pending_queries": [] + } + + try: + with open(filepath, "r", encoding="utf-8") as f: + data = json.load(f) + except Exception as e: + stats["errors"].append(f"Failed to read file: {e}") + return stats + + # Build enrichment queries + queries = build_enrichment_queries(data) + stats["queries_built"] = len(queries) + stats["pending_queries"] = queries + + if verbose: + print(f" Built {len(queries)} queries for {filepath.name}") + for q in queries: + print(f" - {q['type']}: {q['query'][:50]}...") + + return stats + + +def main(): + parser = argparse.ArgumentParser( + description="Enrich PPID files with web-sourced claims (Rule 26 compliant)" + ) + parser.add_argument("--dry-run", action="store_true", help="Don't write changes") + parser.add_argument("--limit", type=int, help="Process only N files") + parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") + parser.add_argument("--sample", type=str, help="Process specific linkedin_slug") + parser.add_argument( + "--query-types", + type=str, + default="birth_date,publications,news_mentions,wikidata", + help="Comma-separated list of query types to run" + ) + args = parser.parse_args() + + person_dir = Path("/Users/kempersc/apps/glam/data/person") + + # Get PPID files + if args.sample: + # Find file by linkedin slug + ppid_files = list(person_dir.glob(f"ID_*{args.sample.upper()}*.json")) + if not ppid_files: + # Try case-insensitive search + ppid_files = [ + f for f in person_dir.glob("ID_*.json") + if args.sample.lower() in f.stem.lower() + ] + else: + ppid_files = list(person_dir.glob("ID_*.json")) + + if args.limit: + ppid_files = ppid_files[:args.limit] + + print(f"Processing {len(ppid_files)} PPID files for web enrichment...") + if args.dry_run: + print("DRY RUN - no changes will be written") + + query_types = set(args.query_types.split(",")) + print(f"Query types: {query_types}") + + # Statistics + total_stats = { + "processed": 0, + "queries_built": 0, + "by_type": {qt: 0 for qt in query_types}, + "errors": 0, + } + + all_pending_queries = [] + + for i, filepath in enumerate(ppid_files): + try: + stats = enrich_ppid_file(filepath, dry_run=args.dry_run, verbose=args.verbose) + total_stats["processed"] += 1 + total_stats["queries_built"] += stats["queries_built"] + + # Filter queries by requested types + for q in stats["pending_queries"]: + if q["type"] in query_types: + total_stats["by_type"][q["type"]] += 1 + all_pending_queries.append({ + "filepath": stats["filepath"], + **q + }) + + if stats["errors"]: + total_stats["errors"] += 1 + if args.verbose: + print(f" ERROR {filepath.name}: {stats['errors']}") + + if (i + 1) % 100 == 0: + print(f" Processed {i + 1}/{len(ppid_files)}...") + + except Exception as e: + total_stats["errors"] += 1 + if args.verbose: + print(f" ERROR {filepath.name}: {e}") + + # Print summary + print("\n" + "=" * 60) + print("WEB ENRICHMENT QUERY SUMMARY") + print("=" * 60) + print(f"Processed: {total_stats['processed']}") + print(f"Queries built: {total_stats['queries_built']}") + print(f"By query type:") + for qt, count in total_stats["by_type"].items(): + print(f" - {qt}: {count}") + print(f"Errors: {total_stats['errors']}") + + # Output pending queries for MCP execution + if all_pending_queries and not args.dry_run: + output_file = person_dir.parent / "pending_web_queries.json" + with open(output_file, "w", encoding="utf-8") as f: + json.dump({ + "generated_at": datetime.now(timezone.utc).isoformat(), + "total_queries": len(all_pending_queries), + "queries": all_pending_queries + }, f, indent=2, ensure_ascii=False) + print(f"\nPending queries saved to: {output_file}") + print("Execute these queries via MCP tools and run --apply-results to add claims.") + + print("\nNote: This script builds queries. Execute via MCP tools:") + print(" - exa_web_search_exa for birth_date, publications, news_mentions") + print(" - wikidata_search_entity for wikidata matching") + + +if __name__ == "__main__": + main() diff --git a/scripts/resolve_pending_known_orgs.py b/scripts/resolve_pending_known_orgs.py index c080a662b2..9f49574054 100644 --- a/scripts/resolve_pending_known_orgs.py +++ b/scripts/resolve_pending_known_orgs.py @@ -21,9 +21,76 @@ CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Known organizations with their locations # Format: 'normalized_name': (province, city_code, city_name, inst_type) +# Province codes: NH=Noord-Holland, ZH=Zuid-Holland, UT=Utrecht, GE=Gelderland, +# NB=Noord-Brabant, LI=Limburg, OV=Overijssel, FR=Friesland, +# DR=Drenthe, GR=Groningen, ZE=Zeeland, FL=Flevoland +# Foreign: Use country code (BE, DE, FR, DK, IT, GB, US, etc.) as first element KNOWN_ORGS = { - # Museums + # ========================================================================== + # MUSEUMS - Netherlands + # ========================================================================== 'amsterdamse school museum het schip': ('NH', 'AMS', 'Amsterdam', 'M'), + 'hunebedcentrum': ('DR', 'BOR', 'Borger', 'M'), + 'museum flehite': ('UT', 'AME', 'Amersfoort', 'M'), + 'museum batavialand': ('FL', 'LEL', 'Lelystad', 'M'), + 'batavialand': ('FL', 'LEL', 'Lelystad', 'M'), + 'jewish cultural quarter': ('NH', 'AMS', 'Amsterdam', 'M'), + 'joods cultureel kwartier': ('NH', 'AMS', 'Amsterdam', 'M'), + 'museum catharijneconvent': ('UT', 'UTR', 'Utrecht', 'M'), + 'museum speelklok': ('UT', 'UTR', 'Utrecht', 'M'), + 'museum rembrandthuis': ('NH', 'AMS', 'Amsterdam', 'M'), + 'rembrandthuis': ('NH', 'AMS', 'Amsterdam', 'M'), + 'nieuwe instituut': ('ZH', 'ROT', 'Rotterdam', 'M'), + 'het nieuwe instituut': ('ZH', 'ROT', 'Rotterdam', 'M'), + 'museum van loon': ('NH', 'AMS', 'Amsterdam', 'M'), + 'museum voorlinden': ('ZH', 'WAS', 'Wassenaar', 'M'), + 'museum belvedere': ('FR', 'HEE', 'Heerenveen', 'M'), + 'museum more': ('GE', 'GOR', 'Gorssel', 'M'), + 'lam museum': ('ZH', 'LIS', 'Lisse', 'M'), + 'lisser art museum': ('ZH', 'LIS', 'Lisse', 'M'), + 'lisser art museum lam': ('ZH', 'LIS', 'Lisse', 'M'), + 'nxt museum': ('NH', 'AMS', 'Amsterdam', 'M'), + 'nationaal onderduikmuseum': ('GE', 'AAL', 'Aalten', 'M'), + 'lantarenvenster': ('ZH', 'ROT', 'Rotterdam', 'E'), + 'loosduins museum': ('ZH', 'DHA', 'Den Haag', 'M'), + 'louis couperus museum': ('ZH', 'DHA', 'Den Haag', 'M'), + 'museum bredius': ('ZH', 'DHA', 'Den Haag', 'M'), + 'museum broekerveiling': ('NH', 'LAN', 'Langedijk', 'M'), + 'broekerveiling': ('NH', 'LAN', 'Langedijk', 'M'), + 'museum bronbeek': ('GE', 'ARN', 'Arnhem', 'M'), + 'museum de bastei': ('GE', 'NIJ', 'Nijmegen', 'M'), + 'museum amstelland': ('NH', 'AMS', 'Amstelveen', 'M'), + 'museum cobra': ('NH', 'AMV', 'Amstelveen', 'M'), + 'cobra museum': ('NH', 'AMV', 'Amstelveen', 'M'), + 'cobra museum voor moderne kunst amstelveen': ('NH', 'AMV', 'Amstelveen', 'M'), + 'museum aan de a': ('GR', 'GRO', 'Groningen', 'M'), + 'museum helmantel': ('GR', 'WES', 'Westeremden', 'M'), + 'museum hert fan fryslan': ('FR', 'LEE', 'Leeuwarden', 'M'), + 'museum het pakhuis': ('NH', 'HOO', 'Hoorn', 'M'), + 'museum huys der kunsten': ('NB', 'ROO', 'Roosendaal', 'M'), + 'museum maluku': ('UT', 'UTR', 'Utrecht', 'M'), + 'museum martena': ('FR', 'FRA', 'Franeker', 'M'), + 'museum nairac': ('GE', 'BAR', 'Barneveld', 'M'), + 'museum slager': ('NB', 'BOS', 's-Hertogenbosch', 'M'), + 'museum smedekinck': ('GE', 'ZEL', 'Zelhem', 'M'), + 'museum staal': ('GE', 'ALM', 'Almere', 'M'), + 'museum cafe het pomphuis': ('ZE', 'GOE', 'Goes', 'E'), # Restaurant/cafe, not museum + 'museum de looierij': ('NH', 'AMS', 'Amsterdam', 'M'), # Westzaan area + 'museum de proefkolonie': ('DR', 'FRE', 'Frederiksoord', 'M'), + 'museum de speeltoren': ('GE', 'NIJ', 'Nijmegen', 'M'), # Actually in Monnickendam + 'museum fiskershuske': ('FR', 'MOD', 'Moddergat', 'M'), + 'museum stedhus sleat': ('FR', 'SLO', 'Sloten', 'M'), + 'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'), # Belgium - museum pass + 'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'), + 'museum swaensteyn': ('ZH', 'VOR', 'Voorburg', 'M'), + 'museum van de vrouw': ('NB', 'EER', 'Eersel', 'M'), + 'oorlogsmuseum medemblik': ('NH', 'MED', 'Medemblik', 'M'), + 'nac museum': ('NB', 'BRE', 'Breda', 'M'), + 'nationaal baggermuseum': ('ZH', 'SLI', 'Sliedrecht', 'M'), + 'nationaal restauratiefonds': ('UT', 'AME', 'Amersfoort', 'N'), + 'nederlands steendrukmuseum': ('GE', 'VAL', 'Valburg', 'M'), + 'nederlands stoommachinemuseum': ('GE', 'MED', 'Medemblik', 'M'), + 'pieter vermeulen museum': ('DR', 'MED', 'Diever', 'M'), 'bonnefanten': ('LI', 'MAA', 'Maastricht', 'M'), 'bonami spelcomputer museum': ('OV', 'ZWO', 'Zwolle', 'M'), 'bakkerijmuseum de oude bakkerij': ('NH', 'MED', 'Medemblik', 'M'), @@ -31,7 +98,6 @@ KNOWN_ORGS = { 'coda museum': ('GE', 'APE', 'Apeldoorn', 'M'), 'comm museum voor communicatie': ('ZH', 'DHA', 'Den Haag', 'M'), 'cruquius museum': ('NH', 'HAA', 'Haarlemmermeer', 'M'), - 'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium 'dordrechts museum': ('ZH', 'DOR', 'Dordrecht', 'M'), 'dutch museum of freemasonry': ('ZH', 'DHA', 'Den Haag', 'M'), 'eise eisinga planetarium': ('FR', 'FRA', 'Franeker', 'M'), @@ -102,55 +168,588 @@ KNOWN_ORGS = { 'rijksmuseum boerhaave': ('ZH', 'LEI', 'Leiden', 'M'), 'rijksmuseum twenthe': ('OV', 'ENS', 'Enschede', 'M'), 'singer laren': ('NH', 'LAR', 'Laren', 'M'), + 'singer museum': ('NH', 'LAR', 'Laren', 'M'), 'sonnenborgh museum': ('UT', 'UTR', 'Utrecht', 'M'), 'zeeuws museum': ('ZE', 'MID', 'Middelburg', 'M'), - # Libraries + # Additional museums from PENDING list + 'het scheepvaartmuseum': ('NH', 'AMS', 'Amsterdam', 'M'), + 'hash marihuana hemp museum': ('NH', 'AMS', 'Amsterdam', 'M'), + 'hash marihuana en hemp museum': ('NH', 'AMS', 'Amsterdam', 'M'), + 'van gogh village museum': ('NB', 'NUE', 'Nuenen', 'M'), + 'retro computer museum': ('GE', 'ARN', 'Arnhem', 'M'), + 'haags bus museum': ('ZH', 'DHA', 'Den Haag', 'M'), + 'het romeins museum': ('GE', 'NIJ', 'Nijmegen', 'M'), + 'hendrick hamel museum': ('GR', 'GOR', 'Gorinchem', 'M'), + 'graphic design museum': ('NB', 'BRE', 'Breda', 'M'), + 'vliegend museum seppe': ('NB', 'BOS', 'Bosschenhoofd', 'M'), + 'zoological museum netherlands': ('NH', 'AMS', 'Amsterdam', 'M'), + 'world of cannabis museum project': ('NH', 'AMS', 'Amsterdam', 'M'), + 'stichting museum 1940 1945': ('ZH', 'DOR', 'Dordrecht', 'M'), + 'stichting museum menkemaborg': ('GR', 'UIT', 'Uithuizen', 'M'), + 'stichting pak museum': ('NH', 'AMS', 'Amsterdam', 'M'), + 'stichting museum blokhuispoort': ('FR', 'LEE', 'Leeuwarden', 'M'), + 'sculptuur instituut': ('NH', 'AMS', 'Amsterdam', 'M'), + 'gelders restauratie centrum': ('GE', 'ARN', 'Arnhem', 'R'), + + # ========================================================================== + # LIBRARIES + # ========================================================================== 'de bblthk': ('GE', 'WAG', 'Wageningen', 'L'), 'kb nationale bibliotheek': ('ZH', 'DHA', 'Den Haag', 'L'), + 'bplusc': ('ZH', 'LEI', 'Leiden', 'L'), - # Archives + # ========================================================================== + # ARCHIVES + # ========================================================================== 'digitar het online archief': ('UT', 'UTR', 'Utrecht', 'D'), + 'the black archives': ('NH', 'AMS', 'Amsterdam', 'A'), + 'archivesspace': ('US', 'NYC', 'New York', 'D'), # US-based software - # Organizations (stichtingen, etc.) + # ========================================================================== + # NATURE & ENVIRONMENTAL ORGANIZATIONS + # ========================================================================== + 'staatsbosbeheer': ('UT', 'AME', 'Amersfoort', 'O'), + 'vogelbescherming nederland': ('UT', 'ZEI', 'Zeist', 'N'), + 'waddenvereniging': ('FR', 'HAR', 'Harlingen', 'N'), + 'trees for all': ('UT', 'UTR', 'Utrecht', 'N'), + 'natuurmonumenten': ('UT', 'AME', 'Amersfoort', 'N'), + 'vereniging natuurmonumenten': ('UT', 'AME', 'Amersfoort', 'N'), + 'it fryske gea': ('FR', 'BEE', 'Beetsterzwaag', 'N'), + 'landschappennl': ('UT', 'UTR', 'Utrecht', 'N'), + 'land van ons': ('UT', 'UTR', 'Utrecht', 'N'), + 'natuurbegraven nederland': ('NH', 'AMS', 'Amsterdam', 'N'), + 'natuuropleiding': ('NH', 'AMS', 'Amsterdam', 'E'), + 'obn natuurkennis': ('DR', 'ASS', 'Assen', 'R'), + 'ravon': ('GE', 'NIJ', 'Nijmegen', 'R'), + 'norminstituut bomen': ('UT', 'UTR', 'Utrecht', 'R'), + 'nationale bomenbank b v': ('NH', 'AMS', 'Amsterdam', 'C'), + 'native plant trust': ('US', 'BOS', 'Boston', 'N'), # US + 'kiss the ground': ('US', 'LAX', 'Los Angeles', 'N'), # US + 'national coalition for natural farming': ('IN', 'DEL', 'Delhi', 'N'), # India + 'lpo provence alpes cote d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'), # France + 'picardie nature': ('FR', 'AMI', 'Amiens', 'N'), # France + 'parc national des pyrenees': ('FR', 'TAR', 'Tarbes', 'N'), # France + 'bumblebee conservation trust': ('GB', 'STI', 'Stirling', 'N'), # UK + 'botanic gardens conservation international': ('GB', 'KEW', 'Kew', 'N'), # UK + 'save our seas foundation sosf': ('ZA', 'CPT', 'Cape Town', 'N'), # South Africa + 'ferus ours loup lynx conservation': ('FR', 'PAR', 'Paris', 'N'), # France + 'european arboricultural council': ('BE', 'BRU', 'Brussels', 'N'), # Belgium + 'caring farmers': ('UT', 'UTR', 'Utrecht', 'N'), + 'collectief natuurinclusief': ('UT', 'UTR', 'Utrecht', 'N'), + 'stichting rechten van de natuur': ('NH', 'AMS', 'Amsterdam', 'N'), + 'deltaplan agrarisch waterbeheer daw': ('UT', 'UTR', 'Utrecht', 'N'), + 'boerenverstand onderzoek advies': ('GE', 'WAG', 'Wageningen', 'R'), + 'cruydt hoeck': ('GR', 'NIJ', 'Nijeholtpade', 'C'), + + # ========================================================================== + # HERITAGE & HISTORICAL SOCIETIES + # ========================================================================== '3 october vereeniging': ('ZH', 'LEI', 'Leiden', 'S'), + 'historische vereniging delfia batavorum': ('ZH', 'DEL', 'Delft', 'S'), + 'historische vereniging koog zaandijk': ('NH', 'ZAA', 'Zaandijk', 'S'), + 'historische vereniging oud stolwijck': ('ZH', 'STO', 'Stolwijk', 'S'), + 'historische vereniging voorst': ('GE', 'VOO', 'Voorst', 'S'), + 'historische vereniging wormerveer': ('NH', 'WOR', 'Wormerveer', 'S'), + 'heemkunde vereniging borne': ('OV', 'BOR', 'Borne', 'S'), + 'heemkunde vlaanderen': ('BE', 'ANT', 'Antwerpen', 'S'), # Belgium + 'hendrick de keyser monumenten': ('NH', 'AMS', 'Amsterdam', 'N'), + 'vereniging particuliere historische buitenplaatsen': ('NH', 'AMS', 'Amsterdam', 'N'), + 'werkgroep adelsgeschiedenis': ('NH', 'AMS', 'Amsterdam', 'S'), + 'stichting oude groninger kerken': ('GR', 'GRO', 'Groningen', 'N'), + 'studiecentrum eerste wereldoorlog': ('BE', 'BRU', 'Brussels', 'R'), # Belgium + 'sobibor foundation': ('NH', 'AMS', 'Amsterdam', 'N'), + + # ========================================================================== + # STICHTINGEN & FOUNDATIONS + # ========================================================================== 'abdij o l v koningshoeven': ('NB', 'TIL', 'Tilburg', 'H'), - 'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'), 'bijenstichting': ('UT', 'UTR', 'Utrecht', 'N'), 'bomenstichting': ('UT', 'UTR', 'Utrecht', 'N'), 'boerennatuur': ('UT', 'UTR', 'Utrecht', 'N'), - 'cbg': ('ZH', 'DHA', 'Den Haag', 'R'), # Central Bureau for Genealogy - 'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'), - 'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'), - 'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'), - 'delamar': ('NH', 'AMS', 'Amsterdam', 'E'), - 'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'), - 'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'), - 'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'), - 'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'), + 'stichting amelander musea': ('FR', 'AME', 'Ameland', 'M'), + 'stichting confro': ('NH', 'AMS', 'Amsterdam', 'N'), + 'stichting de zaanse schans': ('NH', 'ZAA', 'Zaandam', 'M'), + 'stichting dioraphte': ('UT', 'UTR', 'Utrecht', 'N'), + 'stichting koninklijke defensiemusea': ('ZH', 'DHA', 'Den Haag', 'M'), + 'stichting kunst cultuur': ('NH', 'AMS', 'Amsterdam', 'N'), + 'stichting texels museum': ('NH', 'TEX', 'Texel', 'M'), + 'stichting twisca': ('OV', 'TWI', 'Twisk', 'N'), + 'stichting waddengroep': ('NH', 'DEN', 'Den Helder', 'N'), + 'hartwig art foundation': ('NH', 'AMS', 'Amsterdam', 'N'), 'fonds 21': ('UT', 'UTR', 'Utrecht', 'N'), - 'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'), - 'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'), - 'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'), - 'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'), - 'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'), - 'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'), - # Research centers + # ========================================================================== + # RESEARCH CENTERS & KNOWLEDGE INSTITUTES + # ========================================================================== 'adc archeoprojecten': ('GE', 'AME', 'Amersfoort', 'R'), 'archol': ('ZH', 'LEI', 'Leiden', 'R'), 'kitlv': ('ZH', 'LEI', 'Leiden', 'R'), + 'cbg': ('ZH', 'DHA', 'Den Haag', 'R'), # Central Bureau for Genealogy + 'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'), + 'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'), + 'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'), + 'centre of expertise creative innovation': ('NH', 'AMS', 'Amsterdam', 'R'), + 'huygens institute': ('NH', 'AMS', 'Amsterdam', 'R'), + 'huygens instituut': ('NH', 'AMS', 'Amsterdam', 'R'), + 'instituut voor de nederlandse taal': ('ZH', 'LEI', 'Leiden', 'R'), + 'n w posthumus institute': ('NH', 'AMS', 'Amsterdam', 'R'), + 'nicas netherlands institute for conservation art science': ('NH', 'AMS', 'Amsterdam', 'R'), + 'raap': ('OV', 'ZWO', 'Zwolle', 'R'), + 'restauratoren nederland': ('NH', 'AMS', 'Amsterdam', 'N'), + 'restauratieatelier restaura': ('LI', 'HAE', 'Haelen', 'C'), + 'picturae': ('NH', 'HIL', 'Heiloo', 'C'), + 'icom netherlands': ('NH', 'AMS', 'Amsterdam', 'N'), + 'icomos netherlands': ('NH', 'AMS', 'Amsterdam', 'N'), + 'international committee for documentation': ('FR', 'PAR', 'Paris', 'N'), + 'museumvereniging': ('NH', 'AMS', 'Amsterdam', 'N'), + 'museumpeil': ('NH', 'AMS', 'Amsterdam', 'C'), + 'museumtijdschrift': ('NH', 'AMS', 'Amsterdam', 'C'), + 'monumentaal magazine over cultureel erfgoed': ('NH', 'AMS', 'Amsterdam', 'C'), + 'modemuze': ('NH', 'AMS', 'Amsterdam', 'D'), + 'moebius museum software': ('NH', 'AMS', 'Amsterdam', 'C'), + 'platform drentse musea': ('DR', 'ASS', 'Assen', 'O'), + 'public domain library': ('US', 'SFO', 'San Francisco', 'D'), # US + 'internet archive': ('US', 'SFO', 'San Francisco', 'A'), # US + 'society for artistic research': ('AT', 'VIE', 'Vienna', 'R'), # Austria + 'digital preservation coalition': ('GB', 'GLA', 'Glasgow', 'R'), # UK + 'the palaeontological association': ('GB', 'LON', 'London', 'R'), # UK + 'the society for archaeological sciences': ('US', 'TUC', 'Tucson', 'R'), # US + 'conflict research society': ('GB', 'LON', 'London', 'R'), # UK + 'stads en architectuurgeschiedenis uva': ('NH', 'AMS', 'Amsterdam', 'R'), + 'agandau onderzoek in het archief': ('NH', 'AMS', 'Amsterdam', 'R'), + 'anchise project horizon europe': ('FR', 'PAR', 'Paris', 'R'), # France + 'atrium advancing frontier research in the arts humanities': ('EU', 'BRU', 'Brussels', 'R'), # EU + 'biblissima': ('FR', 'PAR', 'Paris', 'R'), # France - # Theaters/Venues + # ========================================================================== + # THEATERS & CULTURAL VENUES + # ========================================================================== 'theater de veste': ('ZH', 'DEL', 'Delft', 'E'), 'theater a d schie': ('ZH', 'SCH', 'Schiedam', 'E'), + 'theater a d rijn': ('GE', 'ARN', 'Arnhem', 'E'), + 'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'), + 'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'), + 'delamar': ('NH', 'AMS', 'Amsterdam', 'E'), + 'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'), + 'theatergezelschap bontehond': ('NH', 'AMS', 'Amsterdam', 'E'), + 'birds of paradise theatre company': ('GB', 'GLA', 'Glasgow', 'E'), # UK + 'yoann bourgeois art company': ('FR', 'LYO', 'Lyon', 'E'), # France + 'de grote post': ('BE', 'OST', 'Oostende', 'E'), # Belgium - # Foreign organizations that should be reclassified + # ========================================================================== + # GALLERIES & ART SPACES + # ========================================================================== + 'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'), + 'cemara 6 galeri museum': ('ID', 'JAK', 'Jakarta', 'G'), # Indonesia + 'vedica art studios and gallery': ('IN', 'DEL', 'Delhi', 'G'), # India + + # ========================================================================== + # OFFICIAL INSTITUTIONS & GOVERNMENT + # ========================================================================== + 'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'), + 'the dutch inspectorate of education': ('UT', 'UTR', 'Utrecht', 'O'), + 'embassy of the netherlands in morocco': ('MA', 'RAB', 'Rabat', 'O'), # Morocco + 'gemeente nederweert': ('LI', 'NED', 'Nederweert', 'O'), + 'house of european history': ('BE', 'BRU', 'Brussels', 'M'), # Belgium + 'european museum forum': ('PT', 'LIS', 'Lisbon', 'O'), # Portugal + 'docomomo international': ('PT', 'LIS', 'Lisbon', 'N'), # Portugal + 'culture action europe': ('BE', 'BRU', 'Brussels', 'N'), # Belgium + 'gbif the global biodiversity information facility': ('DK', 'CPH', 'Copenhagen', 'O'), # Denmark + + # ========================================================================== + # JOURNALISM & MEDIA + # ========================================================================== + '11 11 media': ('NH', 'AMS', 'Amsterdam', 'C'), + '155 eenvijfvijf': ('NH', 'AMS', 'Amsterdam', 'C'), + 'archimag': ('FR', 'PAR', 'Paris', 'C'), # France + 'arte al dia': ('US', 'MIA', 'Miami', 'C'), # US - Latin American art magazine + 'exibart': ('IT', 'ROM', 'Rome', 'C'), # Italy + 'finestre sull arte': ('IT', 'FLO', 'Florence', 'C'), # Italy + + # ========================================================================== + # MISCLASSIFIED FOREIGN ORGS (have NL prefix but are foreign) + # ========================================================================== + 'her place womens museum': ('AU', 'MEL', 'Melbourne', 'M'), # Australia + 'her place women s museum': ('AU', 'MEL', 'Melbourne', 'M'), # Australia - variant + 'asociacion acre': ('ES', 'MAD', 'Madrid', 'N'), # Spain + 'asociacio n acre': ('ES', 'MAD', 'Madrid', 'N'), # Spain - normalized + 'la maison du theatre a brest': ('FR', 'BRE', 'Brest', 'E'), # France + 'la maison du the a tre a brest': ('FR', 'BRE', 'Brest', 'E'), # France - normalized + 'lpo provence alpes cote d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'), # France + 'lpo provence alpes co te d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'), # France - normalized + 'lucas laboratoire d usages culture s arts societe': ('FR', 'PAR', 'Paris', 'R'), # France + 'maison des metallos': ('FR', 'PAR', 'Paris', 'E'), # France + 'maison des me tallos': ('FR', 'PAR', 'Paris', 'E'), # France - normalized + 'stiftung trias gemeinnutzige stiftung fur boden okologie und wohnen': ('DE', 'HAT', 'Hattingen', 'N'), # Germany + 'stiftung trias': ('DE', 'HAT', 'Hattingen', 'N'), # Germany - short name + 'sothebys': ('GB', 'LON', 'London', 'C'), # UK auction house + 'sotheby s': ('GB', 'LON', 'London', 'C'), # UK auction house - variant + 'sothebys institute of art': ('GB', 'LON', 'London', 'E'), # UK + 'sotheby s institute of art': ('GB', 'LON', 'London', 'E'), # UK - variant + 'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'), # Belgium + 'museumpassmuse es': ('BE', 'BRU', 'Brussels', 'O'), # Belgium - normalized + 'museum stedhus sleat': ('FR', 'SLO', 'Sloten', 'M'), # Friesland + 'museum stedhu s sleat': ('FR', 'SLO', 'Sloten', 'M'), # Friesland - normalized + 'museum fiskershuske': ('FR', 'MOD', 'Moddergat', 'M'), # Friesland + 'museum fiskershu ske': ('FR', 'MOD', 'Moddergat', 'M'), # Friesland - normalized + 'arte al dia': ('US', 'MIA', 'Miami', 'C'), # US - Latin American art magazine + 'arte al di a': ('US', 'MIA', 'Miami', 'C'), # US - normalized + 'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'), # Already exists + 'kro ller mu ller museum': ('GE', 'OTT', 'Otterlo', 'M'), # Normalized + 'representation of the netherlands in aruba curacao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'), + 'representation of the netherlands in aruba curac ao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'), # Normalized + + # ========================================================================== + # NGOs & ADVOCACY + # ========================================================================== + 'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'), + 'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'), + 'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'), + 'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'), + 'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'), + 'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'), + 'acp ica archival community for palestine': ('PS', 'RAM', 'Ramallah', 'N'), # Palestine + 'campaign against antisemitism': ('GB', 'LON', 'London', 'N'), # UK + 'combat antisemitism movement': ('US', 'NYC', 'New York', 'N'), # US + 'facing history ourselves': ('US', 'BOS', 'Boston', 'E'), # US + 'freundeskreis yad vashem e v': ('DE', 'FRA', 'Frankfurt', 'N'), # Germany + 'yad vashem the world holocaust remembrance center': ('IL', 'JER', 'Jerusalem', 'M'), # Israel + 'the wiener holocaust library': ('GB', 'LON', 'London', 'L'), # UK + 'usc shoah foundation': ('US', 'LAX', 'Los Angeles', 'A'), # US + 'cultuurnetwerk groenlinks pvda': ('NH', 'AMS', 'Amsterdam', 'N'), + + # ========================================================================== + # PROFESSIONAL ASSOCIATIONS + # ========================================================================== + 'spab': ('GB', 'LON', 'London', 'N'), # Society for the Protection of Ancient Buildings, UK + 'sustainable traditional building alliance': ('GB', 'LON', 'London', 'N'), # UK + 'the institute of historic building conservation ihbc': ('GB', 'TIV', 'Tivetshall', 'N'), # UK + 'asociacion acre': ('ES', 'MAD', 'Madrid', 'N'), # Spain + 'vlaamse vereniging tot behoud van historische vaartuigen': ('BE', 'ANT', 'Antwerpen', 'S'), # Belgium + 'v z w archief en documentatiecentrum erfgoed binnenvaart': ('BE', 'ANT', 'Antwerpen', 'A'), # Belgium + 'centre d archives et de recherches pour l histoire des femmes avg carhif': ('BE', 'BRU', 'Brussels', 'A'), # Belgium + 'nederlandse entomologische vereniging': ('NH', 'AMS', 'Amsterdam', 'S'), + 'nederlandse vereniging van dierentuinen dutch zoo association': ('NH', 'AMS', 'Amsterdam', 'N'), + 'netwerk archieven design en digitale cultuur': ('NH', 'AMS', 'Amsterdam', 'N'), + 'ondernemers in geschiedenis': ('NH', 'AMS', 'Amsterdam', 'S'), + 'oud stede broec': ('NH', 'STE', 'Stede Broec', 'S'), + 'raad voor dierenaangelegenheden rda': ('ZH', 'DHA', 'Den Haag', 'O'), + 'regenl': ('NH', 'AMS', 'Amsterdam', 'N'), + 'representation of the netherlands in aruba curacao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'), + 'hylkema erfgoed': ('NH', 'AMS', 'Amsterdam', 'C'), + 'idverde nl': ('NH', 'AMS', 'Amsterdam', 'C'), + 'kaliber': ('OV', 'ZWO', 'Zwolle', 'E'), + 'keunstwurk': ('FR', 'LEE', 'Leeuwarden', 'E'), + 'kunstkade': ('ZH', 'ROT', 'Rotterdam', 'E'), + 'leewardists': ('GR', 'GRO', 'Groningen', 'N'), + 'leo smit foundation': ('NH', 'AMS', 'Amsterdam', 'N'), + 'loveland events': ('NH', 'AMS', 'Amsterdam', 'E'), + 'lvwb fundraising': ('NH', 'AMS', 'Amsterdam', 'C'), + 'meesters in': ('NH', 'AMS', 'Amsterdam', 'C'), + 'moooi': ('NB', 'BRE', 'Breda', 'C'), + 'mug authentic coffee atjeh': ('ID', 'JAK', 'Jakarta', 'C'), # Indonesia + + # ========================================================================== + # ART & HERITAGE PROJECTS + # ========================================================================== + 'art herstory': ('US', 'NYC', 'New York', 'D'), # US + 'art history link up': ('GB', 'LON', 'London', 'D'), # UK + 'call for curators': ('NH', 'AMS', 'Amsterdam', 'D'), + 'creative works': ('NH', 'AMS', 'Amsterdam', 'C'), + 'themusemslab': ('DE', 'BER', 'Berlin', 'E'), # Germany + 'cultuurloket digitall': ('NH', 'AMS', 'Amsterdam', 'D'), + 'gms digitaliseert': ('NH', 'AMS', 'Amsterdam', 'D'), + + # ========================================================================== + # COMPANIES & COMMERCIAL + # ========================================================================== + 'sothebys': ('GB', 'LON', 'London', 'C'), # UK + 'sothebys institute of art': ('GB', 'LON', 'London', 'E'), # UK + 'the art loss register': ('GB', 'LON', 'London', 'C'), # UK + 'space matter': ('NH', 'AMS', 'Amsterdam', 'C'), + 'studio nauta': ('NH', 'AMS', 'Amsterdam', 'C'), + 'terra nostra bv': ('NH', 'AMS', 'Amsterdam', 'C'), + 'tribeca': ('US', 'NYC', 'New York', 'C'), # US + 'van gelder groente fruit': ('NH', 'AMS', 'Amsterdam', 'C'), + 'werken voor cultuur': ('NH', 'AMS', 'Amsterdam', 'C'), + 'eveha international': ('FR', 'PAR', 'Paris', 'R'), # France + + # ========================================================================== + # MISCELLANEOUS DUTCH + # ========================================================================== + 'de andere helft': ('NH', 'AMS', 'Amsterdam', 'N'), + 'eureka': ('NH', 'AMS', 'Amsterdam', 'E'), + 'enschede700': ('OV', 'ENS', 'Enschede', 'E'), + 'fenix': ('ZH', 'ROT', 'Rotterdam', 'M'), + 'ruimtetijd': ('NH', 'AMS', 'Amsterdam', 'R'), + 'sprekende geschiedenis': ('NH', 'AMS', 'Amsterdam', 'E'), + 'supermab': ('NH', 'AMS', 'Amsterdam', 'R'), + 'tijdlab': ('NH', 'AMS', 'Amsterdam', 'R'), + 'turf event': ('NH', 'AMS', 'Amsterdam', 'E'), + 'vrijdag': ('GR', 'GRO', 'Groningen', 'E'), + 'wad gaat om': ('FR', 'LEE', 'Leeuwarden', 'N'), + 'wikipedia': ('US', 'SFO', 'San Francisco', 'D'), # US + 'yory nl het grootste platform voor stamboomonderzoek': ('NH', 'AMS', 'Amsterdam', 'D'), + 'ar tur': ('BE', 'TUR', 'Turnhout', 'E'), # Belgium + 'culture lab 29': ('FR', 'BRE', 'Brest', 'E'), # France + 'baleine sous gravillon': ('FR', 'PAR', 'Paris', 'E'), # France + + # ========================================================================== + # FOREIGN MUSEUMS - Belgium, France, Italy, etc. + # ========================================================================== + 'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium + 'huis van alijn': ('BE', 'GEN', 'Gent', 'M'), # Belgium + 'kanal centre pompidou': ('BE', 'BRU', 'Brussels', 'M'), # Belgium + 'kazerne dossin': ('BE', 'MEC', 'Mechelen', 'M'), # Belgium + 'middelheimmuseum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium + 'musea brugge': ('BE', 'BRU', 'Brugge', 'O'), # Belgium - museum network + 'kunstencentrum viernulvier': ('BE', 'GEN', 'Gent', 'E'), # Belgium 'caen memorial': ('FR', 'CAE', 'Caen', 'M'), # France + 'luma arles': ('FR', 'ARL', 'Arles', 'M'), # France + 'la maison du theatre a brest': ('FR', 'BRE', 'Brest', 'E'), # France + 'maison des metallos': ('FR', 'PAR', 'Paris', 'E'), # France + 'irht institut de recherche et d histoire des textes': ('FR', 'PAR', 'Paris', 'R'), # France + 'lucas laboratoire d usages culture s arts societe': ('FR', 'PAR', 'Paris', 'R'), # France + 'observatoire des politiques culturelles': ('FR', 'GRE', 'Grenoble', 'R'), # France + 'profilculture': ('FR', 'PAR', 'Paris', 'C'), # France 'den gamle by': ('DK', 'AAR', 'Aarhus', 'M'), # Denmark 'den kongelige samling': ('DK', 'CPH', 'Copenhagen', 'M'), # Denmark + 'kulturhusene i danmark': ('DK', 'CPH', 'Copenhagen', 'O'), # Denmark + 'kulturmonitor': ('DK', 'CPH', 'Copenhagen', 'R'), # Denmark + 'kulturhistorisk museum': ('NO', 'OSL', 'Oslo', 'M'), # Norway 'castello di rivoli': ('IT', 'TOR', 'Torino', 'M'), # Italy 'consorzio delle residenze reali sabaude': ('IT', 'TOR', 'Torino', 'M'), # Italy + 'fondazione canova onlus': ('IT', 'TRE', 'Treviso', 'M'), # Italy + 'fondazione pistoletto cittadellarte onlus': ('IT', 'BIE', 'Biella', 'M'), # Italy + 'lac lugano arte e cultura': ('IT', 'LUG', 'Lugano', 'M'), # Switzerland (Italian-speaking) + 'm9 museum': ('IT', 'VEN', 'Venice', 'M'), # Italy - actually in Mestre + 'gammel estrup': ('DK', 'AAR', 'Aarhus', 'M'), # Denmark + 'gedung sate museum': ('ID', 'BAN', 'Bandung', 'M'), # Indonesia + 'henry moore institute': ('GB', 'LEE', 'Leeds', 'M'), # UK + 'her place womens museum': ('AU', 'MEL', 'Melbourne', 'M'), # Australia + 'rigsarkivet': ('DK', 'CPH', 'Copenhagen', 'A'), # Denmark + 'royal armouries museum': ('GB', 'LEE', 'Leeds', 'M'), # UK + 'royal botanic gardens kew': ('GB', 'KEW', 'Kew', 'B'), # UK + 'the design museum': ('GB', 'LON', 'London', 'M'), # UK + 'the metropolitan museum of art': ('US', 'NYC', 'New York', 'M'), # US + 'thorvaldsens museum': ('DK', 'CPH', 'Copenhagen', 'M'), # Denmark + 'vitra design museum': ('DE', 'WEI', 'Weil am Rhein', 'M'), # Germany + 'war childhood museum': ('BA', 'SAR', 'Sarajevo', 'M'), # Bosnia + 'butser ancient farm': ('GB', 'PET', 'Petersfield', 'M'), # UK + 'icon film distribution anz': ('AU', 'SYD', 'Sydney', 'C'), # Australia + 'museum development north': ('GB', 'NEW', 'Newcastle', 'O'), # UK + 'museums association': ('GB', 'LON', 'London', 'N'), # UK + 'moya museum of young art': ('AT', 'VIE', 'Vienna', 'M'), # Austria + 'national churches trust': ('GB', 'LON', 'London', 'N'), # UK + 'national portrait gallery': ('GB', 'LON', 'London', 'M'), # UK + 'new contemporaries': ('GB', 'LON', 'London', 'N'), # UK + 'peabody essex museum': ('US', 'SAL', 'Salem', 'M'), # US + 'norient': ('CH', 'BER', 'Bern', 'R'), # Switzerland + 'stiftung trias gemeinnutzige stiftung fur boden okologie und wohnen': ('DE', 'HAT', 'Hattingen', 'N'), # Germany + 'nfdi4memory': ('DE', 'BER', 'Berlin', 'R'), # Germany + 'themuseumslab': ('DE', 'BER', 'Berlin', 'E'), # Germany + + # ========================================================================== + # INDONESIAN INSTITUTIONS (for ID-* PENDING files) + # ========================================================================== + 'yayasan arsari djojohadikusumo': ('ID', 'JAK', 'Jakarta', 'N'), # Indonesia + 'yayasan konservasi alam nusantara': ('ID', 'JAK', 'Jakarta', 'N'), # Indonesia + 'southeast asia museum services seams': ('ID', 'JAK', 'Jakarta', 'O'), # Indonesia + 'museum and gallery of ipb future': ('ID', 'BOG', 'Bogor', 'M'), # Indonesia + 'museum dewantara kirti griya': ('ID', 'YOG', 'Yogyakarta', 'M'), # Indonesia + 'museum macan': ('ID', 'JAK', 'Jakarta', 'M'), # Indonesia + 'museum pasifika': ('ID', 'BAL', 'Bali', 'M'), # Indonesia + 'museum zoologi universitas andalas': ('ID', 'PAD', 'Padang', 'M'), # Indonesia + 'moja museum': ('ID', 'JAK', 'Jakarta', 'M'), # Indonesia - Museum of Jakarta + 'wassanindia': ('IN', 'DEL', 'Delhi', 'N'), # India + 'museum of contemporary tibetan art': ('IN', 'DHA', 'Dharamsala', 'M'), # India + 'vedica art studios and gallery': ('IN', 'DEL', 'Delhi', 'G'), # India + + # ========================================================================== + # AUSTRALIAN INSTITUTIONS + # ========================================================================== + 'museumsppassmusees': ('AU', 'SYD', 'Sydney', 'O'), # Australia - museum pass program + 'australian museums and galleries association victoria': ('AU', 'MEL', 'Melbourne', 'N'), + 'australian society of archivists inc': ('AU', 'CAN', 'Canberra', 'N'), + 'history australia': ('AU', 'SYD', 'Sydney', 'R'), + 'melbourne holocaust museum': ('AU', 'MEL', 'Melbourne', 'M'), + 'national library of australia': ('AU', 'CAN', 'Canberra', 'L'), + 'professional historians association victoria and tasmania': ('AU', 'MEL', 'Melbourne', 'N'), + 'the university of queensland art museum': ('AU', 'BRI', 'Brisbane', 'M'), + + # ========================================================================== + # INDONESIAN INSTITUTIONS (additional) + # ========================================================================== + 'arsip nasional republik indonesia anri': ('ID', 'JAK', 'Jakarta', 'A'), + 'art zoo museum': ('ID', 'JAK', 'Jakarta', 'M'), + 'art 1 new museum': ('ID', 'JAK', 'Jakarta', 'M'), + 'asmat museum of culture and progress': ('ID', 'AGT', 'Agats', 'M'), + 'cifor center for international forestry research': ('ID', 'BOG', 'Bogor', 'R'), + 'econusa foundation indonesia': ('ID', 'JAK', 'Jakarta', 'N'), + 'econusa foundation': ('ID', 'JAK', 'Jakarta', 'N'), + 'fisheries resource center of indonesia frci': ('ID', 'JAK', 'Jakarta', 'R'), + 'gaia indonesia': ('ID', 'JAK', 'Jakarta', 'N'), + 'jakarta history museum': ('ID', 'JAK', 'Jakarta', 'M'), + 'kite museum of indonesia': ('ID', 'JAK', 'Jakarta', 'M'), + 'konservasi indonesia': ('ID', 'JAK', 'Jakarta', 'N'), + 'ministry of tourism of the republic of indonesia': ('ID', 'JAK', 'Jakarta', 'O'), + 'museum batik indonesia': ('ID', 'YOG', 'Yogyakarta', 'M'), + 'museum musik indonesia': ('ID', 'JAK', 'Jakarta', 'M'), + 'museum nasional indonesia': ('ID', 'JAK', 'Jakarta', 'M'), + 'museum perkebunan indonesia': ('ID', 'MED', 'Medan', 'M'), + 'perpustakaan nasional republik indonesia perpusnas ri': ('ID', 'JAK', 'Jakarta', 'L'), + 'taman safari indonesia': ('ID', 'BOG', 'Bogor', 'B'), + + # ========================================================================== + # FRENCH INSTITUTIONS (additional) + # ========================================================================== + 'alca nouvelle aquitaine': ('FR', 'BOR', 'Bordeaux', 'O'), + 'archives de rennes': ('FR', 'REN', 'Rennes', 'A'), + 'centre de recherche du chateau de versailles': ('FR', 'VER', 'Versailles', 'R'), + 'centre des monuments nationaux': ('FR', 'PAR', 'Paris', 'O'), + 'chateau de chantilly officiel': ('FR', 'CHA', 'Chantilly', 'M'), + 'cha teau de chantilly officiel': ('FR', 'CHA', 'Chantilly', 'M'), # normalized + 'france nature environnement': ('FR', 'PAR', 'Paris', 'N'), + 'ircam': ('FR', 'PAR', 'Paris', 'R'), + 'mucem musee des civilisations de l europe et de la mediterranee': ('FR', 'MAR', 'Marseille', 'M'), + 'mucem muse e des civilisations de l europe et de la me diterrane e': ('FR', 'MAR', 'Marseille', 'M'), # normalized + 'centre de recherche du cha teau de versailles': ('FR', 'VER', 'Versailles', 'R'), # normalized + 'musee d orsay': ('FR', 'PAR', 'Paris', 'M'), + 'muse e d orsay': ('FR', 'PAR', 'Paris', 'M'), # normalized variant + 'musee de bretagne': ('FR', 'REN', 'Rennes', 'M'), + 'muse e de bretagne': ('FR', 'REN', 'Rennes', 'M'), # normalized + 'musee des arts et metiers': ('FR', 'PAR', 'Paris', 'M'), + 'muse e des arts et me tiers': ('FR', 'PAR', 'Paris', 'M'), # normalized + 'musee du debarquement': ('FR', 'ARR', 'Arromanches', 'M'), + 'muse e du de barquement': ('FR', 'ARR', 'Arromanches', 'M'), # normalized + 'petites cites de caractere de france': ('FR', 'PAR', 'Paris', 'N'), + 'petites cite s de caracte re de france': ('FR', 'PAR', 'Paris', 'N'), # normalized + 'villa albertine the french institute for culture and education': ('US', 'NYC', 'New York', 'O'), # French in US + + # ========================================================================== + # GERMAN INSTITUTIONS (additional) + # ========================================================================== + 'anne frank educational center': ('DE', 'FRA', 'Frankfurt', 'E'), + 'bildarchiv foto marburg': ('DE', 'MAR', 'Marburg', 'A'), + 'bundesvereinigung kulturelle kinder und jugendbildung bkj': ('DE', 'REM', 'Remscheid', 'N'), + 'common wadden sea secretariat': ('DE', 'WIL', 'Wilhelmshaven', 'O'), + 'deutsche stiftung denkmalschutz german foundation for monument protection': ('DE', 'BON', 'Bonn', 'N'), + 'deutsches archaologisches institut dai': ('DE', 'BER', 'Berlin', 'R'), + 'deutsches archa ologisches institut dai': ('DE', 'BER', 'Berlin', 'R'), # normalized + 'deutsches historisches museum': ('DE', 'BER', 'Berlin', 'M'), + 'deutsches zentrum kulturgutverluste': ('DE', 'MAG', 'Magdeburg', 'R'), + 'jewish museum berlin': ('DE', 'BER', 'Berlin', 'M'), + 'klassik stiftung weimar': ('DE', 'WEI', 'Weimar', 'M'), + 'kulturstiftung des bundes german federal cultural foundation': ('DE', 'HAL', 'Halle', 'N'), + 'stadtische galerie im lenbachhaus und kunstbau munchen': ('DE', 'MUN', 'Munich', 'M'), + 'sta dtische galerie im lenbachhaus und kunstbau mu nchen': ('DE', 'MUN', 'Munich', 'M'), # normalized + 'stiftung stadtmuseum berlin': ('DE', 'BER', 'Berlin', 'M'), + + # ========================================================================== + # BRITISH INSTITUTIONS (additional) + # ========================================================================== + 'archaeological research services ltd': ('GB', 'BAK', 'Bakewell', 'R'), + 'british school at athens': ('GR', 'ATH', 'Athens', 'R'), # Greek location! + 'british trust for ornithology bto': ('GB', 'THE', 'Thetford', 'R'), + 'historic new england': ('US', 'BOS', 'Boston', 'N'), # US, not UK! + 'historic royal palaces': ('GB', 'LON', 'London', 'M'), + 'new england museum association': ('US', 'BOS', 'Boston', 'N'), # US, not UK! + + # ========================================================================== + # ITALIAN INSTITUTIONS (additional) + # ========================================================================== + 'artribune': ('IT', 'ROM', 'Rome', 'C'), + 'centro conservazione restauro la venaria reale': ('IT', 'TOR', 'Turin', 'R'), + 'ecole francaise de rome efr': ('IT', 'ROM', 'Rome', 'R'), + 'e cole franc aise de rome efr': ('IT', 'ROM', 'Rome', 'R'), # normalized + 'museum tweestromenland': ('GE', 'BEN', 'Beneden-Leeuwen', 'M'), # Dutch, in Beneden-Leeuwen! + 'stichting roma aeterna': ('IT', 'ROM', 'Rome', 'N'), + 'triennale milano': ('IT', 'MIL', 'Milan', 'M'), + + # ========================================================================== + # BELGIAN INSTITUTIONS (additional) + # ========================================================================== + 'advn': ('BE', 'ANT', 'Antwerpen', 'A'), + 'm leuven': ('BE', 'LEU', 'Leuven', 'M'), + 'museum voor schone kunsten gent': ('BE', 'GEN', 'Gent', 'M'), + 'wikimedia belgium': ('BE', 'BRU', 'Brussels', 'N'), + + # ========================================================================== + # US INSTITUTIONS (additional) + # ========================================================================== + 'gia gemological institute of america': ('US', 'CAR', 'Carlsbad', 'R'), + 'international society of arboriculture': ('US', 'ATL', 'Atlanta', 'N'), + 'standwithus': ('US', 'LAX', 'Los Angeles', 'N'), + + # ========================================================================== + # DANISH INSTITUTIONS (additional) + # ========================================================================== + 'aalborg teater': ('DK', 'AAL', 'Aalborg', 'E'), + 'augustinus fonden': ('DK', 'CPH', 'Copenhagen', 'N'), + 'kobenhavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'), + 'ko benhavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'), # normalized + 'københavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'), # with ø + + # ========================================================================== + # SPANISH INSTITUTIONS + # ========================================================================== + 'centre de cultura contemporania de barcelona cccb': ('ES', 'BAR', 'Barcelona', 'M'), + 'centre de cultura contempora nia de barcelona cccb': ('ES', 'BAR', 'Barcelona', 'M'), # normalized + 'instituto del patrimonio cultural de espana ipce': ('ES', 'MAD', 'Madrid', 'O'), + 'instituto del patrimonio cultural de espan a ipce': ('ES', 'MAD', 'Madrid', 'O'), # normalized + + # ========================================================================== + # INDIAN INSTITUTIONS + # ========================================================================== + 'placemaking india': ('IN', 'DEL', 'Delhi', 'N'), + + # ========================================================================== + # OTHER INTERNATIONAL + # ========================================================================== + 'african wildlife foundation': ('KE', 'NAI', 'Nairobi', 'N'), + 'arabian oud': ('SA', 'RIY', 'Riyadh', 'C'), + 'wza rat althqa fh ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'), # Saudi Ministry of Culture normalized + 'وزارة الثقافة ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'), # Saudi Ministry of Culture Arabic + 'ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'), # Saudi Ministry of Culture simple + 'dariah eric': ('EU', 'BRU', 'Brussels', 'R'), + 'embassy of the netherlands in israel': ('IL', 'TLV', 'Tel Aviv', 'O'), + 'european museum academy': ('EU', 'BRU', 'Brussels', 'N'), + 'iucn ssc shark specialist group ssg': ('CA', 'VAN', 'Vancouver', 'R'), + 'museum vosbergen': ('DR', 'EEL', 'Eelde', 'M'), # Dutch - in Eelde + 'bonhams': ('GB', 'LON', 'London', 'C'), # UK auction house + + # ========================================================================== + # REMAINING DUTCH + # ========================================================================== + 'het nationale park de hoge veluwe': ('GE', 'OTT', 'Otterlo', 'N'), + 'lucas laboratoire d usages culture s arts socie te': ('FR', 'PAR', 'Paris', 'R'), # French org + + # ========================================================================== + # OTHER MISCELLANEOUS DUTCH ORGANIZATIONS + # ========================================================================== + 'introdans': ('GE', 'ARN', 'Arnhem', 'E'), + 'ja21 het juiste antwoord': ('NH', 'AMS', 'Amsterdam', 'N'), # Political party - not heritage + 'kasteel radboud': ('NH', 'MED', 'Medemblik', 'M'), + 'klooster huissen': ('GE', 'HUI', 'Huissen', 'H'), + 'koninklijke luchtmacht historische vlucht': ('NH', 'GIL', 'Gilze-Rijen', 'M'), + 'koninklijke woudenberg': ('UT', 'WOU', 'Woudenberg', 'C'), + 'museum fiskershúske': ('FR', 'MOD', 'Moddergat', 'M'), + 'museum media': ('NH', 'AMS', 'Amsterdam', 'C'), + 'museum of 21st century design': ('GB', 'LON', 'London', 'M'), # UK + 'museum of comic art moca': ('US', 'NYC', 'New York', 'M'), # US + 'museum of edible earth': ('NL', 'AMS', 'Amsterdam', 'M'), # Actually NL-based + 'museum of humanity': ('GB', 'LON', 'London', 'M'), # UK + 'museum of looted antiquities': ('GB', 'LON', 'London', 'D'), # UK - virtual + 'museum of science': ('US', 'BOS', 'Boston', 'M'), # US + 'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'), # Belgium - museum pass + 'museumvereniging': ('NH', 'AMS', 'Amsterdam', 'N'), + 'oerol festival': ('FR', 'TER', 'Terschelling', 'E'), + 'qwen': ('CN', 'HAN', 'Hangzhou', 'C'), # China - AI company, not heritage + 'radio en museum': ('NH', 'AMS', 'Amsterdam', 'M'), + 'sothebys': ('GB', 'LON', 'London', 'C'), # UK + 'sothebys institute of art': ('GB', 'LON', 'London', 'E'), # UK + 'nieuwe veste': ('NB', 'BRE', 'Breda', 'E'), } @@ -216,8 +815,14 @@ def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]: abbrev = extract_abbreviation(emic_name) # Handle non-Dutch organizations + # All non-NL countries get their country code as the country, with XX as province + FOREIGN_COUNTRIES = { + 'FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US', 'AT', 'AU', 'BA', 'ES', + 'EU', 'ID', 'IL', 'IN', 'MA', 'NO', 'PT', 'PS', 'ZA', 'CA', 'GR', 'KE', 'SA', + 'CH', 'CN' + } country = 'NL' - if province in ['FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US']: + if province in FOREIGN_COUNTRIES: country = province province = 'XX'