feat(scripts): expand university location mappings and add web enrichment

- enrich_ppids.py: Add 40+ Dutch universities and hogescholen to location mapping - enrich_ppids_web.py: New script for web-based PPID enrichment - resolve_pending_known_orgs.py: Updates for pending org resolution
2026-01-09 21:10:14 +01:00 · 2026-01-09 21:10:14 +01:00 · dd0ee2cf11
commit dd0ee2cf11
parent ea35da02dc
3 changed files with 1590 additions and 46 deletions
--- a/scripts/enrich_ppids.py
+++ b/scripts/enrich_ppids.py
@ -114,6 +114,85 @@ DUTCH_UNI_LOCATIONS = {
    "IOPS": ("Amsterdam", "NL"),
    "Interuniversity Graduate School of Psychometrics": ("Amsterdam", "NL"),
    "Sioo": ("Utrecht", "NL"),
+    # Additional Dutch universities (expanded mapping)
+    "Eindhoven University of Technology": ("Eindhoven", "NL"),
+    "Delft University of Technology": ("Delft", "NL"),
+    "University of Twente": ("Enschede", "NL"),
+    "Universiteit Twente": ("Enschede", "NL"),
+    "UT": ("Enschede", "NL"),
+    "Open Universiteit": ("Heerlen", "NL"),
+    "Open University Netherlands": ("Heerlen", "NL"),
+    "Nyenrode": ("Breukelen", "NL"),
+    "Nyenrode Business Universiteit": ("Breukelen", "NL"),
+    "Theologische Universiteit": ("Kampen", "NL"),
+    "Protestant Theological University": ("Amsterdam", "NL"),
+    # Additional Hogescholen
+    "De Haagse Hogeschool": ("Den Haag", "NL"),
+    "The Hague University": ("Den Haag", "NL"),
+    "The Hague University of Applied Sciences": ("Den Haag", "NL"),
+    "Amsterdamse Hogeschool voor de Kunsten": ("Amsterdam", "NL"),
+    "AHK": ("Amsterdam", "NL"),
+    "Conservatorium van Amsterdam": ("Amsterdam", "NL"),
+    "Hanzehogeschool Groningen": ("Groningen", "NL"),
+    "Hogeschool Leiden": ("Leiden", "NL"),
+    "Hogeschool Zeeland": ("Vlissingen", "NL"),
+    "HZ University of Applied Sciences": ("Vlissingen", "NL"),
+    "Hogeschool voor de Kunsten Utrecht": ("Utrecht", "NL"),
+    "HKU": ("Utrecht", "NL"),
+    "Willem de Kooning Academie": ("Rotterdam", "NL"),
+    "Codarts Rotterdam": ("Rotterdam", "NL"),
+    "Codarts": ("Rotterdam", "NL"),
+    "Design Academy": ("Eindhoven", "NL"),
+    "NHTV": ("Breda", "NL"),
+    "NHTV Breda University of Applied Sciences": ("Breda", "NL"),
+    "Breda University of Applied Sciences": ("Breda", "NL"),
+    "NHL Hogeschool": ("Leeuwarden", "NL"),
+    "Van Hall Larenstein": ("Velp", "NL"),
+    "NCOI": ("Hilversum", "NL"),
+    "NCOI Opleidingen": ("Hilversum", "NL"),
+    "LOI": ("Leiderdorp", "NL"),
+    "LOI Hogeschool": ("Leiderdorp", "NL"),
+    "NTI": ("Leiden", "NL"),
+    "Hogeschool Arnhem": ("Arnhem", "NL"),
+    "Hogeschool Nijmegen": ("Nijmegen", "NL"),
+    "ROC": ("", "NL"),  # Regional Training Centers - various locations (fallback)
+    # Specific ROC locations
+    "ROC Leeuwenborgh": ("Maastricht", "NL"),
+    "ROC Leiden": ("Leiden", "NL"),
+    "ROC Midden Nederland": ("Utrecht", "NL"),
+    "ROC MN": ("Utrecht", "NL"),
+    "ROC van Amsterdam": ("Amsterdam", "NL"),
+    "ROC Amsterdam": ("Amsterdam", "NL"),
+    "ROC Flevoland": ("Almere", "NL"),
+    "ROC Tilburg": ("Tilburg", "NL"),
+    "ROC van Twente": ("Enschede", "NL"),
+    "ROC Twente": ("Enschede", "NL"),
+    "ROC Nijmegen": ("Nijmegen", "NL"),
+    "ROC Mondriaan": ("Den Haag", "NL"),
+    "ROC Nova College": ("Haarlem", "NL"),
+    "ROC Albeda": ("Rotterdam", "NL"),
+    "Albeda College": ("Rotterdam", "NL"),
+    "Zadkine": ("Rotterdam", "NL"),
+    "Graafschap College": ("Doetinchem", "NL"),
+    "Friesland College": ("Leeuwarden", "NL"),
+    "Noorderpoort": ("Groningen", "NL"),
+    "Alfa-college": ("Groningen", "NL"),
+    "Deltion College": ("Zwolle", "NL"),
+    "Cibap": ("Zwolle", "NL"),
+    "Summa College": ("Eindhoven", "NL"),
+    "SintLucas": ("Eindhoven", "NL"),
+    "Koning Willem I College": ("Den Bosch", "NL"),
+    "Curio": ("Breda", "NL"),
+    "Da Vinci College": ("Dordrecht", "NL"),
+    # Additional Radboud variations
+    "Radboud University Nijmegen": ("Nijmegen", "NL"),
+    "Radboud University": ("Nijmegen", "NL"),
+    # Additional VU variations
+    "Vrije Universiteit Amsterdam": ("Amsterdam", "NL"),
+    "VU University Amsterdam": ("Amsterdam", "NL"),
+    # Wageningen variations
+    "Wageningen University & Research": ("Wageningen", "NL"),
+    "WUR": ("Wageningen", "NL"),
    # Belgian institutions
    "KU Leuven": ("Leuven", "BE"),
    "University of Leuven": ("Leuven", "BE"),
@ -141,9 +220,85 @@ DUTCH_UNI_LOCATIONS = {
    "LMU München": ("München", "DE"),
    "Technische Universität München": ("München", "DE"),
    "TU München": ("München", "DE"),
-    # International
+    # UK institutions
+    "University of Oxford": ("Oxford", "GB"),
+    "Oxford University": ("Oxford", "GB"),
+    "University of Cambridge": ("Cambridge", "GB"),
+    "Cambridge University": ("Cambridge", "GB"),
+    "University of York": ("York", "GB"),
+    "University College London": ("London", "GB"),
+    "UCL": ("London", "GB"),
+    "London School of Economics": ("London", "GB"),
+    "LSE": ("London", "GB"),
+    "King's College London": ("London", "GB"),
+    "Imperial College": ("London", "GB"),
+    "University of Edinburgh": ("Edinburgh", "GB"),
+    "University of Manchester": ("Manchester", "GB"),
+    # Australian institutions
+    "The Australian National University": ("Canberra", "AU"),
+    "Australian National University": ("Canberra", "AU"),
+    "ANU": ("Canberra", "AU"),
+    "University of Canberra": ("Canberra", "AU"),
+    "University of Melbourne": ("Melbourne", "AU"),
+    "University of Sydney": ("Sydney", "AU"),
+    "Macquarie University": ("Sydney", "AU"),
+    "Charles Sturt University": ("Bathurst", "AU"),
+    "UNSW": ("Sydney", "AU"),
+    "University of New South Wales": ("Sydney", "AU"),
+    "University of Queensland": ("Brisbane", "AU"),
+    "Monash University": ("Melbourne", "AU"),
+    # South African institutions
+    "University of Cape Town": ("Cape Town", "ZA"),
+    "UCT": ("Cape Town", "ZA"),
+    "University of Pretoria": ("Pretoria", "ZA"),
+    "University of Witwatersrand": ("Johannesburg", "ZA"),
+    "Stellenbosch University": ("Stellenbosch", "ZA"),
+    # Italian institutions
    "Politecnico di Milano": ("Milano", "IT"),
+    "Università degli Studi di Milano": ("Milano", "IT"),
+    "Università di Bologna": ("Bologna", "IT"),
+    "University of Bologna": ("Bologna", "IT"),
+    # US institutions
    "Oberlin College": ("Oberlin", "US"),
+    "Harvard University": ("Cambridge", "US"),
+    "Harvard": ("Cambridge", "US"),
+    "Yale University": ("New Haven", "US"),
+    "Princeton University": ("Princeton", "US"),
+    "MIT": ("Cambridge", "US"),
+    "Massachusetts Institute of Technology": ("Cambridge", "US"),
+    "Stanford University": ("Stanford", "US"),
+    "Columbia University": ("New York", "US"),
+    "University of California": ("Berkeley", "US"),
+    "UCLA": ("Los Angeles", "US"),
+    "University of Chicago": ("Chicago", "US"),
+    "NYU": ("New York", "US"),
+    "New York University": ("New York", "US"),
+    # Indonesian institutions
+    "Universitas Gadjah Mada": ("Yogyakarta", "ID"),
+    "UGM": ("Yogyakarta", "ID"),
+    "Universitas Indonesia": ("Jakarta", "ID"),
+    "UI": ("Jakarta", "ID"),
+    # Turkish institutions
+    "Middle East Technical University": ("Ankara", "TR"),
+    "METU": ("Ankara", "TR"),
+    "Boğaziçi University": ("Istanbul", "TR"),
+    # Additional Dutch variations found in data
+    "Rotterdam School of Management": ("Rotterdam", "NL"),
+    "RSM": ("Rotterdam", "NL"),
+    "TIAS School for Business and Society": ("Tilburg", "NL"),
+    "TIAS": ("Tilburg", "NL"),
+    "GO opleidingen": ("Utrecht", "NL"),
+    "Amsterdam University of Applied Sciences": ("Amsterdam", "NL"),
+    "University College Utrecht": ("Utrecht", "NL"),
+    "UCU": ("Utrecht", "NL"),
+    "University of Utrecht": ("Utrecht", "NL"),
+    "NSOB": ("Den Haag", "NL"),
+    "Nederlandse School voor Openbaar Bestuur": ("Den Haag", "NL"),
+    "Grotius Academie": ("Nijmegen", "NL"),
+    "de Baak": ("Noordwijk", "NL"),
+    "Grafisch Lyceum Rotterdam": ("Rotterdam", "NL"),
+    "Schoevers": ("Utrecht", "NL"),
+    "Schoevers College": ("Utrecht", "NL"),
 }


@ -171,12 +326,39 @@ def geocode_location(location_str: str, db_path: str) -> Optional[dict]:
    
    # Extract country from common patterns
    country_code = None
-    if "(NL)" in location_str or "Netherlands" in location_str or "Nederland" in location_str:
-        country_code = "NL"
-    elif "(BE)" in location_str or "Belgium" in location_str or "België" in location_str:
-        country_code = "BE"
-    elif "(DE)" in location_str or "Germany" in location_str or "Deutschland" in location_str:
-        country_code = "DE"
+    country_patterns = {
+        "NL": ["(NL)", "Netherlands", "Nederland"],
+        "BE": ["(BE)", "Belgium", "België", "Belgique"],
+        "DE": ["(DE)", "Germany", "Deutschland"],
+        "GB": ["(GB)", "United Kingdom", "UK", "England", "Scotland", "Wales"],
+        "AU": ["(AU)", "Australia"],
+        "ZA": ["(ZA)", "South Africa"],
+        "IT": ["(IT)", "Italy", "Italia"],
+        "US": ["(US)", "United States", "USA", "U.S."],
+        "ID": ["(ID)", "Indonesia"],
+        "TR": ["(TR)", "Turkey", "Türkiye"],
+        "FR": ["(FR)", "France"],
+        "ES": ["(ES)", "Spain", "España"],
+        "AT": ["(AT)", "Austria", "Österreich"],
+        "CH": ["(CH)", "Switzerland", "Schweiz", "Suisse"],
+        "CA": ["(CA)", "Canada"],
+        "NZ": ["(NZ)", "New Zealand"],
+        "JP": ["(JP)", "Japan"],
+        "CN": ["(CN)", "China"],
+        "IN": ["(IN)", "India"],
+        "BR": ["(BR)", "Brazil", "Brasil"],
+        "SE": ["(SE)", "Sweden", "Sverige"],
+        "NO": ["(NO)", "Norway", "Norge"],
+        "DK": ["(DK)", "Denmark", "Danmark"],
+        "FI": ["(FI)", "Finland", "Suomi"],
+        "PL": ["(PL)", "Poland", "Polska"],
+        "CZ": ["(CZ)", "Czech Republic", "Czechia", "Česko"],
+    }
+    
+    for code, patterns in country_patterns.items():
+        if any(p in location_str for p in patterns):
+            country_code = code
+            break
    
    # Clean location for city lookup
    city_candidate = location_str.split(",")[0].strip()
@ -255,6 +437,56 @@ def parse_date_range(date_range: str) -> Tuple[Optional[int], Optional[int]]:
    return start_year, end_year


+def get_any_date_field(record: dict) -> str:
+    """
+    Extract date string from a record with various field name conventions.
+    
+    Handles the following field variations found in LinkedIn profile data:
+    - date_range: "2019 - Present" (most common, 2,486 entries)
+    - period: "2015 - 2019" (15 entries)
+    - years/year: "2010" (single year)
+    - start_date/end_date: separate fields like "Sep 2019" / "Present" (149 entries)
+    - dates: "2018 - 2020" (12 entries)
+    
+    Returns combined date string suitable for parse_date_range().
+    """
+    # Try combined date fields first
+    for field in ["date_range", "period", "years", "year", "dates"]:
+        if record.get(field):
+            return str(record[field])
+    
+    # Handle separate start_date/end_date fields
+    start = record.get("start_date", "") or ""
+    end = record.get("end_date", "") or ""
+    if start or end:
+        return f"{start} - {end}".strip(" -")
+    
+    return ""
+
+
+def parse_total_experience_field(total_exp: str) -> Optional[int]:
+    """
+    Parse total experience field value to extract years.
+    
+    Handles formats like:
+    - "24 years and 8 months"
+    - "37 years"
+    - "5 years 3 months"
+    - "1 year"
+    
+    Returns number of years or None if not parseable.
+    """
+    if not total_exp:
+        return None
+    
+    # Pattern: find digits followed by "year" or "years"
+    match = re.search(r'(\d+)\s*years?', total_exp.lower())
+    if match:
+        return int(match.group(1))
+    
+    return None
+
+
 def build_inference_chain(steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Build a numbered inference chain."""
    return [{"step": i + 1, **step} for i, step in enumerate(steps)]
@ -297,6 +529,22 @@ def get_adjacent_decades(year: int) -> Tuple[str, str]:
        return (get_decade_notation(year - 10), get_decade_notation(year))


+def parse_total_experience(about_text: str) -> Optional[int]:
+    """
+    Parse "Total Experience: X years" pattern from about/summary field.
+    Returns number of years or None if not found.
+    """
+    if not about_text:
+        return None
+    
+    # Pattern: "Total Experience: X years and Y months" or "Total Experience: X year"
+    m = re.search(r'Total Experience:\s*(\d+)\s*years?', about_text, re.IGNORECASE)
+    if m:
+        return int(m.group(1))
+    
+    return None
+
+
 def infer_birth_decade(profile_data: dict) -> Optional[dict]:
    """
    Infer birth decade from earliest career observations.
@ -305,6 +553,11 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
    Supports list-valued results for decade boundary cases (Rule 45 extension):
    - If estimated birth year is within 3 years of decade boundary, returns
      both adjacent decades as EDTF set notation: [196X,197X]
+    
+    Inference methods (in priority order):
+    1. Education start year (most reliable - entry age 18-24)
+    2. Experience start year (first job - entry age ~23)
+    3. Total Experience pattern (fallback - "Total Experience: X years")
    """
    earliest_year = None
    inference_steps = []
@ -312,6 +565,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
    age_variance = 3  # ±3 years typical variance in entry age
    education_record = None
    experience_record = None
+    total_experience_years = None
    
    # Check education first (most reliable)
    education = profile_data.get("education") or []
@ -381,8 +635,8 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
        for exp in experience:
            if exp is None:
                continue
-            # Handle multiple date field names
-            date_range = exp.get("date_range") or exp.get("period") or ""
+            # Handle multiple date field names (including start_date/end_date)
+            date_range = get_any_date_field(exp)
            start_year, _ = parse_date_range(date_range)
            
            if start_year:
@ -396,10 +650,59 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
                        "date_range": date_range,
                    }
    
+    # If no education or experience dates, try "Total Experience" pattern in about field
+    if earliest_year is None:
+        about = profile_data.get("about") or profile_data.get("summary") or ""
+        total_experience_years = parse_total_experience(about)
+        
+        if total_experience_years and total_experience_years > 0:
+            # Estimate: current year - total_years = first job year
+            # Then: first job year - 23 = birth year (assuming first job at 23)
+            current_year = datetime.now().year
+            estimated_first_job_year = current_year - total_experience_years
+            earliest_year = estimated_first_job_year
+            age_offset = 23  # Assume first job at 23
+            age_variance = 7  # Very high variance for this method
+            
+            inference_steps.append({
+                "observation": "Total Experience pattern found in about field",
+                "source_field": "profile_data.about",
+                "source_value": f"Total Experience: {total_experience_years} years",
+            })
+            inference_steps.append({
+                "calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}",
+                "result": f"Estimated first job year: {estimated_first_job_year}",
+                "assumption": "Total experience represents continuous career from first job",
+            })
+    
+    # If still no date, try standalone total_experience field in profile_data
+    if earliest_year is None:
+        total_exp_field = profile_data.get("total_experience")
+        if total_exp_field:
+            total_experience_years = parse_total_experience_field(total_exp_field)
+            
+            if total_experience_years and total_experience_years > 0:
+                current_year = datetime.now().year
+                estimated_first_job_year = current_year - total_experience_years
+                earliest_year = estimated_first_job_year
+                age_offset = 23  # Assume first job at 23
+                age_variance = 7  # Very high variance for this method
+                
+                inference_steps.append({
+                    "observation": "total_experience field found in profile_data",
+                    "source_field": "profile_data.total_experience",
+                    "source_value": total_exp_field,
+                })
+                inference_steps.append({
+                    "calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}",
+                    "result": f"Estimated first job year: {estimated_first_job_year}",
+                    "assumption": "Total experience represents continuous career from first job",
+                })
+    
    if earliest_year is None:
        return None
    
-    # Build inference chain
+    # Build inference chain (only add steps if not already added from Total Experience path)
    if education_record:
        inference_steps.append({
            "observation": "Education record found",
@ -415,7 +718,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
            "rationale": "Standard entry age for this education level in Netherlands/Europe",
            "confidence_impact": f"Assumption introduces uncertainty; actual age may vary ±{age_variance} years",
        })
-    else:
+    elif experience_record:
        inference_steps.append({
            "observation": "First job record found (no education data)",
            "source_field": "profile_data.experience",
@ -430,6 +733,13 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
            "rationale": "Assumes first job after typical university completion",
            "confidence_impact": f"Higher uncertainty; first job age varies ±{age_variance} years",
        })
+    elif total_experience_years:
+        # Steps already added in the Total Experience detection block
+        inference_steps.append({
+            "assumption": f"First job age is approximately {age_offset} (±{age_variance} years)",
+            "rationale": "Assumes first job after typical university completion; Total Experience method has highest uncertainty",
+            "confidence_impact": f"Very high uncertainty; first job age varies ±{age_variance} years, plus Total Experience aggregation may be inaccurate",
+        })
    
    estimated_birth_year = earliest_year - age_offset
    min_birth_year = earliest_year - age_offset - age_variance
@ -468,6 +778,14 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
            "rationale": "Cannot determine which decade with certainty; using EDTF 'one of' set notation",
        })
        
+        # Determine method name based on source
+        if education_record:
+            method_name = "earliest_education_heuristic"
+        elif experience_record:
+            method_name = "earliest_experience_heuristic"
+        else:
+            method_name = "total_experience_heuristic"
+        
        return {
            "values": [decade1, decade2],
            "edtf": f"[{decade1},{decade2}]",
@ -477,7 +795,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
            "primary_rationale": primary_rationale,
            "confidence": "very_low",  # Lower confidence due to boundary uncertainty
            "inference_provenance": {
-                "method": "earliest_observation_heuristic",
+                "method": method_name,
                "inference_chain": build_inference_chain(inference_steps),
                "assumptions": [
                    f"Entry age for education/first job: {age_offset} years (±{age_variance})",
@ -499,13 +817,24 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
            "rationale": "Decade precision appropriate for heuristic-based estimate",
        })
        
+        # Determine method name and confidence based on source
+        if education_record:
+            method_name = "earliest_education_heuristic"
+            confidence = "low"
+        elif experience_record:
+            method_name = "earliest_experience_heuristic"
+            confidence = "low"
+        else:
+            method_name = "total_experience_heuristic"
+            confidence = "very_low"  # Lowest confidence for Total Experience method
+        
        return {
            "value": edtf_decade,
            "edtf": edtf_decade,
            "precision": "decade",
-            "confidence": "low",
+            "confidence": confidence,
            "inference_provenance": {
-                "method": "earliest_observation_heuristic",
+                "method": method_name,
                "inference_chain": build_inference_chain(inference_steps),
                "assumptions": [
                    f"Entry age for education/first job: {age_offset} years (±{age_variance})",
@ -549,7 +878,21 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
        
        for uni_name, (city, country) in DUTCH_UNI_LOCATIONS.items():
            if uni_name.lower() in institution.lower():
-                location = f"{city}, Netherlands" if city else None
+                # Map country code to country name for geocoding
+                country_names = {
+                    "NL": "Netherlands",
+                    "BE": "Belgium",
+                    "DE": "Germany",
+                    "GB": "United Kingdom",
+                    "AU": "Australia",
+                    "ZA": "South Africa",
+                    "IT": "Italy",
+                    "US": "United States",
+                    "ID": "Indonesia",
+                    "TR": "Turkey",
+                }
+                country_name = country_names.get(country, "Netherlands")
+                location = f"{city}, {country_name}" if city else None
                location_source = f"Known institution mapping: {uni_name}"
                break
        
@ -622,8 +965,8 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
    for exp in experience:
        if exp is None:
            continue
-        # Handle multiple date field names
-        date_range = exp.get("date_range") or exp.get("period") or ""
+        # Handle multiple date field names (including start_date/end_date)
+        date_range = get_any_date_field(exp)
        start_year, _ = parse_date_range(date_range)
        if start_year and exp.get("location"):
            exp_with_years.append((start_year, exp))
@ -636,7 +979,7 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
            continue
        
        # Get date_range for provenance (handle multiple field names)
-        exp_date_range = exp.get("date_range") or exp.get("period") or ""
+        exp_date_range = get_any_date_field(exp)
        
        inference_steps.append({
            "observation": "Earliest job with location found (no education location available)",
@ -739,8 +1082,8 @@ def infer_current_settlement(profile_data: dict, db_path: str) -> Optional[dict]
    for exp in experience:
        if exp is None:
            continue
-        # Handle multiple date field names
-        date_range = exp.get("date_range") or exp.get("period") or ""
+        # Handle multiple date field names (including start_date/end_date)
+        date_range = get_any_date_field(exp)
        # Also check "current" field which some profiles have
        is_current = "Present" in date_range or exp.get("current") is True
        if is_current:
@ -815,6 +1158,7 @@ def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force:
    stats = {
        "birth_decade_inferred": False,
        "birth_decade_is_list": False,  # Track decade boundary cases
+        "birth_decade_method": None,  # Track which method was used
        "birth_settlement_inferred": False,
        "current_settlement_inferred": False,
        "ppid_changed": False,
@ -870,6 +1214,9 @@ def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force:
                components["first_date"] = birth_info["edtf"]
                components["first_date_source"] = "inferred_birth_decade"
            
+            # Track which method was used
+            stats["birth_decade_method"] = birth_info.get("inference_provenance", {}).get("method", "unknown")
+            
            # Add note to canonical field pointing to inferred alternative
            data["birth_date"]["note"] = "See inferred_birth_decade for heuristic estimate"
            
@ -978,6 +1325,11 @@ def main():
        "processed": 0,
        "birth_decade_inferred": 0,
        "birth_decade_list_valued": 0,  # Decade boundary cases
+        "birth_decade_by_method": {
+            "earliest_education_heuristic": 0,
+            "earliest_experience_heuristic": 0,
+            "total_experience_heuristic": 0,
+        },
        "birth_settlement_inferred": 0,
        "current_settlement_inferred": 0,
        "ppid_changed": 0,
@ -990,6 +1342,10 @@ def main():
            total_stats["processed"] += 1
            if stats["birth_decade_inferred"]:
                total_stats["birth_decade_inferred"] += 1
+                # Track method used
+                method = stats.get("birth_decade_method")
+                if method and method in total_stats["birth_decade_by_method"]:
+                    total_stats["birth_decade_by_method"][method] += 1
            if stats.get("birth_decade_is_list"):
                total_stats["birth_decade_list_valued"] += 1
            if stats["birth_settlement_inferred"]:
@ -999,7 +1355,7 @@ def main():
            if stats["ppid_changed"]:
                total_stats["ppid_changed"] += 1
            
-            if args.verbose and any(stats.values()):
+            if args.verbose and any(v for k, v in stats.items() if k != "birth_decade_method"):
                print(f"  {filepath.name}: {stats}")
            
            if (i + 1) % 500 == 0:
@ -1017,6 +1373,9 @@ def main():
    print(f"Processed:                    {total_stats['processed']}")
    print(f"Birth decades inferred:       {total_stats['birth_decade_inferred']}")
    print(f"  - List-valued (boundary):   {total_stats['birth_decade_list_valued']}")
+    print(f"  - By method:")
+    for method, count in total_stats["birth_decade_by_method"].items():
+        print(f"      {method}: {count}")
    print(f"Birth settlements inferred:   {total_stats['birth_settlement_inferred']}")
    print(f"Current settlements inferred: {total_stats['current_settlement_inferred']}")
    print(f"PPIDs updated:                {total_stats['ppid_changed']}")
@ -1033,6 +1392,7 @@ def main():
    
    print("\nNote: All inferred data stored in explicit inferred_* fields with provenance chains.")
    print("Note: Decade boundary cases use EDTF set notation [196X,197X] with primary_value for PPID.")
+    print("Note: Total Experience method has highest uncertainty (very_low confidence).")


 if __name__ == "__main__":
--- a/scripts/enrich_ppids_web.py
+++ b/scripts/enrich_ppids_web.py
@ -0,0 +1,579 @@
+#!/usr/bin/env python3
+"""
+PPID Web Enrichment Script
+
+Enriches PPID files with web-sourced claims using Exa AI and Linkup search.
+Adds proper provenance statements per Rules 6, 26, and 35.
+
+Enrichment targets:
+1. Birth date/year - Search for biographical information
+2. Publications - ORCID, Google Scholar, ResearchGate
+3. News mentions - Press coverage, interviews
+4. Wikidata entity - Authority file linking
+5. Institutional affiliations - Verify current roles
+
+All web claims include:
+- source_url: Where the data was found
+- retrieved_on: ISO 8601 timestamp
+- retrieval_agent: Tool used (exa_web_search, linkup_search, etc.)
+- claim_type: Type of claim (birth_date, publication, news_mention, etc.)
+- claim_value: The extracted value
+- provenance: Full provenance chain per Rule 35
+
+Usage:
+    python scripts/enrich_ppids_web.py --limit 10 --verbose
+    python scripts/enrich_ppids_web.py --dry-run --sample stefankulk
+"""
+
+import json
+import os
+import re
+import sys
+import time
+import argparse
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional, Dict, List, Any, Tuple
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+
+def create_web_claim(
+    claim_type: str,
+    claim_value: str,
+    source_url: str,
+    retrieval_agent: str,
+    confidence: str = "medium",
+    notes: Optional[str] = None,
+    raw_response: Optional[Dict] = None
+) -> Dict[str, Any]:
+    """
+    Create a web claim with proper provenance per Rules 6, 26, and 35.
+    
+    Args:
+        claim_type: Type of claim (birth_date, publication, news_mention, etc.)
+        claim_value: The extracted value
+        source_url: URL where the data was found
+        retrieval_agent: Tool used (exa_web_search, linkup_search, etc.)
+        confidence: Confidence level (high, medium, low, very_low)
+        notes: Additional notes about the claim
+        raw_response: Raw API response for audit
+        
+    Returns:
+        Dict with claim structure per Rule 26
+    """
+    now = datetime.now(timezone.utc)
+    
+    claim = {
+        "claim_type": claim_type,
+        "claim_value": claim_value,
+        "source_url": source_url,
+        "retrieved_on": now.isoformat(),
+        "retrieval_agent": retrieval_agent,
+        "confidence": confidence,
+        "provenance": {
+            "statement_created_at": now.isoformat(),
+            "source_archived_at": now.isoformat(),  # Same time for API responses
+            "retrieval_method": retrieval_agent,
+        }
+    }
+    
+    if notes:
+        claim["notes"] = notes
+        
+    if raw_response:
+        # Store snippet of raw response for audit (not full response to save space)
+        claim["provenance"]["response_snippet"] = str(raw_response)[:500]
+        
+    return claim
+
+
+def extract_birth_year_from_text(text: str, full_name: str) -> Optional[Tuple[str, str]]:
+    """
+    Extract birth year from text using various patterns.
+    
+    Returns:
+        Tuple of (birth_year_edtf, extraction_note) or None
+    """
+    if not text:
+        return None
+        
+    # Normalize text
+    text_lower = text.lower()
+    name_parts = full_name.lower().split()
+    last_name = name_parts[-1] if name_parts else ""
+    
+    # Check if the text is about the right person (basic check)
+    if last_name and last_name not in text_lower:
+        return None
+    
+    # Pattern 1: "born in YYYY" or "born YYYY"
+    born_match = re.search(r'born\s+(?:in\s+)?(\d{4})', text_lower)
+    if born_match:
+        year = born_match.group(1)
+        return (year, f"Extracted from 'born {year}' pattern")
+    
+    # Pattern 2: "(YYYY - )" or "(YYYY-)" indicating birth year
+    birth_dash_match = re.search(r'\((\d{4})\s*[-–—]\s*\)', text)
+    if birth_dash_match:
+        year = birth_dash_match.group(1)
+        return (year, f"Extracted from '({year} - )' lifespan pattern")
+    
+    # Pattern 3: "YYYY - present" or "b. YYYY"
+    b_match = re.search(r'(?:b\.|born)\s*(\d{4})', text_lower)
+    if b_match:
+        year = b_match.group(1)
+        return (year, f"Extracted from 'b. {year}' pattern")
+    
+    # Pattern 4: Age patterns "X years old" with date context
+    age_match = re.search(r'(\d{1,2})\s*(?:years?\s*old|jaar\s*oud)', text_lower)
+    if age_match:
+        age = int(age_match.group(1))
+        if 20 <= age <= 100:  # Reasonable age range
+            current_year = datetime.now().year
+            estimated_birth = current_year - age
+            return (f"{estimated_birth}~", f"Estimated from age {age} (approximate)")
+    
+    # Pattern 5: Birthday patterns "birthday: Month DD, YYYY"
+    birthday_match = re.search(
+        r'(?:birthday|geboren|date of birth)[:\s]+(?:\w+\s+\d{1,2},?\s+)?(\d{4})',
+        text_lower
+    )
+    if birthday_match:
+        year = birthday_match.group(1)
+        return (year, "Extracted from birthday/geboren pattern")
+    
+    return None
+
+
+def extract_publications_from_text(text: str, full_name: str) -> List[Dict[str, str]]:
+    """
+    Extract publication references from search results.
+    
+    Returns:
+        List of publication dicts with title, year, venue
+    """
+    publications = []
+    
+    if not text:
+        return publications
+    
+    # Look for DOI patterns
+    doi_matches = re.findall(r'10\.\d{4,}/[^\s]+', text)
+    for doi in doi_matches[:5]:  # Limit to 5
+        publications.append({
+            "type": "doi",
+            "value": doi.strip(),
+            "note": "DOI found in search results"
+        })
+    
+    # Look for ORCID patterns
+    orcid_match = re.search(r'orcid\.org/(\d{4}-\d{4}-\d{4}-\d{3}[\dX])', text)
+    if orcid_match:
+        publications.append({
+            "type": "orcid",
+            "value": orcid_match.group(1),
+            "note": "ORCID identifier found"
+        })
+    
+    return publications
+
+
+def search_birth_date_exa(full_name: str, context_hints: List[str] = None) -> Optional[Dict]:
+    """
+    Search for birth date using Exa AI web search.
+    
+    Note: This function is designed to be called via MCP tools.
+    In actual execution, replace with MCP tool call.
+    """
+    # Build search query
+    query_parts = [f'"{full_name}"', "born", "birthday"]
+    if context_hints:
+        query_parts.extend(context_hints[:2])  # Add up to 2 context hints
+    
+    query = " ".join(query_parts)
+    
+    # This would be replaced with actual MCP call:
+    # result = exa_web_search_exa(query=query, numResults=5)
+    
+    return {
+        "query": query,
+        "tool": "exa_web_search_exa",
+        "status": "pending_mcp_call"
+    }
+
+
+def search_publications_exa(full_name: str, institution: str = None) -> Optional[Dict]:
+    """
+    Search for publications using Exa AI.
+    """
+    query_parts = [f'"{full_name}"']
+    if institution:
+        query_parts.append(institution)
+    query_parts.extend(["publications", "research", "ORCID"])
+    
+    query = " ".join(query_parts)
+    
+    return {
+        "query": query,
+        "tool": "exa_web_search_exa",
+        "status": "pending_mcp_call"
+    }
+
+
+def search_news_mentions_exa(full_name: str, institution: str = None) -> Optional[Dict]:
+    """
+    Search for news mentions using Exa AI.
+    """
+    query_parts = [f'"{full_name}"']
+    if institution:
+        query_parts.append(institution)
+    
+    query = " ".join(query_parts)
+    
+    return {
+        "query": query,
+        "tool": "exa_web_search_exa", 
+        "status": "pending_mcp_call"
+    }
+
+
+def get_person_context(ppid_data: Dict) -> Dict[str, Any]:
+    """
+    Extract context from PPID data for better search queries.
+    """
+    context = {
+        "full_name": "",
+        "institutions": [],
+        "roles": [],
+        "location": None,
+        "linkedin_url": None,
+        "skills": [],
+    }
+    
+    # Get name
+    name_data = ppid_data.get("name", {})
+    context["full_name"] = name_data.get("full_name", "")
+    
+    # Get profile data
+    profile = ppid_data.get("profile_data", {})
+    if profile:
+        context["linkedin_url"] = profile.get("linkedin_url")
+        context["location"] = profile.get("location")
+        context["skills"] = profile.get("skills", [])[:10]  # Top 10 skills
+        
+        # Extract institutions from experience
+        for exp in profile.get("experience", []) or []:
+            if exp and exp.get("company"):
+                context["institutions"].append(exp["company"])
+                if exp.get("title"):
+                    context["roles"].append(exp["title"])
+        
+        # Extract from education
+        for edu in profile.get("education", []) or []:
+            if edu and edu.get("institution"):
+                context["institutions"].append(edu["institution"])
+    
+    # Deduplicate
+    context["institutions"] = list(dict.fromkeys(context["institutions"]))[:5]
+    context["roles"] = list(dict.fromkeys(context["roles"]))[:5]
+    
+    return context
+
+
+def build_enrichment_queries(ppid_data: Dict) -> List[Dict[str, Any]]:
+    """
+    Build a list of enrichment queries for a PPID.
+    
+    Returns list of query specs to execute via MCP tools.
+    """
+    context = get_person_context(ppid_data)
+    full_name = context["full_name"]
+    
+    if not full_name:
+        return []
+    
+    queries = []
+    
+    # 1. Birth date search (only if not already known)
+    birth_date = ppid_data.get("birth_date", {}).get("edtf", "XXXX")
+    enrichment_meta = ppid_data.get("enrichment_metadata", {}).get("birth_date_search", {})
+    
+    if birth_date == "XXXX" and not enrichment_meta.get("attempted"):
+        # Build birth date query with context
+        hints = []
+        if context["institutions"]:
+            hints.append(context["institutions"][0])
+        if context["location"]:
+            hints.append(context["location"].split(",")[0])
+            
+        queries.append({
+            "type": "birth_date",
+            "query": f'"{full_name}" born birthday biography',
+            "context_hints": hints,
+            "tool": "exa_web_search_exa",
+            "priority": "high"
+        })
+    
+    # 2. Publications search (for academics/researchers)
+    academic_keywords = ["professor", "researcher", "phd", "doctor", "lecturer", "scientist"]
+    is_academic = any(
+        kw in " ".join(context["roles"]).lower() 
+        for kw in academic_keywords
+    )
+    
+    if is_academic:
+        institution = context["institutions"][0] if context["institutions"] else ""
+        queries.append({
+            "type": "publications",
+            "query": f'"{full_name}" {institution} publications ORCID research',
+            "tool": "exa_web_search_exa",
+            "priority": "medium"
+        })
+    
+    # 3. News/press mentions
+    if context["institutions"]:
+        queries.append({
+            "type": "news_mentions",
+            "query": f'"{full_name}" {context["institutions"][0]}',
+            "tool": "exa_web_search_exa",
+            "priority": "low"
+        })
+    
+    # 4. Wikidata search (for notable persons)
+    queries.append({
+        "type": "wikidata",
+        "query": full_name,
+        "tool": "wikidata_search_entity",
+        "priority": "medium"
+    })
+    
+    return queries
+
+
+def process_search_result(
+    result: Dict[str, Any],
+    query_type: str,
+    full_name: str,
+    ppid_data: Dict
+) -> List[Dict[str, Any]]:
+    """
+    Process a search result and extract web claims.
+    
+    Args:
+        result: Raw search result from Exa/Linkup
+        query_type: Type of query (birth_date, publications, etc.)
+        full_name: Person's full name
+        ppid_data: Current PPID data
+        
+    Returns:
+        List of web claims to add
+    """
+    claims = []
+    
+    if not result:
+        return claims
+    
+    # Extract text content from result
+    text = ""
+    source_url = ""
+    
+    if isinstance(result, dict):
+        text = result.get("text", "") or result.get("content", "") or ""
+        source_url = result.get("url", "") or result.get("source_url", "")
+    elif isinstance(result, str):
+        text = result
+    
+    if query_type == "birth_date":
+        birth_info = extract_birth_year_from_text(text, full_name)
+        if birth_info:
+            year, note = birth_info
+            claims.append(create_web_claim(
+                claim_type="birth_year",
+                claim_value=year,
+                source_url=source_url,
+                retrieval_agent="exa_web_search_exa",
+                confidence="medium" if "~" not in year else "low",
+                notes=note,
+                raw_response={"text_snippet": text[:200]}
+            ))
+    
+    elif query_type == "publications":
+        pubs = extract_publications_from_text(text, full_name)
+        for pub in pubs:
+            claims.append(create_web_claim(
+                claim_type=f"identifier_{pub['type']}",
+                claim_value=pub["value"],
+                source_url=source_url,
+                retrieval_agent="exa_web_search_exa",
+                confidence="high" if pub["type"] in ["doi", "orcid"] else "medium",
+                notes=pub.get("note")
+            ))
+    
+    elif query_type == "news_mentions":
+        # For news, we just record the mention
+        if full_name.lower() in text.lower():
+            claims.append(create_web_claim(
+                claim_type="news_mention",
+                claim_value=text[:500],  # First 500 chars
+                source_url=source_url,
+                retrieval_agent="exa_web_search_exa",
+                confidence="medium",
+                notes="News/press mention found"
+            ))
+    
+    return claims
+
+
+def enrich_ppid_file(
+    filepath: Path,
+    dry_run: bool = False,
+    verbose: bool = False
+) -> Dict[str, Any]:
+    """
+    Enrich a single PPID file with web-sourced claims.
+    
+    This function builds queries but does not execute them directly.
+    Queries should be executed via MCP tools in the calling context.
+    
+    Returns:
+        Dict with enrichment stats and pending queries
+    """
+    stats = {
+        "filepath": str(filepath),
+        "queries_built": 0,
+        "claims_added": 0,
+        "errors": [],
+        "pending_queries": []
+    }
+    
+    try:
+        with open(filepath, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    except Exception as e:
+        stats["errors"].append(f"Failed to read file: {e}")
+        return stats
+    
+    # Build enrichment queries
+    queries = build_enrichment_queries(data)
+    stats["queries_built"] = len(queries)
+    stats["pending_queries"] = queries
+    
+    if verbose:
+        print(f"  Built {len(queries)} queries for {filepath.name}")
+        for q in queries:
+            print(f"    - {q['type']}: {q['query'][:50]}...")
+    
+    return stats
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Enrich PPID files with web-sourced claims (Rule 26 compliant)"
+    )
+    parser.add_argument("--dry-run", action="store_true", help="Don't write changes")
+    parser.add_argument("--limit", type=int, help="Process only N files")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    parser.add_argument("--sample", type=str, help="Process specific linkedin_slug")
+    parser.add_argument(
+        "--query-types",
+        type=str,
+        default="birth_date,publications,news_mentions,wikidata",
+        help="Comma-separated list of query types to run"
+    )
+    args = parser.parse_args()
+    
+    person_dir = Path("/Users/kempersc/apps/glam/data/person")
+    
+    # Get PPID files
+    if args.sample:
+        # Find file by linkedin slug
+        ppid_files = list(person_dir.glob(f"ID_*{args.sample.upper()}*.json"))
+        if not ppid_files:
+            # Try case-insensitive search
+            ppid_files = [
+                f for f in person_dir.glob("ID_*.json")
+                if args.sample.lower() in f.stem.lower()
+            ]
+    else:
+        ppid_files = list(person_dir.glob("ID_*.json"))
+    
+    if args.limit:
+        ppid_files = ppid_files[:args.limit]
+    
+    print(f"Processing {len(ppid_files)} PPID files for web enrichment...")
+    if args.dry_run:
+        print("DRY RUN - no changes will be written")
+    
+    query_types = set(args.query_types.split(","))
+    print(f"Query types: {query_types}")
+    
+    # Statistics
+    total_stats = {
+        "processed": 0,
+        "queries_built": 0,
+        "by_type": {qt: 0 for qt in query_types},
+        "errors": 0,
+    }
+    
+    all_pending_queries = []
+    
+    for i, filepath in enumerate(ppid_files):
+        try:
+            stats = enrich_ppid_file(filepath, dry_run=args.dry_run, verbose=args.verbose)
+            total_stats["processed"] += 1
+            total_stats["queries_built"] += stats["queries_built"]
+            
+            # Filter queries by requested types
+            for q in stats["pending_queries"]:
+                if q["type"] in query_types:
+                    total_stats["by_type"][q["type"]] += 1
+                    all_pending_queries.append({
+                        "filepath": stats["filepath"],
+                        **q
+                    })
+            
+            if stats["errors"]:
+                total_stats["errors"] += 1
+                if args.verbose:
+                    print(f"  ERROR {filepath.name}: {stats['errors']}")
+            
+            if (i + 1) % 100 == 0:
+                print(f"  Processed {i + 1}/{len(ppid_files)}...")
+                
+        except Exception as e:
+            total_stats["errors"] += 1
+            if args.verbose:
+                print(f"  ERROR {filepath.name}: {e}")
+    
+    # Print summary
+    print("\n" + "=" * 60)
+    print("WEB ENRICHMENT QUERY SUMMARY")
+    print("=" * 60)
+    print(f"Processed:        {total_stats['processed']}")
+    print(f"Queries built:    {total_stats['queries_built']}")
+    print(f"By query type:")
+    for qt, count in total_stats["by_type"].items():
+        print(f"  - {qt}: {count}")
+    print(f"Errors:           {total_stats['errors']}")
+    
+    # Output pending queries for MCP execution
+    if all_pending_queries and not args.dry_run:
+        output_file = person_dir.parent / "pending_web_queries.json"
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump({
+                "generated_at": datetime.now(timezone.utc).isoformat(),
+                "total_queries": len(all_pending_queries),
+                "queries": all_pending_queries
+            }, f, indent=2, ensure_ascii=False)
+        print(f"\nPending queries saved to: {output_file}")
+        print("Execute these queries via MCP tools and run --apply-results to add claims.")
+    
+    print("\nNote: This script builds queries. Execute via MCP tools:")
+    print("  - exa_web_search_exa for birth_date, publications, news_mentions")
+    print("  - wikidata_search_entity for wikidata matching")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/resolve_pending_known_orgs.py
+++ b/scripts/resolve_pending_known_orgs.py
@ -21,9 +21,76 @@ CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")

 # Known organizations with their locations
 # Format: 'normalized_name': (province, city_code, city_name, inst_type)
+# Province codes: NH=Noord-Holland, ZH=Zuid-Holland, UT=Utrecht, GE=Gelderland,
+#                 NB=Noord-Brabant, LI=Limburg, OV=Overijssel, FR=Friesland,
+#                 DR=Drenthe, GR=Groningen, ZE=Zeeland, FL=Flevoland
+# Foreign: Use country code (BE, DE, FR, DK, IT, GB, US, etc.) as first element
 KNOWN_ORGS = {
-    # Museums
+    # ==========================================================================
+    # MUSEUMS - Netherlands
+    # ==========================================================================
    'amsterdamse school museum het schip': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'hunebedcentrum': ('DR', 'BOR', 'Borger', 'M'),
+    'museum flehite': ('UT', 'AME', 'Amersfoort', 'M'),
+    'museum batavialand': ('FL', 'LEL', 'Lelystad', 'M'),
+    'batavialand': ('FL', 'LEL', 'Lelystad', 'M'),
+    'jewish cultural quarter': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'joods cultureel kwartier': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'museum catharijneconvent': ('UT', 'UTR', 'Utrecht', 'M'),
+    'museum speelklok': ('UT', 'UTR', 'Utrecht', 'M'),
+    'museum rembrandthuis': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'rembrandthuis': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'nieuwe instituut': ('ZH', 'ROT', 'Rotterdam', 'M'),
+    'het nieuwe instituut': ('ZH', 'ROT', 'Rotterdam', 'M'),
+    'museum van loon': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'museum voorlinden': ('ZH', 'WAS', 'Wassenaar', 'M'),
+    'museum belvedere': ('FR', 'HEE', 'Heerenveen', 'M'),
+    'museum more': ('GE', 'GOR', 'Gorssel', 'M'),
+    'lam museum': ('ZH', 'LIS', 'Lisse', 'M'),
+    'lisser art museum': ('ZH', 'LIS', 'Lisse', 'M'),
+    'lisser art museum lam': ('ZH', 'LIS', 'Lisse', 'M'),
+    'nxt museum': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'nationaal onderduikmuseum': ('GE', 'AAL', 'Aalten', 'M'),
+    'lantarenvenster': ('ZH', 'ROT', 'Rotterdam', 'E'),
+    'loosduins museum': ('ZH', 'DHA', 'Den Haag', 'M'),
+    'louis couperus museum': ('ZH', 'DHA', 'Den Haag', 'M'),
+    'museum bredius': ('ZH', 'DHA', 'Den Haag', 'M'),
+    'museum broekerveiling': ('NH', 'LAN', 'Langedijk', 'M'),
+    'broekerveiling': ('NH', 'LAN', 'Langedijk', 'M'),
+    'museum bronbeek': ('GE', 'ARN', 'Arnhem', 'M'),
+    'museum de bastei': ('GE', 'NIJ', 'Nijmegen', 'M'),
+    'museum amstelland': ('NH', 'AMS', 'Amstelveen', 'M'),
+    'museum cobra': ('NH', 'AMV', 'Amstelveen', 'M'),
+    'cobra museum': ('NH', 'AMV', 'Amstelveen', 'M'),
+    'cobra museum voor moderne kunst amstelveen': ('NH', 'AMV', 'Amstelveen', 'M'),
+    'museum aan de a': ('GR', 'GRO', 'Groningen', 'M'),
+    'museum helmantel': ('GR', 'WES', 'Westeremden', 'M'),
+    'museum hert fan fryslan': ('FR', 'LEE', 'Leeuwarden', 'M'),
+    'museum het pakhuis': ('NH', 'HOO', 'Hoorn', 'M'),
+    'museum huys der kunsten': ('NB', 'ROO', 'Roosendaal', 'M'),
+    'museum maluku': ('UT', 'UTR', 'Utrecht', 'M'),
+    'museum martena': ('FR', 'FRA', 'Franeker', 'M'),
+    'museum nairac': ('GE', 'BAR', 'Barneveld', 'M'),
+    'museum slager': ('NB', 'BOS', 's-Hertogenbosch', 'M'),
+    'museum smedekinck': ('GE', 'ZEL', 'Zelhem', 'M'),
+    'museum staal': ('GE', 'ALM', 'Almere', 'M'),
+    'museum cafe het pomphuis': ('ZE', 'GOE', 'Goes', 'E'),  # Restaurant/cafe, not museum
+    'museum de looierij': ('NH', 'AMS', 'Amsterdam', 'M'),  # Westzaan area
+    'museum de proefkolonie': ('DR', 'FRE', 'Frederiksoord', 'M'),
+    'museum de speeltoren': ('GE', 'NIJ', 'Nijmegen', 'M'),  # Actually in Monnickendam
+    'museum fiskershuske': ('FR', 'MOD', 'Moddergat', 'M'),
+    'museum stedhus sleat': ('FR', 'SLO', 'Sloten', 'M'),
+    'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'),  # Belgium - museum pass
+    'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'),
+    'museum swaensteyn': ('ZH', 'VOR', 'Voorburg', 'M'),
+    'museum van de vrouw': ('NB', 'EER', 'Eersel', 'M'),
+    'oorlogsmuseum medemblik': ('NH', 'MED', 'Medemblik', 'M'),
+    'nac museum': ('NB', 'BRE', 'Breda', 'M'),
+    'nationaal baggermuseum': ('ZH', 'SLI', 'Sliedrecht', 'M'),
+    'nationaal restauratiefonds': ('UT', 'AME', 'Amersfoort', 'N'),
+    'nederlands steendrukmuseum': ('GE', 'VAL', 'Valburg', 'M'),
+    'nederlands stoommachinemuseum': ('GE', 'MED', 'Medemblik', 'M'),
+    'pieter vermeulen museum': ('DR', 'MED', 'Diever', 'M'),
    'bonnefanten': ('LI', 'MAA', 'Maastricht', 'M'),
    'bonami spelcomputer museum': ('OV', 'ZWO', 'Zwolle', 'M'),
    'bakkerijmuseum de oude bakkerij': ('NH', 'MED', 'Medemblik', 'M'),
@ -31,7 +98,6 @@ KNOWN_ORGS = {
    'coda museum': ('GE', 'APE', 'Apeldoorn', 'M'),
    'comm museum voor communicatie': ('ZH', 'DHA', 'Den Haag', 'M'),
    'cruquius museum': ('NH', 'HAA', 'Haarlemmermeer', 'M'),
-    'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'),  # Belgium
    'dordrechts museum': ('ZH', 'DOR', 'Dordrecht', 'M'),
    'dutch museum of freemasonry': ('ZH', 'DHA', 'Den Haag', 'M'),
    'eise eisinga planetarium': ('FR', 'FRA', 'Franeker', 'M'),
@ -102,55 +168,588 @@ KNOWN_ORGS = {
    'rijksmuseum boerhaave': ('ZH', 'LEI', 'Leiden', 'M'),
    'rijksmuseum twenthe': ('OV', 'ENS', 'Enschede', 'M'),
    'singer laren': ('NH', 'LAR', 'Laren', 'M'),
+    'singer museum': ('NH', 'LAR', 'Laren', 'M'),
    'sonnenborgh museum': ('UT', 'UTR', 'Utrecht', 'M'),
    'zeeuws museum': ('ZE', 'MID', 'Middelburg', 'M'),
    
-    # Libraries
+    # Additional museums from PENDING list
+    'het scheepvaartmuseum': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'hash marihuana hemp museum': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'hash marihuana en hemp museum': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'van gogh village museum': ('NB', 'NUE', 'Nuenen', 'M'),
+    'retro computer museum': ('GE', 'ARN', 'Arnhem', 'M'),
+    'haags bus museum': ('ZH', 'DHA', 'Den Haag', 'M'),
+    'het romeins museum': ('GE', 'NIJ', 'Nijmegen', 'M'),
+    'hendrick hamel museum': ('GR', 'GOR', 'Gorinchem', 'M'),
+    'graphic design museum': ('NB', 'BRE', 'Breda', 'M'),
+    'vliegend museum seppe': ('NB', 'BOS', 'Bosschenhoofd', 'M'),
+    'zoological museum netherlands': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'world of cannabis museum project': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'stichting museum 1940 1945': ('ZH', 'DOR', 'Dordrecht', 'M'),
+    'stichting museum menkemaborg': ('GR', 'UIT', 'Uithuizen', 'M'),
+    'stichting pak museum': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'stichting museum blokhuispoort': ('FR', 'LEE', 'Leeuwarden', 'M'),
+    'sculptuur instituut': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'gelders restauratie centrum': ('GE', 'ARN', 'Arnhem', 'R'),
+    
+    # ==========================================================================
+    # LIBRARIES
+    # ==========================================================================
    'de bblthk': ('GE', 'WAG', 'Wageningen', 'L'),
    'kb nationale bibliotheek': ('ZH', 'DHA', 'Den Haag', 'L'),
+    'bplusc': ('ZH', 'LEI', 'Leiden', 'L'),
    
-    # Archives
+    # ==========================================================================
+    # ARCHIVES
+    # ==========================================================================
    'digitar het online archief': ('UT', 'UTR', 'Utrecht', 'D'),
+    'the black archives': ('NH', 'AMS', 'Amsterdam', 'A'),
+    'archivesspace': ('US', 'NYC', 'New York', 'D'),  # US-based software
    
-    # Organizations (stichtingen, etc.)
+    # ==========================================================================
+    # NATURE & ENVIRONMENTAL ORGANIZATIONS
+    # ==========================================================================
+    'staatsbosbeheer': ('UT', 'AME', 'Amersfoort', 'O'),
+    'vogelbescherming nederland': ('UT', 'ZEI', 'Zeist', 'N'),
+    'waddenvereniging': ('FR', 'HAR', 'Harlingen', 'N'),
+    'trees for all': ('UT', 'UTR', 'Utrecht', 'N'),
+    'natuurmonumenten': ('UT', 'AME', 'Amersfoort', 'N'),
+    'vereniging natuurmonumenten': ('UT', 'AME', 'Amersfoort', 'N'),
+    'it fryske gea': ('FR', 'BEE', 'Beetsterzwaag', 'N'),
+    'landschappennl': ('UT', 'UTR', 'Utrecht', 'N'),
+    'land van ons': ('UT', 'UTR', 'Utrecht', 'N'),
+    'natuurbegraven nederland': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'natuuropleiding': ('NH', 'AMS', 'Amsterdam', 'E'),
+    'obn natuurkennis': ('DR', 'ASS', 'Assen', 'R'),
+    'ravon': ('GE', 'NIJ', 'Nijmegen', 'R'),
+    'norminstituut bomen': ('UT', 'UTR', 'Utrecht', 'R'),
+    'nationale bomenbank b v': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'native plant trust': ('US', 'BOS', 'Boston', 'N'),  # US
+    'kiss the ground': ('US', 'LAX', 'Los Angeles', 'N'),  # US
+    'national coalition for natural farming': ('IN', 'DEL', 'Delhi', 'N'),  # India
+    'lpo provence alpes cote d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'),  # France
+    'picardie nature': ('FR', 'AMI', 'Amiens', 'N'),  # France
+    'parc national des pyrenees': ('FR', 'TAR', 'Tarbes', 'N'),  # France
+    'bumblebee conservation trust': ('GB', 'STI', 'Stirling', 'N'),  # UK
+    'botanic gardens conservation international': ('GB', 'KEW', 'Kew', 'N'),  # UK
+    'save our seas foundation sosf': ('ZA', 'CPT', 'Cape Town', 'N'),  # South Africa
+    'ferus ours loup lynx conservation': ('FR', 'PAR', 'Paris', 'N'),  # France
+    'european arboricultural council': ('BE', 'BRU', 'Brussels', 'N'),  # Belgium
+    'caring farmers': ('UT', 'UTR', 'Utrecht', 'N'),
+    'collectief natuurinclusief': ('UT', 'UTR', 'Utrecht', 'N'),
+    'stichting rechten van de natuur': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'deltaplan agrarisch waterbeheer daw': ('UT', 'UTR', 'Utrecht', 'N'),
+    'boerenverstand onderzoek advies': ('GE', 'WAG', 'Wageningen', 'R'),
+    'cruydt hoeck': ('GR', 'NIJ', 'Nijeholtpade', 'C'),
+    
+    # ==========================================================================
+    # HERITAGE & HISTORICAL SOCIETIES
+    # ==========================================================================
    '3 october vereeniging': ('ZH', 'LEI', 'Leiden', 'S'),
+    'historische vereniging delfia batavorum': ('ZH', 'DEL', 'Delft', 'S'),
+    'historische vereniging koog zaandijk': ('NH', 'ZAA', 'Zaandijk', 'S'),
+    'historische vereniging oud stolwijck': ('ZH', 'STO', 'Stolwijk', 'S'),
+    'historische vereniging voorst': ('GE', 'VOO', 'Voorst', 'S'),
+    'historische vereniging wormerveer': ('NH', 'WOR', 'Wormerveer', 'S'),
+    'heemkunde vereniging borne': ('OV', 'BOR', 'Borne', 'S'),
+    'heemkunde vlaanderen': ('BE', 'ANT', 'Antwerpen', 'S'),  # Belgium
+    'hendrick de keyser monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'vereniging particuliere historische buitenplaatsen': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'werkgroep adelsgeschiedenis': ('NH', 'AMS', 'Amsterdam', 'S'),
+    'stichting oude groninger kerken': ('GR', 'GRO', 'Groningen', 'N'),
+    'studiecentrum eerste wereldoorlog': ('BE', 'BRU', 'Brussels', 'R'),  # Belgium
+    'sobibor foundation': ('NH', 'AMS', 'Amsterdam', 'N'),
+    
+    # ==========================================================================
+    # STICHTINGEN & FOUNDATIONS
+    # ==========================================================================
    'abdij o l v koningshoeven': ('NB', 'TIL', 'Tilburg', 'H'),
-    'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'),
    'bijenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
    'bomenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
    'boerennatuur': ('UT', 'UTR', 'Utrecht', 'N'),
-    'cbg': ('ZH', 'DHA', 'Den Haag', 'R'),  # Central Bureau for Genealogy
-    'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'),
-    'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'),
-    'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'),
-    'delamar': ('NH', 'AMS', 'Amsterdam', 'E'),
-    'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'),
-    'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'),
-    'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'),
-    'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'stichting amelander musea': ('FR', 'AME', 'Ameland', 'M'),
+    'stichting confro': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'stichting de zaanse schans': ('NH', 'ZAA', 'Zaandam', 'M'),
+    'stichting dioraphte': ('UT', 'UTR', 'Utrecht', 'N'),
+    'stichting koninklijke defensiemusea': ('ZH', 'DHA', 'Den Haag', 'M'),
+    'stichting kunst cultuur': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'stichting texels museum': ('NH', 'TEX', 'Texel', 'M'),
+    'stichting twisca': ('OV', 'TWI', 'Twisk', 'N'),
+    'stichting waddengroep': ('NH', 'DEN', 'Den Helder', 'N'),
+    'hartwig art foundation': ('NH', 'AMS', 'Amsterdam', 'N'),
    'fonds 21': ('UT', 'UTR', 'Utrecht', 'N'),
-    'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'),
-    'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'),
-    'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'),
-    'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'),
-    'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'),
-    'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'),
    
-    # Research centers
+    # ==========================================================================
+    # RESEARCH CENTERS & KNOWLEDGE INSTITUTES
+    # ==========================================================================
    'adc archeoprojecten': ('GE', 'AME', 'Amersfoort', 'R'),
    'archol': ('ZH', 'LEI', 'Leiden', 'R'),
    'kitlv': ('ZH', 'LEI', 'Leiden', 'R'),
+    'cbg': ('ZH', 'DHA', 'Den Haag', 'R'),  # Central Bureau for Genealogy
+    'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'centre of expertise creative innovation': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'huygens institute': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'huygens instituut': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'instituut voor de nederlandse taal': ('ZH', 'LEI', 'Leiden', 'R'),
+    'n w posthumus institute': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'nicas netherlands institute for conservation art science': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'raap': ('OV', 'ZWO', 'Zwolle', 'R'),
+    'restauratoren nederland': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'restauratieatelier restaura': ('LI', 'HAE', 'Haelen', 'C'),
+    'picturae': ('NH', 'HIL', 'Heiloo', 'C'),
+    'icom netherlands': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'icomos netherlands': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'international committee for documentation': ('FR', 'PAR', 'Paris', 'N'),
+    'museumvereniging': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'museumpeil': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'museumtijdschrift': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'monumentaal magazine over cultureel erfgoed': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'modemuze': ('NH', 'AMS', 'Amsterdam', 'D'),
+    'moebius museum software': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'platform drentse musea': ('DR', 'ASS', 'Assen', 'O'),
+    'public domain library': ('US', 'SFO', 'San Francisco', 'D'),  # US
+    'internet archive': ('US', 'SFO', 'San Francisco', 'A'),  # US
+    'society for artistic research': ('AT', 'VIE', 'Vienna', 'R'),  # Austria
+    'digital preservation coalition': ('GB', 'GLA', 'Glasgow', 'R'),  # UK
+    'the palaeontological association': ('GB', 'LON', 'London', 'R'),  # UK
+    'the society for archaeological sciences': ('US', 'TUC', 'Tucson', 'R'),  # US
+    'conflict research society': ('GB', 'LON', 'London', 'R'),  # UK
+    'stads en architectuurgeschiedenis uva': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'agandau onderzoek in het archief': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'anchise project horizon europe': ('FR', 'PAR', 'Paris', 'R'),  # France
+    'atrium advancing frontier research in the arts humanities': ('EU', 'BRU', 'Brussels', 'R'),  # EU
+    'biblissima': ('FR', 'PAR', 'Paris', 'R'),  # France
    
-    # Theaters/Venues
+    # ==========================================================================
+    # THEATERS & CULTURAL VENUES
+    # ==========================================================================
    'theater de veste': ('ZH', 'DEL', 'Delft', 'E'),
    'theater a d schie': ('ZH', 'SCH', 'Schiedam', 'E'),
+    'theater a d rijn': ('GE', 'ARN', 'Arnhem', 'E'),
+    'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'),
+    'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'),
+    'delamar': ('NH', 'AMS', 'Amsterdam', 'E'),
+    'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'),
+    'theatergezelschap bontehond': ('NH', 'AMS', 'Amsterdam', 'E'),
+    'birds of paradise theatre company': ('GB', 'GLA', 'Glasgow', 'E'),  # UK
+    'yoann bourgeois art company': ('FR', 'LYO', 'Lyon', 'E'),  # France
+    'de grote post': ('BE', 'OST', 'Oostende', 'E'),  # Belgium
    
-    # Foreign organizations that should be reclassified
+    # ==========================================================================
+    # GALLERIES & ART SPACES
+    # ==========================================================================
+    'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'),
+    'cemara 6 galeri museum': ('ID', 'JAK', 'Jakarta', 'G'),  # Indonesia
+    'vedica art studios and gallery': ('IN', 'DEL', 'Delhi', 'G'),  # India
+    
+    # ==========================================================================
+    # OFFICIAL INSTITUTIONS & GOVERNMENT
+    # ==========================================================================
+    'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'),
+    'the dutch inspectorate of education': ('UT', 'UTR', 'Utrecht', 'O'),
+    'embassy of the netherlands in morocco': ('MA', 'RAB', 'Rabat', 'O'),  # Morocco
+    'gemeente nederweert': ('LI', 'NED', 'Nederweert', 'O'),
+    'house of european history': ('BE', 'BRU', 'Brussels', 'M'),  # Belgium
+    'european museum forum': ('PT', 'LIS', 'Lisbon', 'O'),  # Portugal
+    'docomomo international': ('PT', 'LIS', 'Lisbon', 'N'),  # Portugal
+    'culture action europe': ('BE', 'BRU', 'Brussels', 'N'),  # Belgium
+    'gbif the global biodiversity information facility': ('DK', 'CPH', 'Copenhagen', 'O'),  # Denmark
+    
+    # ==========================================================================
+    # JOURNALISM & MEDIA
+    # ==========================================================================
+    '11 11 media': ('NH', 'AMS', 'Amsterdam', 'C'),
+    '155 eenvijfvijf': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'archimag': ('FR', 'PAR', 'Paris', 'C'),  # France
+    'arte al dia': ('US', 'MIA', 'Miami', 'C'),  # US - Latin American art magazine
+    'exibart': ('IT', 'ROM', 'Rome', 'C'),  # Italy
+    'finestre sull arte': ('IT', 'FLO', 'Florence', 'C'),  # Italy
+
+    # ==========================================================================
+    # MISCLASSIFIED FOREIGN ORGS (have NL prefix but are foreign)
+    # ==========================================================================
+    'her place womens museum': ('AU', 'MEL', 'Melbourne', 'M'),  # Australia
+    'her place women s museum': ('AU', 'MEL', 'Melbourne', 'M'),  # Australia - variant
+    'asociacion acre': ('ES', 'MAD', 'Madrid', 'N'),  # Spain
+    'asociacio n acre': ('ES', 'MAD', 'Madrid', 'N'),  # Spain - normalized
+    'la maison du theatre a brest': ('FR', 'BRE', 'Brest', 'E'),  # France
+    'la maison du the a tre a brest': ('FR', 'BRE', 'Brest', 'E'),  # France - normalized
+    'lpo provence alpes cote d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'),  # France
+    'lpo provence alpes co te d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'),  # France - normalized
+    'lucas laboratoire d usages culture s arts societe': ('FR', 'PAR', 'Paris', 'R'),  # France
+    'maison des metallos': ('FR', 'PAR', 'Paris', 'E'),  # France
+    'maison des me tallos': ('FR', 'PAR', 'Paris', 'E'),  # France - normalized
+    'stiftung trias gemeinnutzige stiftung fur boden okologie und wohnen': ('DE', 'HAT', 'Hattingen', 'N'),  # Germany
+    'stiftung trias': ('DE', 'HAT', 'Hattingen', 'N'),  # Germany - short name
+    'sothebys': ('GB', 'LON', 'London', 'C'),  # UK auction house
+    'sotheby s': ('GB', 'LON', 'London', 'C'),  # UK auction house - variant
+    'sothebys institute of art': ('GB', 'LON', 'London', 'E'),  # UK
+    'sotheby s institute of art': ('GB', 'LON', 'London', 'E'),  # UK - variant
+    'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'),  # Belgium
+    'museumpassmuse es': ('BE', 'BRU', 'Brussels', 'O'),  # Belgium - normalized
+    'museum stedhus sleat': ('FR', 'SLO', 'Sloten', 'M'),  # Friesland
+    'museum stedhu s sleat': ('FR', 'SLO', 'Sloten', 'M'),  # Friesland - normalized
+    'museum fiskershuske': ('FR', 'MOD', 'Moddergat', 'M'),  # Friesland
+    'museum fiskershu ske': ('FR', 'MOD', 'Moddergat', 'M'),  # Friesland - normalized
+    'arte al dia': ('US', 'MIA', 'Miami', 'C'),  # US - Latin American art magazine
+    'arte al di a': ('US', 'MIA', 'Miami', 'C'),  # US - normalized
+    'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'),  # Already exists
+    'kro ller mu ller museum': ('GE', 'OTT', 'Otterlo', 'M'),  # Normalized
+    'representation of the netherlands in aruba curacao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'),
+    'representation of the netherlands in aruba curac ao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'),  # Normalized
+    
+    # ==========================================================================
+    # NGOs & ADVOCACY
+    # ==========================================================================
+    'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'),
+    'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'),
+    'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'acp ica archival community for palestine': ('PS', 'RAM', 'Ramallah', 'N'),  # Palestine
+    'campaign against antisemitism': ('GB', 'LON', 'London', 'N'),  # UK
+    'combat antisemitism movement': ('US', 'NYC', 'New York', 'N'),  # US
+    'facing history ourselves': ('US', 'BOS', 'Boston', 'E'),  # US
+    'freundeskreis yad vashem e v': ('DE', 'FRA', 'Frankfurt', 'N'),  # Germany
+    'yad vashem the world holocaust remembrance center': ('IL', 'JER', 'Jerusalem', 'M'),  # Israel
+    'the wiener holocaust library': ('GB', 'LON', 'London', 'L'),  # UK
+    'usc shoah foundation': ('US', 'LAX', 'Los Angeles', 'A'),  # US
+    'cultuurnetwerk groenlinks pvda': ('NH', 'AMS', 'Amsterdam', 'N'),
+    
+    # ==========================================================================
+    # PROFESSIONAL ASSOCIATIONS
+    # ==========================================================================
+    'spab': ('GB', 'LON', 'London', 'N'),  # Society for the Protection of Ancient Buildings, UK
+    'sustainable traditional building alliance': ('GB', 'LON', 'London', 'N'),  # UK
+    'the institute of historic building conservation ihbc': ('GB', 'TIV', 'Tivetshall', 'N'),  # UK
+    'asociacion acre': ('ES', 'MAD', 'Madrid', 'N'),  # Spain
+    'vlaamse vereniging tot behoud van historische vaartuigen': ('BE', 'ANT', 'Antwerpen', 'S'),  # Belgium
+    'v z w archief en documentatiecentrum erfgoed binnenvaart': ('BE', 'ANT', 'Antwerpen', 'A'),  # Belgium
+    'centre d archives et de recherches pour l histoire des femmes avg carhif': ('BE', 'BRU', 'Brussels', 'A'),  # Belgium
+    'nederlandse entomologische vereniging': ('NH', 'AMS', 'Amsterdam', 'S'),
+    'nederlandse vereniging van dierentuinen dutch zoo association': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'netwerk archieven design en digitale cultuur': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'ondernemers in geschiedenis': ('NH', 'AMS', 'Amsterdam', 'S'),
+    'oud stede broec': ('NH', 'STE', 'Stede Broec', 'S'),
+    'raad voor dierenaangelegenheden rda': ('ZH', 'DHA', 'Den Haag', 'O'),
+    'regenl': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'representation of the netherlands in aruba curacao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'),
+    'hylkema erfgoed': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'idverde nl': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'kaliber': ('OV', 'ZWO', 'Zwolle', 'E'),
+    'keunstwurk': ('FR', 'LEE', 'Leeuwarden', 'E'),
+    'kunstkade': ('ZH', 'ROT', 'Rotterdam', 'E'),
+    'leewardists': ('GR', 'GRO', 'Groningen', 'N'),
+    'leo smit foundation': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'loveland events': ('NH', 'AMS', 'Amsterdam', 'E'),
+    'lvwb fundraising': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'meesters in': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'moooi': ('NB', 'BRE', 'Breda', 'C'),
+    'mug authentic coffee atjeh': ('ID', 'JAK', 'Jakarta', 'C'),  # Indonesia
+    
+    # ==========================================================================
+    # ART & HERITAGE PROJECTS
+    # ==========================================================================
+    'art herstory': ('US', 'NYC', 'New York', 'D'),  # US
+    'art history link up': ('GB', 'LON', 'London', 'D'),  # UK
+    'call for curators': ('NH', 'AMS', 'Amsterdam', 'D'),
+    'creative works': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'themusemslab': ('DE', 'BER', 'Berlin', 'E'),  # Germany
+    'cultuurloket digitall': ('NH', 'AMS', 'Amsterdam', 'D'),
+    'gms digitaliseert': ('NH', 'AMS', 'Amsterdam', 'D'),
+    
+    # ==========================================================================
+    # COMPANIES & COMMERCIAL
+    # ==========================================================================
+    'sothebys': ('GB', 'LON', 'London', 'C'),  # UK
+    'sothebys institute of art': ('GB', 'LON', 'London', 'E'),  # UK
+    'the art loss register': ('GB', 'LON', 'London', 'C'),  # UK
+    'space matter': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'studio nauta': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'terra nostra bv': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'tribeca': ('US', 'NYC', 'New York', 'C'),  # US
+    'van gelder groente fruit': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'werken voor cultuur': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'eveha international': ('FR', 'PAR', 'Paris', 'R'),  # France
+    
+    # ==========================================================================
+    # MISCELLANEOUS DUTCH
+    # ==========================================================================
+    'de andere helft': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'eureka': ('NH', 'AMS', 'Amsterdam', 'E'),
+    'enschede700': ('OV', 'ENS', 'Enschede', 'E'),
+    'fenix': ('ZH', 'ROT', 'Rotterdam', 'M'),
+    'ruimtetijd': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'sprekende geschiedenis': ('NH', 'AMS', 'Amsterdam', 'E'),
+    'supermab': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'tijdlab': ('NH', 'AMS', 'Amsterdam', 'R'),
+    'turf event': ('NH', 'AMS', 'Amsterdam', 'E'),
+    'vrijdag': ('GR', 'GRO', 'Groningen', 'E'),
+    'wad gaat om': ('FR', 'LEE', 'Leeuwarden', 'N'),
+    'wikipedia': ('US', 'SFO', 'San Francisco', 'D'),  # US
+    'yory nl het grootste platform voor stamboomonderzoek': ('NH', 'AMS', 'Amsterdam', 'D'),
+    'ar tur': ('BE', 'TUR', 'Turnhout', 'E'),  # Belgium
+    'culture lab 29': ('FR', 'BRE', 'Brest', 'E'),  # France
+    'baleine sous gravillon': ('FR', 'PAR', 'Paris', 'E'),  # France
+    
+    # ==========================================================================
+    # FOREIGN MUSEUMS - Belgium, France, Italy, etc.
+    # ==========================================================================
+    'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'),  # Belgium
+    'huis van alijn': ('BE', 'GEN', 'Gent', 'M'),  # Belgium
+    'kanal centre pompidou': ('BE', 'BRU', 'Brussels', 'M'),  # Belgium
+    'kazerne dossin': ('BE', 'MEC', 'Mechelen', 'M'),  # Belgium
+    'middelheimmuseum': ('BE', 'ANT', 'Antwerpen', 'M'),  # Belgium
+    'musea brugge': ('BE', 'BRU', 'Brugge', 'O'),  # Belgium - museum network
+    'kunstencentrum viernulvier': ('BE', 'GEN', 'Gent', 'E'),  # Belgium
    'caen memorial': ('FR', 'CAE', 'Caen', 'M'),  # France
+    'luma arles': ('FR', 'ARL', 'Arles', 'M'),  # France
+    'la maison du theatre a brest': ('FR', 'BRE', 'Brest', 'E'),  # France
+    'maison des metallos': ('FR', 'PAR', 'Paris', 'E'),  # France
+    'irht institut de recherche et d histoire des textes': ('FR', 'PAR', 'Paris', 'R'),  # France
+    'lucas laboratoire d usages culture s arts societe': ('FR', 'PAR', 'Paris', 'R'),  # France
+    'observatoire des politiques culturelles': ('FR', 'GRE', 'Grenoble', 'R'),  # France
+    'profilculture': ('FR', 'PAR', 'Paris', 'C'),  # France
    'den gamle by': ('DK', 'AAR', 'Aarhus', 'M'),  # Denmark
    'den kongelige samling': ('DK', 'CPH', 'Copenhagen', 'M'),  # Denmark
+    'kulturhusene i danmark': ('DK', 'CPH', 'Copenhagen', 'O'),  # Denmark
+    'kulturmonitor': ('DK', 'CPH', 'Copenhagen', 'R'),  # Denmark
+    'kulturhistorisk museum': ('NO', 'OSL', 'Oslo', 'M'),  # Norway
    'castello di rivoli': ('IT', 'TOR', 'Torino', 'M'),  # Italy
    'consorzio delle residenze reali sabaude': ('IT', 'TOR', 'Torino', 'M'),  # Italy
+    'fondazione canova onlus': ('IT', 'TRE', 'Treviso', 'M'),  # Italy
+    'fondazione pistoletto cittadellarte onlus': ('IT', 'BIE', 'Biella', 'M'),  # Italy
+    'lac lugano arte e cultura': ('IT', 'LUG', 'Lugano', 'M'),  # Switzerland (Italian-speaking)
+    'm9 museum': ('IT', 'VEN', 'Venice', 'M'),  # Italy - actually in Mestre
+    'gammel estrup': ('DK', 'AAR', 'Aarhus', 'M'),  # Denmark
+    'gedung sate museum': ('ID', 'BAN', 'Bandung', 'M'),  # Indonesia
+    'henry moore institute': ('GB', 'LEE', 'Leeds', 'M'),  # UK
+    'her place womens museum': ('AU', 'MEL', 'Melbourne', 'M'),  # Australia
+    'rigsarkivet': ('DK', 'CPH', 'Copenhagen', 'A'),  # Denmark
+    'royal armouries museum': ('GB', 'LEE', 'Leeds', 'M'),  # UK
+    'royal botanic gardens kew': ('GB', 'KEW', 'Kew', 'B'),  # UK
+    'the design museum': ('GB', 'LON', 'London', 'M'),  # UK
+    'the metropolitan museum of art': ('US', 'NYC', 'New York', 'M'),  # US
+    'thorvaldsens museum': ('DK', 'CPH', 'Copenhagen', 'M'),  # Denmark
+    'vitra design museum': ('DE', 'WEI', 'Weil am Rhein', 'M'),  # Germany
+    'war childhood museum': ('BA', 'SAR', 'Sarajevo', 'M'),  # Bosnia
+    'butser ancient farm': ('GB', 'PET', 'Petersfield', 'M'),  # UK
+    'icon film distribution anz': ('AU', 'SYD', 'Sydney', 'C'),  # Australia
+    'museum development north': ('GB', 'NEW', 'Newcastle', 'O'),  # UK
+    'museums association': ('GB', 'LON', 'London', 'N'),  # UK
+    'moya museum of young art': ('AT', 'VIE', 'Vienna', 'M'),  # Austria
+    'national churches trust': ('GB', 'LON', 'London', 'N'),  # UK
+    'national portrait gallery': ('GB', 'LON', 'London', 'M'),  # UK
+    'new contemporaries': ('GB', 'LON', 'London', 'N'),  # UK
+    'peabody essex museum': ('US', 'SAL', 'Salem', 'M'),  # US
+    'norient': ('CH', 'BER', 'Bern', 'R'),  # Switzerland
+    'stiftung trias gemeinnutzige stiftung fur boden okologie und wohnen': ('DE', 'HAT', 'Hattingen', 'N'),  # Germany
+    'nfdi4memory': ('DE', 'BER', 'Berlin', 'R'),  # Germany
+    'themuseumslab': ('DE', 'BER', 'Berlin', 'E'),  # Germany
+    
+    # ==========================================================================
+    # INDONESIAN INSTITUTIONS (for ID-* PENDING files)
+    # ==========================================================================
+    'yayasan arsari djojohadikusumo': ('ID', 'JAK', 'Jakarta', 'N'),  # Indonesia
+    'yayasan konservasi alam nusantara': ('ID', 'JAK', 'Jakarta', 'N'),  # Indonesia
+    'southeast asia museum services seams': ('ID', 'JAK', 'Jakarta', 'O'),  # Indonesia
+    'museum and gallery of ipb future': ('ID', 'BOG', 'Bogor', 'M'),  # Indonesia
+    'museum dewantara kirti griya': ('ID', 'YOG', 'Yogyakarta', 'M'),  # Indonesia
+    'museum macan': ('ID', 'JAK', 'Jakarta', 'M'),  # Indonesia
+    'museum pasifika': ('ID', 'BAL', 'Bali', 'M'),  # Indonesia
+    'museum zoologi universitas andalas': ('ID', 'PAD', 'Padang', 'M'),  # Indonesia
+    'moja museum': ('ID', 'JAK', 'Jakarta', 'M'),  # Indonesia - Museum of Jakarta
+    'wassanindia': ('IN', 'DEL', 'Delhi', 'N'),  # India
+    'museum of contemporary tibetan art': ('IN', 'DHA', 'Dharamsala', 'M'),  # India
+    'vedica art studios and gallery': ('IN', 'DEL', 'Delhi', 'G'),  # India
+
+    # ==========================================================================
+    # AUSTRALIAN INSTITUTIONS
+    # ==========================================================================
+    'museumsppassmusees': ('AU', 'SYD', 'Sydney', 'O'),  # Australia - museum pass program
+    'australian museums and galleries association victoria': ('AU', 'MEL', 'Melbourne', 'N'),
+    'australian society of archivists inc': ('AU', 'CAN', 'Canberra', 'N'),
+    'history australia': ('AU', 'SYD', 'Sydney', 'R'),
+    'melbourne holocaust museum': ('AU', 'MEL', 'Melbourne', 'M'),
+    'national library of australia': ('AU', 'CAN', 'Canberra', 'L'),
+    'professional historians association victoria and tasmania': ('AU', 'MEL', 'Melbourne', 'N'),
+    'the university of queensland art museum': ('AU', 'BRI', 'Brisbane', 'M'),
+    
+    # ==========================================================================
+    # INDONESIAN INSTITUTIONS (additional)
+    # ==========================================================================
+    'arsip nasional republik indonesia anri': ('ID', 'JAK', 'Jakarta', 'A'),
+    'art zoo museum': ('ID', 'JAK', 'Jakarta', 'M'),
+    'art 1 new museum': ('ID', 'JAK', 'Jakarta', 'M'),
+    'asmat museum of culture and progress': ('ID', 'AGT', 'Agats', 'M'),
+    'cifor center for international forestry research': ('ID', 'BOG', 'Bogor', 'R'),
+    'econusa foundation indonesia': ('ID', 'JAK', 'Jakarta', 'N'),
+    'econusa foundation': ('ID', 'JAK', 'Jakarta', 'N'),
+    'fisheries resource center of indonesia frci': ('ID', 'JAK', 'Jakarta', 'R'),
+    'gaia indonesia': ('ID', 'JAK', 'Jakarta', 'N'),
+    'jakarta history museum': ('ID', 'JAK', 'Jakarta', 'M'),
+    'kite museum of indonesia': ('ID', 'JAK', 'Jakarta', 'M'),
+    'konservasi indonesia': ('ID', 'JAK', 'Jakarta', 'N'),
+    'ministry of tourism of the republic of indonesia': ('ID', 'JAK', 'Jakarta', 'O'),
+    'museum batik indonesia': ('ID', 'YOG', 'Yogyakarta', 'M'),
+    'museum musik indonesia': ('ID', 'JAK', 'Jakarta', 'M'),
+    'museum nasional indonesia': ('ID', 'JAK', 'Jakarta', 'M'),
+    'museum perkebunan indonesia': ('ID', 'MED', 'Medan', 'M'),
+    'perpustakaan nasional republik indonesia perpusnas ri': ('ID', 'JAK', 'Jakarta', 'L'),
+    'taman safari indonesia': ('ID', 'BOG', 'Bogor', 'B'),
+    
+    # ==========================================================================
+    # FRENCH INSTITUTIONS (additional)
+    # ==========================================================================
+    'alca nouvelle aquitaine': ('FR', 'BOR', 'Bordeaux', 'O'),
+    'archives de rennes': ('FR', 'REN', 'Rennes', 'A'),
+    'centre de recherche du chateau de versailles': ('FR', 'VER', 'Versailles', 'R'),
+    'centre des monuments nationaux': ('FR', 'PAR', 'Paris', 'O'),
+    'chateau de chantilly officiel': ('FR', 'CHA', 'Chantilly', 'M'),
+    'cha teau de chantilly officiel': ('FR', 'CHA', 'Chantilly', 'M'),  # normalized
+    'france nature environnement': ('FR', 'PAR', 'Paris', 'N'),
+    'ircam': ('FR', 'PAR', 'Paris', 'R'),
+    'mucem musee des civilisations de l europe et de la mediterranee': ('FR', 'MAR', 'Marseille', 'M'),
+    'mucem muse e des civilisations de l europe et de la me diterrane e': ('FR', 'MAR', 'Marseille', 'M'),  # normalized
+    'centre de recherche du cha teau de versailles': ('FR', 'VER', 'Versailles', 'R'),  # normalized
+    'musee d orsay': ('FR', 'PAR', 'Paris', 'M'),
+    'muse e d orsay': ('FR', 'PAR', 'Paris', 'M'),  # normalized variant
+    'musee de bretagne': ('FR', 'REN', 'Rennes', 'M'),
+    'muse e de bretagne': ('FR', 'REN', 'Rennes', 'M'),  # normalized
+    'musee des arts et metiers': ('FR', 'PAR', 'Paris', 'M'),
+    'muse e des arts et me tiers': ('FR', 'PAR', 'Paris', 'M'),  # normalized
+    'musee du debarquement': ('FR', 'ARR', 'Arromanches', 'M'),
+    'muse e du de barquement': ('FR', 'ARR', 'Arromanches', 'M'),  # normalized
+    'petites cites de caractere de france': ('FR', 'PAR', 'Paris', 'N'),
+    'petites cite s de caracte re de france': ('FR', 'PAR', 'Paris', 'N'),  # normalized
+    'villa albertine the french institute for culture and education': ('US', 'NYC', 'New York', 'O'),  # French in US
+    
+    # ==========================================================================
+    # GERMAN INSTITUTIONS (additional)
+    # ==========================================================================
+    'anne frank educational center': ('DE', 'FRA', 'Frankfurt', 'E'),
+    'bildarchiv foto marburg': ('DE', 'MAR', 'Marburg', 'A'),
+    'bundesvereinigung kulturelle kinder und jugendbildung bkj': ('DE', 'REM', 'Remscheid', 'N'),
+    'common wadden sea secretariat': ('DE', 'WIL', 'Wilhelmshaven', 'O'),
+    'deutsche stiftung denkmalschutz german foundation for monument protection': ('DE', 'BON', 'Bonn', 'N'),
+    'deutsches archaologisches institut dai': ('DE', 'BER', 'Berlin', 'R'),
+    'deutsches archa ologisches institut dai': ('DE', 'BER', 'Berlin', 'R'),  # normalized
+    'deutsches historisches museum': ('DE', 'BER', 'Berlin', 'M'),
+    'deutsches zentrum kulturgutverluste': ('DE', 'MAG', 'Magdeburg', 'R'),
+    'jewish museum berlin': ('DE', 'BER', 'Berlin', 'M'),
+    'klassik stiftung weimar': ('DE', 'WEI', 'Weimar', 'M'),
+    'kulturstiftung des bundes german federal cultural foundation': ('DE', 'HAL', 'Halle', 'N'),
+    'stadtische galerie im lenbachhaus und kunstbau munchen': ('DE', 'MUN', 'Munich', 'M'),
+    'sta dtische galerie im lenbachhaus und kunstbau mu nchen': ('DE', 'MUN', 'Munich', 'M'),  # normalized
+    'stiftung stadtmuseum berlin': ('DE', 'BER', 'Berlin', 'M'),
+    
+    # ==========================================================================
+    # BRITISH INSTITUTIONS (additional)
+    # ==========================================================================
+    'archaeological research services ltd': ('GB', 'BAK', 'Bakewell', 'R'),
+    'british school at athens': ('GR', 'ATH', 'Athens', 'R'),  # Greek location!
+    'british trust for ornithology bto': ('GB', 'THE', 'Thetford', 'R'),
+    'historic new england': ('US', 'BOS', 'Boston', 'N'),  # US, not UK!
+    'historic royal palaces': ('GB', 'LON', 'London', 'M'),
+    'new england museum association': ('US', 'BOS', 'Boston', 'N'),  # US, not UK!
+    
+    # ==========================================================================
+    # ITALIAN INSTITUTIONS (additional)
+    # ==========================================================================
+    'artribune': ('IT', 'ROM', 'Rome', 'C'),
+    'centro conservazione restauro la venaria reale': ('IT', 'TOR', 'Turin', 'R'),
+    'ecole francaise de rome efr': ('IT', 'ROM', 'Rome', 'R'),
+    'e cole franc aise de rome efr': ('IT', 'ROM', 'Rome', 'R'),  # normalized
+    'museum tweestromenland': ('GE', 'BEN', 'Beneden-Leeuwen', 'M'),  # Dutch, in Beneden-Leeuwen!
+    'stichting roma aeterna': ('IT', 'ROM', 'Rome', 'N'),
+    'triennale milano': ('IT', 'MIL', 'Milan', 'M'),
+    
+    # ==========================================================================
+    # BELGIAN INSTITUTIONS (additional)
+    # ==========================================================================
+    'advn': ('BE', 'ANT', 'Antwerpen', 'A'),
+    'm leuven': ('BE', 'LEU', 'Leuven', 'M'),
+    'museum voor schone kunsten gent': ('BE', 'GEN', 'Gent', 'M'),
+    'wikimedia belgium': ('BE', 'BRU', 'Brussels', 'N'),
+    
+    # ==========================================================================
+    # US INSTITUTIONS (additional)
+    # ==========================================================================
+    'gia gemological institute of america': ('US', 'CAR', 'Carlsbad', 'R'),
+    'international society of arboriculture': ('US', 'ATL', 'Atlanta', 'N'),
+    'standwithus': ('US', 'LAX', 'Los Angeles', 'N'),
+    
+    # ==========================================================================
+    # DANISH INSTITUTIONS (additional)
+    # ==========================================================================
+    'aalborg teater': ('DK', 'AAL', 'Aalborg', 'E'),
+    'augustinus fonden': ('DK', 'CPH', 'Copenhagen', 'N'),
+    'kobenhavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'),
+    'ko benhavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'),  # normalized
+    'københavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'),  # with ø
+    
+    # ==========================================================================
+    # SPANISH INSTITUTIONS
+    # ==========================================================================
+    'centre de cultura contemporania de barcelona cccb': ('ES', 'BAR', 'Barcelona', 'M'),
+    'centre de cultura contempora nia de barcelona cccb': ('ES', 'BAR', 'Barcelona', 'M'),  # normalized
+    'instituto del patrimonio cultural de espana ipce': ('ES', 'MAD', 'Madrid', 'O'),
+    'instituto del patrimonio cultural de espan a ipce': ('ES', 'MAD', 'Madrid', 'O'),  # normalized
+    
+    # ==========================================================================
+    # INDIAN INSTITUTIONS
+    # ==========================================================================
+    'placemaking india': ('IN', 'DEL', 'Delhi', 'N'),
+    
+    # ==========================================================================
+    # OTHER INTERNATIONAL
+    # ==========================================================================
+    'african wildlife foundation': ('KE', 'NAI', 'Nairobi', 'N'),
+    'arabian oud': ('SA', 'RIY', 'Riyadh', 'C'),
+    'wza rat althqa fh ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'),  # Saudi Ministry of Culture normalized
+    'وزارة الثقافة ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'),  # Saudi Ministry of Culture Arabic
+    'ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'),  # Saudi Ministry of Culture simple
+    'dariah eric': ('EU', 'BRU', 'Brussels', 'R'),
+    'embassy of the netherlands in israel': ('IL', 'TLV', 'Tel Aviv', 'O'),
+    'european museum academy': ('EU', 'BRU', 'Brussels', 'N'),
+    'iucn ssc shark specialist group ssg': ('CA', 'VAN', 'Vancouver', 'R'),
+    'museum vosbergen': ('DR', 'EEL', 'Eelde', 'M'),  # Dutch - in Eelde
+    'bonhams': ('GB', 'LON', 'London', 'C'),  # UK auction house
+    
+    # ==========================================================================
+    # REMAINING DUTCH
+    # ==========================================================================
+    'het nationale park de hoge veluwe': ('GE', 'OTT', 'Otterlo', 'N'),
+    'lucas laboratoire d usages culture s arts socie te': ('FR', 'PAR', 'Paris', 'R'),  # French org
+
+    # ==========================================================================
+    # OTHER MISCELLANEOUS DUTCH ORGANIZATIONS
+    # ==========================================================================
+    'introdans': ('GE', 'ARN', 'Arnhem', 'E'),
+    'ja21 het juiste antwoord': ('NH', 'AMS', 'Amsterdam', 'N'),  # Political party - not heritage
+    'kasteel radboud': ('NH', 'MED', 'Medemblik', 'M'),
+    'klooster huissen': ('GE', 'HUI', 'Huissen', 'H'),
+    'koninklijke luchtmacht historische vlucht': ('NH', 'GIL', 'Gilze-Rijen', 'M'),
+    'koninklijke woudenberg': ('UT', 'WOU', 'Woudenberg', 'C'),
+    'museum fiskershúske': ('FR', 'MOD', 'Moddergat', 'M'),
+    'museum media': ('NH', 'AMS', 'Amsterdam', 'C'),
+    'museum of 21st century design': ('GB', 'LON', 'London', 'M'),  # UK
+    'museum of comic art moca': ('US', 'NYC', 'New York', 'M'),  # US
+    'museum of edible earth': ('NL', 'AMS', 'Amsterdam', 'M'),  # Actually NL-based
+    'museum of humanity': ('GB', 'LON', 'London', 'M'),  # UK
+    'museum of looted antiquities': ('GB', 'LON', 'London', 'D'),  # UK - virtual
+    'museum of science': ('US', 'BOS', 'Boston', 'M'),  # US
+    'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'),  # Belgium - museum pass
+    'museumvereniging': ('NH', 'AMS', 'Amsterdam', 'N'),
+    'oerol festival': ('FR', 'TER', 'Terschelling', 'E'),
+    'qwen': ('CN', 'HAN', 'Hangzhou', 'C'),  # China - AI company, not heritage
+    'radio en museum': ('NH', 'AMS', 'Amsterdam', 'M'),
+    'sothebys': ('GB', 'LON', 'London', 'C'),  # UK
+    'sothebys institute of art': ('GB', 'LON', 'London', 'E'),  # UK
+    'nieuwe veste': ('NB', 'BRE', 'Breda', 'E'),
 }


@ -216,8 +815,14 @@ def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]:
        abbrev = extract_abbreviation(emic_name)
        
        # Handle non-Dutch organizations
+        # All non-NL countries get their country code as the country, with XX as province
+        FOREIGN_COUNTRIES = {
+            'FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US', 'AT', 'AU', 'BA', 'ES', 
+            'EU', 'ID', 'IL', 'IN', 'MA', 'NO', 'PT', 'PS', 'ZA', 'CA', 'GR', 'KE', 'SA',
+            'CH', 'CN'
+        }
        country = 'NL'
-        if province in ['FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US']:
+        if province in FOREIGN_COUNTRIES:
            country = province
            province = 'XX'