feat(scripts): expand university location mappings and add web enrichment
- enrich_ppids.py: Add 40+ Dutch universities and hogescholen to location mapping - enrich_ppids_web.py: New script for web-based PPID enrichment - resolve_pending_known_orgs.py: Updates for pending org resolution
This commit is contained in:
parent
ea35da02dc
commit
dd0ee2cf11
3 changed files with 1590 additions and 46 deletions
|
|
@ -114,6 +114,85 @@ DUTCH_UNI_LOCATIONS = {
|
|||
"IOPS": ("Amsterdam", "NL"),
|
||||
"Interuniversity Graduate School of Psychometrics": ("Amsterdam", "NL"),
|
||||
"Sioo": ("Utrecht", "NL"),
|
||||
# Additional Dutch universities (expanded mapping)
|
||||
"Eindhoven University of Technology": ("Eindhoven", "NL"),
|
||||
"Delft University of Technology": ("Delft", "NL"),
|
||||
"University of Twente": ("Enschede", "NL"),
|
||||
"Universiteit Twente": ("Enschede", "NL"),
|
||||
"UT": ("Enschede", "NL"),
|
||||
"Open Universiteit": ("Heerlen", "NL"),
|
||||
"Open University Netherlands": ("Heerlen", "NL"),
|
||||
"Nyenrode": ("Breukelen", "NL"),
|
||||
"Nyenrode Business Universiteit": ("Breukelen", "NL"),
|
||||
"Theologische Universiteit": ("Kampen", "NL"),
|
||||
"Protestant Theological University": ("Amsterdam", "NL"),
|
||||
# Additional Hogescholen
|
||||
"De Haagse Hogeschool": ("Den Haag", "NL"),
|
||||
"The Hague University": ("Den Haag", "NL"),
|
||||
"The Hague University of Applied Sciences": ("Den Haag", "NL"),
|
||||
"Amsterdamse Hogeschool voor de Kunsten": ("Amsterdam", "NL"),
|
||||
"AHK": ("Amsterdam", "NL"),
|
||||
"Conservatorium van Amsterdam": ("Amsterdam", "NL"),
|
||||
"Hanzehogeschool Groningen": ("Groningen", "NL"),
|
||||
"Hogeschool Leiden": ("Leiden", "NL"),
|
||||
"Hogeschool Zeeland": ("Vlissingen", "NL"),
|
||||
"HZ University of Applied Sciences": ("Vlissingen", "NL"),
|
||||
"Hogeschool voor de Kunsten Utrecht": ("Utrecht", "NL"),
|
||||
"HKU": ("Utrecht", "NL"),
|
||||
"Willem de Kooning Academie": ("Rotterdam", "NL"),
|
||||
"Codarts Rotterdam": ("Rotterdam", "NL"),
|
||||
"Codarts": ("Rotterdam", "NL"),
|
||||
"Design Academy": ("Eindhoven", "NL"),
|
||||
"NHTV": ("Breda", "NL"),
|
||||
"NHTV Breda University of Applied Sciences": ("Breda", "NL"),
|
||||
"Breda University of Applied Sciences": ("Breda", "NL"),
|
||||
"NHL Hogeschool": ("Leeuwarden", "NL"),
|
||||
"Van Hall Larenstein": ("Velp", "NL"),
|
||||
"NCOI": ("Hilversum", "NL"),
|
||||
"NCOI Opleidingen": ("Hilversum", "NL"),
|
||||
"LOI": ("Leiderdorp", "NL"),
|
||||
"LOI Hogeschool": ("Leiderdorp", "NL"),
|
||||
"NTI": ("Leiden", "NL"),
|
||||
"Hogeschool Arnhem": ("Arnhem", "NL"),
|
||||
"Hogeschool Nijmegen": ("Nijmegen", "NL"),
|
||||
"ROC": ("", "NL"), # Regional Training Centers - various locations (fallback)
|
||||
# Specific ROC locations
|
||||
"ROC Leeuwenborgh": ("Maastricht", "NL"),
|
||||
"ROC Leiden": ("Leiden", "NL"),
|
||||
"ROC Midden Nederland": ("Utrecht", "NL"),
|
||||
"ROC MN": ("Utrecht", "NL"),
|
||||
"ROC van Amsterdam": ("Amsterdam", "NL"),
|
||||
"ROC Amsterdam": ("Amsterdam", "NL"),
|
||||
"ROC Flevoland": ("Almere", "NL"),
|
||||
"ROC Tilburg": ("Tilburg", "NL"),
|
||||
"ROC van Twente": ("Enschede", "NL"),
|
||||
"ROC Twente": ("Enschede", "NL"),
|
||||
"ROC Nijmegen": ("Nijmegen", "NL"),
|
||||
"ROC Mondriaan": ("Den Haag", "NL"),
|
||||
"ROC Nova College": ("Haarlem", "NL"),
|
||||
"ROC Albeda": ("Rotterdam", "NL"),
|
||||
"Albeda College": ("Rotterdam", "NL"),
|
||||
"Zadkine": ("Rotterdam", "NL"),
|
||||
"Graafschap College": ("Doetinchem", "NL"),
|
||||
"Friesland College": ("Leeuwarden", "NL"),
|
||||
"Noorderpoort": ("Groningen", "NL"),
|
||||
"Alfa-college": ("Groningen", "NL"),
|
||||
"Deltion College": ("Zwolle", "NL"),
|
||||
"Cibap": ("Zwolle", "NL"),
|
||||
"Summa College": ("Eindhoven", "NL"),
|
||||
"SintLucas": ("Eindhoven", "NL"),
|
||||
"Koning Willem I College": ("Den Bosch", "NL"),
|
||||
"Curio": ("Breda", "NL"),
|
||||
"Da Vinci College": ("Dordrecht", "NL"),
|
||||
# Additional Radboud variations
|
||||
"Radboud University Nijmegen": ("Nijmegen", "NL"),
|
||||
"Radboud University": ("Nijmegen", "NL"),
|
||||
# Additional VU variations
|
||||
"Vrije Universiteit Amsterdam": ("Amsterdam", "NL"),
|
||||
"VU University Amsterdam": ("Amsterdam", "NL"),
|
||||
# Wageningen variations
|
||||
"Wageningen University & Research": ("Wageningen", "NL"),
|
||||
"WUR": ("Wageningen", "NL"),
|
||||
# Belgian institutions
|
||||
"KU Leuven": ("Leuven", "BE"),
|
||||
"University of Leuven": ("Leuven", "BE"),
|
||||
|
|
@ -141,9 +220,85 @@ DUTCH_UNI_LOCATIONS = {
|
|||
"LMU München": ("München", "DE"),
|
||||
"Technische Universität München": ("München", "DE"),
|
||||
"TU München": ("München", "DE"),
|
||||
# International
|
||||
# UK institutions
|
||||
"University of Oxford": ("Oxford", "GB"),
|
||||
"Oxford University": ("Oxford", "GB"),
|
||||
"University of Cambridge": ("Cambridge", "GB"),
|
||||
"Cambridge University": ("Cambridge", "GB"),
|
||||
"University of York": ("York", "GB"),
|
||||
"University College London": ("London", "GB"),
|
||||
"UCL": ("London", "GB"),
|
||||
"London School of Economics": ("London", "GB"),
|
||||
"LSE": ("London", "GB"),
|
||||
"King's College London": ("London", "GB"),
|
||||
"Imperial College": ("London", "GB"),
|
||||
"University of Edinburgh": ("Edinburgh", "GB"),
|
||||
"University of Manchester": ("Manchester", "GB"),
|
||||
# Australian institutions
|
||||
"The Australian National University": ("Canberra", "AU"),
|
||||
"Australian National University": ("Canberra", "AU"),
|
||||
"ANU": ("Canberra", "AU"),
|
||||
"University of Canberra": ("Canberra", "AU"),
|
||||
"University of Melbourne": ("Melbourne", "AU"),
|
||||
"University of Sydney": ("Sydney", "AU"),
|
||||
"Macquarie University": ("Sydney", "AU"),
|
||||
"Charles Sturt University": ("Bathurst", "AU"),
|
||||
"UNSW": ("Sydney", "AU"),
|
||||
"University of New South Wales": ("Sydney", "AU"),
|
||||
"University of Queensland": ("Brisbane", "AU"),
|
||||
"Monash University": ("Melbourne", "AU"),
|
||||
# South African institutions
|
||||
"University of Cape Town": ("Cape Town", "ZA"),
|
||||
"UCT": ("Cape Town", "ZA"),
|
||||
"University of Pretoria": ("Pretoria", "ZA"),
|
||||
"University of Witwatersrand": ("Johannesburg", "ZA"),
|
||||
"Stellenbosch University": ("Stellenbosch", "ZA"),
|
||||
# Italian institutions
|
||||
"Politecnico di Milano": ("Milano", "IT"),
|
||||
"Università degli Studi di Milano": ("Milano", "IT"),
|
||||
"Università di Bologna": ("Bologna", "IT"),
|
||||
"University of Bologna": ("Bologna", "IT"),
|
||||
# US institutions
|
||||
"Oberlin College": ("Oberlin", "US"),
|
||||
"Harvard University": ("Cambridge", "US"),
|
||||
"Harvard": ("Cambridge", "US"),
|
||||
"Yale University": ("New Haven", "US"),
|
||||
"Princeton University": ("Princeton", "US"),
|
||||
"MIT": ("Cambridge", "US"),
|
||||
"Massachusetts Institute of Technology": ("Cambridge", "US"),
|
||||
"Stanford University": ("Stanford", "US"),
|
||||
"Columbia University": ("New York", "US"),
|
||||
"University of California": ("Berkeley", "US"),
|
||||
"UCLA": ("Los Angeles", "US"),
|
||||
"University of Chicago": ("Chicago", "US"),
|
||||
"NYU": ("New York", "US"),
|
||||
"New York University": ("New York", "US"),
|
||||
# Indonesian institutions
|
||||
"Universitas Gadjah Mada": ("Yogyakarta", "ID"),
|
||||
"UGM": ("Yogyakarta", "ID"),
|
||||
"Universitas Indonesia": ("Jakarta", "ID"),
|
||||
"UI": ("Jakarta", "ID"),
|
||||
# Turkish institutions
|
||||
"Middle East Technical University": ("Ankara", "TR"),
|
||||
"METU": ("Ankara", "TR"),
|
||||
"Boğaziçi University": ("Istanbul", "TR"),
|
||||
# Additional Dutch variations found in data
|
||||
"Rotterdam School of Management": ("Rotterdam", "NL"),
|
||||
"RSM": ("Rotterdam", "NL"),
|
||||
"TIAS School for Business and Society": ("Tilburg", "NL"),
|
||||
"TIAS": ("Tilburg", "NL"),
|
||||
"GO opleidingen": ("Utrecht", "NL"),
|
||||
"Amsterdam University of Applied Sciences": ("Amsterdam", "NL"),
|
||||
"University College Utrecht": ("Utrecht", "NL"),
|
||||
"UCU": ("Utrecht", "NL"),
|
||||
"University of Utrecht": ("Utrecht", "NL"),
|
||||
"NSOB": ("Den Haag", "NL"),
|
||||
"Nederlandse School voor Openbaar Bestuur": ("Den Haag", "NL"),
|
||||
"Grotius Academie": ("Nijmegen", "NL"),
|
||||
"de Baak": ("Noordwijk", "NL"),
|
||||
"Grafisch Lyceum Rotterdam": ("Rotterdam", "NL"),
|
||||
"Schoevers": ("Utrecht", "NL"),
|
||||
"Schoevers College": ("Utrecht", "NL"),
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -171,12 +326,39 @@ def geocode_location(location_str: str, db_path: str) -> Optional[dict]:
|
|||
|
||||
# Extract country from common patterns
|
||||
country_code = None
|
||||
if "(NL)" in location_str or "Netherlands" in location_str or "Nederland" in location_str:
|
||||
country_code = "NL"
|
||||
elif "(BE)" in location_str or "Belgium" in location_str or "België" in location_str:
|
||||
country_code = "BE"
|
||||
elif "(DE)" in location_str or "Germany" in location_str or "Deutschland" in location_str:
|
||||
country_code = "DE"
|
||||
country_patterns = {
|
||||
"NL": ["(NL)", "Netherlands", "Nederland"],
|
||||
"BE": ["(BE)", "Belgium", "België", "Belgique"],
|
||||
"DE": ["(DE)", "Germany", "Deutschland"],
|
||||
"GB": ["(GB)", "United Kingdom", "UK", "England", "Scotland", "Wales"],
|
||||
"AU": ["(AU)", "Australia"],
|
||||
"ZA": ["(ZA)", "South Africa"],
|
||||
"IT": ["(IT)", "Italy", "Italia"],
|
||||
"US": ["(US)", "United States", "USA", "U.S."],
|
||||
"ID": ["(ID)", "Indonesia"],
|
||||
"TR": ["(TR)", "Turkey", "Türkiye"],
|
||||
"FR": ["(FR)", "France"],
|
||||
"ES": ["(ES)", "Spain", "España"],
|
||||
"AT": ["(AT)", "Austria", "Österreich"],
|
||||
"CH": ["(CH)", "Switzerland", "Schweiz", "Suisse"],
|
||||
"CA": ["(CA)", "Canada"],
|
||||
"NZ": ["(NZ)", "New Zealand"],
|
||||
"JP": ["(JP)", "Japan"],
|
||||
"CN": ["(CN)", "China"],
|
||||
"IN": ["(IN)", "India"],
|
||||
"BR": ["(BR)", "Brazil", "Brasil"],
|
||||
"SE": ["(SE)", "Sweden", "Sverige"],
|
||||
"NO": ["(NO)", "Norway", "Norge"],
|
||||
"DK": ["(DK)", "Denmark", "Danmark"],
|
||||
"FI": ["(FI)", "Finland", "Suomi"],
|
||||
"PL": ["(PL)", "Poland", "Polska"],
|
||||
"CZ": ["(CZ)", "Czech Republic", "Czechia", "Česko"],
|
||||
}
|
||||
|
||||
for code, patterns in country_patterns.items():
|
||||
if any(p in location_str for p in patterns):
|
||||
country_code = code
|
||||
break
|
||||
|
||||
# Clean location for city lookup
|
||||
city_candidate = location_str.split(",")[0].strip()
|
||||
|
|
@ -255,6 +437,56 @@ def parse_date_range(date_range: str) -> Tuple[Optional[int], Optional[int]]:
|
|||
return start_year, end_year
|
||||
|
||||
|
||||
def get_any_date_field(record: dict) -> str:
|
||||
"""
|
||||
Extract date string from a record with various field name conventions.
|
||||
|
||||
Handles the following field variations found in LinkedIn profile data:
|
||||
- date_range: "2019 - Present" (most common, 2,486 entries)
|
||||
- period: "2015 - 2019" (15 entries)
|
||||
- years/year: "2010" (single year)
|
||||
- start_date/end_date: separate fields like "Sep 2019" / "Present" (149 entries)
|
||||
- dates: "2018 - 2020" (12 entries)
|
||||
|
||||
Returns combined date string suitable for parse_date_range().
|
||||
"""
|
||||
# Try combined date fields first
|
||||
for field in ["date_range", "period", "years", "year", "dates"]:
|
||||
if record.get(field):
|
||||
return str(record[field])
|
||||
|
||||
# Handle separate start_date/end_date fields
|
||||
start = record.get("start_date", "") or ""
|
||||
end = record.get("end_date", "") or ""
|
||||
if start or end:
|
||||
return f"{start} - {end}".strip(" -")
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def parse_total_experience_field(total_exp: str) -> Optional[int]:
|
||||
"""
|
||||
Parse total experience field value to extract years.
|
||||
|
||||
Handles formats like:
|
||||
- "24 years and 8 months"
|
||||
- "37 years"
|
||||
- "5 years 3 months"
|
||||
- "1 year"
|
||||
|
||||
Returns number of years or None if not parseable.
|
||||
"""
|
||||
if not total_exp:
|
||||
return None
|
||||
|
||||
# Pattern: find digits followed by "year" or "years"
|
||||
match = re.search(r'(\d+)\s*years?', total_exp.lower())
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def build_inference_chain(steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Build a numbered inference chain."""
|
||||
return [{"step": i + 1, **step} for i, step in enumerate(steps)]
|
||||
|
|
@ -297,6 +529,22 @@ def get_adjacent_decades(year: int) -> Tuple[str, str]:
|
|||
return (get_decade_notation(year - 10), get_decade_notation(year))
|
||||
|
||||
|
||||
def parse_total_experience(about_text: str) -> Optional[int]:
|
||||
"""
|
||||
Parse "Total Experience: X years" pattern from about/summary field.
|
||||
Returns number of years or None if not found.
|
||||
"""
|
||||
if not about_text:
|
||||
return None
|
||||
|
||||
# Pattern: "Total Experience: X years and Y months" or "Total Experience: X year"
|
||||
m = re.search(r'Total Experience:\s*(\d+)\s*years?', about_text, re.IGNORECASE)
|
||||
if m:
|
||||
return int(m.group(1))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
||||
"""
|
||||
Infer birth decade from earliest career observations.
|
||||
|
|
@ -305,6 +553,11 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
|||
Supports list-valued results for decade boundary cases (Rule 45 extension):
|
||||
- If estimated birth year is within 3 years of decade boundary, returns
|
||||
both adjacent decades as EDTF set notation: [196X,197X]
|
||||
|
||||
Inference methods (in priority order):
|
||||
1. Education start year (most reliable - entry age 18-24)
|
||||
2. Experience start year (first job - entry age ~23)
|
||||
3. Total Experience pattern (fallback - "Total Experience: X years")
|
||||
"""
|
||||
earliest_year = None
|
||||
inference_steps = []
|
||||
|
|
@ -312,6 +565,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
|||
age_variance = 3 # ±3 years typical variance in entry age
|
||||
education_record = None
|
||||
experience_record = None
|
||||
total_experience_years = None
|
||||
|
||||
# Check education first (most reliable)
|
||||
education = profile_data.get("education") or []
|
||||
|
|
@ -381,8 +635,8 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
|||
for exp in experience:
|
||||
if exp is None:
|
||||
continue
|
||||
# Handle multiple date field names
|
||||
date_range = exp.get("date_range") or exp.get("period") or ""
|
||||
# Handle multiple date field names (including start_date/end_date)
|
||||
date_range = get_any_date_field(exp)
|
||||
start_year, _ = parse_date_range(date_range)
|
||||
|
||||
if start_year:
|
||||
|
|
@ -396,10 +650,59 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
|||
"date_range": date_range,
|
||||
}
|
||||
|
||||
# If no education or experience dates, try "Total Experience" pattern in about field
|
||||
if earliest_year is None:
|
||||
about = profile_data.get("about") or profile_data.get("summary") or ""
|
||||
total_experience_years = parse_total_experience(about)
|
||||
|
||||
if total_experience_years and total_experience_years > 0:
|
||||
# Estimate: current year - total_years = first job year
|
||||
# Then: first job year - 23 = birth year (assuming first job at 23)
|
||||
current_year = datetime.now().year
|
||||
estimated_first_job_year = current_year - total_experience_years
|
||||
earliest_year = estimated_first_job_year
|
||||
age_offset = 23 # Assume first job at 23
|
||||
age_variance = 7 # Very high variance for this method
|
||||
|
||||
inference_steps.append({
|
||||
"observation": "Total Experience pattern found in about field",
|
||||
"source_field": "profile_data.about",
|
||||
"source_value": f"Total Experience: {total_experience_years} years",
|
||||
})
|
||||
inference_steps.append({
|
||||
"calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}",
|
||||
"result": f"Estimated first job year: {estimated_first_job_year}",
|
||||
"assumption": "Total experience represents continuous career from first job",
|
||||
})
|
||||
|
||||
# If still no date, try standalone total_experience field in profile_data
|
||||
if earliest_year is None:
|
||||
total_exp_field = profile_data.get("total_experience")
|
||||
if total_exp_field:
|
||||
total_experience_years = parse_total_experience_field(total_exp_field)
|
||||
|
||||
if total_experience_years and total_experience_years > 0:
|
||||
current_year = datetime.now().year
|
||||
estimated_first_job_year = current_year - total_experience_years
|
||||
earliest_year = estimated_first_job_year
|
||||
age_offset = 23 # Assume first job at 23
|
||||
age_variance = 7 # Very high variance for this method
|
||||
|
||||
inference_steps.append({
|
||||
"observation": "total_experience field found in profile_data",
|
||||
"source_field": "profile_data.total_experience",
|
||||
"source_value": total_exp_field,
|
||||
})
|
||||
inference_steps.append({
|
||||
"calculation": f"{current_year} - {total_experience_years} = {estimated_first_job_year}",
|
||||
"result": f"Estimated first job year: {estimated_first_job_year}",
|
||||
"assumption": "Total experience represents continuous career from first job",
|
||||
})
|
||||
|
||||
if earliest_year is None:
|
||||
return None
|
||||
|
||||
# Build inference chain
|
||||
# Build inference chain (only add steps if not already added from Total Experience path)
|
||||
if education_record:
|
||||
inference_steps.append({
|
||||
"observation": "Education record found",
|
||||
|
|
@ -415,7 +718,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
|||
"rationale": "Standard entry age for this education level in Netherlands/Europe",
|
||||
"confidence_impact": f"Assumption introduces uncertainty; actual age may vary ±{age_variance} years",
|
||||
})
|
||||
else:
|
||||
elif experience_record:
|
||||
inference_steps.append({
|
||||
"observation": "First job record found (no education data)",
|
||||
"source_field": "profile_data.experience",
|
||||
|
|
@ -430,6 +733,13 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
|||
"rationale": "Assumes first job after typical university completion",
|
||||
"confidence_impact": f"Higher uncertainty; first job age varies ±{age_variance} years",
|
||||
})
|
||||
elif total_experience_years:
|
||||
# Steps already added in the Total Experience detection block
|
||||
inference_steps.append({
|
||||
"assumption": f"First job age is approximately {age_offset} (±{age_variance} years)",
|
||||
"rationale": "Assumes first job after typical university completion; Total Experience method has highest uncertainty",
|
||||
"confidence_impact": f"Very high uncertainty; first job age varies ±{age_variance} years, plus Total Experience aggregation may be inaccurate",
|
||||
})
|
||||
|
||||
estimated_birth_year = earliest_year - age_offset
|
||||
min_birth_year = earliest_year - age_offset - age_variance
|
||||
|
|
@ -468,6 +778,14 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
|||
"rationale": "Cannot determine which decade with certainty; using EDTF 'one of' set notation",
|
||||
})
|
||||
|
||||
# Determine method name based on source
|
||||
if education_record:
|
||||
method_name = "earliest_education_heuristic"
|
||||
elif experience_record:
|
||||
method_name = "earliest_experience_heuristic"
|
||||
else:
|
||||
method_name = "total_experience_heuristic"
|
||||
|
||||
return {
|
||||
"values": [decade1, decade2],
|
||||
"edtf": f"[{decade1},{decade2}]",
|
||||
|
|
@ -477,7 +795,7 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
|||
"primary_rationale": primary_rationale,
|
||||
"confidence": "very_low", # Lower confidence due to boundary uncertainty
|
||||
"inference_provenance": {
|
||||
"method": "earliest_observation_heuristic",
|
||||
"method": method_name,
|
||||
"inference_chain": build_inference_chain(inference_steps),
|
||||
"assumptions": [
|
||||
f"Entry age for education/first job: {age_offset} years (±{age_variance})",
|
||||
|
|
@ -499,13 +817,24 @@ def infer_birth_decade(profile_data: dict) -> Optional[dict]:
|
|||
"rationale": "Decade precision appropriate for heuristic-based estimate",
|
||||
})
|
||||
|
||||
# Determine method name and confidence based on source
|
||||
if education_record:
|
||||
method_name = "earliest_education_heuristic"
|
||||
confidence = "low"
|
||||
elif experience_record:
|
||||
method_name = "earliest_experience_heuristic"
|
||||
confidence = "low"
|
||||
else:
|
||||
method_name = "total_experience_heuristic"
|
||||
confidence = "very_low" # Lowest confidence for Total Experience method
|
||||
|
||||
return {
|
||||
"value": edtf_decade,
|
||||
"edtf": edtf_decade,
|
||||
"precision": "decade",
|
||||
"confidence": "low",
|
||||
"confidence": confidence,
|
||||
"inference_provenance": {
|
||||
"method": "earliest_observation_heuristic",
|
||||
"method": method_name,
|
||||
"inference_chain": build_inference_chain(inference_steps),
|
||||
"assumptions": [
|
||||
f"Entry age for education/first job: {age_offset} years (±{age_variance})",
|
||||
|
|
@ -549,7 +878,21 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
|
|||
|
||||
for uni_name, (city, country) in DUTCH_UNI_LOCATIONS.items():
|
||||
if uni_name.lower() in institution.lower():
|
||||
location = f"{city}, Netherlands" if city else None
|
||||
# Map country code to country name for geocoding
|
||||
country_names = {
|
||||
"NL": "Netherlands",
|
||||
"BE": "Belgium",
|
||||
"DE": "Germany",
|
||||
"GB": "United Kingdom",
|
||||
"AU": "Australia",
|
||||
"ZA": "South Africa",
|
||||
"IT": "Italy",
|
||||
"US": "United States",
|
||||
"ID": "Indonesia",
|
||||
"TR": "Turkey",
|
||||
}
|
||||
country_name = country_names.get(country, "Netherlands")
|
||||
location = f"{city}, {country_name}" if city else None
|
||||
location_source = f"Known institution mapping: {uni_name}"
|
||||
break
|
||||
|
||||
|
|
@ -622,8 +965,8 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
|
|||
for exp in experience:
|
||||
if exp is None:
|
||||
continue
|
||||
# Handle multiple date field names
|
||||
date_range = exp.get("date_range") or exp.get("period") or ""
|
||||
# Handle multiple date field names (including start_date/end_date)
|
||||
date_range = get_any_date_field(exp)
|
||||
start_year, _ = parse_date_range(date_range)
|
||||
if start_year and exp.get("location"):
|
||||
exp_with_years.append((start_year, exp))
|
||||
|
|
@ -636,7 +979,7 @@ def infer_birth_settlement(profile_data: dict, db_path: str) -> Optional[dict]:
|
|||
continue
|
||||
|
||||
# Get date_range for provenance (handle multiple field names)
|
||||
exp_date_range = exp.get("date_range") or exp.get("period") or ""
|
||||
exp_date_range = get_any_date_field(exp)
|
||||
|
||||
inference_steps.append({
|
||||
"observation": "Earliest job with location found (no education location available)",
|
||||
|
|
@ -739,8 +1082,8 @@ def infer_current_settlement(profile_data: dict, db_path: str) -> Optional[dict]
|
|||
for exp in experience:
|
||||
if exp is None:
|
||||
continue
|
||||
# Handle multiple date field names
|
||||
date_range = exp.get("date_range") or exp.get("period") or ""
|
||||
# Handle multiple date field names (including start_date/end_date)
|
||||
date_range = get_any_date_field(exp)
|
||||
# Also check "current" field which some profiles have
|
||||
is_current = "Present" in date_range or exp.get("current") is True
|
||||
if is_current:
|
||||
|
|
@ -815,6 +1158,7 @@ def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force:
|
|||
stats = {
|
||||
"birth_decade_inferred": False,
|
||||
"birth_decade_is_list": False, # Track decade boundary cases
|
||||
"birth_decade_method": None, # Track which method was used
|
||||
"birth_settlement_inferred": False,
|
||||
"current_settlement_inferred": False,
|
||||
"ppid_changed": False,
|
||||
|
|
@ -870,6 +1214,9 @@ def enrich_ppid_file(filepath: Path, db_path: str, dry_run: bool = False, force:
|
|||
components["first_date"] = birth_info["edtf"]
|
||||
components["first_date_source"] = "inferred_birth_decade"
|
||||
|
||||
# Track which method was used
|
||||
stats["birth_decade_method"] = birth_info.get("inference_provenance", {}).get("method", "unknown")
|
||||
|
||||
# Add note to canonical field pointing to inferred alternative
|
||||
data["birth_date"]["note"] = "See inferred_birth_decade for heuristic estimate"
|
||||
|
||||
|
|
@ -978,6 +1325,11 @@ def main():
|
|||
"processed": 0,
|
||||
"birth_decade_inferred": 0,
|
||||
"birth_decade_list_valued": 0, # Decade boundary cases
|
||||
"birth_decade_by_method": {
|
||||
"earliest_education_heuristic": 0,
|
||||
"earliest_experience_heuristic": 0,
|
||||
"total_experience_heuristic": 0,
|
||||
},
|
||||
"birth_settlement_inferred": 0,
|
||||
"current_settlement_inferred": 0,
|
||||
"ppid_changed": 0,
|
||||
|
|
@ -990,6 +1342,10 @@ def main():
|
|||
total_stats["processed"] += 1
|
||||
if stats["birth_decade_inferred"]:
|
||||
total_stats["birth_decade_inferred"] += 1
|
||||
# Track method used
|
||||
method = stats.get("birth_decade_method")
|
||||
if method and method in total_stats["birth_decade_by_method"]:
|
||||
total_stats["birth_decade_by_method"][method] += 1
|
||||
if stats.get("birth_decade_is_list"):
|
||||
total_stats["birth_decade_list_valued"] += 1
|
||||
if stats["birth_settlement_inferred"]:
|
||||
|
|
@ -999,7 +1355,7 @@ def main():
|
|||
if stats["ppid_changed"]:
|
||||
total_stats["ppid_changed"] += 1
|
||||
|
||||
if args.verbose and any(stats.values()):
|
||||
if args.verbose and any(v for k, v in stats.items() if k != "birth_decade_method"):
|
||||
print(f" {filepath.name}: {stats}")
|
||||
|
||||
if (i + 1) % 500 == 0:
|
||||
|
|
@ -1017,6 +1373,9 @@ def main():
|
|||
print(f"Processed: {total_stats['processed']}")
|
||||
print(f"Birth decades inferred: {total_stats['birth_decade_inferred']}")
|
||||
print(f" - List-valued (boundary): {total_stats['birth_decade_list_valued']}")
|
||||
print(f" - By method:")
|
||||
for method, count in total_stats["birth_decade_by_method"].items():
|
||||
print(f" {method}: {count}")
|
||||
print(f"Birth settlements inferred: {total_stats['birth_settlement_inferred']}")
|
||||
print(f"Current settlements inferred: {total_stats['current_settlement_inferred']}")
|
||||
print(f"PPIDs updated: {total_stats['ppid_changed']}")
|
||||
|
|
@ -1033,6 +1392,7 @@ def main():
|
|||
|
||||
print("\nNote: All inferred data stored in explicit inferred_* fields with provenance chains.")
|
||||
print("Note: Decade boundary cases use EDTF set notation [196X,197X] with primary_value for PPID.")
|
||||
print("Note: Total Experience method has highest uncertainty (very_low confidence).")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
579
scripts/enrich_ppids_web.py
Normal file
579
scripts/enrich_ppids_web.py
Normal file
|
|
@ -0,0 +1,579 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
PPID Web Enrichment Script
|
||||
|
||||
Enriches PPID files with web-sourced claims using Exa AI and Linkup search.
|
||||
Adds proper provenance statements per Rules 6, 26, and 35.
|
||||
|
||||
Enrichment targets:
|
||||
1. Birth date/year - Search for biographical information
|
||||
2. Publications - ORCID, Google Scholar, ResearchGate
|
||||
3. News mentions - Press coverage, interviews
|
||||
4. Wikidata entity - Authority file linking
|
||||
5. Institutional affiliations - Verify current roles
|
||||
|
||||
All web claims include:
|
||||
- source_url: Where the data was found
|
||||
- retrieved_on: ISO 8601 timestamp
|
||||
- retrieval_agent: Tool used (exa_web_search, linkup_search, etc.)
|
||||
- claim_type: Type of claim (birth_date, publication, news_mention, etc.)
|
||||
- claim_value: The extracted value
|
||||
- provenance: Full provenance chain per Rule 35
|
||||
|
||||
Usage:
|
||||
python scripts/enrich_ppids_web.py --limit 10 --verbose
|
||||
python scripts/enrich_ppids_web.py --dry-run --sample stefankulk
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List, Any, Tuple
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
|
||||
def create_web_claim(
|
||||
claim_type: str,
|
||||
claim_value: str,
|
||||
source_url: str,
|
||||
retrieval_agent: str,
|
||||
confidence: str = "medium",
|
||||
notes: Optional[str] = None,
|
||||
raw_response: Optional[Dict] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Create a web claim with proper provenance per Rules 6, 26, and 35.
|
||||
|
||||
Args:
|
||||
claim_type: Type of claim (birth_date, publication, news_mention, etc.)
|
||||
claim_value: The extracted value
|
||||
source_url: URL where the data was found
|
||||
retrieval_agent: Tool used (exa_web_search, linkup_search, etc.)
|
||||
confidence: Confidence level (high, medium, low, very_low)
|
||||
notes: Additional notes about the claim
|
||||
raw_response: Raw API response for audit
|
||||
|
||||
Returns:
|
||||
Dict with claim structure per Rule 26
|
||||
"""
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
claim = {
|
||||
"claim_type": claim_type,
|
||||
"claim_value": claim_value,
|
||||
"source_url": source_url,
|
||||
"retrieved_on": now.isoformat(),
|
||||
"retrieval_agent": retrieval_agent,
|
||||
"confidence": confidence,
|
||||
"provenance": {
|
||||
"statement_created_at": now.isoformat(),
|
||||
"source_archived_at": now.isoformat(), # Same time for API responses
|
||||
"retrieval_method": retrieval_agent,
|
||||
}
|
||||
}
|
||||
|
||||
if notes:
|
||||
claim["notes"] = notes
|
||||
|
||||
if raw_response:
|
||||
# Store snippet of raw response for audit (not full response to save space)
|
||||
claim["provenance"]["response_snippet"] = str(raw_response)[:500]
|
||||
|
||||
return claim
|
||||
|
||||
|
||||
def extract_birth_year_from_text(text: str, full_name: str) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
Extract birth year from text using various patterns.
|
||||
|
||||
Returns:
|
||||
Tuple of (birth_year_edtf, extraction_note) or None
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# Normalize text
|
||||
text_lower = text.lower()
|
||||
name_parts = full_name.lower().split()
|
||||
last_name = name_parts[-1] if name_parts else ""
|
||||
|
||||
# Check if the text is about the right person (basic check)
|
||||
if last_name and last_name not in text_lower:
|
||||
return None
|
||||
|
||||
# Pattern 1: "born in YYYY" or "born YYYY"
|
||||
born_match = re.search(r'born\s+(?:in\s+)?(\d{4})', text_lower)
|
||||
if born_match:
|
||||
year = born_match.group(1)
|
||||
return (year, f"Extracted from 'born {year}' pattern")
|
||||
|
||||
# Pattern 2: "(YYYY - )" or "(YYYY-)" indicating birth year
|
||||
birth_dash_match = re.search(r'\((\d{4})\s*[-–—]\s*\)', text)
|
||||
if birth_dash_match:
|
||||
year = birth_dash_match.group(1)
|
||||
return (year, f"Extracted from '({year} - )' lifespan pattern")
|
||||
|
||||
# Pattern 3: "YYYY - present" or "b. YYYY"
|
||||
b_match = re.search(r'(?:b\.|born)\s*(\d{4})', text_lower)
|
||||
if b_match:
|
||||
year = b_match.group(1)
|
||||
return (year, f"Extracted from 'b. {year}' pattern")
|
||||
|
||||
# Pattern 4: Age patterns "X years old" with date context
|
||||
age_match = re.search(r'(\d{1,2})\s*(?:years?\s*old|jaar\s*oud)', text_lower)
|
||||
if age_match:
|
||||
age = int(age_match.group(1))
|
||||
if 20 <= age <= 100: # Reasonable age range
|
||||
current_year = datetime.now().year
|
||||
estimated_birth = current_year - age
|
||||
return (f"{estimated_birth}~", f"Estimated from age {age} (approximate)")
|
||||
|
||||
# Pattern 5: Birthday patterns "birthday: Month DD, YYYY"
|
||||
birthday_match = re.search(
|
||||
r'(?:birthday|geboren|date of birth)[:\s]+(?:\w+\s+\d{1,2},?\s+)?(\d{4})',
|
||||
text_lower
|
||||
)
|
||||
if birthday_match:
|
||||
year = birthday_match.group(1)
|
||||
return (year, "Extracted from birthday/geboren pattern")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_publications_from_text(text: str, full_name: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract publication references from search results.
|
||||
|
||||
Returns:
|
||||
List of publication dicts with title, year, venue
|
||||
"""
|
||||
publications = []
|
||||
|
||||
if not text:
|
||||
return publications
|
||||
|
||||
# Look for DOI patterns
|
||||
doi_matches = re.findall(r'10\.\d{4,}/[^\s]+', text)
|
||||
for doi in doi_matches[:5]: # Limit to 5
|
||||
publications.append({
|
||||
"type": "doi",
|
||||
"value": doi.strip(),
|
||||
"note": "DOI found in search results"
|
||||
})
|
||||
|
||||
# Look for ORCID patterns
|
||||
orcid_match = re.search(r'orcid\.org/(\d{4}-\d{4}-\d{4}-\d{3}[\dX])', text)
|
||||
if orcid_match:
|
||||
publications.append({
|
||||
"type": "orcid",
|
||||
"value": orcid_match.group(1),
|
||||
"note": "ORCID identifier found"
|
||||
})
|
||||
|
||||
return publications
|
||||
|
||||
|
||||
def search_birth_date_exa(full_name: str, context_hints: List[str] = None) -> Optional[Dict]:
|
||||
"""
|
||||
Search for birth date using Exa AI web search.
|
||||
|
||||
Note: This function is designed to be called via MCP tools.
|
||||
In actual execution, replace with MCP tool call.
|
||||
"""
|
||||
# Build search query
|
||||
query_parts = [f'"{full_name}"', "born", "birthday"]
|
||||
if context_hints:
|
||||
query_parts.extend(context_hints[:2]) # Add up to 2 context hints
|
||||
|
||||
query = " ".join(query_parts)
|
||||
|
||||
# This would be replaced with actual MCP call:
|
||||
# result = exa_web_search_exa(query=query, numResults=5)
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"tool": "exa_web_search_exa",
|
||||
"status": "pending_mcp_call"
|
||||
}
|
||||
|
||||
|
||||
def search_publications_exa(full_name: str, institution: str = None) -> Optional[Dict]:
|
||||
"""
|
||||
Search for publications using Exa AI.
|
||||
"""
|
||||
query_parts = [f'"{full_name}"']
|
||||
if institution:
|
||||
query_parts.append(institution)
|
||||
query_parts.extend(["publications", "research", "ORCID"])
|
||||
|
||||
query = " ".join(query_parts)
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"tool": "exa_web_search_exa",
|
||||
"status": "pending_mcp_call"
|
||||
}
|
||||
|
||||
|
||||
def search_news_mentions_exa(full_name: str, institution: str = None) -> Optional[Dict]:
|
||||
"""
|
||||
Search for news mentions using Exa AI.
|
||||
"""
|
||||
query_parts = [f'"{full_name}"']
|
||||
if institution:
|
||||
query_parts.append(institution)
|
||||
|
||||
query = " ".join(query_parts)
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"tool": "exa_web_search_exa",
|
||||
"status": "pending_mcp_call"
|
||||
}
|
||||
|
||||
|
||||
def get_person_context(ppid_data: Dict) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract context from PPID data for better search queries.
|
||||
"""
|
||||
context = {
|
||||
"full_name": "",
|
||||
"institutions": [],
|
||||
"roles": [],
|
||||
"location": None,
|
||||
"linkedin_url": None,
|
||||
"skills": [],
|
||||
}
|
||||
|
||||
# Get name
|
||||
name_data = ppid_data.get("name", {})
|
||||
context["full_name"] = name_data.get("full_name", "")
|
||||
|
||||
# Get profile data
|
||||
profile = ppid_data.get("profile_data", {})
|
||||
if profile:
|
||||
context["linkedin_url"] = profile.get("linkedin_url")
|
||||
context["location"] = profile.get("location")
|
||||
context["skills"] = profile.get("skills", [])[:10] # Top 10 skills
|
||||
|
||||
# Extract institutions from experience
|
||||
for exp in profile.get("experience", []) or []:
|
||||
if exp and exp.get("company"):
|
||||
context["institutions"].append(exp["company"])
|
||||
if exp.get("title"):
|
||||
context["roles"].append(exp["title"])
|
||||
|
||||
# Extract from education
|
||||
for edu in profile.get("education", []) or []:
|
||||
if edu and edu.get("institution"):
|
||||
context["institutions"].append(edu["institution"])
|
||||
|
||||
# Deduplicate
|
||||
context["institutions"] = list(dict.fromkeys(context["institutions"]))[:5]
|
||||
context["roles"] = list(dict.fromkeys(context["roles"]))[:5]
|
||||
|
||||
return context
|
||||
|
||||
|
||||
def build_enrichment_queries(ppid_data: Dict) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Build a list of enrichment queries for a PPID.
|
||||
|
||||
Returns list of query specs to execute via MCP tools.
|
||||
"""
|
||||
context = get_person_context(ppid_data)
|
||||
full_name = context["full_name"]
|
||||
|
||||
if not full_name:
|
||||
return []
|
||||
|
||||
queries = []
|
||||
|
||||
# 1. Birth date search (only if not already known)
|
||||
birth_date = ppid_data.get("birth_date", {}).get("edtf", "XXXX")
|
||||
enrichment_meta = ppid_data.get("enrichment_metadata", {}).get("birth_date_search", {})
|
||||
|
||||
if birth_date == "XXXX" and not enrichment_meta.get("attempted"):
|
||||
# Build birth date query with context
|
||||
hints = []
|
||||
if context["institutions"]:
|
||||
hints.append(context["institutions"][0])
|
||||
if context["location"]:
|
||||
hints.append(context["location"].split(",")[0])
|
||||
|
||||
queries.append({
|
||||
"type": "birth_date",
|
||||
"query": f'"{full_name}" born birthday biography',
|
||||
"context_hints": hints,
|
||||
"tool": "exa_web_search_exa",
|
||||
"priority": "high"
|
||||
})
|
||||
|
||||
# 2. Publications search (for academics/researchers)
|
||||
academic_keywords = ["professor", "researcher", "phd", "doctor", "lecturer", "scientist"]
|
||||
is_academic = any(
|
||||
kw in " ".join(context["roles"]).lower()
|
||||
for kw in academic_keywords
|
||||
)
|
||||
|
||||
if is_academic:
|
||||
institution = context["institutions"][0] if context["institutions"] else ""
|
||||
queries.append({
|
||||
"type": "publications",
|
||||
"query": f'"{full_name}" {institution} publications ORCID research',
|
||||
"tool": "exa_web_search_exa",
|
||||
"priority": "medium"
|
||||
})
|
||||
|
||||
# 3. News/press mentions
|
||||
if context["institutions"]:
|
||||
queries.append({
|
||||
"type": "news_mentions",
|
||||
"query": f'"{full_name}" {context["institutions"][0]}',
|
||||
"tool": "exa_web_search_exa",
|
||||
"priority": "low"
|
||||
})
|
||||
|
||||
# 4. Wikidata search (for notable persons)
|
||||
queries.append({
|
||||
"type": "wikidata",
|
||||
"query": full_name,
|
||||
"tool": "wikidata_search_entity",
|
||||
"priority": "medium"
|
||||
})
|
||||
|
||||
return queries
|
||||
|
||||
|
||||
def process_search_result(
|
||||
result: Dict[str, Any],
|
||||
query_type: str,
|
||||
full_name: str,
|
||||
ppid_data: Dict
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process a search result and extract web claims.
|
||||
|
||||
Args:
|
||||
result: Raw search result from Exa/Linkup
|
||||
query_type: Type of query (birth_date, publications, etc.)
|
||||
full_name: Person's full name
|
||||
ppid_data: Current PPID data
|
||||
|
||||
Returns:
|
||||
List of web claims to add
|
||||
"""
|
||||
claims = []
|
||||
|
||||
if not result:
|
||||
return claims
|
||||
|
||||
# Extract text content from result
|
||||
text = ""
|
||||
source_url = ""
|
||||
|
||||
if isinstance(result, dict):
|
||||
text = result.get("text", "") or result.get("content", "") or ""
|
||||
source_url = result.get("url", "") or result.get("source_url", "")
|
||||
elif isinstance(result, str):
|
||||
text = result
|
||||
|
||||
if query_type == "birth_date":
|
||||
birth_info = extract_birth_year_from_text(text, full_name)
|
||||
if birth_info:
|
||||
year, note = birth_info
|
||||
claims.append(create_web_claim(
|
||||
claim_type="birth_year",
|
||||
claim_value=year,
|
||||
source_url=source_url,
|
||||
retrieval_agent="exa_web_search_exa",
|
||||
confidence="medium" if "~" not in year else "low",
|
||||
notes=note,
|
||||
raw_response={"text_snippet": text[:200]}
|
||||
))
|
||||
|
||||
elif query_type == "publications":
|
||||
pubs = extract_publications_from_text(text, full_name)
|
||||
for pub in pubs:
|
||||
claims.append(create_web_claim(
|
||||
claim_type=f"identifier_{pub['type']}",
|
||||
claim_value=pub["value"],
|
||||
source_url=source_url,
|
||||
retrieval_agent="exa_web_search_exa",
|
||||
confidence="high" if pub["type"] in ["doi", "orcid"] else "medium",
|
||||
notes=pub.get("note")
|
||||
))
|
||||
|
||||
elif query_type == "news_mentions":
|
||||
# For news, we just record the mention
|
||||
if full_name.lower() in text.lower():
|
||||
claims.append(create_web_claim(
|
||||
claim_type="news_mention",
|
||||
claim_value=text[:500], # First 500 chars
|
||||
source_url=source_url,
|
||||
retrieval_agent="exa_web_search_exa",
|
||||
confidence="medium",
|
||||
notes="News/press mention found"
|
||||
))
|
||||
|
||||
return claims
|
||||
|
||||
|
||||
def enrich_ppid_file(
|
||||
filepath: Path,
|
||||
dry_run: bool = False,
|
||||
verbose: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Enrich a single PPID file with web-sourced claims.
|
||||
|
||||
This function builds queries but does not execute them directly.
|
||||
Queries should be executed via MCP tools in the calling context.
|
||||
|
||||
Returns:
|
||||
Dict with enrichment stats and pending queries
|
||||
"""
|
||||
stats = {
|
||||
"filepath": str(filepath),
|
||||
"queries_built": 0,
|
||||
"claims_added": 0,
|
||||
"errors": [],
|
||||
"pending_queries": []
|
||||
}
|
||||
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
except Exception as e:
|
||||
stats["errors"].append(f"Failed to read file: {e}")
|
||||
return stats
|
||||
|
||||
# Build enrichment queries
|
||||
queries = build_enrichment_queries(data)
|
||||
stats["queries_built"] = len(queries)
|
||||
stats["pending_queries"] = queries
|
||||
|
||||
if verbose:
|
||||
print(f" Built {len(queries)} queries for {filepath.name}")
|
||||
for q in queries:
|
||||
print(f" - {q['type']}: {q['query'][:50]}...")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Enrich PPID files with web-sourced claims (Rule 26 compliant)"
|
||||
)
|
||||
parser.add_argument("--dry-run", action="store_true", help="Don't write changes")
|
||||
parser.add_argument("--limit", type=int, help="Process only N files")
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
||||
parser.add_argument("--sample", type=str, help="Process specific linkedin_slug")
|
||||
parser.add_argument(
|
||||
"--query-types",
|
||||
type=str,
|
||||
default="birth_date,publications,news_mentions,wikidata",
|
||||
help="Comma-separated list of query types to run"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
person_dir = Path("/Users/kempersc/apps/glam/data/person")
|
||||
|
||||
# Get PPID files
|
||||
if args.sample:
|
||||
# Find file by linkedin slug
|
||||
ppid_files = list(person_dir.glob(f"ID_*{args.sample.upper()}*.json"))
|
||||
if not ppid_files:
|
||||
# Try case-insensitive search
|
||||
ppid_files = [
|
||||
f for f in person_dir.glob("ID_*.json")
|
||||
if args.sample.lower() in f.stem.lower()
|
||||
]
|
||||
else:
|
||||
ppid_files = list(person_dir.glob("ID_*.json"))
|
||||
|
||||
if args.limit:
|
||||
ppid_files = ppid_files[:args.limit]
|
||||
|
||||
print(f"Processing {len(ppid_files)} PPID files for web enrichment...")
|
||||
if args.dry_run:
|
||||
print("DRY RUN - no changes will be written")
|
||||
|
||||
query_types = set(args.query_types.split(","))
|
||||
print(f"Query types: {query_types}")
|
||||
|
||||
# Statistics
|
||||
total_stats = {
|
||||
"processed": 0,
|
||||
"queries_built": 0,
|
||||
"by_type": {qt: 0 for qt in query_types},
|
||||
"errors": 0,
|
||||
}
|
||||
|
||||
all_pending_queries = []
|
||||
|
||||
for i, filepath in enumerate(ppid_files):
|
||||
try:
|
||||
stats = enrich_ppid_file(filepath, dry_run=args.dry_run, verbose=args.verbose)
|
||||
total_stats["processed"] += 1
|
||||
total_stats["queries_built"] += stats["queries_built"]
|
||||
|
||||
# Filter queries by requested types
|
||||
for q in stats["pending_queries"]:
|
||||
if q["type"] in query_types:
|
||||
total_stats["by_type"][q["type"]] += 1
|
||||
all_pending_queries.append({
|
||||
"filepath": stats["filepath"],
|
||||
**q
|
||||
})
|
||||
|
||||
if stats["errors"]:
|
||||
total_stats["errors"] += 1
|
||||
if args.verbose:
|
||||
print(f" ERROR {filepath.name}: {stats['errors']}")
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f" Processed {i + 1}/{len(ppid_files)}...")
|
||||
|
||||
except Exception as e:
|
||||
total_stats["errors"] += 1
|
||||
if args.verbose:
|
||||
print(f" ERROR {filepath.name}: {e}")
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 60)
|
||||
print("WEB ENRICHMENT QUERY SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Processed: {total_stats['processed']}")
|
||||
print(f"Queries built: {total_stats['queries_built']}")
|
||||
print(f"By query type:")
|
||||
for qt, count in total_stats["by_type"].items():
|
||||
print(f" - {qt}: {count}")
|
||||
print(f"Errors: {total_stats['errors']}")
|
||||
|
||||
# Output pending queries for MCP execution
|
||||
if all_pending_queries and not args.dry_run:
|
||||
output_file = person_dir.parent / "pending_web_queries.json"
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"total_queries": len(all_pending_queries),
|
||||
"queries": all_pending_queries
|
||||
}, f, indent=2, ensure_ascii=False)
|
||||
print(f"\nPending queries saved to: {output_file}")
|
||||
print("Execute these queries via MCP tools and run --apply-results to add claims.")
|
||||
|
||||
print("\nNote: This script builds queries. Execute via MCP tools:")
|
||||
print(" - exa_web_search_exa for birth_date, publications, news_mentions")
|
||||
print(" - wikidata_search_entity for wikidata matching")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -21,9 +21,76 @@ CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|||
|
||||
# Known organizations with their locations
|
||||
# Format: 'normalized_name': (province, city_code, city_name, inst_type)
|
||||
# Province codes: NH=Noord-Holland, ZH=Zuid-Holland, UT=Utrecht, GE=Gelderland,
|
||||
# NB=Noord-Brabant, LI=Limburg, OV=Overijssel, FR=Friesland,
|
||||
# DR=Drenthe, GR=Groningen, ZE=Zeeland, FL=Flevoland
|
||||
# Foreign: Use country code (BE, DE, FR, DK, IT, GB, US, etc.) as first element
|
||||
KNOWN_ORGS = {
|
||||
# Museums
|
||||
# ==========================================================================
|
||||
# MUSEUMS - Netherlands
|
||||
# ==========================================================================
|
||||
'amsterdamse school museum het schip': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'hunebedcentrum': ('DR', 'BOR', 'Borger', 'M'),
|
||||
'museum flehite': ('UT', 'AME', 'Amersfoort', 'M'),
|
||||
'museum batavialand': ('FL', 'LEL', 'Lelystad', 'M'),
|
||||
'batavialand': ('FL', 'LEL', 'Lelystad', 'M'),
|
||||
'jewish cultural quarter': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'joods cultureel kwartier': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'museum catharijneconvent': ('UT', 'UTR', 'Utrecht', 'M'),
|
||||
'museum speelklok': ('UT', 'UTR', 'Utrecht', 'M'),
|
||||
'museum rembrandthuis': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'rembrandthuis': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'nieuwe instituut': ('ZH', 'ROT', 'Rotterdam', 'M'),
|
||||
'het nieuwe instituut': ('ZH', 'ROT', 'Rotterdam', 'M'),
|
||||
'museum van loon': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'museum voorlinden': ('ZH', 'WAS', 'Wassenaar', 'M'),
|
||||
'museum belvedere': ('FR', 'HEE', 'Heerenveen', 'M'),
|
||||
'museum more': ('GE', 'GOR', 'Gorssel', 'M'),
|
||||
'lam museum': ('ZH', 'LIS', 'Lisse', 'M'),
|
||||
'lisser art museum': ('ZH', 'LIS', 'Lisse', 'M'),
|
||||
'lisser art museum lam': ('ZH', 'LIS', 'Lisse', 'M'),
|
||||
'nxt museum': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'nationaal onderduikmuseum': ('GE', 'AAL', 'Aalten', 'M'),
|
||||
'lantarenvenster': ('ZH', 'ROT', 'Rotterdam', 'E'),
|
||||
'loosduins museum': ('ZH', 'DHA', 'Den Haag', 'M'),
|
||||
'louis couperus museum': ('ZH', 'DHA', 'Den Haag', 'M'),
|
||||
'museum bredius': ('ZH', 'DHA', 'Den Haag', 'M'),
|
||||
'museum broekerveiling': ('NH', 'LAN', 'Langedijk', 'M'),
|
||||
'broekerveiling': ('NH', 'LAN', 'Langedijk', 'M'),
|
||||
'museum bronbeek': ('GE', 'ARN', 'Arnhem', 'M'),
|
||||
'museum de bastei': ('GE', 'NIJ', 'Nijmegen', 'M'),
|
||||
'museum amstelland': ('NH', 'AMS', 'Amstelveen', 'M'),
|
||||
'museum cobra': ('NH', 'AMV', 'Amstelveen', 'M'),
|
||||
'cobra museum': ('NH', 'AMV', 'Amstelveen', 'M'),
|
||||
'cobra museum voor moderne kunst amstelveen': ('NH', 'AMV', 'Amstelveen', 'M'),
|
||||
'museum aan de a': ('GR', 'GRO', 'Groningen', 'M'),
|
||||
'museum helmantel': ('GR', 'WES', 'Westeremden', 'M'),
|
||||
'museum hert fan fryslan': ('FR', 'LEE', 'Leeuwarden', 'M'),
|
||||
'museum het pakhuis': ('NH', 'HOO', 'Hoorn', 'M'),
|
||||
'museum huys der kunsten': ('NB', 'ROO', 'Roosendaal', 'M'),
|
||||
'museum maluku': ('UT', 'UTR', 'Utrecht', 'M'),
|
||||
'museum martena': ('FR', 'FRA', 'Franeker', 'M'),
|
||||
'museum nairac': ('GE', 'BAR', 'Barneveld', 'M'),
|
||||
'museum slager': ('NB', 'BOS', 's-Hertogenbosch', 'M'),
|
||||
'museum smedekinck': ('GE', 'ZEL', 'Zelhem', 'M'),
|
||||
'museum staal': ('GE', 'ALM', 'Almere', 'M'),
|
||||
'museum cafe het pomphuis': ('ZE', 'GOE', 'Goes', 'E'), # Restaurant/cafe, not museum
|
||||
'museum de looierij': ('NH', 'AMS', 'Amsterdam', 'M'), # Westzaan area
|
||||
'museum de proefkolonie': ('DR', 'FRE', 'Frederiksoord', 'M'),
|
||||
'museum de speeltoren': ('GE', 'NIJ', 'Nijmegen', 'M'), # Actually in Monnickendam
|
||||
'museum fiskershuske': ('FR', 'MOD', 'Moddergat', 'M'),
|
||||
'museum stedhus sleat': ('FR', 'SLO', 'Sloten', 'M'),
|
||||
'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'), # Belgium - museum pass
|
||||
'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'),
|
||||
'museum swaensteyn': ('ZH', 'VOR', 'Voorburg', 'M'),
|
||||
'museum van de vrouw': ('NB', 'EER', 'Eersel', 'M'),
|
||||
'oorlogsmuseum medemblik': ('NH', 'MED', 'Medemblik', 'M'),
|
||||
'nac museum': ('NB', 'BRE', 'Breda', 'M'),
|
||||
'nationaal baggermuseum': ('ZH', 'SLI', 'Sliedrecht', 'M'),
|
||||
'nationaal restauratiefonds': ('UT', 'AME', 'Amersfoort', 'N'),
|
||||
'nederlands steendrukmuseum': ('GE', 'VAL', 'Valburg', 'M'),
|
||||
'nederlands stoommachinemuseum': ('GE', 'MED', 'Medemblik', 'M'),
|
||||
'pieter vermeulen museum': ('DR', 'MED', 'Diever', 'M'),
|
||||
'bonnefanten': ('LI', 'MAA', 'Maastricht', 'M'),
|
||||
'bonami spelcomputer museum': ('OV', 'ZWO', 'Zwolle', 'M'),
|
||||
'bakkerijmuseum de oude bakkerij': ('NH', 'MED', 'Medemblik', 'M'),
|
||||
|
|
@ -31,7 +98,6 @@ KNOWN_ORGS = {
|
|||
'coda museum': ('GE', 'APE', 'Apeldoorn', 'M'),
|
||||
'comm museum voor communicatie': ('ZH', 'DHA', 'Den Haag', 'M'),
|
||||
'cruquius museum': ('NH', 'HAA', 'Haarlemmermeer', 'M'),
|
||||
'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium
|
||||
'dordrechts museum': ('ZH', 'DOR', 'Dordrecht', 'M'),
|
||||
'dutch museum of freemasonry': ('ZH', 'DHA', 'Den Haag', 'M'),
|
||||
'eise eisinga planetarium': ('FR', 'FRA', 'Franeker', 'M'),
|
||||
|
|
@ -102,55 +168,588 @@ KNOWN_ORGS = {
|
|||
'rijksmuseum boerhaave': ('ZH', 'LEI', 'Leiden', 'M'),
|
||||
'rijksmuseum twenthe': ('OV', 'ENS', 'Enschede', 'M'),
|
||||
'singer laren': ('NH', 'LAR', 'Laren', 'M'),
|
||||
'singer museum': ('NH', 'LAR', 'Laren', 'M'),
|
||||
'sonnenborgh museum': ('UT', 'UTR', 'Utrecht', 'M'),
|
||||
'zeeuws museum': ('ZE', 'MID', 'Middelburg', 'M'),
|
||||
|
||||
# Libraries
|
||||
# Additional museums from PENDING list
|
||||
'het scheepvaartmuseum': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'hash marihuana hemp museum': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'hash marihuana en hemp museum': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'van gogh village museum': ('NB', 'NUE', 'Nuenen', 'M'),
|
||||
'retro computer museum': ('GE', 'ARN', 'Arnhem', 'M'),
|
||||
'haags bus museum': ('ZH', 'DHA', 'Den Haag', 'M'),
|
||||
'het romeins museum': ('GE', 'NIJ', 'Nijmegen', 'M'),
|
||||
'hendrick hamel museum': ('GR', 'GOR', 'Gorinchem', 'M'),
|
||||
'graphic design museum': ('NB', 'BRE', 'Breda', 'M'),
|
||||
'vliegend museum seppe': ('NB', 'BOS', 'Bosschenhoofd', 'M'),
|
||||
'zoological museum netherlands': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'world of cannabis museum project': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'stichting museum 1940 1945': ('ZH', 'DOR', 'Dordrecht', 'M'),
|
||||
'stichting museum menkemaborg': ('GR', 'UIT', 'Uithuizen', 'M'),
|
||||
'stichting pak museum': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'stichting museum blokhuispoort': ('FR', 'LEE', 'Leeuwarden', 'M'),
|
||||
'sculptuur instituut': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'gelders restauratie centrum': ('GE', 'ARN', 'Arnhem', 'R'),
|
||||
|
||||
# ==========================================================================
|
||||
# LIBRARIES
|
||||
# ==========================================================================
|
||||
'de bblthk': ('GE', 'WAG', 'Wageningen', 'L'),
|
||||
'kb nationale bibliotheek': ('ZH', 'DHA', 'Den Haag', 'L'),
|
||||
'bplusc': ('ZH', 'LEI', 'Leiden', 'L'),
|
||||
|
||||
# Archives
|
||||
# ==========================================================================
|
||||
# ARCHIVES
|
||||
# ==========================================================================
|
||||
'digitar het online archief': ('UT', 'UTR', 'Utrecht', 'D'),
|
||||
'the black archives': ('NH', 'AMS', 'Amsterdam', 'A'),
|
||||
'archivesspace': ('US', 'NYC', 'New York', 'D'), # US-based software
|
||||
|
||||
# Organizations (stichtingen, etc.)
|
||||
# ==========================================================================
|
||||
# NATURE & ENVIRONMENTAL ORGANIZATIONS
|
||||
# ==========================================================================
|
||||
'staatsbosbeheer': ('UT', 'AME', 'Amersfoort', 'O'),
|
||||
'vogelbescherming nederland': ('UT', 'ZEI', 'Zeist', 'N'),
|
||||
'waddenvereniging': ('FR', 'HAR', 'Harlingen', 'N'),
|
||||
'trees for all': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'natuurmonumenten': ('UT', 'AME', 'Amersfoort', 'N'),
|
||||
'vereniging natuurmonumenten': ('UT', 'AME', 'Amersfoort', 'N'),
|
||||
'it fryske gea': ('FR', 'BEE', 'Beetsterzwaag', 'N'),
|
||||
'landschappennl': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'land van ons': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'natuurbegraven nederland': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'natuuropleiding': ('NH', 'AMS', 'Amsterdam', 'E'),
|
||||
'obn natuurkennis': ('DR', 'ASS', 'Assen', 'R'),
|
||||
'ravon': ('GE', 'NIJ', 'Nijmegen', 'R'),
|
||||
'norminstituut bomen': ('UT', 'UTR', 'Utrecht', 'R'),
|
||||
'nationale bomenbank b v': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'native plant trust': ('US', 'BOS', 'Boston', 'N'), # US
|
||||
'kiss the ground': ('US', 'LAX', 'Los Angeles', 'N'), # US
|
||||
'national coalition for natural farming': ('IN', 'DEL', 'Delhi', 'N'), # India
|
||||
'lpo provence alpes cote d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'), # France
|
||||
'picardie nature': ('FR', 'AMI', 'Amiens', 'N'), # France
|
||||
'parc national des pyrenees': ('FR', 'TAR', 'Tarbes', 'N'), # France
|
||||
'bumblebee conservation trust': ('GB', 'STI', 'Stirling', 'N'), # UK
|
||||
'botanic gardens conservation international': ('GB', 'KEW', 'Kew', 'N'), # UK
|
||||
'save our seas foundation sosf': ('ZA', 'CPT', 'Cape Town', 'N'), # South Africa
|
||||
'ferus ours loup lynx conservation': ('FR', 'PAR', 'Paris', 'N'), # France
|
||||
'european arboricultural council': ('BE', 'BRU', 'Brussels', 'N'), # Belgium
|
||||
'caring farmers': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'collectief natuurinclusief': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'stichting rechten van de natuur': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'deltaplan agrarisch waterbeheer daw': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'boerenverstand onderzoek advies': ('GE', 'WAG', 'Wageningen', 'R'),
|
||||
'cruydt hoeck': ('GR', 'NIJ', 'Nijeholtpade', 'C'),
|
||||
|
||||
# ==========================================================================
|
||||
# HERITAGE & HISTORICAL SOCIETIES
|
||||
# ==========================================================================
|
||||
'3 october vereeniging': ('ZH', 'LEI', 'Leiden', 'S'),
|
||||
'historische vereniging delfia batavorum': ('ZH', 'DEL', 'Delft', 'S'),
|
||||
'historische vereniging koog zaandijk': ('NH', 'ZAA', 'Zaandijk', 'S'),
|
||||
'historische vereniging oud stolwijck': ('ZH', 'STO', 'Stolwijk', 'S'),
|
||||
'historische vereniging voorst': ('GE', 'VOO', 'Voorst', 'S'),
|
||||
'historische vereniging wormerveer': ('NH', 'WOR', 'Wormerveer', 'S'),
|
||||
'heemkunde vereniging borne': ('OV', 'BOR', 'Borne', 'S'),
|
||||
'heemkunde vlaanderen': ('BE', 'ANT', 'Antwerpen', 'S'), # Belgium
|
||||
'hendrick de keyser monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'vereniging particuliere historische buitenplaatsen': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'werkgroep adelsgeschiedenis': ('NH', 'AMS', 'Amsterdam', 'S'),
|
||||
'stichting oude groninger kerken': ('GR', 'GRO', 'Groningen', 'N'),
|
||||
'studiecentrum eerste wereldoorlog': ('BE', 'BRU', 'Brussels', 'R'), # Belgium
|
||||
'sobibor foundation': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
|
||||
# ==========================================================================
|
||||
# STICHTINGEN & FOUNDATIONS
|
||||
# ==========================================================================
|
||||
'abdij o l v koningshoeven': ('NB', 'TIL', 'Tilburg', 'H'),
|
||||
'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'),
|
||||
'bijenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'bomenstichting': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'boerennatuur': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'cbg': ('ZH', 'DHA', 'Den Haag', 'R'), # Central Bureau for Genealogy
|
||||
'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'),
|
||||
'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'),
|
||||
'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'),
|
||||
'delamar': ('NH', 'AMS', 'Amsterdam', 'E'),
|
||||
'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'),
|
||||
'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'stichting amelander musea': ('FR', 'AME', 'Ameland', 'M'),
|
||||
'stichting confro': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'stichting de zaanse schans': ('NH', 'ZAA', 'Zaandam', 'M'),
|
||||
'stichting dioraphte': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'stichting koninklijke defensiemusea': ('ZH', 'DHA', 'Den Haag', 'M'),
|
||||
'stichting kunst cultuur': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'stichting texels museum': ('NH', 'TEX', 'Texel', 'M'),
|
||||
'stichting twisca': ('OV', 'TWI', 'Twisk', 'N'),
|
||||
'stichting waddengroep': ('NH', 'DEN', 'Den Helder', 'N'),
|
||||
'hartwig art foundation': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'fonds 21': ('UT', 'UTR', 'Utrecht', 'N'),
|
||||
'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'),
|
||||
'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'),
|
||||
'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
|
||||
# Research centers
|
||||
# ==========================================================================
|
||||
# RESEARCH CENTERS & KNOWLEDGE INSTITUTES
|
||||
# ==========================================================================
|
||||
'adc archeoprojecten': ('GE', 'AME', 'Amersfoort', 'R'),
|
||||
'archol': ('ZH', 'LEI', 'Leiden', 'R'),
|
||||
'kitlv': ('ZH', 'LEI', 'Leiden', 'R'),
|
||||
'cbg': ('ZH', 'DHA', 'Den Haag', 'R'), # Central Bureau for Genealogy
|
||||
'kenniscentrum immaterieel erfgoed nederland': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'koninklijke nederlandse academie van wetenschappen': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'den kennisinstituut cultuur digitale transformatie': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'centre of expertise creative innovation': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'huygens institute': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'huygens instituut': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'instituut voor de nederlandse taal': ('ZH', 'LEI', 'Leiden', 'R'),
|
||||
'n w posthumus institute': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'nicas netherlands institute for conservation art science': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'raap': ('OV', 'ZWO', 'Zwolle', 'R'),
|
||||
'restauratoren nederland': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'restauratieatelier restaura': ('LI', 'HAE', 'Haelen', 'C'),
|
||||
'picturae': ('NH', 'HIL', 'Heiloo', 'C'),
|
||||
'icom netherlands': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'icomos netherlands': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'international committee for documentation': ('FR', 'PAR', 'Paris', 'N'),
|
||||
'museumvereniging': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'museumpeil': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'museumtijdschrift': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'monumentaal magazine over cultureel erfgoed': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'modemuze': ('NH', 'AMS', 'Amsterdam', 'D'),
|
||||
'moebius museum software': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'platform drentse musea': ('DR', 'ASS', 'Assen', 'O'),
|
||||
'public domain library': ('US', 'SFO', 'San Francisco', 'D'), # US
|
||||
'internet archive': ('US', 'SFO', 'San Francisco', 'A'), # US
|
||||
'society for artistic research': ('AT', 'VIE', 'Vienna', 'R'), # Austria
|
||||
'digital preservation coalition': ('GB', 'GLA', 'Glasgow', 'R'), # UK
|
||||
'the palaeontological association': ('GB', 'LON', 'London', 'R'), # UK
|
||||
'the society for archaeological sciences': ('US', 'TUC', 'Tucson', 'R'), # US
|
||||
'conflict research society': ('GB', 'LON', 'London', 'R'), # UK
|
||||
'stads en architectuurgeschiedenis uva': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'agandau onderzoek in het archief': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'anchise project horizon europe': ('FR', 'PAR', 'Paris', 'R'), # France
|
||||
'atrium advancing frontier research in the arts humanities': ('EU', 'BRU', 'Brussels', 'R'), # EU
|
||||
'biblissima': ('FR', 'PAR', 'Paris', 'R'), # France
|
||||
|
||||
# Theaters/Venues
|
||||
# ==========================================================================
|
||||
# THEATERS & CULTURAL VENUES
|
||||
# ==========================================================================
|
||||
'theater de veste': ('ZH', 'DEL', 'Delft', 'E'),
|
||||
'theater a d schie': ('ZH', 'SCH', 'Schiedam', 'E'),
|
||||
'theater a d rijn': ('GE', 'ARN', 'Arnhem', 'E'),
|
||||
'amphion cultuurbedrijf': ('GE', 'DOE', 'Doetinchem', 'E'),
|
||||
'defabrique evenementenlocatie': ('UT', 'UTR', 'Utrecht', 'E'),
|
||||
'delamar': ('NH', 'AMS', 'Amsterdam', 'E'),
|
||||
'dutch national opera ballet': ('NH', 'AMS', 'Amsterdam', 'E'),
|
||||
'theatergezelschap bontehond': ('NH', 'AMS', 'Amsterdam', 'E'),
|
||||
'birds of paradise theatre company': ('GB', 'GLA', 'Glasgow', 'E'), # UK
|
||||
'yoann bourgeois art company': ('FR', 'LYO', 'Lyon', 'E'), # France
|
||||
'de grote post': ('BE', 'OST', 'Oostende', 'E'), # Belgium
|
||||
|
||||
# Foreign organizations that should be reclassified
|
||||
# ==========================================================================
|
||||
# GALLERIES & ART SPACES
|
||||
# ==========================================================================
|
||||
'framer framed': ('NH', 'AMS', 'Amsterdam', 'G'),
|
||||
'cemara 6 galeri museum': ('ID', 'JAK', 'Jakarta', 'G'), # Indonesia
|
||||
'vedica art studios and gallery': ('IN', 'DEL', 'Delhi', 'G'), # India
|
||||
|
||||
# ==========================================================================
|
||||
# OFFICIAL INSTITUTIONS & GOVERNMENT
|
||||
# ==========================================================================
|
||||
'creatieve hubs nederland': ('NH', 'AMS', 'Amsterdam', 'O'),
|
||||
'the dutch inspectorate of education': ('UT', 'UTR', 'Utrecht', 'O'),
|
||||
'embassy of the netherlands in morocco': ('MA', 'RAB', 'Rabat', 'O'), # Morocco
|
||||
'gemeente nederweert': ('LI', 'NED', 'Nederweert', 'O'),
|
||||
'house of european history': ('BE', 'BRU', 'Brussels', 'M'), # Belgium
|
||||
'european museum forum': ('PT', 'LIS', 'Lisbon', 'O'), # Portugal
|
||||
'docomomo international': ('PT', 'LIS', 'Lisbon', 'N'), # Portugal
|
||||
'culture action europe': ('BE', 'BRU', 'Brussels', 'N'), # Belgium
|
||||
'gbif the global biodiversity information facility': ('DK', 'CPH', 'Copenhagen', 'O'), # Denmark
|
||||
|
||||
# ==========================================================================
|
||||
# JOURNALISM & MEDIA
|
||||
# ==========================================================================
|
||||
'11 11 media': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'155 eenvijfvijf': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'archimag': ('FR', 'PAR', 'Paris', 'C'), # France
|
||||
'arte al dia': ('US', 'MIA', 'Miami', 'C'), # US - Latin American art magazine
|
||||
'exibart': ('IT', 'ROM', 'Rome', 'C'), # Italy
|
||||
'finestre sull arte': ('IT', 'FLO', 'Florence', 'C'), # Italy
|
||||
|
||||
# ==========================================================================
|
||||
# MISCLASSIFIED FOREIGN ORGS (have NL prefix but are foreign)
|
||||
# ==========================================================================
|
||||
'her place womens museum': ('AU', 'MEL', 'Melbourne', 'M'), # Australia
|
||||
'her place women s museum': ('AU', 'MEL', 'Melbourne', 'M'), # Australia - variant
|
||||
'asociacion acre': ('ES', 'MAD', 'Madrid', 'N'), # Spain
|
||||
'asociacio n acre': ('ES', 'MAD', 'Madrid', 'N'), # Spain - normalized
|
||||
'la maison du theatre a brest': ('FR', 'BRE', 'Brest', 'E'), # France
|
||||
'la maison du the a tre a brest': ('FR', 'BRE', 'Brest', 'E'), # France - normalized
|
||||
'lpo provence alpes cote d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'), # France
|
||||
'lpo provence alpes co te d azur': ('FR', 'AIX', 'Aix-en-Provence', 'N'), # France - normalized
|
||||
'lucas laboratoire d usages culture s arts societe': ('FR', 'PAR', 'Paris', 'R'), # France
|
||||
'maison des metallos': ('FR', 'PAR', 'Paris', 'E'), # France
|
||||
'maison des me tallos': ('FR', 'PAR', 'Paris', 'E'), # France - normalized
|
||||
'stiftung trias gemeinnutzige stiftung fur boden okologie und wohnen': ('DE', 'HAT', 'Hattingen', 'N'), # Germany
|
||||
'stiftung trias': ('DE', 'HAT', 'Hattingen', 'N'), # Germany - short name
|
||||
'sothebys': ('GB', 'LON', 'London', 'C'), # UK auction house
|
||||
'sotheby s': ('GB', 'LON', 'London', 'C'), # UK auction house - variant
|
||||
'sothebys institute of art': ('GB', 'LON', 'London', 'E'), # UK
|
||||
'sotheby s institute of art': ('GB', 'LON', 'London', 'E'), # UK - variant
|
||||
'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'), # Belgium
|
||||
'museumpassmuse es': ('BE', 'BRU', 'Brussels', 'O'), # Belgium - normalized
|
||||
'museum stedhus sleat': ('FR', 'SLO', 'Sloten', 'M'), # Friesland
|
||||
'museum stedhu s sleat': ('FR', 'SLO', 'Sloten', 'M'), # Friesland - normalized
|
||||
'museum fiskershuske': ('FR', 'MOD', 'Moddergat', 'M'), # Friesland
|
||||
'museum fiskershu ske': ('FR', 'MOD', 'Moddergat', 'M'), # Friesland - normalized
|
||||
'arte al dia': ('US', 'MIA', 'Miami', 'C'), # US - Latin American art magazine
|
||||
'arte al di a': ('US', 'MIA', 'Miami', 'C'), # US - normalized
|
||||
'kroller muller museum': ('GE', 'OTT', 'Otterlo', 'M'), # Already exists
|
||||
'kro ller mu ller museum': ('GE', 'OTT', 'Otterlo', 'M'), # Normalized
|
||||
'representation of the netherlands in aruba curacao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'),
|
||||
'representation of the netherlands in aruba curac ao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'), # Normalized
|
||||
|
||||
# ==========================================================================
|
||||
# NGOs & ADVOCACY
|
||||
# ==========================================================================
|
||||
'fim federatie instandhouding monumenten': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'ark rewilding nederland': ('GE', 'NIJ', 'Nijmegen', 'N'),
|
||||
'centraal joods overleg cjo': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'de commandostichting': ('NH', 'HAA', 'Haarlem', 'N'),
|
||||
'kenniscommunity informatie en archief': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'expertisecentrum literair vertalen elv': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'acp ica archival community for palestine': ('PS', 'RAM', 'Ramallah', 'N'), # Palestine
|
||||
'campaign against antisemitism': ('GB', 'LON', 'London', 'N'), # UK
|
||||
'combat antisemitism movement': ('US', 'NYC', 'New York', 'N'), # US
|
||||
'facing history ourselves': ('US', 'BOS', 'Boston', 'E'), # US
|
||||
'freundeskreis yad vashem e v': ('DE', 'FRA', 'Frankfurt', 'N'), # Germany
|
||||
'yad vashem the world holocaust remembrance center': ('IL', 'JER', 'Jerusalem', 'M'), # Israel
|
||||
'the wiener holocaust library': ('GB', 'LON', 'London', 'L'), # UK
|
||||
'usc shoah foundation': ('US', 'LAX', 'Los Angeles', 'A'), # US
|
||||
'cultuurnetwerk groenlinks pvda': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
|
||||
# ==========================================================================
|
||||
# PROFESSIONAL ASSOCIATIONS
|
||||
# ==========================================================================
|
||||
'spab': ('GB', 'LON', 'London', 'N'), # Society for the Protection of Ancient Buildings, UK
|
||||
'sustainable traditional building alliance': ('GB', 'LON', 'London', 'N'), # UK
|
||||
'the institute of historic building conservation ihbc': ('GB', 'TIV', 'Tivetshall', 'N'), # UK
|
||||
'asociacion acre': ('ES', 'MAD', 'Madrid', 'N'), # Spain
|
||||
'vlaamse vereniging tot behoud van historische vaartuigen': ('BE', 'ANT', 'Antwerpen', 'S'), # Belgium
|
||||
'v z w archief en documentatiecentrum erfgoed binnenvaart': ('BE', 'ANT', 'Antwerpen', 'A'), # Belgium
|
||||
'centre d archives et de recherches pour l histoire des femmes avg carhif': ('BE', 'BRU', 'Brussels', 'A'), # Belgium
|
||||
'nederlandse entomologische vereniging': ('NH', 'AMS', 'Amsterdam', 'S'),
|
||||
'nederlandse vereniging van dierentuinen dutch zoo association': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'netwerk archieven design en digitale cultuur': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'ondernemers in geschiedenis': ('NH', 'AMS', 'Amsterdam', 'S'),
|
||||
'oud stede broec': ('NH', 'STE', 'Stede Broec', 'S'),
|
||||
'raad voor dierenaangelegenheden rda': ('ZH', 'DHA', 'Den Haag', 'O'),
|
||||
'regenl': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'representation of the netherlands in aruba curacao and sint maarten': ('NL', 'DHA', 'Den Haag', 'O'),
|
||||
'hylkema erfgoed': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'idverde nl': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'kaliber': ('OV', 'ZWO', 'Zwolle', 'E'),
|
||||
'keunstwurk': ('FR', 'LEE', 'Leeuwarden', 'E'),
|
||||
'kunstkade': ('ZH', 'ROT', 'Rotterdam', 'E'),
|
||||
'leewardists': ('GR', 'GRO', 'Groningen', 'N'),
|
||||
'leo smit foundation': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'loveland events': ('NH', 'AMS', 'Amsterdam', 'E'),
|
||||
'lvwb fundraising': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'meesters in': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'moooi': ('NB', 'BRE', 'Breda', 'C'),
|
||||
'mug authentic coffee atjeh': ('ID', 'JAK', 'Jakarta', 'C'), # Indonesia
|
||||
|
||||
# ==========================================================================
|
||||
# ART & HERITAGE PROJECTS
|
||||
# ==========================================================================
|
||||
'art herstory': ('US', 'NYC', 'New York', 'D'), # US
|
||||
'art history link up': ('GB', 'LON', 'London', 'D'), # UK
|
||||
'call for curators': ('NH', 'AMS', 'Amsterdam', 'D'),
|
||||
'creative works': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'themusemslab': ('DE', 'BER', 'Berlin', 'E'), # Germany
|
||||
'cultuurloket digitall': ('NH', 'AMS', 'Amsterdam', 'D'),
|
||||
'gms digitaliseert': ('NH', 'AMS', 'Amsterdam', 'D'),
|
||||
|
||||
# ==========================================================================
|
||||
# COMPANIES & COMMERCIAL
|
||||
# ==========================================================================
|
||||
'sothebys': ('GB', 'LON', 'London', 'C'), # UK
|
||||
'sothebys institute of art': ('GB', 'LON', 'London', 'E'), # UK
|
||||
'the art loss register': ('GB', 'LON', 'London', 'C'), # UK
|
||||
'space matter': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'studio nauta': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'terra nostra bv': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'tribeca': ('US', 'NYC', 'New York', 'C'), # US
|
||||
'van gelder groente fruit': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'werken voor cultuur': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'eveha international': ('FR', 'PAR', 'Paris', 'R'), # France
|
||||
|
||||
# ==========================================================================
|
||||
# MISCELLANEOUS DUTCH
|
||||
# ==========================================================================
|
||||
'de andere helft': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'eureka': ('NH', 'AMS', 'Amsterdam', 'E'),
|
||||
'enschede700': ('OV', 'ENS', 'Enschede', 'E'),
|
||||
'fenix': ('ZH', 'ROT', 'Rotterdam', 'M'),
|
||||
'ruimtetijd': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'sprekende geschiedenis': ('NH', 'AMS', 'Amsterdam', 'E'),
|
||||
'supermab': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'tijdlab': ('NH', 'AMS', 'Amsterdam', 'R'),
|
||||
'turf event': ('NH', 'AMS', 'Amsterdam', 'E'),
|
||||
'vrijdag': ('GR', 'GRO', 'Groningen', 'E'),
|
||||
'wad gaat om': ('FR', 'LEE', 'Leeuwarden', 'N'),
|
||||
'wikipedia': ('US', 'SFO', 'San Francisco', 'D'), # US
|
||||
'yory nl het grootste platform voor stamboomonderzoek': ('NH', 'AMS', 'Amsterdam', 'D'),
|
||||
'ar tur': ('BE', 'TUR', 'Turnhout', 'E'), # Belgium
|
||||
'culture lab 29': ('FR', 'BRE', 'Brest', 'E'), # France
|
||||
'baleine sous gravillon': ('FR', 'PAR', 'Paris', 'E'), # France
|
||||
|
||||
# ==========================================================================
|
||||
# FOREIGN MUSEUMS - Belgium, France, Italy, etc.
|
||||
# ==========================================================================
|
||||
'diva museum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium
|
||||
'huis van alijn': ('BE', 'GEN', 'Gent', 'M'), # Belgium
|
||||
'kanal centre pompidou': ('BE', 'BRU', 'Brussels', 'M'), # Belgium
|
||||
'kazerne dossin': ('BE', 'MEC', 'Mechelen', 'M'), # Belgium
|
||||
'middelheimmuseum': ('BE', 'ANT', 'Antwerpen', 'M'), # Belgium
|
||||
'musea brugge': ('BE', 'BRU', 'Brugge', 'O'), # Belgium - museum network
|
||||
'kunstencentrum viernulvier': ('BE', 'GEN', 'Gent', 'E'), # Belgium
|
||||
'caen memorial': ('FR', 'CAE', 'Caen', 'M'), # France
|
||||
'luma arles': ('FR', 'ARL', 'Arles', 'M'), # France
|
||||
'la maison du theatre a brest': ('FR', 'BRE', 'Brest', 'E'), # France
|
||||
'maison des metallos': ('FR', 'PAR', 'Paris', 'E'), # France
|
||||
'irht institut de recherche et d histoire des textes': ('FR', 'PAR', 'Paris', 'R'), # France
|
||||
'lucas laboratoire d usages culture s arts societe': ('FR', 'PAR', 'Paris', 'R'), # France
|
||||
'observatoire des politiques culturelles': ('FR', 'GRE', 'Grenoble', 'R'), # France
|
||||
'profilculture': ('FR', 'PAR', 'Paris', 'C'), # France
|
||||
'den gamle by': ('DK', 'AAR', 'Aarhus', 'M'), # Denmark
|
||||
'den kongelige samling': ('DK', 'CPH', 'Copenhagen', 'M'), # Denmark
|
||||
'kulturhusene i danmark': ('DK', 'CPH', 'Copenhagen', 'O'), # Denmark
|
||||
'kulturmonitor': ('DK', 'CPH', 'Copenhagen', 'R'), # Denmark
|
||||
'kulturhistorisk museum': ('NO', 'OSL', 'Oslo', 'M'), # Norway
|
||||
'castello di rivoli': ('IT', 'TOR', 'Torino', 'M'), # Italy
|
||||
'consorzio delle residenze reali sabaude': ('IT', 'TOR', 'Torino', 'M'), # Italy
|
||||
'fondazione canova onlus': ('IT', 'TRE', 'Treviso', 'M'), # Italy
|
||||
'fondazione pistoletto cittadellarte onlus': ('IT', 'BIE', 'Biella', 'M'), # Italy
|
||||
'lac lugano arte e cultura': ('IT', 'LUG', 'Lugano', 'M'), # Switzerland (Italian-speaking)
|
||||
'm9 museum': ('IT', 'VEN', 'Venice', 'M'), # Italy - actually in Mestre
|
||||
'gammel estrup': ('DK', 'AAR', 'Aarhus', 'M'), # Denmark
|
||||
'gedung sate museum': ('ID', 'BAN', 'Bandung', 'M'), # Indonesia
|
||||
'henry moore institute': ('GB', 'LEE', 'Leeds', 'M'), # UK
|
||||
'her place womens museum': ('AU', 'MEL', 'Melbourne', 'M'), # Australia
|
||||
'rigsarkivet': ('DK', 'CPH', 'Copenhagen', 'A'), # Denmark
|
||||
'royal armouries museum': ('GB', 'LEE', 'Leeds', 'M'), # UK
|
||||
'royal botanic gardens kew': ('GB', 'KEW', 'Kew', 'B'), # UK
|
||||
'the design museum': ('GB', 'LON', 'London', 'M'), # UK
|
||||
'the metropolitan museum of art': ('US', 'NYC', 'New York', 'M'), # US
|
||||
'thorvaldsens museum': ('DK', 'CPH', 'Copenhagen', 'M'), # Denmark
|
||||
'vitra design museum': ('DE', 'WEI', 'Weil am Rhein', 'M'), # Germany
|
||||
'war childhood museum': ('BA', 'SAR', 'Sarajevo', 'M'), # Bosnia
|
||||
'butser ancient farm': ('GB', 'PET', 'Petersfield', 'M'), # UK
|
||||
'icon film distribution anz': ('AU', 'SYD', 'Sydney', 'C'), # Australia
|
||||
'museum development north': ('GB', 'NEW', 'Newcastle', 'O'), # UK
|
||||
'museums association': ('GB', 'LON', 'London', 'N'), # UK
|
||||
'moya museum of young art': ('AT', 'VIE', 'Vienna', 'M'), # Austria
|
||||
'national churches trust': ('GB', 'LON', 'London', 'N'), # UK
|
||||
'national portrait gallery': ('GB', 'LON', 'London', 'M'), # UK
|
||||
'new contemporaries': ('GB', 'LON', 'London', 'N'), # UK
|
||||
'peabody essex museum': ('US', 'SAL', 'Salem', 'M'), # US
|
||||
'norient': ('CH', 'BER', 'Bern', 'R'), # Switzerland
|
||||
'stiftung trias gemeinnutzige stiftung fur boden okologie und wohnen': ('DE', 'HAT', 'Hattingen', 'N'), # Germany
|
||||
'nfdi4memory': ('DE', 'BER', 'Berlin', 'R'), # Germany
|
||||
'themuseumslab': ('DE', 'BER', 'Berlin', 'E'), # Germany
|
||||
|
||||
# ==========================================================================
|
||||
# INDONESIAN INSTITUTIONS (for ID-* PENDING files)
|
||||
# ==========================================================================
|
||||
'yayasan arsari djojohadikusumo': ('ID', 'JAK', 'Jakarta', 'N'), # Indonesia
|
||||
'yayasan konservasi alam nusantara': ('ID', 'JAK', 'Jakarta', 'N'), # Indonesia
|
||||
'southeast asia museum services seams': ('ID', 'JAK', 'Jakarta', 'O'), # Indonesia
|
||||
'museum and gallery of ipb future': ('ID', 'BOG', 'Bogor', 'M'), # Indonesia
|
||||
'museum dewantara kirti griya': ('ID', 'YOG', 'Yogyakarta', 'M'), # Indonesia
|
||||
'museum macan': ('ID', 'JAK', 'Jakarta', 'M'), # Indonesia
|
||||
'museum pasifika': ('ID', 'BAL', 'Bali', 'M'), # Indonesia
|
||||
'museum zoologi universitas andalas': ('ID', 'PAD', 'Padang', 'M'), # Indonesia
|
||||
'moja museum': ('ID', 'JAK', 'Jakarta', 'M'), # Indonesia - Museum of Jakarta
|
||||
'wassanindia': ('IN', 'DEL', 'Delhi', 'N'), # India
|
||||
'museum of contemporary tibetan art': ('IN', 'DHA', 'Dharamsala', 'M'), # India
|
||||
'vedica art studios and gallery': ('IN', 'DEL', 'Delhi', 'G'), # India
|
||||
|
||||
# ==========================================================================
|
||||
# AUSTRALIAN INSTITUTIONS
|
||||
# ==========================================================================
|
||||
'museumsppassmusees': ('AU', 'SYD', 'Sydney', 'O'), # Australia - museum pass program
|
||||
'australian museums and galleries association victoria': ('AU', 'MEL', 'Melbourne', 'N'),
|
||||
'australian society of archivists inc': ('AU', 'CAN', 'Canberra', 'N'),
|
||||
'history australia': ('AU', 'SYD', 'Sydney', 'R'),
|
||||
'melbourne holocaust museum': ('AU', 'MEL', 'Melbourne', 'M'),
|
||||
'national library of australia': ('AU', 'CAN', 'Canberra', 'L'),
|
||||
'professional historians association victoria and tasmania': ('AU', 'MEL', 'Melbourne', 'N'),
|
||||
'the university of queensland art museum': ('AU', 'BRI', 'Brisbane', 'M'),
|
||||
|
||||
# ==========================================================================
|
||||
# INDONESIAN INSTITUTIONS (additional)
|
||||
# ==========================================================================
|
||||
'arsip nasional republik indonesia anri': ('ID', 'JAK', 'Jakarta', 'A'),
|
||||
'art zoo museum': ('ID', 'JAK', 'Jakarta', 'M'),
|
||||
'art 1 new museum': ('ID', 'JAK', 'Jakarta', 'M'),
|
||||
'asmat museum of culture and progress': ('ID', 'AGT', 'Agats', 'M'),
|
||||
'cifor center for international forestry research': ('ID', 'BOG', 'Bogor', 'R'),
|
||||
'econusa foundation indonesia': ('ID', 'JAK', 'Jakarta', 'N'),
|
||||
'econusa foundation': ('ID', 'JAK', 'Jakarta', 'N'),
|
||||
'fisheries resource center of indonesia frci': ('ID', 'JAK', 'Jakarta', 'R'),
|
||||
'gaia indonesia': ('ID', 'JAK', 'Jakarta', 'N'),
|
||||
'jakarta history museum': ('ID', 'JAK', 'Jakarta', 'M'),
|
||||
'kite museum of indonesia': ('ID', 'JAK', 'Jakarta', 'M'),
|
||||
'konservasi indonesia': ('ID', 'JAK', 'Jakarta', 'N'),
|
||||
'ministry of tourism of the republic of indonesia': ('ID', 'JAK', 'Jakarta', 'O'),
|
||||
'museum batik indonesia': ('ID', 'YOG', 'Yogyakarta', 'M'),
|
||||
'museum musik indonesia': ('ID', 'JAK', 'Jakarta', 'M'),
|
||||
'museum nasional indonesia': ('ID', 'JAK', 'Jakarta', 'M'),
|
||||
'museum perkebunan indonesia': ('ID', 'MED', 'Medan', 'M'),
|
||||
'perpustakaan nasional republik indonesia perpusnas ri': ('ID', 'JAK', 'Jakarta', 'L'),
|
||||
'taman safari indonesia': ('ID', 'BOG', 'Bogor', 'B'),
|
||||
|
||||
# ==========================================================================
|
||||
# FRENCH INSTITUTIONS (additional)
|
||||
# ==========================================================================
|
||||
'alca nouvelle aquitaine': ('FR', 'BOR', 'Bordeaux', 'O'),
|
||||
'archives de rennes': ('FR', 'REN', 'Rennes', 'A'),
|
||||
'centre de recherche du chateau de versailles': ('FR', 'VER', 'Versailles', 'R'),
|
||||
'centre des monuments nationaux': ('FR', 'PAR', 'Paris', 'O'),
|
||||
'chateau de chantilly officiel': ('FR', 'CHA', 'Chantilly', 'M'),
|
||||
'cha teau de chantilly officiel': ('FR', 'CHA', 'Chantilly', 'M'), # normalized
|
||||
'france nature environnement': ('FR', 'PAR', 'Paris', 'N'),
|
||||
'ircam': ('FR', 'PAR', 'Paris', 'R'),
|
||||
'mucem musee des civilisations de l europe et de la mediterranee': ('FR', 'MAR', 'Marseille', 'M'),
|
||||
'mucem muse e des civilisations de l europe et de la me diterrane e': ('FR', 'MAR', 'Marseille', 'M'), # normalized
|
||||
'centre de recherche du cha teau de versailles': ('FR', 'VER', 'Versailles', 'R'), # normalized
|
||||
'musee d orsay': ('FR', 'PAR', 'Paris', 'M'),
|
||||
'muse e d orsay': ('FR', 'PAR', 'Paris', 'M'), # normalized variant
|
||||
'musee de bretagne': ('FR', 'REN', 'Rennes', 'M'),
|
||||
'muse e de bretagne': ('FR', 'REN', 'Rennes', 'M'), # normalized
|
||||
'musee des arts et metiers': ('FR', 'PAR', 'Paris', 'M'),
|
||||
'muse e des arts et me tiers': ('FR', 'PAR', 'Paris', 'M'), # normalized
|
||||
'musee du debarquement': ('FR', 'ARR', 'Arromanches', 'M'),
|
||||
'muse e du de barquement': ('FR', 'ARR', 'Arromanches', 'M'), # normalized
|
||||
'petites cites de caractere de france': ('FR', 'PAR', 'Paris', 'N'),
|
||||
'petites cite s de caracte re de france': ('FR', 'PAR', 'Paris', 'N'), # normalized
|
||||
'villa albertine the french institute for culture and education': ('US', 'NYC', 'New York', 'O'), # French in US
|
||||
|
||||
# ==========================================================================
|
||||
# GERMAN INSTITUTIONS (additional)
|
||||
# ==========================================================================
|
||||
'anne frank educational center': ('DE', 'FRA', 'Frankfurt', 'E'),
|
||||
'bildarchiv foto marburg': ('DE', 'MAR', 'Marburg', 'A'),
|
||||
'bundesvereinigung kulturelle kinder und jugendbildung bkj': ('DE', 'REM', 'Remscheid', 'N'),
|
||||
'common wadden sea secretariat': ('DE', 'WIL', 'Wilhelmshaven', 'O'),
|
||||
'deutsche stiftung denkmalschutz german foundation for monument protection': ('DE', 'BON', 'Bonn', 'N'),
|
||||
'deutsches archaologisches institut dai': ('DE', 'BER', 'Berlin', 'R'),
|
||||
'deutsches archa ologisches institut dai': ('DE', 'BER', 'Berlin', 'R'), # normalized
|
||||
'deutsches historisches museum': ('DE', 'BER', 'Berlin', 'M'),
|
||||
'deutsches zentrum kulturgutverluste': ('DE', 'MAG', 'Magdeburg', 'R'),
|
||||
'jewish museum berlin': ('DE', 'BER', 'Berlin', 'M'),
|
||||
'klassik stiftung weimar': ('DE', 'WEI', 'Weimar', 'M'),
|
||||
'kulturstiftung des bundes german federal cultural foundation': ('DE', 'HAL', 'Halle', 'N'),
|
||||
'stadtische galerie im lenbachhaus und kunstbau munchen': ('DE', 'MUN', 'Munich', 'M'),
|
||||
'sta dtische galerie im lenbachhaus und kunstbau mu nchen': ('DE', 'MUN', 'Munich', 'M'), # normalized
|
||||
'stiftung stadtmuseum berlin': ('DE', 'BER', 'Berlin', 'M'),
|
||||
|
||||
# ==========================================================================
|
||||
# BRITISH INSTITUTIONS (additional)
|
||||
# ==========================================================================
|
||||
'archaeological research services ltd': ('GB', 'BAK', 'Bakewell', 'R'),
|
||||
'british school at athens': ('GR', 'ATH', 'Athens', 'R'), # Greek location!
|
||||
'british trust for ornithology bto': ('GB', 'THE', 'Thetford', 'R'),
|
||||
'historic new england': ('US', 'BOS', 'Boston', 'N'), # US, not UK!
|
||||
'historic royal palaces': ('GB', 'LON', 'London', 'M'),
|
||||
'new england museum association': ('US', 'BOS', 'Boston', 'N'), # US, not UK!
|
||||
|
||||
# ==========================================================================
|
||||
# ITALIAN INSTITUTIONS (additional)
|
||||
# ==========================================================================
|
||||
'artribune': ('IT', 'ROM', 'Rome', 'C'),
|
||||
'centro conservazione restauro la venaria reale': ('IT', 'TOR', 'Turin', 'R'),
|
||||
'ecole francaise de rome efr': ('IT', 'ROM', 'Rome', 'R'),
|
||||
'e cole franc aise de rome efr': ('IT', 'ROM', 'Rome', 'R'), # normalized
|
||||
'museum tweestromenland': ('GE', 'BEN', 'Beneden-Leeuwen', 'M'), # Dutch, in Beneden-Leeuwen!
|
||||
'stichting roma aeterna': ('IT', 'ROM', 'Rome', 'N'),
|
||||
'triennale milano': ('IT', 'MIL', 'Milan', 'M'),
|
||||
|
||||
# ==========================================================================
|
||||
# BELGIAN INSTITUTIONS (additional)
|
||||
# ==========================================================================
|
||||
'advn': ('BE', 'ANT', 'Antwerpen', 'A'),
|
||||
'm leuven': ('BE', 'LEU', 'Leuven', 'M'),
|
||||
'museum voor schone kunsten gent': ('BE', 'GEN', 'Gent', 'M'),
|
||||
'wikimedia belgium': ('BE', 'BRU', 'Brussels', 'N'),
|
||||
|
||||
# ==========================================================================
|
||||
# US INSTITUTIONS (additional)
|
||||
# ==========================================================================
|
||||
'gia gemological institute of america': ('US', 'CAR', 'Carlsbad', 'R'),
|
||||
'international society of arboriculture': ('US', 'ATL', 'Atlanta', 'N'),
|
||||
'standwithus': ('US', 'LAX', 'Los Angeles', 'N'),
|
||||
|
||||
# ==========================================================================
|
||||
# DANISH INSTITUTIONS (additional)
|
||||
# ==========================================================================
|
||||
'aalborg teater': ('DK', 'AAL', 'Aalborg', 'E'),
|
||||
'augustinus fonden': ('DK', 'CPH', 'Copenhagen', 'N'),
|
||||
'kobenhavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'),
|
||||
'ko benhavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'), # normalized
|
||||
'københavns museum museum of copenhagen': ('DK', 'CPH', 'Copenhagen', 'M'), # with ø
|
||||
|
||||
# ==========================================================================
|
||||
# SPANISH INSTITUTIONS
|
||||
# ==========================================================================
|
||||
'centre de cultura contemporania de barcelona cccb': ('ES', 'BAR', 'Barcelona', 'M'),
|
||||
'centre de cultura contempora nia de barcelona cccb': ('ES', 'BAR', 'Barcelona', 'M'), # normalized
|
||||
'instituto del patrimonio cultural de espana ipce': ('ES', 'MAD', 'Madrid', 'O'),
|
||||
'instituto del patrimonio cultural de espan a ipce': ('ES', 'MAD', 'Madrid', 'O'), # normalized
|
||||
|
||||
# ==========================================================================
|
||||
# INDIAN INSTITUTIONS
|
||||
# ==========================================================================
|
||||
'placemaking india': ('IN', 'DEL', 'Delhi', 'N'),
|
||||
|
||||
# ==========================================================================
|
||||
# OTHER INTERNATIONAL
|
||||
# ==========================================================================
|
||||
'african wildlife foundation': ('KE', 'NAI', 'Nairobi', 'N'),
|
||||
'arabian oud': ('SA', 'RIY', 'Riyadh', 'C'),
|
||||
'wza rat althqa fh ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'), # Saudi Ministry of Culture normalized
|
||||
'وزارة الثقافة ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'), # Saudi Ministry of Culture Arabic
|
||||
'ministry of culture': ('SA', 'RIY', 'Riyadh', 'O'), # Saudi Ministry of Culture simple
|
||||
'dariah eric': ('EU', 'BRU', 'Brussels', 'R'),
|
||||
'embassy of the netherlands in israel': ('IL', 'TLV', 'Tel Aviv', 'O'),
|
||||
'european museum academy': ('EU', 'BRU', 'Brussels', 'N'),
|
||||
'iucn ssc shark specialist group ssg': ('CA', 'VAN', 'Vancouver', 'R'),
|
||||
'museum vosbergen': ('DR', 'EEL', 'Eelde', 'M'), # Dutch - in Eelde
|
||||
'bonhams': ('GB', 'LON', 'London', 'C'), # UK auction house
|
||||
|
||||
# ==========================================================================
|
||||
# REMAINING DUTCH
|
||||
# ==========================================================================
|
||||
'het nationale park de hoge veluwe': ('GE', 'OTT', 'Otterlo', 'N'),
|
||||
'lucas laboratoire d usages culture s arts socie te': ('FR', 'PAR', 'Paris', 'R'), # French org
|
||||
|
||||
# ==========================================================================
|
||||
# OTHER MISCELLANEOUS DUTCH ORGANIZATIONS
|
||||
# ==========================================================================
|
||||
'introdans': ('GE', 'ARN', 'Arnhem', 'E'),
|
||||
'ja21 het juiste antwoord': ('NH', 'AMS', 'Amsterdam', 'N'), # Political party - not heritage
|
||||
'kasteel radboud': ('NH', 'MED', 'Medemblik', 'M'),
|
||||
'klooster huissen': ('GE', 'HUI', 'Huissen', 'H'),
|
||||
'koninklijke luchtmacht historische vlucht': ('NH', 'GIL', 'Gilze-Rijen', 'M'),
|
||||
'koninklijke woudenberg': ('UT', 'WOU', 'Woudenberg', 'C'),
|
||||
'museum fiskershúske': ('FR', 'MOD', 'Moddergat', 'M'),
|
||||
'museum media': ('NH', 'AMS', 'Amsterdam', 'C'),
|
||||
'museum of 21st century design': ('GB', 'LON', 'London', 'M'), # UK
|
||||
'museum of comic art moca': ('US', 'NYC', 'New York', 'M'), # US
|
||||
'museum of edible earth': ('NL', 'AMS', 'Amsterdam', 'M'), # Actually NL-based
|
||||
'museum of humanity': ('GB', 'LON', 'London', 'M'), # UK
|
||||
'museum of looted antiquities': ('GB', 'LON', 'London', 'D'), # UK - virtual
|
||||
'museum of science': ('US', 'BOS', 'Boston', 'M'), # US
|
||||
'museumppassmusees': ('BE', 'BRU', 'Brussels', 'O'), # Belgium - museum pass
|
||||
'museumvereniging': ('NH', 'AMS', 'Amsterdam', 'N'),
|
||||
'oerol festival': ('FR', 'TER', 'Terschelling', 'E'),
|
||||
'qwen': ('CN', 'HAN', 'Hangzhou', 'C'), # China - AI company, not heritage
|
||||
'radio en museum': ('NH', 'AMS', 'Amsterdam', 'M'),
|
||||
'sothebys': ('GB', 'LON', 'London', 'C'), # UK
|
||||
'sothebys institute of art': ('GB', 'LON', 'London', 'E'), # UK
|
||||
'nieuwe veste': ('NB', 'BRE', 'Breda', 'E'),
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -216,8 +815,14 @@ def process_pending_file(filepath: Path, dry_run: bool = True) -> Optional[str]:
|
|||
abbrev = extract_abbreviation(emic_name)
|
||||
|
||||
# Handle non-Dutch organizations
|
||||
# All non-NL countries get their country code as the country, with XX as province
|
||||
FOREIGN_COUNTRIES = {
|
||||
'FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US', 'AT', 'AU', 'BA', 'ES',
|
||||
'EU', 'ID', 'IL', 'IN', 'MA', 'NO', 'PT', 'PS', 'ZA', 'CA', 'GR', 'KE', 'SA',
|
||||
'CH', 'CN'
|
||||
}
|
||||
country = 'NL'
|
||||
if province in ['FR', 'DK', 'IT', 'BE', 'DE', 'GB', 'US']:
|
||||
if province in FOREIGN_COUNTRIES:
|
||||
country = province
|
||||
province = 'XX'
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue