#!/usr/bin/env python3 """ Enrich NL-XX-XXX files with correct location data via web search. The LinkedIn HTML extraction method was flawed - it extracted location data from wrong companies in the HTML. This script uses web search to find correct locations. Strategy: 1. Read custodian name and website from YAML file 2. Search web for "[name] Netherlands location address city" 3. Parse results to extract city/region 4. Update YAML file with correct location 5. Regenerate GHCID based on new location """ import os import re import yaml import json import subprocess from pathlib import Path from datetime import datetime, timezone from typing import Optional, Tuple # Directory containing custodian files CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # GeoNames database for settlement lookup GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db") # Dutch province mapping PROVINCE_MAP = { 'drenthe': 'DR', 'friesland': 'FR', 'fryslân': 'FR', 'gelderland': 'GE', 'groningen': 'GR', 'limburg': 'LI', 'noord-brabant': 'NB', 'north brabant': 'NB', 'nordbrabant': 'NB', 'brabant': 'NB', 'noord-holland': 'NH', 'north holland': 'NH', 'overijssel': 'OV', 'utrecht': 'UT', 'zeeland': 'ZE', 'zuid-holland': 'ZH', 'south holland': 'ZH', 'flevoland': 'FL', } # Dutch city to 3-letter code mapping (common cities) CITY_CODES = { 'amsterdam': 'AMS', 'rotterdam': 'ROT', 'den haag': 'DHA', 'the hague': 'DHA', "'s-gravenhage": 'DHA', 'utrecht': 'UTR', 'eindhoven': 'EIN', 'groningen': 'GRO', 'tilburg': 'TIL', 'almere': 'ALM', 'breda': 'BRE', 'nijmegen': 'NIJ', 'apeldoorn': 'APE', 'haarlem': 'HAA', 'arnhem': 'ARN', 'enschede': 'ENS', 'amersfoort': 'AME', 'zaanstad': 'ZAA', 'haarlemmermeer': 'HMM', 'zwolle': 'ZWO', 'leiden': 'LEI', 'maastricht': 'MAA', 'dordrecht': 'DOR', 'zoetermeer': 'ZOE', 'deventer': 'DEV', 'delft': 'DEL', 'alkmaar': 'ALK', 'venlo': 'VEN', 'leeuwarden': 'LEE', 'heerlen': 'HEE', 'hilversum': 'HIL', 'assen': 'ASS', 'schiedam': 'SCH', 'weert': 'WEE', 'duivendrecht': 'DUI', 'noordwijk': 'NOO', } def get_city_code(city: str) -> str: """Get 3-letter code for a city.""" city_lower = city.lower().strip() if city_lower in CITY_CODES: return CITY_CODES[city_lower] # Generate code from first 3 letters clean = re.sub(r'[^a-z]', '', city_lower) return clean[:3].upper() if len(clean) >= 3 else clean.upper().ljust(3, 'X') def get_region_code(region: str) -> Optional[str]: """Get 2-letter province code from region name.""" region_lower = region.lower().strip() for key, code in PROVINCE_MAP.items(): if key in region_lower: return code return None def extract_location_from_search_results(results: list) -> Optional[dict]: """Extract city and region from Exa search results.""" # Patterns to match Dutch locations patterns = [ # "City, Netherlands" or "City (Province)" r'(\w+(?:\s+\w+)?)\s*,\s*Netherlands\s*\((\w+(?:\s+\w+)?)\)', # "in City, Province" r'in\s+(\w+(?:\s+\w+)?)\s*,\s*(Noord-Holland|Zuid-Holland|Noord-Brabant|Gelderland|Limburg|Overijssel|Friesland|Drenthe|Groningen|Utrecht|Zeeland|Flevoland)', # "legal seat in City" r'legal\s+seat\s+in\s+(\w+)', # "Address: ... City" r'Address[:\s]+[^,]+,\s*(\d{4}\s*[A-Z]{2})\s+(\w+)', # Dutch postal code pattern r'(\d{4}\s*[A-Z]{2})\s+(\w+(?:\s+\w+)?)\s*,?\s*(?:Netherlands|NL)', ] for result in results: text = result.get('text', '') + ' ' + result.get('title', '') # Try each pattern for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE) if match: groups = match.groups() if len(groups) >= 2: # Check if first group is postal code if re.match(r'\d{4}\s*[A-Z]{2}', groups[0]): city = groups[1] region = None else: city = groups[0] region = groups[1] if len(groups) > 1 else None else: city = groups[0] region = None city = city.strip() region_code = get_region_code(region) if region else None return { 'city': city, 'region_code': region_code, 'source_text': text[:200] } return None def search_institution_location(name: str, website: Optional[str] = None) -> Optional[dict]: """Search web for institution location using Exa.""" # Build search query query = f'"{name}" Netherlands location address city' if website and 'lnkd.in' not in website: # Add website domain to query for better results domain = re.sub(r'https?://(www\.)?', '', website).split('/')[0] query = f'site:{domain} OR "{name}" Netherlands address city location' # Use Exa via subprocess (since we can't import the MCP client directly) # For now, return None - we'll use the MCP tool directly in the main flow return None def find_xxx_files_needing_enrichment(): """Find NL-XX-XXX files that need location enrichment.""" files = [] for f in sorted(CUSTODIAN_DIR.glob("NL-XX-XXX-*.yaml")): try: with open(f, 'r', encoding='utf-8') as file: content = yaml.safe_load(file) if not content: continue # Get institution name name = content.get('custodian_name', {}).get('emic_name', '') # Get website website = content.get('linkedin_enrichment', {}).get('website') # Get LinkedIn slug slug = content.get('linkedin_enrichment', {}).get('linkedin_slug', '') files.append({ 'file': f, 'name': name, 'website': website, 'slug': slug, 'content': content }) except Exception as e: print(f"Error reading {f}: {e}") return files def update_file_with_location(file_info: dict, city: str, region_code: str, source: str): """Update a YAML file with correct location data.""" f = file_info['file'] content = file_info['content'] name = file_info['name'] # Get city code city_code = get_city_code(city) # Update location content['location'] = { 'city': city, 'region': region_code, 'country': 'NL' } # Generate new GHCID # Extract type and abbreviation from filename filename = f.stem # Pattern: NL-XX-XXX-{TYPE}-{ABBREV}[-{name_suffix}] match = re.match(r'NL-XX-XXX-([A-Z])-(.+)', filename) if match: inst_type = match.group(1) abbrev_suffix = match.group(2) new_ghcid = f"NL-{region_code}-{city_code}-{inst_type}-{abbrev_suffix}" # Update GHCID if 'ghcid' not in content: content['ghcid'] = {} old_ghcid = content['ghcid'].get('ghcid_current', filename) content['ghcid']['ghcid_current'] = new_ghcid content['ghcid']['ghcid_original'] = old_ghcid # Update history content['ghcid']['ghcid_history'] = [{ 'ghcid': new_ghcid, 'ghcid_numeric': content['ghcid'].get('ghcid_numeric'), 'valid_from': datetime.now(timezone.utc).isoformat(), 'valid_to': None, 'reason': f'Location enriched via web search: {city}, {region_code}' }] # Add location resolution content['ghcid']['location_resolution'] = { 'method': 'WEB_SEARCH', 'city': city, 'city_code': city_code, 'region_code': region_code, 'country_code': 'NL', 'source': source, 'resolution_date': datetime.now(timezone.utc).isoformat() } # Add provenance note if 'provenance' not in content: content['provenance'] = {} if 'notes' not in content['provenance']: content['provenance']['notes'] = [] content['provenance']['notes'].append( f"Location enriched via web search on {datetime.now(timezone.utc).strftime('%Y-%m-%d')}: {city}, {region_code}" ) # Write back with open(f, 'w', encoding='utf-8') as file: yaml.dump(content, file, default_flow_style=False, allow_unicode=True, sort_keys=False) return new_ghcid if match else None def main(): """Main function to list files needing enrichment.""" print("Finding NL-XX-XXX files needing location enrichment...\n") files = find_xxx_files_needing_enrichment() print(f"Found {len(files)} files\n") # Group by whether they have website with_website = [f for f in files if f['website'] and 'lnkd.in' not in str(f['website'])] without_website = [f for f in files if not f['website'] or 'lnkd.in' in str(f['website'])] print(f"Files with valid website: {len(with_website)}") print(f"Files without valid website: {len(without_website)}") print("\n--- Sample files with websites (first 20) ---") for f in with_website[:20]: print(f" {f['name']}") print(f" Website: {f['website']}") print(f" File: {f['file'].name}") print() if __name__ == "__main__": main()