glam/scripts/enrich_xxx_via_web_search.py
2025-12-17 11:58:40 +01:00

297 lines
9.6 KiB
Python

#!/usr/bin/env python3
"""
Enrich NL-XX-XXX files with correct location data via web search.
The LinkedIn HTML extraction method was flawed - it extracted location data from
wrong companies in the HTML. This script uses web search to find correct locations.
Strategy:
1. Read custodian name and website from YAML file
2. Search web for "[name] Netherlands location address city"
3. Parse results to extract city/region
4. Update YAML file with correct location
5. Regenerate GHCID based on new location
"""
import os
import re
import yaml
import json
import subprocess
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Tuple
# Directory containing custodian files
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# GeoNames database for settlement lookup
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
# Dutch province mapping
PROVINCE_MAP = {
'drenthe': 'DR',
'friesland': 'FR', 'fryslân': 'FR',
'gelderland': 'GE',
'groningen': 'GR',
'limburg': 'LI',
'noord-brabant': 'NB', 'north brabant': 'NB', 'nordbrabant': 'NB', 'brabant': 'NB',
'noord-holland': 'NH', 'north holland': 'NH',
'overijssel': 'OV',
'utrecht': 'UT',
'zeeland': 'ZE',
'zuid-holland': 'ZH', 'south holland': 'ZH',
'flevoland': 'FL',
}
# Dutch city to 3-letter code mapping (common cities)
CITY_CODES = {
'amsterdam': 'AMS',
'rotterdam': 'ROT',
'den haag': 'DHA', 'the hague': 'DHA', "'s-gravenhage": 'DHA',
'utrecht': 'UTR',
'eindhoven': 'EIN',
'groningen': 'GRO',
'tilburg': 'TIL',
'almere': 'ALM',
'breda': 'BRE',
'nijmegen': 'NIJ',
'apeldoorn': 'APE',
'haarlem': 'HAA',
'arnhem': 'ARN',
'enschede': 'ENS',
'amersfoort': 'AME',
'zaanstad': 'ZAA',
'haarlemmermeer': 'HMM',
'zwolle': 'ZWO',
'leiden': 'LEI',
'maastricht': 'MAA',
'dordrecht': 'DOR',
'zoetermeer': 'ZOE',
'deventer': 'DEV',
'delft': 'DEL',
'alkmaar': 'ALK',
'venlo': 'VEN',
'leeuwarden': 'LEE',
'heerlen': 'HEE',
'hilversum': 'HIL',
'assen': 'ASS',
'schiedam': 'SCH',
'weert': 'WEE',
'duivendrecht': 'DUI',
'noordwijk': 'NOO',
}
def get_city_code(city: str) -> str:
"""Get 3-letter code for a city."""
city_lower = city.lower().strip()
if city_lower in CITY_CODES:
return CITY_CODES[city_lower]
# Generate code from first 3 letters
clean = re.sub(r'[^a-z]', '', city_lower)
return clean[:3].upper() if len(clean) >= 3 else clean.upper().ljust(3, 'X')
def get_region_code(region: str) -> Optional[str]:
"""Get 2-letter province code from region name."""
region_lower = region.lower().strip()
for key, code in PROVINCE_MAP.items():
if key in region_lower:
return code
return None
def extract_location_from_search_results(results: list) -> Optional[dict]:
"""Extract city and region from Exa search results."""
# Patterns to match Dutch locations
patterns = [
# "City, Netherlands" or "City (Province)"
r'(\w+(?:\s+\w+)?)\s*,\s*Netherlands\s*\((\w+(?:\s+\w+)?)\)',
# "in City, Province"
r'in\s+(\w+(?:\s+\w+)?)\s*,\s*(Noord-Holland|Zuid-Holland|Noord-Brabant|Gelderland|Limburg|Overijssel|Friesland|Drenthe|Groningen|Utrecht|Zeeland|Flevoland)',
# "legal seat in City"
r'legal\s+seat\s+in\s+(\w+)',
# "Address: ... City"
r'Address[:\s]+[^,]+,\s*(\d{4}\s*[A-Z]{2})\s+(\w+)',
# Dutch postal code pattern
r'(\d{4}\s*[A-Z]{2})\s+(\w+(?:\s+\w+)?)\s*,?\s*(?:Netherlands|NL)',
]
for result in results:
text = result.get('text', '') + ' ' + result.get('title', '')
# Try each pattern
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
groups = match.groups()
if len(groups) >= 2:
# Check if first group is postal code
if re.match(r'\d{4}\s*[A-Z]{2}', groups[0]):
city = groups[1]
region = None
else:
city = groups[0]
region = groups[1] if len(groups) > 1 else None
else:
city = groups[0]
region = None
city = city.strip()
region_code = get_region_code(region) if region else None
return {
'city': city,
'region_code': region_code,
'source_text': text[:200]
}
return None
def search_institution_location(name: str, website: Optional[str] = None) -> Optional[dict]:
"""Search web for institution location using Exa."""
# Build search query
query = f'"{name}" Netherlands location address city'
if website and 'lnkd.in' not in website:
# Add website domain to query for better results
domain = re.sub(r'https?://(www\.)?', '', website).split('/')[0]
query = f'site:{domain} OR "{name}" Netherlands address city location'
# Use Exa via subprocess (since we can't import the MCP client directly)
# For now, return None - we'll use the MCP tool directly in the main flow
return None
def find_xxx_files_needing_enrichment():
"""Find NL-XX-XXX files that need location enrichment."""
files = []
for f in sorted(CUSTODIAN_DIR.glob("NL-XX-XXX-*.yaml")):
try:
with open(f, 'r', encoding='utf-8') as file:
content = yaml.safe_load(file)
if not content:
continue
# Get institution name
name = content.get('custodian_name', {}).get('emic_name', '')
# Get website
website = content.get('linkedin_enrichment', {}).get('website')
# Get LinkedIn slug
slug = content.get('linkedin_enrichment', {}).get('linkedin_slug', '')
files.append({
'file': f,
'name': name,
'website': website,
'slug': slug,
'content': content
})
except Exception as e:
print(f"Error reading {f}: {e}")
return files
def update_file_with_location(file_info: dict, city: str, region_code: str, source: str):
"""Update a YAML file with correct location data."""
f = file_info['file']
content = file_info['content']
name = file_info['name']
# Get city code
city_code = get_city_code(city)
# Update location
content['location'] = {
'city': city,
'region': region_code,
'country': 'NL'
}
# Generate new GHCID
# Extract type and abbreviation from filename
filename = f.stem
# Pattern: NL-XX-XXX-{TYPE}-{ABBREV}[-{name_suffix}]
match = re.match(r'NL-XX-XXX-([A-Z])-(.+)', filename)
if match:
inst_type = match.group(1)
abbrev_suffix = match.group(2)
new_ghcid = f"NL-{region_code}-{city_code}-{inst_type}-{abbrev_suffix}"
# Update GHCID
if 'ghcid' not in content:
content['ghcid'] = {}
old_ghcid = content['ghcid'].get('ghcid_current', filename)
content['ghcid']['ghcid_current'] = new_ghcid
content['ghcid']['ghcid_original'] = old_ghcid
# Update history
content['ghcid']['ghcid_history'] = [{
'ghcid': new_ghcid,
'ghcid_numeric': content['ghcid'].get('ghcid_numeric'),
'valid_from': datetime.now(timezone.utc).isoformat(),
'valid_to': None,
'reason': f'Location enriched via web search: {city}, {region_code}'
}]
# Add location resolution
content['ghcid']['location_resolution'] = {
'method': 'WEB_SEARCH',
'city': city,
'city_code': city_code,
'region_code': region_code,
'country_code': 'NL',
'source': source,
'resolution_date': datetime.now(timezone.utc).isoformat()
}
# Add provenance note
if 'provenance' not in content:
content['provenance'] = {}
if 'notes' not in content['provenance']:
content['provenance']['notes'] = []
content['provenance']['notes'].append(
f"Location enriched via web search on {datetime.now(timezone.utc).strftime('%Y-%m-%d')}: {city}, {region_code}"
)
# Write back
with open(f, 'w', encoding='utf-8') as file:
yaml.dump(content, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
return new_ghcid if match else None
def main():
"""Main function to list files needing enrichment."""
print("Finding NL-XX-XXX files needing location enrichment...\n")
files = find_xxx_files_needing_enrichment()
print(f"Found {len(files)} files\n")
# Group by whether they have website
with_website = [f for f in files if f['website'] and 'lnkd.in' not in str(f['website'])]
without_website = [f for f in files if not f['website'] or 'lnkd.in' in str(f['website'])]
print(f"Files with valid website: {len(with_website)}")
print(f"Files without valid website: {len(without_website)}")
print("\n--- Sample files with websites (first 20) ---")
for f in with_website[:20]:
print(f" {f['name']}")
print(f" Website: {f['website']}")
print(f" File: {f['file'].name}")
print()
if __name__ == "__main__":
main()