297 lines
9.6 KiB
Python
297 lines
9.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich NL-XX-XXX files with correct location data via web search.
|
|
|
|
The LinkedIn HTML extraction method was flawed - it extracted location data from
|
|
wrong companies in the HTML. This script uses web search to find correct locations.
|
|
|
|
Strategy:
|
|
1. Read custodian name and website from YAML file
|
|
2. Search web for "[name] Netherlands location address city"
|
|
3. Parse results to extract city/region
|
|
4. Update YAML file with correct location
|
|
5. Regenerate GHCID based on new location
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
import json
|
|
import subprocess
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, Tuple
|
|
|
|
# Directory containing custodian files
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
# GeoNames database for settlement lookup
|
|
GEONAMES_DB = Path("/Users/kempersc/apps/glam/data/reference/geonames.db")
|
|
|
|
# Dutch province mapping
|
|
PROVINCE_MAP = {
|
|
'drenthe': 'DR',
|
|
'friesland': 'FR', 'fryslân': 'FR',
|
|
'gelderland': 'GE',
|
|
'groningen': 'GR',
|
|
'limburg': 'LI',
|
|
'noord-brabant': 'NB', 'north brabant': 'NB', 'nordbrabant': 'NB', 'brabant': 'NB',
|
|
'noord-holland': 'NH', 'north holland': 'NH',
|
|
'overijssel': 'OV',
|
|
'utrecht': 'UT',
|
|
'zeeland': 'ZE',
|
|
'zuid-holland': 'ZH', 'south holland': 'ZH',
|
|
'flevoland': 'FL',
|
|
}
|
|
|
|
# Dutch city to 3-letter code mapping (common cities)
|
|
CITY_CODES = {
|
|
'amsterdam': 'AMS',
|
|
'rotterdam': 'ROT',
|
|
'den haag': 'DHA', 'the hague': 'DHA', "'s-gravenhage": 'DHA',
|
|
'utrecht': 'UTR',
|
|
'eindhoven': 'EIN',
|
|
'groningen': 'GRO',
|
|
'tilburg': 'TIL',
|
|
'almere': 'ALM',
|
|
'breda': 'BRE',
|
|
'nijmegen': 'NIJ',
|
|
'apeldoorn': 'APE',
|
|
'haarlem': 'HAA',
|
|
'arnhem': 'ARN',
|
|
'enschede': 'ENS',
|
|
'amersfoort': 'AME',
|
|
'zaanstad': 'ZAA',
|
|
'haarlemmermeer': 'HMM',
|
|
'zwolle': 'ZWO',
|
|
'leiden': 'LEI',
|
|
'maastricht': 'MAA',
|
|
'dordrecht': 'DOR',
|
|
'zoetermeer': 'ZOE',
|
|
'deventer': 'DEV',
|
|
'delft': 'DEL',
|
|
'alkmaar': 'ALK',
|
|
'venlo': 'VEN',
|
|
'leeuwarden': 'LEE',
|
|
'heerlen': 'HEE',
|
|
'hilversum': 'HIL',
|
|
'assen': 'ASS',
|
|
'schiedam': 'SCH',
|
|
'weert': 'WEE',
|
|
'duivendrecht': 'DUI',
|
|
'noordwijk': 'NOO',
|
|
}
|
|
|
|
|
|
def get_city_code(city: str) -> str:
|
|
"""Get 3-letter code for a city."""
|
|
city_lower = city.lower().strip()
|
|
if city_lower in CITY_CODES:
|
|
return CITY_CODES[city_lower]
|
|
# Generate code from first 3 letters
|
|
clean = re.sub(r'[^a-z]', '', city_lower)
|
|
return clean[:3].upper() if len(clean) >= 3 else clean.upper().ljust(3, 'X')
|
|
|
|
|
|
def get_region_code(region: str) -> Optional[str]:
|
|
"""Get 2-letter province code from region name."""
|
|
region_lower = region.lower().strip()
|
|
for key, code in PROVINCE_MAP.items():
|
|
if key in region_lower:
|
|
return code
|
|
return None
|
|
|
|
|
|
def extract_location_from_search_results(results: list) -> Optional[dict]:
|
|
"""Extract city and region from Exa search results."""
|
|
|
|
# Patterns to match Dutch locations
|
|
patterns = [
|
|
# "City, Netherlands" or "City (Province)"
|
|
r'(\w+(?:\s+\w+)?)\s*,\s*Netherlands\s*\((\w+(?:\s+\w+)?)\)',
|
|
# "in City, Province"
|
|
r'in\s+(\w+(?:\s+\w+)?)\s*,\s*(Noord-Holland|Zuid-Holland|Noord-Brabant|Gelderland|Limburg|Overijssel|Friesland|Drenthe|Groningen|Utrecht|Zeeland|Flevoland)',
|
|
# "legal seat in City"
|
|
r'legal\s+seat\s+in\s+(\w+)',
|
|
# "Address: ... City"
|
|
r'Address[:\s]+[^,]+,\s*(\d{4}\s*[A-Z]{2})\s+(\w+)',
|
|
# Dutch postal code pattern
|
|
r'(\d{4}\s*[A-Z]{2})\s+(\w+(?:\s+\w+)?)\s*,?\s*(?:Netherlands|NL)',
|
|
]
|
|
|
|
for result in results:
|
|
text = result.get('text', '') + ' ' + result.get('title', '')
|
|
|
|
# Try each pattern
|
|
for pattern in patterns:
|
|
match = re.search(pattern, text, re.IGNORECASE)
|
|
if match:
|
|
groups = match.groups()
|
|
if len(groups) >= 2:
|
|
# Check if first group is postal code
|
|
if re.match(r'\d{4}\s*[A-Z]{2}', groups[0]):
|
|
city = groups[1]
|
|
region = None
|
|
else:
|
|
city = groups[0]
|
|
region = groups[1] if len(groups) > 1 else None
|
|
else:
|
|
city = groups[0]
|
|
region = None
|
|
|
|
city = city.strip()
|
|
region_code = get_region_code(region) if region else None
|
|
|
|
return {
|
|
'city': city,
|
|
'region_code': region_code,
|
|
'source_text': text[:200]
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def search_institution_location(name: str, website: Optional[str] = None) -> Optional[dict]:
|
|
"""Search web for institution location using Exa."""
|
|
|
|
# Build search query
|
|
query = f'"{name}" Netherlands location address city'
|
|
if website and 'lnkd.in' not in website:
|
|
# Add website domain to query for better results
|
|
domain = re.sub(r'https?://(www\.)?', '', website).split('/')[0]
|
|
query = f'site:{domain} OR "{name}" Netherlands address city location'
|
|
|
|
# Use Exa via subprocess (since we can't import the MCP client directly)
|
|
# For now, return None - we'll use the MCP tool directly in the main flow
|
|
return None
|
|
|
|
|
|
def find_xxx_files_needing_enrichment():
|
|
"""Find NL-XX-XXX files that need location enrichment."""
|
|
files = []
|
|
|
|
for f in sorted(CUSTODIAN_DIR.glob("NL-XX-XXX-*.yaml")):
|
|
try:
|
|
with open(f, 'r', encoding='utf-8') as file:
|
|
content = yaml.safe_load(file)
|
|
|
|
if not content:
|
|
continue
|
|
|
|
# Get institution name
|
|
name = content.get('custodian_name', {}).get('emic_name', '')
|
|
|
|
# Get website
|
|
website = content.get('linkedin_enrichment', {}).get('website')
|
|
|
|
# Get LinkedIn slug
|
|
slug = content.get('linkedin_enrichment', {}).get('linkedin_slug', '')
|
|
|
|
files.append({
|
|
'file': f,
|
|
'name': name,
|
|
'website': website,
|
|
'slug': slug,
|
|
'content': content
|
|
})
|
|
except Exception as e:
|
|
print(f"Error reading {f}: {e}")
|
|
|
|
return files
|
|
|
|
|
|
def update_file_with_location(file_info: dict, city: str, region_code: str, source: str):
|
|
"""Update a YAML file with correct location data."""
|
|
f = file_info['file']
|
|
content = file_info['content']
|
|
name = file_info['name']
|
|
|
|
# Get city code
|
|
city_code = get_city_code(city)
|
|
|
|
# Update location
|
|
content['location'] = {
|
|
'city': city,
|
|
'region': region_code,
|
|
'country': 'NL'
|
|
}
|
|
|
|
# Generate new GHCID
|
|
# Extract type and abbreviation from filename
|
|
filename = f.stem
|
|
# Pattern: NL-XX-XXX-{TYPE}-{ABBREV}[-{name_suffix}]
|
|
match = re.match(r'NL-XX-XXX-([A-Z])-(.+)', filename)
|
|
if match:
|
|
inst_type = match.group(1)
|
|
abbrev_suffix = match.group(2)
|
|
|
|
new_ghcid = f"NL-{region_code}-{city_code}-{inst_type}-{abbrev_suffix}"
|
|
|
|
# Update GHCID
|
|
if 'ghcid' not in content:
|
|
content['ghcid'] = {}
|
|
|
|
old_ghcid = content['ghcid'].get('ghcid_current', filename)
|
|
content['ghcid']['ghcid_current'] = new_ghcid
|
|
content['ghcid']['ghcid_original'] = old_ghcid
|
|
|
|
# Update history
|
|
content['ghcid']['ghcid_history'] = [{
|
|
'ghcid': new_ghcid,
|
|
'ghcid_numeric': content['ghcid'].get('ghcid_numeric'),
|
|
'valid_from': datetime.now(timezone.utc).isoformat(),
|
|
'valid_to': None,
|
|
'reason': f'Location enriched via web search: {city}, {region_code}'
|
|
}]
|
|
|
|
# Add location resolution
|
|
content['ghcid']['location_resolution'] = {
|
|
'method': 'WEB_SEARCH',
|
|
'city': city,
|
|
'city_code': city_code,
|
|
'region_code': region_code,
|
|
'country_code': 'NL',
|
|
'source': source,
|
|
'resolution_date': datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
# Add provenance note
|
|
if 'provenance' not in content:
|
|
content['provenance'] = {}
|
|
if 'notes' not in content['provenance']:
|
|
content['provenance']['notes'] = []
|
|
content['provenance']['notes'].append(
|
|
f"Location enriched via web search on {datetime.now(timezone.utc).strftime('%Y-%m-%d')}: {city}, {region_code}"
|
|
)
|
|
|
|
# Write back
|
|
with open(f, 'w', encoding='utf-8') as file:
|
|
yaml.dump(content, file, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return new_ghcid if match else None
|
|
|
|
|
|
def main():
|
|
"""Main function to list files needing enrichment."""
|
|
print("Finding NL-XX-XXX files needing location enrichment...\n")
|
|
|
|
files = find_xxx_files_needing_enrichment()
|
|
print(f"Found {len(files)} files\n")
|
|
|
|
# Group by whether they have website
|
|
with_website = [f for f in files if f['website'] and 'lnkd.in' not in str(f['website'])]
|
|
without_website = [f for f in files if not f['website'] or 'lnkd.in' in str(f['website'])]
|
|
|
|
print(f"Files with valid website: {len(with_website)}")
|
|
print(f"Files without valid website: {len(without_website)}")
|
|
|
|
print("\n--- Sample files with websites (first 20) ---")
|
|
for f in with_website[:20]:
|
|
print(f" {f['name']}")
|
|
print(f" Website: {f['website']}")
|
|
print(f" File: {f['file'].name}")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|