486 lines
15 KiB
Python
486 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich NL-XX-XXX custodian files with location data from LinkedIn HTML files.
|
|
|
|
This script:
|
|
1. Parses LinkedIn HTML files to extract company names and headquarters locations
|
|
2. Matches custodian files by emic_name
|
|
3. Updates custodian files with correct country, region (province), and city codes
|
|
4. Regenerates GHCID based on new location data
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, Tuple
|
|
from collections import defaultdict
|
|
import unicodedata
|
|
|
|
# Dutch province code mapping
|
|
DUTCH_PROVINCE_CODES = {
|
|
'drenthe': 'DR',
|
|
'flevoland': 'FL',
|
|
'friesland': 'FR',
|
|
'fryslân': 'FR',
|
|
'gelderland': 'GE',
|
|
'groningen': 'GR',
|
|
'limburg': 'LI',
|
|
'noord-brabant': 'NB',
|
|
'noord brabant': 'NB',
|
|
'north brabant': 'NB',
|
|
'noord-holland': 'NH',
|
|
'noord holland': 'NH',
|
|
'north holland': 'NH',
|
|
'overijssel': 'OV',
|
|
'utrecht': 'UT',
|
|
'zeeland': 'ZE',
|
|
'zuid-holland': 'ZH',
|
|
'zuid holland': 'ZH',
|
|
'south holland': 'ZH',
|
|
}
|
|
|
|
# City to province mapping for common Dutch cities
|
|
DUTCH_CITY_TO_PROVINCE = {
|
|
'amsterdam': 'NH',
|
|
'rotterdam': 'ZH',
|
|
'den haag': 'ZH',
|
|
'the hague': 'ZH',
|
|
"'s-gravenhage": 'ZH',
|
|
's-gravenhage': 'ZH',
|
|
'utrecht': 'UT',
|
|
'eindhoven': 'NB',
|
|
'tilburg': 'NB',
|
|
'groningen': 'GR',
|
|
'almere': 'FL',
|
|
'breda': 'NB',
|
|
'nijmegen': 'GE',
|
|
'arnhem': 'GE',
|
|
'haarlem': 'NH',
|
|
'enschede': 'OV',
|
|
'maastricht': 'LI',
|
|
'leiden': 'ZH',
|
|
'dordrecht': 'ZH',
|
|
'apeldoorn': 'GE',
|
|
'zwolle': 'OV',
|
|
'amersfoort': 'UT',
|
|
'delft': 'ZH',
|
|
'alkmaar': 'NH',
|
|
'zaandam': 'NH',
|
|
'leeuwarden': 'FR',
|
|
'hilversum': 'NH',
|
|
'deventer': 'OV',
|
|
'middelburg': 'ZE',
|
|
'assen': 'DR',
|
|
'wageningen': 'GE',
|
|
'lelystad': 'FL',
|
|
'venlo': 'LI',
|
|
'heerlen': 'LI',
|
|
'sittard': 'LI',
|
|
'oss': 'NB',
|
|
"'s-hertogenbosch": 'NB',
|
|
's-hertogenbosch': 'NB',
|
|
'den bosch': 'NB',
|
|
'gouda': 'ZH',
|
|
'schiedam': 'ZH',
|
|
'zoetermeer': 'ZH',
|
|
'alphen aan den rijn': 'ZH',
|
|
'emmen': 'DR',
|
|
'kampen': 'OV',
|
|
'harderwijk': 'GE',
|
|
'hoorn': 'NH',
|
|
'purmerend': 'NH',
|
|
'vlaardingen': 'ZH',
|
|
'beverwijk': 'NH',
|
|
'hoofddorp': 'NH',
|
|
'amstelveen': 'NH',
|
|
'diemen': 'NH',
|
|
'nieuwegein': 'UT',
|
|
'zeist': 'UT',
|
|
'veenendaal': 'UT',
|
|
'helmond': 'NB',
|
|
'roosendaal': 'NB',
|
|
'bergen op zoom': 'NB',
|
|
'waalwijk': 'NB',
|
|
'vlissingen': 'ZE',
|
|
'goes': 'ZE',
|
|
'terneuzen': 'ZE',
|
|
'roermond': 'LI',
|
|
'weert': 'LI',
|
|
'kerkrade': 'LI',
|
|
'geleen': 'LI',
|
|
'doetinchem': 'GE',
|
|
'tiel': 'GE',
|
|
'ede': 'GE',
|
|
'barneveld': 'GE',
|
|
'winterswijk': 'GE',
|
|
'almelo': 'OV',
|
|
'hengelo': 'OV',
|
|
'oldenzaal': 'OV',
|
|
'steenwijk': 'OV',
|
|
'meppel': 'DR',
|
|
'hoogeveen': 'DR',
|
|
'coevorden': 'DR',
|
|
'drachten': 'FR',
|
|
'sneek': 'FR',
|
|
'heerenveen': 'FR',
|
|
'harlingen': 'FR',
|
|
'franeker': 'FR',
|
|
}
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize a name for matching."""
|
|
if not name:
|
|
return ""
|
|
# NFD decomposition to separate base characters from diacritics
|
|
normalized = unicodedata.normalize('NFD', name.lower())
|
|
# Remove diacritics
|
|
ascii_name = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
# Remove punctuation and extra spaces
|
|
ascii_name = re.sub(r'[^\w\s]', ' ', ascii_name)
|
|
ascii_name = re.sub(r'\s+', ' ', ascii_name).strip()
|
|
return ascii_name
|
|
|
|
|
|
def generate_city_code(city: str) -> str:
|
|
"""Generate a 3-letter city code from city name."""
|
|
if not city:
|
|
return "XXX"
|
|
|
|
city = city.strip()
|
|
words = city.split()
|
|
|
|
# Handle Dutch articles
|
|
dutch_articles = {'de', 'het', 'den', "'s", 's'}
|
|
|
|
if len(words) == 1:
|
|
# Single word: first 3 letters
|
|
return city[:3].upper()
|
|
elif words[0].lower() in dutch_articles:
|
|
# Dutch article + word: article initial + 2 from main word
|
|
if len(words) >= 2:
|
|
return (words[0][0] + words[1][:2]).upper()
|
|
else:
|
|
# Multi-word: initials
|
|
initials = ''.join(w[0] for w in words if w.lower() not in dutch_articles)
|
|
return initials[:3].upper()
|
|
|
|
return city[:3].upper()
|
|
|
|
|
|
def extract_linkedin_locations(html_dir: Path) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Extract company names and locations from LinkedIn HTML files.
|
|
|
|
Returns dict mapping normalized company name -> location info
|
|
"""
|
|
locations = {}
|
|
|
|
html_files = list(html_dir.glob("*.html"))
|
|
print(f"Found {len(html_files)} HTML files to process")
|
|
|
|
for html_file in html_files:
|
|
try:
|
|
content = html_file.read_text(encoding='utf-8', errors='ignore')
|
|
|
|
# Extract company name from filename
|
|
# Format: "(N) Company Name_ People _ LinkedIn.html"
|
|
filename = html_file.stem
|
|
match = re.match(r'\(\d+\)\s*(.+?)_\s*People\s*_\s*LinkedIn', filename)
|
|
if match:
|
|
company_name = match.group(1).strip()
|
|
else:
|
|
# Try without the number prefix
|
|
match = re.match(r'(.+?)_\s*People\s*_\s*LinkedIn', filename)
|
|
if match:
|
|
company_name = match.group(1).strip()
|
|
else:
|
|
continue
|
|
|
|
# Extract headquarter location from JSON embedded in HTML
|
|
# Pattern: "headquarter":{"streetAddressOptOut":...,"address":{"country":"NL",...,"city":"Amsterdam",...}}
|
|
hq_pattern = r'"headquarter":\s*\{[^}]*"address":\s*\{([^}]+)\}'
|
|
hq_matches = re.findall(hq_pattern, content)
|
|
|
|
if hq_matches:
|
|
# Get the first headquarter (usually the main one)
|
|
address_json = '{' + hq_matches[0] + '}'
|
|
try:
|
|
# Clean up JSON (remove $recipeTypes etc.)
|
|
address_json = re.sub(r'"\$[^"]*":\s*\[[^\]]*\]', '', address_json)
|
|
address_json = re.sub(r'"\$[^"]*":\s*"[^"]*"', '', address_json)
|
|
address_json = re.sub(r',\s*,', ',', address_json)
|
|
address_json = re.sub(r',\s*}', '}', address_json)
|
|
|
|
address_data = json.loads(address_json)
|
|
|
|
country = address_data.get('country', '')
|
|
city = address_data.get('city', '')
|
|
region = address_data.get('geographicArea', '')
|
|
|
|
if country or city:
|
|
normalized = normalize_name(company_name)
|
|
locations[normalized] = {
|
|
'original_name': company_name,
|
|
'country': country,
|
|
'city': city,
|
|
'region': region,
|
|
'source_file': str(html_file.name),
|
|
}
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {html_file.name}: {e}")
|
|
continue
|
|
|
|
return locations
|
|
|
|
|
|
def get_province_code(city: str, region: str, country: str) -> str:
|
|
"""Get Dutch province code from city or region."""
|
|
if country != 'NL':
|
|
return 'XX'
|
|
|
|
# Try region first
|
|
if region:
|
|
region_lower = region.lower().strip()
|
|
if region_lower in DUTCH_PROVINCE_CODES:
|
|
return DUTCH_PROVINCE_CODES[region_lower]
|
|
|
|
# Try city
|
|
if city:
|
|
city_lower = city.lower().strip()
|
|
if city_lower in DUTCH_CITY_TO_PROVINCE:
|
|
return DUTCH_CITY_TO_PROVINCE[city_lower]
|
|
|
|
return 'XX'
|
|
|
|
|
|
def generate_abbreviation(name: str) -> str:
|
|
"""Generate abbreviation from institution name."""
|
|
if not name:
|
|
return "XXX"
|
|
|
|
# Skip words (articles, prepositions)
|
|
skip_words = {
|
|
'de', 'het', 'een', 'van', 'voor', 'in', 'op', 'te', 'den', 'der', 'des', "'s",
|
|
'aan', 'bij', 'met', 'naar', 'om', 'tot', 'uit', 'over', 'onder', 'door', 'en', 'of',
|
|
'a', 'an', 'the', 'of', 'at', 'on', 'to', 'for', 'with', 'from', 'by', 'as', 'and', 'or',
|
|
'stichting', 'vereniging', 'foundation', 'museum', 'archief', 'bibliotheek',
|
|
}
|
|
|
|
words = name.split()
|
|
initials = []
|
|
|
|
for word in words:
|
|
# Skip if word is in skip list
|
|
word_lower = word.lower().strip('.,;:!?()')
|
|
if word_lower in skip_words:
|
|
continue
|
|
# Skip numbers
|
|
if word.isdigit():
|
|
continue
|
|
# Get first letter, normalize to ASCII
|
|
if word:
|
|
first = word[0]
|
|
# Normalize diacritics
|
|
first = unicodedata.normalize('NFD', first)
|
|
first = ''.join(c for c in first if unicodedata.category(c) != 'Mn')
|
|
if first.isalpha():
|
|
initials.append(first.upper())
|
|
|
|
if not initials:
|
|
# Fallback: first 3 letters of first word
|
|
return name[:3].upper()
|
|
|
|
return ''.join(initials[:10]) # Max 10 chars
|
|
|
|
|
|
def update_custodian_file(
|
|
file_path: Path,
|
|
country: str,
|
|
region_code: str,
|
|
city: str,
|
|
city_code: str,
|
|
source_file: str
|
|
) -> Tuple[str, str]:
|
|
"""
|
|
Update a custodian YAML file with new location data.
|
|
|
|
Returns (old_ghcid, new_ghcid)
|
|
"""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Get current GHCID
|
|
old_ghcid = data.get('ghcid', {}).get('ghcid_current', '')
|
|
|
|
# Get institution type
|
|
inst_types = data.get('institution_type', ['M'])
|
|
inst_type = inst_types[0] if inst_types else 'M'
|
|
|
|
# Get emic name for abbreviation
|
|
emic_name = data.get('custodian_name', {}).get('emic_name', '')
|
|
abbreviation = generate_abbreviation(emic_name)
|
|
|
|
# Generate new GHCID
|
|
new_ghcid = f"{country}-{region_code}-{city_code}-{inst_type}-{abbreviation}"
|
|
|
|
# Check for name suffix in old GHCID
|
|
old_parts = old_ghcid.split('-')
|
|
if len(old_parts) > 5:
|
|
# Has name suffix
|
|
name_suffix = '-'.join(old_parts[5:])
|
|
new_ghcid = f"{new_ghcid}-{name_suffix}"
|
|
|
|
# Update location
|
|
if 'location' not in data:
|
|
data['location'] = {}
|
|
|
|
data['location']['city'] = city
|
|
data['location']['region'] = region_code
|
|
data['location']['country'] = country
|
|
|
|
# Update GHCID
|
|
if 'ghcid' not in data:
|
|
data['ghcid'] = {}
|
|
|
|
data['ghcid']['ghcid_current'] = new_ghcid
|
|
data['ghcid']['location_resolution'] = {
|
|
'method': 'LINKEDIN_HTML_EXTRACTION',
|
|
'source_file': source_file,
|
|
'city_code': city_code,
|
|
'region_code': region_code,
|
|
'country_code': country,
|
|
}
|
|
|
|
# Write back
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return old_ghcid, new_ghcid
|
|
|
|
|
|
def main():
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
linkedin_dir = Path('/Users/kempersc/apps/glam/data/custodian/person/affiliated/manual')
|
|
|
|
# Step 1: Extract locations from LinkedIn HTML files
|
|
print("Extracting locations from LinkedIn HTML files...")
|
|
linkedin_locations = extract_linkedin_locations(linkedin_dir)
|
|
print(f"Extracted locations for {len(linkedin_locations)} companies")
|
|
|
|
# Step 2: Find XXX custodian files
|
|
xxx_files = list(custodian_dir.glob('NL-XX-XXX-*.yaml'))
|
|
print(f"\nFound {len(xxx_files)} NL-XX-XXX files to process")
|
|
|
|
# Step 3: Match and update
|
|
matched = 0
|
|
not_matched = []
|
|
updates = []
|
|
non_nl_files = []
|
|
|
|
for file_path in xxx_files:
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
emic_name = data.get('custodian_name', {}).get('emic_name', '')
|
|
if not emic_name:
|
|
not_matched.append((file_path.name, "No emic_name"))
|
|
continue
|
|
|
|
normalized = normalize_name(emic_name)
|
|
|
|
# Try to find a match
|
|
location = linkedin_locations.get(normalized)
|
|
|
|
if not location:
|
|
# Try partial matching
|
|
for key, loc in linkedin_locations.items():
|
|
if normalized in key or key in normalized:
|
|
location = loc
|
|
break
|
|
|
|
if location:
|
|
country = location['country']
|
|
city = location['city']
|
|
region = location.get('region', '')
|
|
source_file = location['source_file']
|
|
|
|
if country != 'NL':
|
|
# Non-Dutch institution - mark for later
|
|
non_nl_files.append({
|
|
'file': file_path.name,
|
|
'emic_name': emic_name,
|
|
'country': country,
|
|
'city': city,
|
|
})
|
|
continue
|
|
|
|
# Get province code
|
|
region_code = get_province_code(city, region, country)
|
|
city_code = generate_city_code(city)
|
|
|
|
if region_code == 'XX' and city_code == 'XXX':
|
|
not_matched.append((file_path.name, f"No province/city for {city}"))
|
|
continue
|
|
|
|
# Update the file
|
|
old_ghcid, new_ghcid = update_custodian_file(
|
|
file_path, country, region_code, city, city_code, source_file
|
|
)
|
|
|
|
updates.append({
|
|
'file': file_path.name,
|
|
'emic_name': emic_name,
|
|
'old_ghcid': old_ghcid,
|
|
'new_ghcid': new_ghcid,
|
|
'city': city,
|
|
'region': region_code,
|
|
})
|
|
matched += 1
|
|
else:
|
|
not_matched.append((file_path.name, emic_name))
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {file_path.name}: {e}")
|
|
|
|
# Report results
|
|
print(f"\n{'='*60}")
|
|
print(f"RESULTS")
|
|
print(f"{'='*60}")
|
|
print(f"Total XXX files: {len(xxx_files)}")
|
|
print(f"Matched and updated: {matched}")
|
|
print(f"Not matched: {len(not_matched)}")
|
|
print(f"Non-NL institutions found: {len(non_nl_files)}")
|
|
|
|
if updates:
|
|
print(f"\n--- UPDATES ({len(updates)}) ---")
|
|
for u in updates[:20]:
|
|
print(f" {u['emic_name'][:40]:<40} | {u['city']:<15} | {u['region']} | {u['new_ghcid']}")
|
|
if len(updates) > 20:
|
|
print(f" ... and {len(updates) - 20} more")
|
|
|
|
if non_nl_files:
|
|
print(f"\n--- NON-NL INSTITUTIONS ({len(non_nl_files)}) ---")
|
|
for nf in non_nl_files[:10]:
|
|
print(f" {nf['emic_name'][:40]:<40} | {nf['country']} | {nf['city']}")
|
|
if len(non_nl_files) > 10:
|
|
print(f" ... and {len(non_nl_files) - 10} more")
|
|
|
|
if not_matched:
|
|
print(f"\n--- NOT MATCHED ({len(not_matched)}) ---")
|
|
for nm in not_matched[:20]:
|
|
print(f" {nm[0]:<50} | {nm[1][:40]}")
|
|
if len(not_matched) > 20:
|
|
print(f" ... and {len(not_matched) - 20} more")
|
|
|
|
return updates, not_matched, non_nl_files
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|