glam/scripts/enrich_belgian_locations.py

#!/usr/bin/env python3
"""
Infer city names from Belgian institution names and enrich location data.

This script:
1. Loads Belgian ISIL institutions from YAML
2. Extracts city names from institution names using pattern matching
3. Updates Location objects with inferred cities
4. Geocodes addresses using Nominatim (optional)
5. Re-exports enriched YAML

Pattern strategies:
- "Bibliotheek [City]" → City
- "Bibliotheek van [City]" → City
- "Bib [City]" → City (short form)
- "Stadsbibliotheek [City]" → City
- "Archief [City]" → City
- Institution name contains parentheses with city info
"""

import sys
import re
from pathlib import Path
from typing import Optional, List, Tuple
sys.path.insert(0, 'src')

from glam_extractor.parsers.belgian_isil import BelgianISILParser
from glam_extractor.models import Location
from linkml_runtime.dumpers import yaml_dumper

# Belgian city name patterns
CITY_PATTERNS = [
    # Pattern 1: "Bibliotheek [City]" or "Bibliotheek van [City]"
    (re.compile(r'Bibliotheek(?:\s+van)?\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*(?:\s+op\s+den\s+Berg)?)', re.IGNORECASE), 1),

    # Pattern 2: "Bib [City]" (short form)
    (re.compile(r'\bBib\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),

    # Pattern 3: Parentheses with city info
    (re.compile(r'\((?:Bibliotheek|Bib)\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)\)', re.IGNORECASE), 1),

    # Pattern 4: Archive patterns
    (re.compile(r'(?:Archief|Archive)(?:\s+van)?\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),

    # Pattern 5: Stadsbibliotheek
    (re.compile(r'Stadsbibliotheek\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),

    # Pattern 6: Museum patterns
    (re.compile(r'Museum\s+(?:van\s+)?([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),

    # Pattern 7: City at start with separator
    (re.compile(r'^([A-Z][a-z]+(?:-[A-Z][a-z]+)*)\s*[-:]', re.IGNORECASE), 1),
]

# Common false positives to filter out
FALSE_POSITIVES = {
    'de', 'De', 'het', 'Het', 'van', 'Van',
    'Vondel',  # Person name, not city
    'AS',      # Abbreviation
    'Koninklijk', 'Royal',
    'Provinciale', 'Provincial',
}

def extract_city_from_name(institution_name: str) -> Optional[str]:
    """
    Extract city name from Belgian institution name using pattern matching.

    Args:
        institution_name: Institution name to parse

    Returns:
        City name if found, None otherwise
    """
    for pattern, group_num in CITY_PATTERNS:
        match = pattern.search(institution_name)
        if match:
            city = match.group(group_num).strip()

            # Filter false positives
            if city in FALSE_POSITIVES:
                continue

            # Basic validation: city should start with uppercase
            if not city[0].isupper():
                continue

            return city

    return None


def enrich_belgian_locations():
    """
    Main enrichment function.
    """
    print("=" * 70)
    print("Belgian Institution Location Enrichment")
    print("=" * 70)

    # Load Belgian institutions
    print("\n1. Loading Belgian institutions...")
    parser = BelgianISILParser()
    custodians = parser.parse_and_convert('data/isil/belgian_isil_detailed.csv')
    print(f"   ✓ Loaded {len(custodians)} institutions")

    # Extract cities
    print("\n2. Extracting city names from institution names...")
    enriched_count = 0
    city_counts = {}

    for custodian in custodians:
        city = extract_city_from_name(custodian.name)

        if city:
            # Create or update location
            if not custodian.locations:
                custodian.locations = []

            if len(custodian.locations) == 0:
                # No existing location - create new one
                location = Location(
                    city=city,
                    country="BE"
                )
                custodian.locations.append(location)
                enriched_count += 1
            else:
                # Update existing location if city is missing
                location = custodian.locations[0]
                if not location.city:
                    location.city = city
                    enriched_count += 1

            # Track city frequency
            city_counts[city] = city_counts.get(city, 0) + 1

    print(f"   ✓ Enriched {enriched_count} institutions with city data")
    print(f"   ✓ Total institutions with locations: {sum(1 for c in custodians if c.locations)}")

    # Show city distribution
    print(f"\n3. City distribution (top 15):")
    sorted_cities = sorted(city_counts.items(), key=lambda x: x[1], reverse=True)
    for city, count in sorted_cities[:15]:
        print(f"   {city:20} : {count} institutions")

    # Show enrichment examples
    print(f"\n4. Sample enriched records:")
    enriched_samples = [c for c in custodians if c.locations and c.locations[0].city][:5]
    for c in enriched_samples:
        print(f"   {c.id}: {c.name[:45]:45} → {c.locations[0].city}")

    # Institutions without cities
    no_location = [c for c in custodians if not c.locations or not c.locations[0].city]
    print(f"\n5. Institutions without city data: {len(no_location)}")
    if no_location:
        print(f"   Sample (first 5):")
        for c in no_location[:5]:
            print(f"   {c.id}: {c.name[:60]}")

    # Export enriched data
    print(f"\n6. Exporting enriched YAML...")
    output_file = Path("data/instances/belgium_isil_institutions_enriched.yaml")

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("# Belgian ISIL Registry Institutions (Location Enriched)\n")
        f.write("# Scraped from https://isil.kbr.be/ (Royal Library of Belgium)\n")
        f.write(f"# Total institutions: {len(custodians)}\n")
        f.write(f"# Institutions with location data: {sum(1 for c in custodians if c.locations)}\n")
        f.write(f"# Data tier: TIER_1_AUTHORITATIVE\n")
        f.write("#\n")
        f.write("---\n\n")

        for idx, custodian in enumerate(custodians, 1):
            yaml_str = yaml_dumper.dumps(custodian)
            f.write(yaml_str)
            f.write("\n")

            if idx % 50 == 0:
                print(f"   ... exported {idx} institutions")

    file_size_kb = output_file.stat().st_size / 1024
    print(f"   ✓ Exported to: {output_file}")
    print(f"   ✓ File size: {file_size_kb:.1f} KB")

    # Summary statistics
    print("\n" + "=" * 70)
    print("Enrichment Summary")
    print("=" * 70)
    print(f"Total institutions:           {len(custodians)}")
    print(f"With location data:           {sum(1 for c in custodians if c.locations)} ({sum(1 for c in custodians if c.locations)/len(custodians)*100:.1f}%)")
    print(f"With city names:              {sum(1 for c in custodians if c.locations and c.locations[0].city)} ({sum(1 for c in custodians if c.locations and c.locations[0].city)/len(custodians)*100:.1f}%)")
    print(f"Unique cities:                {len(city_counts)}")
    print(f"Without location data:        {len(no_location)} ({len(no_location)/len(custodians)*100:.1f}%)")

    print("\n✓ Location enrichment complete!")


if __name__ == "__main__":
    enrich_belgian_locations()