#!/usr/bin/env python3 """ Infer city names from Belgian institution names and enrich location data. This script: 1. Loads Belgian ISIL institutions from YAML 2. Extracts city names from institution names using pattern matching 3. Updates Location objects with inferred cities 4. Geocodes addresses using Nominatim (optional) 5. Re-exports enriched YAML Pattern strategies: - "Bibliotheek [City]" → City - "Bibliotheek van [City]" → City - "Bib [City]" → City (short form) - "Stadsbibliotheek [City]" → City - "Archief [City]" → City - Institution name contains parentheses with city info """ import sys import re from pathlib import Path from typing import Optional, List, Tuple sys.path.insert(0, 'src') from glam_extractor.parsers.belgian_isil import BelgianISILParser from glam_extractor.models import Location from linkml_runtime.dumpers import yaml_dumper # Belgian city name patterns CITY_PATTERNS = [ # Pattern 1: "Bibliotheek [City]" or "Bibliotheek van [City]" (re.compile(r'Bibliotheek(?:\s+van)?\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*(?:\s+op\s+den\s+Berg)?)', re.IGNORECASE), 1), # Pattern 2: "Bib [City]" (short form) (re.compile(r'\bBib\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1), # Pattern 3: Parentheses with city info (re.compile(r'\((?:Bibliotheek|Bib)\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)\)', re.IGNORECASE), 1), # Pattern 4: Archive patterns (re.compile(r'(?:Archief|Archive)(?:\s+van)?\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1), # Pattern 5: Stadsbibliotheek (re.compile(r'Stadsbibliotheek\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1), # Pattern 6: Museum patterns (re.compile(r'Museum\s+(?:van\s+)?([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1), # Pattern 7: City at start with separator (re.compile(r'^([A-Z][a-z]+(?:-[A-Z][a-z]+)*)\s*[-:]', re.IGNORECASE), 1), ] # Common false positives to filter out FALSE_POSITIVES = { 'de', 'De', 'het', 'Het', 'van', 'Van', 'Vondel', # Person name, not city 'AS', # Abbreviation 'Koninklijk', 'Royal', 'Provinciale', 'Provincial', } def extract_city_from_name(institution_name: str) -> Optional[str]: """ Extract city name from Belgian institution name using pattern matching. Args: institution_name: Institution name to parse Returns: City name if found, None otherwise """ for pattern, group_num in CITY_PATTERNS: match = pattern.search(institution_name) if match: city = match.group(group_num).strip() # Filter false positives if city in FALSE_POSITIVES: continue # Basic validation: city should start with uppercase if not city[0].isupper(): continue return city return None def enrich_belgian_locations(): """ Main enrichment function. """ print("=" * 70) print("Belgian Institution Location Enrichment") print("=" * 70) # Load Belgian institutions print("\n1. Loading Belgian institutions...") parser = BelgianISILParser() custodians = parser.parse_and_convert('data/isil/belgian_isil_detailed.csv') print(f" ✓ Loaded {len(custodians)} institutions") # Extract cities print("\n2. Extracting city names from institution names...") enriched_count = 0 city_counts = {} for custodian in custodians: city = extract_city_from_name(custodian.name) if city: # Create or update location if not custodian.locations: custodian.locations = [] if len(custodian.locations) == 0: # No existing location - create new one location = Location( city=city, country="BE" ) custodian.locations.append(location) enriched_count += 1 else: # Update existing location if city is missing location = custodian.locations[0] if not location.city: location.city = city enriched_count += 1 # Track city frequency city_counts[city] = city_counts.get(city, 0) + 1 print(f" ✓ Enriched {enriched_count} institutions with city data") print(f" ✓ Total institutions with locations: {sum(1 for c in custodians if c.locations)}") # Show city distribution print(f"\n3. City distribution (top 15):") sorted_cities = sorted(city_counts.items(), key=lambda x: x[1], reverse=True) for city, count in sorted_cities[:15]: print(f" {city:20} : {count} institutions") # Show enrichment examples print(f"\n4. Sample enriched records:") enriched_samples = [c for c in custodians if c.locations and c.locations[0].city][:5] for c in enriched_samples: print(f" {c.id}: {c.name[:45]:45} → {c.locations[0].city}") # Institutions without cities no_location = [c for c in custodians if not c.locations or not c.locations[0].city] print(f"\n5. Institutions without city data: {len(no_location)}") if no_location: print(f" Sample (first 5):") for c in no_location[:5]: print(f" {c.id}: {c.name[:60]}") # Export enriched data print(f"\n6. Exporting enriched YAML...") output_file = Path("data/instances/belgium_isil_institutions_enriched.yaml") with open(output_file, 'w', encoding='utf-8') as f: f.write("# Belgian ISIL Registry Institutions (Location Enriched)\n") f.write("# Scraped from https://isil.kbr.be/ (Royal Library of Belgium)\n") f.write(f"# Total institutions: {len(custodians)}\n") f.write(f"# Institutions with location data: {sum(1 for c in custodians if c.locations)}\n") f.write(f"# Data tier: TIER_1_AUTHORITATIVE\n") f.write("#\n") f.write("---\n\n") for idx, custodian in enumerate(custodians, 1): yaml_str = yaml_dumper.dumps(custodian) f.write(yaml_str) f.write("\n") if idx % 50 == 0: print(f" ... exported {idx} institutions") file_size_kb = output_file.stat().st_size / 1024 print(f" ✓ Exported to: {output_file}") print(f" ✓ File size: {file_size_kb:.1f} KB") # Summary statistics print("\n" + "=" * 70) print("Enrichment Summary") print("=" * 70) print(f"Total institutions: {len(custodians)}") print(f"With location data: {sum(1 for c in custodians if c.locations)} ({sum(1 for c in custodians if c.locations)/len(custodians)*100:.1f}%)") print(f"With city names: {sum(1 for c in custodians if c.locations and c.locations[0].city)} ({sum(1 for c in custodians if c.locations and c.locations[0].city)/len(custodians)*100:.1f}%)") print(f"Unique cities: {len(city_counts)}") print(f"Without location data: {len(no_location)} ({len(no_location)/len(custodians)*100:.1f}%)") print("\n✓ Location enrichment complete!") if __name__ == "__main__": enrich_belgian_locations()