glam/scripts/enrich_belgian_locations.py
2025-11-19 23:25:22 +01:00

199 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Infer city names from Belgian institution names and enrich location data.
This script:
1. Loads Belgian ISIL institutions from YAML
2. Extracts city names from institution names using pattern matching
3. Updates Location objects with inferred cities
4. Geocodes addresses using Nominatim (optional)
5. Re-exports enriched YAML
Pattern strategies:
- "Bibliotheek [City]" → City
- "Bibliotheek van [City]" → City
- "Bib [City]" → City (short form)
- "Stadsbibliotheek [City]" → City
- "Archief [City]" → City
- Institution name contains parentheses with city info
"""
import sys
import re
from pathlib import Path
from typing import Optional, List, Tuple
sys.path.insert(0, 'src')
from glam_extractor.parsers.belgian_isil import BelgianISILParser
from glam_extractor.models import Location
from linkml_runtime.dumpers import yaml_dumper
# Belgian city name patterns
CITY_PATTERNS = [
# Pattern 1: "Bibliotheek [City]" or "Bibliotheek van [City]"
(re.compile(r'Bibliotheek(?:\s+van)?\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*(?:\s+op\s+den\s+Berg)?)', re.IGNORECASE), 1),
# Pattern 2: "Bib [City]" (short form)
(re.compile(r'\bBib\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),
# Pattern 3: Parentheses with city info
(re.compile(r'\((?:Bibliotheek|Bib)\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)\)', re.IGNORECASE), 1),
# Pattern 4: Archive patterns
(re.compile(r'(?:Archief|Archive)(?:\s+van)?\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),
# Pattern 5: Stadsbibliotheek
(re.compile(r'Stadsbibliotheek\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),
# Pattern 6: Museum patterns
(re.compile(r'Museum\s+(?:van\s+)?([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),
# Pattern 7: City at start with separator
(re.compile(r'^([A-Z][a-z]+(?:-[A-Z][a-z]+)*)\s*[-:]', re.IGNORECASE), 1),
]
# Common false positives to filter out
FALSE_POSITIVES = {
'de', 'De', 'het', 'Het', 'van', 'Van',
'Vondel', # Person name, not city
'AS', # Abbreviation
'Koninklijk', 'Royal',
'Provinciale', 'Provincial',
}
def extract_city_from_name(institution_name: str) -> Optional[str]:
"""
Extract city name from Belgian institution name using pattern matching.
Args:
institution_name: Institution name to parse
Returns:
City name if found, None otherwise
"""
for pattern, group_num in CITY_PATTERNS:
match = pattern.search(institution_name)
if match:
city = match.group(group_num).strip()
# Filter false positives
if city in FALSE_POSITIVES:
continue
# Basic validation: city should start with uppercase
if not city[0].isupper():
continue
return city
return None
def enrich_belgian_locations():
"""
Main enrichment function.
"""
print("=" * 70)
print("Belgian Institution Location Enrichment")
print("=" * 70)
# Load Belgian institutions
print("\n1. Loading Belgian institutions...")
parser = BelgianISILParser()
custodians = parser.parse_and_convert('data/isil/belgian_isil_detailed.csv')
print(f" ✓ Loaded {len(custodians)} institutions")
# Extract cities
print("\n2. Extracting city names from institution names...")
enriched_count = 0
city_counts = {}
for custodian in custodians:
city = extract_city_from_name(custodian.name)
if city:
# Create or update location
if not custodian.locations:
custodian.locations = []
if len(custodian.locations) == 0:
# No existing location - create new one
location = Location(
city=city,
country="BE"
)
custodian.locations.append(location)
enriched_count += 1
else:
# Update existing location if city is missing
location = custodian.locations[0]
if not location.city:
location.city = city
enriched_count += 1
# Track city frequency
city_counts[city] = city_counts.get(city, 0) + 1
print(f" ✓ Enriched {enriched_count} institutions with city data")
print(f" ✓ Total institutions with locations: {sum(1 for c in custodians if c.locations)}")
# Show city distribution
print(f"\n3. City distribution (top 15):")
sorted_cities = sorted(city_counts.items(), key=lambda x: x[1], reverse=True)
for city, count in sorted_cities[:15]:
print(f" {city:20} : {count} institutions")
# Show enrichment examples
print(f"\n4. Sample enriched records:")
enriched_samples = [c for c in custodians if c.locations and c.locations[0].city][:5]
for c in enriched_samples:
print(f" {c.id}: {c.name[:45]:45}{c.locations[0].city}")
# Institutions without cities
no_location = [c for c in custodians if not c.locations or not c.locations[0].city]
print(f"\n5. Institutions without city data: {len(no_location)}")
if no_location:
print(f" Sample (first 5):")
for c in no_location[:5]:
print(f" {c.id}: {c.name[:60]}")
# Export enriched data
print(f"\n6. Exporting enriched YAML...")
output_file = Path("data/instances/belgium_isil_institutions_enriched.yaml")
with open(output_file, 'w', encoding='utf-8') as f:
f.write("# Belgian ISIL Registry Institutions (Location Enriched)\n")
f.write("# Scraped from https://isil.kbr.be/ (Royal Library of Belgium)\n")
f.write(f"# Total institutions: {len(custodians)}\n")
f.write(f"# Institutions with location data: {sum(1 for c in custodians if c.locations)}\n")
f.write(f"# Data tier: TIER_1_AUTHORITATIVE\n")
f.write("#\n")
f.write("---\n\n")
for idx, custodian in enumerate(custodians, 1):
yaml_str = yaml_dumper.dumps(custodian)
f.write(yaml_str)
f.write("\n")
if idx % 50 == 0:
print(f" ... exported {idx} institutions")
file_size_kb = output_file.stat().st_size / 1024
print(f" ✓ Exported to: {output_file}")
print(f" ✓ File size: {file_size_kb:.1f} KB")
# Summary statistics
print("\n" + "=" * 70)
print("Enrichment Summary")
print("=" * 70)
print(f"Total institutions: {len(custodians)}")
print(f"With location data: {sum(1 for c in custodians if c.locations)} ({sum(1 for c in custodians if c.locations)/len(custodians)*100:.1f}%)")
print(f"With city names: {sum(1 for c in custodians if c.locations and c.locations[0].city)} ({sum(1 for c in custodians if c.locations and c.locations[0].city)/len(custodians)*100:.1f}%)")
print(f"Unique cities: {len(city_counts)}")
print(f"Without location data: {len(no_location)} ({len(no_location)/len(custodians)*100:.1f}%)")
print("\n✓ Location enrichment complete!")
if __name__ == "__main__":
enrich_belgian_locations()