199 lines
7.2 KiB
Python
199 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Infer city names from Belgian institution names and enrich location data.
|
|
|
|
This script:
|
|
1. Loads Belgian ISIL institutions from YAML
|
|
2. Extracts city names from institution names using pattern matching
|
|
3. Updates Location objects with inferred cities
|
|
4. Geocodes addresses using Nominatim (optional)
|
|
5. Re-exports enriched YAML
|
|
|
|
Pattern strategies:
|
|
- "Bibliotheek [City]" → City
|
|
- "Bibliotheek van [City]" → City
|
|
- "Bib [City]" → City (short form)
|
|
- "Stadsbibliotheek [City]" → City
|
|
- "Archief [City]" → City
|
|
- Institution name contains parentheses with city info
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Optional, List, Tuple
|
|
sys.path.insert(0, 'src')
|
|
|
|
from glam_extractor.parsers.belgian_isil import BelgianISILParser
|
|
from glam_extractor.models import Location
|
|
from linkml_runtime.dumpers import yaml_dumper
|
|
|
|
# Belgian city name patterns
|
|
CITY_PATTERNS = [
|
|
# Pattern 1: "Bibliotheek [City]" or "Bibliotheek van [City]"
|
|
(re.compile(r'Bibliotheek(?:\s+van)?\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*(?:\s+op\s+den\s+Berg)?)', re.IGNORECASE), 1),
|
|
|
|
# Pattern 2: "Bib [City]" (short form)
|
|
(re.compile(r'\bBib\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),
|
|
|
|
# Pattern 3: Parentheses with city info
|
|
(re.compile(r'\((?:Bibliotheek|Bib)\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)\)', re.IGNORECASE), 1),
|
|
|
|
# Pattern 4: Archive patterns
|
|
(re.compile(r'(?:Archief|Archive)(?:\s+van)?\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),
|
|
|
|
# Pattern 5: Stadsbibliotheek
|
|
(re.compile(r'Stadsbibliotheek\s+([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),
|
|
|
|
# Pattern 6: Museum patterns
|
|
(re.compile(r'Museum\s+(?:van\s+)?([A-Z][a-z]+(?:-[A-Z][a-z]+)*)', re.IGNORECASE), 1),
|
|
|
|
# Pattern 7: City at start with separator
|
|
(re.compile(r'^([A-Z][a-z]+(?:-[A-Z][a-z]+)*)\s*[-:]', re.IGNORECASE), 1),
|
|
]
|
|
|
|
# Common false positives to filter out
|
|
FALSE_POSITIVES = {
|
|
'de', 'De', 'het', 'Het', 'van', 'Van',
|
|
'Vondel', # Person name, not city
|
|
'AS', # Abbreviation
|
|
'Koninklijk', 'Royal',
|
|
'Provinciale', 'Provincial',
|
|
}
|
|
|
|
def extract_city_from_name(institution_name: str) -> Optional[str]:
|
|
"""
|
|
Extract city name from Belgian institution name using pattern matching.
|
|
|
|
Args:
|
|
institution_name: Institution name to parse
|
|
|
|
Returns:
|
|
City name if found, None otherwise
|
|
"""
|
|
for pattern, group_num in CITY_PATTERNS:
|
|
match = pattern.search(institution_name)
|
|
if match:
|
|
city = match.group(group_num).strip()
|
|
|
|
# Filter false positives
|
|
if city in FALSE_POSITIVES:
|
|
continue
|
|
|
|
# Basic validation: city should start with uppercase
|
|
if not city[0].isupper():
|
|
continue
|
|
|
|
return city
|
|
|
|
return None
|
|
|
|
|
|
def enrich_belgian_locations():
|
|
"""
|
|
Main enrichment function.
|
|
"""
|
|
print("=" * 70)
|
|
print("Belgian Institution Location Enrichment")
|
|
print("=" * 70)
|
|
|
|
# Load Belgian institutions
|
|
print("\n1. Loading Belgian institutions...")
|
|
parser = BelgianISILParser()
|
|
custodians = parser.parse_and_convert('data/isil/belgian_isil_detailed.csv')
|
|
print(f" ✓ Loaded {len(custodians)} institutions")
|
|
|
|
# Extract cities
|
|
print("\n2. Extracting city names from institution names...")
|
|
enriched_count = 0
|
|
city_counts = {}
|
|
|
|
for custodian in custodians:
|
|
city = extract_city_from_name(custodian.name)
|
|
|
|
if city:
|
|
# Create or update location
|
|
if not custodian.locations:
|
|
custodian.locations = []
|
|
|
|
if len(custodian.locations) == 0:
|
|
# No existing location - create new one
|
|
location = Location(
|
|
city=city,
|
|
country="BE"
|
|
)
|
|
custodian.locations.append(location)
|
|
enriched_count += 1
|
|
else:
|
|
# Update existing location if city is missing
|
|
location = custodian.locations[0]
|
|
if not location.city:
|
|
location.city = city
|
|
enriched_count += 1
|
|
|
|
# Track city frequency
|
|
city_counts[city] = city_counts.get(city, 0) + 1
|
|
|
|
print(f" ✓ Enriched {enriched_count} institutions with city data")
|
|
print(f" ✓ Total institutions with locations: {sum(1 for c in custodians if c.locations)}")
|
|
|
|
# Show city distribution
|
|
print(f"\n3. City distribution (top 15):")
|
|
sorted_cities = sorted(city_counts.items(), key=lambda x: x[1], reverse=True)
|
|
for city, count in sorted_cities[:15]:
|
|
print(f" {city:20} : {count} institutions")
|
|
|
|
# Show enrichment examples
|
|
print(f"\n4. Sample enriched records:")
|
|
enriched_samples = [c for c in custodians if c.locations and c.locations[0].city][:5]
|
|
for c in enriched_samples:
|
|
print(f" {c.id}: {c.name[:45]:45} → {c.locations[0].city}")
|
|
|
|
# Institutions without cities
|
|
no_location = [c for c in custodians if not c.locations or not c.locations[0].city]
|
|
print(f"\n5. Institutions without city data: {len(no_location)}")
|
|
if no_location:
|
|
print(f" Sample (first 5):")
|
|
for c in no_location[:5]:
|
|
print(f" {c.id}: {c.name[:60]}")
|
|
|
|
# Export enriched data
|
|
print(f"\n6. Exporting enriched YAML...")
|
|
output_file = Path("data/instances/belgium_isil_institutions_enriched.yaml")
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("# Belgian ISIL Registry Institutions (Location Enriched)\n")
|
|
f.write("# Scraped from https://isil.kbr.be/ (Royal Library of Belgium)\n")
|
|
f.write(f"# Total institutions: {len(custodians)}\n")
|
|
f.write(f"# Institutions with location data: {sum(1 for c in custodians if c.locations)}\n")
|
|
f.write(f"# Data tier: TIER_1_AUTHORITATIVE\n")
|
|
f.write("#\n")
|
|
f.write("---\n\n")
|
|
|
|
for idx, custodian in enumerate(custodians, 1):
|
|
yaml_str = yaml_dumper.dumps(custodian)
|
|
f.write(yaml_str)
|
|
f.write("\n")
|
|
|
|
if idx % 50 == 0:
|
|
print(f" ... exported {idx} institutions")
|
|
|
|
file_size_kb = output_file.stat().st_size / 1024
|
|
print(f" ✓ Exported to: {output_file}")
|
|
print(f" ✓ File size: {file_size_kb:.1f} KB")
|
|
|
|
# Summary statistics
|
|
print("\n" + "=" * 70)
|
|
print("Enrichment Summary")
|
|
print("=" * 70)
|
|
print(f"Total institutions: {len(custodians)}")
|
|
print(f"With location data: {sum(1 for c in custodians if c.locations)} ({sum(1 for c in custodians if c.locations)/len(custodians)*100:.1f}%)")
|
|
print(f"With city names: {sum(1 for c in custodians if c.locations and c.locations[0].city)} ({sum(1 for c in custodians if c.locations and c.locations[0].city)/len(custodians)*100:.1f}%)")
|
|
print(f"Unique cities: {len(city_counts)}")
|
|
print(f"Without location data: {len(no_location)} ({len(no_location)/len(custodians)*100:.1f}%)")
|
|
|
|
print("\n✓ Location enrichment complete!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
enrich_belgian_locations()
|