3391 lines
125 KiB
Python
3391 lines
125 KiB
Python
"""
|
|
PostGIS Geo API for Heritage Custodian Map
|
|
FastAPI backend providing spatial queries for bronhouder.nl map
|
|
|
|
Mounted at /api/geo/ via Caddy reverse proxy.
|
|
|
|
Endpoints:
|
|
- GET / - Health check and geo statistics
|
|
- GET /countries - Get all countries as GeoJSON with institution counts
|
|
- GET /provinces - Get all provinces as GeoJSON
|
|
- GET /municipalities - Get municipalities (with filters)
|
|
- GET /institutions - Get institutions as GeoJSON (with bbox/type filters)
|
|
- GET /institution/:ghcid - Get single institution details
|
|
- GET /historical - Get historical boundaries
|
|
- GET /search - Search institutions by name
|
|
- GET /admin/point - Find admin unit for a point
|
|
- GET /nearby - Find institutions near a point
|
|
- GET /stats/by-type - Institution counts by type
|
|
- GET /stats/by-province - Institution counts by province
|
|
|
|
Person Endpoints (Beschermers):
|
|
- GET /persons - List persons with filters (custodian, type, country)
|
|
- GET /persons/count - Get total person count for stats
|
|
- GET /persons/search - Search persons by name/headline/custodian
|
|
- GET /person/:staff_id - Get single person details
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
import json
|
|
from datetime import datetime
|
|
from typing import Optional, List, Dict, Any
|
|
from contextlib import asynccontextmanager
|
|
from decimal import Decimal
|
|
|
|
from fastapi import FastAPI, HTTPException, Query, APIRouter
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from fastapi.middleware.gzip import GZipMiddleware
|
|
from fastapi.responses import JSONResponse, Response
|
|
from pydantic import BaseModel, Field
|
|
import asyncpg
|
|
import httpx
|
|
import hashlib
|
|
import aiosqlite
|
|
from urllib.parse import urlparse
|
|
from pathlib import Path
|
|
|
|
|
|
# ============================================================================
|
|
# Configuration
|
|
# ============================================================================
|
|
|
|
class GeoSettings(BaseModel):
|
|
"""PostGIS geo database settings - connects to glam_geo with PostGIS boundaries"""
|
|
host: str = os.getenv("GEO_POSTGRES_HOST", "localhost")
|
|
port: int = int(os.getenv("GEO_POSTGRES_PORT", "5432"))
|
|
database: str = os.getenv("GEO_POSTGRES_DB", "glam_geo") # glam_geo has boundary data
|
|
user: str = os.getenv("GEO_POSTGRES_USER", "glam_api")
|
|
password: str = os.getenv("GEO_POSTGRES_PASSWORD", "")
|
|
|
|
# Server settings
|
|
api_host: str = os.getenv("GEO_API_HOST", "0.0.0.0")
|
|
api_port: int = int(os.getenv("GEO_API_PORT", "8002"))
|
|
|
|
# GeoNames database path (SQLite)
|
|
geonames_db: str = os.getenv(
|
|
"GEONAMES_DB_PATH",
|
|
str(Path(__file__).parent.parent.parent / "data" / "reference" / "geonames.db")
|
|
)
|
|
|
|
|
|
settings = GeoSettings()
|
|
|
|
|
|
# ============================================================================
|
|
# Pydantic Models
|
|
# ============================================================================
|
|
|
|
class GeoStatsResponse(BaseModel):
|
|
"""Geo database statistics"""
|
|
status: str
|
|
database: str
|
|
provinces: int
|
|
municipalities: int
|
|
institutions: int
|
|
historical_boundaries: int
|
|
postgres_version: str
|
|
|
|
|
|
class InstitutionDetail(BaseModel):
|
|
"""Detailed institution information"""
|
|
ghcid: str
|
|
name: str
|
|
type: str
|
|
type_name: Optional[str]
|
|
lat: Optional[float]
|
|
lon: Optional[float]
|
|
address: Optional[str]
|
|
city: Optional[str]
|
|
province: Optional[str]
|
|
website: Optional[str]
|
|
phone: Optional[str]
|
|
wikidata_id: Optional[str]
|
|
rating: Optional[float]
|
|
total_ratings: Optional[int]
|
|
description: Optional[str]
|
|
reviews: Optional[List[Dict]]
|
|
genealogiewerkbalk: Optional[Dict]
|
|
|
|
|
|
class AdminPoint(BaseModel):
|
|
"""Admin unit for a point"""
|
|
province_code: Optional[str]
|
|
province_name: Optional[str]
|
|
municipality_code: Optional[str]
|
|
municipality_name: Optional[str]
|
|
|
|
|
|
class NearbyInstitution(BaseModel):
|
|
"""Institution with distance"""
|
|
ghcid: str
|
|
name: str
|
|
type: str
|
|
type_name: Optional[str]
|
|
distance_km: float
|
|
city: Optional[str]
|
|
province: Optional[str]
|
|
rating: Optional[float]
|
|
|
|
|
|
class PersonSummary(BaseModel):
|
|
"""Summary person information for list views"""
|
|
staff_id: str
|
|
name: str
|
|
headline: Optional[str]
|
|
location: Optional[str]
|
|
country_code: Optional[str]
|
|
custodian_slug: Optional[str]
|
|
custodian_name: Optional[str]
|
|
linkedin_url: Optional[str]
|
|
profile_image_url: Optional[str]
|
|
heritage_relevant: bool = True
|
|
heritage_types: List[str] = []
|
|
|
|
|
|
class PersonDetail(BaseModel):
|
|
"""Detailed person information"""
|
|
staff_id: str
|
|
name: str
|
|
headline: Optional[str]
|
|
location: Optional[str]
|
|
country_code: Optional[str]
|
|
custodian_slug: Optional[str]
|
|
custodian_name: Optional[str]
|
|
linkedin_url: Optional[str]
|
|
profile_image_url: Optional[str]
|
|
heritage_relevant: bool = True
|
|
heritage_types: List[str] = []
|
|
experience: List[Dict] = []
|
|
education: List[Dict] = []
|
|
skills: List[str] = []
|
|
languages: List[Dict] = []
|
|
about: Optional[str]
|
|
connections: Optional[str]
|
|
extraction_date: Optional[str]
|
|
extraction_method: Optional[str]
|
|
source_file: Optional[str]
|
|
|
|
|
|
class PlaceLookupResponse(BaseModel):
|
|
"""Response from GeoNames place lookup"""
|
|
name: str
|
|
latitude: float
|
|
longitude: float
|
|
province: Optional[str] = None
|
|
population: Optional[int] = None
|
|
bbox: List[float] = Field(
|
|
description="Bounding box [minLon, minLat, maxLon, maxLat]"
|
|
)
|
|
country_code: str = "NL"
|
|
|
|
|
|
# ============================================================================
|
|
# Heritage Classification (copied from main.py for experience item classification)
|
|
# ============================================================================
|
|
|
|
import re
|
|
|
|
# Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy
|
|
# Complete 19-type taxonomy extracted from Wikidata hyponyms + domain expertise
|
|
# G=Gallery, L=Library, A=Archive, M=Museum, O=Official, R=Research, C=Corporation,
|
|
# U=Unspecified, B=Bio-custodian, E=Education, S=Society, F=Feature, I=Intangible,
|
|
# X=Mixed, P=Personal, H=Holy, D=Digital, N=Non-profit, T=Taste/Scent
|
|
HERITAGE_KEYWORDS = {
|
|
# G - Gallery: Art galleries, exhibition spaces, kunsthallen
|
|
'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space',
|
|
'art center', 'art centre', 'art collection', 'art collective', 'art colony',
|
|
'kunstgalerie', 'kunstcentrum', 'tentoonstellingsruimte', 'alternative space',
|
|
'sculpture garden', 'beeldentuin', 'kunsthuis', 'artist-run'],
|
|
|
|
# L - Library: Public, academic, and specialized libraries
|
|
'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ',
|
|
'national library', 'academic library', 'public library', 'openbare bibliotheek',
|
|
'university library', 'universiteitsbibliotheek', 'reading room', 'leeszaal',
|
|
'book collection', 'boekencollectie', 'mediatheek', 'mediatheque', 'lending library',
|
|
'special collections', 'bijzondere collecties', 'manuscript library'],
|
|
|
|
# A - Archive: Government, corporate, and personal archives
|
|
'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid',
|
|
'beeld & geluid', 'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual',
|
|
'nationaal archief', 'stadsarchief', 'gemeentearchief', 'rijksarchief', 'NIOD',
|
|
'IISH', 'IISG', 'archiefspecialist', 'records management', 'documentbeheer',
|
|
'film archive', 'filmarchief', 'sound archive', 'geluidsarchief', 'photo archive',
|
|
'fotoarchief', 'regional archive', 'regionaal archief', 'provincial archive',
|
|
'provinciaal archief', 'family archive', 'familiearchief', 'ecclesiastical archive',
|
|
'kerkelijk archief', 'notarial archive', 'notarieel archief', 'depot'],
|
|
|
|
# M - Museum: All museum types (art, history, science, etc.)
|
|
'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum',
|
|
'van gogh', 'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale',
|
|
'collectiebeheerder', 'collectiespecialist', 'collectie', 'archaeological museum',
|
|
'archeologisch museum', 'natural history museum', 'natuurhistorisch museum',
|
|
'science museum', 'wetenschapsmuseum', 'art museum', 'kunstmuseum', 'history museum',
|
|
'historisch museum', 'maritime museum', 'maritiem museum', 'military museum',
|
|
'militair museum', 'open air museum', 'openluchtmuseum', 'heritage museum',
|
|
'erfgoedmuseum', 'volkskundemuseum', 'ethnographic museum'],
|
|
|
|
# O - Official Institution: Government heritage agencies, platforms
|
|
'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province',
|
|
'provincie', 'OCW', 'cultural ministry', 'heritage agency', 'erfgoedinstelling',
|
|
'rijksdienst', 'cultural heritage agency', 'RCE', 'rijksdienst cultureel erfgoed',
|
|
'heritage platform', 'erfgoedplatform', 'state archives', 'public records office',
|
|
'national heritage', 'monumentenzorg', 'heritage council', 'erfgoedraad'],
|
|
|
|
# R - Research Center: Research institutes, documentation centers
|
|
'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster',
|
|
'NWO', 'documentatie', 'documentation', 'kenniscentrum', 'historicus',
|
|
'research institute', 'onderzoeksinstituut', 'documentation center',
|
|
'documentatiecentrum', 'scientific archive', 'wetenschappelijk archief',
|
|
'study center', 'studiecentrum', 'research library', 'onderzoeksbibliotheek',
|
|
'think tank', 'knowledge institute', 'kennisinstituut', 'meertens', 'huygens',
|
|
'NIOD', 'fryske akademy'],
|
|
|
|
# C - Corporation: Corporate heritage collections, company archives
|
|
'C': ['corporate archive', 'bedrijfsarchief', 'company history', 'company museum',
|
|
'bedrijfsmuseum', 'industrial heritage', 'industrieel erfgoed', 'brand archive',
|
|
'merkarchief', 'corporate collection', 'bedrijfscollectie', 'factory museum',
|
|
'fabrieksmuseum', 'business archive', 'ondernemingsarchief'],
|
|
|
|
# U - Unspecified: Data quality flag, no keywords (intentional)
|
|
'U': [],
|
|
|
|
# B - Bio-custodian: Botanical gardens, zoos, aquariums, nature reserves
|
|
'B': ['botanical garden', 'botanische tuin', 'hortus', 'arboretum', 'zoo', 'dierentuin',
|
|
'aquarium', 'vivarium', 'terrarium', 'insectarium', 'butterfly house', 'vlindertuin',
|
|
'nature reserve', 'natuurreservaat', 'wildlife park', 'wildpark', 'safari park',
|
|
'bird sanctuary', 'vogelreservaat', 'marine park', 'oceanarium', 'dolphinarium',
|
|
'seed bank', 'zaadbank', 'herbarium', 'alpine garden', 'alpentuin', 'reptile house',
|
|
'reptielenhuis', 'animal park', 'dierenpark', 'nature museum', 'natuurmuseum',
|
|
'plant collection', 'plantencollectie', 'living collection', 'levende collectie'],
|
|
|
|
# E - Education Provider: Universities, academies, training centers
|
|
'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool',
|
|
'academy', 'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs',
|
|
'education', 'UvA', 'VU ', 'leiden university', 'reinwardt', 'film academy',
|
|
'graduate', 'assistant professor', 'associate professor', 'hoogleraar', 'educatie',
|
|
'educator', 'art school', 'kunstacademie', 'music conservatory', 'conservatorium',
|
|
'training center', 'opleidingscentrum', 'museum studies', 'archival science',
|
|
'archiefwetenschap', 'library science', 'bibliotheconomie', 'heritage studies'],
|
|
|
|
# S - Society: Historical societies, collecting societies, associations
|
|
'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging',
|
|
'heemkundige kring', 'oudheidkundige kring', 'archaeological society',
|
|
'archeologische vereniging', 'numismatic society', 'numismatische vereniging',
|
|
'philatelic society', 'filatelistenvereniging', 'friends of', 'vrienden van',
|
|
'heritage association', 'erfgoedvereniging', 'collectors club', 'verzamelclub',
|
|
'antiquarian society', 'local history society', 'family history society',
|
|
'genealogische vereniging', 'alumni association', 'oud-ledenvereniging'],
|
|
|
|
# F - Feature: Monuments, sculptures, heritage sites, landmarks
|
|
'F': ['monument', 'memorial', 'statue', 'sculpture', 'landmark', 'heritage site',
|
|
'erfgoedlocatie', 'archaeological site', 'archeologische vindplaats', 'castle',
|
|
'kasteel', 'manor', 'landgoed', 'mansion', 'landhuis', 'buitenplaats', 'palace',
|
|
'paleis', 'fortress', 'fort', 'vesting', 'ruins', 'ruïne', 'cemetery', 'begraafplaats',
|
|
'war memorial', 'oorlogsmonument', 'historic building', 'historisch gebouw',
|
|
'windmill', 'molen', 'industrial monument', 'industrieel monument', 'watermill',
|
|
'watermolen', 'lighthouse', 'vuurtoren', 'bunker', 'aqueduct', 'aquaduct'],
|
|
|
|
# I - Intangible Heritage: Folklore, oral history, performing traditions
|
|
'I': ['intangible heritage', 'immaterieel erfgoed', 'oral history', 'oral tradition',
|
|
'folklore', 'volkscultuur', 'traditional music', 'traditionele muziek',
|
|
'folk dance', 'volksdans', 'ritual', 'ritueel', 'craft tradition', 'ambachtstraditie',
|
|
'storytelling', 'verteltraditie', 'carnival', 'carnaval', 'festival', 'feast',
|
|
'feest', 'procession', 'processie', 'dialect', 'streektaal', 'living heritage',
|
|
'levend erfgoed', 'performing arts', 'podiumkunsten', 'traditional craft',
|
|
'traditioneel ambacht', 'rederijkerskamer', 'chamber of rhetoric'],
|
|
|
|
# X - Mixed: Umbrella organizations, federations, multi-type institutions
|
|
'X': ['umbrella organization', 'koepelorganisatie', 'federation', 'federatie',
|
|
'consortium', 'netwerk', 'network', 'platform', 'overkoepelende organisatie',
|
|
'dachverband', 'roof organization', 'combined museum and archive',
|
|
'gecombineerde instelling', 'multi-disciplinary', 'multidisciplinair'],
|
|
|
|
# P - Personal Collection: Private collections, estates, personal archives
|
|
'P': ['personal collection', 'persoonlijke collectie', 'private collection',
|
|
'privécollectie', 'private archive', 'privéarchief', 'personal archive',
|
|
'persoonlijk archief', 'estate', 'nalatenschap', 'nachlass', 'literary estate',
|
|
'literaire nalatenschap', 'private library', 'privébibliotheek', 'home library',
|
|
'thuisbibliotheek', 'collector', 'verzamelaar', 'private museum', 'huismuseum'],
|
|
|
|
# H - Holy Site: Religious heritage sites, sacred places with collections
|
|
'H': ['church', 'kerk', 'cathedral', 'kathedraal', 'monastery', 'klooster', 'abbey',
|
|
'abdij', 'temple', 'tempel', 'mosque', 'moskee', 'synagogue', 'synagoge',
|
|
'shrine', 'schrijn', 'chapel', 'kapel', 'pilgrimage', 'bedevaart', 'convent',
|
|
'religious heritage', 'religieus erfgoed', 'sacred site', 'heilige plaats',
|
|
'treasury', 'schatkamer', 'church archive', 'kerkarchief', 'parish records',
|
|
'parochieregister', 'diocesan', 'bisdom', 'ecclesiastical', 'kerkelijk'],
|
|
|
|
# D - Digital Platform: Digital heritage platforms, repositories
|
|
'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer',
|
|
'engineer', 'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement',
|
|
'data analist', 'digital archive', 'digitaal archief', 'digital library',
|
|
'digitale bibliotheek', 'digital repository', 'online collection', 'aggregator',
|
|
'portal', 'portaal', 'database', 'metadata', 'linked data', 'semantic web',
|
|
'digitization', 'digitalisatie', 'digital preservation', 'digitale conservering',
|
|
'web archive', 'webarchief', 'born digital'],
|
|
|
|
# N - Non-profit: Heritage NGOs, advocacy groups, foundations
|
|
'N': ['foundation', 'stichting', 'non-profit', 'NGO', 'charity', 'trust',
|
|
'advocacy', 'belangenbehartiging', 'heritage foundation', 'erfgoedstichting',
|
|
'cultural foundation', 'culturele stichting', 'preservation society',
|
|
'monumentenstichting', 'heritage trust', 'conservation trust', 'friends foundation',
|
|
'vriendenstichting', 'support organization', 'steunorganisatie', 'ANBI',
|
|
'algemeen nut beogende instelling', 'cultural council', 'cultuurfonds'],
|
|
|
|
# T - Taste/Scent Heritage: Culinary, brewing, olfactory heritage
|
|
'T': ['brewery', 'brouwerij', 'distillery', 'distilleerderij', 'winery', 'wijnhuis',
|
|
'chocolate factory', 'chocoladefabriek', 'culinary heritage', 'culinair erfgoed',
|
|
'food museum', 'voedselmuseum', 'perfumery', 'parfumerie', 'spice museum',
|
|
'specerijmuseum', 'coffee museum', 'koffiemuseum', 'tea museum', 'theemuseum',
|
|
'cheese museum', 'kaasmuseum', 'wine museum', 'wijnmuseum', 'beer museum',
|
|
'biermuseum', 'gastronomy', 'gastronomie', 'traditional recipe', 'historic recipe',
|
|
'jenever', 'genever', 'liqueur', 'likeur'],
|
|
}
|
|
|
|
NON_HERITAGE_KEYWORDS = [
|
|
'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting',
|
|
'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical',
|
|
'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking',
|
|
'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse'
|
|
]
|
|
|
|
# Organizations that are explicitly NOT heritage institutions
|
|
NON_HERITAGE_ORGANIZATIONS = [
|
|
# Banks & Financial
|
|
'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos',
|
|
# Security companies
|
|
'i-sec', 'g4s', 'securitas', 'trigion', 'chubb',
|
|
# Police/Government (non-cultural)
|
|
'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie',
|
|
# Political parties
|
|
'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt',
|
|
'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ',
|
|
# Tech companies (non-heritage)
|
|
'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix',
|
|
'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird',
|
|
'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat',
|
|
# Telecom
|
|
'kpn', 'vodafone', 't-mobile', 'ziggo',
|
|
# Postal / Logistics
|
|
'postnl', 'postkantoren', 'dhl', 'ups', 'fedex',
|
|
# Healthcare
|
|
'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg',
|
|
# Retail
|
|
'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action',
|
|
# Consulting / Professional services
|
|
'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg',
|
|
'accenture', 'capgemini', 'ordina', 'atos', 'cgi ',
|
|
# Recruitment / HR
|
|
'randstad', 'tempo-team', 'manpower', 'hays', 'brunel',
|
|
# Energy / Utilities
|
|
'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon',
|
|
# Transport
|
|
'ns ', 'prorail', 'schiphol', 'klm', 'transavia',
|
|
# Other
|
|
'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf',
|
|
]
|
|
|
|
# Heritage organization keywords - organizations that ARE heritage institutions
|
|
HERITAGE_ORGANIZATION_KEYWORDS = [
|
|
# Archives
|
|
'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief',
|
|
'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg',
|
|
# Museums
|
|
'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis',
|
|
'tropenmuseum', 'allard pierson', 'kröller', 'boijmans',
|
|
# Libraries
|
|
'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ',
|
|
# Film/AV heritage
|
|
'eye film', 'filmmuseum', 'eye ', 'sound and vision',
|
|
# Heritage platforms
|
|
'erfgoed', 'heritage', 'cultural', 'cultureel',
|
|
# Research institutes (heritage-focused)
|
|
'knaw', 'humanities cluster', 'meertens', 'huygens',
|
|
]
|
|
|
|
|
|
def detect_heritage_type(role: Optional[str], company: Optional[str]) -> tuple:
|
|
"""
|
|
Detect if a position is heritage-relevant and what type.
|
|
|
|
Two-stage classification:
|
|
1. Check if organization is explicitly non-heritage (blocklist)
|
|
2. Check if role/organization matches heritage patterns
|
|
|
|
For 'D' (Digital) type, require BOTH a tech role AND a heritage organization.
|
|
This prevents generic IT workers at banks/police from being classified as heritage.
|
|
|
|
Args:
|
|
role: Job title/role text
|
|
company: Company/organization name
|
|
|
|
Returns:
|
|
Tuple of (heritage_relevant: bool, heritage_type: Optional[str])
|
|
"""
|
|
# Combine role and company for full context
|
|
role_text = role or ''
|
|
company_text = company or ''
|
|
combined = f"{role_text} {company_text}".lower()
|
|
|
|
if not combined.strip():
|
|
return (False, None)
|
|
|
|
# Stage 1: Check for non-heritage organizations (blocklist)
|
|
# Use word boundary matching to avoid false positives like "sharing" matching "ing "
|
|
for org in NON_HERITAGE_ORGANIZATIONS:
|
|
org_pattern = org.lower().strip()
|
|
# Use word boundary regex for patterns that could have false positives
|
|
if re.search(r'\b' + re.escape(org_pattern) + r'\b', combined):
|
|
return (False, None)
|
|
|
|
# Stage 2: Check for non-heritage role indicators
|
|
for keyword in NON_HERITAGE_KEYWORDS:
|
|
keyword_pattern = keyword.lower().strip()
|
|
if re.search(r'\b' + re.escape(keyword_pattern) + r'\b', combined):
|
|
return (False, None)
|
|
|
|
# Stage 3: Check if this is a heritage organization
|
|
is_heritage_org = False
|
|
for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS:
|
|
if org_keyword.lower() in combined:
|
|
is_heritage_org = True
|
|
break
|
|
|
|
# Check heritage keywords by type (order matters - more specific first)
|
|
# 'D' (Digital) is checked last and requires heritage org validation
|
|
type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop
|
|
|
|
for heritage_type in type_order:
|
|
keywords = HERITAGE_KEYWORDS.get(heritage_type, [])
|
|
for keyword in keywords:
|
|
if keyword.lower() in combined:
|
|
return (True, heritage_type)
|
|
|
|
# Special handling for 'D' (Digital) - ONLY if at a heritage organization
|
|
if is_heritage_org:
|
|
digital_keywords = HERITAGE_KEYWORDS.get('D', [])
|
|
for keyword in digital_keywords:
|
|
if keyword.lower() in combined:
|
|
return (True, 'D')
|
|
|
|
# Generic heritage terms (without specific type)
|
|
generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema',
|
|
'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection']
|
|
for keyword in generic:
|
|
if keyword in combined:
|
|
return (True, None)
|
|
|
|
return (False, None)
|
|
|
|
|
|
# ============================================================================
|
|
# Embedding-Based Heritage Type Classification
|
|
# ============================================================================
|
|
# Uses pre-computed prototype embeddings from Wikidata hyponyms to classify
|
|
# institution names/descriptions into GLAMORCUBESFIXPHDNT types via semantic
|
|
# similarity. Complements keyword-based detection for ambiguous/multilingual text.
|
|
#
|
|
# TODO: In future, generate prototypes from LinkML enums (e.g., ArchiveOrganisationType)
|
|
# instead of static NPZ file, to automatically pick up schema changes.
|
|
# ============================================================================
|
|
|
|
import numpy as np
|
|
from functools import lru_cache
|
|
|
|
# Global variables for lazy-loaded embedding resources
|
|
_type_prototypes: dict | None = None
|
|
_type_codes: list | None = None
|
|
_prototype_matrix: np.ndarray | None = None # Pre-stacked for fast batch comparison
|
|
_st_model = None # Sentence transformer model
|
|
|
|
|
|
def _load_type_prototypes() -> tuple[dict[str, np.ndarray], list[str], np.ndarray]:
|
|
"""
|
|
Load pre-computed type prototype embeddings from NPZ file.
|
|
|
|
Returns:
|
|
Tuple of (prototypes_dict, type_codes_list, prototype_matrix)
|
|
- prototypes_dict: {type_code: embedding_vector}
|
|
- type_codes_list: ['A', 'B', 'C', ...]
|
|
- prototype_matrix: (19, 384) array for vectorized comparison
|
|
|
|
TODO: In future, generate prototypes from LinkML enums (e.g., ArchiveOrganisationType)
|
|
instead of static NPZ file, to automatically pick up schema changes.
|
|
"""
|
|
global _type_prototypes, _type_codes, _prototype_matrix
|
|
|
|
if _type_prototypes is None:
|
|
npz_path = Path(__file__).parent.parent.parent / "data/wikidata/GLAMORCUBEPSXHFN/type_prototypes_minilm_384.npz"
|
|
|
|
if not npz_path.exists():
|
|
raise FileNotFoundError(
|
|
f"Type prototypes not found at {npz_path}. "
|
|
"Run: python scripts/generate_type_prototypes.py"
|
|
)
|
|
|
|
data = np.load(npz_path)
|
|
codes: list[str] = data["type_codes"].tolist()
|
|
prototypes: np.ndarray = data["prototypes"] # Shape: (19, 384)
|
|
|
|
# Store as dict for lookup by type code
|
|
_type_prototypes = {code: prototypes[i] for i, code in enumerate(codes)}
|
|
_type_codes = codes
|
|
|
|
# Store as matrix for vectorized similarity computation
|
|
_prototype_matrix = prototypes
|
|
|
|
# Assert not None for type checker (we've loaded them above)
|
|
assert _type_prototypes is not None
|
|
assert _type_codes is not None
|
|
assert _prototype_matrix is not None
|
|
|
|
return _type_prototypes, _type_codes, _prototype_matrix
|
|
|
|
|
|
def _get_embedding_model():
|
|
"""
|
|
Lazy-load sentence-transformers model.
|
|
|
|
Uses all-MiniLM-L6-v2 (384-dim) for fast inference.
|
|
Model is loaded once and cached for subsequent calls.
|
|
"""
|
|
global _st_model
|
|
|
|
if _st_model is None:
|
|
try:
|
|
from sentence_transformers import SentenceTransformer
|
|
_st_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
except ImportError:
|
|
raise ImportError(
|
|
"sentence-transformers not installed. "
|
|
"Run: pip install sentence-transformers"
|
|
)
|
|
|
|
return _st_model
|
|
|
|
|
|
def classify_type_embedding(
|
|
text: str,
|
|
threshold: float = 0.35,
|
|
return_scores: bool = False
|
|
) -> tuple[str | None, float, dict[str, float] | None]:
|
|
"""
|
|
Classify text into GLAMORCUBESFIXPHDNT type using semantic similarity.
|
|
|
|
Embeds the input text and compares against pre-computed prototype embeddings
|
|
for each of the 19 heritage custodian types. Returns the best matching type
|
|
if similarity exceeds threshold.
|
|
|
|
Args:
|
|
text: Institution name, description, or role/company text to classify
|
|
threshold: Minimum cosine similarity to return a type (default 0.35)
|
|
return_scores: If True, also return dict of all type scores (otherwise None)
|
|
|
|
Returns:
|
|
Always returns 3-tuple of (type_code, best_score, all_scores):
|
|
- type_code: Best matching type code, or None if below threshold
|
|
- best_score: Cosine similarity of best match (0.0-1.0)
|
|
- all_scores: Dict of all type scores if return_scores=True, else None
|
|
|
|
Example:
|
|
>>> type_code, score, _ = classify_type_embedding("Rijksmuseum Amsterdam")
|
|
>>> print(type_code, score)
|
|
M 0.67
|
|
|
|
>>> type_code, score, all_scores = classify_type_embedding(
|
|
... "Botanical Garden Utrecht", return_scores=True)
|
|
>>> print(type_code, all_scores['B'])
|
|
B 0.72
|
|
"""
|
|
if not text or not text.strip():
|
|
return (None, 0.0, {} if return_scores else None)
|
|
|
|
# Load prototypes and model
|
|
prototypes, type_codes, prototype_matrix = _load_type_prototypes()
|
|
model = _get_embedding_model()
|
|
|
|
# Embed the input text
|
|
text_embedding = model.encode(text, convert_to_numpy=True)
|
|
|
|
# Normalize for cosine similarity
|
|
text_norm = np.linalg.norm(text_embedding)
|
|
if text_norm > 0:
|
|
text_embedding = text_embedding / text_norm
|
|
|
|
# Compute cosine similarities with all prototypes (vectorized)
|
|
# prototype_matrix is already L2-normalized from generation script
|
|
similarities = prototype_matrix @ text_embedding # Shape: (19,)
|
|
|
|
# Find best match
|
|
best_idx = int(np.argmax(similarities))
|
|
best_score = float(similarities[best_idx])
|
|
best_type = type_codes[best_idx]
|
|
|
|
# Build scores dict if requested
|
|
all_scores: dict[str, float] | None = None
|
|
if return_scores:
|
|
all_scores = {code: float(similarities[i]) for i, code in enumerate(type_codes)}
|
|
|
|
# Return best type if above threshold, otherwise None
|
|
if best_score >= threshold:
|
|
return (best_type, best_score, all_scores)
|
|
|
|
return (None, best_score, all_scores)
|
|
|
|
|
|
@lru_cache(maxsize=2000)
|
|
def detect_heritage_type_hybrid(
|
|
role: str | None,
|
|
company: str | None,
|
|
embedding_threshold: float = 0.35
|
|
) -> tuple[bool, str | None, str]:
|
|
"""
|
|
Hybrid heritage type detection combining keywords and embeddings.
|
|
|
|
Strategy:
|
|
1. Try keyword-based detection first (fast, high precision)
|
|
2. Fall back to embedding-based classification for unmatched text
|
|
3. Cache results for repeated lookups
|
|
|
|
Args:
|
|
role: Job title/role text
|
|
company: Company/organization name
|
|
embedding_threshold: Minimum similarity for embedding match (default 0.35)
|
|
Note: 0.35 balances recall/precision. Use 0.40+ for higher precision.
|
|
|
|
Returns:
|
|
Tuple of (is_heritage, type_code, method) where method is:
|
|
- 'keyword': Matched via keyword rules
|
|
- 'keyword_generic': Matched generic heritage terms (no specific type)
|
|
- 'embedding:0.XXX': Matched via embedding with similarity score
|
|
- 'none': No heritage relevance detected
|
|
|
|
Example:
|
|
>>> detect_heritage_type_hybrid("Curator", "Rijksmuseum")
|
|
(True, 'M', 'keyword')
|
|
|
|
>>> detect_heritage_type_hybrid("Zookeeper", "Artis Zoo")
|
|
(True, 'B', 'embedding:0.68')
|
|
|
|
>>> detect_heritage_type_hybrid("Software Engineer", "Google")
|
|
(False, None, 'none')
|
|
"""
|
|
# Convert None to empty string for caching (None isn't hashable in all cases)
|
|
role = role or ''
|
|
company = company or ''
|
|
|
|
# Try keyword-based detection first (existing system)
|
|
is_heritage, type_code = detect_heritage_type(role, company)
|
|
|
|
if is_heritage and type_code:
|
|
return (True, type_code, "keyword")
|
|
|
|
if is_heritage and type_code is None:
|
|
# Matched generic heritage terms but no specific type
|
|
# Try embedding to get specific type
|
|
combined = f"{role} {company}".strip()
|
|
emb_type, emb_score, _ = classify_type_embedding(combined, threshold=embedding_threshold)
|
|
if emb_type:
|
|
return (True, emb_type, f"embedding:{emb_score:.3f}")
|
|
return (True, None, "keyword_generic")
|
|
|
|
# Fall back to embedding-based classification
|
|
combined = f"{role} {company}".strip()
|
|
if not combined:
|
|
return (False, None, "none")
|
|
|
|
emb_type, emb_score, _ = classify_type_embedding(combined, threshold=embedding_threshold)
|
|
|
|
if emb_type:
|
|
return (True, emb_type, f"embedding:{emb_score:.3f}")
|
|
|
|
return (False, None, "none")
|
|
|
|
|
|
def enrich_experience_with_heritage(experience: List) -> List[Dict]:
|
|
"""
|
|
Add heritage_relevant and heritage_type fields to each experience item.
|
|
|
|
Handles both dict and JSON string inputs (asyncpg returns jsonb array
|
|
elements as strings that need parsing).
|
|
|
|
Args:
|
|
experience: List of experience items (dicts or JSON strings)
|
|
|
|
Returns:
|
|
Same list with heritage_relevant and heritage_type added to each item
|
|
"""
|
|
if not experience:
|
|
return []
|
|
|
|
enriched = []
|
|
for exp in experience:
|
|
# Handle case where exp is a JSON string instead of dict
|
|
# (asyncpg returns jsonb array elements as strings)
|
|
if isinstance(exp, str):
|
|
try:
|
|
exp = json.loads(exp)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
# Skip if still not a dict
|
|
if not isinstance(exp, dict):
|
|
continue
|
|
|
|
# Get role and company for classification
|
|
role = exp.get('title') or exp.get('role') or ''
|
|
company = exp.get('company') or exp.get('organization') or ''
|
|
|
|
# Detect heritage relevance
|
|
heritage_relevant, heritage_type = detect_heritage_type(role, company)
|
|
|
|
# Create new dict with heritage fields added
|
|
enriched_exp = {**exp}
|
|
enriched_exp['heritage_relevant'] = heritage_relevant
|
|
enriched_exp['heritage_type'] = heritage_type
|
|
enriched.append(enriched_exp)
|
|
|
|
return enriched
|
|
|
|
|
|
def parse_jsonb_list(data) -> List:
|
|
"""
|
|
Parse a jsonb list field from PostgreSQL.
|
|
|
|
asyncpg returns jsonb columns in various forms:
|
|
- Sometimes as a proper Python list with dict elements
|
|
- Sometimes as a JSON string that needs parsing
|
|
- Sometimes as a list where each element is a JSON string
|
|
- Sometimes as a list where each element is a Python repr string (single quotes)
|
|
|
|
This function handles all these cases.
|
|
|
|
Args:
|
|
data: Either a list, a JSON string representing a list, or None
|
|
|
|
Returns:
|
|
Parsed list with all elements as proper Python objects (empty list if None or invalid)
|
|
"""
|
|
import ast
|
|
|
|
if data is None:
|
|
return []
|
|
|
|
result = []
|
|
|
|
# If it's a string, try to parse the whole thing as JSON first
|
|
if isinstance(data, str):
|
|
try:
|
|
data = json.loads(data)
|
|
except json.JSONDecodeError:
|
|
return []
|
|
|
|
# Now data should be a list
|
|
if not isinstance(data, list):
|
|
return []
|
|
|
|
# Parse each element if it's a string
|
|
for item in data:
|
|
if isinstance(item, str):
|
|
# Try JSON first (double quotes)
|
|
try:
|
|
parsed_item = json.loads(item)
|
|
result.append(parsed_item)
|
|
continue
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Try Python literal (single quotes) - handles malformed data
|
|
try:
|
|
parsed_item = ast.literal_eval(item)
|
|
result.append(parsed_item)
|
|
continue
|
|
except (ValueError, SyntaxError):
|
|
pass
|
|
|
|
# Keep as string if neither works (e.g., plain skill strings)
|
|
result.append(item)
|
|
else:
|
|
result.append(item)
|
|
|
|
return result
|
|
|
|
|
|
# ============================================================================
|
|
# Global State
|
|
# ============================================================================
|
|
|
|
_pool: Optional[asyncpg.Pool] = None
|
|
_start_time: datetime = datetime.now()
|
|
|
|
|
|
async def get_pool() -> asyncpg.Pool:
|
|
"""Get or create connection pool"""
|
|
global _pool
|
|
|
|
if _pool is None:
|
|
_pool = await asyncpg.create_pool(
|
|
host=settings.host,
|
|
port=settings.port,
|
|
database=settings.database,
|
|
user=settings.user,
|
|
password=settings.password,
|
|
min_size=2,
|
|
max_size=10,
|
|
)
|
|
|
|
return _pool
|
|
|
|
|
|
# ============================================================================
|
|
# FastAPI App
|
|
# ============================================================================
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Application lifespan handler"""
|
|
await get_pool()
|
|
yield
|
|
global _pool
|
|
if _pool:
|
|
await _pool.close()
|
|
_pool = None
|
|
|
|
|
|
app = FastAPI(
|
|
title="PostGIS Geo API",
|
|
description="Spatial REST API for heritage institution map",
|
|
version="1.0.0",
|
|
lifespan=lifespan,
|
|
)
|
|
|
|
# CORS middleware
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
# GZip compression middleware - compresses responses >1KB
|
|
# Reduces ~126MB JSON payload to ~20-30MB (70-80% reduction)
|
|
app.add_middleware(GZipMiddleware, minimum_size=1000)
|
|
|
|
|
|
# ============================================================================
|
|
# Helper Functions
|
|
# ============================================================================
|
|
|
|
def serialize_value(val: Any) -> Any:
|
|
"""Convert PostgreSQL values to JSON-serializable format"""
|
|
if val is None:
|
|
return None
|
|
elif isinstance(val, datetime):
|
|
return val.isoformat()
|
|
elif isinstance(val, Decimal):
|
|
return float(val)
|
|
elif isinstance(val, (dict, list)):
|
|
return val
|
|
elif isinstance(val, bytes):
|
|
return val.decode('utf-8', errors='replace')
|
|
else:
|
|
return val
|
|
|
|
|
|
def row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
|
|
"""Convert asyncpg row to dict with serialization"""
|
|
return {key: serialize_value(row[key]) for key in row.keys()}
|
|
|
|
|
|
# ============================================================================
|
|
# API Endpoints
|
|
# ============================================================================
|
|
|
|
@app.get("/", response_model=GeoStatsResponse)
|
|
async def get_geo_status() -> GeoStatsResponse:
|
|
"""Get geo database status and statistics"""
|
|
pool = await get_pool()
|
|
|
|
async with pool.acquire() as conn:
|
|
version = await conn.fetchval("SELECT version()")
|
|
provinces = await conn.fetchval("SELECT COUNT(*) FROM provinces")
|
|
municipalities = await conn.fetchval("SELECT COUNT(*) FROM municipalities")
|
|
institutions = await conn.fetchval("SELECT COUNT(*) FROM institutions")
|
|
historical = await conn.fetchval("SELECT COUNT(*) FROM historical_boundaries")
|
|
|
|
return GeoStatsResponse(
|
|
status="healthy",
|
|
database=settings.database,
|
|
provinces=provinces or 0,
|
|
municipalities=municipalities or 0,
|
|
institutions=institutions or 0,
|
|
historical_boundaries=historical or 0,
|
|
postgres_version=version.split(',')[0] if version else "unknown",
|
|
)
|
|
|
|
|
|
@app.get("/provinces")
|
|
async def get_provinces(
|
|
simplified: bool = Query(True, description="Return simplified geometries")
|
|
):
|
|
"""Get all provinces as GeoJSON FeatureCollection"""
|
|
pool = await get_pool()
|
|
|
|
tolerance = 0.001 if simplified else 0
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(f"""
|
|
SELECT
|
|
id, province_code, iso_code, name,
|
|
ST_AsGeoJSON(
|
|
{'ST_Simplify(geom, ' + str(tolerance) + ')' if simplified else 'geom'}
|
|
)::json as geometry,
|
|
ST_X(centroid) as centroid_lon,
|
|
ST_Y(centroid) as centroid_lat,
|
|
area_km2
|
|
FROM provinces
|
|
ORDER BY name
|
|
""")
|
|
|
|
features = []
|
|
for row in rows:
|
|
features.append({
|
|
"type": "Feature",
|
|
"id": row['province_code'],
|
|
"geometry": row['geometry'],
|
|
"properties": {
|
|
"id": row['id'],
|
|
"province_code": row['province_code'],
|
|
"iso_code": row['iso_code'],
|
|
"name": row['name'],
|
|
"centroid_lon": float(row['centroid_lon']) if row['centroid_lon'] else None,
|
|
"centroid_lat": float(row['centroid_lat']) if row['centroid_lat'] else None,
|
|
"area_km2": float(row['area_km2']) if row['area_km2'] else None,
|
|
}
|
|
})
|
|
|
|
return {
|
|
"type": "FeatureCollection",
|
|
"features": features
|
|
}
|
|
|
|
|
|
@app.get("/countries")
|
|
async def get_countries(
|
|
simplified: bool = Query(True, description="Return simplified geometries"),
|
|
with_counts: bool = Query(False, description="Include institution counts per country"),
|
|
):
|
|
"""Get all countries as GeoJSON FeatureCollection with optional institution counts"""
|
|
pool = await get_pool()
|
|
|
|
# Use more aggressive simplification for countries (world view)
|
|
tolerance = 0.01 if simplified else 0
|
|
|
|
async with pool.acquire() as conn:
|
|
if with_counts:
|
|
# Join with custodians to get counts per country
|
|
rows = await conn.fetch(f"""
|
|
SELECT
|
|
bc.id,
|
|
bc.iso_a2 as country_code,
|
|
bc.iso_a3,
|
|
bc.country_name as name,
|
|
ST_AsGeoJSON(
|
|
{'ST_Simplify(bc.geom, ' + str(tolerance) + ')' if simplified else 'bc.geom'}
|
|
) as geometry,
|
|
ST_X(bc.centroid) as centroid_lon,
|
|
ST_Y(bc.centroid) as centroid_lat,
|
|
bc.area_km2,
|
|
COALESCE(counts.institution_count, 0) as institution_count
|
|
FROM boundary_countries bc
|
|
LEFT JOIN (
|
|
SELECT country_code, COUNT(*) as institution_count
|
|
FROM custodians
|
|
WHERE country_code IS NOT NULL
|
|
GROUP BY country_code
|
|
) counts ON bc.iso_a2 = counts.country_code
|
|
WHERE bc.geom IS NOT NULL
|
|
ORDER BY bc.country_name
|
|
""")
|
|
else:
|
|
rows = await conn.fetch(f"""
|
|
SELECT
|
|
id,
|
|
iso_a2 as country_code,
|
|
iso_a3,
|
|
country_name as name,
|
|
ST_AsGeoJSON(
|
|
{'ST_Simplify(geom, ' + str(tolerance) + ')' if simplified else 'geom'}
|
|
) as geometry,
|
|
ST_X(centroid) as centroid_lon,
|
|
ST_Y(centroid) as centroid_lat,
|
|
area_km2
|
|
FROM boundary_countries
|
|
WHERE geom IS NOT NULL
|
|
ORDER BY country_name
|
|
""")
|
|
|
|
features = []
|
|
total_institutions = 0
|
|
countries_with_data = 0
|
|
|
|
for row in rows:
|
|
# Parse geometry from string to dict (ST_AsGeoJSON returns text)
|
|
geometry = row['geometry']
|
|
if geometry is None:
|
|
# Skip countries with no geometry (e.g., Vatican City)
|
|
continue
|
|
if isinstance(geometry, str):
|
|
geometry = json.loads(geometry)
|
|
|
|
# Ensure geometry has required structure
|
|
if not isinstance(geometry, dict) or 'type' not in geometry or 'coordinates' not in geometry:
|
|
continue
|
|
|
|
iso_a2 = row['country_code'].strip() if row['country_code'] else None
|
|
iso_a3 = row['iso_a3'].strip() if row['iso_a3'] else None
|
|
institution_count = row['institution_count'] if with_counts else 0
|
|
|
|
# Track totals
|
|
if with_counts:
|
|
total_institutions += institution_count
|
|
if institution_count > 0:
|
|
countries_with_data += 1
|
|
|
|
# Build properties with frontend-expected field names
|
|
properties = {
|
|
"id": row['id'],
|
|
"iso_a2": iso_a2, # Frontend expects iso_a2
|
|
"iso_a3": iso_a3,
|
|
"name": row['name'],
|
|
"institution_count": institution_count,
|
|
"centroid": [
|
|
float(row['centroid_lon']) if row['centroid_lon'] else None,
|
|
float(row['centroid_lat']) if row['centroid_lat'] else None,
|
|
],
|
|
"area_km2": float(row['area_km2']) if row['area_km2'] else None,
|
|
}
|
|
|
|
features.append({
|
|
"type": "Feature",
|
|
"id": iso_a2,
|
|
"geometry": geometry,
|
|
"properties": properties
|
|
})
|
|
|
|
return {
|
|
"type": "FeatureCollection",
|
|
"features": features,
|
|
"metadata": {
|
|
"count": len(features),
|
|
"total_institutions": total_institutions,
|
|
"countries_with_data": countries_with_data,
|
|
"type_filter": None,
|
|
"simplified": simplified,
|
|
}
|
|
}
|
|
|
|
|
|
@app.get("/municipalities")
|
|
async def get_municipalities(
|
|
province: Optional[str] = Query(None, description="Filter by province ISO code (e.g., NH)"),
|
|
simplified: bool = Query(True, description="Return simplified geometries"),
|
|
limit: int = Query(500, ge=1, le=1000, description="Maximum results")
|
|
):
|
|
"""Get municipalities as GeoJSON FeatureCollection"""
|
|
pool = await get_pool()
|
|
|
|
tolerance = 0.0005 if simplified else 0
|
|
|
|
query = f"""
|
|
SELECT
|
|
m.id, m.municipality_code, m.name,
|
|
p.iso_code as province_iso, p.name as province_name,
|
|
ST_AsGeoJSON(
|
|
{'ST_Simplify(m.geom, ' + str(tolerance) + ')' if simplified else 'm.geom'}
|
|
)::json as geometry,
|
|
ST_X(m.centroid) as centroid_lon,
|
|
ST_Y(m.centroid) as centroid_lat,
|
|
m.area_km2
|
|
FROM municipalities m
|
|
LEFT JOIN provinces p ON m.province_id = p.id
|
|
{'WHERE p.iso_code = $1' if province else ''}
|
|
ORDER BY m.name
|
|
LIMIT {'$2' if province else '$1'}
|
|
"""
|
|
|
|
async with pool.acquire() as conn:
|
|
if province:
|
|
rows = await conn.fetch(query, province.upper(), limit)
|
|
else:
|
|
rows = await conn.fetch(query, limit)
|
|
|
|
features = []
|
|
for row in rows:
|
|
features.append({
|
|
"type": "Feature",
|
|
"id": row['municipality_code'],
|
|
"geometry": row['geometry'],
|
|
"properties": {
|
|
"id": row['id'],
|
|
"code": row['municipality_code'],
|
|
"name": row['name'],
|
|
"province_iso": row['province_iso'],
|
|
"province_name": row['province_name'],
|
|
"centroid_lon": float(row['centroid_lon']) if row['centroid_lon'] else None,
|
|
"centroid_lat": float(row['centroid_lat']) if row['centroid_lat'] else None,
|
|
"area_km2": float(row['area_km2']) if row['area_km2'] else None,
|
|
}
|
|
})
|
|
|
|
return {
|
|
"type": "FeatureCollection",
|
|
"features": features
|
|
}
|
|
|
|
|
|
@app.get("/institutions")
|
|
async def get_institutions(
|
|
bbox: Optional[str] = Query(None, description="Bounding box: minLon,minLat,maxLon,maxLat"),
|
|
province: Optional[str] = Query(None, description="Filter by province ISO code (e.g., NH, ZH)"),
|
|
country: Optional[str] = Query(None, description="Filter by country code (e.g., NL, DE, JP)"),
|
|
type: Optional[str] = Query(None, description="Filter by institution type (G,L,A,M,O,R,C,U,B,E,S,F,I,X,P,H,D,N,T)"),
|
|
limit: int = Query(50000, ge=1, le=100000, description="Maximum results")
|
|
):
|
|
"""Get institutions as GeoJSON FeatureCollection with full metadata from custodians table"""
|
|
pool = await get_pool()
|
|
|
|
# Build WHERE clauses - query custodians table directly
|
|
conditions = ["lat IS NOT NULL AND lon IS NOT NULL"]
|
|
params = []
|
|
param_count = 0
|
|
|
|
if bbox:
|
|
try:
|
|
min_lon, min_lat, max_lon, max_lat = map(float, bbox.split(','))
|
|
param_count += 4
|
|
conditions.append(f"""
|
|
lon >= ${param_count-3} AND lat >= ${param_count-2}
|
|
AND lon <= ${param_count-1} AND lat <= ${param_count}
|
|
""")
|
|
params.extend([min_lon, min_lat, max_lon, max_lat])
|
|
except ValueError:
|
|
raise HTTPException(status_code=400, detail="Invalid bbox format. Use: minLon,minLat,maxLon,maxLat")
|
|
|
|
if province:
|
|
param_count += 1
|
|
conditions.append(f"region_code = ${param_count}")
|
|
params.append(province.upper())
|
|
|
|
if type:
|
|
param_count += 1
|
|
conditions.append(f"type = ${param_count}")
|
|
params.append(type.upper())
|
|
|
|
if country:
|
|
param_count += 1
|
|
conditions.append(f"country_code = ${param_count}")
|
|
params.append(country.upper())
|
|
|
|
param_count += 1
|
|
where_clause = " AND ".join(conditions)
|
|
|
|
# Query custodians table with all rich metadata fields
|
|
query = f"""
|
|
SELECT
|
|
ghcid,
|
|
name,
|
|
emic_name,
|
|
type,
|
|
type_name,
|
|
lon,
|
|
lat,
|
|
city,
|
|
region as province,
|
|
region_code as province_iso,
|
|
country_code,
|
|
formatted_address,
|
|
street_address,
|
|
postal_code,
|
|
rating,
|
|
total_ratings,
|
|
wikidata_id,
|
|
website,
|
|
phone,
|
|
email,
|
|
isil_code,
|
|
google_place_id,
|
|
description,
|
|
opening_hours,
|
|
reviews,
|
|
photos,
|
|
photo_urls,
|
|
business_status,
|
|
street_view_url,
|
|
founding_year,
|
|
dissolution_year,
|
|
temporal_extent,
|
|
museum_register,
|
|
youtube_channel_url,
|
|
youtube_subscriber_count,
|
|
youtube_video_count,
|
|
youtube_enrichment,
|
|
social_facebook,
|
|
social_twitter,
|
|
social_instagram,
|
|
wikidata_label_en,
|
|
wikidata_description_en,
|
|
logo_url,
|
|
web_claims
|
|
FROM custodians
|
|
WHERE {where_clause}
|
|
ORDER BY name
|
|
LIMIT ${param_count}
|
|
"""
|
|
|
|
params.append(limit)
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(query, *params)
|
|
|
|
features = []
|
|
for row in rows:
|
|
# Build properties with all available metadata
|
|
props = {
|
|
"ghcid": row['ghcid'],
|
|
"name": row['name'],
|
|
"emic_name": row['emic_name'],
|
|
"type": row['type'],
|
|
"type_name": row['type_name'],
|
|
"city": row['city'],
|
|
"province": row['province'],
|
|
"province_iso": row['province_iso'],
|
|
"country_code": row['country_code'],
|
|
"formatted_address": row['formatted_address'],
|
|
"rating": float(row['rating']) if row['rating'] else None,
|
|
"total_ratings": row['total_ratings'],
|
|
"wikidata_id": row['wikidata_id'],
|
|
"website": row['website'],
|
|
"phone": row['phone'],
|
|
"email": row['email'],
|
|
"isil_code": row['isil_code'],
|
|
"google_place_id": row['google_place_id'],
|
|
"description": row['description'],
|
|
"business_status": row['business_status'],
|
|
"street_view_url": row['street_view_url'],
|
|
"founding_year": row['founding_year'],
|
|
"dissolution_year": row['dissolution_year'],
|
|
}
|
|
|
|
# Add JSONB fields (handle potential None values)
|
|
if row['opening_hours']:
|
|
props["opening_hours"] = row['opening_hours']
|
|
if row['reviews']:
|
|
props["reviews"] = row['reviews']
|
|
if row['photos']:
|
|
props["photos"] = row['photos']
|
|
if row['photo_urls']:
|
|
props["photo_urls"] = row['photo_urls']
|
|
if row['temporal_extent']:
|
|
props["temporal_extent"] = row['temporal_extent']
|
|
if row['museum_register']:
|
|
props["museum_register"] = row['museum_register']
|
|
if row['youtube_enrichment']:
|
|
props["youtube_enrichment"] = row['youtube_enrichment']
|
|
elif row['youtube_channel_url']:
|
|
# Build minimal YouTube data if enrichment not present
|
|
props["youtube"] = {
|
|
"channel_url": row['youtube_channel_url'],
|
|
"subscriber_count": row['youtube_subscriber_count'],
|
|
"video_count": row['youtube_video_count'],
|
|
}
|
|
|
|
# Social media
|
|
social = {}
|
|
if row['social_facebook']:
|
|
social['facebook'] = row['social_facebook']
|
|
if row['social_twitter']:
|
|
social['twitter'] = row['social_twitter']
|
|
if row['social_instagram']:
|
|
social['instagram'] = row['social_instagram']
|
|
if social:
|
|
props["social_media"] = social
|
|
|
|
# Wikidata labels
|
|
if row['wikidata_label_en']:
|
|
props["wikidata_label"] = row['wikidata_label_en']
|
|
if row['wikidata_description_en']:
|
|
props["wikidata_description"] = row['wikidata_description_en']
|
|
|
|
# Logo URL from web claims extraction
|
|
if row['logo_url']:
|
|
props["logo_url"] = row['logo_url']
|
|
|
|
# Web claims (financial documents, etc.)
|
|
if row['web_claims']:
|
|
props["web_claims"] = row['web_claims']
|
|
|
|
features.append({
|
|
"type": "Feature",
|
|
"geometry": {
|
|
"type": "Point",
|
|
"coordinates": [float(row['lon']), float(row['lat'])]
|
|
},
|
|
"properties": props
|
|
})
|
|
|
|
return {
|
|
"type": "FeatureCollection",
|
|
"features": features,
|
|
"metadata": {
|
|
"count": len(features),
|
|
"limit": limit,
|
|
"filters": {
|
|
"bbox": bbox,
|
|
"province": province,
|
|
"type": type
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@app.get("/institution/{ghcid}")
|
|
async def get_institution(ghcid: str):
|
|
"""Get detailed information for a single institution with full metadata"""
|
|
pool = await get_pool()
|
|
|
|
async with pool.acquire() as conn:
|
|
row = await conn.fetchrow("""
|
|
SELECT
|
|
ghcid,
|
|
name,
|
|
emic_name,
|
|
verified_name,
|
|
type,
|
|
type_name,
|
|
lon,
|
|
lat,
|
|
city,
|
|
region as province,
|
|
region_code as province_iso,
|
|
country_code,
|
|
formatted_address,
|
|
street_address,
|
|
postal_code,
|
|
website,
|
|
phone,
|
|
email,
|
|
wikidata_id,
|
|
isil_code,
|
|
google_place_id,
|
|
rating,
|
|
total_ratings,
|
|
description,
|
|
business_status,
|
|
street_view_url,
|
|
google_maps_url,
|
|
opening_hours,
|
|
reviews,
|
|
photos,
|
|
photo_urls,
|
|
founding_year,
|
|
founding_date,
|
|
dissolution_year,
|
|
dissolution_date,
|
|
temporal_extent,
|
|
museum_register,
|
|
youtube_channel_id,
|
|
youtube_channel_url,
|
|
youtube_subscriber_count,
|
|
youtube_video_count,
|
|
youtube_view_count,
|
|
youtube_enrichment,
|
|
social_facebook,
|
|
social_twitter,
|
|
social_instagram,
|
|
social_linkedin,
|
|
social_youtube,
|
|
logo_url,
|
|
wikidata_label_nl,
|
|
wikidata_label_en,
|
|
wikidata_description_nl,
|
|
wikidata_description_en,
|
|
wikidata_types,
|
|
wikidata_inception,
|
|
wikidata_enrichment,
|
|
genealogiewerkbalk,
|
|
nan_isil_enrichment,
|
|
kb_enrichment,
|
|
zcbs_enrichment,
|
|
web_claims,
|
|
ghcid_uuid,
|
|
ghcid_numeric,
|
|
identifiers,
|
|
data_source,
|
|
data_tier,
|
|
provenance
|
|
FROM custodians
|
|
WHERE ghcid = $1
|
|
""", ghcid)
|
|
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail=f"Institution '{ghcid}' not found")
|
|
|
|
# Build comprehensive response with all metadata
|
|
result = {
|
|
"ghcid": row['ghcid'],
|
|
"name": row['name'],
|
|
"emic_name": row['emic_name'],
|
|
"verified_name": row['verified_name'],
|
|
"type": row['type'],
|
|
"type_name": row['type_name'],
|
|
"lat": float(row['lat']) if row['lat'] else None,
|
|
"lon": float(row['lon']) if row['lon'] else None,
|
|
"city": row['city'],
|
|
"province": row['province'],
|
|
"province_iso": row['province_iso'],
|
|
"country_code": row['country_code'],
|
|
"formatted_address": row['formatted_address'],
|
|
"street_address": row['street_address'],
|
|
"postal_code": row['postal_code'],
|
|
"website": row['website'],
|
|
"phone": row['phone'],
|
|
"email": row['email'],
|
|
"wikidata_id": row['wikidata_id'],
|
|
"isil_code": row['isil_code'],
|
|
"google_place_id": row['google_place_id'],
|
|
"rating": float(row['rating']) if row['rating'] else None,
|
|
"total_ratings": row['total_ratings'],
|
|
"description": row['description'],
|
|
"business_status": row['business_status'],
|
|
"street_view_url": row['street_view_url'],
|
|
"google_maps_url": row['google_maps_url'],
|
|
}
|
|
|
|
# JSONB fields - only include if present
|
|
if row['opening_hours']:
|
|
result["opening_hours"] = row['opening_hours']
|
|
if row['reviews']:
|
|
result["reviews"] = row['reviews']
|
|
if row['photos']:
|
|
result["photos"] = row['photos']
|
|
if row['photo_urls']:
|
|
result["photo_urls"] = row['photo_urls']
|
|
if row['identifiers']:
|
|
result["identifiers"] = row['identifiers']
|
|
|
|
# Temporal data
|
|
temporal = {}
|
|
if row['founding_year']:
|
|
temporal["founding_year"] = row['founding_year']
|
|
if row['founding_date']:
|
|
temporal["founding_date"] = row['founding_date'].isoformat() if row['founding_date'] else None
|
|
if row['dissolution_year']:
|
|
temporal["dissolution_year"] = row['dissolution_year']
|
|
if row['dissolution_date']:
|
|
temporal["dissolution_date"] = row['dissolution_date'].isoformat() if row['dissolution_date'] else None
|
|
if row['temporal_extent']:
|
|
temporal["extent"] = row['temporal_extent']
|
|
if temporal:
|
|
result["temporal"] = temporal
|
|
|
|
# Museum register
|
|
if row['museum_register']:
|
|
result["museum_register"] = row['museum_register']
|
|
|
|
# YouTube enrichment
|
|
youtube = {}
|
|
if row['youtube_channel_id']:
|
|
youtube["channel_id"] = row['youtube_channel_id']
|
|
if row['youtube_channel_url']:
|
|
youtube["channel_url"] = row['youtube_channel_url']
|
|
if row['youtube_subscriber_count']:
|
|
youtube["subscriber_count"] = row['youtube_subscriber_count']
|
|
if row['youtube_video_count']:
|
|
youtube["video_count"] = row['youtube_video_count']
|
|
if row['youtube_view_count']:
|
|
youtube["view_count"] = row['youtube_view_count']
|
|
if row['youtube_enrichment']:
|
|
youtube["enrichment"] = row['youtube_enrichment']
|
|
if youtube:
|
|
result["youtube"] = youtube
|
|
|
|
# Social media
|
|
social = {}
|
|
if row['social_facebook']:
|
|
social["facebook"] = row['social_facebook']
|
|
if row['social_twitter']:
|
|
social["twitter"] = row['social_twitter']
|
|
if row['social_instagram']:
|
|
social["instagram"] = row['social_instagram']
|
|
if row['social_linkedin']:
|
|
social["linkedin"] = row['social_linkedin']
|
|
if row['social_youtube']:
|
|
social["youtube"] = row['social_youtube']
|
|
if social:
|
|
result["social_media"] = social
|
|
|
|
# Wikidata
|
|
wikidata = {}
|
|
if row['wikidata_label_nl']:
|
|
wikidata["label_nl"] = row['wikidata_label_nl']
|
|
if row['wikidata_label_en']:
|
|
wikidata["label_en"] = row['wikidata_label_en']
|
|
if row['wikidata_description_nl']:
|
|
wikidata["description_nl"] = row['wikidata_description_nl']
|
|
if row['wikidata_description_en']:
|
|
wikidata["description_en"] = row['wikidata_description_en']
|
|
if row['wikidata_types']:
|
|
wikidata["types"] = row['wikidata_types']
|
|
if row['wikidata_inception']:
|
|
wikidata["inception"] = row['wikidata_inception']
|
|
if row['wikidata_enrichment']:
|
|
wikidata["enrichment"] = row['wikidata_enrichment']
|
|
if wikidata:
|
|
result["wikidata"] = wikidata
|
|
|
|
# Logo
|
|
if row['logo_url']:
|
|
result["logo_url"] = row['logo_url']
|
|
|
|
# Other enrichment data
|
|
if row['genealogiewerkbalk']:
|
|
result["genealogiewerkbalk"] = row['genealogiewerkbalk']
|
|
if row['nan_isil_enrichment']:
|
|
result["nan_isil_enrichment"] = row['nan_isil_enrichment']
|
|
if row['kb_enrichment']:
|
|
result["kb_enrichment"] = row['kb_enrichment']
|
|
if row['zcbs_enrichment']:
|
|
result["zcbs_enrichment"] = row['zcbs_enrichment']
|
|
if row['web_claims']:
|
|
result["web_claims"] = row['web_claims']
|
|
|
|
# GHCID details
|
|
ghcid_data = {"current": row['ghcid']}
|
|
if row['ghcid_uuid']:
|
|
ghcid_data["uuid"] = str(row['ghcid_uuid'])
|
|
if row['ghcid_numeric']:
|
|
ghcid_data["numeric"] = int(row['ghcid_numeric'])
|
|
result["ghcid_details"] = ghcid_data
|
|
|
|
# Provenance
|
|
if row['data_source'] or row['data_tier'] or row['provenance']:
|
|
result["provenance"] = {
|
|
"data_source": row['data_source'],
|
|
"data_tier": row['data_tier'],
|
|
"details": row['provenance'],
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
@app.get("/search")
|
|
async def search_institutions(
|
|
q: str = Query(..., min_length=2, description="Search query (name, GHCID, or description)"),
|
|
type: Optional[str] = Query(None, description="Filter by institution type"),
|
|
include_persons: bool = Query(False, description="Also search for associated persons"),
|
|
limit: int = Query(50, ge=1, le=200, description="Maximum results")
|
|
):
|
|
"""
|
|
Search institutions by name, GHCID, or description.
|
|
|
|
Supports:
|
|
- Full-text search on name and description
|
|
- Exact GHCID matching (e.g., NL-OV-ZWO-D-WIWO)
|
|
- Partial GHCID matching (e.g., NL-OV or WIWO)
|
|
- Optionally includes persons associated with matching custodians
|
|
"""
|
|
pool = await get_pool()
|
|
q_upper = q.upper()
|
|
q_pattern = f"%{q}%"
|
|
|
|
# Check if query looks like a GHCID pattern (contains hyphen and letter-hyphen pattern)
|
|
# Accept both uppercase and lowercase input (e.g., "nl-nh-ams" or "NL-NH-AMS")
|
|
is_ghcid_query = '-' in q and any(c.isalpha() for c in q)
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = []
|
|
|
|
# Priority 1: Exact GHCID match
|
|
if is_ghcid_query:
|
|
exact_query = """
|
|
SELECT
|
|
c.ghcid as ghcid,
|
|
c.name,
|
|
c.type,
|
|
c.type_name,
|
|
c.lon,
|
|
c.lat,
|
|
c.city,
|
|
c.region_code as province_iso,
|
|
c.rating,
|
|
1000 as rank
|
|
FROM custodians c
|
|
WHERE c.ghcid = $1
|
|
"""
|
|
rows = await conn.fetch(exact_query, q_upper)
|
|
|
|
# Priority 2: Partial GHCID match (if no exact match)
|
|
if not rows and is_ghcid_query:
|
|
partial_ghcid_query = """
|
|
SELECT
|
|
c.ghcid as ghcid,
|
|
c.name,
|
|
c.type,
|
|
c.type_name,
|
|
c.lon,
|
|
c.lat,
|
|
c.city,
|
|
c.region_code as province_iso,
|
|
c.rating,
|
|
500 as rank
|
|
FROM custodians c
|
|
WHERE c.ghcid ILIKE $1
|
|
"""
|
|
params = [f"%{q_upper}%"]
|
|
|
|
if type:
|
|
partial_ghcid_query += " AND c.type = $2"
|
|
params.append(type.upper())
|
|
|
|
partial_ghcid_query += f" ORDER BY c.ghcid LIMIT ${len(params) + 1}"
|
|
params.append(limit)
|
|
|
|
rows = await conn.fetch(partial_ghcid_query, *params)
|
|
|
|
# Priority 3: Name/description text search
|
|
if not rows:
|
|
# Try ILIKE search on name, description, city, and emic_name
|
|
text_query = """
|
|
SELECT
|
|
c.ghcid as ghcid,
|
|
c.name,
|
|
c.type,
|
|
c.type_name,
|
|
c.lon,
|
|
c.lat,
|
|
c.city,
|
|
c.region_code as province_iso,
|
|
c.rating,
|
|
CASE
|
|
WHEN c.name ILIKE $2 THEN 100
|
|
WHEN c.name ILIKE $1 THEN 50
|
|
WHEN c.emic_name ILIKE $1 THEN 40
|
|
WHEN c.city ILIKE $1 THEN 30
|
|
ELSE 10
|
|
END as rank
|
|
FROM custodians c
|
|
WHERE c.name ILIKE $1
|
|
OR c.emic_name ILIKE $1
|
|
OR c.description ILIKE $1
|
|
OR c.city ILIKE $1
|
|
"""
|
|
params = [q_pattern, f"{q}%"]
|
|
param_count = 2
|
|
|
|
if type:
|
|
param_count += 1
|
|
text_query += f" AND c.type = ${param_count}"
|
|
params.append(type.upper())
|
|
|
|
param_count += 1
|
|
text_query += f" ORDER BY rank DESC, c.name LIMIT ${param_count}"
|
|
params.append(limit)
|
|
|
|
rows = await conn.fetch(text_query, *params)
|
|
|
|
# Build institution results with match type indicator
|
|
def get_match_type(rank: int) -> str:
|
|
if rank >= 1000:
|
|
return "exact_ghcid"
|
|
elif rank >= 500:
|
|
return "partial_ghcid"
|
|
else:
|
|
return "text"
|
|
|
|
institutions = [
|
|
{
|
|
"ghcid": row['ghcid'],
|
|
"name": row['name'],
|
|
"type": row['type'],
|
|
"type_name": row['type_name'],
|
|
"lon": float(row['lon']) if row['lon'] else None,
|
|
"lat": float(row['lat']) if row['lat'] else None,
|
|
"city": row['city'],
|
|
"province_iso": row['province_iso'],
|
|
"rating": float(row['rating']) if row['rating'] else None,
|
|
"match_type": get_match_type(row['rank']),
|
|
}
|
|
for row in rows
|
|
]
|
|
|
|
# Optionally search for persons
|
|
persons = []
|
|
if include_persons and institutions:
|
|
# Get custodian slugs from results to find associated persons
|
|
ghcids = [inst['ghcid'] for inst in institutions if inst['ghcid']]
|
|
|
|
if ghcids:
|
|
# Search persons by custodian association or by name
|
|
person_query = """
|
|
SELECT DISTINCT ON (p.staff_id)
|
|
p.staff_id,
|
|
p.name,
|
|
p.headline,
|
|
p.location,
|
|
p.country_code,
|
|
p.custodian_slug,
|
|
p.custodian_name,
|
|
p.linkedin_url,
|
|
p.profile_image_url,
|
|
p.heritage_relevant,
|
|
p.heritage_types
|
|
FROM persons p
|
|
WHERE p.name ILIKE $1
|
|
OR p.headline ILIKE $1
|
|
OR p.custodian_name ILIKE $1
|
|
ORDER BY p.staff_id, p.name
|
|
LIMIT $2
|
|
"""
|
|
person_rows = await conn.fetch(person_query, q_pattern, limit)
|
|
|
|
persons = [
|
|
{
|
|
"staff_id": row['staff_id'],
|
|
"name": row['name'],
|
|
"headline": row['headline'],
|
|
"location": row['location'],
|
|
"country_code": row['country_code'],
|
|
"custodian_slug": row['custodian_slug'],
|
|
"custodian_name": row['custodian_name'],
|
|
"linkedin_url": row['linkedin_url'],
|
|
"profile_image_url": row['profile_image_url'],
|
|
"heritage_relevant": row['heritage_relevant'],
|
|
"heritage_types": row['heritage_types'] or [],
|
|
}
|
|
for row in person_rows
|
|
]
|
|
|
|
result = {
|
|
"query": q,
|
|
"count": len(institutions),
|
|
"results": institutions,
|
|
}
|
|
|
|
if include_persons:
|
|
result["persons"] = persons
|
|
result["persons_count"] = len(persons)
|
|
|
|
return result
|
|
|
|
|
|
@app.get("/nearby", response_model=List[NearbyInstitution])
|
|
async def find_nearby(
|
|
lon: float = Query(..., description="Longitude"),
|
|
lat: float = Query(..., description="Latitude"),
|
|
radius_km: float = Query(10, ge=0.1, le=100, description="Search radius in km"),
|
|
type: Optional[str] = Query(None, description="Filter by institution type"),
|
|
limit: int = Query(50, ge=1, le=200, description="Maximum results")
|
|
):
|
|
"""Find institutions near a point"""
|
|
pool = await get_pool()
|
|
|
|
# Use custodians table with lat/lon columns (no PostGIS geometry)
|
|
query = """
|
|
SELECT
|
|
c.ghcid as ghcid,
|
|
c.name,
|
|
c.type,
|
|
c.type_name,
|
|
(
|
|
6371 * acos(
|
|
cos(radians($2)) * cos(radians(c.lat)) *
|
|
cos(radians(c.lon) - radians($1)) +
|
|
sin(radians($2)) * sin(radians(c.lat))
|
|
)
|
|
) as distance_km,
|
|
c.city,
|
|
c.region as province,
|
|
c.rating
|
|
FROM custodians c
|
|
WHERE c.lat IS NOT NULL AND c.lon IS NOT NULL
|
|
AND (
|
|
6371 * acos(
|
|
cos(radians($2)) * cos(radians(c.lat)) *
|
|
cos(radians(c.lon) - radians($1)) +
|
|
sin(radians($2)) * sin(radians(c.lat))
|
|
)
|
|
) <= $3
|
|
"""
|
|
|
|
params: list = [lon, lat, radius_km]
|
|
param_count = 3
|
|
|
|
if type:
|
|
param_count += 1
|
|
query += f" AND c.type = ${param_count}"
|
|
params.append(type.upper())
|
|
|
|
param_count += 1
|
|
query += f" ORDER BY distance_km LIMIT ${param_count}"
|
|
params.append(limit)
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(query, *params)
|
|
|
|
return [
|
|
NearbyInstitution(
|
|
ghcid=row['ghcid'],
|
|
name=row['name'],
|
|
type=row['type'],
|
|
type_name=row['type_name'],
|
|
distance_km=round(float(row['distance_km']), 2),
|
|
city=row['city'],
|
|
province=row['province'],
|
|
rating=float(row['rating']) if row['rating'] else None,
|
|
)
|
|
for row in rows
|
|
]
|
|
|
|
|
|
@app.get("/admin/point", response_model=AdminPoint)
|
|
async def get_admin_for_point(
|
|
lon: float = Query(..., description="Longitude"),
|
|
lat: float = Query(..., description="Latitude")
|
|
):
|
|
"""Find which municipality/province contains a point"""
|
|
pool = await get_pool()
|
|
|
|
async with pool.acquire() as conn:
|
|
row = await conn.fetchrow("""
|
|
SELECT
|
|
p.province_code,
|
|
p.name as province_name,
|
|
m.municipality_code,
|
|
m.name as municipality_name
|
|
FROM municipalities m
|
|
JOIN provinces p ON m.province_id = p.id
|
|
WHERE ST_Contains(m.geom, ST_SetSRID(ST_Point($1, $2), 4326))
|
|
LIMIT 1
|
|
""", lon, lat)
|
|
|
|
if not row:
|
|
# Try province only
|
|
async with pool.acquire() as conn:
|
|
row = await conn.fetchrow("""
|
|
SELECT
|
|
province_code,
|
|
name as province_name,
|
|
NULL as municipality_code,
|
|
NULL as municipality_name
|
|
FROM provinces
|
|
WHERE ST_Contains(geom, ST_SetSRID(ST_Point($1, $2), 4326))
|
|
LIMIT 1
|
|
""", lon, lat)
|
|
|
|
if not row:
|
|
return AdminPoint(
|
|
province_code=None,
|
|
province_name=None,
|
|
municipality_code=None,
|
|
municipality_name=None
|
|
)
|
|
|
|
return AdminPoint(
|
|
province_code=row['province_code'],
|
|
province_name=row['province_name'],
|
|
municipality_code=row['municipality_code'],
|
|
municipality_name=row['municipality_name']
|
|
)
|
|
|
|
|
|
@app.get("/historical")
|
|
async def get_historical_boundaries(
|
|
year: int = Query(1500, description="Reference year"),
|
|
boundary_type: Optional[str] = Query(None, description="Boundary type filter"),
|
|
simplified: bool = Query(True, description="Return simplified geometries"),
|
|
limit: int = Query(1000, ge=1, le=10000, description="Maximum results")
|
|
):
|
|
"""Get historical boundaries as GeoJSON"""
|
|
pool = await get_pool()
|
|
|
|
tolerance = 0.001 if simplified else 0
|
|
|
|
conditions = ["reference_year = $1"]
|
|
params = [year]
|
|
param_count = 1
|
|
|
|
if boundary_type:
|
|
param_count += 1
|
|
conditions.append(f"boundary_type = ${param_count}")
|
|
params.append(boundary_type)
|
|
|
|
param_count += 1
|
|
where_clause = " AND ".join(conditions)
|
|
|
|
query = f"""
|
|
SELECT
|
|
id, boundary_code, name, boundary_type, reference_year,
|
|
ST_AsGeoJSON(
|
|
{'ST_Simplify(geom, ' + str(tolerance) + ')' if simplified else 'geom'}
|
|
)::json as geometry,
|
|
ST_X(centroid) as centroid_lon,
|
|
ST_Y(centroid) as centroid_lat,
|
|
area_km2
|
|
FROM historical_boundaries
|
|
WHERE {where_clause}
|
|
ORDER BY name
|
|
LIMIT ${param_count}
|
|
"""
|
|
params.append(limit)
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(query, *params)
|
|
|
|
features = []
|
|
for row in rows:
|
|
if row['geometry']:
|
|
features.append({
|
|
"type": "Feature",
|
|
"id": row['boundary_code'],
|
|
"geometry": row['geometry'],
|
|
"properties": {
|
|
"id": row['id'],
|
|
"code": row['boundary_code'],
|
|
"name": row['name'],
|
|
"type": row['boundary_type'],
|
|
"year": row['reference_year'],
|
|
"centroid_lon": float(row['centroid_lon']) if row['centroid_lon'] else None,
|
|
"centroid_lat": float(row['centroid_lat']) if row['centroid_lat'] else None,
|
|
"area_km2": float(row['area_km2']) if row['area_km2'] else None,
|
|
}
|
|
})
|
|
|
|
return {
|
|
"type": "FeatureCollection",
|
|
"features": features,
|
|
"metadata": {
|
|
"year": year,
|
|
"boundary_type": boundary_type,
|
|
"count": len(features)
|
|
}
|
|
}
|
|
|
|
|
|
@app.get("/stats/by-type")
|
|
async def get_stats_by_type():
|
|
"""Get institution counts by type"""
|
|
pool = await get_pool()
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch("""
|
|
SELECT
|
|
institution_type as type,
|
|
type_name,
|
|
COUNT(*) as count,
|
|
ROUND(AVG(rating)::numeric, 2) as avg_rating
|
|
FROM institutions
|
|
WHERE geom IS NOT NULL
|
|
GROUP BY institution_type, type_name
|
|
ORDER BY count DESC
|
|
""")
|
|
|
|
return {
|
|
"stats": [
|
|
{
|
|
"type": row['type'],
|
|
"type_name": row['type_name'],
|
|
"count": row['count'],
|
|
"avg_rating": float(row['avg_rating']) if row['avg_rating'] else None
|
|
}
|
|
for row in rows
|
|
]
|
|
}
|
|
|
|
|
|
@app.get("/stats/by-province")
|
|
async def get_stats_by_province():
|
|
"""Get institution counts by province"""
|
|
pool = await get_pool()
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch("""
|
|
SELECT
|
|
p.iso_code,
|
|
p.name as province_name,
|
|
COUNT(i.id) as count,
|
|
ROUND(AVG(i.rating)::numeric, 2) as avg_rating
|
|
FROM provinces p
|
|
LEFT JOIN institutions i ON i.province_id = p.id
|
|
GROUP BY p.id, p.iso_code, p.name
|
|
ORDER BY count DESC
|
|
""")
|
|
|
|
return {
|
|
"stats": [
|
|
{
|
|
"province_iso": row['iso_code'],
|
|
"province_name": row['province_name'],
|
|
"count": row['count'],
|
|
"avg_rating": float(row['avg_rating']) if row['avg_rating'] else None
|
|
}
|
|
for row in rows
|
|
]
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# Heritage Type Classification Endpoint
|
|
# ============================================================================
|
|
|
|
# Response model for type classification
|
|
class TypeClassifyResponse(BaseModel):
|
|
"""Response model for heritage type classification"""
|
|
text: str = Field(description="Input text that was classified")
|
|
is_heritage: bool = Field(description="Whether the text is heritage-relevant")
|
|
type_code: str | None = Field(description="GLAMORCUBESFIXPHDNT type code (e.g., 'M', 'A', 'L')")
|
|
type_name: str | None = Field(description="Full type name (e.g., 'Museum', 'Archive')")
|
|
method: str = Field(description="Classification method used: 'keyword', 'embedding:0.XXX', 'none'")
|
|
confidence: float | None = Field(description="Similarity score for embedding-based classification")
|
|
all_scores: dict | None = Field(default=None, description="Scores for all types (if requested)")
|
|
|
|
|
|
# Type code to full name mapping
|
|
TYPE_CODE_NAMES = {
|
|
'G': 'Gallery',
|
|
'L': 'Library',
|
|
'A': 'Archive',
|
|
'M': 'Museum',
|
|
'O': 'Official Institution',
|
|
'R': 'Research Center',
|
|
'C': 'Corporation',
|
|
'U': 'Unspecified',
|
|
'B': 'Bio-custodian (Zoo/Botanical)',
|
|
'E': 'Education Provider',
|
|
'S': 'Society',
|
|
'F': 'Feature (Monument/Site)',
|
|
'I': 'Intangible Heritage',
|
|
'X': 'Mixed',
|
|
'P': 'Personal Collection',
|
|
'H': 'Holy Site',
|
|
'D': 'Digital Platform',
|
|
'N': 'Non-profit',
|
|
'T': 'Taste/Scent Heritage',
|
|
}
|
|
|
|
|
|
@app.get("/types/classify", response_model=TypeClassifyResponse)
|
|
async def classify_heritage_type(
|
|
text: str = Query(..., description="Text to classify (institution name, role, company)"),
|
|
method: str = Query("hybrid", description="Classification method: 'keyword', 'embedding', 'hybrid'"),
|
|
threshold: float = Query(0.40, ge=0.0, le=1.0, description="Similarity threshold for embedding"),
|
|
include_scores: bool = Query(False, description="Include all type scores in response")
|
|
):
|
|
"""
|
|
Classify text into GLAMORCUBESFIXPHDNT heritage type.
|
|
|
|
Supports three classification methods:
|
|
- **keyword**: Fast rule-based matching using predefined keyword lists
|
|
- **embedding**: Semantic similarity using pre-computed type prototype embeddings
|
|
- **hybrid** (default): Keywords first, embedding fallback
|
|
|
|
Types: G=Gallery, L=Library, A=Archive, M=Museum, O=Official, R=Research,
|
|
C=Corporation, U=Unspecified, B=Bio, E=Education, S=Society, F=Feature,
|
|
I=Intangible, X=Mixed, P=Personal, H=Holy, D=Digital, N=Non-profit, T=Taste
|
|
"""
|
|
if not text.strip():
|
|
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
|
|
|
all_scores = None
|
|
confidence = None
|
|
|
|
if method == "keyword":
|
|
# Keyword-only classification
|
|
is_heritage, type_code = detect_heritage_type(text, "")
|
|
method_used = "keyword" if is_heritage else "none"
|
|
|
|
elif method == "embedding":
|
|
# Embedding-only classification
|
|
if include_scores:
|
|
type_code, score, all_scores = classify_type_embedding(
|
|
text, threshold=threshold, return_scores=True
|
|
)
|
|
else:
|
|
type_code, score, _ = classify_type_embedding(text, threshold=threshold)
|
|
|
|
is_heritage = type_code is not None
|
|
confidence = score
|
|
method_used = f"embedding:{score:.3f}" if is_heritage else "none"
|
|
|
|
else: # hybrid (default)
|
|
# Use hybrid detection
|
|
is_heritage, type_code, method_used = detect_heritage_type_hybrid(
|
|
text, "", embedding_threshold=threshold
|
|
)
|
|
|
|
# Extract confidence from method string if embedding was used
|
|
if method_used.startswith("embedding:"):
|
|
confidence = float(method_used.split(":")[1])
|
|
|
|
# Get all scores if requested and embedding path was used
|
|
if include_scores and (method_used.startswith("embedding") or method_used == "none"):
|
|
_, _, all_scores = classify_type_embedding(text, threshold=threshold, return_scores=True)
|
|
|
|
return TypeClassifyResponse(
|
|
text=text,
|
|
is_heritage=is_heritage,
|
|
type_code=type_code,
|
|
type_name=TYPE_CODE_NAMES.get(type_code) if type_code else None,
|
|
method=method_used,
|
|
confidence=confidence,
|
|
all_scores=all_scores
|
|
)
|
|
|
|
|
|
@app.get("/types/list")
|
|
async def list_heritage_types():
|
|
"""
|
|
List all GLAMORCUBESFIXPHDNT heritage types with descriptions.
|
|
"""
|
|
return {
|
|
"types": [
|
|
{"code": code, "name": name}
|
|
for code, name in TYPE_CODE_NAMES.items()
|
|
],
|
|
"mnemonic": "GLAMORCUBESFIXPHDNT",
|
|
"count": len(TYPE_CODE_NAMES)
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# Optimized Loading Endpoints (Pagination, Viewport, Lite)
|
|
# ============================================================================
|
|
|
|
@app.get("/institutions/lite")
|
|
async def get_institutions_lite(
|
|
bbox: Optional[str] = Query(None, description="Bounding box: minLon,minLat,maxLon,maxLat"),
|
|
country: Optional[str] = Query(None, description="Filter by country code (e.g., NL, DE, JP)"),
|
|
type: Optional[str] = Query(None, description="Filter by institution type"),
|
|
limit: int = Query(100000, ge=1, le=200000, description="Maximum results")
|
|
):
|
|
"""
|
|
Get lightweight institution data for map markers.
|
|
Returns only essential fields (~5-10MB instead of ~126MB).
|
|
Use /institution/{ghcid} for full details on click.
|
|
"""
|
|
pool = await get_pool()
|
|
|
|
conditions = ["lat IS NOT NULL AND lon IS NOT NULL"]
|
|
params = []
|
|
param_count = 0
|
|
|
|
if bbox:
|
|
try:
|
|
min_lon, min_lat, max_lon, max_lat = map(float, bbox.split(','))
|
|
param_count += 4
|
|
conditions.append(f"""
|
|
lon >= ${param_count-3} AND lat >= ${param_count-2}
|
|
AND lon <= ${param_count-1} AND lat <= ${param_count}
|
|
""")
|
|
params.extend([min_lon, min_lat, max_lon, max_lat])
|
|
except ValueError:
|
|
raise HTTPException(status_code=400, detail="Invalid bbox format")
|
|
|
|
if country:
|
|
param_count += 1
|
|
conditions.append(f"country_code = ${param_count}")
|
|
params.append(country.upper())
|
|
|
|
if type:
|
|
param_count += 1
|
|
conditions.append(f"type = ${param_count}")
|
|
params.append(type.upper())
|
|
|
|
param_count += 1
|
|
where_clause = " AND ".join(conditions)
|
|
|
|
# Minimal fields for markers - dramatically reduces payload
|
|
query = f"""
|
|
SELECT
|
|
ghcid,
|
|
name,
|
|
type,
|
|
lon,
|
|
lat,
|
|
city,
|
|
country_code,
|
|
rating
|
|
FROM custodians
|
|
WHERE {where_clause}
|
|
ORDER BY name
|
|
LIMIT ${param_count}
|
|
"""
|
|
params.append(limit)
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(query, *params)
|
|
|
|
features = []
|
|
for row in rows:
|
|
features.append({
|
|
"type": "Feature",
|
|
"geometry": {
|
|
"type": "Point",
|
|
"coordinates": [float(row['lon']), float(row['lat'])]
|
|
},
|
|
"properties": {
|
|
"ghcid": row['ghcid'],
|
|
"name": row['name'],
|
|
"type": row['type'],
|
|
"city": row['city'],
|
|
"country_code": row['country_code'],
|
|
"rating": float(row['rating']) if row['rating'] else None,
|
|
}
|
|
})
|
|
|
|
return {
|
|
"type": "FeatureCollection",
|
|
"features": features,
|
|
"metadata": {
|
|
"count": len(features),
|
|
"mode": "lite",
|
|
"filters": {"bbox": bbox, "country": country, "type": type}
|
|
}
|
|
}
|
|
|
|
|
|
@app.get("/institutions/page")
|
|
async def get_institutions_paginated(
|
|
page: int = Query(1, ge=1, description="Page number (1-indexed)"),
|
|
page_size: int = Query(1000, ge=100, le=5000, description="Items per page"),
|
|
country: Optional[str] = Query(None, description="Filter by country code"),
|
|
type: Optional[str] = Query(None, description="Filter by institution type"),
|
|
sort_by: str = Query("name", description="Sort field: name, city, rating, type"),
|
|
sort_order: str = Query("asc", description="Sort order: asc, desc"),
|
|
):
|
|
"""
|
|
Get paginated institutions with full metadata.
|
|
~1MB per page instead of ~126MB full download.
|
|
Supports cursor-based iteration for large datasets.
|
|
"""
|
|
pool = await get_pool()
|
|
|
|
# Validate sort parameters
|
|
valid_sort_fields = {"name", "city", "rating", "type", "country_code"}
|
|
if sort_by not in valid_sort_fields:
|
|
sort_by = "name"
|
|
sort_direction = "DESC" if sort_order.lower() == "desc" else "ASC"
|
|
|
|
conditions = ["lat IS NOT NULL AND lon IS NOT NULL"]
|
|
params = []
|
|
param_count = 0
|
|
|
|
if country:
|
|
param_count += 1
|
|
conditions.append(f"country_code = ${param_count}")
|
|
params.append(country.upper())
|
|
|
|
if type:
|
|
param_count += 1
|
|
conditions.append(f"type = ${param_count}")
|
|
params.append(type.upper())
|
|
|
|
where_clause = " AND ".join(conditions)
|
|
|
|
# Get total count for pagination metadata
|
|
count_query = f"SELECT COUNT(*) FROM custodians WHERE {where_clause}"
|
|
|
|
# Calculate offset
|
|
offset = (page - 1) * page_size
|
|
param_count += 1
|
|
limit_param = param_count
|
|
param_count += 1
|
|
offset_param = param_count
|
|
|
|
# Full metadata query with pagination
|
|
query = f"""
|
|
SELECT
|
|
ghcid, name, emic_name, type, type_name,
|
|
lon, lat, city, region as province, region_code as province_iso,
|
|
country_code, formatted_address, street_address, postal_code,
|
|
rating, total_ratings, wikidata_id, website, phone, email,
|
|
isil_code, google_place_id, description, opening_hours,
|
|
reviews, photos, photo_urls, business_status, street_view_url,
|
|
founding_year, dissolution_year, temporal_extent, museum_register,
|
|
youtube_channel_url, youtube_subscriber_count, youtube_video_count,
|
|
youtube_enrichment, social_facebook, social_twitter, social_instagram,
|
|
wikidata_label_en, wikidata_description_en
|
|
FROM custodians
|
|
WHERE {where_clause}
|
|
ORDER BY {sort_by} {sort_direction}
|
|
LIMIT ${limit_param} OFFSET ${offset_param}
|
|
"""
|
|
params.extend([page_size, offset])
|
|
|
|
async with pool.acquire() as conn:
|
|
total_count = await conn.fetchval(count_query, *params[:param_count-2] if params else [])
|
|
rows = await conn.fetch(query, *params)
|
|
|
|
total_pages = (total_count + page_size - 1) // page_size
|
|
|
|
features = []
|
|
for row in rows:
|
|
props = {
|
|
"ghcid": row['ghcid'],
|
|
"name": row['name'],
|
|
"emic_name": row['emic_name'],
|
|
"type": row['type'],
|
|
"type_name": row['type_name'],
|
|
"city": row['city'],
|
|
"province": row['province'],
|
|
"province_iso": row['province_iso'],
|
|
"country_code": row['country_code'],
|
|
"formatted_address": row['formatted_address'],
|
|
"rating": float(row['rating']) if row['rating'] else None,
|
|
"total_ratings": row['total_ratings'],
|
|
"wikidata_id": row['wikidata_id'],
|
|
"website": row['website'],
|
|
"phone": row['phone'],
|
|
"email": row['email'],
|
|
"isil_code": row['isil_code'],
|
|
"google_place_id": row['google_place_id'],
|
|
"description": row['description'],
|
|
"business_status": row['business_status'],
|
|
"street_view_url": row['street_view_url'],
|
|
"founding_year": row['founding_year'],
|
|
"dissolution_year": row['dissolution_year'],
|
|
}
|
|
|
|
# Add JSONB fields if present
|
|
if row['opening_hours']:
|
|
props["opening_hours"] = row['opening_hours']
|
|
if row['reviews']:
|
|
props["reviews"] = row['reviews']
|
|
if row['photos']:
|
|
props["photos"] = row['photos']
|
|
if row['photo_urls']:
|
|
props["photo_urls"] = row['photo_urls']
|
|
if row['temporal_extent']:
|
|
props["temporal_extent"] = row['temporal_extent']
|
|
if row['museum_register']:
|
|
props["museum_register"] = row['museum_register']
|
|
if row['youtube_enrichment']:
|
|
props["youtube_enrichment"] = row['youtube_enrichment']
|
|
elif row['youtube_channel_url']:
|
|
props["youtube"] = {
|
|
"channel_url": row['youtube_channel_url'],
|
|
"subscriber_count": row['youtube_subscriber_count'],
|
|
"video_count": row['youtube_video_count'],
|
|
}
|
|
|
|
social = {}
|
|
if row['social_facebook']:
|
|
social['facebook'] = row['social_facebook']
|
|
if row['social_twitter']:
|
|
social['twitter'] = row['social_twitter']
|
|
if row['social_instagram']:
|
|
social['instagram'] = row['social_instagram']
|
|
if social:
|
|
props["social_media"] = social
|
|
|
|
if row['wikidata_label_en']:
|
|
props["wikidata_label"] = row['wikidata_label_en']
|
|
if row['wikidata_description_en']:
|
|
props["wikidata_description"] = row['wikidata_description_en']
|
|
|
|
features.append({
|
|
"type": "Feature",
|
|
"geometry": {
|
|
"type": "Point",
|
|
"coordinates": [float(row['lon']), float(row['lat'])]
|
|
},
|
|
"properties": props
|
|
})
|
|
|
|
return {
|
|
"type": "FeatureCollection",
|
|
"features": features,
|
|
"pagination": {
|
|
"page": page,
|
|
"page_size": page_size,
|
|
"total_count": total_count,
|
|
"total_pages": total_pages,
|
|
"has_next": page < total_pages,
|
|
"has_prev": page > 1,
|
|
"next_page": page + 1 if page < total_pages else None,
|
|
"prev_page": page - 1 if page > 1 else None,
|
|
},
|
|
"metadata": {
|
|
"mode": "paginated",
|
|
"filters": {"country": country, "type": type},
|
|
"sort": {"field": sort_by, "order": sort_order}
|
|
}
|
|
}
|
|
|
|
|
|
@app.get("/institutions/viewport")
|
|
async def get_institutions_viewport(
|
|
bbox: str = Query(..., description="Bounding box: minLon,minLat,maxLon,maxLat (REQUIRED)"),
|
|
zoom: int = Query(10, ge=1, le=20, description="Map zoom level (affects detail)"),
|
|
country: Optional[str] = Query(None, description="Filter by country code"),
|
|
type: Optional[str] = Query(None, description="Filter by institution type"),
|
|
limit: int = Query(2000, ge=100, le=10000, description="Maximum results for viewport"),
|
|
):
|
|
"""
|
|
Get institutions visible in current map viewport.
|
|
Returns lite data at low zoom, full data at high zoom.
|
|
Optimized for map pan/zoom interactions.
|
|
"""
|
|
pool = await get_pool()
|
|
|
|
# Parse bbox
|
|
try:
|
|
min_lon, min_lat, max_lon, max_lat = map(float, bbox.split(','))
|
|
except ValueError:
|
|
raise HTTPException(status_code=400, detail="Invalid bbox format. Use: minLon,minLat,maxLon,maxLat")
|
|
|
|
conditions = [
|
|
"lat IS NOT NULL AND lon IS NOT NULL",
|
|
f"lon >= $1 AND lat >= $2 AND lon <= $3 AND lat <= $4"
|
|
]
|
|
params = [min_lon, min_lat, max_lon, max_lat]
|
|
param_count = 4
|
|
|
|
if country:
|
|
param_count += 1
|
|
conditions.append(f"country_code = ${param_count}")
|
|
params.append(country.upper())
|
|
|
|
if type:
|
|
param_count += 1
|
|
conditions.append(f"type = ${param_count}")
|
|
params.append(type.upper())
|
|
|
|
where_clause = " AND ".join(conditions)
|
|
|
|
# Adaptive detail based on zoom level
|
|
# Low zoom (world/continent view): minimal fields
|
|
# Medium zoom (country view): basic fields
|
|
# High zoom (city view): full fields
|
|
if zoom <= 6:
|
|
# World/continent view - just markers
|
|
select_fields = "ghcid, name, type, lon, lat, country_code"
|
|
detail_level = "minimal"
|
|
elif zoom <= 10:
|
|
# Country view - basic info
|
|
select_fields = "ghcid, name, type, type_name, lon, lat, city, country_code, rating"
|
|
detail_level = "basic"
|
|
elif zoom <= 14:
|
|
# Region view - moderate detail
|
|
select_fields = """
|
|
ghcid, name, emic_name, type, type_name, lon, lat, city,
|
|
region as province, country_code, rating, total_ratings,
|
|
website, wikidata_id, description
|
|
"""
|
|
detail_level = "moderate"
|
|
else:
|
|
# City/street view - full detail
|
|
select_fields = """
|
|
ghcid, name, emic_name, type, type_name, lon, lat, city,
|
|
region as province, region_code as province_iso, country_code,
|
|
formatted_address, street_address, postal_code, rating, total_ratings,
|
|
wikidata_id, website, phone, email, isil_code, google_place_id,
|
|
description, opening_hours, reviews, photos, photo_urls,
|
|
business_status, street_view_url, founding_year, dissolution_year,
|
|
social_facebook, social_twitter, social_instagram,
|
|
wikidata_label_en, wikidata_description_en
|
|
"""
|
|
detail_level = "full"
|
|
|
|
param_count += 1
|
|
query = f"""
|
|
SELECT {select_fields}
|
|
FROM custodians
|
|
WHERE {where_clause}
|
|
ORDER BY rating DESC NULLS LAST, name
|
|
LIMIT ${param_count}
|
|
"""
|
|
params.append(limit)
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(query, *params)
|
|
|
|
features = []
|
|
for row in rows:
|
|
row_dict = dict(row)
|
|
props = {"ghcid": row_dict['ghcid'], "name": row_dict['name'], "type": row_dict['type']}
|
|
|
|
# Add fields based on detail level
|
|
if 'type_name' in row_dict:
|
|
props['type_name'] = row_dict['type_name']
|
|
if 'city' in row_dict:
|
|
props['city'] = row_dict['city']
|
|
if 'country_code' in row_dict:
|
|
props['country_code'] = row_dict['country_code']
|
|
if 'rating' in row_dict and row_dict['rating']:
|
|
props['rating'] = float(row_dict['rating'])
|
|
if 'total_ratings' in row_dict:
|
|
props['total_ratings'] = row_dict['total_ratings']
|
|
if 'province' in row_dict:
|
|
props['province'] = row_dict['province']
|
|
if 'province_iso' in row_dict:
|
|
props['province_iso'] = row_dict['province_iso']
|
|
if 'emic_name' in row_dict:
|
|
props['emic_name'] = row_dict['emic_name']
|
|
if 'website' in row_dict:
|
|
props['website'] = row_dict['website']
|
|
if 'wikidata_id' in row_dict:
|
|
props['wikidata_id'] = row_dict['wikidata_id']
|
|
if 'description' in row_dict:
|
|
props['description'] = row_dict['description']
|
|
if 'formatted_address' in row_dict:
|
|
props['formatted_address'] = row_dict['formatted_address']
|
|
if 'street_address' in row_dict:
|
|
props['street_address'] = row_dict['street_address']
|
|
if 'postal_code' in row_dict:
|
|
props['postal_code'] = row_dict['postal_code']
|
|
if 'phone' in row_dict:
|
|
props['phone'] = row_dict['phone']
|
|
if 'email' in row_dict:
|
|
props['email'] = row_dict['email']
|
|
if 'isil_code' in row_dict:
|
|
props['isil_code'] = row_dict['isil_code']
|
|
if 'google_place_id' in row_dict:
|
|
props['google_place_id'] = row_dict['google_place_id']
|
|
if 'business_status' in row_dict:
|
|
props['business_status'] = row_dict['business_status']
|
|
if 'street_view_url' in row_dict:
|
|
props['street_view_url'] = row_dict['street_view_url']
|
|
if 'founding_year' in row_dict:
|
|
props['founding_year'] = row_dict['founding_year']
|
|
if 'dissolution_year' in row_dict:
|
|
props['dissolution_year'] = row_dict['dissolution_year']
|
|
|
|
# JSONB fields at full detail
|
|
if 'opening_hours' in row_dict and row_dict['opening_hours']:
|
|
props['opening_hours'] = row_dict['opening_hours']
|
|
if 'reviews' in row_dict and row_dict['reviews']:
|
|
props['reviews'] = row_dict['reviews']
|
|
if 'photos' in row_dict and row_dict['photos']:
|
|
props['photos'] = row_dict['photos']
|
|
if 'photo_urls' in row_dict and row_dict['photo_urls']:
|
|
props['photo_urls'] = row_dict['photo_urls']
|
|
|
|
# Social media at full detail
|
|
social = {}
|
|
if 'social_facebook' in row_dict and row_dict['social_facebook']:
|
|
social['facebook'] = row_dict['social_facebook']
|
|
if 'social_twitter' in row_dict and row_dict['social_twitter']:
|
|
social['twitter'] = row_dict['social_twitter']
|
|
if 'social_instagram' in row_dict and row_dict['social_instagram']:
|
|
social['instagram'] = row_dict['social_instagram']
|
|
if social:
|
|
props['social_media'] = social
|
|
|
|
if 'wikidata_label_en' in row_dict and row_dict['wikidata_label_en']:
|
|
props['wikidata_label'] = row_dict['wikidata_label_en']
|
|
if 'wikidata_description_en' in row_dict and row_dict['wikidata_description_en']:
|
|
props['wikidata_description'] = row_dict['wikidata_description_en']
|
|
|
|
features.append({
|
|
"type": "Feature",
|
|
"geometry": {
|
|
"type": "Point",
|
|
"coordinates": [float(row_dict['lon']), float(row_dict['lat'])]
|
|
},
|
|
"properties": props
|
|
})
|
|
|
|
return {
|
|
"type": "FeatureCollection",
|
|
"features": features,
|
|
"metadata": {
|
|
"count": len(features),
|
|
"mode": "viewport",
|
|
"detail_level": detail_level,
|
|
"zoom": zoom,
|
|
"bbox": {"min_lon": min_lon, "min_lat": min_lat, "max_lon": max_lon, "max_lat": max_lat},
|
|
"filters": {"country": country, "type": type},
|
|
"limit": limit,
|
|
"truncated": len(features) >= limit
|
|
}
|
|
}
|
|
|
|
|
|
@app.get("/institutions/cluster")
|
|
async def get_institutions_clustered(
|
|
bbox: str = Query(..., description="Bounding box: minLon,minLat,maxLon,maxLat"),
|
|
zoom: int = Query(5, ge=1, le=20, description="Map zoom level"),
|
|
grid_size: Optional[float] = Query(None, description="Grid cell size in degrees (auto if not set)"),
|
|
):
|
|
"""
|
|
Get clustered institution counts for overview maps.
|
|
Returns grid cells with counts instead of individual markers.
|
|
Ideal for world/country views with 10k+ institutions.
|
|
"""
|
|
pool = await get_pool()
|
|
|
|
try:
|
|
min_lon, min_lat, max_lon, max_lat = map(float, bbox.split(','))
|
|
except ValueError:
|
|
raise HTTPException(status_code=400, detail="Invalid bbox format")
|
|
|
|
# Auto-calculate grid size based on zoom
|
|
# Higher zoom = smaller grid cells = more detail
|
|
if grid_size is None:
|
|
if zoom <= 3:
|
|
grid_size = 10.0 # ~1000km cells
|
|
elif zoom <= 5:
|
|
grid_size = 5.0 # ~500km cells
|
|
elif zoom <= 7:
|
|
grid_size = 2.0 # ~200km cells
|
|
elif zoom <= 9:
|
|
grid_size = 1.0 # ~100km cells
|
|
elif zoom <= 11:
|
|
grid_size = 0.5 # ~50km cells
|
|
elif zoom <= 13:
|
|
grid_size = 0.2 # ~20km cells
|
|
else:
|
|
grid_size = 0.1 # ~10km cells
|
|
|
|
# Use PostgreSQL to cluster into grid cells
|
|
query = """
|
|
SELECT
|
|
FLOOR(lon / $5) * $5 + $5/2 as cell_lon,
|
|
FLOOR(lat / $5) * $5 + $5/2 as cell_lat,
|
|
COUNT(*) as count,
|
|
array_agg(DISTINCT type) as types,
|
|
AVG(rating) as avg_rating
|
|
FROM custodians
|
|
WHERE lat IS NOT NULL AND lon IS NOT NULL
|
|
AND lon >= $1 AND lat >= $2 AND lon <= $3 AND lat <= $4
|
|
GROUP BY FLOOR(lon / $5), FLOOR(lat / $5)
|
|
ORDER BY count DESC
|
|
"""
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(query, min_lon, min_lat, max_lon, max_lat, grid_size)
|
|
|
|
features = []
|
|
total_count = 0
|
|
for row in rows:
|
|
count = row['count']
|
|
total_count += count
|
|
|
|
features.append({
|
|
"type": "Feature",
|
|
"geometry": {
|
|
"type": "Point",
|
|
"coordinates": [float(row['cell_lon']), float(row['cell_lat'])]
|
|
},
|
|
"properties": {
|
|
"cluster": True,
|
|
"count": count,
|
|
"types": row['types'],
|
|
"avg_rating": round(float(row['avg_rating']), 2) if row['avg_rating'] else None,
|
|
}
|
|
})
|
|
|
|
return {
|
|
"type": "FeatureCollection",
|
|
"features": features,
|
|
"metadata": {
|
|
"mode": "clustered",
|
|
"cluster_count": len(features),
|
|
"total_institutions": total_count,
|
|
"grid_size": grid_size,
|
|
"zoom": zoom,
|
|
"bbox": {"min_lon": min_lon, "min_lat": min_lat, "max_lon": max_lon, "max_lat": max_lat}
|
|
}
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# Person Endpoints (Beschermers)
|
|
# ============================================================================
|
|
|
|
@app.get("/persons", response_model=Dict[str, Any])
|
|
async def list_persons(
|
|
custodian_slug: Optional[str] = Query(None, description="Filter by custodian slug"),
|
|
heritage_type: Optional[str] = Query(None, description="Filter by heritage type (A, L, M, etc.)"),
|
|
country_code: Optional[str] = Query(None, description="Filter by country code"),
|
|
heritage_relevant: Optional[bool] = Query(None, description="Filter by heritage relevance (true/false)"),
|
|
limit: int = Query(50, ge=1, le=500, description="Max results to return"),
|
|
offset: int = Query(0, ge=0, description="Offset for pagination"),
|
|
):
|
|
"""
|
|
List persons with optional filters.
|
|
Returns paginated list of heritage professionals (beschermers).
|
|
"""
|
|
pool = await get_pool()
|
|
|
|
# Build query with optional filters
|
|
conditions = []
|
|
params = []
|
|
param_idx = 1
|
|
|
|
if custodian_slug:
|
|
conditions.append(f"custodian_slug = ${param_idx}")
|
|
params.append(custodian_slug)
|
|
param_idx += 1
|
|
|
|
if heritage_type:
|
|
conditions.append(f"${param_idx} = ANY(heritage_types)")
|
|
params.append(heritage_type)
|
|
param_idx += 1
|
|
|
|
if country_code:
|
|
conditions.append(f"country_code = ${param_idx}")
|
|
params.append(country_code)
|
|
param_idx += 1
|
|
|
|
if heritage_relevant is not None:
|
|
conditions.append(f"heritage_relevant = ${param_idx}")
|
|
params.append(heritage_relevant)
|
|
param_idx += 1
|
|
|
|
where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else ""
|
|
|
|
# Add pagination params
|
|
params.extend([limit, offset])
|
|
|
|
query = f"""
|
|
SELECT staff_id, name, headline, location, country_code,
|
|
custodian_slug, custodian_name, linkedin_url, profile_image_url,
|
|
heritage_relevant, heritage_types
|
|
FROM persons
|
|
{where_clause}
|
|
ORDER BY name
|
|
LIMIT ${param_idx} OFFSET ${param_idx + 1}
|
|
"""
|
|
|
|
# Count query
|
|
count_query = f"SELECT COUNT(*) FROM persons {where_clause}"
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(query, *params)
|
|
count_params = params[:-2] if params else [] # Remove limit/offset for count
|
|
total = await conn.fetchval(count_query, *count_params) if count_params else await conn.fetchval(count_query)
|
|
|
|
persons = []
|
|
for row in rows:
|
|
persons.append(PersonSummary(
|
|
staff_id=row['staff_id'],
|
|
name=row['name'],
|
|
headline=row['headline'],
|
|
location=row['location'],
|
|
country_code=row['country_code'],
|
|
custodian_slug=row['custodian_slug'],
|
|
custodian_name=row['custodian_name'],
|
|
linkedin_url=row['linkedin_url'],
|
|
profile_image_url=row['profile_image_url'],
|
|
heritage_relevant=row['heritage_relevant'] if row['heritage_relevant'] is not None else True,
|
|
heritage_types=row['heritage_types'] if row['heritage_types'] else [],
|
|
))
|
|
|
|
return {
|
|
"persons": [p.model_dump() for p in persons],
|
|
"total": total,
|
|
"limit": limit,
|
|
"offset": offset,
|
|
"has_more": offset + len(persons) < total,
|
|
}
|
|
|
|
|
|
@app.get("/persons/count")
|
|
async def get_persons_count():
|
|
"""Get total person count for stats display."""
|
|
pool = await get_pool()
|
|
|
|
async with pool.acquire() as conn:
|
|
total = await conn.fetchval("SELECT COUNT(*) FROM persons")
|
|
heritage_relevant = await conn.fetchval("SELECT COUNT(*) FROM persons WHERE heritage_relevant = true")
|
|
|
|
return {
|
|
"total": total,
|
|
"heritage_relevant": heritage_relevant,
|
|
}
|
|
|
|
|
|
@app.get("/persons/search", response_model=Dict[str, Any])
|
|
async def search_persons(
|
|
q: str = Query(..., min_length=2, description="Search query"),
|
|
limit: int = Query(20, ge=1, le=100, description="Max results"),
|
|
):
|
|
"""
|
|
Search persons by name, headline, or custodian name.
|
|
Uses PostgreSQL full-text search.
|
|
"""
|
|
pool = await get_pool()
|
|
|
|
# Use ILIKE for simple search (full-text search can be added later if index exists)
|
|
search_pattern = f"%{q}%"
|
|
|
|
query = """
|
|
SELECT staff_id, name, headline, location, country_code,
|
|
custodian_slug, custodian_name, linkedin_url, profile_image_url,
|
|
heritage_relevant, heritage_types
|
|
FROM persons
|
|
WHERE name ILIKE $1
|
|
OR headline ILIKE $1
|
|
OR custodian_name ILIKE $1
|
|
ORDER BY
|
|
CASE WHEN name ILIKE $2 THEN 0 ELSE 1 END,
|
|
name
|
|
LIMIT $3
|
|
"""
|
|
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(query, search_pattern, f"{q}%", limit)
|
|
|
|
persons = []
|
|
for row in rows:
|
|
persons.append(PersonSummary(
|
|
staff_id=row['staff_id'],
|
|
name=row['name'],
|
|
headline=row['headline'],
|
|
location=row['location'],
|
|
country_code=row['country_code'],
|
|
custodian_slug=row['custodian_slug'],
|
|
custodian_name=row['custodian_name'],
|
|
linkedin_url=row['linkedin_url'],
|
|
profile_image_url=row['profile_image_url'],
|
|
heritage_relevant=row['heritage_relevant'] if row['heritage_relevant'] is not None else True,
|
|
heritage_types=row['heritage_types'] if row['heritage_types'] else [],
|
|
))
|
|
|
|
return {
|
|
"persons": [p.model_dump() for p in persons],
|
|
"count": len(persons),
|
|
"query": q,
|
|
}
|
|
|
|
|
|
@app.get("/person/{staff_id}", response_model=PersonDetail)
|
|
async def get_person(staff_id: str):
|
|
"""Get detailed information for a single person."""
|
|
pool = await get_pool()
|
|
|
|
query = """
|
|
SELECT staff_id, name, headline, location, country_code,
|
|
custodian_slug, custodian_name, linkedin_url, profile_image_url,
|
|
heritage_relevant, heritage_types, experience, education,
|
|
skills, languages, about, connections,
|
|
extraction_date, extraction_method, source_file
|
|
FROM persons
|
|
WHERE staff_id = $1
|
|
"""
|
|
|
|
async with pool.acquire() as conn:
|
|
row = await conn.fetchrow(query, staff_id)
|
|
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail=f"Person not found: {staff_id}")
|
|
|
|
return PersonDetail(
|
|
staff_id=row['staff_id'],
|
|
name=row['name'],
|
|
headline=row['headline'],
|
|
location=row['location'],
|
|
country_code=row['country_code'],
|
|
custodian_slug=row['custodian_slug'],
|
|
custodian_name=row['custodian_name'],
|
|
linkedin_url=row['linkedin_url'],
|
|
profile_image_url=row['profile_image_url'],
|
|
heritage_relevant=row['heritage_relevant'] if row['heritage_relevant'] is not None else True,
|
|
heritage_types=parse_jsonb_list(row['heritage_types']),
|
|
experience=enrich_experience_with_heritage(parse_jsonb_list(row['experience'])),
|
|
education=parse_jsonb_list(row['education']),
|
|
skills=parse_jsonb_list(row['skills']),
|
|
languages=parse_jsonb_list(row['languages']),
|
|
about=row['about'],
|
|
connections=row['connections'],
|
|
extraction_date=row['extraction_date'].isoformat() if row['extraction_date'] else None,
|
|
extraction_method=row['extraction_method'],
|
|
source_file=row['source_file'],
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# Image Proxy (Avoid Hotlinking Issues)
|
|
# ============================================================================
|
|
|
|
# In-memory cache for proxied images (simple TTL-based)
|
|
_image_cache: Dict[str, tuple] = {} # hash -> (content, content_type, timestamp)
|
|
IMAGE_CACHE_TTL = 3600 # 1 hour
|
|
|
|
# Allowed image domains for security
|
|
ALLOWED_IMAGE_DOMAINS = {
|
|
# Google Maps
|
|
'lh3.googleusercontent.com',
|
|
'lh4.googleusercontent.com',
|
|
'lh5.googleusercontent.com',
|
|
'lh6.googleusercontent.com',
|
|
'maps.gstatic.com',
|
|
'maps.googleapis.com',
|
|
# Wikidata/Wikimedia
|
|
'upload.wikimedia.org',
|
|
'commons.wikimedia.org',
|
|
# Institution domains (add as needed)
|
|
# Generic patterns handled below
|
|
}
|
|
|
|
|
|
def is_allowed_image_url(url: str) -> bool:
|
|
"""Check if URL is from an allowed domain for proxying."""
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc.lower()
|
|
|
|
# Check exact matches
|
|
if domain in ALLOWED_IMAGE_DOMAINS:
|
|
return True
|
|
|
|
# Allow any .nl domain (Dutch institutions)
|
|
if domain.endswith('.nl'):
|
|
return True
|
|
|
|
# Allow any .org domain (many heritage institutions)
|
|
if domain.endswith('.org'):
|
|
return True
|
|
|
|
# Allow any .museum domain
|
|
if domain.endswith('.museum'):
|
|
return True
|
|
|
|
# Check for Google user content subdomains
|
|
if 'googleusercontent.com' in domain:
|
|
return True
|
|
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
@app.get("/image-proxy")
|
|
async def proxy_image(url: str = Query(..., description="Image URL to proxy")):
|
|
"""
|
|
Proxy external images to avoid hotlinking issues.
|
|
|
|
Many external servers block direct embedding (hotlinking) of their images.
|
|
This endpoint fetches the image server-side and returns it with proper headers.
|
|
|
|
Features:
|
|
- Validates URL is from allowed domains (security)
|
|
- Caches images in memory for 1 hour (performance)
|
|
- Sets proper Content-Type headers
|
|
- Avoids CORS issues
|
|
|
|
Usage: /image-proxy?url=https://example.com/logo.png
|
|
"""
|
|
# Security: validate URL
|
|
if not url or not url.startswith(('http://', 'https://')):
|
|
raise HTTPException(status_code=400, detail="Invalid URL")
|
|
|
|
if not is_allowed_image_url(url):
|
|
raise HTTPException(status_code=403, detail="Domain not allowed for proxying")
|
|
|
|
# Check cache
|
|
url_hash = hashlib.md5(url.encode()).hexdigest()
|
|
if url_hash in _image_cache:
|
|
content, content_type, timestamp = _image_cache[url_hash]
|
|
if datetime.now().timestamp() - timestamp < IMAGE_CACHE_TTL:
|
|
return Response(
|
|
content=content,
|
|
media_type=content_type,
|
|
headers={
|
|
"Cache-Control": "public, max-age=3600",
|
|
"X-Proxy-Cache": "HIT",
|
|
}
|
|
)
|
|
|
|
# Fetch image
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
|
|
response = await client.get(
|
|
url,
|
|
headers={
|
|
# Spoof headers to avoid hotlink detection
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9,nl;q=0.8",
|
|
"Referer": urlparse(url).scheme + "://" + urlparse(url).netloc + "/",
|
|
}
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise HTTPException(status_code=502, detail=f"Failed to fetch image: {response.status_code}")
|
|
|
|
content = response.content
|
|
content_type = response.headers.get("content-type", "image/png")
|
|
|
|
# Validate it's actually an image
|
|
if not content_type.startswith("image/"):
|
|
raise HTTPException(status_code=400, detail="URL does not point to an image")
|
|
|
|
# Cache the result
|
|
_image_cache[url_hash] = (content, content_type, datetime.now().timestamp())
|
|
|
|
# Limit cache size (simple LRU-like cleanup)
|
|
if len(_image_cache) > 1000:
|
|
# Remove oldest entries
|
|
sorted_entries = sorted(_image_cache.items(), key=lambda x: x[1][2])
|
|
for key, _ in sorted_entries[:500]:
|
|
del _image_cache[key]
|
|
|
|
return Response(
|
|
content=content,
|
|
media_type=content_type,
|
|
headers={
|
|
"Cache-Control": "public, max-age=3600",
|
|
"X-Proxy-Cache": "MISS",
|
|
}
|
|
)
|
|
|
|
except httpx.TimeoutException:
|
|
raise HTTPException(status_code=504, detail="Timeout fetching image")
|
|
except httpx.RequestError as e:
|
|
raise HTTPException(status_code=502, detail=f"Error fetching image: {str(e)}")
|
|
|
|
|
|
# ============================================================================
|
|
# GeoNames Place Lookup (for RAG geospatial filtering)
|
|
# ============================================================================
|
|
|
|
# In-memory LRU cache for place lookups (key: "country:lowername", value: PlaceLookupResponse dict)
|
|
# This dramatically speeds up repeated queries for the same places
|
|
from functools import lru_cache
|
|
import time as time_module
|
|
|
|
# Cache statistics for monitoring
|
|
_place_cache_stats = {"hits": 0, "misses": 0, "last_reset": time_module.time()}
|
|
|
|
|
|
def _calculate_bbox(lat: float, lon: float, population: int) -> list[float]:
|
|
"""Calculate bounding box based on population size."""
|
|
if population > 200000:
|
|
delta = 0.15 # ~17km for large cities
|
|
elif population > 50000:
|
|
delta = 0.08 # ~9km for medium cities
|
|
else:
|
|
delta = 0.04 # ~4.5km for small places
|
|
return [lon - delta, lat - delta, lon + delta, lat + delta]
|
|
|
|
|
|
# Dutch to English province name mapping for GeoNames compatibility
|
|
# GeoNames stores English names, but users query in Dutch
|
|
DUTCH_PROVINCE_MAPPING = {
|
|
"noord-holland": "North Holland",
|
|
"zuid-holland": "South Holland",
|
|
"noord-brabant": "North Brabant",
|
|
"noordholland": "North Holland",
|
|
"zuidholland": "South Holland",
|
|
"noordbrabant": "North Brabant",
|
|
# Add Belgian provinces
|
|
"oost-vlaanderen": "East Flanders",
|
|
"west-vlaanderen": "West Flanders",
|
|
"vlaams-brabant": "Flemish Brabant",
|
|
"waals-brabant": "Walloon Brabant",
|
|
}
|
|
|
|
|
|
# LRU cache for place lookups - caches up to 500 places
|
|
# This avoids repeated SQLite queries for the same place
|
|
@lru_cache(maxsize=500)
|
|
def _cached_place_lookup_sync(name_lower: str, country: str, db_path: str) -> dict | None:
|
|
"""
|
|
Synchronous cached place lookup using tiered search strategy.
|
|
|
|
Tier 1: Exact match with Title Case (uses index, ~0.03ms)
|
|
Tier 2: Case-insensitive COLLATE NOCASE (~2ms)
|
|
Tier 3: Fuzzy match on alternate_names (~12ms)
|
|
Tier 4: Province/region lookup using admin1_name
|
|
|
|
Returns dict with place data or None if not found.
|
|
"""
|
|
import sqlite3
|
|
|
|
# Normalize Dutch province names to English (GeoNames uses English)
|
|
normalized_name = DUTCH_PROVINCE_MAPPING.get(name_lower.replace(" ", "-"), name_lower)
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
row = None
|
|
|
|
try:
|
|
# Tier 1: Exact match with Title Case (fastest - uses index)
|
|
# Most place names from frontend are already normalized
|
|
title_name = name_lower.title()
|
|
cursor = conn.execute("""
|
|
SELECT name, latitude, longitude, admin1_name, population
|
|
FROM cities
|
|
WHERE name = ? AND country_code = ?
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (title_name, country))
|
|
row = cursor.fetchone()
|
|
|
|
if not row:
|
|
# Tier 2: Case-insensitive match with COLLATE NOCASE
|
|
cursor = conn.execute("""
|
|
SELECT name, latitude, longitude, admin1_name, population
|
|
FROM cities
|
|
WHERE name = ? COLLATE NOCASE AND country_code = ?
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (name_lower, country))
|
|
row = cursor.fetchone()
|
|
|
|
if not row:
|
|
# Tier 3: Check alternate_names and ascii_name (slowest but comprehensive)
|
|
cursor = conn.execute("""
|
|
SELECT name, latitude, longitude, admin1_name, population
|
|
FROM cities
|
|
WHERE (
|
|
alternate_names LIKE '%' || ? || '%'
|
|
OR ascii_name = ? COLLATE NOCASE
|
|
)
|
|
AND country_code = ?
|
|
ORDER BY population DESC
|
|
LIMIT 1
|
|
""", (name_lower, name_lower, country))
|
|
row = cursor.fetchone()
|
|
|
|
if row:
|
|
pop = row["population"] or 10000
|
|
lat = float(row["latitude"])
|
|
lon = float(row["longitude"])
|
|
|
|
return {
|
|
"name": row["name"],
|
|
"latitude": lat,
|
|
"longitude": lon,
|
|
"province": row["admin1_name"],
|
|
"population": pop,
|
|
"bbox": _calculate_bbox(lat, lon, pop),
|
|
"country_code": country,
|
|
}
|
|
|
|
# Tier 4: Province/region lookup - search admin1_name column
|
|
# For queries like "Noord-Holland", "North Holland", "Gelderland"
|
|
# Use normalized_name which maps Dutch names to English (GeoNames uses English)
|
|
province_search_names = [normalized_name]
|
|
if normalized_name != name_lower:
|
|
province_search_names.append(name_lower.title()) # Also try original
|
|
|
|
for search_name in province_search_names:
|
|
province_cursor = conn.execute("""
|
|
SELECT admin1_name as province,
|
|
AVG(latitude) as center_lat,
|
|
AVG(longitude) as center_lon,
|
|
SUM(population) as total_pop,
|
|
MIN(latitude) as min_lat,
|
|
MAX(latitude) as max_lat,
|
|
MIN(longitude) as min_lon,
|
|
MAX(longitude) as max_lon
|
|
FROM cities
|
|
WHERE admin1_name = ? COLLATE NOCASE AND country_code = ?
|
|
GROUP BY admin1_name
|
|
""", (search_name, country))
|
|
province_row = province_cursor.fetchone()
|
|
|
|
if province_row and province_row["province"]:
|
|
lat = float(province_row["center_lat"])
|
|
lon = float(province_row["center_lon"])
|
|
pop = province_row["total_pop"] or 1000000 # Province population
|
|
|
|
# Calculate bbox from actual province extent (with padding)
|
|
min_lat = float(province_row["min_lat"]) - 0.1
|
|
max_lat = float(province_row["max_lat"]) + 0.1
|
|
min_lon = float(province_row["min_lon"]) - 0.1
|
|
max_lon = float(province_row["max_lon"]) + 0.1
|
|
|
|
return {
|
|
"name": province_row["province"],
|
|
"latitude": lat,
|
|
"longitude": lon,
|
|
"province": province_row["province"], # It IS the province
|
|
"population": pop,
|
|
"bbox": [min_lon, min_lat, max_lon, max_lat],
|
|
"country_code": country,
|
|
}
|
|
|
|
return None
|
|
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
@app.get("/places/lookup", response_model=PlaceLookupResponse)
|
|
async def lookup_place(
|
|
name: str = Query(..., min_length=2, description="Place name to look up"),
|
|
country: str = Query("NL", description="Country code filter (ISO 3166-1 alpha-2)"),
|
|
) -> PlaceLookupResponse:
|
|
"""
|
|
Look up a place by name and return coordinates with bounding box.
|
|
|
|
Used by RAG to resolve place names mentioned in user queries and apply
|
|
geospatial filtering to Qdrant vector search results.
|
|
|
|
OPTIMIZED with tiered lookup strategy:
|
|
- Tier 1: Exact match (uses index, ~0.03ms)
|
|
- Tier 2: Case-insensitive COLLATE NOCASE (~2ms)
|
|
- Tier 3: Fuzzy match on alternate_names (~12ms)
|
|
|
|
Results are cached in-memory (LRU cache, 500 entries) for instant
|
|
repeated lookups.
|
|
|
|
Bounding box is calculated based on population:
|
|
- Large cities (>200k): +/- 0.15 degrees (~17km)
|
|
- Medium cities (50k-200k): +/- 0.08 degrees (~9km)
|
|
- Small places (<50k): +/- 0.04 degrees (~4.5km)
|
|
|
|
Examples:
|
|
- /places/lookup?name=Amsterdam -> Returns Amsterdam with large bbox
|
|
- /places/lookup?name=Leiden -> Returns Leiden with medium bbox
|
|
- /places/lookup?name=Aalten -> Returns Aalten with small bbox
|
|
"""
|
|
global _place_cache_stats
|
|
|
|
db_path = settings.geonames_db
|
|
|
|
# Check if database exists
|
|
if not Path(db_path).exists():
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"GeoNames database not found at {db_path}"
|
|
)
|
|
|
|
# Normalize name for cache key
|
|
name_lower = name.lower().strip()
|
|
|
|
# Check cache stats before lookup
|
|
cache_info_before = _cached_place_lookup_sync.cache_info()
|
|
|
|
try:
|
|
# Use cached synchronous lookup (runs in thread pool via asyncio)
|
|
# The LRU cache makes repeated lookups instant
|
|
result = await asyncio.get_event_loop().run_in_executor(
|
|
None,
|
|
_cached_place_lookup_sync,
|
|
name_lower,
|
|
country.upper(),
|
|
db_path
|
|
)
|
|
|
|
# Update cache stats
|
|
cache_info_after = _cached_place_lookup_sync.cache_info()
|
|
if cache_info_after.hits > cache_info_before.hits:
|
|
_place_cache_stats["hits"] += 1
|
|
else:
|
|
_place_cache_stats["misses"] += 1
|
|
|
|
if not result:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Place '{name}' not found in {country}"
|
|
)
|
|
|
|
return PlaceLookupResponse(
|
|
name=result["name"],
|
|
latitude=result["latitude"],
|
|
longitude=result["longitude"],
|
|
province=result["province"],
|
|
population=result["population"],
|
|
bbox=result["bbox"],
|
|
country_code=result["country_code"],
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Database error: {str(e)}"
|
|
)
|
|
|
|
|
|
@app.get("/places/cache-stats")
|
|
async def get_place_cache_stats() -> dict:
|
|
"""Get statistics about the place lookup cache."""
|
|
cache_info = _cached_place_lookup_sync.cache_info()
|
|
return {
|
|
"cache_size": cache_info.currsize,
|
|
"max_size": cache_info.maxsize,
|
|
"hits": cache_info.hits,
|
|
"misses": cache_info.misses,
|
|
"hit_rate": cache_info.hits / (cache_info.hits + cache_info.misses) if (cache_info.hits + cache_info.misses) > 0 else 0,
|
|
"api_hits": _place_cache_stats["hits"],
|
|
"api_misses": _place_cache_stats["misses"],
|
|
}
|
|
|
|
|
|
@app.get("/places/provinces")
|
|
async def list_provinces(
|
|
country: str = Query("NL", description="Country code filter"),
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
List all provinces for a country from GeoNames database.
|
|
Useful for province-level geospatial filtering.
|
|
"""
|
|
db_path = settings.geonames_db
|
|
|
|
if not Path(db_path).exists():
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"GeoNames database not found"
|
|
)
|
|
|
|
try:
|
|
async with aiosqlite.connect(db_path) as db:
|
|
db.row_factory = aiosqlite.Row
|
|
|
|
# Get unique provinces with representative city for bbox calculation
|
|
cursor = await db.execute("""
|
|
SELECT DISTINCT admin1_name as province,
|
|
admin1_code,
|
|
AVG(latitude) as center_lat,
|
|
AVG(longitude) as center_lon,
|
|
SUM(population) as total_pop
|
|
FROM cities
|
|
WHERE country_code = ? AND admin1_name IS NOT NULL
|
|
GROUP BY admin1_name, admin1_code
|
|
ORDER BY admin1_name
|
|
""", (country,))
|
|
|
|
rows = await cursor.fetchall()
|
|
|
|
provinces = []
|
|
for row in rows:
|
|
provinces.append({
|
|
"name": row["province"],
|
|
"code": row["admin1_code"],
|
|
"center_lat": row["center_lat"],
|
|
"center_lon": row["center_lon"],
|
|
"total_population": row["total_pop"],
|
|
})
|
|
|
|
return provinces
|
|
|
|
except aiosqlite.Error as e:
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Database error: {str(e)}"
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# Main
|
|
# ============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run(
|
|
"geo_api:app",
|
|
host=settings.api_host,
|
|
port=settings.api_port,
|
|
reload=True,
|
|
)
|