""" PostGIS Geo API for Heritage Custodian Map FastAPI backend providing spatial queries for bronhouder.nl map Mounted at /api/geo/ via Caddy reverse proxy. Endpoints: - GET / - Health check and geo statistics - GET /countries - Get all countries as GeoJSON with institution counts - GET /provinces - Get all provinces as GeoJSON - GET /municipalities - Get municipalities (with filters) - GET /institutions - Get institutions as GeoJSON (with bbox/type filters) - GET /institution/:ghcid - Get single institution details - GET /historical - Get historical boundaries - GET /search - Search institutions by name - GET /admin/point - Find admin unit for a point - GET /nearby - Find institutions near a point - GET /stats/by-type - Institution counts by type - GET /stats/by-province - Institution counts by province Person Endpoints (Beschermers): - GET /persons - List persons with filters (custodian, type, country) - GET /persons/count - Get total person count for stats - GET /persons/search - Search persons by name/headline/custodian - GET /person/:staff_id - Get single person details """ import os import asyncio import json from datetime import datetime from typing import Optional, List, Dict, Any from contextlib import asynccontextmanager from decimal import Decimal from fastapi import FastAPI, HTTPException, Query, APIRouter from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.gzip import GZipMiddleware from fastapi.responses import JSONResponse, Response from pydantic import BaseModel, Field import asyncpg import httpx import hashlib import aiosqlite from urllib.parse import urlparse from pathlib import Path # ============================================================================ # Configuration # ============================================================================ class GeoSettings(BaseModel): """PostGIS geo database settings - connects to glam_geo with PostGIS boundaries""" host: str = os.getenv("GEO_POSTGRES_HOST", "localhost") port: int = int(os.getenv("GEO_POSTGRES_PORT", "5432")) database: str = os.getenv("GEO_POSTGRES_DB", "glam_geo") # glam_geo has boundary data user: str = os.getenv("GEO_POSTGRES_USER", "glam_api") password: str = os.getenv("GEO_POSTGRES_PASSWORD", "") # Server settings api_host: str = os.getenv("GEO_API_HOST", "0.0.0.0") api_port: int = int(os.getenv("GEO_API_PORT", "8002")) # GeoNames database path (SQLite) geonames_db: str = os.getenv( "GEONAMES_DB_PATH", str(Path(__file__).parent.parent.parent / "data" / "reference" / "geonames.db") ) settings = GeoSettings() # ============================================================================ # Pydantic Models # ============================================================================ class GeoStatsResponse(BaseModel): """Geo database statistics""" status: str database: str provinces: int municipalities: int institutions: int historical_boundaries: int postgres_version: str class InstitutionDetail(BaseModel): """Detailed institution information""" ghcid: str name: str type: str type_name: Optional[str] lat: Optional[float] lon: Optional[float] address: Optional[str] city: Optional[str] province: Optional[str] website: Optional[str] phone: Optional[str] wikidata_id: Optional[str] rating: Optional[float] total_ratings: Optional[int] description: Optional[str] reviews: Optional[List[Dict]] genealogiewerkbalk: Optional[Dict] class AdminPoint(BaseModel): """Admin unit for a point""" province_code: Optional[str] province_name: Optional[str] municipality_code: Optional[str] municipality_name: Optional[str] class NearbyInstitution(BaseModel): """Institution with distance""" ghcid: str name: str type: str type_name: Optional[str] distance_km: float city: Optional[str] province: Optional[str] rating: Optional[float] class PersonSummary(BaseModel): """Summary person information for list views""" staff_id: str name: str headline: Optional[str] location: Optional[str] country_code: Optional[str] custodian_slug: Optional[str] custodian_name: Optional[str] linkedin_url: Optional[str] profile_image_url: Optional[str] heritage_relevant: bool = True heritage_types: List[str] = [] class PersonDetail(BaseModel): """Detailed person information""" staff_id: str name: str headline: Optional[str] location: Optional[str] country_code: Optional[str] custodian_slug: Optional[str] custodian_name: Optional[str] linkedin_url: Optional[str] profile_image_url: Optional[str] heritage_relevant: bool = True heritage_types: List[str] = [] experience: List[Dict] = [] education: List[Dict] = [] skills: List[str] = [] languages: List[Dict] = [] about: Optional[str] connections: Optional[str] extraction_date: Optional[str] extraction_method: Optional[str] source_file: Optional[str] class PlaceLookupResponse(BaseModel): """Response from GeoNames place lookup""" name: str latitude: float longitude: float province: Optional[str] = None population: Optional[int] = None bbox: List[float] = Field( description="Bounding box [minLon, minLat, maxLon, maxLat]" ) country_code: str = "NL" # ============================================================================ # Heritage Classification (copied from main.py for experience item classification) # ============================================================================ import re # Heritage type detection keywords for GLAMORCUBESFIXPHDNT taxonomy # Complete 19-type taxonomy extracted from Wikidata hyponyms + domain expertise # G=Gallery, L=Library, A=Archive, M=Museum, O=Official, R=Research, C=Corporation, # U=Unspecified, B=Bio-custodian, E=Education, S=Society, F=Feature, I=Intangible, # X=Mixed, P=Personal, H=Holy, D=Digital, N=Non-profit, T=Taste/Scent HERITAGE_KEYWORDS = { # G - Gallery: Art galleries, exhibition spaces, kunsthallen 'G': ['gallery', 'galerie', 'kunsthal', 'art dealer', 'art gallery', 'exhibition space', 'art center', 'art centre', 'art collection', 'art collective', 'art colony', 'kunstgalerie', 'kunstcentrum', 'tentoonstellingsruimte', 'alternative space', 'sculpture garden', 'beeldentuin', 'kunsthuis', 'artist-run'], # L - Library: Public, academic, and specialized libraries 'L': ['library', 'bibliotheek', 'bibliothek', 'librarian', 'bibliothecaris', 'KB ', 'national library', 'academic library', 'public library', 'openbare bibliotheek', 'university library', 'universiteitsbibliotheek', 'reading room', 'leeszaal', 'book collection', 'boekencollectie', 'mediatheek', 'mediatheque', 'lending library', 'special collections', 'bijzondere collecties', 'manuscript library'], # A - Archive: Government, corporate, and personal archives 'A': ['archive', 'archief', 'archivist', 'archivaris', 'archival', 'beeld en geluid', 'beeld & geluid', 'NISV', 'filmmuseum', 'eye film', 'EYE ', 'audiovisual', 'nationaal archief', 'stadsarchief', 'gemeentearchief', 'rijksarchief', 'NIOD', 'IISH', 'IISG', 'archiefspecialist', 'records management', 'documentbeheer', 'film archive', 'filmarchief', 'sound archive', 'geluidsarchief', 'photo archive', 'fotoarchief', 'regional archive', 'regionaal archief', 'provincial archive', 'provinciaal archief', 'family archive', 'familiearchief', 'ecclesiastical archive', 'kerkelijk archief', 'notarial archive', 'notarieel archief', 'depot'], # M - Museum: All museum types (art, history, science, etc.) 'M': ['museum', 'musea', 'curator', 'conservator', 'collection manager', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'museale', 'collectiebeheerder', 'collectiespecialist', 'collectie', 'archaeological museum', 'archeologisch museum', 'natural history museum', 'natuurhistorisch museum', 'science museum', 'wetenschapsmuseum', 'art museum', 'kunstmuseum', 'history museum', 'historisch museum', 'maritime museum', 'maritiem museum', 'military museum', 'militair museum', 'open air museum', 'openluchtmuseum', 'heritage museum', 'erfgoedmuseum', 'volkskundemuseum', 'ethnographic museum'], # O - Official Institution: Government heritage agencies, platforms 'O': ['ministry', 'ministerie', 'government', 'overheid', 'gemeente', 'province', 'provincie', 'OCW', 'cultural ministry', 'heritage agency', 'erfgoedinstelling', 'rijksdienst', 'cultural heritage agency', 'RCE', 'rijksdienst cultureel erfgoed', 'heritage platform', 'erfgoedplatform', 'state archives', 'public records office', 'national heritage', 'monumentenzorg', 'heritage council', 'erfgoedraad'], # R - Research Center: Research institutes, documentation centers 'R': ['research', 'onderzoek', 'researcher', 'onderzoeker', 'KNAW', 'humanities cluster', 'NWO', 'documentatie', 'documentation', 'kenniscentrum', 'historicus', 'research institute', 'onderzoeksinstituut', 'documentation center', 'documentatiecentrum', 'scientific archive', 'wetenschappelijk archief', 'study center', 'studiecentrum', 'research library', 'onderzoeksbibliotheek', 'think tank', 'knowledge institute', 'kennisinstituut', 'meertens', 'huygens', 'NIOD', 'fryske akademy'], # C - Corporation: Corporate heritage collections, company archives 'C': ['corporate archive', 'bedrijfsarchief', 'company history', 'company museum', 'bedrijfsmuseum', 'industrial heritage', 'industrieel erfgoed', 'brand archive', 'merkarchief', 'corporate collection', 'bedrijfscollectie', 'factory museum', 'fabrieksmuseum', 'business archive', 'ondernemingsarchief'], # U - Unspecified: Data quality flag, no keywords (intentional) 'U': [], # B - Bio-custodian: Botanical gardens, zoos, aquariums, nature reserves 'B': ['botanical garden', 'botanische tuin', 'hortus', 'arboretum', 'zoo', 'dierentuin', 'aquarium', 'vivarium', 'terrarium', 'insectarium', 'butterfly house', 'vlindertuin', 'nature reserve', 'natuurreservaat', 'wildlife park', 'wildpark', 'safari park', 'bird sanctuary', 'vogelreservaat', 'marine park', 'oceanarium', 'dolphinarium', 'seed bank', 'zaadbank', 'herbarium', 'alpine garden', 'alpentuin', 'reptile house', 'reptielenhuis', 'animal park', 'dierenpark', 'nature museum', 'natuurmuseum', 'plant collection', 'plantencollectie', 'living collection', 'levende collectie'], # E - Education Provider: Universities, academies, training centers 'E': ['university', 'universiteit', 'professor', 'lecturer', 'docent', 'hogeschool', 'academy', 'academie', 'PhD', 'phd candidate', 'student', 'teacher', 'onderwijs', 'education', 'UvA', 'VU ', 'leiden university', 'reinwardt', 'film academy', 'graduate', 'assistant professor', 'associate professor', 'hoogleraar', 'educatie', 'educator', 'art school', 'kunstacademie', 'music conservatory', 'conservatorium', 'training center', 'opleidingscentrum', 'museum studies', 'archival science', 'archiefwetenschap', 'library science', 'bibliotheconomie', 'heritage studies'], # S - Society: Historical societies, collecting societies, associations 'S': ['society', 'vereniging', 'genootschap', 'historical society', 'historische vereniging', 'heemkundige kring', 'oudheidkundige kring', 'archaeological society', 'archeologische vereniging', 'numismatic society', 'numismatische vereniging', 'philatelic society', 'filatelistenvereniging', 'friends of', 'vrienden van', 'heritage association', 'erfgoedvereniging', 'collectors club', 'verzamelclub', 'antiquarian society', 'local history society', 'family history society', 'genealogische vereniging', 'alumni association', 'oud-ledenvereniging'], # F - Feature: Monuments, sculptures, heritage sites, landmarks 'F': ['monument', 'memorial', 'statue', 'sculpture', 'landmark', 'heritage site', 'erfgoedlocatie', 'archaeological site', 'archeologische vindplaats', 'castle', 'kasteel', 'manor', 'landgoed', 'mansion', 'landhuis', 'buitenplaats', 'palace', 'paleis', 'fortress', 'fort', 'vesting', 'ruins', 'ruïne', 'cemetery', 'begraafplaats', 'war memorial', 'oorlogsmonument', 'historic building', 'historisch gebouw', 'windmill', 'molen', 'industrial monument', 'industrieel monument', 'watermill', 'watermolen', 'lighthouse', 'vuurtoren', 'bunker', 'aqueduct', 'aquaduct'], # I - Intangible Heritage: Folklore, oral history, performing traditions 'I': ['intangible heritage', 'immaterieel erfgoed', 'oral history', 'oral tradition', 'folklore', 'volkscultuur', 'traditional music', 'traditionele muziek', 'folk dance', 'volksdans', 'ritual', 'ritueel', 'craft tradition', 'ambachtstraditie', 'storytelling', 'verteltraditie', 'carnival', 'carnaval', 'festival', 'feast', 'feest', 'procession', 'processie', 'dialect', 'streektaal', 'living heritage', 'levend erfgoed', 'performing arts', 'podiumkunsten', 'traditional craft', 'traditioneel ambacht', 'rederijkerskamer', 'chamber of rhetoric'], # X - Mixed: Umbrella organizations, federations, multi-type institutions 'X': ['umbrella organization', 'koepelorganisatie', 'federation', 'federatie', 'consortium', 'netwerk', 'network', 'platform', 'overkoepelende organisatie', 'dachverband', 'roof organization', 'combined museum and archive', 'gecombineerde instelling', 'multi-disciplinary', 'multidisciplinair'], # P - Personal Collection: Private collections, estates, personal archives 'P': ['personal collection', 'persoonlijke collectie', 'private collection', 'privécollectie', 'private archive', 'privéarchief', 'personal archive', 'persoonlijk archief', 'estate', 'nalatenschap', 'nachlass', 'literary estate', 'literaire nalatenschap', 'private library', 'privébibliotheek', 'home library', 'thuisbibliotheek', 'collector', 'verzamelaar', 'private museum', 'huismuseum'], # H - Holy Site: Religious heritage sites, sacred places with collections 'H': ['church', 'kerk', 'cathedral', 'kathedraal', 'monastery', 'klooster', 'abbey', 'abdij', 'temple', 'tempel', 'mosque', 'moskee', 'synagogue', 'synagoge', 'shrine', 'schrijn', 'chapel', 'kapel', 'pilgrimage', 'bedevaart', 'convent', 'religious heritage', 'religieus erfgoed', 'sacred site', 'heilige plaats', 'treasury', 'schatkamer', 'church archive', 'kerkarchief', 'parish records', 'parochieregister', 'diocesan', 'bisdom', 'ecclesiastical', 'kerkelijk'], # D - Digital Platform: Digital heritage platforms, repositories 'D': ['digital', 'digitaal', 'platform', 'software', 'IT ', 'tech', 'developer', 'engineer', 'data ', 'AI ', 'machine learning', 'digitalisering', 'datamanagement', 'data analist', 'digital archive', 'digitaal archief', 'digital library', 'digitale bibliotheek', 'digital repository', 'online collection', 'aggregator', 'portal', 'portaal', 'database', 'metadata', 'linked data', 'semantic web', 'digitization', 'digitalisatie', 'digital preservation', 'digitale conservering', 'web archive', 'webarchief', 'born digital'], # N - Non-profit: Heritage NGOs, advocacy groups, foundations 'N': ['foundation', 'stichting', 'non-profit', 'NGO', 'charity', 'trust', 'advocacy', 'belangenbehartiging', 'heritage foundation', 'erfgoedstichting', 'cultural foundation', 'culturele stichting', 'preservation society', 'monumentenstichting', 'heritage trust', 'conservation trust', 'friends foundation', 'vriendenstichting', 'support organization', 'steunorganisatie', 'ANBI', 'algemeen nut beogende instelling', 'cultural council', 'cultuurfonds'], # T - Taste/Scent Heritage: Culinary, brewing, olfactory heritage 'T': ['brewery', 'brouwerij', 'distillery', 'distilleerderij', 'winery', 'wijnhuis', 'chocolate factory', 'chocoladefabriek', 'culinary heritage', 'culinair erfgoed', 'food museum', 'voedselmuseum', 'perfumery', 'parfumerie', 'spice museum', 'specerijmuseum', 'coffee museum', 'koffiemuseum', 'tea museum', 'theemuseum', 'cheese museum', 'kaasmuseum', 'wine museum', 'wijnmuseum', 'beer museum', 'biermuseum', 'gastronomy', 'gastronomie', 'traditional recipe', 'historic recipe', 'jenever', 'genever', 'liqueur', 'likeur'], } NON_HERITAGE_KEYWORDS = [ 'marketing', 'sales', 'HR ', 'human resources', 'recruiter', 'finance', 'accounting', 'legal', 'lawyer', 'advocaat', 'consultant', 'coach', 'therapy', 'health', 'medical', 'food', 'restaurant', 'retail', 'fashion', 'real estate', 'insurance', 'banking', 'investment', 'e-commerce', 'organiser', 'opruimhulp', 'verpleeg', 'nurse' ] # Organizations that are explicitly NOT heritage institutions NON_HERITAGE_ORGANIZATIONS = [ # Banks & Financial 'ing ', 'ing nederland', 'rabobank', 'abn amro', 'postbank', 'triodos', # Security companies 'i-sec', 'g4s', 'securitas', 'trigion', 'chubb', # Police/Government (non-cultural) 'politie', 'police', 'rijkswaterstaat', 'belastingdienst', 'douane', 'defensie', # Political parties 'vvd', 'pvda', 'cda', 'd66', 'groenlinks', 'pvv', 'bbb', 'nsc', 'volt', 'sp ', 'forum voor democratie', 'ja21', 'bij1', 'denk', 'sgp', 'cu ', # Tech companies (non-heritage) 'google', 'microsoft', 'amazon', 'meta', 'facebook', 'apple', 'netflix', 'uber', 'airbnb', 'booking.com', 'adyen', 'mollie', 'messagebird', 'coolblue', 'bol.com', 'picnic', 'takeaway', 'just eat', # Telecom 'kpn', 'vodafone', 't-mobile', 'ziggo', # Postal / Logistics 'postnl', 'postkantoren', 'dhl', 'ups', 'fedex', # Healthcare 'ziekenhuis', 'hospital', 'ggz', 'ggd', 'thuiszorg', # Retail 'albert heijn', 'jumbo', 'lidl', 'aldi', 'ikea', 'hema', 'action', # Consulting / Professional services 'deloitte', 'kpmg', 'pwc', 'ey ', 'ernst & young', 'mckinsey', 'bcg', 'accenture', 'capgemini', 'ordina', 'atos', 'cgi ', # Recruitment / HR 'randstad', 'tempo-team', 'manpower', 'hays', 'brunel', # Energy / Utilities 'shell', 'bp ', 'eneco', 'vattenfall', 'essent', 'nuon', # Transport 'ns ', 'prorail', 'schiphol', 'klm', 'transavia', # Other 'freelance', 'zelfstandig', 'zzp', 'eigen bedrijf', ] # Heritage organization keywords - organizations that ARE heritage institutions HERITAGE_ORGANIZATION_KEYWORDS = [ # Archives 'archief', 'archive', 'nationaal archief', 'stadsarchief', 'regionaal archief', 'beeld en geluid', 'beeld & geluid', 'niod', 'iish', 'iisg', # Museums 'museum', 'musea', 'rijksmuseum', 'van gogh', 'stedelijk', 'mauritshuis', 'tropenmuseum', 'allard pierson', 'kröller', 'boijmans', # Libraries 'bibliotheek', 'library', 'koninklijke bibliotheek', 'kb ', # Film/AV heritage 'eye film', 'filmmuseum', 'eye ', 'sound and vision', # Heritage platforms 'erfgoed', 'heritage', 'cultural', 'cultureel', # Research institutes (heritage-focused) 'knaw', 'humanities cluster', 'meertens', 'huygens', ] def detect_heritage_type(role: Optional[str], company: Optional[str]) -> tuple: """ Detect if a position is heritage-relevant and what type. Two-stage classification: 1. Check if organization is explicitly non-heritage (blocklist) 2. Check if role/organization matches heritage patterns For 'D' (Digital) type, require BOTH a tech role AND a heritage organization. This prevents generic IT workers at banks/police from being classified as heritage. Args: role: Job title/role text company: Company/organization name Returns: Tuple of (heritage_relevant: bool, heritage_type: Optional[str]) """ # Combine role and company for full context role_text = role or '' company_text = company or '' combined = f"{role_text} {company_text}".lower() if not combined.strip(): return (False, None) # Stage 1: Check for non-heritage organizations (blocklist) # Use word boundary matching to avoid false positives like "sharing" matching "ing " for org in NON_HERITAGE_ORGANIZATIONS: org_pattern = org.lower().strip() # Use word boundary regex for patterns that could have false positives if re.search(r'\b' + re.escape(org_pattern) + r'\b', combined): return (False, None) # Stage 2: Check for non-heritage role indicators for keyword in NON_HERITAGE_KEYWORDS: keyword_pattern = keyword.lower().strip() if re.search(r'\b' + re.escape(keyword_pattern) + r'\b', combined): return (False, None) # Stage 3: Check if this is a heritage organization is_heritage_org = False for org_keyword in HERITAGE_ORGANIZATION_KEYWORDS: if org_keyword.lower() in combined: is_heritage_org = True break # Check heritage keywords by type (order matters - more specific first) # 'D' (Digital) is checked last and requires heritage org validation type_order = ['A', 'M', 'L', 'G', 'S', 'C', 'O', 'R', 'E'] # D removed from main loop for heritage_type in type_order: keywords = HERITAGE_KEYWORDS.get(heritage_type, []) for keyword in keywords: if keyword.lower() in combined: return (True, heritage_type) # Special handling for 'D' (Digital) - ONLY if at a heritage organization if is_heritage_org: digital_keywords = HERITAGE_KEYWORDS.get('D', []) for keyword in digital_keywords: if keyword.lower() in combined: return (True, 'D') # Generic heritage terms (without specific type) generic = ['heritage', 'erfgoed', 'culture', 'cultuur', 'cultural', 'film', 'cinema', 'media', 'arts', 'kunst', 'creative', 'preservation', 'conservation', 'collection'] for keyword in generic: if keyword in combined: return (True, None) return (False, None) # ============================================================================ # Embedding-Based Heritage Type Classification # ============================================================================ # Uses pre-computed prototype embeddings from Wikidata hyponyms to classify # institution names/descriptions into GLAMORCUBESFIXPHDNT types via semantic # similarity. Complements keyword-based detection for ambiguous/multilingual text. # # TODO: In future, generate prototypes from LinkML enums (e.g., ArchiveOrganisationType) # instead of static NPZ file, to automatically pick up schema changes. # ============================================================================ import numpy as np from functools import lru_cache # Global variables for lazy-loaded embedding resources _type_prototypes: dict | None = None _type_codes: list | None = None _prototype_matrix: np.ndarray | None = None # Pre-stacked for fast batch comparison _st_model = None # Sentence transformer model def _load_type_prototypes() -> tuple[dict[str, np.ndarray], list[str], np.ndarray]: """ Load pre-computed type prototype embeddings from NPZ file. Returns: Tuple of (prototypes_dict, type_codes_list, prototype_matrix) - prototypes_dict: {type_code: embedding_vector} - type_codes_list: ['A', 'B', 'C', ...] - prototype_matrix: (19, 384) array for vectorized comparison TODO: In future, generate prototypes from LinkML enums (e.g., ArchiveOrganisationType) instead of static NPZ file, to automatically pick up schema changes. """ global _type_prototypes, _type_codes, _prototype_matrix if _type_prototypes is None: npz_path = Path(__file__).parent.parent.parent / "data/wikidata/GLAMORCUBEPSXHFN/type_prototypes_minilm_384.npz" if not npz_path.exists(): raise FileNotFoundError( f"Type prototypes not found at {npz_path}. " "Run: python scripts/generate_type_prototypes.py" ) data = np.load(npz_path) codes: list[str] = data["type_codes"].tolist() prototypes: np.ndarray = data["prototypes"] # Shape: (19, 384) # Store as dict for lookup by type code _type_prototypes = {code: prototypes[i] for i, code in enumerate(codes)} _type_codes = codes # Store as matrix for vectorized similarity computation _prototype_matrix = prototypes # Assert not None for type checker (we've loaded them above) assert _type_prototypes is not None assert _type_codes is not None assert _prototype_matrix is not None return _type_prototypes, _type_codes, _prototype_matrix def _get_embedding_model(): """ Lazy-load sentence-transformers model. Uses all-MiniLM-L6-v2 (384-dim) for fast inference. Model is loaded once and cached for subsequent calls. """ global _st_model if _st_model is None: try: from sentence_transformers import SentenceTransformer _st_model = SentenceTransformer("all-MiniLM-L6-v2") except ImportError: raise ImportError( "sentence-transformers not installed. " "Run: pip install sentence-transformers" ) return _st_model def classify_type_embedding( text: str, threshold: float = 0.35, return_scores: bool = False ) -> tuple[str | None, float, dict[str, float] | None]: """ Classify text into GLAMORCUBESFIXPHDNT type using semantic similarity. Embeds the input text and compares against pre-computed prototype embeddings for each of the 19 heritage custodian types. Returns the best matching type if similarity exceeds threshold. Args: text: Institution name, description, or role/company text to classify threshold: Minimum cosine similarity to return a type (default 0.35) return_scores: If True, also return dict of all type scores (otherwise None) Returns: Always returns 3-tuple of (type_code, best_score, all_scores): - type_code: Best matching type code, or None if below threshold - best_score: Cosine similarity of best match (0.0-1.0) - all_scores: Dict of all type scores if return_scores=True, else None Example: >>> type_code, score, _ = classify_type_embedding("Rijksmuseum Amsterdam") >>> print(type_code, score) M 0.67 >>> type_code, score, all_scores = classify_type_embedding( ... "Botanical Garden Utrecht", return_scores=True) >>> print(type_code, all_scores['B']) B 0.72 """ if not text or not text.strip(): return (None, 0.0, {} if return_scores else None) # Load prototypes and model prototypes, type_codes, prototype_matrix = _load_type_prototypes() model = _get_embedding_model() # Embed the input text text_embedding = model.encode(text, convert_to_numpy=True) # Normalize for cosine similarity text_norm = np.linalg.norm(text_embedding) if text_norm > 0: text_embedding = text_embedding / text_norm # Compute cosine similarities with all prototypes (vectorized) # prototype_matrix is already L2-normalized from generation script similarities = prototype_matrix @ text_embedding # Shape: (19,) # Find best match best_idx = int(np.argmax(similarities)) best_score = float(similarities[best_idx]) best_type = type_codes[best_idx] # Build scores dict if requested all_scores: dict[str, float] | None = None if return_scores: all_scores = {code: float(similarities[i]) for i, code in enumerate(type_codes)} # Return best type if above threshold, otherwise None if best_score >= threshold: return (best_type, best_score, all_scores) return (None, best_score, all_scores) @lru_cache(maxsize=2000) def detect_heritage_type_hybrid( role: str | None, company: str | None, embedding_threshold: float = 0.35 ) -> tuple[bool, str | None, str]: """ Hybrid heritage type detection combining keywords and embeddings. Strategy: 1. Try keyword-based detection first (fast, high precision) 2. Fall back to embedding-based classification for unmatched text 3. Cache results for repeated lookups Args: role: Job title/role text company: Company/organization name embedding_threshold: Minimum similarity for embedding match (default 0.35) Note: 0.35 balances recall/precision. Use 0.40+ for higher precision. Returns: Tuple of (is_heritage, type_code, method) where method is: - 'keyword': Matched via keyword rules - 'keyword_generic': Matched generic heritage terms (no specific type) - 'embedding:0.XXX': Matched via embedding with similarity score - 'none': No heritage relevance detected Example: >>> detect_heritage_type_hybrid("Curator", "Rijksmuseum") (True, 'M', 'keyword') >>> detect_heritage_type_hybrid("Zookeeper", "Artis Zoo") (True, 'B', 'embedding:0.68') >>> detect_heritage_type_hybrid("Software Engineer", "Google") (False, None, 'none') """ # Convert None to empty string for caching (None isn't hashable in all cases) role = role or '' company = company or '' # Try keyword-based detection first (existing system) is_heritage, type_code = detect_heritage_type(role, company) if is_heritage and type_code: return (True, type_code, "keyword") if is_heritage and type_code is None: # Matched generic heritage terms but no specific type # Try embedding to get specific type combined = f"{role} {company}".strip() emb_type, emb_score, _ = classify_type_embedding(combined, threshold=embedding_threshold) if emb_type: return (True, emb_type, f"embedding:{emb_score:.3f}") return (True, None, "keyword_generic") # Fall back to embedding-based classification combined = f"{role} {company}".strip() if not combined: return (False, None, "none") emb_type, emb_score, _ = classify_type_embedding(combined, threshold=embedding_threshold) if emb_type: return (True, emb_type, f"embedding:{emb_score:.3f}") return (False, None, "none") def enrich_experience_with_heritage(experience: List) -> List[Dict]: """ Add heritage_relevant and heritage_type fields to each experience item. Handles both dict and JSON string inputs (asyncpg returns jsonb array elements as strings that need parsing). Args: experience: List of experience items (dicts or JSON strings) Returns: Same list with heritage_relevant and heritage_type added to each item """ if not experience: return [] enriched = [] for exp in experience: # Handle case where exp is a JSON string instead of dict # (asyncpg returns jsonb array elements as strings) if isinstance(exp, str): try: exp = json.loads(exp) except json.JSONDecodeError: continue # Skip if still not a dict if not isinstance(exp, dict): continue # Get role and company for classification role = exp.get('title') or exp.get('role') or '' company = exp.get('company') or exp.get('organization') or '' # Detect heritage relevance heritage_relevant, heritage_type = detect_heritage_type(role, company) # Create new dict with heritage fields added enriched_exp = {**exp} enriched_exp['heritage_relevant'] = heritage_relevant enriched_exp['heritage_type'] = heritage_type enriched.append(enriched_exp) return enriched def parse_jsonb_list(data) -> List: """ Parse a jsonb list field from PostgreSQL. asyncpg returns jsonb columns in various forms: - Sometimes as a proper Python list with dict elements - Sometimes as a JSON string that needs parsing - Sometimes as a list where each element is a JSON string - Sometimes as a list where each element is a Python repr string (single quotes) This function handles all these cases. Args: data: Either a list, a JSON string representing a list, or None Returns: Parsed list with all elements as proper Python objects (empty list if None or invalid) """ import ast if data is None: return [] result = [] # If it's a string, try to parse the whole thing as JSON first if isinstance(data, str): try: data = json.loads(data) except json.JSONDecodeError: return [] # Now data should be a list if not isinstance(data, list): return [] # Parse each element if it's a string for item in data: if isinstance(item, str): # Try JSON first (double quotes) try: parsed_item = json.loads(item) result.append(parsed_item) continue except json.JSONDecodeError: pass # Try Python literal (single quotes) - handles malformed data try: parsed_item = ast.literal_eval(item) result.append(parsed_item) continue except (ValueError, SyntaxError): pass # Keep as string if neither works (e.g., plain skill strings) result.append(item) else: result.append(item) return result # ============================================================================ # Global State # ============================================================================ _pool: Optional[asyncpg.Pool] = None _start_time: datetime = datetime.now() async def get_pool() -> asyncpg.Pool: """Get or create connection pool""" global _pool if _pool is None: _pool = await asyncpg.create_pool( host=settings.host, port=settings.port, database=settings.database, user=settings.user, password=settings.password, min_size=2, max_size=10, ) return _pool # ============================================================================ # FastAPI App # ============================================================================ @asynccontextmanager async def lifespan(app: FastAPI): """Application lifespan handler""" await get_pool() yield global _pool if _pool: await _pool.close() _pool = None app = FastAPI( title="PostGIS Geo API", description="Spatial REST API for heritage institution map", version="1.0.0", lifespan=lifespan, ) # CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # GZip compression middleware - compresses responses >1KB # Reduces ~126MB JSON payload to ~20-30MB (70-80% reduction) app.add_middleware(GZipMiddleware, minimum_size=1000) # ============================================================================ # Helper Functions # ============================================================================ def serialize_value(val: Any) -> Any: """Convert PostgreSQL values to JSON-serializable format""" if val is None: return None elif isinstance(val, datetime): return val.isoformat() elif isinstance(val, Decimal): return float(val) elif isinstance(val, (dict, list)): return val elif isinstance(val, bytes): return val.decode('utf-8', errors='replace') else: return val def row_to_dict(row: asyncpg.Record) -> Dict[str, Any]: """Convert asyncpg row to dict with serialization""" return {key: serialize_value(row[key]) for key in row.keys()} # ============================================================================ # API Endpoints # ============================================================================ @app.get("/", response_model=GeoStatsResponse) async def get_geo_status() -> GeoStatsResponse: """Get geo database status and statistics""" pool = await get_pool() async with pool.acquire() as conn: version = await conn.fetchval("SELECT version()") provinces = await conn.fetchval("SELECT COUNT(*) FROM provinces") municipalities = await conn.fetchval("SELECT COUNT(*) FROM municipalities") institutions = await conn.fetchval("SELECT COUNT(*) FROM institutions") historical = await conn.fetchval("SELECT COUNT(*) FROM historical_boundaries") return GeoStatsResponse( status="healthy", database=settings.database, provinces=provinces or 0, municipalities=municipalities or 0, institutions=institutions or 0, historical_boundaries=historical or 0, postgres_version=version.split(',')[0] if version else "unknown", ) @app.get("/provinces") async def get_provinces( simplified: bool = Query(True, description="Return simplified geometries") ): """Get all provinces as GeoJSON FeatureCollection""" pool = await get_pool() tolerance = 0.001 if simplified else 0 async with pool.acquire() as conn: rows = await conn.fetch(f""" SELECT id, province_code, iso_code, name, ST_AsGeoJSON( {'ST_Simplify(geom, ' + str(tolerance) + ')' if simplified else 'geom'} )::json as geometry, ST_X(centroid) as centroid_lon, ST_Y(centroid) as centroid_lat, area_km2 FROM provinces ORDER BY name """) features = [] for row in rows: features.append({ "type": "Feature", "id": row['province_code'], "geometry": row['geometry'], "properties": { "id": row['id'], "province_code": row['province_code'], "iso_code": row['iso_code'], "name": row['name'], "centroid_lon": float(row['centroid_lon']) if row['centroid_lon'] else None, "centroid_lat": float(row['centroid_lat']) if row['centroid_lat'] else None, "area_km2": float(row['area_km2']) if row['area_km2'] else None, } }) return { "type": "FeatureCollection", "features": features } @app.get("/countries") async def get_countries( simplified: bool = Query(True, description="Return simplified geometries"), with_counts: bool = Query(False, description="Include institution counts per country"), ): """Get all countries as GeoJSON FeatureCollection with optional institution counts""" pool = await get_pool() # Use more aggressive simplification for countries (world view) tolerance = 0.01 if simplified else 0 async with pool.acquire() as conn: if with_counts: # Join with custodians to get counts per country rows = await conn.fetch(f""" SELECT bc.id, bc.iso_a2 as country_code, bc.iso_a3, bc.country_name as name, ST_AsGeoJSON( {'ST_Simplify(bc.geom, ' + str(tolerance) + ')' if simplified else 'bc.geom'} ) as geometry, ST_X(bc.centroid) as centroid_lon, ST_Y(bc.centroid) as centroid_lat, bc.area_km2, COALESCE(counts.institution_count, 0) as institution_count FROM boundary_countries bc LEFT JOIN ( SELECT country_code, COUNT(*) as institution_count FROM custodians WHERE country_code IS NOT NULL GROUP BY country_code ) counts ON bc.iso_a2 = counts.country_code WHERE bc.geom IS NOT NULL ORDER BY bc.country_name """) else: rows = await conn.fetch(f""" SELECT id, iso_a2 as country_code, iso_a3, country_name as name, ST_AsGeoJSON( {'ST_Simplify(geom, ' + str(tolerance) + ')' if simplified else 'geom'} ) as geometry, ST_X(centroid) as centroid_lon, ST_Y(centroid) as centroid_lat, area_km2 FROM boundary_countries WHERE geom IS NOT NULL ORDER BY country_name """) features = [] total_institutions = 0 countries_with_data = 0 for row in rows: # Parse geometry from string to dict (ST_AsGeoJSON returns text) geometry = row['geometry'] if geometry is None: # Skip countries with no geometry (e.g., Vatican City) continue if isinstance(geometry, str): geometry = json.loads(geometry) # Ensure geometry has required structure if not isinstance(geometry, dict) or 'type' not in geometry or 'coordinates' not in geometry: continue iso_a2 = row['country_code'].strip() if row['country_code'] else None iso_a3 = row['iso_a3'].strip() if row['iso_a3'] else None institution_count = row['institution_count'] if with_counts else 0 # Track totals if with_counts: total_institutions += institution_count if institution_count > 0: countries_with_data += 1 # Build properties with frontend-expected field names properties = { "id": row['id'], "iso_a2": iso_a2, # Frontend expects iso_a2 "iso_a3": iso_a3, "name": row['name'], "institution_count": institution_count, "centroid": [ float(row['centroid_lon']) if row['centroid_lon'] else None, float(row['centroid_lat']) if row['centroid_lat'] else None, ], "area_km2": float(row['area_km2']) if row['area_km2'] else None, } features.append({ "type": "Feature", "id": iso_a2, "geometry": geometry, "properties": properties }) return { "type": "FeatureCollection", "features": features, "metadata": { "count": len(features), "total_institutions": total_institutions, "countries_with_data": countries_with_data, "type_filter": None, "simplified": simplified, } } @app.get("/municipalities") async def get_municipalities( province: Optional[str] = Query(None, description="Filter by province ISO code (e.g., NH)"), simplified: bool = Query(True, description="Return simplified geometries"), limit: int = Query(500, ge=1, le=1000, description="Maximum results") ): """Get municipalities as GeoJSON FeatureCollection""" pool = await get_pool() tolerance = 0.0005 if simplified else 0 query = f""" SELECT m.id, m.municipality_code, m.name, p.iso_code as province_iso, p.name as province_name, ST_AsGeoJSON( {'ST_Simplify(m.geom, ' + str(tolerance) + ')' if simplified else 'm.geom'} )::json as geometry, ST_X(m.centroid) as centroid_lon, ST_Y(m.centroid) as centroid_lat, m.area_km2 FROM municipalities m LEFT JOIN provinces p ON m.province_id = p.id {'WHERE p.iso_code = $1' if province else ''} ORDER BY m.name LIMIT {'$2' if province else '$1'} """ async with pool.acquire() as conn: if province: rows = await conn.fetch(query, province.upper(), limit) else: rows = await conn.fetch(query, limit) features = [] for row in rows: features.append({ "type": "Feature", "id": row['municipality_code'], "geometry": row['geometry'], "properties": { "id": row['id'], "code": row['municipality_code'], "name": row['name'], "province_iso": row['province_iso'], "province_name": row['province_name'], "centroid_lon": float(row['centroid_lon']) if row['centroid_lon'] else None, "centroid_lat": float(row['centroid_lat']) if row['centroid_lat'] else None, "area_km2": float(row['area_km2']) if row['area_km2'] else None, } }) return { "type": "FeatureCollection", "features": features } @app.get("/institutions") async def get_institutions( bbox: Optional[str] = Query(None, description="Bounding box: minLon,minLat,maxLon,maxLat"), province: Optional[str] = Query(None, description="Filter by province ISO code (e.g., NH, ZH)"), country: Optional[str] = Query(None, description="Filter by country code (e.g., NL, DE, JP)"), type: Optional[str] = Query(None, description="Filter by institution type (G,L,A,M,O,R,C,U,B,E,S,F,I,X,P,H,D,N,T)"), limit: int = Query(50000, ge=1, le=100000, description="Maximum results") ): """Get institutions as GeoJSON FeatureCollection with full metadata from custodians table""" pool = await get_pool() # Build WHERE clauses - query custodians table directly conditions = ["lat IS NOT NULL AND lon IS NOT NULL"] params = [] param_count = 0 if bbox: try: min_lon, min_lat, max_lon, max_lat = map(float, bbox.split(',')) param_count += 4 conditions.append(f""" lon >= ${param_count-3} AND lat >= ${param_count-2} AND lon <= ${param_count-1} AND lat <= ${param_count} """) params.extend([min_lon, min_lat, max_lon, max_lat]) except ValueError: raise HTTPException(status_code=400, detail="Invalid bbox format. Use: minLon,minLat,maxLon,maxLat") if province: param_count += 1 conditions.append(f"region_code = ${param_count}") params.append(province.upper()) if type: param_count += 1 conditions.append(f"type = ${param_count}") params.append(type.upper()) if country: param_count += 1 conditions.append(f"country_code = ${param_count}") params.append(country.upper()) param_count += 1 where_clause = " AND ".join(conditions) # Query custodians table with all rich metadata fields query = f""" SELECT ghcid, name, emic_name, type, type_name, lon, lat, city, region as province, region_code as province_iso, country_code, formatted_address, street_address, postal_code, rating, total_ratings, wikidata_id, website, phone, email, isil_code, google_place_id, description, opening_hours, reviews, photos, photo_urls, business_status, street_view_url, founding_year, dissolution_year, temporal_extent, museum_register, youtube_channel_url, youtube_subscriber_count, youtube_video_count, youtube_enrichment, social_facebook, social_twitter, social_instagram, wikidata_label_en, wikidata_description_en, logo_url, web_claims FROM custodians WHERE {where_clause} ORDER BY name LIMIT ${param_count} """ params.append(limit) async with pool.acquire() as conn: rows = await conn.fetch(query, *params) features = [] for row in rows: # Build properties with all available metadata props = { "ghcid": row['ghcid'], "name": row['name'], "emic_name": row['emic_name'], "type": row['type'], "type_name": row['type_name'], "city": row['city'], "province": row['province'], "province_iso": row['province_iso'], "country_code": row['country_code'], "formatted_address": row['formatted_address'], "rating": float(row['rating']) if row['rating'] else None, "total_ratings": row['total_ratings'], "wikidata_id": row['wikidata_id'], "website": row['website'], "phone": row['phone'], "email": row['email'], "isil_code": row['isil_code'], "google_place_id": row['google_place_id'], "description": row['description'], "business_status": row['business_status'], "street_view_url": row['street_view_url'], "founding_year": row['founding_year'], "dissolution_year": row['dissolution_year'], } # Add JSONB fields (handle potential None values) if row['opening_hours']: props["opening_hours"] = row['opening_hours'] if row['reviews']: props["reviews"] = row['reviews'] if row['photos']: props["photos"] = row['photos'] if row['photo_urls']: props["photo_urls"] = row['photo_urls'] if row['temporal_extent']: props["temporal_extent"] = row['temporal_extent'] if row['museum_register']: props["museum_register"] = row['museum_register'] if row['youtube_enrichment']: props["youtube_enrichment"] = row['youtube_enrichment'] elif row['youtube_channel_url']: # Build minimal YouTube data if enrichment not present props["youtube"] = { "channel_url": row['youtube_channel_url'], "subscriber_count": row['youtube_subscriber_count'], "video_count": row['youtube_video_count'], } # Social media social = {} if row['social_facebook']: social['facebook'] = row['social_facebook'] if row['social_twitter']: social['twitter'] = row['social_twitter'] if row['social_instagram']: social['instagram'] = row['social_instagram'] if social: props["social_media"] = social # Wikidata labels if row['wikidata_label_en']: props["wikidata_label"] = row['wikidata_label_en'] if row['wikidata_description_en']: props["wikidata_description"] = row['wikidata_description_en'] # Logo URL from web claims extraction if row['logo_url']: props["logo_url"] = row['logo_url'] # Web claims (financial documents, etc.) if row['web_claims']: props["web_claims"] = row['web_claims'] features.append({ "type": "Feature", "geometry": { "type": "Point", "coordinates": [float(row['lon']), float(row['lat'])] }, "properties": props }) return { "type": "FeatureCollection", "features": features, "metadata": { "count": len(features), "limit": limit, "filters": { "bbox": bbox, "province": province, "type": type } } } @app.get("/institution/{ghcid}") async def get_institution(ghcid: str): """Get detailed information for a single institution with full metadata""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow(""" SELECT ghcid, name, emic_name, verified_name, type, type_name, lon, lat, city, region as province, region_code as province_iso, country_code, formatted_address, street_address, postal_code, website, phone, email, wikidata_id, isil_code, google_place_id, rating, total_ratings, description, business_status, street_view_url, google_maps_url, opening_hours, reviews, photos, photo_urls, founding_year, founding_date, dissolution_year, dissolution_date, temporal_extent, museum_register, youtube_channel_id, youtube_channel_url, youtube_subscriber_count, youtube_video_count, youtube_view_count, youtube_enrichment, social_facebook, social_twitter, social_instagram, social_linkedin, social_youtube, logo_url, wikidata_label_nl, wikidata_label_en, wikidata_description_nl, wikidata_description_en, wikidata_types, wikidata_inception, wikidata_enrichment, genealogiewerkbalk, nan_isil_enrichment, kb_enrichment, zcbs_enrichment, web_claims, ghcid_uuid, ghcid_numeric, identifiers, data_source, data_tier, provenance FROM custodians WHERE ghcid = $1 """, ghcid) if not row: raise HTTPException(status_code=404, detail=f"Institution '{ghcid}' not found") # Build comprehensive response with all metadata result = { "ghcid": row['ghcid'], "name": row['name'], "emic_name": row['emic_name'], "verified_name": row['verified_name'], "type": row['type'], "type_name": row['type_name'], "lat": float(row['lat']) if row['lat'] else None, "lon": float(row['lon']) if row['lon'] else None, "city": row['city'], "province": row['province'], "province_iso": row['province_iso'], "country_code": row['country_code'], "formatted_address": row['formatted_address'], "street_address": row['street_address'], "postal_code": row['postal_code'], "website": row['website'], "phone": row['phone'], "email": row['email'], "wikidata_id": row['wikidata_id'], "isil_code": row['isil_code'], "google_place_id": row['google_place_id'], "rating": float(row['rating']) if row['rating'] else None, "total_ratings": row['total_ratings'], "description": row['description'], "business_status": row['business_status'], "street_view_url": row['street_view_url'], "google_maps_url": row['google_maps_url'], } # JSONB fields - only include if present if row['opening_hours']: result["opening_hours"] = row['opening_hours'] if row['reviews']: result["reviews"] = row['reviews'] if row['photos']: result["photos"] = row['photos'] if row['photo_urls']: result["photo_urls"] = row['photo_urls'] if row['identifiers']: result["identifiers"] = row['identifiers'] # Temporal data temporal = {} if row['founding_year']: temporal["founding_year"] = row['founding_year'] if row['founding_date']: temporal["founding_date"] = row['founding_date'].isoformat() if row['founding_date'] else None if row['dissolution_year']: temporal["dissolution_year"] = row['dissolution_year'] if row['dissolution_date']: temporal["dissolution_date"] = row['dissolution_date'].isoformat() if row['dissolution_date'] else None if row['temporal_extent']: temporal["extent"] = row['temporal_extent'] if temporal: result["temporal"] = temporal # Museum register if row['museum_register']: result["museum_register"] = row['museum_register'] # YouTube enrichment youtube = {} if row['youtube_channel_id']: youtube["channel_id"] = row['youtube_channel_id'] if row['youtube_channel_url']: youtube["channel_url"] = row['youtube_channel_url'] if row['youtube_subscriber_count']: youtube["subscriber_count"] = row['youtube_subscriber_count'] if row['youtube_video_count']: youtube["video_count"] = row['youtube_video_count'] if row['youtube_view_count']: youtube["view_count"] = row['youtube_view_count'] if row['youtube_enrichment']: youtube["enrichment"] = row['youtube_enrichment'] if youtube: result["youtube"] = youtube # Social media social = {} if row['social_facebook']: social["facebook"] = row['social_facebook'] if row['social_twitter']: social["twitter"] = row['social_twitter'] if row['social_instagram']: social["instagram"] = row['social_instagram'] if row['social_linkedin']: social["linkedin"] = row['social_linkedin'] if row['social_youtube']: social["youtube"] = row['social_youtube'] if social: result["social_media"] = social # Wikidata wikidata = {} if row['wikidata_label_nl']: wikidata["label_nl"] = row['wikidata_label_nl'] if row['wikidata_label_en']: wikidata["label_en"] = row['wikidata_label_en'] if row['wikidata_description_nl']: wikidata["description_nl"] = row['wikidata_description_nl'] if row['wikidata_description_en']: wikidata["description_en"] = row['wikidata_description_en'] if row['wikidata_types']: wikidata["types"] = row['wikidata_types'] if row['wikidata_inception']: wikidata["inception"] = row['wikidata_inception'] if row['wikidata_enrichment']: wikidata["enrichment"] = row['wikidata_enrichment'] if wikidata: result["wikidata"] = wikidata # Logo if row['logo_url']: result["logo_url"] = row['logo_url'] # Other enrichment data if row['genealogiewerkbalk']: result["genealogiewerkbalk"] = row['genealogiewerkbalk'] if row['nan_isil_enrichment']: result["nan_isil_enrichment"] = row['nan_isil_enrichment'] if row['kb_enrichment']: result["kb_enrichment"] = row['kb_enrichment'] if row['zcbs_enrichment']: result["zcbs_enrichment"] = row['zcbs_enrichment'] if row['web_claims']: result["web_claims"] = row['web_claims'] # GHCID details ghcid_data = {"current": row['ghcid']} if row['ghcid_uuid']: ghcid_data["uuid"] = str(row['ghcid_uuid']) if row['ghcid_numeric']: ghcid_data["numeric"] = int(row['ghcid_numeric']) result["ghcid_details"] = ghcid_data # Provenance if row['data_source'] or row['data_tier'] or row['provenance']: result["provenance"] = { "data_source": row['data_source'], "data_tier": row['data_tier'], "details": row['provenance'], } return result @app.get("/search") async def search_institutions( q: str = Query(..., min_length=2, description="Search query (name, GHCID, or description)"), type: Optional[str] = Query(None, description="Filter by institution type"), include_persons: bool = Query(False, description="Also search for associated persons"), limit: int = Query(50, ge=1, le=200, description="Maximum results") ): """ Search institutions by name, GHCID, or description. Supports: - Full-text search on name and description - Exact GHCID matching (e.g., NL-OV-ZWO-D-WIWO) - Partial GHCID matching (e.g., NL-OV or WIWO) - Optionally includes persons associated with matching custodians """ pool = await get_pool() q_upper = q.upper() q_pattern = f"%{q}%" # Check if query looks like a GHCID pattern (contains hyphen and letter-hyphen pattern) # Accept both uppercase and lowercase input (e.g., "nl-nh-ams" or "NL-NH-AMS") is_ghcid_query = '-' in q and any(c.isalpha() for c in q) async with pool.acquire() as conn: rows = [] # Priority 1: Exact GHCID match if is_ghcid_query: exact_query = """ SELECT c.ghcid as ghcid, c.name, c.type, c.type_name, c.lon, c.lat, c.city, c.region_code as province_iso, c.rating, 1000 as rank FROM custodians c WHERE c.ghcid = $1 """ rows = await conn.fetch(exact_query, q_upper) # Priority 2: Partial GHCID match (if no exact match) if not rows and is_ghcid_query: partial_ghcid_query = """ SELECT c.ghcid as ghcid, c.name, c.type, c.type_name, c.lon, c.lat, c.city, c.region_code as province_iso, c.rating, 500 as rank FROM custodians c WHERE c.ghcid ILIKE $1 """ params = [f"%{q_upper}%"] if type: partial_ghcid_query += " AND c.type = $2" params.append(type.upper()) partial_ghcid_query += f" ORDER BY c.ghcid LIMIT ${len(params) + 1}" params.append(limit) rows = await conn.fetch(partial_ghcid_query, *params) # Priority 3: Name/description text search if not rows: # Try ILIKE search on name, description, city, and emic_name text_query = """ SELECT c.ghcid as ghcid, c.name, c.type, c.type_name, c.lon, c.lat, c.city, c.region_code as province_iso, c.rating, CASE WHEN c.name ILIKE $2 THEN 100 WHEN c.name ILIKE $1 THEN 50 WHEN c.emic_name ILIKE $1 THEN 40 WHEN c.city ILIKE $1 THEN 30 ELSE 10 END as rank FROM custodians c WHERE c.name ILIKE $1 OR c.emic_name ILIKE $1 OR c.description ILIKE $1 OR c.city ILIKE $1 """ params = [q_pattern, f"{q}%"] param_count = 2 if type: param_count += 1 text_query += f" AND c.type = ${param_count}" params.append(type.upper()) param_count += 1 text_query += f" ORDER BY rank DESC, c.name LIMIT ${param_count}" params.append(limit) rows = await conn.fetch(text_query, *params) # Build institution results with match type indicator def get_match_type(rank: int) -> str: if rank >= 1000: return "exact_ghcid" elif rank >= 500: return "partial_ghcid" else: return "text" institutions = [ { "ghcid": row['ghcid'], "name": row['name'], "type": row['type'], "type_name": row['type_name'], "lon": float(row['lon']) if row['lon'] else None, "lat": float(row['lat']) if row['lat'] else None, "city": row['city'], "province_iso": row['province_iso'], "rating": float(row['rating']) if row['rating'] else None, "match_type": get_match_type(row['rank']), } for row in rows ] # Optionally search for persons persons = [] if include_persons and institutions: # Get custodian slugs from results to find associated persons ghcids = [inst['ghcid'] for inst in institutions if inst['ghcid']] if ghcids: # Search persons by custodian association or by name person_query = """ SELECT DISTINCT ON (p.staff_id) p.staff_id, p.name, p.headline, p.location, p.country_code, p.custodian_slug, p.custodian_name, p.linkedin_url, p.profile_image_url, p.heritage_relevant, p.heritage_types FROM persons p WHERE p.name ILIKE $1 OR p.headline ILIKE $1 OR p.custodian_name ILIKE $1 ORDER BY p.staff_id, p.name LIMIT $2 """ person_rows = await conn.fetch(person_query, q_pattern, limit) persons = [ { "staff_id": row['staff_id'], "name": row['name'], "headline": row['headline'], "location": row['location'], "country_code": row['country_code'], "custodian_slug": row['custodian_slug'], "custodian_name": row['custodian_name'], "linkedin_url": row['linkedin_url'], "profile_image_url": row['profile_image_url'], "heritage_relevant": row['heritage_relevant'], "heritage_types": row['heritage_types'] or [], } for row in person_rows ] result = { "query": q, "count": len(institutions), "results": institutions, } if include_persons: result["persons"] = persons result["persons_count"] = len(persons) return result @app.get("/nearby", response_model=List[NearbyInstitution]) async def find_nearby( lon: float = Query(..., description="Longitude"), lat: float = Query(..., description="Latitude"), radius_km: float = Query(10, ge=0.1, le=100, description="Search radius in km"), type: Optional[str] = Query(None, description="Filter by institution type"), limit: int = Query(50, ge=1, le=200, description="Maximum results") ): """Find institutions near a point""" pool = await get_pool() # Use custodians table with lat/lon columns (no PostGIS geometry) query = """ SELECT c.ghcid as ghcid, c.name, c.type, c.type_name, ( 6371 * acos( cos(radians($2)) * cos(radians(c.lat)) * cos(radians(c.lon) - radians($1)) + sin(radians($2)) * sin(radians(c.lat)) ) ) as distance_km, c.city, c.region as province, c.rating FROM custodians c WHERE c.lat IS NOT NULL AND c.lon IS NOT NULL AND ( 6371 * acos( cos(radians($2)) * cos(radians(c.lat)) * cos(radians(c.lon) - radians($1)) + sin(radians($2)) * sin(radians(c.lat)) ) ) <= $3 """ params: list = [lon, lat, radius_km] param_count = 3 if type: param_count += 1 query += f" AND c.type = ${param_count}" params.append(type.upper()) param_count += 1 query += f" ORDER BY distance_km LIMIT ${param_count}" params.append(limit) async with pool.acquire() as conn: rows = await conn.fetch(query, *params) return [ NearbyInstitution( ghcid=row['ghcid'], name=row['name'], type=row['type'], type_name=row['type_name'], distance_km=round(float(row['distance_km']), 2), city=row['city'], province=row['province'], rating=float(row['rating']) if row['rating'] else None, ) for row in rows ] @app.get("/admin/point", response_model=AdminPoint) async def get_admin_for_point( lon: float = Query(..., description="Longitude"), lat: float = Query(..., description="Latitude") ): """Find which municipality/province contains a point""" pool = await get_pool() async with pool.acquire() as conn: row = await conn.fetchrow(""" SELECT p.province_code, p.name as province_name, m.municipality_code, m.name as municipality_name FROM municipalities m JOIN provinces p ON m.province_id = p.id WHERE ST_Contains(m.geom, ST_SetSRID(ST_Point($1, $2), 4326)) LIMIT 1 """, lon, lat) if not row: # Try province only async with pool.acquire() as conn: row = await conn.fetchrow(""" SELECT province_code, name as province_name, NULL as municipality_code, NULL as municipality_name FROM provinces WHERE ST_Contains(geom, ST_SetSRID(ST_Point($1, $2), 4326)) LIMIT 1 """, lon, lat) if not row: return AdminPoint( province_code=None, province_name=None, municipality_code=None, municipality_name=None ) return AdminPoint( province_code=row['province_code'], province_name=row['province_name'], municipality_code=row['municipality_code'], municipality_name=row['municipality_name'] ) @app.get("/historical") async def get_historical_boundaries( year: int = Query(1500, description="Reference year"), boundary_type: Optional[str] = Query(None, description="Boundary type filter"), simplified: bool = Query(True, description="Return simplified geometries"), limit: int = Query(1000, ge=1, le=10000, description="Maximum results") ): """Get historical boundaries as GeoJSON""" pool = await get_pool() tolerance = 0.001 if simplified else 0 conditions = ["reference_year = $1"] params = [year] param_count = 1 if boundary_type: param_count += 1 conditions.append(f"boundary_type = ${param_count}") params.append(boundary_type) param_count += 1 where_clause = " AND ".join(conditions) query = f""" SELECT id, boundary_code, name, boundary_type, reference_year, ST_AsGeoJSON( {'ST_Simplify(geom, ' + str(tolerance) + ')' if simplified else 'geom'} )::json as geometry, ST_X(centroid) as centroid_lon, ST_Y(centroid) as centroid_lat, area_km2 FROM historical_boundaries WHERE {where_clause} ORDER BY name LIMIT ${param_count} """ params.append(limit) async with pool.acquire() as conn: rows = await conn.fetch(query, *params) features = [] for row in rows: if row['geometry']: features.append({ "type": "Feature", "id": row['boundary_code'], "geometry": row['geometry'], "properties": { "id": row['id'], "code": row['boundary_code'], "name": row['name'], "type": row['boundary_type'], "year": row['reference_year'], "centroid_lon": float(row['centroid_lon']) if row['centroid_lon'] else None, "centroid_lat": float(row['centroid_lat']) if row['centroid_lat'] else None, "area_km2": float(row['area_km2']) if row['area_km2'] else None, } }) return { "type": "FeatureCollection", "features": features, "metadata": { "year": year, "boundary_type": boundary_type, "count": len(features) } } @app.get("/stats/by-type") async def get_stats_by_type(): """Get institution counts by type""" pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch(""" SELECT institution_type as type, type_name, COUNT(*) as count, ROUND(AVG(rating)::numeric, 2) as avg_rating FROM institutions WHERE geom IS NOT NULL GROUP BY institution_type, type_name ORDER BY count DESC """) return { "stats": [ { "type": row['type'], "type_name": row['type_name'], "count": row['count'], "avg_rating": float(row['avg_rating']) if row['avg_rating'] else None } for row in rows ] } @app.get("/stats/by-province") async def get_stats_by_province(): """Get institution counts by province""" pool = await get_pool() async with pool.acquire() as conn: rows = await conn.fetch(""" SELECT p.iso_code, p.name as province_name, COUNT(i.id) as count, ROUND(AVG(i.rating)::numeric, 2) as avg_rating FROM provinces p LEFT JOIN institutions i ON i.province_id = p.id GROUP BY p.id, p.iso_code, p.name ORDER BY count DESC """) return { "stats": [ { "province_iso": row['iso_code'], "province_name": row['province_name'], "count": row['count'], "avg_rating": float(row['avg_rating']) if row['avg_rating'] else None } for row in rows ] } # ============================================================================ # Heritage Type Classification Endpoint # ============================================================================ # Response model for type classification class TypeClassifyResponse(BaseModel): """Response model for heritage type classification""" text: str = Field(description="Input text that was classified") is_heritage: bool = Field(description="Whether the text is heritage-relevant") type_code: str | None = Field(description="GLAMORCUBESFIXPHDNT type code (e.g., 'M', 'A', 'L')") type_name: str | None = Field(description="Full type name (e.g., 'Museum', 'Archive')") method: str = Field(description="Classification method used: 'keyword', 'embedding:0.XXX', 'none'") confidence: float | None = Field(description="Similarity score for embedding-based classification") all_scores: dict | None = Field(default=None, description="Scores for all types (if requested)") # Type code to full name mapping TYPE_CODE_NAMES = { 'G': 'Gallery', 'L': 'Library', 'A': 'Archive', 'M': 'Museum', 'O': 'Official Institution', 'R': 'Research Center', 'C': 'Corporation', 'U': 'Unspecified', 'B': 'Bio-custodian (Zoo/Botanical)', 'E': 'Education Provider', 'S': 'Society', 'F': 'Feature (Monument/Site)', 'I': 'Intangible Heritage', 'X': 'Mixed', 'P': 'Personal Collection', 'H': 'Holy Site', 'D': 'Digital Platform', 'N': 'Non-profit', 'T': 'Taste/Scent Heritage', } @app.get("/types/classify", response_model=TypeClassifyResponse) async def classify_heritage_type( text: str = Query(..., description="Text to classify (institution name, role, company)"), method: str = Query("hybrid", description="Classification method: 'keyword', 'embedding', 'hybrid'"), threshold: float = Query(0.40, ge=0.0, le=1.0, description="Similarity threshold for embedding"), include_scores: bool = Query(False, description="Include all type scores in response") ): """ Classify text into GLAMORCUBESFIXPHDNT heritage type. Supports three classification methods: - **keyword**: Fast rule-based matching using predefined keyword lists - **embedding**: Semantic similarity using pre-computed type prototype embeddings - **hybrid** (default): Keywords first, embedding fallback Types: G=Gallery, L=Library, A=Archive, M=Museum, O=Official, R=Research, C=Corporation, U=Unspecified, B=Bio, E=Education, S=Society, F=Feature, I=Intangible, X=Mixed, P=Personal, H=Holy, D=Digital, N=Non-profit, T=Taste """ if not text.strip(): raise HTTPException(status_code=400, detail="Text cannot be empty") all_scores = None confidence = None if method == "keyword": # Keyword-only classification is_heritage, type_code = detect_heritage_type(text, "") method_used = "keyword" if is_heritage else "none" elif method == "embedding": # Embedding-only classification if include_scores: type_code, score, all_scores = classify_type_embedding( text, threshold=threshold, return_scores=True ) else: type_code, score, _ = classify_type_embedding(text, threshold=threshold) is_heritage = type_code is not None confidence = score method_used = f"embedding:{score:.3f}" if is_heritage else "none" else: # hybrid (default) # Use hybrid detection is_heritage, type_code, method_used = detect_heritage_type_hybrid( text, "", embedding_threshold=threshold ) # Extract confidence from method string if embedding was used if method_used.startswith("embedding:"): confidence = float(method_used.split(":")[1]) # Get all scores if requested and embedding path was used if include_scores and (method_used.startswith("embedding") or method_used == "none"): _, _, all_scores = classify_type_embedding(text, threshold=threshold, return_scores=True) return TypeClassifyResponse( text=text, is_heritage=is_heritage, type_code=type_code, type_name=TYPE_CODE_NAMES.get(type_code) if type_code else None, method=method_used, confidence=confidence, all_scores=all_scores ) @app.get("/types/list") async def list_heritage_types(): """ List all GLAMORCUBESFIXPHDNT heritage types with descriptions. """ return { "types": [ {"code": code, "name": name} for code, name in TYPE_CODE_NAMES.items() ], "mnemonic": "GLAMORCUBESFIXPHDNT", "count": len(TYPE_CODE_NAMES) } # ============================================================================ # Optimized Loading Endpoints (Pagination, Viewport, Lite) # ============================================================================ @app.get("/institutions/lite") async def get_institutions_lite( bbox: Optional[str] = Query(None, description="Bounding box: minLon,minLat,maxLon,maxLat"), country: Optional[str] = Query(None, description="Filter by country code (e.g., NL, DE, JP)"), type: Optional[str] = Query(None, description="Filter by institution type"), limit: int = Query(100000, ge=1, le=200000, description="Maximum results") ): """ Get lightweight institution data for map markers. Returns only essential fields (~5-10MB instead of ~126MB). Use /institution/{ghcid} for full details on click. """ pool = await get_pool() conditions = ["lat IS NOT NULL AND lon IS NOT NULL"] params = [] param_count = 0 if bbox: try: min_lon, min_lat, max_lon, max_lat = map(float, bbox.split(',')) param_count += 4 conditions.append(f""" lon >= ${param_count-3} AND lat >= ${param_count-2} AND lon <= ${param_count-1} AND lat <= ${param_count} """) params.extend([min_lon, min_lat, max_lon, max_lat]) except ValueError: raise HTTPException(status_code=400, detail="Invalid bbox format") if country: param_count += 1 conditions.append(f"country_code = ${param_count}") params.append(country.upper()) if type: param_count += 1 conditions.append(f"type = ${param_count}") params.append(type.upper()) param_count += 1 where_clause = " AND ".join(conditions) # Minimal fields for markers - dramatically reduces payload query = f""" SELECT ghcid, name, type, lon, lat, city, country_code, rating FROM custodians WHERE {where_clause} ORDER BY name LIMIT ${param_count} """ params.append(limit) async with pool.acquire() as conn: rows = await conn.fetch(query, *params) features = [] for row in rows: features.append({ "type": "Feature", "geometry": { "type": "Point", "coordinates": [float(row['lon']), float(row['lat'])] }, "properties": { "ghcid": row['ghcid'], "name": row['name'], "type": row['type'], "city": row['city'], "country_code": row['country_code'], "rating": float(row['rating']) if row['rating'] else None, } }) return { "type": "FeatureCollection", "features": features, "metadata": { "count": len(features), "mode": "lite", "filters": {"bbox": bbox, "country": country, "type": type} } } @app.get("/institutions/page") async def get_institutions_paginated( page: int = Query(1, ge=1, description="Page number (1-indexed)"), page_size: int = Query(1000, ge=100, le=5000, description="Items per page"), country: Optional[str] = Query(None, description="Filter by country code"), type: Optional[str] = Query(None, description="Filter by institution type"), sort_by: str = Query("name", description="Sort field: name, city, rating, type"), sort_order: str = Query("asc", description="Sort order: asc, desc"), ): """ Get paginated institutions with full metadata. ~1MB per page instead of ~126MB full download. Supports cursor-based iteration for large datasets. """ pool = await get_pool() # Validate sort parameters valid_sort_fields = {"name", "city", "rating", "type", "country_code"} if sort_by not in valid_sort_fields: sort_by = "name" sort_direction = "DESC" if sort_order.lower() == "desc" else "ASC" conditions = ["lat IS NOT NULL AND lon IS NOT NULL"] params = [] param_count = 0 if country: param_count += 1 conditions.append(f"country_code = ${param_count}") params.append(country.upper()) if type: param_count += 1 conditions.append(f"type = ${param_count}") params.append(type.upper()) where_clause = " AND ".join(conditions) # Get total count for pagination metadata count_query = f"SELECT COUNT(*) FROM custodians WHERE {where_clause}" # Calculate offset offset = (page - 1) * page_size param_count += 1 limit_param = param_count param_count += 1 offset_param = param_count # Full metadata query with pagination query = f""" SELECT ghcid, name, emic_name, type, type_name, lon, lat, city, region as province, region_code as province_iso, country_code, formatted_address, street_address, postal_code, rating, total_ratings, wikidata_id, website, phone, email, isil_code, google_place_id, description, opening_hours, reviews, photos, photo_urls, business_status, street_view_url, founding_year, dissolution_year, temporal_extent, museum_register, youtube_channel_url, youtube_subscriber_count, youtube_video_count, youtube_enrichment, social_facebook, social_twitter, social_instagram, wikidata_label_en, wikidata_description_en FROM custodians WHERE {where_clause} ORDER BY {sort_by} {sort_direction} LIMIT ${limit_param} OFFSET ${offset_param} """ params.extend([page_size, offset]) async with pool.acquire() as conn: total_count = await conn.fetchval(count_query, *params[:param_count-2] if params else []) rows = await conn.fetch(query, *params) total_pages = (total_count + page_size - 1) // page_size features = [] for row in rows: props = { "ghcid": row['ghcid'], "name": row['name'], "emic_name": row['emic_name'], "type": row['type'], "type_name": row['type_name'], "city": row['city'], "province": row['province'], "province_iso": row['province_iso'], "country_code": row['country_code'], "formatted_address": row['formatted_address'], "rating": float(row['rating']) if row['rating'] else None, "total_ratings": row['total_ratings'], "wikidata_id": row['wikidata_id'], "website": row['website'], "phone": row['phone'], "email": row['email'], "isil_code": row['isil_code'], "google_place_id": row['google_place_id'], "description": row['description'], "business_status": row['business_status'], "street_view_url": row['street_view_url'], "founding_year": row['founding_year'], "dissolution_year": row['dissolution_year'], } # Add JSONB fields if present if row['opening_hours']: props["opening_hours"] = row['opening_hours'] if row['reviews']: props["reviews"] = row['reviews'] if row['photos']: props["photos"] = row['photos'] if row['photo_urls']: props["photo_urls"] = row['photo_urls'] if row['temporal_extent']: props["temporal_extent"] = row['temporal_extent'] if row['museum_register']: props["museum_register"] = row['museum_register'] if row['youtube_enrichment']: props["youtube_enrichment"] = row['youtube_enrichment'] elif row['youtube_channel_url']: props["youtube"] = { "channel_url": row['youtube_channel_url'], "subscriber_count": row['youtube_subscriber_count'], "video_count": row['youtube_video_count'], } social = {} if row['social_facebook']: social['facebook'] = row['social_facebook'] if row['social_twitter']: social['twitter'] = row['social_twitter'] if row['social_instagram']: social['instagram'] = row['social_instagram'] if social: props["social_media"] = social if row['wikidata_label_en']: props["wikidata_label"] = row['wikidata_label_en'] if row['wikidata_description_en']: props["wikidata_description"] = row['wikidata_description_en'] features.append({ "type": "Feature", "geometry": { "type": "Point", "coordinates": [float(row['lon']), float(row['lat'])] }, "properties": props }) return { "type": "FeatureCollection", "features": features, "pagination": { "page": page, "page_size": page_size, "total_count": total_count, "total_pages": total_pages, "has_next": page < total_pages, "has_prev": page > 1, "next_page": page + 1 if page < total_pages else None, "prev_page": page - 1 if page > 1 else None, }, "metadata": { "mode": "paginated", "filters": {"country": country, "type": type}, "sort": {"field": sort_by, "order": sort_order} } } @app.get("/institutions/viewport") async def get_institutions_viewport( bbox: str = Query(..., description="Bounding box: minLon,minLat,maxLon,maxLat (REQUIRED)"), zoom: int = Query(10, ge=1, le=20, description="Map zoom level (affects detail)"), country: Optional[str] = Query(None, description="Filter by country code"), type: Optional[str] = Query(None, description="Filter by institution type"), limit: int = Query(2000, ge=100, le=10000, description="Maximum results for viewport"), ): """ Get institutions visible in current map viewport. Returns lite data at low zoom, full data at high zoom. Optimized for map pan/zoom interactions. """ pool = await get_pool() # Parse bbox try: min_lon, min_lat, max_lon, max_lat = map(float, bbox.split(',')) except ValueError: raise HTTPException(status_code=400, detail="Invalid bbox format. Use: minLon,minLat,maxLon,maxLat") conditions = [ "lat IS NOT NULL AND lon IS NOT NULL", f"lon >= $1 AND lat >= $2 AND lon <= $3 AND lat <= $4" ] params = [min_lon, min_lat, max_lon, max_lat] param_count = 4 if country: param_count += 1 conditions.append(f"country_code = ${param_count}") params.append(country.upper()) if type: param_count += 1 conditions.append(f"type = ${param_count}") params.append(type.upper()) where_clause = " AND ".join(conditions) # Adaptive detail based on zoom level # Low zoom (world/continent view): minimal fields # Medium zoom (country view): basic fields # High zoom (city view): full fields if zoom <= 6: # World/continent view - just markers select_fields = "ghcid, name, type, lon, lat, country_code" detail_level = "minimal" elif zoom <= 10: # Country view - basic info select_fields = "ghcid, name, type, type_name, lon, lat, city, country_code, rating" detail_level = "basic" elif zoom <= 14: # Region view - moderate detail select_fields = """ ghcid, name, emic_name, type, type_name, lon, lat, city, region as province, country_code, rating, total_ratings, website, wikidata_id, description """ detail_level = "moderate" else: # City/street view - full detail select_fields = """ ghcid, name, emic_name, type, type_name, lon, lat, city, region as province, region_code as province_iso, country_code, formatted_address, street_address, postal_code, rating, total_ratings, wikidata_id, website, phone, email, isil_code, google_place_id, description, opening_hours, reviews, photos, photo_urls, business_status, street_view_url, founding_year, dissolution_year, social_facebook, social_twitter, social_instagram, wikidata_label_en, wikidata_description_en """ detail_level = "full" param_count += 1 query = f""" SELECT {select_fields} FROM custodians WHERE {where_clause} ORDER BY rating DESC NULLS LAST, name LIMIT ${param_count} """ params.append(limit) async with pool.acquire() as conn: rows = await conn.fetch(query, *params) features = [] for row in rows: row_dict = dict(row) props = {"ghcid": row_dict['ghcid'], "name": row_dict['name'], "type": row_dict['type']} # Add fields based on detail level if 'type_name' in row_dict: props['type_name'] = row_dict['type_name'] if 'city' in row_dict: props['city'] = row_dict['city'] if 'country_code' in row_dict: props['country_code'] = row_dict['country_code'] if 'rating' in row_dict and row_dict['rating']: props['rating'] = float(row_dict['rating']) if 'total_ratings' in row_dict: props['total_ratings'] = row_dict['total_ratings'] if 'province' in row_dict: props['province'] = row_dict['province'] if 'province_iso' in row_dict: props['province_iso'] = row_dict['province_iso'] if 'emic_name' in row_dict: props['emic_name'] = row_dict['emic_name'] if 'website' in row_dict: props['website'] = row_dict['website'] if 'wikidata_id' in row_dict: props['wikidata_id'] = row_dict['wikidata_id'] if 'description' in row_dict: props['description'] = row_dict['description'] if 'formatted_address' in row_dict: props['formatted_address'] = row_dict['formatted_address'] if 'street_address' in row_dict: props['street_address'] = row_dict['street_address'] if 'postal_code' in row_dict: props['postal_code'] = row_dict['postal_code'] if 'phone' in row_dict: props['phone'] = row_dict['phone'] if 'email' in row_dict: props['email'] = row_dict['email'] if 'isil_code' in row_dict: props['isil_code'] = row_dict['isil_code'] if 'google_place_id' in row_dict: props['google_place_id'] = row_dict['google_place_id'] if 'business_status' in row_dict: props['business_status'] = row_dict['business_status'] if 'street_view_url' in row_dict: props['street_view_url'] = row_dict['street_view_url'] if 'founding_year' in row_dict: props['founding_year'] = row_dict['founding_year'] if 'dissolution_year' in row_dict: props['dissolution_year'] = row_dict['dissolution_year'] # JSONB fields at full detail if 'opening_hours' in row_dict and row_dict['opening_hours']: props['opening_hours'] = row_dict['opening_hours'] if 'reviews' in row_dict and row_dict['reviews']: props['reviews'] = row_dict['reviews'] if 'photos' in row_dict and row_dict['photos']: props['photos'] = row_dict['photos'] if 'photo_urls' in row_dict and row_dict['photo_urls']: props['photo_urls'] = row_dict['photo_urls'] # Social media at full detail social = {} if 'social_facebook' in row_dict and row_dict['social_facebook']: social['facebook'] = row_dict['social_facebook'] if 'social_twitter' in row_dict and row_dict['social_twitter']: social['twitter'] = row_dict['social_twitter'] if 'social_instagram' in row_dict and row_dict['social_instagram']: social['instagram'] = row_dict['social_instagram'] if social: props['social_media'] = social if 'wikidata_label_en' in row_dict and row_dict['wikidata_label_en']: props['wikidata_label'] = row_dict['wikidata_label_en'] if 'wikidata_description_en' in row_dict and row_dict['wikidata_description_en']: props['wikidata_description'] = row_dict['wikidata_description_en'] features.append({ "type": "Feature", "geometry": { "type": "Point", "coordinates": [float(row_dict['lon']), float(row_dict['lat'])] }, "properties": props }) return { "type": "FeatureCollection", "features": features, "metadata": { "count": len(features), "mode": "viewport", "detail_level": detail_level, "zoom": zoom, "bbox": {"min_lon": min_lon, "min_lat": min_lat, "max_lon": max_lon, "max_lat": max_lat}, "filters": {"country": country, "type": type}, "limit": limit, "truncated": len(features) >= limit } } @app.get("/institutions/cluster") async def get_institutions_clustered( bbox: str = Query(..., description="Bounding box: minLon,minLat,maxLon,maxLat"), zoom: int = Query(5, ge=1, le=20, description="Map zoom level"), grid_size: Optional[float] = Query(None, description="Grid cell size in degrees (auto if not set)"), ): """ Get clustered institution counts for overview maps. Returns grid cells with counts instead of individual markers. Ideal for world/country views with 10k+ institutions. """ pool = await get_pool() try: min_lon, min_lat, max_lon, max_lat = map(float, bbox.split(',')) except ValueError: raise HTTPException(status_code=400, detail="Invalid bbox format") # Auto-calculate grid size based on zoom # Higher zoom = smaller grid cells = more detail if grid_size is None: if zoom <= 3: grid_size = 10.0 # ~1000km cells elif zoom <= 5: grid_size = 5.0 # ~500km cells elif zoom <= 7: grid_size = 2.0 # ~200km cells elif zoom <= 9: grid_size = 1.0 # ~100km cells elif zoom <= 11: grid_size = 0.5 # ~50km cells elif zoom <= 13: grid_size = 0.2 # ~20km cells else: grid_size = 0.1 # ~10km cells # Use PostgreSQL to cluster into grid cells query = """ SELECT FLOOR(lon / $5) * $5 + $5/2 as cell_lon, FLOOR(lat / $5) * $5 + $5/2 as cell_lat, COUNT(*) as count, array_agg(DISTINCT type) as types, AVG(rating) as avg_rating FROM custodians WHERE lat IS NOT NULL AND lon IS NOT NULL AND lon >= $1 AND lat >= $2 AND lon <= $3 AND lat <= $4 GROUP BY FLOOR(lon / $5), FLOOR(lat / $5) ORDER BY count DESC """ async with pool.acquire() as conn: rows = await conn.fetch(query, min_lon, min_lat, max_lon, max_lat, grid_size) features = [] total_count = 0 for row in rows: count = row['count'] total_count += count features.append({ "type": "Feature", "geometry": { "type": "Point", "coordinates": [float(row['cell_lon']), float(row['cell_lat'])] }, "properties": { "cluster": True, "count": count, "types": row['types'], "avg_rating": round(float(row['avg_rating']), 2) if row['avg_rating'] else None, } }) return { "type": "FeatureCollection", "features": features, "metadata": { "mode": "clustered", "cluster_count": len(features), "total_institutions": total_count, "grid_size": grid_size, "zoom": zoom, "bbox": {"min_lon": min_lon, "min_lat": min_lat, "max_lon": max_lon, "max_lat": max_lat} } } # ============================================================================ # Person Endpoints (Beschermers) # ============================================================================ @app.get("/persons", response_model=Dict[str, Any]) async def list_persons( custodian_slug: Optional[str] = Query(None, description="Filter by custodian slug"), heritage_type: Optional[str] = Query(None, description="Filter by heritage type (A, L, M, etc.)"), country_code: Optional[str] = Query(None, description="Filter by country code"), heritage_relevant: Optional[bool] = Query(None, description="Filter by heritage relevance (true/false)"), limit: int = Query(50, ge=1, le=500, description="Max results to return"), offset: int = Query(0, ge=0, description="Offset for pagination"), ): """ List persons with optional filters. Returns paginated list of heritage professionals (beschermers). """ pool = await get_pool() # Build query with optional filters conditions = [] params = [] param_idx = 1 if custodian_slug: conditions.append(f"custodian_slug = ${param_idx}") params.append(custodian_slug) param_idx += 1 if heritage_type: conditions.append(f"${param_idx} = ANY(heritage_types)") params.append(heritage_type) param_idx += 1 if country_code: conditions.append(f"country_code = ${param_idx}") params.append(country_code) param_idx += 1 if heritage_relevant is not None: conditions.append(f"heritage_relevant = ${param_idx}") params.append(heritage_relevant) param_idx += 1 where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else "" # Add pagination params params.extend([limit, offset]) query = f""" SELECT staff_id, name, headline, location, country_code, custodian_slug, custodian_name, linkedin_url, profile_image_url, heritage_relevant, heritage_types FROM persons {where_clause} ORDER BY name LIMIT ${param_idx} OFFSET ${param_idx + 1} """ # Count query count_query = f"SELECT COUNT(*) FROM persons {where_clause}" async with pool.acquire() as conn: rows = await conn.fetch(query, *params) count_params = params[:-2] if params else [] # Remove limit/offset for count total = await conn.fetchval(count_query, *count_params) if count_params else await conn.fetchval(count_query) persons = [] for row in rows: persons.append(PersonSummary( staff_id=row['staff_id'], name=row['name'], headline=row['headline'], location=row['location'], country_code=row['country_code'], custodian_slug=row['custodian_slug'], custodian_name=row['custodian_name'], linkedin_url=row['linkedin_url'], profile_image_url=row['profile_image_url'], heritage_relevant=row['heritage_relevant'] if row['heritage_relevant'] is not None else True, heritage_types=row['heritage_types'] if row['heritage_types'] else [], )) return { "persons": [p.model_dump() for p in persons], "total": total, "limit": limit, "offset": offset, "has_more": offset + len(persons) < total, } @app.get("/persons/count") async def get_persons_count(): """Get total person count for stats display.""" pool = await get_pool() async with pool.acquire() as conn: total = await conn.fetchval("SELECT COUNT(*) FROM persons") heritage_relevant = await conn.fetchval("SELECT COUNT(*) FROM persons WHERE heritage_relevant = true") return { "total": total, "heritage_relevant": heritage_relevant, } @app.get("/persons/search", response_model=Dict[str, Any]) async def search_persons( q: str = Query(..., min_length=2, description="Search query"), limit: int = Query(20, ge=1, le=100, description="Max results"), ): """ Search persons by name, headline, or custodian name. Uses PostgreSQL full-text search. """ pool = await get_pool() # Use ILIKE for simple search (full-text search can be added later if index exists) search_pattern = f"%{q}%" query = """ SELECT staff_id, name, headline, location, country_code, custodian_slug, custodian_name, linkedin_url, profile_image_url, heritage_relevant, heritage_types FROM persons WHERE name ILIKE $1 OR headline ILIKE $1 OR custodian_name ILIKE $1 ORDER BY CASE WHEN name ILIKE $2 THEN 0 ELSE 1 END, name LIMIT $3 """ async with pool.acquire() as conn: rows = await conn.fetch(query, search_pattern, f"{q}%", limit) persons = [] for row in rows: persons.append(PersonSummary( staff_id=row['staff_id'], name=row['name'], headline=row['headline'], location=row['location'], country_code=row['country_code'], custodian_slug=row['custodian_slug'], custodian_name=row['custodian_name'], linkedin_url=row['linkedin_url'], profile_image_url=row['profile_image_url'], heritage_relevant=row['heritage_relevant'] if row['heritage_relevant'] is not None else True, heritage_types=row['heritage_types'] if row['heritage_types'] else [], )) return { "persons": [p.model_dump() for p in persons], "count": len(persons), "query": q, } @app.get("/person/{staff_id}", response_model=PersonDetail) async def get_person(staff_id: str): """Get detailed information for a single person.""" pool = await get_pool() query = """ SELECT staff_id, name, headline, location, country_code, custodian_slug, custodian_name, linkedin_url, profile_image_url, heritage_relevant, heritage_types, experience, education, skills, languages, about, connections, extraction_date, extraction_method, source_file FROM persons WHERE staff_id = $1 """ async with pool.acquire() as conn: row = await conn.fetchrow(query, staff_id) if not row: raise HTTPException(status_code=404, detail=f"Person not found: {staff_id}") return PersonDetail( staff_id=row['staff_id'], name=row['name'], headline=row['headline'], location=row['location'], country_code=row['country_code'], custodian_slug=row['custodian_slug'], custodian_name=row['custodian_name'], linkedin_url=row['linkedin_url'], profile_image_url=row['profile_image_url'], heritage_relevant=row['heritage_relevant'] if row['heritage_relevant'] is not None else True, heritage_types=parse_jsonb_list(row['heritage_types']), experience=enrich_experience_with_heritage(parse_jsonb_list(row['experience'])), education=parse_jsonb_list(row['education']), skills=parse_jsonb_list(row['skills']), languages=parse_jsonb_list(row['languages']), about=row['about'], connections=row['connections'], extraction_date=row['extraction_date'].isoformat() if row['extraction_date'] else None, extraction_method=row['extraction_method'], source_file=row['source_file'], ) # ============================================================================ # Image Proxy (Avoid Hotlinking Issues) # ============================================================================ # In-memory cache for proxied images (simple TTL-based) _image_cache: Dict[str, tuple] = {} # hash -> (content, content_type, timestamp) IMAGE_CACHE_TTL = 3600 # 1 hour # Allowed image domains for security ALLOWED_IMAGE_DOMAINS = { # Google Maps 'lh3.googleusercontent.com', 'lh4.googleusercontent.com', 'lh5.googleusercontent.com', 'lh6.googleusercontent.com', 'maps.gstatic.com', 'maps.googleapis.com', # Wikidata/Wikimedia 'upload.wikimedia.org', 'commons.wikimedia.org', # Institution domains (add as needed) # Generic patterns handled below } def is_allowed_image_url(url: str) -> bool: """Check if URL is from an allowed domain for proxying.""" try: parsed = urlparse(url) domain = parsed.netloc.lower() # Check exact matches if domain in ALLOWED_IMAGE_DOMAINS: return True # Allow any .nl domain (Dutch institutions) if domain.endswith('.nl'): return True # Allow any .org domain (many heritage institutions) if domain.endswith('.org'): return True # Allow any .museum domain if domain.endswith('.museum'): return True # Check for Google user content subdomains if 'googleusercontent.com' in domain: return True return False except Exception: return False @app.get("/image-proxy") async def proxy_image(url: str = Query(..., description="Image URL to proxy")): """ Proxy external images to avoid hotlinking issues. Many external servers block direct embedding (hotlinking) of their images. This endpoint fetches the image server-side and returns it with proper headers. Features: - Validates URL is from allowed domains (security) - Caches images in memory for 1 hour (performance) - Sets proper Content-Type headers - Avoids CORS issues Usage: /image-proxy?url=https://example.com/logo.png """ # Security: validate URL if not url or not url.startswith(('http://', 'https://')): raise HTTPException(status_code=400, detail="Invalid URL") if not is_allowed_image_url(url): raise HTTPException(status_code=403, detail="Domain not allowed for proxying") # Check cache url_hash = hashlib.md5(url.encode()).hexdigest() if url_hash in _image_cache: content, content_type, timestamp = _image_cache[url_hash] if datetime.now().timestamp() - timestamp < IMAGE_CACHE_TTL: return Response( content=content, media_type=content_type, headers={ "Cache-Control": "public, max-age=3600", "X-Proxy-Cache": "HIT", } ) # Fetch image try: async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client: response = await client.get( url, headers={ # Spoof headers to avoid hotlink detection "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9,nl;q=0.8", "Referer": urlparse(url).scheme + "://" + urlparse(url).netloc + "/", } ) if response.status_code != 200: raise HTTPException(status_code=502, detail=f"Failed to fetch image: {response.status_code}") content = response.content content_type = response.headers.get("content-type", "image/png") # Validate it's actually an image if not content_type.startswith("image/"): raise HTTPException(status_code=400, detail="URL does not point to an image") # Cache the result _image_cache[url_hash] = (content, content_type, datetime.now().timestamp()) # Limit cache size (simple LRU-like cleanup) if len(_image_cache) > 1000: # Remove oldest entries sorted_entries = sorted(_image_cache.items(), key=lambda x: x[1][2]) for key, _ in sorted_entries[:500]: del _image_cache[key] return Response( content=content, media_type=content_type, headers={ "Cache-Control": "public, max-age=3600", "X-Proxy-Cache": "MISS", } ) except httpx.TimeoutException: raise HTTPException(status_code=504, detail="Timeout fetching image") except httpx.RequestError as e: raise HTTPException(status_code=502, detail=f"Error fetching image: {str(e)}") # ============================================================================ # GeoNames Place Lookup (for RAG geospatial filtering) # ============================================================================ # In-memory LRU cache for place lookups (key: "country:lowername", value: PlaceLookupResponse dict) # This dramatically speeds up repeated queries for the same places from functools import lru_cache import time as time_module # Cache statistics for monitoring _place_cache_stats = {"hits": 0, "misses": 0, "last_reset": time_module.time()} def _calculate_bbox(lat: float, lon: float, population: int) -> list[float]: """Calculate bounding box based on population size.""" if population > 200000: delta = 0.15 # ~17km for large cities elif population > 50000: delta = 0.08 # ~9km for medium cities else: delta = 0.04 # ~4.5km for small places return [lon - delta, lat - delta, lon + delta, lat + delta] # LRU cache for place lookups - caches up to 500 places # This avoids repeated SQLite queries for the same place @lru_cache(maxsize=500) def _cached_place_lookup_sync(name_lower: str, country: str, db_path: str) -> dict | None: """ Synchronous cached place lookup using tiered search strategy. Tier 1: Exact match with Title Case (uses index, ~0.03ms) Tier 2: Case-insensitive COLLATE NOCASE (~2ms) Tier 3: Fuzzy match on alternate_names (~12ms) Returns dict with place data or None if not found. """ import sqlite3 conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row row = None try: # Tier 1: Exact match with Title Case (fastest - uses index) # Most place names from frontend are already normalized title_name = name_lower.title() cursor = conn.execute(""" SELECT name, latitude, longitude, admin1_name, population FROM cities WHERE name = ? AND country_code = ? ORDER BY population DESC LIMIT 1 """, (title_name, country)) row = cursor.fetchone() if not row: # Tier 2: Case-insensitive match with COLLATE NOCASE cursor = conn.execute(""" SELECT name, latitude, longitude, admin1_name, population FROM cities WHERE name = ? COLLATE NOCASE AND country_code = ? ORDER BY population DESC LIMIT 1 """, (name_lower, country)) row = cursor.fetchone() if not row: # Tier 3: Check alternate_names and ascii_name (slowest but comprehensive) cursor = conn.execute(""" SELECT name, latitude, longitude, admin1_name, population FROM cities WHERE ( alternate_names LIKE '%' || ? || '%' OR ascii_name = ? COLLATE NOCASE ) AND country_code = ? ORDER BY population DESC LIMIT 1 """, (name_lower, name_lower, country)) row = cursor.fetchone() if row: pop = row["population"] or 10000 lat = float(row["latitude"]) lon = float(row["longitude"]) return { "name": row["name"], "latitude": lat, "longitude": lon, "province": row["admin1_name"], "population": pop, "bbox": _calculate_bbox(lat, lon, pop), "country_code": country, } return None finally: conn.close() @app.get("/places/lookup", response_model=PlaceLookupResponse) async def lookup_place( name: str = Query(..., min_length=2, description="Place name to look up"), country: str = Query("NL", description="Country code filter (ISO 3166-1 alpha-2)"), ) -> PlaceLookupResponse: """ Look up a place by name and return coordinates with bounding box. Used by RAG to resolve place names mentioned in user queries and apply geospatial filtering to Qdrant vector search results. OPTIMIZED with tiered lookup strategy: - Tier 1: Exact match (uses index, ~0.03ms) - Tier 2: Case-insensitive COLLATE NOCASE (~2ms) - Tier 3: Fuzzy match on alternate_names (~12ms) Results are cached in-memory (LRU cache, 500 entries) for instant repeated lookups. Bounding box is calculated based on population: - Large cities (>200k): +/- 0.15 degrees (~17km) - Medium cities (50k-200k): +/- 0.08 degrees (~9km) - Small places (<50k): +/- 0.04 degrees (~4.5km) Examples: - /places/lookup?name=Amsterdam -> Returns Amsterdam with large bbox - /places/lookup?name=Leiden -> Returns Leiden with medium bbox - /places/lookup?name=Aalten -> Returns Aalten with small bbox """ global _place_cache_stats db_path = settings.geonames_db # Check if database exists if not Path(db_path).exists(): raise HTTPException( status_code=500, detail=f"GeoNames database not found at {db_path}" ) # Normalize name for cache key name_lower = name.lower().strip() # Check cache stats before lookup cache_info_before = _cached_place_lookup_sync.cache_info() try: # Use cached synchronous lookup (runs in thread pool via asyncio) # The LRU cache makes repeated lookups instant result = await asyncio.get_event_loop().run_in_executor( None, _cached_place_lookup_sync, name_lower, country.upper(), db_path ) # Update cache stats cache_info_after = _cached_place_lookup_sync.cache_info() if cache_info_after.hits > cache_info_before.hits: _place_cache_stats["hits"] += 1 else: _place_cache_stats["misses"] += 1 if not result: raise HTTPException( status_code=404, detail=f"Place '{name}' not found in {country}" ) return PlaceLookupResponse( name=result["name"], latitude=result["latitude"], longitude=result["longitude"], province=result["province"], population=result["population"], bbox=result["bbox"], country_code=result["country_code"], ) except HTTPException: raise except Exception as e: raise HTTPException( status_code=500, detail=f"Database error: {str(e)}" ) @app.get("/places/cache-stats") async def get_place_cache_stats() -> dict: """Get statistics about the place lookup cache.""" cache_info = _cached_place_lookup_sync.cache_info() return { "cache_size": cache_info.currsize, "max_size": cache_info.maxsize, "hits": cache_info.hits, "misses": cache_info.misses, "hit_rate": cache_info.hits / (cache_info.hits + cache_info.misses) if (cache_info.hits + cache_info.misses) > 0 else 0, "api_hits": _place_cache_stats["hits"], "api_misses": _place_cache_stats["misses"], } @app.get("/places/provinces") async def list_provinces( country: str = Query("NL", description="Country code filter"), ) -> List[Dict[str, Any]]: """ List all provinces for a country from GeoNames database. Useful for province-level geospatial filtering. """ db_path = settings.geonames_db if not Path(db_path).exists(): raise HTTPException( status_code=500, detail=f"GeoNames database not found" ) try: async with aiosqlite.connect(db_path) as db: db.row_factory = aiosqlite.Row # Get unique provinces with representative city for bbox calculation cursor = await db.execute(""" SELECT DISTINCT admin1_name as province, admin1_code, AVG(latitude) as center_lat, AVG(longitude) as center_lon, SUM(population) as total_pop FROM cities WHERE country_code = ? AND admin1_name IS NOT NULL GROUP BY admin1_name, admin1_code ORDER BY admin1_name """, (country,)) rows = await cursor.fetchall() provinces = [] for row in rows: provinces.append({ "name": row["province"], "code": row["admin1_code"], "center_lat": row["center_lat"], "center_lon": row["center_lon"], "total_population": row["total_pop"], }) return provinces except aiosqlite.Error as e: raise HTTPException( status_code=500, detail=f"Database error: {str(e)}" ) # ============================================================================ # Main # ============================================================================ if __name__ == "__main__": import uvicorn uvicorn.run( "geo_api:app", host=settings.api_host, port=settings.api_port, reload=True, )