- Layer 1: 35 unit tests (no LLM required) - Layer 2: 56 DSPy module tests with LLM - Layer 3: 10 integration tests with Oxigraph - Layer 4: Comprehensive evaluation suite Fixed: - Coordinate queries to use schema:location -> blank node pattern - Golden query expected intent for location questions - Health check test filtering in Layer 4 Added GitHub Actions workflow for CI/CD evaluation
234 lines
5.8 KiB
TOML
234 lines
5.8 KiB
TOML
[tool.poetry]
|
|
name = "glam-extractor"
|
|
version = "0.1.0"
|
|
description = "Extract and standardize global GLAM (Galleries, Libraries, Archives, Museums) institutional data from conversation transcripts and authoritative registries"
|
|
authors = ["Your Name <your.email@example.com>"]
|
|
license = "MIT"
|
|
readme = "README.md"
|
|
homepage = "https://github.com/yourusername/glam-extractor"
|
|
repository = "https://github.com/yourusername/glam-extractor"
|
|
keywords = ["glam", "heritage", "museums", "libraries", "archives", "linkml", "nlp", "data-extraction"]
|
|
classifiers = [
|
|
"Development Status :: 3 - Alpha",
|
|
"Intended Audience :: Science/Research",
|
|
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
"Topic :: Sociology :: History",
|
|
]
|
|
packages = [{include = "glam_extractor", from = "src"}]
|
|
|
|
[tool.poetry.dependencies]
|
|
python = "^3.11"
|
|
|
|
# Core data processing
|
|
pandas = "^2.1.0"
|
|
numpy = ">=2.0.0"
|
|
|
|
# Text processing (direct dependencies only)
|
|
# NOTE: NLP extraction (NER) is handled by coding subagents via Task tool
|
|
# spaCy, transformers, torch are NOT direct dependencies
|
|
rapidfuzz = "^3.5.0" # Fuzzy string matching for deduplication
|
|
langdetect = "^1.0.9" # Language detection (fallback)
|
|
fast-langdetect = "^1.0.0" # FastText-based language detection (primary, more accurate)
|
|
unidecode = "^1.3.7" # Unicode transliteration
|
|
|
|
# Web crawling and scraping
|
|
crawl4ai = "^0.7.0"
|
|
httpx = "^0.27.0"
|
|
beautifulsoup4 = "^4.12.0"
|
|
lxml = ">=4.9.0,<6.0.0"
|
|
|
|
# LinkML schema and validation
|
|
linkml = "^1.9.0"
|
|
linkml-runtime = "^1.9.0"
|
|
# NOTE: Upgraded to Pydantic v2 for DSPy compatibility (LinkML 1.9+ supports v2)
|
|
pydantic = "^2.0.0"
|
|
|
|
# RDF and semantic web
|
|
rdflib = "^7.0.0"
|
|
SPARQLWrapper = "^2.0.0"
|
|
|
|
# Database and storage
|
|
duckdb = ">=1.0.0"
|
|
sqlalchemy = "^2.0.0"
|
|
pyarrow = "^14.0.0"
|
|
|
|
# Geographic data
|
|
geopy = "^2.4.0"
|
|
pycountry = "^23.12.0"
|
|
|
|
# Utilities
|
|
click = "^8.1.0"
|
|
tqdm = "^4.66.0"
|
|
python-dotenv = "^1.0.0"
|
|
pyyaml = "^6.0.0"
|
|
python-dateutil = "^2.8.0"
|
|
requests = "^2.31.0"
|
|
|
|
# API Server
|
|
fastapi = "^0.115.0"
|
|
uvicorn = {extras = ["standard"], version = "^0.32.0"}
|
|
pydantic-settings = "^2.0.0"
|
|
|
|
# DSPy for LLM-powered SPARQL generation
|
|
dspy-ai = "^2.5.0"
|
|
openai = "^1.0.0" # DSPy backend for OpenAI/Anthropic
|
|
qdrant-client = "^1.16.2"
|
|
sentence-transformers = "^5.2.0"
|
|
typedb-driver = "^3.0.0"
|
|
|
|
[tool.poetry.group.dev.dependencies]
|
|
# Testing
|
|
pytest = "^7.4.0"
|
|
pytest-cov = "^4.1.0"
|
|
pytest-asyncio = "^0.21.0"
|
|
hypothesis = "^6.92.0"
|
|
|
|
# Code quality
|
|
ruff = "^0.8.0"
|
|
black = "^23.11.0"
|
|
mypy = "^1.7.0"
|
|
pre-commit = "^3.5.0"
|
|
|
|
# Documentation
|
|
mkdocs = "^1.5.0"
|
|
mkdocs-material = "^9.4.0"
|
|
mkdocstrings = {extras = ["python"], version = "^0.24.0"}
|
|
|
|
# Jupyter for exploration
|
|
jupyter = "^1.0.0"
|
|
ipykernel = "^6.27.0"
|
|
matplotlib = "^3.8.0"
|
|
seaborn = "^0.13.0"
|
|
types-pyyaml = "^6.0.12.20250915"
|
|
|
|
[tool.poetry.scripts]
|
|
glam = "glam_extractor.cli:main"
|
|
glam-extract = "glam_extractor.cli:extract_command"
|
|
glam-validate = "glam_extractor.cli:validate_command"
|
|
glam-export = "glam_extractor.cli:export_command"
|
|
glam-crawl = "glam_extractor.cli:crawl_command"
|
|
|
|
[tool.black]
|
|
line-length = 100
|
|
target-version = ['py311']
|
|
include = '\.pyi?$'
|
|
extend-exclude = '''
|
|
/(
|
|
# directories
|
|
\.eggs
|
|
| \.git
|
|
| \.hg
|
|
| \.mypy_cache
|
|
| \.tox
|
|
| \.venv
|
|
| build
|
|
| dist
|
|
)/
|
|
'''
|
|
|
|
[tool.ruff]
|
|
line-length = 100
|
|
target-version = "py311"
|
|
select = [
|
|
"E", # pycodestyle errors
|
|
"W", # pycodestyle warnings
|
|
"F", # pyflakes
|
|
"I", # isort
|
|
"B", # flake8-bugbear
|
|
"C4", # flake8-comprehensions
|
|
"UP", # pyupgrade
|
|
]
|
|
ignore = [
|
|
"E501", # line too long (handled by black)
|
|
"B008", # do not perform function calls in argument defaults
|
|
"C901", # too complex
|
|
]
|
|
|
|
[tool.ruff.per-file-ignores]
|
|
"__init__.py" = ["F401"]
|
|
|
|
[tool.ruff.isort]
|
|
known-first-party = ["glam_extractor"]
|
|
|
|
[tool.mypy]
|
|
python_version = "3.11"
|
|
warn_return_any = true
|
|
warn_unused_configs = true
|
|
disallow_untyped_defs = true
|
|
disallow_incomplete_defs = true
|
|
check_untyped_defs = true
|
|
no_implicit_optional = true
|
|
warn_redundant_casts = true
|
|
warn_unused_ignores = true
|
|
warn_no_return = true
|
|
warn_unreachable = true
|
|
strict_equality = true
|
|
plugins = ["pydantic.mypy"]
|
|
|
|
[tool.pydantic-mypy]
|
|
init_forbid_extra = true
|
|
init_typed = true
|
|
warn_required_dynamic_aliases = true
|
|
|
|
[[tool.mypy.overrides]]
|
|
module = [
|
|
"crawl4ai.*",
|
|
"linkml.*",
|
|
"rdflib.*",
|
|
"geopy.*",
|
|
]
|
|
ignore_missing_imports = true
|
|
|
|
[tool.pytest.ini_options]
|
|
minversion = "7.0"
|
|
addopts = [
|
|
"-ra",
|
|
"--strict-markers",
|
|
"--strict-config",
|
|
"--cov=glam_extractor",
|
|
"--cov-report=term-missing:skip-covered",
|
|
"--cov-report=html",
|
|
"--cov-report=xml",
|
|
]
|
|
testpaths = ["tests"]
|
|
pythonpath = ["src"]
|
|
markers = [
|
|
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
"integration: marks tests as integration tests",
|
|
"unit: marks tests as unit tests",
|
|
"subagent: marks tests that use coding subagents for NER",
|
|
"web: marks tests that require internet connection",
|
|
"performance: marks tests that measure performance metrics",
|
|
"layer1: fast unit tests without LLM (DSPy GitOps)",
|
|
"layer2: DSPy module tests with LLM (DSPy GitOps)",
|
|
"layer3: integration tests with live Oxigraph (DSPy GitOps)",
|
|
"layer4: comprehensive evaluation (DSPy GitOps)",
|
|
"smoke: quick smoke tests for CI",
|
|
"requires_oxigraph: tests that need Oxigraph connection",
|
|
"requires_llm: tests that need LLM API access",
|
|
]
|
|
|
|
[tool.coverage.run]
|
|
source = ["src"]
|
|
branch = true
|
|
omit = [
|
|
"*/tests/*",
|
|
"*/__pycache__/*",
|
|
"*/site-packages/*",
|
|
]
|
|
|
|
[tool.coverage.report]
|
|
exclude_lines = [
|
|
"pragma: no cover",
|
|
"def __repr__",
|
|
"raise AssertionError",
|
|
"raise NotImplementedError",
|
|
"if __name__ == .__main__.:",
|
|
"if TYPE_CHECKING:",
|
|
"class .*\\bProtocol\\):",
|
|
"@(abc\\.)?abstractmethod",
|
|
]
|
|
|
|
[build-system]
|
|
requires = ["poetry-core"]
|
|
build-backend = "poetry.core.masonry.api"
|