glam/pyproject.toml
2025-12-10 13:01:13 +01:00

222 lines
5.2 KiB
TOML

[tool.poetry]
name = "glam-extractor"
version = "0.1.0"
description = "Extract and standardize global GLAM (Galleries, Libraries, Archives, Museums) institutional data from conversation transcripts and authoritative registries"
authors = ["Your Name <your.email@example.com>"]
license = "MIT"
readme = "README.md"
homepage = "https://github.com/yourusername/glam-extractor"
repository = "https://github.com/yourusername/glam-extractor"
keywords = ["glam", "heritage", "museums", "libraries", "archives", "linkml", "nlp", "data-extraction"]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Sociology :: History",
]
packages = [{include = "glam_extractor", from = "src"}]
[tool.poetry.dependencies]
python = "^3.11"
# Core data processing
pandas = "^2.1.0"
numpy = "^1.26.0"
# Text processing (direct dependencies only)
# NOTE: NLP extraction (NER) is handled by coding subagents via Task tool
# spaCy, transformers, torch are NOT direct dependencies
rapidfuzz = "^3.5.0" # Fuzzy string matching for deduplication
langdetect = "^1.0.9" # Language detection
unidecode = "^1.3.7" # Unicode transliteration
# Web crawling and scraping
crawl4ai = "^0.7.0"
httpx = "^0.27.0"
beautifulsoup4 = "^4.12.0"
lxml = ">=4.9.0,<6.0.0"
# LinkML schema and validation
linkml = "^1.9.0"
linkml-runtime = "^1.9.0"
# NOTE: Upgraded to Pydantic v2 for DSPy compatibility (LinkML 1.9+ supports v2)
pydantic = "^2.0.0"
# RDF and semantic web
rdflib = "^7.0.0"
SPARQLWrapper = "^2.0.0"
# Database and storage
duckdb = "^0.9.0"
sqlalchemy = "^2.0.0"
pyarrow = "^14.0.0"
# Geographic data
geopy = "^2.4.0"
pycountry = "^23.12.0"
# Utilities
click = "^8.1.0"
tqdm = "^4.66.0"
python-dotenv = "^1.0.0"
pyyaml = "^6.0.0"
python-dateutil = "^2.8.0"
requests = "^2.31.0"
# API Server
fastapi = "^0.115.0"
uvicorn = {extras = ["standard"], version = "^0.32.0"}
pydantic-settings = "^2.0.0"
# DSPy for LLM-powered SPARQL generation
dspy-ai = "^2.5.0"
openai = "^1.0.0" # DSPy backend for OpenAI/Anthropic
[tool.poetry.group.dev.dependencies]
# Testing
pytest = "^7.4.0"
pytest-cov = "^4.1.0"
pytest-asyncio = "^0.21.0"
hypothesis = "^6.92.0"
# Code quality
ruff = "^0.8.0"
black = "^23.11.0"
mypy = "^1.7.0"
pre-commit = "^3.5.0"
# Documentation
mkdocs = "^1.5.0"
mkdocs-material = "^9.4.0"
mkdocstrings = {extras = ["python"], version = "^0.24.0"}
# Jupyter for exploration
jupyter = "^1.0.0"
ipykernel = "^6.27.0"
matplotlib = "^3.8.0"
seaborn = "^0.13.0"
[tool.poetry.scripts]
glam = "glam_extractor.cli:main"
glam-extract = "glam_extractor.cli:extract_command"
glam-validate = "glam_extractor.cli:validate_command"
glam-export = "glam_extractor.cli:export_command"
glam-crawl = "glam_extractor.cli:crawl_command"
[tool.black]
line-length = 100
target-version = ['py311']
include = '\.pyi?$'
extend-exclude = '''
/(
# directories
\.eggs
| \.git
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| build
| dist
)/
'''
[tool.ruff]
line-length = 100
target-version = "py311"
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"I", # isort
"B", # flake8-bugbear
"C4", # flake8-comprehensions
"UP", # pyupgrade
]
ignore = [
"E501", # line too long (handled by black)
"B008", # do not perform function calls in argument defaults
"C901", # too complex
]
[tool.ruff.per-file-ignores]
"__init__.py" = ["F401"]
[tool.ruff.isort]
known-first-party = ["glam_extractor"]
[tool.mypy]
python_version = "3.11"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
check_untyped_defs = true
no_implicit_optional = true
warn_redundant_casts = true
warn_unused_ignores = true
warn_no_return = true
warn_unreachable = true
strict_equality = true
plugins = ["pydantic.mypy"]
[tool.pydantic-mypy]
init_forbid_extra = true
init_typed = true
warn_required_dynamic_aliases = true
[[tool.mypy.overrides]]
module = [
"crawl4ai.*",
"linkml.*",
"rdflib.*",
"geopy.*",
]
ignore_missing_imports = true
[tool.pytest.ini_options]
minversion = "7.0"
addopts = [
"-ra",
"--strict-markers",
"--strict-config",
"--cov=glam_extractor",
"--cov-report=term-missing:skip-covered",
"--cov-report=html",
"--cov-report=xml",
]
testpaths = ["tests"]
pythonpath = ["src"]
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"integration: marks tests as integration tests",
"unit: marks tests as unit tests",
"subagent: marks tests that use coding subagents for NER",
"web: marks tests that require internet connection",
"performance: marks tests that measure performance metrics",
]
[tool.coverage.run]
source = ["src"]
branch = true
omit = [
"*/tests/*",
"*/__pycache__/*",
"*/site-packages/*",
]
[tool.coverage.report]
exclude_lines = [
"pragma: no cover",
"def __repr__",
"raise AssertionError",
"raise NotImplementedError",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
"class .*\\bProtocol\\):",
"@(abc\\.)?abstractmethod",
]
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"