glam/pyproject.toml

[tool.poetry]
name = "glam-extractor"
version = "0.1.0"
description = "Extract and standardize global GLAM (Galleries, Libraries, Archives, Museums) institutional data from conversation transcripts and authoritative registries"
authors = ["Your Name <your.email@example.com>"]
license = "MIT"
readme = "README.md"
homepage = "https://github.com/yourusername/glam-extractor"
repository = "https://github.com/yourusername/glam-extractor"
keywords = ["glam", "heritage", "museums", "libraries", "archives", "linkml", "nlp", "data-extraction"]
classifiers = [
    "Development Status :: 3 - Alpha",
    "Intended Audience :: Science/Research",
    "Topic :: Scientific/Engineering :: Information Analysis",
    "Topic :: Sociology :: History",
]
packages = [{include = "glam_extractor", from = "src"}]

[tool.poetry.dependencies]
python = "^3.11"

# Core data processing
pandas = "^2.1.0"
numpy = ">=2.0.0"

# Text processing (direct dependencies only)
# NOTE: NLP extraction (NER) is handled by coding subagents via Task tool
# spaCy, transformers, torch are NOT direct dependencies
rapidfuzz = "^3.5.0"  # Fuzzy string matching for deduplication
langdetect = "^1.0.9"  # Language detection (fallback)
fast-langdetect = "^1.0.0"  # FastText-based language detection (primary, more accurate)
unidecode = "^1.3.7"  # Unicode transliteration

# Web crawling and scraping
crawl4ai = "^0.7.0"
httpx = "^0.27.0"
beautifulsoup4 = "^4.12.0"
lxml = ">=4.9.0,<6.0.0"

# LinkML schema and validation
linkml = "^1.9.0"
linkml-runtime = "^1.9.0"
# NOTE: Upgraded to Pydantic v2 for DSPy compatibility (LinkML 1.9+ supports v2)
pydantic = "^2.0.0"

# RDF and semantic web
rdflib = "^7.0.0"
SPARQLWrapper = "^2.0.0"

# Database and storage
duckdb = ">=1.0.0"
sqlalchemy = "^2.0.0"
pyarrow = "^14.0.0"

# Geographic data
geopy = "^2.4.0"
pycountry = "^23.12.0"

# Utilities
click = "^8.1.0"
tqdm = "^4.66.0"
python-dotenv = "^1.0.0"
pyyaml = "^6.0.0"
python-dateutil = "^2.8.0"
requests = "^2.31.0"

# API Server
fastapi = "^0.115.0"
uvicorn = {extras = ["standard"], version = "^0.32.0"}
pydantic-settings = "^2.0.0"

# DSPy for LLM-powered SPARQL generation
dspy-ai = "^2.5.0"
openai = "^1.0.0"  # DSPy backend for OpenAI/Anthropic
qdrant-client = "^1.16.2"
sentence-transformers = "^5.2.0"
typedb-driver = "^3.0.0"

[tool.poetry.group.dev.dependencies]
# Testing
pytest = "^7.4.0"
pytest-cov = "^4.1.0"
pytest-asyncio = "^0.21.0"
hypothesis = "^6.92.0"

# Code quality
ruff = "^0.8.0"
black = "^23.11.0"
mypy = "^1.7.0"
pre-commit = "^3.5.0"

# Documentation
mkdocs = "^1.5.0"
mkdocs-material = "^9.4.0"
mkdocstrings = {extras = ["python"], version = "^0.24.0"}

# Jupyter for exploration
jupyter = "^1.0.0"
ipykernel = "^6.27.0"
matplotlib = "^3.8.0"
seaborn = "^0.13.0"
types-pyyaml = "^6.0.12.20250915"

[tool.poetry.scripts]
glam = "glam_extractor.cli:main"
glam-extract = "glam_extractor.cli:extract_command"
glam-validate = "glam_extractor.cli:validate_command"
glam-export = "glam_extractor.cli:export_command"
glam-crawl = "glam_extractor.cli:crawl_command"

[tool.black]
line-length = 100
target-version = ['py311']
include = '\.pyi?$'
extend-exclude = '''
/(
  # directories
  \.eggs
  | \.git
  | \.hg
  | \.mypy_cache
  | \.tox
  | \.venv
  | build
  | dist
)/
'''

[tool.ruff]
line-length = 100
target-version = "py311"
select = [
    "E",   # pycodestyle errors
    "W",   # pycodestyle warnings
    "F",   # pyflakes
    "I",   # isort
    "B",   # flake8-bugbear
    "C4",  # flake8-comprehensions
    "UP",  # pyupgrade
]
ignore = [
    "E501",  # line too long (handled by black)
    "B008",  # do not perform function calls in argument defaults
    "C901",  # too complex
]

[tool.ruff.per-file-ignores]
"__init__.py" = ["F401"]

[tool.ruff.isort]
known-first-party = ["glam_extractor"]

[tool.mypy]
python_version = "3.11"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
check_untyped_defs = true
no_implicit_optional = true
warn_redundant_casts = true
warn_unused_ignores = true
warn_no_return = true
warn_unreachable = true
strict_equality = true
plugins = ["pydantic.mypy"]

[tool.pydantic-mypy]
init_forbid_extra = true
init_typed = true
warn_required_dynamic_aliases = true

[[tool.mypy.overrides]]
module = [
    "crawl4ai.*",
    "linkml.*",
    "rdflib.*",
    "geopy.*",
]
ignore_missing_imports = true

[tool.pytest.ini_options]
minversion = "7.0"
addopts = [
    "-ra",
    "--strict-markers",
    "--strict-config",
    "--cov=glam_extractor",
    "--cov-report=term-missing:skip-covered",
    "--cov-report=html",
    "--cov-report=xml",
]
testpaths = ["tests"]
pythonpath = ["src"]
markers = [
    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
    "integration: marks tests as integration tests",
    "unit: marks tests as unit tests",
    "subagent: marks tests that use coding subagents for NER",
    "web: marks tests that require internet connection",
    "performance: marks tests that measure performance metrics",
    "layer1: fast unit tests without LLM (DSPy GitOps)",
    "layer2: DSPy module tests with LLM (DSPy GitOps)",
    "layer3: integration tests with live Oxigraph (DSPy GitOps)",
    "layer4: comprehensive evaluation (DSPy GitOps)",
    "smoke: quick smoke tests for CI",
    "requires_oxigraph: tests that need Oxigraph connection",
    "requires_llm: tests that need LLM API access",
]

[tool.coverage.run]
source = ["src"]
branch = true
omit = [
    "*/tests/*",
    "*/__pycache__/*",
    "*/site-packages/*",
]

[tool.coverage.report]
exclude_lines = [
    "pragma: no cover",
    "def __repr__",
    "raise AssertionError",
    "raise NotImplementedError",
    "if __name__ == .__main__.:",
    "if TYPE_CHECKING:",
    "class .*\\bProtocol\\):",
    "@(abc\\.)?abstractmethod",
]

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"