[tool.poetry] name = "glam-extractor" version = "0.1.0" description = "Extract and standardize global GLAM (Galleries, Libraries, Archives, Museums) institutional data from conversation transcripts and authoritative registries" authors = ["Your Name "] license = "MIT" readme = "README.md" homepage = "https://github.com/yourusername/glam-extractor" repository = "https://github.com/yourusername/glam-extractor" keywords = ["glam", "heritage", "museums", "libraries", "archives", "linkml", "nlp", "data-extraction"] classifiers = [ "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Sociology :: History", ] packages = [{include = "glam_extractor", from = "src"}] [tool.poetry.dependencies] python = "^3.11" # Core data processing pandas = "^2.1.0" numpy = "^1.26.0" # Text processing (direct dependencies only) # NOTE: NLP extraction (NER) is handled by coding subagents via Task tool # spaCy, transformers, torch are NOT direct dependencies rapidfuzz = "^3.5.0" # Fuzzy string matching for deduplication langdetect = "^1.0.9" # Language detection unidecode = "^1.3.7" # Unicode transliteration # Web crawling and scraping crawl4ai = "^0.7.0" httpx = "^0.27.0" beautifulsoup4 = "^4.12.0" lxml = ">=4.9.0,<6.0.0" # LinkML schema and validation linkml = "^1.9.0" linkml-runtime = "^1.9.0" # NOTE: Upgraded to Pydantic v2 for DSPy compatibility (LinkML 1.9+ supports v2) pydantic = "^2.0.0" # RDF and semantic web rdflib = "^7.0.0" SPARQLWrapper = "^2.0.0" # Database and storage duckdb = "^0.9.0" sqlalchemy = "^2.0.0" pyarrow = "^14.0.0" # Geographic data geopy = "^2.4.0" pycountry = "^23.12.0" # Utilities click = "^8.1.0" tqdm = "^4.66.0" python-dotenv = "^1.0.0" pyyaml = "^6.0.0" python-dateutil = "^2.8.0" requests = "^2.31.0" # API Server fastapi = "^0.115.0" uvicorn = {extras = ["standard"], version = "^0.32.0"} pydantic-settings = "^2.0.0" # DSPy for LLM-powered SPARQL generation dspy-ai = "^2.5.0" openai = "^1.0.0" # DSPy backend for OpenAI/Anthropic [tool.poetry.group.dev.dependencies] # Testing pytest = "^7.4.0" pytest-cov = "^4.1.0" pytest-asyncio = "^0.21.0" hypothesis = "^6.92.0" # Code quality ruff = "^0.1.0" black = "^23.11.0" mypy = "^1.7.0" pre-commit = "^3.5.0" # Documentation mkdocs = "^1.5.0" mkdocs-material = "^9.4.0" mkdocstrings = {extras = ["python"], version = "^0.24.0"} # Jupyter for exploration jupyter = "^1.0.0" ipykernel = "^6.27.0" matplotlib = "^3.8.0" seaborn = "^0.13.0" [tool.poetry.scripts] glam = "glam_extractor.cli:main" glam-extract = "glam_extractor.cli:extract_command" glam-validate = "glam_extractor.cli:validate_command" glam-export = "glam_extractor.cli:export_command" glam-crawl = "glam_extractor.cli:crawl_command" [tool.black] line-length = 100 target-version = ['py311'] include = '\.pyi?$' extend-exclude = ''' /( # directories \.eggs | \.git | \.hg | \.mypy_cache | \.tox | \.venv | build | dist )/ ''' [tool.ruff] line-length = 100 target-version = "py311" select = [ "E", # pycodestyle errors "W", # pycodestyle warnings "F", # pyflakes "I", # isort "B", # flake8-bugbear "C4", # flake8-comprehensions "UP", # pyupgrade ] ignore = [ "E501", # line too long (handled by black) "B008", # do not perform function calls in argument defaults "C901", # too complex ] [tool.ruff.per-file-ignores] "__init__.py" = ["F401"] [tool.ruff.isort] known-first-party = ["glam_extractor"] [tool.mypy] python_version = "3.11" warn_return_any = true warn_unused_configs = true disallow_untyped_defs = true disallow_incomplete_defs = true check_untyped_defs = true no_implicit_optional = true warn_redundant_casts = true warn_unused_ignores = true warn_no_return = true warn_unreachable = true strict_equality = true plugins = ["pydantic.mypy"] [tool.pydantic-mypy] init_forbid_extra = true init_typed = true warn_required_dynamic_aliases = true [[tool.mypy.overrides]] module = [ "crawl4ai.*", "linkml.*", "rdflib.*", "geopy.*", ] ignore_missing_imports = true [tool.pytest.ini_options] minversion = "7.0" addopts = [ "-ra", "--strict-markers", "--strict-config", "--cov=glam_extractor", "--cov-report=term-missing:skip-covered", "--cov-report=html", "--cov-report=xml", ] testpaths = ["tests"] pythonpath = ["src"] markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", "integration: marks tests as integration tests", "unit: marks tests as unit tests", "subagent: marks tests that use coding subagents for NER", "web: marks tests that require internet connection", "performance: marks tests that measure performance metrics", ] [tool.coverage.run] source = ["src"] branch = true omit = [ "*/tests/*", "*/__pycache__/*", "*/site-packages/*", ] [tool.coverage.report] exclude_lines = [ "pragma: no cover", "def __repr__", "raise AssertionError", "raise NotImplementedError", "if __name__ == .__main__.:", "if TYPE_CHECKING:", "class .*\\bProtocol\\):", "@(abc\\.)?abstractmethod", ] [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api"