Giskard-AI
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 71 additions & 8 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 71 additions & 8 deletions
diff --git a/‎libs/giskard-agents/src/giskard/agents/embeddings/litellm_embedding_model.py‎
Lines changed: 1 addition & 1 deletion b/‎libs/giskard-agents/src/giskard/agents/embeddings/litellm_embedding_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎libs/giskard-agents/src/giskard/agents/generators/litellm_generator.py‎
Lines changed: 1 addition & 1 deletion b/‎libs/giskard-agents/src/giskard/agents/generators/litellm_generator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎libs/giskard-agents/tests/conftest.py‎
Lines changed: 2 additions & 11 deletions b/‎libs/giskard-agents/tests/conftest.py‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎libs/giskard-llm/.cursor/rules/01-architecture.mdc‎
Lines changed: 34 additions & 0 deletions b/‎libs/giskard-llm/.cursor/rules/01-architecture.mdc‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎libs/giskard-llm/.cursor/rules/02-testing.mdc‎
Lines changed: 20 additions & 0 deletions b/‎libs/giskard-llm/.cursor/rules/02-testing.mdc‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎libs/giskard-llm/.cursor/rules/03-development.mdc‎
Lines changed: 26 additions & 0 deletions b/‎libs/giskard-llm/.cursor/rules/03-development.mdc‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎libs/giskard-llm/.cursor/rules/04-provider-docs.mdc‎
Lines changed: 71 additions & 0 deletions b/‎libs/giskard-llm/.cursor/rules/04-provider-docs.mdc‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎libs/giskard-llm/README.md‎
Lines changed: 45 additions & 8 deletions b/‎libs/giskard-llm/README.md‎
Lines changed: 45 additions & 8 deletions
@@ -50,19 +50,17 @@ jobs:
           echo "::error::External contributors require a maintainer to add the 'safe for build' label."
           exit 1
 
-  test-functional:
+  test-agents-functional:
     needs: authorize
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
       fail-fast: false
       matrix:
         python-version: ["3.12", "3.13", "3.14"]
-        package: [giskard-core, giskard-agents, giskard-checks]
         provider: [google]
-    name: test-functional / ${{ matrix.package }} / ${{ matrix.provider }} / ${{ matrix.python-version }}
+    name: agents / ${{ matrix.provider }} / ${{ matrix.python-version }}
     env:
-      PACKAGE: ${{ matrix.package }}
       PROVIDER: ${{ matrix.provider }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
@@ -77,7 +75,72 @@ jobs:
         run: uv pip install "giskard-llm[$PROVIDER]"
       - name: Run functional tests
         env:
-          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
-          TEST_MODEL: "gemini/gemini-2.0-flash"
-          TEST_EMBEDDING_MODEL: "gemini/gemini-embedding-001"
-        run: make test-functional PACKAGE=$PACKAGE PROVIDER=$PROVIDER
+          GOOGLE_API_KEY: ${{ matrix.provider == 'google' && secrets.GEMINI_API_KEY || '' }}
+          TEST_MODEL: "google/gemini-2.0-flash"
+          TEST_EMBEDDING_MODEL: "google/gemini-embedding-001"
+        run: make test-functional PACKAGE=giskard-agents PROVIDER=$PROVIDER
+
+  test-llm-functional:
+    needs: authorize
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12"]
+        provider: [openai, google, anthropic, azure, azure_ai]
+    name: llm / ${{ matrix.provider }} / ${{ matrix.python-version }}
+    env:
+      PROVIDER: ${{ matrix.provider }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          ref: ${{ github.event.pull_request.head.sha || github.ref }}
+      - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7
+        with:
+          enable-cache: true
+          python-version: ${{ matrix.python-version }}
+      - run: make install
+      - name: Install provider SDK
+        run: uv pip install "giskard-llm[$PROVIDER]"
+      - name: Run functional tests
+        env:
+          OPENAI_API_KEY: ${{ matrix.provider == 'openai' && secrets.OPENAI_API_KEY || '' }}
+          GOOGLE_API_KEY: ${{ matrix.provider == 'google' && secrets.GEMINI_API_KEY || '' }}
+          ANTHROPIC_API_KEY: ${{ matrix.provider == 'anthropic' && secrets.ANTHROPIC_API_KEY || '' }}
+          AZURE_API_KEY: ${{ matrix.provider == 'azure' && secrets.AZURE_API_KEY || '' }}
+          AZURE_API_BASE: ${{ matrix.provider == 'azure' && secrets.AZURE_API_BASE || '' }}
+          AZURE_API_VERSION: ${{ matrix.provider == 'azure' && secrets.AZURE_API_VERSION || '' }}
+          AZURE_AI_API_KEY: ${{ matrix.provider == 'azure_ai' && secrets.AZURE_AI_API_KEY || '' }}
+          AZURE_AI_ENDPOINT: ${{ matrix.provider == 'azure_ai' && secrets.AZURE_AI_ENDPOINT || '' }}
+        run: make test-functional PACKAGE=giskard-llm PROVIDER=$PROVIDER
+
+  test-checks-functional:
+    needs: authorize
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12", "3.13", "3.14"]
+        provider: [google]
+    name: checks / ${{ matrix.provider }} / ${{ matrix.python-version }}
+    env:
+      PROVIDER: ${{ matrix.provider }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
+        with:
+          ref: ${{ github.event.pull_request.head.sha || github.ref }}
+      - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7
+        with:
+          enable-cache: true
+          python-version: ${{ matrix.python-version }}
+      - run: make install
+      - name: Install provider SDK
+        run: uv pip install "giskard-llm[$PROVIDER]"
+      - name: Run functional tests
+        env:
+          GOOGLE_API_KEY: ${{ matrix.provider == 'google' && secrets.GEMINI_API_KEY || '' }}
+          TEST_MODEL: "google/gemini-2.0-flash"
+          TEST_EMBEDDING_MODEL: "google/gemini-embedding-001"
+        run: make test-functional PACKAGE=giskard-checks PROVIDER=$PROVIDER
@@ -9,7 +9,7 @@
 class LitellmEmbeddingModel(BaseEmbeddingModel):
     """An embedding model backed by giskard-llm."""
 
-    model: str = Field(default="gemini/gemini-embedding-001")
+    model: str = Field(default="google/gemini-embedding-001")
 
     async def _embed(
         self, texts: list[str], params: EmbeddingParams | None = None
 
@@ -24,7 +24,7 @@ class LiteLLMGenerator(BaseGenerator):
     """A generator for creating chat completion pipelines."""
 
     model: str = Field(
-        description="The model identifier to use (e.g. 'gemini/gemini-2.0-flash')"
+        description="The model identifier to use (e.g. 'google/gemini-2.0-flash')"
     )
     retry_policy: RetryPolicy | None = Field(default_factory=RetryPolicy)
 
 
@@ -5,7 +5,6 @@
 from giskard.agents.embeddings import EmbeddingModel
 from giskard.agents.embeddings.base import EmbeddingParams
 from giskard.agents.generators import Generator
-from giskard.llm.routing import _provider_cache
 
 _PROVIDER_PACKAGES = {
     "openai": "openai",
@@ -38,24 +37,16 @@ def pytest_collection_modifyitems(items: list[pytest.Item]) -> None:
                     )
 
 
-@pytest.fixture(autouse=True)
-def _clear_provider_cache():
-    """Prevent stale async clients across event-loop boundaries."""
-    _provider_cache.clear()
-    yield
-    _provider_cache.clear()
-
-
 @pytest.fixture
 async def generator():
     """Fixture providing a configured generator for tests."""
-    return Generator(model=os.getenv("TEST_MODEL", "gemini/gemini-2.0-flash"))
+    return Generator(model=os.getenv("TEST_MODEL", "google/gemini-2.0-flash"))
 
 
 @pytest.fixture
 def embedding_model():
     """Fixture providing a configured embedding model for tests."""
     return EmbeddingModel(
-        model=os.getenv("TEST_EMBEDDING_MODEL", "gemini/gemini-embedding-001"),
+        model=os.getenv("TEST_EMBEDDING_MODEL", "google/gemini-embedding-001"),
         params=EmbeddingParams(dimensions=1536),
     )
@@ -0,0 +1,34 @@
+---
+description: Architecture and design principles for giskard-llm. Always read before modifying providers, routing, or error handling.
+alwaysApply: true
+---
+
+# giskard-llm Architecture
+
+## Overview
+
+`giskard-llm` is a lightweight routing layer that dispatches `"provider/model"` strings to native LLM SDKs. It replaces litellm with direct SDK calls while presenting a unified response shape to consumers.
+
+## Architecture
+
+The public API (`acompletion`, `aembedding`) routes `"provider/model"` strings through a lazy-loading registry to provider implementations. Providers are self-contained modules under `providers/`, each subclassing a shared ABC and responsible for: role mapping, message validation, tool conversion, error mapping, and response normalization.
+
+Shared contracts live at the package root: Pydantic v2 response types (OpenAI-shaped), a unified error hierarchy with `status_code` for consumer retry logic, and a `should_retry` helper.
+
+## Core Principles
+
+1. **Lazy imports.** Importing `giskard.llm` must never require any provider SDK. SDKs are imported inside provider modules only.
+2. **Strict message validation.** Providers validate messages before calling the SDK and raise `BadRequestError` with a clear message. Invalid input must not silently pass through to opaque SDK errors. Opt-in relaxation (e.g., `merge_system=True`) is explicit.
+3. **Unified error boundary.** Raw SDK exceptions never escape a provider. Every provider maps its SDK errors to the `errors.py` hierarchy.
+4. **Response normalization.** All providers convert native responses to the shared `types.py` models. Consumers never see provider-specific shapes.
+5. **Provider config via env vars or `**params`.** The public API has no provider-specific kwargs. Configuration flows through environment variables or pass-through params.
+6. **Provider behavior is self-documented.** Each provider class must have a comprehensive docstring covering: env vars, role mapping, error mapping, supported features, and provider-specific kwargs. This is the source of truth for provider behavior.
+
+## Adding a New Provider
+
+1. Create `providers/<name>.py` subclassing `BaseProvider`.
+2. Register it in the provider registry in `routing.py`.
+3. Add the SDK as an optional dependency extra in `pyproject.toml`.
+4. Implement role mapping, message validation, tool conversion, error mapping, and response conversion.
+5. Write the class docstring following the provider documentation template (see `04-provider-docs.mdc`).
+6. Add unit tests (mocked SDK) and functional test scenarios.
@@ -0,0 +1,20 @@
+---
+description: Testing conventions for giskard-llm. Read before writing or modifying tests.
+globs: "**/test*.py,**/conftest.py"
+---
+
+# giskard-llm Testing Conventions
+
+## Test Structure
+
+- **Unit tests** (`tests/`): Mocked SDK calls, no API keys needed. Cover routing, message conversion, error mapping, response conversion, validation.
+- **Functional tests** (`tests/functional/`): Real API calls. Cover end-to-end scenarios across all providers.
+
+## Provider Marks and Auto-Skip
+
+Every functional test is marked with its provider (`@pytest.mark.google`, `@pytest.mark.openai`, etc.). Auto-skip logic in `conftest.py` skips tests whose provider SDK is not installed, so unit test runs never fail due to missing optional dependencies.
+
+## Scenario Design
+
+- **Assert on structure, not content.** Tests must pass with even the weakest model. Assert non-empty responses, correct roles, correct types, parseable JSON — never assert on specific wording.
+- **Meaningful inputs and outputs.** Test scenarios should exercise real behavior: system prompts that produce verifiable effects, tool calls with checkable arguments, structured output that validates against a schema. The goal is a test that fails for the right reasons.
@@ -0,0 +1,26 @@
+---
+description: Development workflow for giskard-llm. Read before creating a PR.
+alwaysApply: false
+---
+
+# giskard-llm Development Workflow
+
+## Dependencies
+
+- **Core**: `pydantic>=2.0` only. No other runtime dependencies.
+- **Provider SDKs**: Optional extras in `pyproject.toml` (`openai`, `google`, `anthropic`, `azure`). Never add a provider SDK to core.
+- **Package manager**: `uv`.
+
+## Format and Lint
+
+`ruff format` and `ruff check` via pre-commit hooks. If `basedpyright` fails on provider files due to uninstalled SDKs, add `# pyright: reportMissingImports=false` at the top of the provider file.
+
+## CI
+
+- **Unit tests**: run on every PR, no SDK required — all provider interactions are mocked.
+- **Functional tests**: run per-provider in a matrix. Each matrix entry installs only its SDK and injects only its env vars at the step level.
+- **`workflow_dispatch`**: allows manual triggering with org-membership check.
+
+## Commit Messages
+
+Semantic format scoped to the lib: `feat(giskard-llm): add azure provider`, `fix(giskard-llm): google system message extraction`.
@@ -0,0 +1,71 @@
+---
+description: Provider documentation standards. Read when creating or modifying a provider.
+globs: "**/providers/*.py"
+---
+
+# Provider Documentation
+
+Each provider class must have a comprehensive docstring that serves as the single source of truth for that provider's behavior. This replaces external documentation that would go stale.
+
+## Required Docstring Sections
+
+Every provider class docstring must cover:
+
+1. **Overview** — SDK used, what model prefix routes here (e.g., `"google/"`, `"azure/"`).
+2. **Authentication** — Required env vars (e.g., `GOOGLE_API_KEY`) and alternative kwargs.
+3. **Role mapping** — How canonical roles (`system`, `user`, `assistant`, `tool`) map to the SDK format.
+4. **Message constraints** — Provider-specific validation rules (alternation, system message handling, etc.).
+5. **Tool call format** — How tool definitions and tool results are converted.
+6. **Error mapping** — Which SDK exceptions map to which `LLMError` subclasses.
+7. **Supported features** — Completion, embeddings, structured output, and any limitations.
+8. **Provider-specific kwargs** — Any pass-through params unique to this provider (e.g., `merge_system` for Anthropic).
+
+## Example
+
+```python
+class GoogleProvider(BaseProvider):
+    """Google Gemini provider using the ``google-genai`` SDK.
+
+    Routing prefix: ``google/``
+
+    Authentication:
+        - Env: ``GOOGLE_API_KEY`` (or ``GEMINI_API_KEY``)
+        - Kwargs: ``api_key``
+
+    Role mapping:
+        - ``system`` -> extracted to ``system_instruction`` config (accepts a list)
+        - ``assistant`` -> ``model``
+        - ``tool`` -> ``function_response`` part
+        - ``user`` -> ``user``
+
+    Message constraints:
+        - Multiple system messages: supported natively (passed as list)
+        - System-only messages: raises ``BadRequestError``
+        - No strict alternation required
+
+    Tool call format:
+        - Tool definitions: converted to ``FunctionDeclaration``
+        - Tool results: converted to ``function_response`` parts
+        - Tool call IDs: synthetic (``call_<index>``) since Gemini doesn't provide them
+
+    Error mapping:
+        - ``google.genai.errors.ClientError`` (400) -> ``BadRequestError``
+        - ``google.genai.errors.ClientError`` (401/403) -> ``AuthenticationError``
+        - ``google.genai.errors.ClientError`` (429) -> ``RateLimitError``
+        - ``google.genai.errors.ServerError`` -> ``ServerError``
+
+    Supported features:
+        - Completion: yes
+        - Embeddings: yes
+        - Structured output (response_format): yes, via ``response_schema``
+
+    Provider-specific kwargs:
+        - ``safety_settings``: override default safety settings
+    """
+```
+
+## Rules
+
+- **The docstring is the spec.** When behavior changes, update the docstring in the same commit.
+- **No external provider docs.** Don't maintain separate markdown files per provider. The class docstring is authoritative.
+- **README references providers briefly.** The top-level README has a summary table; details link to the source.
@@ -1,29 +1,66 @@
 # giskard-llm
 
-Lightweight LLM routing layer over native provider SDKs. Routes `provider/model` strings to OpenAI, Google Gemini, or Anthropic using their native async SDKs.
+Lightweight LLM routing layer over native provider SDKs. Routes `provider/model` strings to the correct async SDK (OpenAI, Google Gemini, Anthropic, Azure OpenAI, Azure AI Foundry).
 
 ## Installation
 
 ```bash
-pip install giskard-llm[openai]      # OpenAI only
-pip install giskard-llm[google]      # Google Gemini only
-pip install giskard-llm[anthropic]   # Anthropic only
+pip install giskard-llm[openai]      # OpenAI + Azure OpenAI + Azure AI Foundry
+pip install giskard-llm[google]      # Google Gemini
+pip install giskard-llm[anthropic]   # Anthropic
 pip install giskard-llm[all]         # All providers
 ```
 
-## Usage
+> **Note:** Azure OpenAI (`azure/`) and Azure AI Foundry (`azure_ai/`) use the `openai` SDK.
+> Installing `giskard-llm[openai]` (or `giskard-llm[azure]`) covers all three.
+
+## Quick start
 
 ```python
 from giskard.llm import acompletion, aembedding
 
+# Module-level functions use env vars automatically
 response = await acompletion(
     model="openai/gpt-4o",
     messages=[{"role": "user", "content": "Hello!"}],
 )
 print(response.choices[0].message.content)
 
-embeddings = await aembedding(
-    model="openai/text-embedding-3-small",
-    input=["hello world"],
+# Bare model names default to OpenAI
+response = await acompletion(model="gpt-4o", messages=[...])
+```
+
+## LLMClient (programmatic configuration)
+
+```python
+from giskard.llm import LLMClient
+
+client = LLMClient()
+
+# Configure with explicit values or env var references
+client.configure("openai", api_key="sk-...") # pragma: allowlist secret
+client.configure("azure-prod", provider="azure",
+    api_key="os.environ/AZURE_PROD_KEY", # pragma: allowlist secret
+    base_url="os.environ/AZURE_PROD_ENDPOINT",
+    api_version="2024-02-01",
+)
+client.configure("anthropic-relaxed", provider="anthropic",
+    api_key="os.environ/ANTHROPIC_API_KEY", # pragma: allowlist secret
+    merge_system=True,
 )
+
+response = await client.acompletion("azure-prod/gpt-4o", messages)
+response = await client.acompletion("anthropic-relaxed/claude-3-5-haiku-latest", messages)
 ```
+
+## Provider reference
+
+| Prefix | SDK | Auth env var | Completion | Embeddings | Notable kwargs |
+|---|---|---|---|---|---|
+| `openai/` (default) | `openai` | `OPENAI_API_KEY` | yes | yes | `base_url`, `timeout` |
+| `google/` | `google-genai` | `GOOGLE_API_KEY` / `GEMINI_API_KEY` | yes | yes | — |
+| `anthropic/` | `anthropic` | `ANTHROPIC_API_KEY` | yes | no | `merge_system`, `timeout` |
+| `azure/` | `openai` | `AZURE_API_KEY`, `AZURE_API_BASE` | yes | yes | `api_version`, `base_url` |
+| `azure_ai/` | `openai` | `AZURE_AI_API_KEY`, `AZURE_AI_ENDPOINT` | yes | model-dependent | `base_url` |
+
+For detailed per-provider documentation (role mapping, message constraints, tool format, error mapping), see the provider class docstrings in `src/giskard/llm/providers/`.
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ class LiteLLMGenerator(BaseGenerator):`
`24`	`24`	`"""A generator for creating chat completion pipelines."""`
`25`	`25`
`26`	`26`	`model: str = Field(`
`27`		`- description="The model identifier to use (e.g. 'gemini/gemini-2.0-flash')"`
	`27`	`+ description="The model identifier to use (e.g. 'google/gemini-2.0-flash')"`
`28`	`28`	`)`
`29`	`29`	`retry_policy: RetryPolicy \| None = Field(default_factory=RetryPolicy)`
`30`	`30`