Skip to content

Python SDK

The EvalHub Python SDK provides synchronous and asynchronous clients for the EvalHub REST API, plus an adapter framework for building evaluation integrations.

Installation

pip install eval-hub-sdk[client]    # Client only
pip install eval-hub-sdk[adapter]   # Adapter SDK (includes mlflow, oras, olot)
pip install eval-hub-sdk[all]       # Everything

Quick Start

from evalhub import SyncEvalHubClient
from evalhub.models.api import ModelConfig, BenchmarkConfig, JobSubmissionRequest

with SyncEvalHubClient(base_url="http://localhost:8080") as client:
    job = client.jobs.submit(JobSubmissionRequest(
        model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"),
        benchmarks=[BenchmarkConfig(id="mmlu", provider_id="lm_evaluation_harness")]
    ))

    result = client.jobs.wait_for_completion(job.id, timeout=3600)
    print(f"Status: {result.status}, Results: {result.results}")
from evalhub import AsyncEvalHubClient
from evalhub.models.api import ModelConfig, BenchmarkConfig, JobSubmissionRequest

async with AsyncEvalHubClient(base_url="http://localhost:8080") as client:
    job = await client.jobs.submit(JobSubmissionRequest(
        model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"),
        benchmarks=[BenchmarkConfig(id="mmlu", provider_id="lm_evaluation_harness")]
    ))

    result = await client.jobs.wait_for_completion(job.id, timeout=3600)

Client Configuration

client = SyncEvalHubClient(
    base_url="http://localhost:8080",
    auth_token=None,              # Bearer token (or use auth_token_path)
    auth_token_path=None,         # Path to token file
    ca_bundle_path=None,          # CA bundle for TLS
    insecure=False,               # Skip TLS verification
    timeout=30.0,                 # Request timeout (seconds)
    max_retries=3,                # Retry attempts for 5xx/timeout/connection errors
    retry_initial_delay=1.0,      # Initial retry delay (seconds)
    retry_max_delay=60.0,         # Max retry delay
    retry_backoff_factor=2.0,     # Exponential backoff multiplier
    retry_randomization=True,     # Add jitter to retries
)

Auth resolution order: explicit token → token file → Kubernetes ServiceAccount token.

Resource Operations

Providers

providers = client.providers.list()
provider = client.providers.get("lm_evaluation_harness")

Benchmarks

benchmarks = client.benchmarks.list()
benchmarks = client.benchmarks.list(provider_id="lm_evaluation_harness")
benchmarks = client.benchmarks.list(category="math")

Collections

collections = client.collections.list()
collection = client.collections.get("healthcare_safety_v1")

Jobs

from evalhub.models.api import JobSubmissionRequest, ModelConfig, BenchmarkConfig, JobStatus

job = client.jobs.submit(JobSubmissionRequest(
    model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"),
    benchmarks=[
        BenchmarkConfig(id="mmlu", provider_id="lm_evaluation_harness"),
        BenchmarkConfig(id="hellaswag", provider_id="lighteval"),
    ]
))

status = client.jobs.get(job.id)
all_jobs = client.jobs.list(status=JobStatus.RUNNING)

result = client.jobs.wait_for_completion(job.id, timeout=3600, poll_interval=5.0)

client.jobs.cancel(job.id)
client.jobs.cancel(job.id, hard_delete=True)

Async Concurrency

import asyncio
from evalhub import AsyncEvalHubClient
from evalhub.models.api import ModelConfig, BenchmarkConfig, JobSubmissionRequest

async def main():
    async with AsyncEvalHubClient(base_url="http://localhost:8080") as client:
        benchmarks = ["mmlu", "hellaswag", "truthfulqa"]

        jobs = await asyncio.gather(*[
            client.jobs.submit(JobSubmissionRequest(
                model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"),
                benchmarks=[BenchmarkConfig(id=b, provider_id="lm_evaluation_harness")]
            ))
            for b in benchmarks
        ])

        results = await asyncio.gather(*[
            client.jobs.wait_for_completion(j.id, timeout=3600)
            for j in jobs
        ])

asyncio.run(main())

Error Handling

import httpx
from evalhub.client.base import ClientError

try:
    job = client.jobs.get("nonexistent-id")
except httpx.HTTPStatusError as e:
    print(f"HTTP {e.response.status_code}")
except httpx.RequestError as e:
    print(f"Connection error: {e}")
except ClientError as e:
    print(f"Client error: {e}")

API Reference

Client Classes

  • SyncEvalHubClient - Synchronous client
  • AsyncEvalHubClient - Asynchronous client
  • EvalHubClient - Alias for AsyncEvalHubClient

Resources

Resource Methods
client.providers list(), get(id)
client.benchmarks list(provider_id?, category?, limit?)
client.collections list(), get(id)
client.jobs submit(request), get(id), list(status?, limit?), cancel(id, hard_delete?), wait_for_completion(id, timeout?, poll_interval?)
client.health() Health check

Key Models (evalhub.models.api)

Model Description
JobSubmissionRequest Job submission (model + benchmarks or collection)
ModelConfig Model endpoint (url, name, auth)
BenchmarkConfig Benchmark reference (id, provider_id, parameters)
EvaluationJob Job status and results
JobStatus Enum: PENDING, RUNNING, COMPLETED, FAILED, CANCELLED
Provider Provider metadata and benchmarks
Collection Benchmark collection

See Also