Skip to content

Python SDK

The EvalHub Python SDK provides synchronous and asynchronous clients for the EvalHub REST API, plus an adapter framework for building evaluation integrations.

Terminal window
pip install eval-hub-sdk[client] # Client only
pip install eval-hub-sdk[adapter] # Adapter SDK (includes mlflow, oras, olot)
pip install eval-hub-sdk[server] # Includes eval-hub-server binary (~20 MB)
pip install eval-hub-sdk[all] # Everything (excludes server)
from evalhub import SyncEvalHubClient
from evalhub.models.api import ModelConfig, BenchmarkConfig, JobSubmissionRequest
with SyncEvalHubClient(base_url="http://localhost:8080") as client:
job = client.jobs.submit(JobSubmissionRequest(
name="llama3-mmlu-eval",
model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"),
benchmarks=[BenchmarkConfig(id="mmlu", provider_id="lm_evaluation_harness")]
))
result = client.jobs.wait_for_completion(job.id, timeout=3600)
print(f"Status: {result.status}, Results: {result.results}")
client = SyncEvalHubClient(
base_url="http://localhost:8080",
auth_token=None, # Bearer token (or use auth_token_path)
auth_token_path=None, # Path to token file
ca_bundle_path=None, # CA bundle for TLS
insecure=False, # Skip TLS verification
tenant=None, # Namespace for multi-tenant deployments (X-Tenant header)
timeout=30.0, # Request timeout (seconds)
max_retries=3, # Retry attempts for 5xx/timeout/connection errors
retry_initial_delay=1.0, # Initial retry delay (seconds)
retry_max_delay=60.0, # Max retry delay
retry_backoff_factor=2.0, # Exponential backoff multiplier
retry_randomization=True, # Add jitter to retries
)

Auth resolution order: explicit token → token file → Kubernetes ServiceAccount token.

providers = client.providers.list()
provider = client.providers.get("lm_evaluation_harness")
# Filter by agent metadata (client-side)
safety_providers = client.providers.list(evaluates="safety", target_type="model")

Providers and benchmarks may include an optional agent metadata block returned by the API. Use it to discover evaluations by capability, read operational hints, and interpret results.

See Agent Discoverability for the full metadata model.

import os
from evalhub import SyncEvalHubClient
with SyncEvalHubClient(
base_url=os.environ["EVALHUB_BASE_URL"],
auth_token=os.environ["EVALHUB_TOKEN"],
tenant=os.environ["EVALHUB_TENANT"],
) as client:
for p in client.providers.list(evaluates="safety", target_type="model"):
if p.agent:
print(f"{p.resource.id}: {p.agent.summary}")
for hint in p.agent.hints:
print(f" hint: {hint}")
provider = client.providers.get("garak")
if provider.agent:
for line in provider.agent.result_interpretation:
print(line)
benchmarks = client.benchmarks.list()
benchmarks = client.benchmarks.list(provider_id="lm_evaluation_harness")
benchmarks = client.benchmarks.list(category="math")
collections = client.collections.list()
collection = client.collections.get("healthcare_safety_v1")
from evalhub.models.api import JobSubmissionRequest, ModelConfig, BenchmarkConfig, JobStatus
job = client.jobs.submit(JobSubmissionRequest(
name="llama3-multi-benchmark",
model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"),
benchmarks=[
BenchmarkConfig(id="mmlu", provider_id="lm_evaluation_harness"),
BenchmarkConfig(id="hellaswag", provider_id="lighteval"),
]
))
status = client.jobs.get(job.id)
all_jobs = client.jobs.list(status=JobStatus.RUNNING)
result = client.jobs.wait_for_completion(job.id, timeout=3600, poll_interval=5.0)
client.jobs.cancel(job.id)
client.jobs.cancel(job.id, hard_delete=True)
import asyncio
from evalhub import AsyncEvalHubClient
from evalhub.models.api import ModelConfig, BenchmarkConfig, JobSubmissionRequest
async def main():
async with AsyncEvalHubClient(base_url="http://localhost:8080") as client:
benchmarks = ["mmlu", "hellaswag", "truthfulqa"]
jobs = await asyncio.gather(*[
client.jobs.submit(JobSubmissionRequest(
name=f"llama3-{b}",
model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"),
benchmarks=[BenchmarkConfig(id=b, provider_id="lm_evaluation_harness")]
))
for b in benchmarks
])
results = await asyncio.gather(*[
client.jobs.wait_for_completion(j.id, timeout=3600)
for j in jobs
])
asyncio.run(main())
import httpx
from evalhub.client.base import ClientError
try:
job = client.jobs.get("nonexistent-id")
except httpx.HTTPStatusError as e:
print(f"HTTP {e.response.status_code}")
except httpx.RequestError as e:
print(f"Connection error: {e}")
except ClientError as e:
print(f"Client error: {e}")
  • SyncEvalHubClient - Synchronous client
  • AsyncEvalHubClient - Asynchronous client
  • EvalHubClient - Alias for AsyncEvalHubClient
ResourceMethods
client.providerslist(target_type?, evaluates?), get(id)
client.benchmarkslist(provider_id?, category?, limit?)
client.collectionslist(), get(id), create(request), delete(id)
client.jobssubmit(request), get(id), list(status?, limit?), cancel(id, hard_delete?), wait_for_completion(id, timeout?, poll_interval?)
client.health()Health check
ModelDescription
JobSubmissionRequestJob submission (model + benchmarks or collection)
ModelConfigModel endpoint (url, name, auth)
BenchmarkConfigBenchmark reference (id, provider_id, parameters)
EvaluationJobJob status and results
JobStatusEnum: PENDING, RUNNING, COMPLETED, FAILED, CANCELLED
ProviderProvider metadata and benchmarks
AgentMetadataProvider-level agent discoverability metadata
BenchmarkAgentMetadataBenchmark-level agent metadata (result interpretation, score ranges)
CollectionBenchmark collection