Python SDK
The EvalHub Python SDK provides synchronous and asynchronous clients for the EvalHub REST API, plus an adapter framework for building evaluation integrations.
Installation
Section titled “Installation”pip install eval-hub-sdk[client] # Client onlypip install eval-hub-sdk[adapter] # Adapter SDK (includes mlflow, oras, olot)pip install eval-hub-sdk[server] # Includes eval-hub-server binary (~20 MB)pip install eval-hub-sdk[all] # Everything (excludes server)Quick Start
Section titled “Quick Start”from evalhub import SyncEvalHubClientfrom evalhub.models.api import ModelConfig, BenchmarkConfig, JobSubmissionRequest
with SyncEvalHubClient(base_url="http://localhost:8080") as client: job = client.jobs.submit(JobSubmissionRequest( name="llama3-mmlu-eval", model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"), benchmarks=[BenchmarkConfig(id="mmlu", provider_id="lm_evaluation_harness")] ))
result = client.jobs.wait_for_completion(job.id, timeout=3600) print(f"Status: {result.status}, Results: {result.results}")from evalhub import AsyncEvalHubClientfrom evalhub.models.api import ModelConfig, BenchmarkConfig, JobSubmissionRequest
async with AsyncEvalHubClient(base_url="http://localhost:8080") as client: job = await client.jobs.submit(JobSubmissionRequest( name="llama3-mmlu-eval", model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"), benchmarks=[BenchmarkConfig(id="mmlu", provider_id="lm_evaluation_harness")] ))
result = await client.jobs.wait_for_completion(job.id, timeout=3600)Client Configuration
Section titled “Client Configuration”client = SyncEvalHubClient( base_url="http://localhost:8080", auth_token=None, # Bearer token (or use auth_token_path) auth_token_path=None, # Path to token file ca_bundle_path=None, # CA bundle for TLS insecure=False, # Skip TLS verification tenant=None, # Namespace for multi-tenant deployments (X-Tenant header) timeout=30.0, # Request timeout (seconds) max_retries=3, # Retry attempts for 5xx/timeout/connection errors retry_initial_delay=1.0, # Initial retry delay (seconds) retry_max_delay=60.0, # Max retry delay retry_backoff_factor=2.0, # Exponential backoff multiplier retry_randomization=True, # Add jitter to retries)Auth resolution order: explicit token → token file → Kubernetes ServiceAccount token.
Resource Operations
Section titled “Resource Operations”Providers
Section titled “Providers”providers = client.providers.list()provider = client.providers.get("lm_evaluation_harness")
# Filter by agent metadata (client-side)safety_providers = client.providers.list(evaluates="safety", target_type="model")Agent discoverability
Section titled “Agent discoverability”Providers and benchmarks may include an optional agent metadata block returned by the API. Use it to discover evaluations by capability, read operational hints, and interpret results.
See Agent Discoverability for the full metadata model.
import osfrom evalhub import SyncEvalHubClient
with SyncEvalHubClient( base_url=os.environ["EVALHUB_BASE_URL"], auth_token=os.environ["EVALHUB_TOKEN"], tenant=os.environ["EVALHUB_TENANT"],) as client: for p in client.providers.list(evaluates="safety", target_type="model"): if p.agent: print(f"{p.resource.id}: {p.agent.summary}") for hint in p.agent.hints: print(f" hint: {hint}")
provider = client.providers.get("garak") if provider.agent: for line in provider.agent.result_interpretation: print(line)import osfrom evalhub import AsyncEvalHubClient
async with AsyncEvalHubClient( base_url=os.environ["EVALHUB_BASE_URL"], auth_token=os.environ["EVALHUB_TOKEN"], tenant=os.environ["EVALHUB_TENANT"],) as client: providers = await client.providers.list(evaluates="safety", target_type="model") for p in providers: if p.agent: print(p.resource.id, p.agent.summary)Benchmarks
Section titled “Benchmarks”benchmarks = client.benchmarks.list()benchmarks = client.benchmarks.list(provider_id="lm_evaluation_harness")benchmarks = client.benchmarks.list(category="math")Collections
Section titled “Collections”collections = client.collections.list()collection = client.collections.get("healthcare_safety_v1")from evalhub.models.api import JobSubmissionRequest, ModelConfig, BenchmarkConfig, JobStatus
job = client.jobs.submit(JobSubmissionRequest( name="llama3-multi-benchmark", model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"), benchmarks=[ BenchmarkConfig(id="mmlu", provider_id="lm_evaluation_harness"), BenchmarkConfig(id="hellaswag", provider_id="lighteval"), ]))
status = client.jobs.get(job.id)all_jobs = client.jobs.list(status=JobStatus.RUNNING)
result = client.jobs.wait_for_completion(job.id, timeout=3600, poll_interval=5.0)
client.jobs.cancel(job.id)client.jobs.cancel(job.id, hard_delete=True)Async Concurrency
Section titled “Async Concurrency”import asynciofrom evalhub import AsyncEvalHubClientfrom evalhub.models.api import ModelConfig, BenchmarkConfig, JobSubmissionRequest
async def main(): async with AsyncEvalHubClient(base_url="http://localhost:8080") as client: benchmarks = ["mmlu", "hellaswag", "truthfulqa"]
jobs = await asyncio.gather(*[ client.jobs.submit(JobSubmissionRequest( name=f"llama3-{b}", model=ModelConfig(url="http://vllm:8000/v1", name="llama-3-8b"), benchmarks=[BenchmarkConfig(id=b, provider_id="lm_evaluation_harness")] )) for b in benchmarks ])
results = await asyncio.gather(*[ client.jobs.wait_for_completion(j.id, timeout=3600) for j in jobs ])
asyncio.run(main())Error Handling
Section titled “Error Handling”import httpxfrom evalhub.client.base import ClientError
try: job = client.jobs.get("nonexistent-id")except httpx.HTTPStatusError as e: print(f"HTTP {e.response.status_code}")except httpx.RequestError as e: print(f"Connection error: {e}")except ClientError as e: print(f"Client error: {e}")API Reference
Section titled “API Reference”Client Classes
Section titled “Client Classes”SyncEvalHubClient- Synchronous clientAsyncEvalHubClient- Asynchronous clientEvalHubClient- Alias forAsyncEvalHubClient
Resources
Section titled “Resources”| Resource | Methods |
|---|---|
client.providers | list(target_type?, evaluates?), get(id) |
client.benchmarks | list(provider_id?, category?, limit?) |
client.collections | list(), get(id), create(request), delete(id) |
client.jobs | submit(request), get(id), list(status?, limit?), cancel(id, hard_delete?), wait_for_completion(id, timeout?, poll_interval?) |
client.health() | Health check |
Key Models (evalhub.models.api)
Section titled “Key Models (evalhub.models.api)”| Model | Description |
|---|---|
JobSubmissionRequest | Job submission (model + benchmarks or collection) |
ModelConfig | Model endpoint (url, name, auth) |
BenchmarkConfig | Benchmark reference (id, provider_id, parameters) |
EvaluationJob | Job status and results |
JobStatus | Enum: PENDING, RUNNING, COMPLETED, FAILED, CANCELLED |
Provider | Provider metadata and benchmarks |
AgentMetadata | Provider-level agent discoverability metadata |
BenchmarkAgentMetadata | Benchmark-level agent metadata (result interpretation, score ranges) |
Collection | Benchmark collection |