Download OpenAPI specification:
API REST server for evaluation backend orchestration
Create and execute evaluation request using the simplified benchmark schema.
| name required | string The evaluation job name. |
| description | string The evaluation job description. |
| tags | Array of strings The evaluation job tags. |
required | object (ModelRef) The model to evaluate. |
required | Array of objects (EvaluationBenchmarkConfig) The evaluation benchmarks to run. |
object (PassCriteriaWithDefault) The overall pass criteria for the evaluation job. | |
object (ExperimentConfig) The MLFlow experiment configuration. When provided, the evaluation job will be tracked in MLFlow. | |
object (EvaluationExports) Optional exports configuration for the evaluation job. When provided, the evaluation job results will be exported to the specified location. | |
object (QueueConfig) Optional scheduling queue for Kubernetes-backed evaluation jobs (e.g. Kueue). | |
object Custom request data. This can be used for user specific job data. |
{- "name": "granite-3.1-8b-safety-eval",
- "description": "Safety and reasoning evaluation for Granite 3.1 8B Instruct",
- "tags": [
- "nightly",
- "granite"
], - "model": {
- "name": "granite-3.1-8b-instruct"
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}, - "parameters": {
- "num_fewshot": 0,
- "limit": 100
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
], - "pass_criteria": {
- "threshold": 0.5
}
}{- "resource": {
- "id": "a1b2c3d4-5678-9abc-def0-1234567890ab",
- "tenant": "default",
- "created_at": "2026-01-15T09:30:00Z",
- "updated_at": "2026-01-15T09:30:00Z",
- "owner": "user@example.com"
}, - "status": {
- "state": "pending",
- "message": {
- "message": "Evaluation job created.",
- "message_code": "evaluation_job_created"
}
}, - "name": "granite-3.1-8b-safety-eval",
- "description": "Safety and reasoning evaluation for Granite 3.1 8B Instruct",
- "tags": [
- "nightly",
- "granite"
], - "model": {
- "name": "granite-3.1-8b-instruct"
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}, - "parameters": {
- "num_fewshot": 0,
- "limit": 100
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
], - "pass_criteria": {
- "threshold": 0.5
}
}List all evaluation requests.
| limit | integer (Limit) [ 1 .. 100 ] Default: 50 Maximum number of evaluations to return |
| offset | integer (Offset) >= 0 Default: 0 Offset for pagination |
| status | string (Status Filter) Filter by status |
| name | string (Name) Name to search for |
| tags | string (Tags) Tags to search for |
{- "first": {
- "href": "/api/v1/evaluations/jobs?limit=50&offset=0"
}, - "next": {
- "href": "/api/v1/evaluations/jobs?limit=50&offset=50"
}, - "limit": 50,
- "total_count": 73,
- "items": [
- {
- "resource": {
- "id": "a1b2c3d4-5678-9abc-def0-1234567890ab",
- "tenant": "default",
- "created_at": "2026-01-15T09:30:00Z",
- "updated_at": "2026-01-15T09:42:15Z",
- "owner": "user@example.com"
}, - "status": {
- "state": "completed",
- "message": {
- "message": "Evaluation job completed.",
- "message_code": "evaluation_job_updated"
}
}, - "results": {
- "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "benchmark_index": 0,
- "metrics": {
- "acc": 0.82,
- "acc_norm": 0.85
}, - "test": {
- "primary_score": 0.85,
- "threshold": 0.25,
- "pass": true
}
}
], - "test": {
- "score": 0.85,
- "threshold": 0.5,
- "pass": true
}
}, - "name": "granite-3.1-8b-safety-eval",
- "model": {
- "name": "granite-3.1-8b-instruct"
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6
}
], - "pass_criteria": {
- "threshold": 0.5
}
}
]
}Returns the evaluation job resource with the current status and results.
| id required | string (Id) |
{- "resource": {
- "id": "a1b2c3d4-5678-9abc-def0-1234567890ab",
- "tenant": "default",
- "created_at": "2026-01-15T09:30:00Z",
- "updated_at": "2026-01-15T09:42:15Z",
- "owner": "user@example.com"
}, - "status": {
- "state": "completed",
- "message": {
- "message": "Evaluation job completed.",
- "message_code": "evaluation_job_updated"
}, - "benchmarks": [
- {
- "provider_id": "lm_evaluation_harness",
- "id": "arc_easy",
- "benchmark_index": 0,
- "status": "completed",
- "started_at": "2026-01-15T09:31:00Z",
- "completed_at": "2026-01-15T09:38:45Z"
}, - {
- "provider_id": "garak",
- "id": "owasp_llm_top10",
- "benchmark_index": 1,
- "status": "completed",
- "started_at": "2026-01-15T09:31:00Z",
- "completed_at": "2026-01-15T09:42:15Z"
}
]
}, - "results": {
- "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "benchmark_index": 0,
- "metrics": {
- "acc": 0.82,
- "acc_norm": 0.85
}, - "mlflow_run_id": "run-7f3a1b2c",
- "logs_path": "/data/logs/a1b2c3d4.log",
- "test": {
- "primary_score": 0.85,
- "threshold": 0.25,
- "pass": true
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "benchmark_index": 1,
- "metrics": {
- "attack_success_rate": 0.12
}, - "mlflow_run_id": "run-9e8d7c6b",
- "logs_path": "/data/logs/a1b2c3d4-garak.log",
- "test": {
- "primary_score": 0.12,
- "threshold": 0.3,
- "pass": true
}
}
], - "test": {
- "score": 0.85,
- "threshold": 0.5,
- "pass": true
}
}, - "name": "granite-3.1-8b-safety-eval",
- "description": "Safety and reasoning evaluation for Granite 3.1 8B Instruct",
- "tags": [
- "nightly",
- "granite"
], - "model": {
- "name": "granite-3.1-8b-instruct"
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}, - "parameters": {
- "num_fewshot": 0,
- "limit": 100
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
], - "pass_criteria": {
- "threshold": 0.5
}
}Cancel a running evaluation.
| id required | string (Id) |
| hard_delete | boolean (Hard Delete) Default: false If |
{- "message": "The field 'state' is not valid.",
- "message_code": "invalid_value",
- "trace": "b12692e1-8582-4628-88ca-7a13fefb73e2"
}Send an evaluation job status or results. Note that this endpoint is internal and should not be used by clients, eventually it will not be possible to call this endpoint without a special token.
| id required | string (Id) |
required | object (BenchmarkStatusEvent) Event payload to update benchmark status at runtime |
A simple request update with a completed status.
{- "benchmark_status_event": {
- "id": "toxicity",
- "provider_id": "garak",
- "status": "completed",
- "metrics": {
- "accuracy": 0.95
}, - "artifacts": {
- "logs": "/data/logs/1234567890.log"
}, - "started_at": "2026-01-12T10:45:32Z",
- "completed_at": "2026-01-12T10:47:12Z"
}
}{- "message": "The field 'state' is not valid.",
- "message_code": "invalid_value",
- "trace": "b12692e1-8582-4628-88ca-7a13fefb73e2"
}List all benchmark collections.
| limit | integer (Limit) [ 1 .. 100 ] Default: 50 Maximum number of collections to return |
| offset | integer (Offset) >= 0 Default: 0 Offset for pagination |
| name | string (Name) Name to search for |
| category | string (Category) Category to search for |
| tags | string (Tags) Tags to search for |
| scope | string (Scope of collections) Enum: "system" "tenant" Set to |
{- "first": {
- "href": "/api/v1/evaluations/collections?limit=50&offset=0"
}, - "limit": 50,
- "total_count": 2,
- "items": [
- {
- "resource": {
- "id": "e5f6a7b8-9012-3456-cdef-0123456789ab",
- "tenant": "default",
- "created_at": "2025-12-01T10:00:00Z",
- "updated_at": "2025-12-01T10:00:00Z"
}, - "name": "llm-safety-suite",
- "category": "safety",
- "description": "Comprehensive safety evaluation combining reasoning accuracy and vulnerability scanning",
- "tags": [
- "safety",
- "nightly"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}
]
}Create a new collection.
| name required | string Collection name. |
| category required | string Collection category. |
| description | string Optional description. |
| tags | Array of strings Tags. |
object Custom key-value data. | |
object (PassCriteria) Pass criteria for the collection. | |
required | Array of objects (CollectionBenchmarkConfig) Benchmarks in the collection. |
{- "name": "release-gate-safety",
- "category": "safety",
- "description": "Release-gate collection combining reasoning and red-teaming benchmarks",
- "tags": [
- "release-gate",
- "safety"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}, - "parameters": {
- "num_fewshot": 0,
- "limit": 100
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}{- "resource": {
- "id": "f6a7b8c9-0123-4567-def0-123456789abc",
- "tenant": "default",
- "created_at": "2026-02-01T09:00:00Z",
- "updated_at": "2026-02-01T09:00:00Z",
- "owner": "user@example.com"
}, - "name": "release-gate-safety",
- "category": "safety",
- "description": "Release-gate collection combining reasoning and red-teaming benchmarks",
- "tags": [
- "release-gate",
- "safety"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}, - "parameters": {
- "num_fewshot": 0,
- "limit": 100
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}Get details of a specific collection.
| id required | string (Collection Id) |
{- "resource": {
- "id": "e5f6a7b8-9012-3456-cdef-0123456789ab",
- "tenant": "default",
- "created_at": "2025-12-01T10:00:00Z",
- "updated_at": "2025-12-01T10:00:00Z"
}, - "name": "llm-safety-suite",
- "category": "safety",
- "description": "Comprehensive safety evaluation combining reasoning accuracy and vulnerability scanning",
- "tags": [
- "safety",
- "nightly"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}Update an existing collection.
| id required | string (Collection Id) |
| name required | string Collection name. |
| category required | string Collection category. |
| description | string Optional description. |
| tags | Array of strings Tags. |
object Custom key-value data. | |
object (PassCriteria) Pass criteria for the collection. | |
required | Array of objects (CollectionBenchmarkConfig) Benchmarks in the collection. |
{- "name": "llm-safety-suite",
- "category": "safety",
- "description": "Safety evaluation with reasoning, OWASP risks, and content quality",
- "tags": [
- "safety",
- "nightly",
- "updated"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.5,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.3,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}, - {
- "id": "quality",
- "provider_id": "garak",
- "weight": 0.2,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}{- "resource": {
- "id": "e5f6a7b8-9012-3456-cdef-0123456789ab",
- "tenant": "default",
- "created_at": "2025-12-01T10:00:00Z",
- "updated_at": "2026-02-10T11:00:00Z"
}, - "name": "llm-safety-suite",
- "category": "safety",
- "description": "Safety evaluation with reasoning, OWASP risks, and content quality",
- "tags": [
- "safety",
- "nightly",
- "updated"
], - "pass_criteria": {
- "threshold": 0.5
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.5,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.3,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}, - {
- "id": "quality",
- "provider_id": "garak",
- "weight": 0.2,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}Partially update an existing collection.
| id required | string (Collection Id) |
| op required | string (PatchOp) Enum: "replace" "add" "remove" Patch operation type |
| path required | string JSON Pointer path |
| value | any Value for add/replace (omit for remove) |
[- {
- "op": "replace",
- "path": "/pass_criteria/threshold",
- "value": 0.6
}, - {
- "op": "replace",
- "path": "/description",
- "value": "Safety evaluation with stricter pass threshold"
}
]{- "resource": {
- "id": "e5f6a7b8-9012-3456-cdef-0123456789ab",
- "tenant": "default",
- "created_at": "2025-12-01T10:00:00Z",
- "updated_at": "2026-02-10T12:30:00Z"
}, - "name": "llm-safety-suite",
- "category": "safety",
- "description": "Safety evaluation with stricter pass threshold",
- "tags": [
- "safety",
- "nightly"
], - "pass_criteria": {
- "threshold": 0.6
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "provider_id": "lm_evaluation_harness",
- "weight": 0.6,
- "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}, - {
- "id": "owasp_llm_top10",
- "provider_id": "garak",
- "weight": 0.4,
- "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}List all registered evaluation providers.
| limit | integer (Limit) [ 1 .. 100 ] Default: 50 Maximum number of providers to return |
| offset | integer (Offset) >= 0 Default: 0 Offset for pagination |
| benchmarks | boolean (Benchmarks) Default: true Include or exclude benchmarks supported by this provider in the response |
| name | string (Name) Name to search for |
| tags | string (Tags) Tags to search for |
| scope | string (Scope of providers) Enum: "system" "tenant" Set to |
{- "first": {
- "href": "/api/v1/evaluations/providers?limit=50&offset=0"
}, - "limit": 50,
- "total_count": 3,
- "items": [
- {
- "resource": {
- "id": "b3f1a2c4-1234-5678-abcd-ef0123456789",
- "tenant": "default",
- "created_at": "2025-10-01T00:00:00Z",
- "updated_at": "2025-10-01T00:00:00Z"
}, - "name": "lm_evaluation_harness",
- "title": "LM Evaluation Harness",
- "description": "Comprehensive evaluation framework for language models with 180 benchmarks",
- "tags": [
- "reasoning",
- "science",
- "lm_eval"
], - "runtime": {
- "k8s": {
- "image": "quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "100m",
- "memory_request": "128Mi",
- "cpu_limit": "500m",
- "memory_limit": "4Gi"
}
}, - "benchmarks": [
- {
- "id": "arc_easy",
- "name": "Basic science Q&A",
- "description": "Grade-school science questions testing basic reasoning and scientific knowledge (AI2 Reasoning Challenge, easy split).",
- "category": "reasoning",
- "metrics": [
- "acc",
- "acc_norm"
], - "num_few_shot": 0,
- "dataset_size": 2376,
- "tags": [
- "reasoning",
- "science",
- "lm_eval"
], - "primary_score": {
- "metric": "acc_norm",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.25
}
}
]
}
]
}Create a new provider scoped to the current tenant (Bring Your Own Provider)
| name required | string Provider name |
| title | string Provider display title |
| description | string Provider description |
| tags | Array of strings Provider tags |
object (AgentMetadata) Agent discoverability metadata for this provider | |
required | object (Runtime) Provider runtime configuration |
required | Array of objects (BenchmarkResource) Benchmarks offered by this provider |
{- "name": "my-custom-evaluator",
- "title": "Custom Internal Evaluator",
- "description": "Internal evaluation adapter for domain-specific benchmarks",
- "tags": [
- "custom",
- "internal"
], - "runtime": {
- "k8s": {
- "image": "registry.internal.example.com/eval/custom-adapter:v1.2",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "250m",
- "memory_request": "512Mi",
- "cpu_limit": "1",
- "memory_limit": "2Gi"
}
}, - "benchmarks": [
- {
- "id": "domain-qa",
- "name": "Domain Q&A Accuracy",
- "description": "Measures accuracy on domain-specific question answering",
- "category": "reasoning",
- "metrics": [
- "acc",
- "f1"
], - "primary_score": {
- "metric": "acc",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.5
}
}
]
}{- "resource": {
- "id": "c4d5e6f7-8901-2345-bcde-f67890123456",
- "tenant": "default",
- "created_at": "2026-01-20T10:00:00Z",
- "updated_at": "2026-01-20T10:00:00Z",
- "owner": "user@example.com"
}, - "name": "my-custom-evaluator",
- "title": "Custom Internal Evaluator",
- "description": "Internal evaluation adapter for domain-specific benchmarks",
- "tags": [
- "custom",
- "internal"
], - "runtime": {
- "k8s": {
- "image": "registry.internal.example.com/eval/custom-adapter:v1.2",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "250m",
- "memory_request": "512Mi",
- "cpu_limit": "1",
- "memory_limit": "2Gi"
}
}, - "benchmarks": [
- {
- "id": "domain-qa",
- "name": "Domain Q&A Accuracy",
- "description": "Measures accuracy on domain-specific question answering",
- "category": "reasoning",
- "metrics": [
- "acc",
- "f1"
], - "primary_score": {
- "metric": "acc",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.5
}
}
]
}Get a provider by ID.
| id required | string (Provider Id) Provider ID |
{- "resource": {
- "id": "d8e9f0a1-2345-6789-cdef-012345678901",
- "tenant": "default",
- "created_at": "2025-10-01T00:00:00Z",
- "updated_at": "2025-10-01T00:00:00Z"
}, - "name": "garak",
- "title": "Garak",
- "description": "LLM vulnerability scanner and red-teaming framework",
- "tags": [
- "security",
- "red_team"
], - "runtime": {
- "k8s": {
- "image": "quay.io/trustyai/trustyai-garak-lls-provider-dsp:latest",
- "entrypoint": [
- "python",
- "-m",
- "llama_stack_provider_trustyai_garak.evalhub"
], - "cpu_request": "500m",
- "memory_request": "512Mi",
- "cpu_limit": "2000m",
- "memory_limit": "4Gi"
}
}, - "benchmarks": [
- {
- "id": "owasp_llm_top10",
- "name": "OWASP LLM top 10 risk scan",
- "description": "Tests against the top 10 security risks specific to LLM applications.",
- "category": "security",
- "metrics": [
- "attack_success_rate"
], - "tags": [
- "security",
- "owasp",
- "red_team"
], - "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}, - {
- "id": "quality",
- "name": "Toxic & harmful content scan",
- "description": "Scans for violence, profanity, toxicity, hate speech, and integrity issues.",
- "category": "safety",
- "metrics": [
- "attack_success_rate"
], - "tags": [
- "safety",
- "quality",
- "toxicity",
- "red_team"
], - "primary_score": {
- "metric": "attack_success_rate",
- "lower_is_better": true
}, - "pass_criteria": {
- "threshold": 0.3
}
}
]
}Update an existing provider.
| id required | string (Provider Id) Provider ID |
| name required | string Provider name |
| title | string Provider display title |
| description | string Provider description |
| tags | Array of strings Provider tags |
object (AgentMetadata) Agent discoverability metadata for this provider | |
required | object (Runtime) Provider runtime configuration |
required | Array of objects (BenchmarkResource) Benchmarks offered by this provider |
{- "name": "my-custom-evaluator",
- "title": "Custom Internal Evaluator",
- "description": "Updated evaluation adapter with improved tokenization",
- "tags": [
- "custom",
- "internal"
], - "runtime": {
- "k8s": {
- "image": "registry.internal.example.com/eval/custom-adapter:v2.0",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "500m",
- "memory_request": "1Gi",
- "cpu_limit": "2",
- "memory_limit": "4Gi"
}
}, - "benchmarks": [
- {
- "id": "domain-qa",
- "name": "Domain Q&A Accuracy",
- "description": "Measures accuracy on domain-specific question answering",
- "category": "reasoning",
- "metrics": [
- "acc",
- "f1"
], - "primary_score": {
- "metric": "acc",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.5
}
}
]
}{- "resource": {
- "id": "c4d5e6f7-8901-2345-bcde-f67890123456",
- "tenant": "default",
- "created_at": "2026-01-20T10:00:00Z",
- "updated_at": "2026-02-05T14:30:00Z"
}, - "name": "my-custom-evaluator",
- "title": "Custom Internal Evaluator",
- "description": "Updated evaluation adapter with improved tokenization",
- "tags": [
- "custom",
- "internal"
], - "runtime": {
- "k8s": {
- "image": "registry.internal.example.com/eval/custom-adapter:v2.0",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "500m",
- "memory_request": "1Gi",
- "cpu_limit": "2",
- "memory_limit": "4Gi"
}
}, - "benchmarks": [
- {
- "id": "domain-qa",
- "name": "Domain Q&A Accuracy",
- "description": "Measures accuracy on domain-specific question answering",
- "category": "reasoning",
- "metrics": [
- "acc",
- "f1"
], - "primary_score": {
- "metric": "acc",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.5
}
}
]
}Partially update an existing provider.
| id required | string (Provider Id) |
| op required | string (PatchOp) Enum: "replace" "add" "remove" Patch operation type |
| path required | string JSON Pointer path |
| value | any Value for add/replace (omit for remove) |
[- {
- "op": "replace",
- "path": "/runtime/k8s/image",
- "value": "registry.internal.example.com/eval/custom-adapter:v2.1"
}, - {
- "op": "replace",
- "path": "/description",
- "value": "Updated evaluation adapter with bug fixes"
}
]{- "resource": {
- "id": "c4d5e6f7-8901-2345-bcde-f67890123456",
- "tenant": "default",
- "created_at": "2026-01-20T10:00:00Z",
- "updated_at": "2026-02-06T09:15:00Z"
}, - "name": "my-custom-evaluator",
- "title": "Custom Internal Evaluator",
- "description": "Updated evaluation adapter with bug fixes",
- "tags": [
- "custom",
- "internal"
], - "runtime": {
- "k8s": {
- "image": "registry.internal.example.com/eval/custom-adapter:v2.1",
- "entrypoint": [
- "/opt/app-root/bin/python",
- "/opt/app-root/src/main.py"
], - "cpu_request": "500m",
- "memory_request": "1Gi",
- "cpu_limit": "2",
- "memory_limit": "4Gi"
}
}, - "benchmarks": [
- {
- "id": "domain-qa",
- "name": "Domain Q&A Accuracy",
- "description": "Measures accuracy on domain-specific question answering",
- "category": "reasoning",
- "metrics": [
- "acc",
- "f1"
], - "primary_score": {
- "metric": "acc",
- "lower_is_better": false
}, - "pass_criteria": {
- "threshold": 0.5
}
}
]
}