Add harness app: agent orchestrator with cluster deployment

- Next.js app for orchestrating coding agent benchmarks (Claude Code, Codex, OpenCode)
- Dockerfile installs git, gh CLI, and agent CLIs for headless execution
- K8s deployment with workspace volume, sealed credentials for Claude + OpenCode
- Traefik IngressRoute at harness.coreworlds.io with internal-only middleware + TLS
- CI pipeline path filter for harness builds
- Fix OpenCode runtime flags (subcommand-based headless mode)
This commit is contained in:
Julia McGhee
2026-03-21 15:26:09 +00:00
parent 9e7077cd82
commit 6dde7c8aef
46 changed files with 4675 additions and 0 deletions

View File

@@ -0,0 +1,154 @@
// Agent runtime definitions and configuration
export type AgentRuntime = "claude-code" | "codex" | "opencode";
export interface AgentRuntimeInfo {
id: AgentRuntime;
name: string;
description: string;
defaultProviders: string[]; // which AI providers this runtime supports
cliCommand: string; // base CLI command
headlessFlag: string; // flag to run headless
modelFlag: string; // flag to specify model
promptFlag: string; // flag to pass the prompt/task
}
export const AGENT_RUNTIMES: Record<AgentRuntime, AgentRuntimeInfo> = {
"claude-code": {
id: "claude-code",
name: "Claude Code",
description: "Anthropic's agentic coding CLI. Supports Claude models via Anthropic API or Bedrock.",
defaultProviders: ["anthropic"],
cliCommand: "claude",
headlessFlag: "--print",
modelFlag: "--model",
promptFlag: "--prompt",
},
"codex": {
id: "codex",
name: "Codex CLI",
description: "OpenAI's open-source coding agent. Supports OpenAI models.",
defaultProviders: ["openai"],
cliCommand: "codex",
headlessFlag: "--quiet",
modelFlag: "--model",
promptFlag: "", // prompt is positional
},
"opencode": {
id: "opencode",
name: "OpenCode",
description: "Open-source multi-provider coding agent. Supports Anthropic, OpenAI, Google, OpenRouter.",
defaultProviders: ["anthropic", "openai", "google", "openrouter", "opencode-zen"],
cliCommand: "opencode",
headlessFlag: "run", // subcommand, not a flag
modelFlag: "--model",
promptFlag: "", // prompt is positional (like codex)
},
};
// ─── AGENT CONFIGURATIONS ────────────────────────────────────
export interface AgentConfig {
id: string;
name: string;
runtime: AgentRuntime;
modelId: string;
provider: string;
maxTokens?: number;
env?: Record<string, string>; // additional env vars for the agent process
}
const configs: Map<string, AgentConfig> = new Map();
export function getAllAgentConfigs(): AgentConfig[] {
return Array.from(configs.values());
}
export function getAgentConfig(id: string): AgentConfig | undefined {
return configs.get(id);
}
export function upsertAgentConfig(config: AgentConfig): AgentConfig {
configs.set(config.id, config);
return config;
}
export function deleteAgentConfig(id: string): boolean {
return configs.delete(id);
}
// ─── SEED DATA ──────────────────────────────────────────────
const SEED_CONFIGS: AgentConfig[] = [
{
id: "agent-claude-opus",
name: "Claude Code · Opus 4",
runtime: "claude-code",
modelId: "claude-opus-4-20250514",
provider: "anthropic",
},
{
id: "agent-claude-sonnet",
name: "Claude Code · Sonnet 4",
runtime: "claude-code",
modelId: "claude-sonnet-4-20250514",
provider: "anthropic",
},
{
id: "agent-codex-o3",
name: "Codex · o3",
runtime: "codex",
modelId: "o3",
provider: "openai",
},
{
id: "agent-codex-o4mini",
name: "Codex · o4-mini",
runtime: "codex",
modelId: "o4-mini",
provider: "openai",
},
{
id: "agent-opencode-sonnet",
name: "OpenCode · Sonnet 4",
runtime: "opencode",
modelId: "claude-sonnet-4-20250514",
provider: "anthropic",
},
{
id: "agent-opencode-gemini",
name: "OpenCode · Gemini 2.5 Pro",
runtime: "opencode",
modelId: "gemini-2.5-pro",
provider: "google",
},
];
function seedAgents() {
if (configs.size > 0) return;
for (const c of SEED_CONFIGS) {
configs.set(c.id, c);
}
}
seedAgents();
// ─── CLI BUILDER ────────────────────────────────────────────
// Builds the shell command to invoke an agent headlessly.
export function buildAgentCommand(config: AgentConfig, prompt: string, workDir: string): string[] {
const runtime = AGENT_RUNTIMES[config.runtime];
const args = [runtime.cliCommand];
if (runtime.headlessFlag) args.push(runtime.headlessFlag);
if (runtime.modelFlag && config.modelId) args.push(runtime.modelFlag, config.modelId);
if (runtime.promptFlag) {
args.push(runtime.promptFlag, prompt);
} else {
// positional prompt (codex)
args.push(prompt);
}
return args;
}

View File

@@ -0,0 +1,57 @@
export type Provider =
| "github" | "gitlab"
| "anthropic" | "openai" | "openrouter" | "google" | "opencode-zen";
export const GIT_PROVIDERS: Provider[] = ["github", "gitlab"];
export const AI_PROVIDERS: Provider[] = ["anthropic", "openai", "openrouter", "google", "opencode-zen"];
export interface Credential {
id: string;
provider: Provider;
label: string;
token: string;
baseUrl?: string; // for self-hosted GitLab or custom endpoints
}
// In-memory store. Will be replaced with encrypted persistent storage.
const credentials: Map<string, Credential> = new Map();
export function getAllCredentials(): Credential[] {
return Array.from(credentials.values()).map(c => ({
...c,
token: maskToken(c.token),
}));
}
export function getCredentialsByKind(kind: "git" | "ai"): Credential[] {
const providers = kind === "git" ? GIT_PROVIDERS : AI_PROVIDERS;
return Array.from(credentials.values())
.filter(c => providers.includes(c.provider))
.map(c => ({ ...c, token: maskToken(c.token) }));
}
export function getCredential(id: string): Credential | undefined {
return credentials.get(id);
}
export function getCredentialsByProvider(provider: Provider): Credential[] {
return Array.from(credentials.values()).filter(c => c.provider === provider);
}
export function getRawCredentialsByProvider(provider: Provider): Credential[] {
return Array.from(credentials.values()).filter(c => c.provider === provider);
}
export function upsertCredential(cred: Credential): Credential {
credentials.set(cred.id, cred);
return { ...cred, token: maskToken(cred.token) };
}
export function deleteCredential(id: string): boolean {
return credentials.delete(id);
}
function maskToken(token: string): string {
if (token.length <= 8) return "••••••••";
return token.slice(0, 4) + "••••" + token.slice(-4);
}

View File

@@ -0,0 +1,99 @@
import { Task, Eval } from "./types";
import { hasDiff, getDiffStats } from "./git-ops";
export interface EvalResult {
evals: Record<string, Eval>;
allPassed: boolean;
diagnosis: string;
diffStats: string;
}
// Simple target DSL:
// exitCode:0 — exit code equals value
// contains:<text> — agent output contains text
// filesChanged:>0 — git diff has changes
function evaluateCriterion(
criterion: { label: string; target: string },
context: { exitCode: number; agentOutput: string; hasChanges: boolean },
): Eval {
const { label, target } = criterion;
// exitCode:N
const exitMatch = target.match(/^exitCode:(\d+)$/);
if (exitMatch) {
const expected = parseInt(exitMatch[1], 10);
return {
label,
value: context.exitCode,
unit: "exit code",
pass: context.exitCode === expected,
target,
};
}
// contains:<text>
const containsMatch = target.match(/^contains:(.+)$/);
if (containsMatch) {
const text = containsMatch[1];
const found = context.agentOutput.includes(text);
return {
label,
value: found ? "found" : "not found",
unit: "",
pass: found,
target,
};
}
// filesChanged:>0
if (target === "filesChanged:>0") {
return {
label,
value: context.hasChanges ? ">0" : "0",
unit: "files",
pass: context.hasChanges,
target,
};
}
// Unknown target — always fail
return {
label,
value: "unknown",
unit: "",
pass: false,
target,
};
}
export async function evaluate(opts: {
task: Task;
iterationNumber: number;
agentOutput: string;
exitCode: number;
workDir: string;
}): Promise<EvalResult> {
const { task, agentOutput, exitCode, workDir } = opts;
const hasChanges = await hasDiff(workDir);
const diffStats = await getDiffStats(workDir);
const context = { exitCode, agentOutput, hasChanges };
const evals: Record<string, Eval> = {};
const failures: string[] = [];
for (const criterion of task.spec.criteria) {
const result = evaluateCriterion(criterion, context);
evals[criterion.label] = result;
if (!result.pass) {
failures.push(`${criterion.label}: expected ${criterion.target}, got ${result.value}`);
}
}
const allPassed = failures.length === 0;
const diagnosis = allPassed
? "All criteria passed."
: `Failed criteria:\n${failures.map((f) => `- ${f}`).join("\n")}`;
return { evals, allPassed, diagnosis, diffStats };
}

View File

@@ -0,0 +1,158 @@
import { spawn, ChildProcess } from "node:child_process";
import { getAgentConfig, buildAgentCommand, AGENT_RUNTIMES } from "./agents";
import { getRawCredentialsByProvider, Provider } from "./credentials";
import { ExecutionResult } from "./types";
const DEFAULT_TIMEOUT_MS = 10 * 60 * 1000; // 10 minutes
// Maps AI providers to their env var names
const PROVIDER_ENV_VARS: Record<string, string> = {
anthropic: "ANTHROPIC_API_KEY",
openai: "OPENAI_API_KEY",
google: "GOOGLE_API_KEY",
openrouter: "OPENROUTER_API_KEY",
"opencode-zen": "OPENCODE_ZEN_API_KEY",
};
// Best-effort token extraction regexes per runtime
const TOKEN_PATTERNS: Record<string, { input: RegExp; output: RegExp }> = {
"claude-code": {
input: /input[_\s]tokens?[:\s]+(\d[\d,]*)/i,
output: /output[_\s]tokens?[:\s]+(\d[\d,]*)/i,
},
codex: {
input: /input[_\s]tokens?[:\s]+(\d[\d,]*)/i,
output: /output[_\s]tokens?[:\s]+(\d[\d,]*)/i,
},
opencode: {
input: /input[_\s]tokens?[:\s]+(\d[\d,]*)/i,
output: /output[_\s]tokens?[:\s]+(\d[\d,]*)/i,
},
};
function parseTokenCount(text: string, pattern: RegExp): number {
const match = text.match(pattern);
if (!match) return 0;
return parseInt(match[1].replace(/,/g, ""), 10);
}
export async function executeAgent(opts: {
agentId: string;
prompt: string;
workDir: string;
timeoutMs?: number;
signal?: AbortSignal;
}): Promise<ExecutionResult> {
const config = getAgentConfig(opts.agentId);
if (!config) {
throw new Error(`Agent config not found: ${opts.agentId}`);
}
const args = buildAgentCommand(config, opts.prompt, opts.workDir);
const command = args[0];
const commandArgs = args.slice(1);
// Build environment with credentials
const env: NodeJS.ProcessEnv = { ...process.env };
// Set API keys — OpenCode is multi-provider so inject all available keys;
// other runtimes only need their configured provider's key.
const providersToInject =
config.runtime === "opencode"
? Object.keys(PROVIDER_ENV_VARS)
: [config.provider];
for (const provider of providersToInject) {
const envVar = PROVIDER_ENV_VARS[provider];
if (!envVar) continue;
const creds = getRawCredentialsByProvider(provider as Provider);
if (creds.length > 0) {
env[envVar] = creds[0].token;
}
}
// Set GitHub token for git operations within agent
const ghCreds = getRawCredentialsByProvider("github" as Provider);
if (ghCreds.length > 0) {
env.GITHUB_TOKEN = ghCreds[0].token;
env.GH_TOKEN = ghCreds[0].token;
}
// Add any custom env from agent config
if (config.env) {
Object.assign(env, config.env);
}
const timeout = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const startTime = Date.now();
return new Promise<ExecutionResult>((resolve) => {
const child: ChildProcess = spawn(command, commandArgs, {
cwd: opts.workDir,
env,
stdio: ["ignore", "pipe", "pipe"],
});
let stdout = "";
let stderr = "";
let killed = false;
child.stdout!.on("data", (chunk: Buffer) => {
stdout += chunk.toString();
});
child.stderr!.on("data", (chunk: Buffer) => {
stderr += chunk.toString();
});
// Timeout
const timer = setTimeout(() => {
killed = true;
child.kill("SIGTERM");
setTimeout(() => child.kill("SIGKILL"), 5000);
}, timeout);
// Cancellation via AbortSignal
const onAbort = () => {
killed = true;
child.kill("SIGTERM");
setTimeout(() => child.kill("SIGKILL"), 5000);
};
opts.signal?.addEventListener("abort", onAbort, { once: true });
child.on("close", (code: number | null) => {
clearTimeout(timer);
opts.signal?.removeEventListener("abort", onAbort);
const durationMs = Date.now() - startTime;
const combined = stdout + "\n" + stderr;
const runtime = AGENT_RUNTIMES[config.runtime];
const patterns = TOKEN_PATTERNS[runtime.id] ?? TOKEN_PATTERNS["claude-code"];
resolve({
exitCode: code ?? 1,
stdout,
stderr,
durationMs,
inputTokens: parseTokenCount(combined, patterns.input),
outputTokens: parseTokenCount(combined, patterns.output),
killed,
});
});
child.on("error", (err: Error) => {
clearTimeout(timer);
opts.signal?.removeEventListener("abort", onAbort);
resolve({
exitCode: 1,
stdout,
stderr: stderr + "\n" + err.message,
durationMs: Date.now() - startTime,
inputTokens: 0,
outputTokens: 0,
killed: false,
});
});
});
}

View File

@@ -0,0 +1,149 @@
import { execFile } from "node:child_process";
import { promisify } from "node:util";
import { mkdir } from "node:fs/promises";
import path from "node:path";
const exec = promisify(execFile);
const WORK_DIR = process.env.HARNESS_WORK_DIR || "/tmp/harness";
function reposDir(): string {
return path.join(WORK_DIR, "repos");
}
export function taskDir(taskId: string): string {
return path.join(WORK_DIR, "tasks", taskId);
}
export function iterationDir(taskId: string, iteration: number): string {
return path.join(taskDir(taskId), `iter-${iteration}`);
}
export function buildAuthenticatedCloneUrl(
repo: string,
provider: "github" | "gitlab",
token: string,
): string {
// repo format: "owner/name"
if (provider === "gitlab") {
return `https://oauth2:${token}@gitlab.com/${repo}.git`;
}
return `https://x-access-token:${token}@github.com/${repo}.git`;
}
function bareClonePath(slug: string): string {
return path.join(reposDir(), `${slug}.git`);
}
export async function ensureBareClone(
repoUrl: string,
slug: string,
): Promise<string> {
const clonePath = bareClonePath(slug);
await mkdir(reposDir(), { recursive: true });
try {
// Try fetching first (repo already cloned)
await exec("git", ["fetch", "--all"], { cwd: clonePath });
} catch {
// Clone bare
await exec("git", ["clone", "--bare", repoUrl, clonePath]);
}
return clonePath;
}
export async function createWorktree(
bareClone: string,
worktreePath: string,
branch: string,
base?: string,
): Promise<void> {
await mkdir(path.dirname(worktreePath), { recursive: true });
const args = ["worktree", "add", worktreePath, "-b", branch];
if (base) args.push(base);
await exec("git", args, { cwd: bareClone });
}
export async function removeWorktree(
bareClone: string,
worktreePath: string,
): Promise<void> {
try {
await exec("git", ["worktree", "remove", "--force", worktreePath], {
cwd: bareClone,
});
} catch {
// Best-effort cleanup
}
}
export async function getDiffStats(workDir: string): Promise<string> {
try {
const { stdout } = await exec("git", ["diff", "--stat", "HEAD"], {
cwd: workDir,
});
return stdout.trim();
} catch {
return "";
}
}
export async function hasDiff(workDir: string): Promise<boolean> {
try {
const { stdout } = await exec(
"git",
["diff", "--name-only", "HEAD"],
{ cwd: workDir },
);
return stdout.trim().length > 0;
} catch {
return false;
}
}
export async function commitAll(
workDir: string,
message: string,
): Promise<void> {
await exec("git", ["add", "-A"], { cwd: workDir });
await exec("git", ["commit", "-m", message, "--allow-empty"], {
cwd: workDir,
});
}
export async function pushBranch(
workDir: string,
branch: string,
): Promise<void> {
await exec("git", ["push", "origin", branch, "--force-with-lease"], {
cwd: workDir,
});
}
export async function createPullRequest(opts: {
repo: string;
head: string;
title: string;
body: string;
token: string;
}): Promise<{ number: number; url: string }> {
const { stdout } = await exec(
"gh",
[
"pr",
"create",
"--repo", opts.repo,
"--head", opts.head,
"--title", opts.title,
"--body", opts.body,
"--json", "number,url",
],
{
env: { ...process.env, GH_TOKEN: opts.token },
},
);
return JSON.parse(stdout.trim());
}

View File

@@ -0,0 +1,135 @@
import { getRawCredentialsByProvider } from "./credentials";
export interface ModelInfo {
id: string;
name: string;
provider: string;
contextWindow?: number;
}
export async function fetchAllModels(): Promise<ModelInfo[]> {
const results = await Promise.allSettled([
fetchAnthropicModels(),
fetchOpenAIModels(),
fetchOpenRouterModels(),
fetchGoogleModels(),
]);
return results.flatMap(r => r.status === "fulfilled" ? r.value : []);
}
async function fetchAnthropicModels(): Promise<ModelInfo[]> {
const creds = getRawCredentialsByProvider("anthropic");
if (creds.length === 0) return [];
for (const cred of creds) {
try {
const res = await fetch("https://api.anthropic.com/v1/models", {
headers: {
"x-api-key": cred.token,
"anthropic-version": "2023-06-01",
},
});
if (!res.ok) continue;
const data = await res.json();
return (data.data || []).map((m: { id: string; display_name?: string }) => ({
id: m.id,
name: m.display_name || m.id,
provider: "anthropic",
}));
} catch {
continue;
}
}
return [];
}
async function fetchOpenAIModels(): Promise<ModelInfo[]> {
const creds = getRawCredentialsByProvider("openai");
if (creds.length === 0) return [];
for (const cred of creds) {
try {
const baseUrl = cred.baseUrl || "https://api.openai.com";
const res = await fetch(`${baseUrl}/v1/models`, {
headers: { Authorization: `Bearer ${cred.token}` },
});
if (!res.ok) continue;
const data = await res.json();
return (data.data || [])
.filter((m: { id: string }) =>
m.id.startsWith("gpt-") || m.id.startsWith("o") || m.id.startsWith("chatgpt-")
)
.map((m: { id: string }) => ({
id: m.id,
name: m.id,
provider: "openai",
}));
} catch {
continue;
}
}
return [];
}
async function fetchOpenRouterModels(): Promise<ModelInfo[]> {
const creds = getRawCredentialsByProvider("openrouter");
if (creds.length === 0) return [];
for (const cred of creds) {
try {
const res = await fetch("https://openrouter.ai/api/v1/models", {
headers: { Authorization: `Bearer ${cred.token}` },
});
if (!res.ok) continue;
const data = await res.json();
return (data.data || []).map((m: { id: string; name?: string; context_length?: number }) => ({
id: m.id,
name: m.name || m.id,
provider: "openrouter",
contextWindow: m.context_length,
}));
} catch {
continue;
}
}
return [];
}
async function fetchGoogleModels(): Promise<ModelInfo[]> {
const creds = getRawCredentialsByProvider("google");
if (creds.length === 0) return [];
for (const cred of creds) {
try {
const res = await fetch(
`https://generativelanguage.googleapis.com/v1beta/models?key=${cred.token}`
);
if (!res.ok) continue;
const data = await res.json();
return (data.models || [])
.filter((m: { name: string }) => m.name.includes("gemini"))
.map((m: { name: string; displayName?: string; inputTokenLimit?: number }) => ({
id: m.name.replace("models/", ""),
name: m.displayName || m.name,
provider: "google",
contextWindow: m.inputTokenLimit,
}));
} catch {
continue;
}
}
return [];
}

View File

@@ -0,0 +1,154 @@
// Curated model list and usage tracking
export interface CuratedModel {
id: string;
name: string;
provider: string;
enabled: boolean;
contextWindow?: number;
costPer1kInput?: number; // USD per 1k input tokens
costPer1kOutput?: number; // USD per 1k output tokens
}
export interface ModelUsageEntry {
modelId: string;
provider: string;
taskId: string;
taskSlug: string;
iteration: number;
inputTokens: number;
outputTokens: number;
durationMs: number;
timestamp: number;
}
export interface ModelUsageSummary {
modelId: string;
provider: string;
totalInputTokens: number;
totalOutputTokens: number;
totalCost: number;
totalRequests: number;
totalDurationMs: number;
}
// In-memory stores
const curatedModels: Map<string, CuratedModel> = new Map();
const usageLog: ModelUsageEntry[] = [];
// ─── CURATED MODELS ─────────────────────────────────────────
export function getCuratedModels(): CuratedModel[] {
return Array.from(curatedModels.values());
}
export function getEnabledModels(): CuratedModel[] {
return Array.from(curatedModels.values()).filter(m => m.enabled);
}
export function upsertCuratedModel(model: CuratedModel): CuratedModel {
curatedModels.set(model.id, model);
return model;
}
export function removeCuratedModel(id: string): boolean {
return curatedModels.delete(id);
}
export function toggleModelEnabled(id: string): CuratedModel | undefined {
const model = curatedModels.get(id);
if (!model) return undefined;
model.enabled = !model.enabled;
curatedModels.set(id, model);
return model;
}
export function updateModelCost(id: string, costPer1kInput: number, costPer1kOutput: number): CuratedModel | undefined {
const model = curatedModels.get(id);
if (!model) return undefined;
model.costPer1kInput = costPer1kInput;
model.costPer1kOutput = costPer1kOutput;
curatedModels.set(id, model);
return model;
}
// ─── USAGE TRACKING ─────────────────────────────────────────
export function recordUsage(entry: ModelUsageEntry): void {
usageLog.push(entry);
}
export function getUsageLog(): ModelUsageEntry[] {
return [...usageLog];
}
export function getUsageSummary(): ModelUsageSummary[] {
const grouped = new Map<string, ModelUsageSummary>();
for (const entry of usageLog) {
const key = `${entry.provider}:${entry.modelId}`;
const existing = grouped.get(key);
const model = curatedModels.get(entry.modelId);
const inputCost = model?.costPer1kInput ? (entry.inputTokens / 1000) * model.costPer1kInput : 0;
const outputCost = model?.costPer1kOutput ? (entry.outputTokens / 1000) * model.costPer1kOutput : 0;
if (existing) {
existing.totalInputTokens += entry.inputTokens;
existing.totalOutputTokens += entry.outputTokens;
existing.totalCost += inputCost + outputCost;
existing.totalRequests += 1;
existing.totalDurationMs += entry.durationMs;
} else {
grouped.set(key, {
modelId: entry.modelId,
provider: entry.provider,
totalInputTokens: entry.inputTokens,
totalOutputTokens: entry.outputTokens,
totalCost: inputCost + outputCost,
totalRequests: 1,
totalDurationMs: entry.durationMs,
});
}
}
return Array.from(grouped.values()).sort((a, b) => b.totalCost - a.totalCost);
}
// ─── SEED DATA ──────────────────────────────────────────────
// Pre-populate with well-known models and pricing
const SEED_MODELS: Omit<CuratedModel, "enabled">[] = [
{ id: "claude-opus-4-20250514", name: "Claude Opus 4", provider: "anthropic", contextWindow: 200000, costPer1kInput: 0.015, costPer1kOutput: 0.075 },
{ id: "claude-sonnet-4-20250514", name: "Claude Sonnet 4", provider: "anthropic", contextWindow: 200000, costPer1kInput: 0.003, costPer1kOutput: 0.015 },
{ id: "claude-haiku-4-20250514", name: "Claude Haiku 4", provider: "anthropic", contextWindow: 200000, costPer1kInput: 0.0008, costPer1kOutput: 0.004 },
{ id: "gpt-4o", name: "GPT-4o", provider: "openai", contextWindow: 128000, costPer1kInput: 0.0025, costPer1kOutput: 0.01 },
{ id: "gpt-4o-mini", name: "GPT-4o Mini", provider: "openai", contextWindow: 128000, costPer1kInput: 0.00015,costPer1kOutput: 0.0006 },
{ id: "o3", name: "o3", provider: "openai", contextWindow: 200000, costPer1kInput: 0.01, costPer1kOutput: 0.04 },
{ id: "o4-mini", name: "o4 Mini", provider: "openai", contextWindow: 200000, costPer1kInput: 0.0011, costPer1kOutput: 0.0044 },
{ id: "gemini-2.5-pro", name: "Gemini 2.5 Pro", provider: "google", contextWindow: 1048576,costPer1kInput: 0.00125,costPer1kOutput: 0.01 },
{ id: "gemini-2.5-flash", name: "Gemini 2.5 Flash", provider: "google", contextWindow: 1048576,costPer1kInput: 0.00015,costPer1kOutput: 0.0006 },
];
const SEED_USAGE: Omit<ModelUsageEntry, "timestamp">[] = [
{ modelId: "claude-sonnet-4-20250514", provider: "anthropic", taskId: "task-002", taskSlug: "haiku-moderation-tier2", iteration: 1, inputTokens: 48200, outputTokens: 12400, durationMs: 34000 },
{ modelId: "claude-sonnet-4-20250514", provider: "anthropic", taskId: "task-002", taskSlug: "haiku-moderation-tier2", iteration: 2, inputTokens: 52100, outputTokens: 15800, durationMs: 41000 },
{ modelId: "claude-sonnet-4-20250514", provider: "anthropic", taskId: "task-002", taskSlug: "haiku-moderation-tier2", iteration: 3, inputTokens: 61300, outputTokens: 18200, durationMs: 45000 },
{ modelId: "claude-sonnet-4-20250514", provider: "anthropic", taskId: "task-002", taskSlug: "haiku-moderation-tier2", iteration: 4, inputTokens: 55000, outputTokens: 14600, durationMs: 38000 },
{ modelId: "claude-opus-4-20250514", provider: "anthropic", taskId: "task-001", taskSlug: "pubsub-pipeline-migration", iteration: 1, inputTokens: 85400, outputTokens: 28900, durationMs: 92000 },
{ modelId: "claude-opus-4-20250514", provider: "anthropic", taskId: "task-001", taskSlug: "pubsub-pipeline-migration", iteration: 2, inputTokens: 91200, outputTokens: 31400, durationMs: 98000 },
{ modelId: "claude-opus-4-20250514", provider: "anthropic", taskId: "task-001", taskSlug: "pubsub-pipeline-migration", iteration: 3, inputTokens: 78600, outputTokens: 22100, durationMs: 85000 },
{ modelId: "gpt-4o", provider: "openai", taskId: "task-001", taskSlug: "pubsub-pipeline-migration", iteration: 1, inputTokens: 42000, outputTokens: 9800, durationMs: 28000 },
];
export function seedData() {
if (curatedModels.size > 0) return; // already seeded
for (const m of SEED_MODELS) {
curatedModels.set(m.id, { ...m, enabled: true });
}
const now = Date.now();
for (let i = 0; i < SEED_USAGE.length; i++) {
usageLog.push({ ...SEED_USAGE[i], timestamp: now - (SEED_USAGE.length - i) * 1000 * 60 * 30 });
}
}
seedData();

View File

@@ -0,0 +1,316 @@
import {
getTask,
updateTask,
appendIteration,
updateIteration,
getFirstPendingTask,
getRunningTasks,
} from "./store";
import { recordUsage } from "./model-store";
import { getAgentConfig } from "./agents";
import { getRawCredentialsByProvider } from "./credentials";
import {
ensureBareClone,
createWorktree,
removeWorktree,
iterationDir,
buildAuthenticatedCloneUrl,
commitAll,
pushBranch,
createPullRequest,
} from "./git-ops";
import { executeAgent } from "./executor";
import { buildPrompt } from "./prompt-builder";
import { evaluate } from "./evaluator";
import { Task, Iteration } from "./types";
const POLL_INTERVAL_MS = 2000;
let pollTimer: ReturnType<typeof setInterval> | null = null;
let running = false;
let currentTaskId: string | null = null;
let currentAbort: AbortController | null = null;
export function isRunning(): boolean {
return running;
}
export function currentRunningTaskId(): string | null {
return currentTaskId;
}
export function startOrchestrator(): void {
if (running) return;
running = true;
// Mark any crashed running tasks as failed on startup
recoverCrashedTasks();
pollTimer = setInterval(() => {
if (currentTaskId) return; // already processing a task
poll();
}, POLL_INTERVAL_MS);
// Immediate first poll
poll();
}
export function stopOrchestrator(): void {
running = false;
if (pollTimer) {
clearInterval(pollTimer);
pollTimer = null;
}
}
export function cancelTask(taskId: string): boolean {
if (currentTaskId !== taskId) return false;
currentAbort?.abort();
return true;
}
function recoverCrashedTasks(): void {
const runningTasks = getRunningTasks();
for (const task of runningTasks) {
// Mark running iterations as failed
const updatedIterations = task.iterations.map((iter) =>
iter.status === "running"
? { ...iter, status: "failed" as const, diagnosis: "Interrupted — server restarted", completedAt: Date.now() }
: iter,
);
updateTask(task.id, {
status: "failed",
iterations: updatedIterations,
completedAt: Date.now(),
});
}
}
async function poll(): Promise<void> {
if (!running || currentTaskId) return;
const task = getFirstPendingTask();
if (!task) return;
currentTaskId = task.id;
currentAbort = new AbortController();
try {
await runTask(task);
} catch (err) {
console.error(`[orchestrator] Task ${task.id} failed with error:`, err);
updateTask(task.id, {
status: "failed",
completedAt: Date.now(),
});
} finally {
currentTaskId = null;
currentAbort = null;
}
}
async function runTask(task: Task): Promise<void> {
const agentConfig = getAgentConfig(task.spec.agentId);
if (!agentConfig) {
updateTask(task.id, {
status: "failed",
completedAt: Date.now(),
});
return;
}
// Determine git credentials and repo URL
const gitCreds = getRawCredentialsByProvider("github");
const gitToken = gitCreds[0]?.token;
if (!gitToken) {
updateTask(task.id, {
status: "failed",
completedAt: Date.now(),
});
return;
}
const repoUrl = buildAuthenticatedCloneUrl(task.project, "github", gitToken);
updateTask(task.id, {
status: "running",
startedAt: Date.now(),
});
// Ensure bare clone
let bareClone: string;
try {
bareClone = await ensureBareClone(repoUrl, task.slug);
} catch (err) {
console.error(`[orchestrator] Failed to clone repo for task ${task.id}:`, err);
updateTask(task.id, {
status: "failed",
completedAt: Date.now(),
});
return;
}
const branchName = `harness/${task.slug}`;
let converged = false;
for (let n = 1; n <= task.maxIterations; n++) {
if (currentAbort?.signal.aborted) {
updateTask(task.id, {
status: "failed",
completedAt: Date.now(),
});
return;
}
const result = await runIteration(task, n, bareClone, branchName);
if (!result) {
// Iteration was cancelled or errored fatally
return;
}
if (result.allPassed) {
converged = true;
break;
}
}
if (converged) {
// Push and create PR
try {
const lastIterN = getTask(task.id)!.iteration;
const workDir = iterationDir(task.id, lastIterN);
await commitAll(workDir, `harness: ${task.goal}`);
await pushBranch(workDir, branchName);
const pr = await createPullRequest({
repo: task.project,
head: branchName,
title: `[harness] ${task.goal}`,
body: `Automated by harness orchestrator.\n\nTask: ${task.slug}\nIterations: ${lastIterN}`,
token: gitToken,
});
updateTask(task.id, {
status: "completed",
completedAt: Date.now(),
pr: { number: pr.number, title: `[harness] ${task.goal}`, status: "open" },
});
} catch (err) {
console.error(`[orchestrator] Failed to create PR for task ${task.id}:`, err);
updateTask(task.id, {
status: "completed",
completedAt: Date.now(),
});
}
} else {
updateTask(task.id, {
status: "failed",
completedAt: Date.now(),
});
}
// Cleanup worktrees
const finalTask = getTask(task.id)!;
for (const iter of finalTask.iterations) {
await removeWorktree(bareClone, iterationDir(task.id, iter.n));
}
}
async function runIteration(
task: Task,
n: number,
bareClone: string,
branchName: string,
): Promise<{ allPassed: boolean } | null> {
const iteration: Iteration = {
n,
status: "running",
diagnosis: null,
startedAt: Date.now(),
};
appendIteration(task.id, iteration);
const workDir = iterationDir(task.id, n);
try {
// Create worktree — first iteration gets a new branch, subsequent reuse it
const branchForWorktree = n === 1 ? branchName : `${branchName}-iter-${n}`;
await createWorktree(bareClone, workDir, branchForWorktree, "HEAD");
} catch (err) {
console.error(`[orchestrator] Failed to create worktree for iteration ${n}:`, err);
updateIteration(task.id, n, {
status: "failed",
diagnosis: `Failed to create worktree: ${err}`,
completedAt: Date.now(),
});
return null;
}
// Build prompt with prior iterations
const currentTask = getTask(task.id)!;
const priorIterations = currentTask.iterations.filter((i) => i.n < n);
const prompt = await buildPrompt({
task: currentTask,
iterationNumber: n,
priorIterations,
});
// Execute agent
const execResult = await executeAgent({
agentId: task.spec.agentId,
prompt,
workDir,
signal: currentAbort?.signal,
});
if (execResult.killed && currentAbort?.signal.aborted) {
updateIteration(task.id, n, {
status: "failed",
diagnosis: "Cancelled by user",
completedAt: Date.now(),
});
updateTask(task.id, { status: "failed", completedAt: Date.now() });
return null;
}
// Evaluate
const evalResult = await evaluate({
task: currentTask,
iterationNumber: n,
agentOutput: execResult.stdout,
exitCode: execResult.exitCode,
workDir,
});
// Record usage
const agentConfig = getAgentConfig(task.spec.agentId);
if (agentConfig) {
recordUsage({
modelId: agentConfig.modelId,
provider: agentConfig.provider,
taskId: task.id,
taskSlug: task.slug,
iteration: n,
inputTokens: execResult.inputTokens,
outputTokens: execResult.outputTokens,
durationMs: execResult.durationMs,
timestamp: Date.now(),
});
}
// Update iteration
updateIteration(task.id, n, {
status: evalResult.allPassed ? "passed" : "failed",
diagnosis: evalResult.diagnosis,
agentOutput: execResult.stdout.slice(-8000), // keep last 8k chars
evals: evalResult.evals,
diffStats: evalResult.diffStats,
completedAt: Date.now(),
});
// Update task-level evals
updateTask(task.id, { evals: evalResult.evals });
return { allPassed: evalResult.allPassed };
}

View File

@@ -0,0 +1,94 @@
import { readFile, readdir } from "node:fs/promises";
import path from "node:path";
import { Task, Iteration } from "./types";
const KNOWLEDGE_DIR = process.env.HARNESS_KNOWLEDGE_DIR || "";
const MAX_AGENT_OUTPUT_LENGTH = 4000;
const MAX_PRIOR_ITERATIONS = 3;
export async function buildPrompt(opts: {
task: Task;
iterationNumber: number;
priorIterations: Iteration[];
}): Promise<string> {
const { task, iterationNumber, priorIterations } = opts;
const sections: string[] = [];
// Task goal
sections.push(`# Task\n\n${task.goal}`);
// Success criteria
if (task.spec.criteria.length > 0) {
const criteriaLines = task.spec.criteria
.map((c) => `- **${c.label}**: ${c.target}`)
.join("\n");
sections.push(`# Success Criteria\n\n${criteriaLines}`);
}
// Constraints
if (task.spec.constraints.length > 0) {
const constraintLines = task.spec.constraints
.map((c) => `- ${c}`)
.join("\n");
sections.push(`# Constraints\n\n${constraintLines}`);
}
// Knowledge references
const knowledgeContent = await loadKnowledge(task.spec.knowledgeRefs);
if (knowledgeContent) {
sections.push(`# Reference Material\n\n${knowledgeContent}`);
}
// Prior iterations
if (priorIterations.length > 0) {
const recentIterations = priorIterations.slice(-MAX_PRIOR_ITERATIONS);
const priorLines = recentIterations.map((iter) => {
const parts = [`## Iteration ${iter.n}${iter.status}`];
if (iter.diagnosis) {
parts.push(`**Diagnosis:** ${iter.diagnosis}`);
}
if (iter.evals) {
const evalSummary = Object.entries(iter.evals)
.map(([key, ev]) => `- ${key}: ${ev.pass ? "PASS" : "FAIL"} (${ev.value} ${ev.unit}, target: ${ev.target})`)
.join("\n");
parts.push(`**Evals:**\n${evalSummary}`);
}
// Include truncated agent output only for the most recent iteration
if (iter === recentIterations[recentIterations.length - 1] && iter.agentOutput) {
const truncated = iter.agentOutput.length > MAX_AGENT_OUTPUT_LENGTH
? iter.agentOutput.slice(-MAX_AGENT_OUTPUT_LENGTH) + "\n... (truncated)"
: iter.agentOutput;
parts.push(`**Agent Output (truncated):**\n\`\`\`\n${truncated}\n\`\`\``);
}
return parts.join("\n");
});
sections.push(`# Prior Iterations\n\n${priorLines.join("\n\n")}`);
}
// Instructions
sections.push(
`# Instructions\n\n` +
`This is iteration ${iterationNumber} of ${task.maxIterations}.\n` +
`Work in the current directory. Make all necessary changes to satisfy the success criteria.\n` +
`If prior iterations failed, analyze the diagnosis and try a different approach.`,
);
return sections.join("\n\n---\n\n");
}
async function loadKnowledge(refs: string[]): Promise<string> {
if (!KNOWLEDGE_DIR || refs.length === 0) return "";
const parts: string[] = [];
for (const ref of refs) {
try {
// ref can be a filename or glob-like path
const filePath = path.resolve(KNOWLEDGE_DIR, ref);
const content = await readFile(filePath, "utf-8");
parts.push(`## ${ref}\n\n${content}`);
} catch {
// Skip missing knowledge files
}
}
return parts.join("\n\n");
}

View File

@@ -0,0 +1,100 @@
import { getCredentialsByProvider } from "./credentials";
export interface RepoResult {
provider: "github" | "gitlab";
fullName: string;
url: string;
description: string;
defaultBranch: string;
private: boolean;
}
export async function searchRepos(query: string): Promise<RepoResult[]> {
if (!query || query.length < 2) return [];
const results = await Promise.allSettled([
searchGitHub(query),
searchGitLab(query),
]);
return results.flatMap(r => r.status === "fulfilled" ? r.value : []);
}
async function searchGitHub(query: string): Promise<RepoResult[]> {
const creds = getCredentialsByProvider("github");
if (creds.length === 0) return [];
const results: RepoResult[] = [];
for (const cred of creds) {
try {
const res = await fetch(
`https://api.github.com/search/repositories?q=${encodeURIComponent(query)}&per_page=10&sort=updated`,
{
headers: {
Authorization: `Bearer ${cred.token}`,
Accept: "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
},
}
);
if (!res.ok) continue;
const data = await res.json();
for (const repo of data.items || []) {
results.push({
provider: "github",
fullName: repo.full_name,
url: repo.html_url,
description: repo.description || "",
defaultBranch: repo.default_branch || "main",
private: repo.private,
});
}
} catch {
// skip failed credential
}
}
return results;
}
async function searchGitLab(query: string): Promise<RepoResult[]> {
const creds = getCredentialsByProvider("gitlab");
if (creds.length === 0) return [];
const results: RepoResult[] = [];
for (const cred of creds) {
const baseUrl = cred.baseUrl || "https://gitlab.com";
try {
const res = await fetch(
`${baseUrl}/api/v4/projects?search=${encodeURIComponent(query)}&per_page=10&order_by=updated_at&membership=true`,
{
headers: {
"PRIVATE-TOKEN": cred.token,
},
}
);
if (!res.ok) continue;
const data = await res.json();
for (const project of data) {
results.push({
provider: "gitlab",
fullName: project.path_with_namespace,
url: project.web_url,
description: project.description || "",
defaultBranch: project.default_branch || "main",
private: project.visibility === "private",
});
}
} catch {
// skip failed credential
}
}
return results;
}

View File

@@ -0,0 +1,61 @@
import { Task } from "./types";
// In-memory task store. Will be replaced with persistent storage (CloudNativePG)
// once the orchestrator loop is wired up.
const tasks: Map<string, Task> = new Map();
export function getAllTasks(): Task[] {
return Array.from(tasks.values());
}
export function getTask(id: string): Task | undefined {
return tasks.get(id);
}
export function createTask(task: Task): Task {
tasks.set(task.id, task);
return task;
}
export function updateTask(id: string, updates: Partial<Task>): Task | undefined {
const existing = tasks.get(id);
if (!existing) return undefined;
const updated = { ...existing, ...updates };
tasks.set(id, updated);
return updated;
}
export function deleteTask(id: string): boolean {
return tasks.delete(id);
}
export function appendIteration(id: string, iteration: import("./types").Iteration): Task | undefined {
const existing = tasks.get(id);
if (!existing) return undefined;
existing.iterations = [...existing.iterations, iteration];
existing.iteration = iteration.n;
tasks.set(id, existing);
return existing;
}
export function updateIteration(
id: string,
iterationN: number,
updates: Partial<import("./types").Iteration>,
): Task | undefined {
const existing = tasks.get(id);
if (!existing) return undefined;
existing.iterations = existing.iterations.map((iter) =>
iter.n === iterationN ? { ...iter, ...updates } : iter,
);
tasks.set(id, existing);
return existing;
}
export function getFirstPendingTask(): Task | undefined {
return Array.from(tasks.values()).find((t) => t.status === "pending");
}
export function getRunningTasks(): Task[] {
return Array.from(tasks.values()).filter((t) => t.status === "running");
}

View File

@@ -0,0 +1,68 @@
export interface TaskSpec {
slug: string;
goal: string;
project: string;
agentId: string;
maxIterations: number;
criteria: { label: string; target: string }[];
constraints: string[];
knowledgeRefs: string[];
}
export interface Eval {
label: string;
value: number | string;
unit: string;
pass: boolean;
target: string;
}
export interface Iteration {
n: number;
status: "pending" | "running" | "passed" | "failed";
diagnosis: string | null;
agentOutput?: string;
evals?: Record<string, Eval>;
diffStats?: string;
startedAt?: number;
completedAt?: number;
}
export interface ExecutionResult {
exitCode: number;
stdout: string;
stderr: string;
durationMs: number;
inputTokens: number;
outputTokens: number;
killed: boolean;
}
export interface Task {
id: string;
slug: string;
goal: string;
status: "pending" | "running" | "completed" | "failed";
iteration: number;
maxIterations: number;
startedAt: number | null;
completedAt?: number;
project: string;
evals: Record<string, Eval>;
iterations: Iteration[];
pr?: {
number: number;
title: string;
status: string;
};
spec: TaskSpec;
}
export interface KnowledgeDoc {
path: string;
title: string;
verificationStatus: string;
lastUpdated: string;
project: string;
preview: string;
}