// AgentTests — capability test runner panel. // // Calls POST /api/agents/test to run the test suite (backend/agent_tests.py) // against the live editor agent. Each test fires a fixed question, validates // the response satisfies a contract (right tool called, required keywords // present, citation discipline, length budget), and reports pass/fail with // an approximate token cost. Use as the SET STANDARD any token-efficiency // change must preserve. const { useState: atUseState, useEffect: atUseEffect, useCallback: atUseCallback } = React; function AgentTests({ open, onClose }) { if (!open) return null; const [manifest, setManifest] = atUseState([]); const [running, setRunning] = atUseState(false); const [report, setReport] = atUseState(null); const [err, setErr] = atUseState(''); const [selected, setSelected] = atUseState(new Set()); // Load manifest on open atUseEffect(() => { fetch('/api/agents/test/manifest') .then(r => r.ok ? r.json() : null) .then(j => setManifest(j?.tests || [])) .catch(() => {}); }, []); const run = atUseCallback(async (idsArg = null) => { if (running) return; setRunning(true); setReport(null); setErr(''); try { const body = {}; if (idsArg && idsArg.length > 0) body.ids = idsArg; const r = await fetch('/api/agents/test', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(body), }); if (!r.ok) { throw new Error(`HTTP ${r.status}: ${(await r.text()).slice(0, 100)}`); } setReport(await r.json()); } catch (e) { setErr(String(e.message || e)); } finally { setRunning(false); } }, [running]); const toggle = (id) => { setSelected(s => { const n = new Set(s); n.has(id) ? n.delete(id) : n.add(id); return n; }); }; return (
e.stopPropagation()} style={{ width: '100%', maxWidth: 880, maxHeight: 'calc(100vh - 80px)', overflowY: 'auto', background: 'var(--bg)', border: '1px solid var(--line-2)', borderRadius: 6, padding: 20, fontFamily: 'var(--font-body)', }}>

Agent capability tests

brain + API + recall + citation discipline · {manifest.length} tests
{err && (
{err}
)} {/* Run controls */}
{report && ( = 0.7 ? '#d9a85f' : '#d96a6a', }}> {report.passed}/{report.n} pass · {(report.pass_rate * 100).toFixed(0)}% · {report.duration_s}s · ~{report.approx_tokens_input + report.approx_tokens_output} tokens )}
{/* Manifest with results merged in */} {manifest.map(t => { const r = report?.results?.find(x => x.id === t.id); const isSel = selected.has(t.id); const status = r ? (r.passed ? 'pass' : 'fail') : 'idle'; const accent = status === 'pass' ? '#5fb37c' : status === 'fail' ? '#d96a6a' : 'var(--line)'; return (
toggle(t.id)} style={{ marginRight: 4, cursor: 'pointer' }} /> {t.id} {t.tag} {r && ( <> {r.tool_pass ? '✓' : '✗'} tool {r.content_pass ? '✓' : '✗'} content {t.must_cite_path && ( {r.citation_pass ? '✓' : '✗'} cite )} {r.duration_s}s · {r.tools_called.join(',') || '—'} )}
{t.question}
{t.must_call_tool && (
must call: {t.must_call_tool} {t.must_contain.length > 0 && ` · must contain: ${t.must_contain.join(', ')}`} {t.must_cite_path && ' · must cite path'}
)} {r && r.notes.length > 0 && (
{r.notes.map((n, i) =>
⚠ {n}
)}
)} {r && r.response_preview && (
{r.response_preview}
)}
); })}
); } window.AgentTests = AgentTests;