Automated AI tests via GitHub Actions (#19880)
* ci: ai tests * chore(ai-commands): update readme * fix: dont load .env.local in ci environment * fix: pass openai env variable into ci job * feat(ai): llm evaluated tests * chore(ai): remove unused jest snapshots
This commit is contained in:
50
.github/workflows/ai-tests.yml
vendored
Normal file
50
.github/workflows/ai-tests.yml
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
name: AI Unit Tests & Type Check
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
paths:
|
||||
- 'packages/ai-commands/**'
|
||||
pull_request:
|
||||
branches: [master]
|
||||
paths:
|
||||
- 'packages/ai-commands/**'
|
||||
schedule:
|
||||
- cron: '15 0 * * 1' # Every Monday @ 12:15am UTC (off the hour to avoid heavy load times)
|
||||
|
||||
# Cancel old builds on new commit for same workflow + branch/PR
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
node-version: [18.x]
|
||||
|
||||
defaults:
|
||||
run:
|
||||
working-directory: ./packages/ai-commands
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
sparse-checkout: |
|
||||
packages
|
||||
- name: Use Node.js ${{ matrix.node-version }}
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
cache: 'npm'
|
||||
- name: Install deps
|
||||
run: npm ci
|
||||
- name: Type check
|
||||
run: npm run typecheck
|
||||
- name: Run tests
|
||||
run: npm run test
|
||||
@@ -2,11 +2,11 @@
|
||||
|
||||
## Main purpose
|
||||
|
||||
This package contains all features involving OpenAI API. Technically, each feature is implemented as a function which
|
||||
can be easily tested for regressions.
|
||||
This package contains all features involving AI and LLMs (eg. via OpenAI API).
|
||||
Each feature is implemented as a function which can be easily tested for regressions.
|
||||
|
||||
The streaming functions only work on Edge runtime so they can only be imported via a special `edge` subpath like so:
|
||||
|
||||
```
|
||||
```ts
|
||||
import { chatRlsPolicy } from 'ai-commands/edge'
|
||||
```
|
||||
|
||||
@@ -7,6 +7,7 @@ module.exports = {
|
||||
'^.+\\.(js|jsx)$': 'babel-jest',
|
||||
},
|
||||
setupFiles: ['./test/setup.ts'],
|
||||
setupFilesAfterEnv: ['./test/extensions.ts'],
|
||||
testTimeout: 15000,
|
||||
transformIgnorePatterns: [
|
||||
'node_modules/(?!(mdast-.*|micromark|micromark-.*|unist-.*|decode-named-character-reference|character-entities)/)',
|
||||
|
||||
@@ -38,14 +38,8 @@ exports[`generate single table with specified columns 1`] = `
|
||||
);"
|
||||
`;
|
||||
|
||||
exports[`generate single table with specified columns 2`] = `"Employee Tracking Table"`;
|
||||
|
||||
exports[`rls chat select policy using table definition 1`] = `
|
||||
"create policy select_todo_policy on todos for
|
||||
select
|
||||
using (user_id = auth.uid ());"
|
||||
`;
|
||||
|
||||
exports[`title title matches content 1`] = `"Employee and Department Tables"`;
|
||||
|
||||
exports[`title title matches content 2`] = `"Tables to track employees and their respective departments"`;
|
||||
|
||||
@@ -16,7 +16,7 @@ describe('generate', () => {
|
||||
)
|
||||
|
||||
expect(formatSql(sql)).toMatchSnapshot()
|
||||
expect(title).toMatchSnapshot()
|
||||
await expect(title).toMatchCriteria('relates to employees')
|
||||
})
|
||||
})
|
||||
|
||||
@@ -94,8 +94,8 @@ describe('title', () => {
|
||||
`
|
||||
)
|
||||
|
||||
expect(title).toMatchSnapshot()
|
||||
expect(description).toMatchSnapshot()
|
||||
await expect(title).toMatchCriteria('relates to employees and departments')
|
||||
await expect(description).toMatchCriteria('describes employees and departments')
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
11
packages/ai-commands/test/extensions.d.ts
vendored
Normal file
11
packages/ai-commands/test/extensions.d.ts
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
import 'expect'
|
||||
|
||||
declare module 'expect' {
|
||||
interface Matchers {
|
||||
/**
|
||||
* Check that a string matches a natural language criteria
|
||||
* describing the expected output. Uses a LLM to evaluate.
|
||||
*/
|
||||
toMatchCriteria(criteria: string): Promise<void>
|
||||
}
|
||||
}
|
||||
72
packages/ai-commands/test/extensions.ts
Normal file
72
packages/ai-commands/test/extensions.ts
Normal file
@@ -0,0 +1,72 @@
|
||||
import { expect } from '@jest/globals'
|
||||
import { codeBlock } from 'common-tags'
|
||||
import OpenAI from 'openai'
|
||||
|
||||
const openAiKey = process.env.OPENAI_KEY
|
||||
const openai = new OpenAI({ apiKey: openAiKey })
|
||||
|
||||
expect.extend({
|
||||
async toMatchCriteria(received: string, criteria: string) {
|
||||
const model = 'gpt-4-1106-preview'
|
||||
|
||||
const completionResponse = await openai.chat.completions.create({
|
||||
model,
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
content: codeBlock`
|
||||
You are a test runner. Your job is to evaluate whether 'Received' adheres to the test 'Criteria'.
|
||||
|
||||
You must output JSON, specifically an object containing a "pass" boolean and "reason" string:
|
||||
- \`{ "pass": true, "reason": "<reason>" }\` if 'Received' adheres to the test 'Criteria'
|
||||
- \`{ "pass": false, "reason": "<reason>" }\` if 'Received' does not adhere to the test 'Criteria'
|
||||
|
||||
The "reason" must explain exactly which part of 'Received' did or did not pass the test 'Criteria'.
|
||||
`,
|
||||
},
|
||||
{
|
||||
role: 'user',
|
||||
content: codeBlock`
|
||||
Received:
|
||||
${received}
|
||||
|
||||
Criteria:
|
||||
${criteria}
|
||||
`,
|
||||
},
|
||||
],
|
||||
max_tokens: 256,
|
||||
temperature: 0,
|
||||
response_format: {
|
||||
type: 'json_object',
|
||||
},
|
||||
stream: false,
|
||||
})
|
||||
|
||||
const [choice] = completionResponse.choices
|
||||
|
||||
if (!choice.message.content) {
|
||||
throw new Error('LLM evaluator returned invalid response')
|
||||
}
|
||||
|
||||
const { pass, reason }: { pass?: boolean; reason?: string } = JSON.parse(choice.message.content)
|
||||
|
||||
if (pass === undefined) {
|
||||
throw new Error('LLM evaluator returned invalid response')
|
||||
}
|
||||
|
||||
return {
|
||||
message: () =>
|
||||
codeBlock`
|
||||
${this.utils.matcherHint('toMatchCriteria', received, criteria, {
|
||||
comment: `evaluated by LLM '${model}'`,
|
||||
isNot: this.isNot,
|
||||
promise: this.promise,
|
||||
})}
|
||||
|
||||
${reason}
|
||||
`,
|
||||
pass,
|
||||
}
|
||||
},
|
||||
})
|
||||
@@ -1,8 +1,10 @@
|
||||
import { config } from 'dotenv'
|
||||
import { statSync } from 'fs'
|
||||
|
||||
// Use studio .env.local for now
|
||||
const envPath = '../../apps/studio/.env.local'
|
||||
if (!process.env.CI) {
|
||||
// Use keys from studio .env.local for local tests
|
||||
const envPath = '../../apps/studio/.env.local'
|
||||
|
||||
statSync(envPath)
|
||||
config({ path: envPath })
|
||||
statSync(envPath)
|
||||
config({ path: envPath })
|
||||
}
|
||||
|
||||
@@ -1,5 +1,16 @@
|
||||
{
|
||||
"extends": "tsconfig/react-library.json",
|
||||
"include": ["."],
|
||||
"exclude": ["dist", "build", "node_modules"]
|
||||
}
|
||||
"include": [
|
||||
"."
|
||||
],
|
||||
"exclude": [
|
||||
"dist",
|
||||
"build",
|
||||
"node_modules"
|
||||
],
|
||||
"compilerOptions": {
|
||||
"types": [
|
||||
"./test/extensions.d.ts"
|
||||
]
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user