Automated AI tests via GitHub Actions (#19880)

* ci: ai tests * chore(ai-commands): update readme * fix: dont load .env.local in ci environment * fix: pass openai env variable into ci job * feat(ai): llm evaluated tests * chore(ai): remove unused jest snapshots
2023-12-20 09:34:30 -07:00
parent 23e095580c
commit 1a244b79a5
9 changed files with 160 additions and 19 deletions
--- a/.github/workflows/ai-tests.yml
+++ b/.github/workflows/ai-tests.yml
@@ -0,0 +1,50 @@
+name: AI Unit Tests & Type Check
+
+on:
+  push:
+    branches: [master]
+    paths:
+      - 'packages/ai-commands/**'
+  pull_request:
+    branches: [master]
+    paths:
+      - 'packages/ai-commands/**'
+  schedule:
+    - cron: '15 0 * * 1' # Every Monday @ 12:15am UTC (off the hour to avoid heavy load times)
+
+# Cancel old builds on new commit for same workflow + branch/PR
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    env:
+      OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
+
+    strategy:
+      matrix:
+        node-version: [18.x]
+
+    defaults:
+      run:
+        working-directory: ./packages/ai-commands
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            packages
+      - name: Use Node.js ${{ matrix.node-version }}
+        uses: actions/setup-node@v3
+        with:
+          node-version: ${{ matrix.node-version }}
+          cache: 'npm'
+      - name: Install deps
+        run: npm ci
+      - name: Type check
+        run: npm run typecheck
+      - name: Run tests
+        run: npm run test
--- a/packages/ai-commands/README.md
+++ b/packages/ai-commands/README.md
@@ -2,11 +2,11 @@

 ## Main purpose

-This package contains all features involving OpenAI API. Technically, each feature is implemented as a function which
-can be easily tested for regressions.
+This package contains all features involving AI and LLMs (eg. via OpenAI API).
+Each feature is implemented as a function which can be easily tested for regressions.

 The streaming functions only work on Edge runtime so they can only be imported via a special `edge` subpath like so:

-```
+```ts
 import { chatRlsPolicy } from 'ai-commands/edge'
 ```
--- a/packages/ai-commands/jest.config.js
+++ b/packages/ai-commands/jest.config.js
@@ -7,6 +7,7 @@ module.exports = {
    '^.+\\.(js|jsx)$': 'babel-jest',
  },
  setupFiles: ['./test/setup.ts'],
+  setupFilesAfterEnv: ['./test/extensions.ts'],
  testTimeout: 15000,
  transformIgnorePatterns: [
    'node_modules/(?!(mdast-.*|micromark|micromark-.*|unist-.*|decode-named-character-reference|character-entities)/)',
--- a/packages/ai-commands/src/snapshots/sql.test.ts.snap
+++ b/packages/ai-commands/src/snapshots/sql.test.ts.snap
@@ -38,14 +38,8 @@ exports[`generate single table with specified columns 1`] = `
 );"
 `;

-exports[`generate single table with specified columns 2`] = `"Employee Tracking Table"`;
-
 exports[`rls chat select policy using table definition 1`] = `
 "create policy select_todo_policy on todos for
 select
  using (user_id = auth.uid ());"
 `;
-
-exports[`title title matches content 1`] = `"Employee and Department Tables"`;
-
-exports[`title title matches content 2`] = `"Tables to track employees and their respective departments"`;
--- a/packages/ai-commands/src/sql.test.ts
+++ b/packages/ai-commands/src/sql.test.ts
@@ -16,7 +16,7 @@ describe('generate', () => {
    )

    expect(formatSql(sql)).toMatchSnapshot()
-    expect(title).toMatchSnapshot()
+    await expect(title).toMatchCriteria('relates to employees')
  })
 })

@@ -94,8 +94,8 @@ describe('title', () => {
      `
    )

-    expect(title).toMatchSnapshot()
-    expect(description).toMatchSnapshot()
+    await expect(title).toMatchCriteria('relates to employees and departments')
+    await expect(description).toMatchCriteria('describes employees and departments')
  })
 })

--- a/packages/ai-commands/test/extensions.d.ts
+++ b/packages/ai-commands/test/extensions.d.ts
@@ -0,0 +1,11 @@
+import 'expect'
+
+declare module 'expect' {
+  interface Matchers {
+    /**
+     * Check that a string matches a natural language criteria
+     * describing the expected output. Uses a LLM to evaluate.
+     */
+    toMatchCriteria(criteria: string): Promise<void>
+  }
+}
--- a/packages/ai-commands/test/extensions.ts
+++ b/packages/ai-commands/test/extensions.ts
@@ -0,0 +1,72 @@
+import { expect } from '@jest/globals'
+import { codeBlock } from 'common-tags'
+import OpenAI from 'openai'
+
+const openAiKey = process.env.OPENAI_KEY
+const openai = new OpenAI({ apiKey: openAiKey })
+
+expect.extend({
+  async toMatchCriteria(received: string, criteria: string) {
+    const model = 'gpt-4-1106-preview'
+
+    const completionResponse = await openai.chat.completions.create({
+      model,
+      messages: [
+        {
+          role: 'system',
+          content: codeBlock`
+            You are a test runner. Your job is to evaluate whether 'Received' adheres to the test 'Criteria'.
+            
+            You must output JSON, specifically an object containing a "pass" boolean and "reason" string:
+            - \`{ "pass": true, "reason": "<reason>" }\` if 'Received' adheres to the test 'Criteria'
+            - \`{ "pass": false, "reason": "<reason>" }\` if 'Received' does not adhere to the test 'Criteria'
+
+            The "reason" must explain exactly which part of 'Received' did or did not pass the test 'Criteria'.
+          `,
+        },
+        {
+          role: 'user',
+          content: codeBlock`
+            Received:
+            ${received}
+
+            Criteria:
+            ${criteria}
+          `,
+        },
+      ],
+      max_tokens: 256,
+      temperature: 0,
+      response_format: {
+        type: 'json_object',
+      },
+      stream: false,
+    })
+
+    const [choice] = completionResponse.choices
+
+    if (!choice.message.content) {
+      throw new Error('LLM evaluator returned invalid response')
+    }
+
+    const { pass, reason }: { pass?: boolean; reason?: string } = JSON.parse(choice.message.content)
+
+    if (pass === undefined) {
+      throw new Error('LLM evaluator returned invalid response')
+    }
+
+    return {
+      message: () =>
+        codeBlock`
+          ${this.utils.matcherHint('toMatchCriteria', received, criteria, {
+            comment: `evaluated by LLM '${model}'`,
+            isNot: this.isNot,
+            promise: this.promise,
+          })}
+          
+          ${reason}
+        `,
+      pass,
+    }
+  },
+})
--- a/packages/ai-commands/test/setup.ts
+++ b/packages/ai-commands/test/setup.ts
@@ -1,8 +1,10 @@
 import { config } from 'dotenv'
 import { statSync } from 'fs'

-// Use studio .env.local for now
-const envPath = '../../apps/studio/.env.local'
+if (!process.env.CI) {
+  // Use keys from studio .env.local for local tests
+  const envPath = '../../apps/studio/.env.local'

-statSync(envPath)
-config({ path: envPath })
+  statSync(envPath)
+  config({ path: envPath })
+}
--- a/packages/ai-commands/tsconfig.json
+++ b/packages/ai-commands/tsconfig.json
@@ -1,5 +1,16 @@
 {
  "extends": "tsconfig/react-library.json",
-  "include": ["."],
-  "exclude": ["dist", "build", "node_modules"]
-}
+  "include": [
+    "."
+  ],
+  "exclude": [
+    "dist",
+    "build",
+    "node_modules"
+  ],
+  "compilerOptions": {
+    "types": [
+      "./test/extensions.d.ts"
+    ]
+  }
+}