feat: alternate search index for nimbus (#38662)

* feat: alternate search index for nimbus

Create an alternate search index for Nimbus that filters out
feature-flagged pages (equivalent to setting all feature flags to
false).

Notes:
- Creates two new DB tables, `page_nimbus` and `page_section_nimbus`,
  which are filtered versions of `page` and `page_section`
- Makes `nimbus` versions of all the DB search functions
- Refactored the embedding upload script. Changes to make it faster (got
  annoyed by how slow it was when testing...), incorporate retries, and
  produce better summary logs.
- Upload script, when run with the environment variable
  ENABLED_FEATURES_OVERRIDE_DISABLE_ALL, produces and uploads the
  alternate search index
- Changed all the search calls in frontend/API to check for
  `isFeatureEnabled('search:fullIndex')` to determine whether to search
  the full or alternate index

* ci: produce nimbus search indexes on merge

* fix: turn full search index on
This commit is contained in:
Charis
2025-09-16 12:37:53 -04:00
committed by GitHub
parent 9907fb25f4
commit 8cd5e10038
17 changed files with 1256 additions and 229 deletions

View File

@@ -68,8 +68,12 @@ jobs:
run: |
pnpm run codegen:examples
pnpm run embeddings
pnpm run embeddings:nimbus
- name: Refresh embeddings
working-directory: ./apps/docs
if: ${{ inputs.refresh }}
run: pnpm run embeddings:refresh
run: |
pnpm run codegen:examples
pnpm run embeddings:refresh
pnpm run embeddings:nimbus:refresh

View File

@@ -3,6 +3,8 @@ import { ApplicationError, UserError, clippy } from 'ai-commands/edge'
import { NextRequest, NextResponse } from 'next/server'
import OpenAI from 'openai'
import { isFeatureEnabled } from 'common/enabled-features'
export const runtime = 'edge'
/* To avoid OpenAI errors, restrict to the Vercel Edge Function regions that
overlap with the OpenAI API regions.
@@ -54,7 +56,10 @@ export async function POST(req: NextRequest) {
throw new UserError('Missing messages in request data')
}
const response = await clippy(openai, supabaseClient, messages)
const useAltSearchIndex = !isFeatureEnabled('search:fullIndex')
const response = await clippy(openai, supabaseClient, messages, {
useAltSearchIndex,
})
// Proxy the streamed SSE response from OpenAI
return new NextResponse(response.body, {

View File

@@ -17,7 +17,9 @@
"dev:secrets:pull": "AWS_PROFILE=supa-dev node ../../scripts/getSecrets.js -n local/docs",
"dev:watch:troubleshooting": "node ./scripts/troubleshooting/watch.mjs",
"embeddings": "tsx --conditions=react-server scripts/search/generate-embeddings.ts",
"embeddings:nimbus": "ENABLED_FEATURES_OVERRIDE_DISABLE_ALL=true pnpm run embeddings",
"embeddings:refresh": "pnpm run embeddings --refresh",
"embeddings:nimbus:refresh": "ENABLED_FEATURES_OVERRIDE_DISABLE_ALL=true pnpm run embeddings:refresh",
"last-changed": "tsx scripts/last-changed.ts",
"last-changed:reset": "pnpm run last-changed -- --reset",
"lint": "next lint",

View File

@@ -3,6 +3,8 @@ import { convertPostgrestToApiError, type ApiErrorGeneric } from '~/app/api/util
import { Result } from '~/features/helpers.fn'
import { openAI } from '~/lib/openAi'
import { supabase, type DatabaseCorrected } from '~/lib/supabase'
import { isFeatureEnabled } from 'common/enabled-features'
import { GuideModel } from '../guide/guideModel'
import {
DB_METADATA_TAG_PLATFORM_CLI,
@@ -13,6 +15,9 @@ import { ReferenceSDKFunctionModel, SDKLanguageValues } from '../reference/refer
import { TroubleshootingModel } from '../troubleshooting/troubleshootingModel'
import { SearchResultInterface } from './globalSearchInterface'
type SearchFunction = 'search_content' | 'search_content_nimbus'
type SearchHybridFunction = 'search_content_hybrid' | 'search_content_hybrid_nimbus'
export abstract class SearchResultModel {
static async search(
args: RootQueryTypeSearchDocsArgs,
@@ -22,9 +27,14 @@ export abstract class SearchResultModel {
const includeFullContent = requestedFields.includes('content')
const embeddingResult = await openAI().createContentEmbedding(query)
const useAltSearchIndex = !isFeatureEnabled('search:fullIndex')
const searchFunction: SearchFunction = useAltSearchIndex
? 'search_content_nimbus'
: 'search_content'
return embeddingResult.flatMapAsync(async ({ embedding }) => {
const matchResult = new Result(
await supabase().rpc('search_content', {
await supabase().rpc(searchFunction, {
embedding,
include_full_content: includeFullContent,
max_result: args.limit ?? undefined,
@@ -49,9 +59,14 @@ export abstract class SearchResultModel {
const includeFullContent = requestedFields.includes('content')
const embeddingResult = await openAI().createContentEmbedding(query)
const useAltSearchIndex = !isFeatureEnabled('search:fullIndex')
const searchFunction: SearchHybridFunction = useAltSearchIndex
? 'search_content_hybrid_nimbus'
: 'search_content_hybrid'
return embeddingResult.flatMapAsync(async ({ embedding }) => {
const matchResult = new Result(
await supabase().rpc('search_content_hybrid', {
await supabase().rpc(searchFunction, {
query_text: query,
query_embedding: embedding,
include_full_content: includeFullContent,

View File

@@ -0,0 +1,106 @@
export interface PageInfo {
pageId: number
path: string
checksum: string
sectionsCount: number
}
export interface PageSectionForEmbedding {
pageId: number
path: string
slug?: string
heading?: string
content: string
input: string
ragIgnore: boolean
}
export interface PageSectionWithEmbedding extends PageSectionForEmbedding {
embedding: number[]
}
export interface ProcessingResult {
successfulPages: Set<number>
failedPages: Set<number>
totalSectionsProcessed: number
totalSectionsInserted: number
}
export function createBatches<T>(array: T[], batchSize: number): T[][] {
const batches: T[][] = []
for (let i = 0; i < array.length; i += batchSize) {
batches.push(array.slice(i, i + batchSize))
}
return batches
}
export function mapEmbeddingsToSections(
batch: PageSectionForEmbedding[],
data: Array<{ embedding?: number[] }>,
batchNumber: number
): {
sectionsWithEmbeddings: PageSectionWithEmbedding[]
failedSectionIndexes: Set<number>
} {
const sectionsWithEmbeddings: PageSectionWithEmbedding[] = []
const failedSectionIndexes: Set<number> = new Set()
if (batch.length !== data.length) {
console.error(
`Ignoring all embeddings returned from batch ${batchNumber} because returned number doesn't match input number`
)
batch.forEach((_, index) => {
failedSectionIndexes.add(index)
})
}
for (let i = 0; i < batch.length; i++) {
if (data[i].embedding) {
sectionsWithEmbeddings.push({ ...batch[i], embedding: data[i].embedding! })
} else {
failedSectionIndexes.add(i)
}
}
return { sectionsWithEmbeddings, failedSectionIndexes }
}
export function updatePageInsertionCounts(
pageSectionsInserted: Map<number, number>,
sectionsWithEmbeddings: PageSectionWithEmbedding[]
) {
sectionsWithEmbeddings.forEach((section) => {
const current = pageSectionsInserted.get(section.pageId) || 0
pageSectionsInserted.set(section.pageId, current + 1)
})
}
export function computePageResults(
pageInfoMap: Map<number, PageInfo>,
pageSectionsInserted: Map<number, number>,
result: ProcessingResult
) {
for (const [pageId, pageInfo] of pageInfoMap) {
const insertedCount = pageSectionsInserted.get(pageId) || 0
if (insertedCount === pageInfo.sectionsCount && !result.failedPages.has(pageId)) {
result.successfulPages.add(pageId)
} else {
result.failedPages.add(pageId)
console.warn(
`Page ${pageInfo.path}: inserted ${insertedCount}/${pageInfo.sectionsCount} sections`
)
}
}
}
export function logFailedSections(
batch: PageSectionForEmbedding[],
inputs: string[],
failedSectionIndexes: Set<number>
) {
failedSectionIndexes.forEach((i) => {
console.error(
`Failed to process section: ${batch[i].path}#${batch[i].slug} (content: "${inputs[i]?.slice(0, 50)}...")`
)
})
}

View File

@@ -1,24 +1,453 @@
import '../utils/dotenv.js'
import { createClient } from '@supabase/supabase-js'
import { createClient, type SupabaseClient } from '@supabase/supabase-js'
import { parseArgs } from 'node:util'
import { OpenAI } from 'openai'
import { v4 as uuidv4 } from 'uuid'
import type { Section } from '../helpers.mdx.js'
import {
type PageInfo,
type PageSectionForEmbedding,
type PageSectionWithEmbedding,
type ProcessingResult,
createBatches,
mapEmbeddingsToSections,
updatePageInsertionCounts,
computePageResults,
logFailedSections,
} from './embeddings/utils.js'
import { fetchAllSources } from './sources/index.js'
const CONFIG = {
// OpenAI settings
EMBEDDING_MODEL: 'text-embedding-ada-002' as const,
EMBEDDING_DIMENSION: 1536, // Keep in sync with EMBEDDING_MODEL
OPENAI_BATCH_SIZE: 128,
OPENAI_MAX_RETRIES: 3,
OPENAI_BASE_DELAY_MS: 500,
/**
* If context length is exceeded, truncate inputs over this character length
* and retry. This is a character-based heuristic, not token-exact.
*/
EMBEDDING_TRUNCATE_CHAR_LIMIT: 16_000,
// Supabase settings
SUPABASE_MAX_RETRIES: 2,
SUPABASE_BASE_DELAY_MS: 100,
// Processing settings
SOURCE_CONCURRENCY: 10,
} as const
function delay(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms))
}
function exponentialBackoff(attempt: number, baseDelay: number, maxDelay: number = 30_000): number {
const exponentialDelay = baseDelay * Math.pow(2, attempt)
const jitter = (Math.random() - 0.5) * 0.1 * exponentialDelay
return Math.min(Math.max(0, exponentialDelay + jitter), maxDelay)
}
async function withRetry<T>(
operation: () => Promise<T>,
maxRetries: number,
baseDelay: number,
operationName: string,
shouldRetryOnError: (error: unknown) => boolean = () => true
): Promise<T> {
let lastError: Error
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await operation()
} catch (error) {
lastError = error as Error
// Allow caller to prevent redundant retries for specific errors
if (!shouldRetryOnError?.(error)) {
console.warn(`${operationName} encountered non-retryable error:`, lastError.message)
throw lastError
}
if (attempt === maxRetries) {
console.error(`${operationName} failed after ${maxRetries + 1} attempts:`, lastError)
throw lastError
}
const delayMs = exponentialBackoff(attempt, baseDelay)
console.warn(
`${operationName} attempt ${attempt + 1} failed, retrying in ${delayMs}ms:`,
lastError.message
)
await delay(delayMs)
}
}
throw lastError!
}
function isNimbusMode(): boolean {
return process.env.ENABLED_FEATURES_OVERRIDE_DISABLE_ALL === 'true'
}
function getPageTables() {
const nimbus = isNimbusMode()
return {
pageTable: nimbus ? 'page_nimbus' : 'page',
pageSectionTable: nimbus ? 'page_section_nimbus' : 'page_section',
} as const
}
function requireEnvOrThrow(names: string[]): void {
const missing = names.filter((n) => !process.env[n])
if (missing.length) {
throw new Error(
`Environment variables ${missing.join(', ')} are required: skipping embeddings generation`
)
}
}
function initSupabase(): SupabaseClient {
return createClient(process.env.NEXT_PUBLIC_SUPABASE_URL!, process.env.SUPABASE_SECRET_KEY!, {
auth: { persistSession: false, autoRefreshToken: false },
})
}
type PreparedSections = {
allSectionsToProcess: PageSectionForEmbedding[]
pageInfoMap: Map<number, PageInfo>
}
async function prepareSections(
supabaseClient: SupabaseClient,
pageTable: string,
pageSectionTable: string,
shouldRefresh: boolean,
refreshVersion: string,
refreshDate: Date,
fullIndex = true,
debug = false
): Promise<PreparedSections> {
const embeddingSources = await fetchAllSources(fullIndex)
console.log(`Discovered ${embeddingSources.length} sources`)
const allSectionsToProcess: PageSectionForEmbedding[] = []
const pageInfoMap = new Map<number, PageInfo>()
for (const sourceBatch of createBatches(embeddingSources, CONFIG.SOURCE_CONCURRENCY)) {
await Promise.all(
sourceBatch.map(async (embeddingSource) => {
const { type, source, path } = embeddingSource
try {
const {
checksum,
sections,
meta = {},
ragIgnore = false,
}: {
checksum: string
sections: Section[]
ragIgnore?: boolean
meta?: Record<string, unknown>
} = await embeddingSource.process()
const { error: fetchPageError, data: existingPage } = await supabaseClient
.from(pageTable)
.select('id, path, checksum')
.filter('path', 'eq', path)
.limit(1)
.maybeSingle()
if (fetchPageError) throw fetchPageError
if (!shouldRefresh && existingPage?.checksum === checksum) {
const { error: updatePageError } = await supabaseClient
.from(pageTable)
.update({
type,
source,
meta,
version: refreshVersion,
last_refresh: refreshDate,
})
.filter('id', 'eq', existingPage.id)
if (updatePageError) throw updatePageError
return
}
if (existingPage && debug) {
console.log(
!shouldRefresh
? `[${path}] Docs have changed, removing old page sections and their embeddings`
: `[${path}] Refresh flag set, removing old page sections and their embeddings`
)
const { error: deletePageSectionError } = await supabaseClient
.from(pageSectionTable)
.delete()
.filter('page_id', 'eq', existingPage.id)
if (deletePageSectionError) throw deletePageSectionError
}
const { error: upsertPageError, data: page } = await supabaseClient
.from(pageTable)
.upsert(
{
checksum: null,
path,
type,
source,
meta,
content: embeddingSource.extractIndexedContent(),
version: refreshVersion,
last_refresh: refreshDate,
},
{ onConflict: 'path' }
)
.select()
.limit(1)
.single()
if (upsertPageError) throw upsertPageError
if (debug) {
console.log(`[${path}] Preparing ${sections.length} page sections for processing`)
}
pageInfoMap.set(page.id, {
pageId: page.id,
path,
checksum,
sectionsCount: sections.length,
})
const sectionsForBatching = sections.map(({ slug, heading, content }) => ({
pageId: page.id,
path,
slug,
heading,
content,
input: content.replace(/\n/g, ' '),
ragIgnore,
}))
allSectionsToProcess.push(...sectionsForBatching)
} catch (err) {
console.error(`Error preparing path '${path}' for processing.`)
console.error(err)
}
})
)
}
console.log(
`Prepared ${allSectionsToProcess.length} sections for processing from ${pageInfoMap.size} pages`
)
return { allSectionsToProcess, pageInfoMap }
}
async function processAndInsertEmbeddings(
openai: OpenAI,
supabaseClient: SupabaseClient,
pageSectionTable: string,
allSections: PageSectionForEmbedding[],
pageInfoMap: Map<number, PageInfo>
): Promise<ProcessingResult> {
if (allSections.length === 0) {
return {
successfulPages: new Set(),
failedPages: new Set(),
totalSectionsProcessed: 0,
totalSectionsInserted: 0,
}
}
console.log(`Processing ${allSections.length} sections with embeddings + insertion`)
const embeddingBatches = createBatches(allSections, CONFIG.OPENAI_BATCH_SIZE)
const result: ProcessingResult = {
successfulPages: new Set(),
failedPages: new Set(),
totalSectionsProcessed: 0,
totalSectionsInserted: 0,
}
// Track sections inserted per page
const pageSectionsInserted = new Map<number, number>()
for (let batchIndex = 0; batchIndex < embeddingBatches.length; batchIndex++) {
const batch = embeddingBatches[batchIndex]
try {
const batchResult = await processEmbeddingBatch(
openai,
batch,
batchIndex,
embeddingBatches.length
)
result.totalSectionsProcessed += batchResult.processedCount
if (batchResult.sectionsWithEmbeddings.length > 0) {
const insertedCount = await insertSectionBatch(
supabaseClient,
pageSectionTable,
batchResult.sectionsWithEmbeddings
)
result.totalSectionsInserted += insertedCount
updatePageInsertionCounts(pageSectionsInserted, batchResult.sectionsWithEmbeddings)
}
// Mark failed section pages
batchResult.failedSectionIndexes.forEach((i) => {
result.failedPages.add(batch[i].pageId)
})
} catch (error) {
console.error(`Batch ${batchIndex + 1} completely failed:`, error)
batch.forEach((section) => result.failedPages.add(section.pageId))
}
if (batchIndex < embeddingBatches.length - 1) {
await delay(CONFIG.OPENAI_BASE_DELAY_MS)
}
}
computePageResults(pageInfoMap, pageSectionsInserted, result)
return result
}
type BatchEmbeddingResult = {
sectionsWithEmbeddings: PageSectionWithEmbedding[]
failedSectionIndexes: Set<number>
processedCount: number
}
async function processEmbeddingBatch(
openai: OpenAI,
batch: PageSectionForEmbedding[],
batchIndex: number,
totalBatches: number
): Promise<BatchEmbeddingResult> {
const inputs = batch.map((section) => section.input)
console.log(
`Processing embedding batch ${batchIndex + 1}/${totalBatches} (${inputs.length} sections)`
)
// Helper to identify context length exceeded errors from OpenAI
const isContextLengthError = (err: unknown) => {
if (!(err instanceof OpenAI.APIError)) return false
const message = err.error?.message as string
const status = err.status
return status === 400 && message.toLowerCase().includes('context')
}
let embeddingResponse: OpenAI.Embeddings.CreateEmbeddingResponse
try {
embeddingResponse = await withRetry(
() =>
openai.embeddings.create({
model: CONFIG.EMBEDDING_MODEL,
input: inputs,
}),
CONFIG.OPENAI_MAX_RETRIES,
CONFIG.OPENAI_BASE_DELAY_MS,
`OpenAI embedding batch ${batchIndex + 1}`,
(err) => !isContextLengthError(err)
)
} catch (err) {
if (!isContextLengthError(err)) {
throw err
}
// Context length exceeded: truncate problematic sections and try once more
const limit = CONFIG.EMBEDDING_TRUNCATE_CHAR_LIMIT
const truncatedInputs = inputs.map((s) => (s.length > limit ? s.slice(0, limit) : s))
const truncatedCount = truncatedInputs.filter((s, i) => s !== inputs[i]).length
console.warn(
`OpenAI embedding batch ${batchIndex + 1}: context length exceeded. ` +
`Truncating ${truncatedCount} overly long section(s) to ${limit} chars and retrying once.`
)
embeddingResponse = await openai.embeddings.create({
model: CONFIG.EMBEDDING_MODEL,
input: truncatedInputs,
})
// Replace inputs with truncated inputs for downstream bookkeeping
for (let i = 0; i < inputs.length; i++) inputs[i] = truncatedInputs[i]
}
const { sectionsWithEmbeddings, failedSectionIndexes } = mapEmbeddingsToSections(
batch,
embeddingResponse.data,
batchIndex
)
logFailedSections(batch, inputs, failedSectionIndexes)
return {
sectionsWithEmbeddings,
failedSectionIndexes,
processedCount: inputs.length,
}
}
async function insertSectionBatch(
supabaseClient: SupabaseClient,
pageSectionTable: string,
sectionsWithEmbeddings: PageSectionWithEmbedding[]
): Promise<number> {
if (sectionsWithEmbeddings.length === 0) {
return 0
}
const pageSectionsToInsert = sectionsWithEmbeddings.map((section) => ({
page_id: section.pageId,
slug: section.slug,
heading: section.heading,
content: section.content,
embedding: section.embedding,
rag_ignore: section.ragIgnore,
}))
await withRetry(
async () => {
const { error } = await supabaseClient.from(pageSectionTable).insert(pageSectionsToInsert)
if (error) {
throw new Error(`Supabase insert error: ${error.message}`)
}
},
CONFIG.SUPABASE_MAX_RETRIES,
CONFIG.SUPABASE_BASE_DELAY_MS,
`Insert batch of ${sectionsWithEmbeddings.length} sections`
)
return sectionsWithEmbeddings.length
}
const args = parseArgs({
options: {
refresh: {
type: 'boolean',
},
debug: {
type: 'boolean',
},
},
})
async function generateEmbeddings() {
const shouldRefresh = Boolean(args.values.refresh)
const debug = Boolean(args.values.debug)
const requiredEnvVars = [
const nimbus = isNimbusMode()
if (nimbus) {
console.log('Running in Nimbus mode - will filter content based on disabled feature flags')
}
requireEnvOrThrow([
'DOCS_GITHUB_APP_ID',
'DOCS_GITHUB_APP_INSTALLATION_ID',
'DOCS_GITHUB_APP_PRIVATE_KEY',
@@ -27,217 +456,136 @@ async function generateEmbeddings() {
'NEXT_PUBLIC_SUPABASE_URL',
'OPENAI_API_KEY',
'SUPABASE_SECRET_KEY',
]
])
const missingEnvVars = requiredEnvVars.filter((name) => !process.env[name])
if (missingEnvVars.length > 0) {
throw new Error(
`Environment variables ${missingEnvVars.join(
', '
)} are required: skipping embeddings generation`
)
}
const supabaseClient = initSupabase()
const supabaseClient = createClient(
process.env.NEXT_PUBLIC_SUPABASE_URL!,
process.env.SUPABASE_SECRET_KEY!,
{
auth: {
persistSession: false,
autoRefreshToken: false,
},
}
)
// Use this version to track which pages to purge
// after the refresh
const refreshVersion = uuidv4()
const refreshDate = new Date()
const embeddingSources = await fetchAllSources()
const { pageTable, pageSectionTable } = getPageTables()
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY })
console.log(`Discovered ${embeddingSources.length} pages`)
console.log(
shouldRefresh
? 'Refresh flag set, re-generating all pages'
: 'Checking which pages are new or have changed'
)
if (!shouldRefresh) {
console.log('Checking which pages are new or have changed')
} else {
console.log('Refresh flag set, re-generating all pages')
const { allSectionsToProcess, pageInfoMap } = await prepareSections(
supabaseClient,
pageTable,
pageSectionTable,
shouldRefresh,
refreshVersion,
refreshDate,
!nimbus,
debug
)
let processingResult: ProcessingResult
try {
processingResult = await processAndInsertEmbeddings(
openai,
supabaseClient,
pageSectionTable,
allSectionsToProcess,
pageInfoMap
)
console.log(
`Processing complete: ${processingResult.totalSectionsInserted}/${processingResult.totalSectionsProcessed} sections inserted successfully`
)
console.log(
`Page summary: ${processingResult.successfulPages.size} successful, ${processingResult.failedPages.size} failed`
)
} catch (error) {
console.error('Critical error during embedding processing:', error)
console.log('Exiting due to complete processing failure')
return
}
for (const embeddingSource of embeddingSources) {
const { type, source, path } = embeddingSource
console.log(`\nUpdating checksums for ${processingResult.successfulPages.size} successful pages`)
const successfulChecksumUpdates = await updateSuccessfulChecksums(
supabaseClient,
pageTable,
pageInfoMap,
processingResult
)
console.log(
`Successfully updated checksums for ${successfulChecksumUpdates}/${processingResult.successfulPages.size} successful pages`
)
try {
const {
checksum,
sections,
meta = {},
ragIgnore = false,
}: {
checksum: string
sections: Section[]
ragIgnore?: boolean
meta?: Record<string, unknown>
} = await embeddingSource.process()
logFailedPages(pageInfoMap, processingResult)
// Check for existing page in DB and compare checksums
const { error: fetchPageError, data: existingPage } = await supabaseClient
.from('page')
.select('id, path, checksum')
.filter('path', 'eq', path)
.limit(1)
.maybeSingle()
if (fetchPageError) {
throw fetchPageError
}
// We use checksum to determine if this page & its sections need to be regenerated
if (!shouldRefresh && existingPage?.checksum === checksum) {
// No content/embedding update required on this page
// Update other meta info
const { error: updatePageError } = await supabaseClient
.from('page')
.update({
type,
source,
meta,
version: refreshVersion,
last_refresh: refreshDate,
})
.filter('id', 'eq', existingPage.id)
if (updatePageError) {
throw updatePageError
}
continue
}
if (existingPage) {
if (!shouldRefresh) {
console.log(
`[${path}] Docs have changed, removing old page sections and their embeddings`
)
} else {
console.log(`[${path}] Refresh flag set, removing old page sections and their embeddings`)
}
const { error: deletePageSectionError } = await supabaseClient
.from('page_section')
.delete()
.filter('page_id', 'eq', existingPage.id)
if (deletePageSectionError) {
throw deletePageSectionError
}
}
// Create/update page record. Intentionally clear checksum until we
// have successfully generated all page sections.
const { error: upsertPageError, data: page } = await supabaseClient
.from('page')
.upsert(
{
checksum: null,
path,
type,
source,
meta,
content: embeddingSource.extractIndexedContent(),
version: refreshVersion,
last_refresh: refreshDate,
},
{ onConflict: 'path' }
)
.select()
.limit(1)
.single()
if (upsertPageError) {
throw upsertPageError
}
console.log(`[${path}] Adding ${sections.length} page sections (with embeddings)`)
for (const { slug, heading, content } of sections) {
// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
// force a redeploy
const input = content.replace(/\n/g, ' ')
try {
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY })
const embeddingResponse = await openai.embeddings.create({
model: 'text-embedding-ada-002',
input,
})
const [responseData] = embeddingResponse.data
const { error: insertPageSectionError } = await supabaseClient
.from('page_section')
.insert({
page_id: page.id,
slug,
heading,
content,
token_count: embeddingResponse.usage.total_tokens,
embedding: responseData.embedding,
rag_ignore: ragIgnore,
})
.select()
.limit(1)
.single()
if (insertPageSectionError) {
throw insertPageSectionError
}
} catch (err) {
// TODO: decide how to better handle failed embeddings
console.error(
`Failed to generate embeddings for '${path}' page section starting with '${input.slice(
0,
40
)}...'`
)
throw err
}
}
// Set page checksum so that we know this page was stored successfully
const { error: updatePageError } = await supabaseClient
.from('page')
.update({ checksum })
.filter('id', 'eq', page.id)
if (updatePageError) {
throw updatePageError
}
} catch (err) {
console.error(
`Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.`
)
console.error(err)
}
}
console.log(`Removing old pages and their sections`)
// Delete pages that have been removed (and their sections via cascade)
const { error: deletePageError } = await supabaseClient
.from('page')
.delete()
.filter('version', 'neq', refreshVersion)
if (deletePageError) {
throw deletePageError
}
await purgeOldPages(supabaseClient, pageTable, refreshVersion)
console.log('Embedding generation complete')
}
async function updateSuccessfulChecksums(
supabaseClient: SupabaseClient,
pageTable: string,
pageInfoMap: Map<number, PageInfo>,
processingResult: ProcessingResult
): Promise<number> {
let successfulChecksumUpdates = 0
const pageIds = Array.from(processingResult.successfulPages)
const batches = createBatches(pageIds, CONFIG.SOURCE_CONCURRENCY)
for (const batch of batches) {
const results = await Promise.all(
batch.map(async (pageId) => {
const pageInfo = pageInfoMap.get(pageId)
if (!pageInfo) {
console.error(`Missing page info for pageId ${pageId}`)
return 0
}
try {
const { error: updatePageError } = await supabaseClient
.from(pageTable)
.update({ checksum: pageInfo.checksum })
.eq('id', pageId)
if (updatePageError) {
console.error(`Failed to update checksum for page ${pageInfo.path}:`, updatePageError)
return 0
}
return 1
} catch (error) {
console.error(`Error updating checksum for page ${pageInfo.path}:`, error)
return 0
}
})
)
successfulChecksumUpdates += results.reduce((sum, x) => sum + x, 0)
}
return successfulChecksumUpdates
}
function logFailedPages(pageInfoMap: Map<number, PageInfo>, processingResult: ProcessingResult) {
if (processingResult.failedPages.size === 0) return
console.log(`\nFailed pages:`)
for (const pageId of processingResult.failedPages) {
const pageInfo = pageInfoMap.get(pageId)
if (pageInfo) console.log(` - ${pageInfo.path}`)
}
}
async function purgeOldPages(
supabaseClient: SupabaseClient,
pageTable: string,
refreshVersion: string
) {
console.log(`Removing old pages and their sections`)
const { error: deletePageError } = await supabaseClient
.from(pageTable)
.delete()
.filter('version', 'neq', refreshVersion)
if (deletePageError) throw deletePageError
}
async function main() {
await generateEmbeddings()
}

View File

@@ -1,3 +1,4 @@
import { type GuideModel } from '../../../resources/guide/guideModel.js'
import { GuideModelLoader } from '../../../resources/guide/guideModelLoader.js'
import {
GitHubDiscussionLoader,
@@ -28,7 +29,7 @@ export type SearchSource =
export async function fetchGuideSources() {
const guides = (await GuideModelLoader.allFromFs()).unwrapLeft()
return guides.map((guide) => MarkdownLoader.fromGuideModel('guide', guide))
return guides.map((guide: GuideModel) => MarkdownLoader.fromGuideModel('guide', guide))
}
export async function fetchOpenApiReferenceSource() {
@@ -125,27 +126,29 @@ export async function fetchLintWarningsGuideSources() {
/**
* Fetches all the sources we want to index for search
*/
export async function fetchAllSources() {
export async function fetchAllSources(fullIndex: boolean) {
const guideSources = fetchGuideSources()
const lintWarningsGuideSources = fetchLintWarningsGuideSources()
const openApiReferenceSource = fetchOpenApiReferenceSource()
const jsLibReferenceSource = fetchJsLibReferenceSource()
const dartLibReferenceSource = fetchDartLibReferenceSource()
const pythonLibReferenceSource = fetchPythonLibReferenceSource()
const cSharpLibReferenceSource = fetchCSharpLibReferenceSource()
const swiftLibReferenceSource = fetchSwiftLibReferenceSource()
const ktLibReferenceSource = fetchKtLibReferenceSource()
const cliReferenceSource = fetchCliLibReferenceSource()
const dartLibReferenceSource = fullIndex ? fetchDartLibReferenceSource() : []
const pythonLibReferenceSource = fullIndex ? fetchPythonLibReferenceSource() : []
const cSharpLibReferenceSource = fullIndex ? fetchCSharpLibReferenceSource() : []
const swiftLibReferenceSource = fullIndex ? fetchSwiftLibReferenceSource() : []
const ktLibReferenceSource = fullIndex ? fetchKtLibReferenceSource() : []
const cliReferenceSource = fullIndex ? fetchCliLibReferenceSource() : []
const partnerIntegrationSources = fetchPartners()
.then((partners) =>
partners
? Promise.all(
partners.map((partner) => new IntegrationLoader(partner.slug, partner).load())
)
: []
)
.then((data) => data.flat())
const partnerIntegrationSources = fullIndex
? fetchPartners()
.then((partners) =>
partners
? Promise.all(
partners.map((partner) => new IntegrationLoader(partner.slug, partner).load())
)
: []
)
.then((data) => data.flat())
: []
const githubDiscussionSources = fetchDiscussions(
'supabase',

View File

@@ -820,7 +820,7 @@
{
"id": "supabase-postgres-config",
"title": "Manage Postgres configurations",
"slug": "supabase-ssl-enforcement",
"slug": "supabase-postgres-config",
"type": "cli-command"
},
{

View File

@@ -46,6 +46,7 @@
"DOCS_GITHUB_APP_PRIVATE_KEY",
"DOCS_REVALIDATION_KEYS",
"DOCS_REVALIDATION_OVERRIDE_KEYS",
"ENABLED_FEATURES_OVERRIDE_DISABLE_ALL",
"GITHUB_ACTIONS",
"FORCE_ASSET_CDN",
"LOGFLARE_INGESTION_API_KEY",

View File

@@ -13,10 +13,13 @@ interface PageSection {
rag_ignore?: boolean
}
type MatchPageSectionsFunction = 'match_page_sections_v2' | 'match_page_sections_v2_nimbus'
export async function clippy(
openai: OpenAI,
supabaseClient: SupabaseClient<any, 'public', any>,
messages: Message[]
messages: Message[],
options?: { useAltSearchIndex?: boolean }
) {
// TODO: better sanitization
const contextMessages = messages.map(({ role, content }) => {
@@ -63,14 +66,19 @@ export async function clippy(
const [{ embedding }] = embeddingResponse.data
const searchFunction = options?.useAltSearchIndex
? 'match_page_sections_v2_nimbus'
: 'match_page_sections_v2'
const joinedTable = options?.useAltSearchIndex ? 'page_nimbus' : 'page'
const { error: matchError, data: pageSections } = (await supabaseClient
.rpc('match_page_sections_v2', {
.rpc(searchFunction, {
embedding,
match_threshold: 0.78,
min_content_length: 50,
})
.neq('rag_ignore', true)
.select('content,page!inner(path),rag_ignore')
.select(`content,${joinedTable}!inner(path),rag_ignore`)
.limit(10)) as { error: any; data: PageSection[] | null }
if (matchError || !pageSections) {

View File

@@ -294,6 +294,48 @@ export type Database = {
}
Relationships: []
}
page_nimbus: {
Row: {
checksum: string | null
content: string | null
fts_tokens: unknown | null
id: number
last_refresh: string | null
meta: Json | null
path: string
source: string | null
title_tokens: unknown | null
type: string | null
version: string | null
}
Insert: {
checksum?: string | null
content?: string | null
fts_tokens?: unknown | null
id?: never
last_refresh?: string | null
meta?: Json | null
path: string
source?: string | null
title_tokens?: unknown | null
type?: string | null
version?: string | null
}
Update: {
checksum?: string | null
content?: string | null
fts_tokens?: unknown | null
id?: never
last_refresh?: string | null
meta?: Json | null
path?: string
source?: string | null
title_tokens?: unknown | null
type?: string | null
version?: string | null
}
Relationships: []
}
page_section: {
Row: {
content: string | null
@@ -335,6 +377,47 @@ export type Database = {
},
]
}
page_section_nimbus: {
Row: {
content: string | null
embedding: string | null
heading: string | null
id: number
page_id: number
rag_ignore: boolean | null
slug: string | null
token_count: number | null
}
Insert: {
content?: string | null
embedding?: string | null
heading?: string | null
id?: never
page_id: number
rag_ignore?: boolean | null
slug?: string | null
token_count?: number | null
}
Update: {
content?: string | null
embedding?: string | null
heading?: string | null
id?: never
page_id?: number
rag_ignore?: boolean | null
slug?: string | null
token_count?: number | null
}
Relationships: [
{
foreignKeyName: 'page_section_nimbus_page_id_fkey'
columns: ['page_id']
isOneToOne: false
referencedRelation: 'page_nimbus'
referencedColumns: ['id']
},
]
}
tickets: {
Row: {
company: string | null
@@ -526,6 +609,22 @@ export type Database = {
slugs: string[]
}[]
}
docs_search_embeddings_nimbus: {
Args: {
embedding: string
match_threshold: number
}
Returns: {
id: number
path: string
type: string
title: string
subtitle: string
description: string
headings: string[]
slugs: string[]
}[]
}
docs_search_fts: {
Args: {
query: string
@@ -539,6 +638,19 @@ export type Database = {
description: string
}[]
}
docs_search_fts_nimbus: {
Args: {
query: string
}
Returns: {
id: number
path: string
type: string
title: string
subtitle: string
description: string
}[]
}
get_full_content_url: {
Args: {
type: string
@@ -621,6 +733,23 @@ export type Database = {
token_count: number | null
}[]
}
match_embedding_nimbus: {
Args: {
embedding: string
match_threshold?: number
max_results?: number
}
Returns: {
content: string | null
embedding: string | null
heading: string | null
id: number
page_id: number
rag_ignore: boolean | null
slug: string | null
token_count: number | null
}[]
}
match_page_sections_v2: {
Args: {
embedding: string
@@ -638,6 +767,23 @@ export type Database = {
token_count: number | null
}[]
}
match_page_sections_v2_nimbus: {
Args: {
embedding: string
match_threshold: number
min_content_length: number
}
Returns: {
content: string | null
embedding: string | null
heading: string | null
id: number
page_id: number
rag_ignore: boolean | null
slug: string | null
token_count: number | null
}[]
}
search_content: {
Args: {
embedding: string
@@ -676,6 +822,44 @@ export type Database = {
subsections: Json[]
}[]
}
search_content_hybrid_nimbus: {
Args: {
query_text: string
query_embedding: string
max_result?: number
full_text_weight?: number
semantic_weight?: number
rrf_k?: number
match_threshold?: number
include_full_content?: boolean
}
Returns: {
id: number
page_title: string
type: string
href: string
content: string
metadata: Json
subsections: Json[]
}[]
}
search_content_nimbus: {
Args: {
embedding: string
include_full_content?: boolean
match_threshold?: number
max_result?: number
}
Returns: {
id: number
page_title: string
type: string
href: string
content: string
metadata: Json
subsections: Json[]
}[]
}
update_last_changed_checksum: {
Args: {
new_parent_page: string

View File

@@ -76,5 +76,7 @@
"sdk:dart": true,
"sdk:kotlin": true,
"sdk:python": true,
"sdk:swift": true
"sdk:swift": true,
"search:fullIndex": true
}

View File

@@ -255,6 +255,11 @@
"sdk:swift": {
"type": "boolean",
"description": "Enable the Swift SDK"
},
"search:fullIndex": {
"type": "boolean",
"description": "Enable the full search index. When true, uses the full search; when false, uses the alternate search index."
}
},
"required": [
@@ -314,7 +319,8 @@
"sdk:dart",
"sdk:kotlin",
"sdk:python",
"sdk:swift"
"sdk:swift",
"search:fullIndex"
],
"additionalProperties": false
}

View File

@@ -40,6 +40,15 @@ function isFeatureEnabled<T extends Feature | Feature[]>(
features: T,
runtimeDisabledFeatures?: Feature[]
) {
// Override is used to produce a filtered version of the docs search index
// using the same sync setup as our normal search index
if (process.env.ENABLED_FEATURES_OVERRIDE_DISABLE_ALL === 'true') {
if (Array.isArray(features)) {
return Object.fromEntries(features.map((feature) => [featureToCamelCase(feature), false]))
}
return false
}
const disabledFeatures = new Set([
...(runtimeDisabledFeatures ?? []),
...disabledFeaturesStaticArray,

View File

@@ -3,6 +3,8 @@
import { compact, debounce, uniqBy } from 'lodash'
import { useCallback, useMemo, useReducer, useRef } from 'react'
import { isFeatureEnabled } from '../enabled-features'
const NUMBER_SOURCES = 2
const SUPABASE_URL = process.env.NEXT_PUBLIC_SUPABASE_URL
@@ -200,7 +202,10 @@ const useDocsSearch = () => {
let sourcesLoaded = 0
fetch(`${SUPABASE_URL}/rest/v1/rpc/docs_search_fts`, {
const useAlternateSearchIndex = !isFeatureEnabled('search:fullIndex')
const searchEndpoint = useAlternateSearchIndex ? 'docs_search_fts_nimbus' : 'docs_search_fts'
fetch(`${SUPABASE_URL}/rest/v1/rpc/${searchEndpoint}`, {
method: 'POST',
headers: {
'content-type': 'application/json',
@@ -244,7 +249,7 @@ const useDocsSearch = () => {
fetch(`${SUPABASE_URL}${FUNCTIONS_URL}search-embeddings`, {
method: 'POST',
body: JSON.stringify({ query }),
body: JSON.stringify({ query, useAlternateSearchIndex }),
})
.then((response) => response.json())
.then((results) => {

View File

@@ -38,7 +38,7 @@ Deno.serve(async (req) => {
throw new UserError('Missing request data')
}
const { query } = requestData
const { query, useAlternateSearchIndex } = requestData
if (!query) {
throw new UserError('Missing query in request data')
@@ -76,7 +76,11 @@ Deno.serve(async (req) => {
}
const [{ embedding }] = embeddingResponse.data.data
const { error: matchError, data: pages } = await supabaseClient.rpc('docs_search_embeddings', {
const searchFunction = useAlternateSearchIndex
? 'docs_search_embeddings_nimbus'
: 'docs_search_embeddings'
const { error: matchError, data: pages } = await supabaseClient.rpc(searchFunction, {
embedding,
match_threshold: 0.78,
})

View File

@@ -0,0 +1,325 @@
-- Create nimbus tables for feature-flag-filtered search
-- These tables mirror the structure of page and page_section but contain only content
-- that should be visible when all feature flags are disabled
create table "public"."page_nimbus" (
id bigint primary key generated always as identity,
path text not null unique,
checksum text,
meta jsonb,
type text,
source text,
content text,
version uuid,
last_refresh timestamptz,
fts_tokens tsvector generated always as (to_tsvector('english', content)) stored,
title_tokens tsvector generated always as (to_tsvector('english', coalesce(meta ->> 'title', ''))) stored
);
alter table "public"."page_nimbus"
enable row level security;
create policy "anon can read page_nimbus"
on public.page_nimbus
for select
to anon
using (true);
create policy "authenticated can read page_nimbus"
on public.page_nimbus
for select
to authenticated
using (true);
create table "public"."page_section_nimbus" (
id bigint primary key generated always as identity,
page_id bigint not null references public.page_nimbus (id) on delete cascade,
content text,
token_count int,
embedding vector(1536),
slug text,
heading text,
rag_ignore boolean default false
);
alter table "public"."page_section_nimbus"
enable row level security;
create policy "anon can read page_section_nimbus"
on public.page_section_nimbus
for select
to anon
using (true);
create policy "authenticated can read page_section_nimbus"
on public.page_section_nimbus
for select
to authenticated
using (true);
-- Create indexes for nimbus tables (matching the regular tables)
create index fts_search_index_content_nimbus
on page_nimbus
using gin(fts_tokens);
create index fts_search_index_title_nimbus
on page_nimbus
using gin(title_tokens);
-- Create search function for nimbus tables (FTS search)
create or replace function docs_search_fts_nimbus(query text)
returns table (
id bigint,
path text,
type text,
title text,
subtitle text,
description text
)
set search_path = ''
language plpgsql
as $$
#variable_conflict use_variable
begin
return query
select
page_nimbus.id,
page_nimbus.path,
page_nimbus.type,
page_nimbus.meta ->> 'title' as title,
page_nimbus.meta ->> 'subtitle' as subtitle,
page_nimbus.meta ->> 'description' as description
from public.page_nimbus
where title_tokens @@ websearch_to_tsquery(query) or fts_tokens @@ websearch_to_tsquery(query)
order by greatest(
-- Title is more important than body, so use 10 as the weighting factor
-- Cut off at max rank of 1
least(10 * ts_rank(title_tokens, websearch_to_tsquery(query)), 1),
ts_rank(fts_tokens, websearch_to_tsquery(query))
) desc
limit 10;
end;
$$;
-- Create embedding matching function for nimbus tables
create or replace function match_embedding_nimbus(
embedding vector(1536),
match_threshold float default 0.78,
max_results int default 30
)
returns setof public.page_section_nimbus
set search_path = ''
language plpgsql
as $$
#variable_conflict use_variable
begin
return query
select *
from public.page_section_nimbus
where (page_section_nimbus.embedding operator(public.<#>) embedding) <= -match_threshold
order by page_section_nimbus.embedding operator(public.<#>) embedding
limit max_results;
end;
$$;
-- Create hybrid search function for nimbus tables
create or replace function search_content_hybrid_nimbus(
query_text text,
query_embedding vector(1536),
max_result int default 30,
full_text_weight float default 1,
semantic_weight float default 1,
rrf_k int default 50,
match_threshold float default 0.78,
include_full_content boolean default false
)
returns table (
id bigint,
page_title text,
type text,
href text,
content text,
metadata json,
subsections json[]
)
language sql
set search_path = ''
as $$
with full_text as (
select
id,
row_number() over(order by greatest(
least(10 * ts_rank(title_tokens, websearch_to_tsquery(query_text)), 1),
ts_rank(fts_tokens, websearch_to_tsquery(query_text))
) desc) as rank_ix
from public.page_nimbus
where title_tokens @@ websearch_to_tsquery(query_text) or fts_tokens @@ websearch_to_tsquery(query_text)
order by rank_ix
limit least(max_result, 30) * 2
),
semantic as (
select
page_id as id,
row_number() over () as rank_ix
from public.match_embedding_nimbus(query_embedding, match_threshold, max_result * 2)
),
rrf as (
select
coalesce(full_text.id, semantic.id) as id,
coalesce(1.0 / (rrf_k + full_text.rank_ix), 0.0) * full_text_weight +
coalesce(1.0 / (rrf_k + semantic.rank_ix), 0.0) * semantic_weight as rrf_score
from full_text
full outer join semantic on full_text.id = semantic.id
)
select
page_nimbus.id,
page_nimbus.meta ->> 'title' as page_title,
page_nimbus.type,
public.get_full_content_url(page_nimbus.type, page_nimbus.path, null) as href,
case when include_full_content then page_nimbus.content else null end as content,
page_nimbus.meta as metadata,
array_agg(json_build_object(
'title', page_section_nimbus.heading,
'href', public.get_full_content_url(page_nimbus.type, page_nimbus.path, page_section_nimbus.slug),
'content', page_section_nimbus.content
)) as subsections
from rrf
join public.page_nimbus on page_nimbus.id = rrf.id
left join public.page_section_nimbus on page_section_nimbus.page_id = page_nimbus.id
where rrf.rrf_score > 0
group by page_nimbus.id
order by max(rrf.rrf_score) desc
limit max_result;
$$;
create or replace function match_page_sections_v2_nimbus(
embedding vector(1536),
match_threshold float,
min_content_length int
)
returns setof page_section_nimbus
set search_path = ''
language plpgsql
as $$
#variable_conflict use_variable
begin
return query
select *
from public.page_section_nimbus
-- We only care about sections that have a useful amount of content
where length(page_section_nimbus.content) >= min_content_length
-- The dot product is negative because of a Postgres limitation, so we negate it
and (page_section_nimbus.embedding operator(public.<#>) embedding) * -1 > match_threshold
-- OpenAI embeddings are normalized to length 1, so
-- cosine similarity and dot product will produce the same results.
-- Using dot product which can be computed slightly faster.
--
-- For the different syntaxes, see https://github.com/pgvector/pgvector
order by page_section_nimbus.embedding operator(public.<#>) embedding;
end;
$$;
create or replace function docs_search_embeddings_nimbus(
embedding vector(1536),
match_threshold float
)
returns table (
id bigint,
path text,
type text,
title text,
subtitle text,
description text,
headings text[],
slugs text[]
)
set search_path = ''
language plpgsql
as $$
#variable_conflict use_variable
begin
return query
with match as(
select *
from public.page_section_nimbus
-- The dot product is negative because of a Postgres limitation, so we negate it
where (page_section_nimbus.embedding operator(public.<#>) embedding) * -1 > match_threshold
-- OpenAI embeddings are normalized to length 1, so
-- cosine similarity and dot product will produce the same results.
-- Using dot product which can be computed slightly faster.
--
-- For the different syntaxes, see https://github.com/pgvector/pgvector
order by page_section_nimbus.embedding operator(public.<#>) embedding
limit 10
)
select
page_nimbus.id,
page_nimbus.path,
page_nimbus.type,
page_nimbus.meta ->> 'title' as title,
page_nimbus.meta ->> 'subtitle' as title,
page_nimbus.meta ->> 'description' as description,
array_agg(match.heading) as headings,
array_agg(match.slug) as slugs
from public.page_nimbus
join match on match.page_id = page_nimbus.id
group by page_nimbus.id;
end;
$$;
create or replace function search_content_nimbus(
embedding vector(1536),
include_full_content boolean default false,
match_threshold float default 0.78,
max_result int default 30
)
returns table (
id bigint,
page_title text,
type text,
href text,
content text,
metadata json,
subsections json[]
)
set search_path = ''
language sql
as $$
with matched_section as (
select
*,
row_number() over () as ranking
from public.match_embedding_nimbus(
embedding,
match_threshold,
max_result
)
)
select
page_nimbus.id,
meta ->> 'title' as page_title,
type,
public.get_full_content_url(type, path, null) as href,
case
when include_full_content
then page_nimbus.content
else
null
end as content,
meta as metadata,
array_agg(
json_build_object(
'title', heading,
'href', public.get_full_content_url(type, path, slug),
'content', matched_section.content
)
)
from matched_section
join public.page_nimbus on matched_section.page_id = page_nimbus.id
group by page_nimbus.id
order by min(ranking);
$$;