feat: alternate search index for nimbus (#38662)
* feat: alternate search index for nimbus
Create an alternate search index for Nimbus that filters out
feature-flagged pages (equivalent to setting all feature flags to
false).
Notes:
- Creates two new DB tables, `page_nimbus` and `page_section_nimbus`,
which are filtered versions of `page` and `page_section`
- Makes `nimbus` versions of all the DB search functions
- Refactored the embedding upload script. Changes to make it faster (got
annoyed by how slow it was when testing...), incorporate retries, and
produce better summary logs.
- Upload script, when run with the environment variable
ENABLED_FEATURES_OVERRIDE_DISABLE_ALL, produces and uploads the
alternate search index
- Changed all the search calls in frontend/API to check for
`isFeatureEnabled('search:fullIndex')` to determine whether to search
the full or alternate index
* ci: produce nimbus search indexes on merge
* fix: turn full search index on
This commit is contained in:
6
.github/workflows/search.yml
vendored
6
.github/workflows/search.yml
vendored
@@ -68,8 +68,12 @@ jobs:
|
||||
run: |
|
||||
pnpm run codegen:examples
|
||||
pnpm run embeddings
|
||||
pnpm run embeddings:nimbus
|
||||
|
||||
- name: Refresh embeddings
|
||||
working-directory: ./apps/docs
|
||||
if: ${{ inputs.refresh }}
|
||||
run: pnpm run embeddings:refresh
|
||||
run: |
|
||||
pnpm run codegen:examples
|
||||
pnpm run embeddings:refresh
|
||||
pnpm run embeddings:nimbus:refresh
|
||||
|
||||
@@ -3,6 +3,8 @@ import { ApplicationError, UserError, clippy } from 'ai-commands/edge'
|
||||
import { NextRequest, NextResponse } from 'next/server'
|
||||
import OpenAI from 'openai'
|
||||
|
||||
import { isFeatureEnabled } from 'common/enabled-features'
|
||||
|
||||
export const runtime = 'edge'
|
||||
/* To avoid OpenAI errors, restrict to the Vercel Edge Function regions that
|
||||
overlap with the OpenAI API regions.
|
||||
@@ -54,7 +56,10 @@ export async function POST(req: NextRequest) {
|
||||
throw new UserError('Missing messages in request data')
|
||||
}
|
||||
|
||||
const response = await clippy(openai, supabaseClient, messages)
|
||||
const useAltSearchIndex = !isFeatureEnabled('search:fullIndex')
|
||||
const response = await clippy(openai, supabaseClient, messages, {
|
||||
useAltSearchIndex,
|
||||
})
|
||||
|
||||
// Proxy the streamed SSE response from OpenAI
|
||||
return new NextResponse(response.body, {
|
||||
|
||||
@@ -17,7 +17,9 @@
|
||||
"dev:secrets:pull": "AWS_PROFILE=supa-dev node ../../scripts/getSecrets.js -n local/docs",
|
||||
"dev:watch:troubleshooting": "node ./scripts/troubleshooting/watch.mjs",
|
||||
"embeddings": "tsx --conditions=react-server scripts/search/generate-embeddings.ts",
|
||||
"embeddings:nimbus": "ENABLED_FEATURES_OVERRIDE_DISABLE_ALL=true pnpm run embeddings",
|
||||
"embeddings:refresh": "pnpm run embeddings --refresh",
|
||||
"embeddings:nimbus:refresh": "ENABLED_FEATURES_OVERRIDE_DISABLE_ALL=true pnpm run embeddings:refresh",
|
||||
"last-changed": "tsx scripts/last-changed.ts",
|
||||
"last-changed:reset": "pnpm run last-changed -- --reset",
|
||||
"lint": "next lint",
|
||||
|
||||
@@ -3,6 +3,8 @@ import { convertPostgrestToApiError, type ApiErrorGeneric } from '~/app/api/util
|
||||
import { Result } from '~/features/helpers.fn'
|
||||
import { openAI } from '~/lib/openAi'
|
||||
import { supabase, type DatabaseCorrected } from '~/lib/supabase'
|
||||
|
||||
import { isFeatureEnabled } from 'common/enabled-features'
|
||||
import { GuideModel } from '../guide/guideModel'
|
||||
import {
|
||||
DB_METADATA_TAG_PLATFORM_CLI,
|
||||
@@ -13,6 +15,9 @@ import { ReferenceSDKFunctionModel, SDKLanguageValues } from '../reference/refer
|
||||
import { TroubleshootingModel } from '../troubleshooting/troubleshootingModel'
|
||||
import { SearchResultInterface } from './globalSearchInterface'
|
||||
|
||||
type SearchFunction = 'search_content' | 'search_content_nimbus'
|
||||
type SearchHybridFunction = 'search_content_hybrid' | 'search_content_hybrid_nimbus'
|
||||
|
||||
export abstract class SearchResultModel {
|
||||
static async search(
|
||||
args: RootQueryTypeSearchDocsArgs,
|
||||
@@ -22,9 +27,14 @@ export abstract class SearchResultModel {
|
||||
const includeFullContent = requestedFields.includes('content')
|
||||
const embeddingResult = await openAI().createContentEmbedding(query)
|
||||
|
||||
const useAltSearchIndex = !isFeatureEnabled('search:fullIndex')
|
||||
const searchFunction: SearchFunction = useAltSearchIndex
|
||||
? 'search_content_nimbus'
|
||||
: 'search_content'
|
||||
|
||||
return embeddingResult.flatMapAsync(async ({ embedding }) => {
|
||||
const matchResult = new Result(
|
||||
await supabase().rpc('search_content', {
|
||||
await supabase().rpc(searchFunction, {
|
||||
embedding,
|
||||
include_full_content: includeFullContent,
|
||||
max_result: args.limit ?? undefined,
|
||||
@@ -49,9 +59,14 @@ export abstract class SearchResultModel {
|
||||
const includeFullContent = requestedFields.includes('content')
|
||||
const embeddingResult = await openAI().createContentEmbedding(query)
|
||||
|
||||
const useAltSearchIndex = !isFeatureEnabled('search:fullIndex')
|
||||
const searchFunction: SearchHybridFunction = useAltSearchIndex
|
||||
? 'search_content_hybrid_nimbus'
|
||||
: 'search_content_hybrid'
|
||||
|
||||
return embeddingResult.flatMapAsync(async ({ embedding }) => {
|
||||
const matchResult = new Result(
|
||||
await supabase().rpc('search_content_hybrid', {
|
||||
await supabase().rpc(searchFunction, {
|
||||
query_text: query,
|
||||
query_embedding: embedding,
|
||||
include_full_content: includeFullContent,
|
||||
|
||||
106
apps/docs/scripts/search/embeddings/utils.ts
Normal file
106
apps/docs/scripts/search/embeddings/utils.ts
Normal file
@@ -0,0 +1,106 @@
|
||||
export interface PageInfo {
|
||||
pageId: number
|
||||
path: string
|
||||
checksum: string
|
||||
sectionsCount: number
|
||||
}
|
||||
|
||||
export interface PageSectionForEmbedding {
|
||||
pageId: number
|
||||
path: string
|
||||
slug?: string
|
||||
heading?: string
|
||||
content: string
|
||||
input: string
|
||||
ragIgnore: boolean
|
||||
}
|
||||
|
||||
export interface PageSectionWithEmbedding extends PageSectionForEmbedding {
|
||||
embedding: number[]
|
||||
}
|
||||
|
||||
export interface ProcessingResult {
|
||||
successfulPages: Set<number>
|
||||
failedPages: Set<number>
|
||||
totalSectionsProcessed: number
|
||||
totalSectionsInserted: number
|
||||
}
|
||||
|
||||
export function createBatches<T>(array: T[], batchSize: number): T[][] {
|
||||
const batches: T[][] = []
|
||||
for (let i = 0; i < array.length; i += batchSize) {
|
||||
batches.push(array.slice(i, i + batchSize))
|
||||
}
|
||||
return batches
|
||||
}
|
||||
|
||||
export function mapEmbeddingsToSections(
|
||||
batch: PageSectionForEmbedding[],
|
||||
data: Array<{ embedding?: number[] }>,
|
||||
batchNumber: number
|
||||
): {
|
||||
sectionsWithEmbeddings: PageSectionWithEmbedding[]
|
||||
failedSectionIndexes: Set<number>
|
||||
} {
|
||||
const sectionsWithEmbeddings: PageSectionWithEmbedding[] = []
|
||||
const failedSectionIndexes: Set<number> = new Set()
|
||||
|
||||
if (batch.length !== data.length) {
|
||||
console.error(
|
||||
`Ignoring all embeddings returned from batch ${batchNumber} because returned number doesn't match input number`
|
||||
)
|
||||
batch.forEach((_, index) => {
|
||||
failedSectionIndexes.add(index)
|
||||
})
|
||||
}
|
||||
|
||||
for (let i = 0; i < batch.length; i++) {
|
||||
if (data[i].embedding) {
|
||||
sectionsWithEmbeddings.push({ ...batch[i], embedding: data[i].embedding! })
|
||||
} else {
|
||||
failedSectionIndexes.add(i)
|
||||
}
|
||||
}
|
||||
|
||||
return { sectionsWithEmbeddings, failedSectionIndexes }
|
||||
}
|
||||
|
||||
export function updatePageInsertionCounts(
|
||||
pageSectionsInserted: Map<number, number>,
|
||||
sectionsWithEmbeddings: PageSectionWithEmbedding[]
|
||||
) {
|
||||
sectionsWithEmbeddings.forEach((section) => {
|
||||
const current = pageSectionsInserted.get(section.pageId) || 0
|
||||
pageSectionsInserted.set(section.pageId, current + 1)
|
||||
})
|
||||
}
|
||||
|
||||
export function computePageResults(
|
||||
pageInfoMap: Map<number, PageInfo>,
|
||||
pageSectionsInserted: Map<number, number>,
|
||||
result: ProcessingResult
|
||||
) {
|
||||
for (const [pageId, pageInfo] of pageInfoMap) {
|
||||
const insertedCount = pageSectionsInserted.get(pageId) || 0
|
||||
if (insertedCount === pageInfo.sectionsCount && !result.failedPages.has(pageId)) {
|
||||
result.successfulPages.add(pageId)
|
||||
} else {
|
||||
result.failedPages.add(pageId)
|
||||
console.warn(
|
||||
`Page ${pageInfo.path}: inserted ${insertedCount}/${pageInfo.sectionsCount} sections`
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export function logFailedSections(
|
||||
batch: PageSectionForEmbedding[],
|
||||
inputs: string[],
|
||||
failedSectionIndexes: Set<number>
|
||||
) {
|
||||
failedSectionIndexes.forEach((i) => {
|
||||
console.error(
|
||||
`Failed to process section: ${batch[i].path}#${batch[i].slug} (content: "${inputs[i]?.slice(0, 50)}...")`
|
||||
)
|
||||
})
|
||||
}
|
||||
@@ -1,24 +1,453 @@
|
||||
import '../utils/dotenv.js'
|
||||
|
||||
import { createClient } from '@supabase/supabase-js'
|
||||
import { createClient, type SupabaseClient } from '@supabase/supabase-js'
|
||||
import { parseArgs } from 'node:util'
|
||||
import { OpenAI } from 'openai'
|
||||
import { v4 as uuidv4 } from 'uuid'
|
||||
|
||||
import type { Section } from '../helpers.mdx.js'
|
||||
import {
|
||||
type PageInfo,
|
||||
type PageSectionForEmbedding,
|
||||
type PageSectionWithEmbedding,
|
||||
type ProcessingResult,
|
||||
createBatches,
|
||||
mapEmbeddingsToSections,
|
||||
updatePageInsertionCounts,
|
||||
computePageResults,
|
||||
logFailedSections,
|
||||
} from './embeddings/utils.js'
|
||||
import { fetchAllSources } from './sources/index.js'
|
||||
|
||||
const CONFIG = {
|
||||
// OpenAI settings
|
||||
EMBEDDING_MODEL: 'text-embedding-ada-002' as const,
|
||||
EMBEDDING_DIMENSION: 1536, // Keep in sync with EMBEDDING_MODEL
|
||||
OPENAI_BATCH_SIZE: 128,
|
||||
OPENAI_MAX_RETRIES: 3,
|
||||
OPENAI_BASE_DELAY_MS: 500,
|
||||
/**
|
||||
* If context length is exceeded, truncate inputs over this character length
|
||||
* and retry. This is a character-based heuristic, not token-exact.
|
||||
*/
|
||||
EMBEDDING_TRUNCATE_CHAR_LIMIT: 16_000,
|
||||
|
||||
// Supabase settings
|
||||
SUPABASE_MAX_RETRIES: 2,
|
||||
SUPABASE_BASE_DELAY_MS: 100,
|
||||
|
||||
// Processing settings
|
||||
SOURCE_CONCURRENCY: 10,
|
||||
} as const
|
||||
|
||||
function delay(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms))
|
||||
}
|
||||
|
||||
function exponentialBackoff(attempt: number, baseDelay: number, maxDelay: number = 30_000): number {
|
||||
const exponentialDelay = baseDelay * Math.pow(2, attempt)
|
||||
const jitter = (Math.random() - 0.5) * 0.1 * exponentialDelay
|
||||
return Math.min(Math.max(0, exponentialDelay + jitter), maxDelay)
|
||||
}
|
||||
|
||||
async function withRetry<T>(
|
||||
operation: () => Promise<T>,
|
||||
maxRetries: number,
|
||||
baseDelay: number,
|
||||
operationName: string,
|
||||
shouldRetryOnError: (error: unknown) => boolean = () => true
|
||||
): Promise<T> {
|
||||
let lastError: Error
|
||||
|
||||
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
return await operation()
|
||||
} catch (error) {
|
||||
lastError = error as Error
|
||||
|
||||
// Allow caller to prevent redundant retries for specific errors
|
||||
if (!shouldRetryOnError?.(error)) {
|
||||
console.warn(`${operationName} encountered non-retryable error:`, lastError.message)
|
||||
throw lastError
|
||||
}
|
||||
|
||||
if (attempt === maxRetries) {
|
||||
console.error(`${operationName} failed after ${maxRetries + 1} attempts:`, lastError)
|
||||
throw lastError
|
||||
}
|
||||
|
||||
const delayMs = exponentialBackoff(attempt, baseDelay)
|
||||
console.warn(
|
||||
`${operationName} attempt ${attempt + 1} failed, retrying in ${delayMs}ms:`,
|
||||
lastError.message
|
||||
)
|
||||
await delay(delayMs)
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError!
|
||||
}
|
||||
|
||||
function isNimbusMode(): boolean {
|
||||
return process.env.ENABLED_FEATURES_OVERRIDE_DISABLE_ALL === 'true'
|
||||
}
|
||||
|
||||
function getPageTables() {
|
||||
const nimbus = isNimbusMode()
|
||||
return {
|
||||
pageTable: nimbus ? 'page_nimbus' : 'page',
|
||||
pageSectionTable: nimbus ? 'page_section_nimbus' : 'page_section',
|
||||
} as const
|
||||
}
|
||||
|
||||
function requireEnvOrThrow(names: string[]): void {
|
||||
const missing = names.filter((n) => !process.env[n])
|
||||
if (missing.length) {
|
||||
throw new Error(
|
||||
`Environment variables ${missing.join(', ')} are required: skipping embeddings generation`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
function initSupabase(): SupabaseClient {
|
||||
return createClient(process.env.NEXT_PUBLIC_SUPABASE_URL!, process.env.SUPABASE_SECRET_KEY!, {
|
||||
auth: { persistSession: false, autoRefreshToken: false },
|
||||
})
|
||||
}
|
||||
|
||||
type PreparedSections = {
|
||||
allSectionsToProcess: PageSectionForEmbedding[]
|
||||
pageInfoMap: Map<number, PageInfo>
|
||||
}
|
||||
|
||||
async function prepareSections(
|
||||
supabaseClient: SupabaseClient,
|
||||
pageTable: string,
|
||||
pageSectionTable: string,
|
||||
shouldRefresh: boolean,
|
||||
refreshVersion: string,
|
||||
refreshDate: Date,
|
||||
fullIndex = true,
|
||||
debug = false
|
||||
): Promise<PreparedSections> {
|
||||
const embeddingSources = await fetchAllSources(fullIndex)
|
||||
console.log(`Discovered ${embeddingSources.length} sources`)
|
||||
|
||||
const allSectionsToProcess: PageSectionForEmbedding[] = []
|
||||
const pageInfoMap = new Map<number, PageInfo>()
|
||||
|
||||
for (const sourceBatch of createBatches(embeddingSources, CONFIG.SOURCE_CONCURRENCY)) {
|
||||
await Promise.all(
|
||||
sourceBatch.map(async (embeddingSource) => {
|
||||
const { type, source, path } = embeddingSource
|
||||
|
||||
try {
|
||||
const {
|
||||
checksum,
|
||||
sections,
|
||||
meta = {},
|
||||
ragIgnore = false,
|
||||
}: {
|
||||
checksum: string
|
||||
sections: Section[]
|
||||
ragIgnore?: boolean
|
||||
meta?: Record<string, unknown>
|
||||
} = await embeddingSource.process()
|
||||
|
||||
const { error: fetchPageError, data: existingPage } = await supabaseClient
|
||||
.from(pageTable)
|
||||
.select('id, path, checksum')
|
||||
.filter('path', 'eq', path)
|
||||
.limit(1)
|
||||
.maybeSingle()
|
||||
|
||||
if (fetchPageError) throw fetchPageError
|
||||
|
||||
if (!shouldRefresh && existingPage?.checksum === checksum) {
|
||||
const { error: updatePageError } = await supabaseClient
|
||||
.from(pageTable)
|
||||
.update({
|
||||
type,
|
||||
source,
|
||||
meta,
|
||||
version: refreshVersion,
|
||||
last_refresh: refreshDate,
|
||||
})
|
||||
.filter('id', 'eq', existingPage.id)
|
||||
if (updatePageError) throw updatePageError
|
||||
return
|
||||
}
|
||||
|
||||
if (existingPage && debug) {
|
||||
console.log(
|
||||
!shouldRefresh
|
||||
? `[${path}] Docs have changed, removing old page sections and their embeddings`
|
||||
: `[${path}] Refresh flag set, removing old page sections and their embeddings`
|
||||
)
|
||||
|
||||
const { error: deletePageSectionError } = await supabaseClient
|
||||
.from(pageSectionTable)
|
||||
.delete()
|
||||
.filter('page_id', 'eq', existingPage.id)
|
||||
if (deletePageSectionError) throw deletePageSectionError
|
||||
}
|
||||
|
||||
const { error: upsertPageError, data: page } = await supabaseClient
|
||||
.from(pageTable)
|
||||
.upsert(
|
||||
{
|
||||
checksum: null,
|
||||
path,
|
||||
type,
|
||||
source,
|
||||
meta,
|
||||
content: embeddingSource.extractIndexedContent(),
|
||||
version: refreshVersion,
|
||||
last_refresh: refreshDate,
|
||||
},
|
||||
{ onConflict: 'path' }
|
||||
)
|
||||
.select()
|
||||
.limit(1)
|
||||
.single()
|
||||
if (upsertPageError) throw upsertPageError
|
||||
|
||||
if (debug) {
|
||||
console.log(`[${path}] Preparing ${sections.length} page sections for processing`)
|
||||
}
|
||||
|
||||
pageInfoMap.set(page.id, {
|
||||
pageId: page.id,
|
||||
path,
|
||||
checksum,
|
||||
sectionsCount: sections.length,
|
||||
})
|
||||
|
||||
const sectionsForBatching = sections.map(({ slug, heading, content }) => ({
|
||||
pageId: page.id,
|
||||
path,
|
||||
slug,
|
||||
heading,
|
||||
content,
|
||||
input: content.replace(/\n/g, ' '),
|
||||
ragIgnore,
|
||||
}))
|
||||
allSectionsToProcess.push(...sectionsForBatching)
|
||||
} catch (err) {
|
||||
console.error(`Error preparing path '${path}' for processing.`)
|
||||
console.error(err)
|
||||
}
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
console.log(
|
||||
`Prepared ${allSectionsToProcess.length} sections for processing from ${pageInfoMap.size} pages`
|
||||
)
|
||||
return { allSectionsToProcess, pageInfoMap }
|
||||
}
|
||||
|
||||
async function processAndInsertEmbeddings(
|
||||
openai: OpenAI,
|
||||
supabaseClient: SupabaseClient,
|
||||
pageSectionTable: string,
|
||||
allSections: PageSectionForEmbedding[],
|
||||
pageInfoMap: Map<number, PageInfo>
|
||||
): Promise<ProcessingResult> {
|
||||
if (allSections.length === 0) {
|
||||
return {
|
||||
successfulPages: new Set(),
|
||||
failedPages: new Set(),
|
||||
totalSectionsProcessed: 0,
|
||||
totalSectionsInserted: 0,
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Processing ${allSections.length} sections with embeddings + insertion`)
|
||||
|
||||
const embeddingBatches = createBatches(allSections, CONFIG.OPENAI_BATCH_SIZE)
|
||||
const result: ProcessingResult = {
|
||||
successfulPages: new Set(),
|
||||
failedPages: new Set(),
|
||||
totalSectionsProcessed: 0,
|
||||
totalSectionsInserted: 0,
|
||||
}
|
||||
|
||||
// Track sections inserted per page
|
||||
const pageSectionsInserted = new Map<number, number>()
|
||||
|
||||
for (let batchIndex = 0; batchIndex < embeddingBatches.length; batchIndex++) {
|
||||
const batch = embeddingBatches[batchIndex]
|
||||
try {
|
||||
const batchResult = await processEmbeddingBatch(
|
||||
openai,
|
||||
batch,
|
||||
batchIndex,
|
||||
embeddingBatches.length
|
||||
)
|
||||
|
||||
result.totalSectionsProcessed += batchResult.processedCount
|
||||
|
||||
if (batchResult.sectionsWithEmbeddings.length > 0) {
|
||||
const insertedCount = await insertSectionBatch(
|
||||
supabaseClient,
|
||||
pageSectionTable,
|
||||
batchResult.sectionsWithEmbeddings
|
||||
)
|
||||
result.totalSectionsInserted += insertedCount
|
||||
updatePageInsertionCounts(pageSectionsInserted, batchResult.sectionsWithEmbeddings)
|
||||
}
|
||||
|
||||
// Mark failed section pages
|
||||
batchResult.failedSectionIndexes.forEach((i) => {
|
||||
result.failedPages.add(batch[i].pageId)
|
||||
})
|
||||
} catch (error) {
|
||||
console.error(`Batch ${batchIndex + 1} completely failed:`, error)
|
||||
batch.forEach((section) => result.failedPages.add(section.pageId))
|
||||
}
|
||||
|
||||
if (batchIndex < embeddingBatches.length - 1) {
|
||||
await delay(CONFIG.OPENAI_BASE_DELAY_MS)
|
||||
}
|
||||
}
|
||||
|
||||
computePageResults(pageInfoMap, pageSectionsInserted, result)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
type BatchEmbeddingResult = {
|
||||
sectionsWithEmbeddings: PageSectionWithEmbedding[]
|
||||
failedSectionIndexes: Set<number>
|
||||
processedCount: number
|
||||
}
|
||||
|
||||
async function processEmbeddingBatch(
|
||||
openai: OpenAI,
|
||||
batch: PageSectionForEmbedding[],
|
||||
batchIndex: number,
|
||||
totalBatches: number
|
||||
): Promise<BatchEmbeddingResult> {
|
||||
const inputs = batch.map((section) => section.input)
|
||||
console.log(
|
||||
`Processing embedding batch ${batchIndex + 1}/${totalBatches} (${inputs.length} sections)`
|
||||
)
|
||||
|
||||
// Helper to identify context length exceeded errors from OpenAI
|
||||
const isContextLengthError = (err: unknown) => {
|
||||
if (!(err instanceof OpenAI.APIError)) return false
|
||||
|
||||
const message = err.error?.message as string
|
||||
const status = err.status
|
||||
return status === 400 && message.toLowerCase().includes('context')
|
||||
}
|
||||
|
||||
let embeddingResponse: OpenAI.Embeddings.CreateEmbeddingResponse
|
||||
try {
|
||||
embeddingResponse = await withRetry(
|
||||
() =>
|
||||
openai.embeddings.create({
|
||||
model: CONFIG.EMBEDDING_MODEL,
|
||||
input: inputs,
|
||||
}),
|
||||
CONFIG.OPENAI_MAX_RETRIES,
|
||||
CONFIG.OPENAI_BASE_DELAY_MS,
|
||||
`OpenAI embedding batch ${batchIndex + 1}`,
|
||||
(err) => !isContextLengthError(err)
|
||||
)
|
||||
} catch (err) {
|
||||
if (!isContextLengthError(err)) {
|
||||
throw err
|
||||
}
|
||||
|
||||
// Context length exceeded: truncate problematic sections and try once more
|
||||
const limit = CONFIG.EMBEDDING_TRUNCATE_CHAR_LIMIT
|
||||
const truncatedInputs = inputs.map((s) => (s.length > limit ? s.slice(0, limit) : s))
|
||||
const truncatedCount = truncatedInputs.filter((s, i) => s !== inputs[i]).length
|
||||
console.warn(
|
||||
`OpenAI embedding batch ${batchIndex + 1}: context length exceeded. ` +
|
||||
`Truncating ${truncatedCount} overly long section(s) to ${limit} chars and retrying once.`
|
||||
)
|
||||
|
||||
embeddingResponse = await openai.embeddings.create({
|
||||
model: CONFIG.EMBEDDING_MODEL,
|
||||
input: truncatedInputs,
|
||||
})
|
||||
|
||||
// Replace inputs with truncated inputs for downstream bookkeeping
|
||||
for (let i = 0; i < inputs.length; i++) inputs[i] = truncatedInputs[i]
|
||||
}
|
||||
|
||||
const { sectionsWithEmbeddings, failedSectionIndexes } = mapEmbeddingsToSections(
|
||||
batch,
|
||||
embeddingResponse.data,
|
||||
batchIndex
|
||||
)
|
||||
logFailedSections(batch, inputs, failedSectionIndexes)
|
||||
|
||||
return {
|
||||
sectionsWithEmbeddings,
|
||||
failedSectionIndexes,
|
||||
processedCount: inputs.length,
|
||||
}
|
||||
}
|
||||
|
||||
async function insertSectionBatch(
|
||||
supabaseClient: SupabaseClient,
|
||||
pageSectionTable: string,
|
||||
sectionsWithEmbeddings: PageSectionWithEmbedding[]
|
||||
): Promise<number> {
|
||||
if (sectionsWithEmbeddings.length === 0) {
|
||||
return 0
|
||||
}
|
||||
|
||||
const pageSectionsToInsert = sectionsWithEmbeddings.map((section) => ({
|
||||
page_id: section.pageId,
|
||||
slug: section.slug,
|
||||
heading: section.heading,
|
||||
content: section.content,
|
||||
embedding: section.embedding,
|
||||
rag_ignore: section.ragIgnore,
|
||||
}))
|
||||
|
||||
await withRetry(
|
||||
async () => {
|
||||
const { error } = await supabaseClient.from(pageSectionTable).insert(pageSectionsToInsert)
|
||||
|
||||
if (error) {
|
||||
throw new Error(`Supabase insert error: ${error.message}`)
|
||||
}
|
||||
},
|
||||
CONFIG.SUPABASE_MAX_RETRIES,
|
||||
CONFIG.SUPABASE_BASE_DELAY_MS,
|
||||
`Insert batch of ${sectionsWithEmbeddings.length} sections`
|
||||
)
|
||||
|
||||
return sectionsWithEmbeddings.length
|
||||
}
|
||||
|
||||
const args = parseArgs({
|
||||
options: {
|
||||
refresh: {
|
||||
type: 'boolean',
|
||||
},
|
||||
debug: {
|
||||
type: 'boolean',
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
async function generateEmbeddings() {
|
||||
const shouldRefresh = Boolean(args.values.refresh)
|
||||
const debug = Boolean(args.values.debug)
|
||||
|
||||
const requiredEnvVars = [
|
||||
const nimbus = isNimbusMode()
|
||||
if (nimbus) {
|
||||
console.log('Running in Nimbus mode - will filter content based on disabled feature flags')
|
||||
}
|
||||
|
||||
requireEnvOrThrow([
|
||||
'DOCS_GITHUB_APP_ID',
|
||||
'DOCS_GITHUB_APP_INSTALLATION_ID',
|
||||
'DOCS_GITHUB_APP_PRIVATE_KEY',
|
||||
@@ -27,217 +456,136 @@ async function generateEmbeddings() {
|
||||
'NEXT_PUBLIC_SUPABASE_URL',
|
||||
'OPENAI_API_KEY',
|
||||
'SUPABASE_SECRET_KEY',
|
||||
]
|
||||
])
|
||||
|
||||
const missingEnvVars = requiredEnvVars.filter((name) => !process.env[name])
|
||||
if (missingEnvVars.length > 0) {
|
||||
throw new Error(
|
||||
`Environment variables ${missingEnvVars.join(
|
||||
', '
|
||||
)} are required: skipping embeddings generation`
|
||||
)
|
||||
}
|
||||
const supabaseClient = initSupabase()
|
||||
|
||||
const supabaseClient = createClient(
|
||||
process.env.NEXT_PUBLIC_SUPABASE_URL!,
|
||||
process.env.SUPABASE_SECRET_KEY!,
|
||||
{
|
||||
auth: {
|
||||
persistSession: false,
|
||||
autoRefreshToken: false,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
// Use this version to track which pages to purge
|
||||
// after the refresh
|
||||
const refreshVersion = uuidv4()
|
||||
|
||||
const refreshDate = new Date()
|
||||
|
||||
const embeddingSources = await fetchAllSources()
|
||||
const { pageTable, pageSectionTable } = getPageTables()
|
||||
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY })
|
||||
|
||||
console.log(`Discovered ${embeddingSources.length} pages`)
|
||||
console.log(
|
||||
shouldRefresh
|
||||
? 'Refresh flag set, re-generating all pages'
|
||||
: 'Checking which pages are new or have changed'
|
||||
)
|
||||
|
||||
if (!shouldRefresh) {
|
||||
console.log('Checking which pages are new or have changed')
|
||||
} else {
|
||||
console.log('Refresh flag set, re-generating all pages')
|
||||
const { allSectionsToProcess, pageInfoMap } = await prepareSections(
|
||||
supabaseClient,
|
||||
pageTable,
|
||||
pageSectionTable,
|
||||
shouldRefresh,
|
||||
refreshVersion,
|
||||
refreshDate,
|
||||
!nimbus,
|
||||
debug
|
||||
)
|
||||
|
||||
let processingResult: ProcessingResult
|
||||
try {
|
||||
processingResult = await processAndInsertEmbeddings(
|
||||
openai,
|
||||
supabaseClient,
|
||||
pageSectionTable,
|
||||
allSectionsToProcess,
|
||||
pageInfoMap
|
||||
)
|
||||
console.log(
|
||||
`Processing complete: ${processingResult.totalSectionsInserted}/${processingResult.totalSectionsProcessed} sections inserted successfully`
|
||||
)
|
||||
console.log(
|
||||
`Page summary: ${processingResult.successfulPages.size} successful, ${processingResult.failedPages.size} failed`
|
||||
)
|
||||
} catch (error) {
|
||||
console.error('Critical error during embedding processing:', error)
|
||||
console.log('Exiting due to complete processing failure')
|
||||
return
|
||||
}
|
||||
|
||||
for (const embeddingSource of embeddingSources) {
|
||||
const { type, source, path } = embeddingSource
|
||||
console.log(`\nUpdating checksums for ${processingResult.successfulPages.size} successful pages`)
|
||||
const successfulChecksumUpdates = await updateSuccessfulChecksums(
|
||||
supabaseClient,
|
||||
pageTable,
|
||||
pageInfoMap,
|
||||
processingResult
|
||||
)
|
||||
console.log(
|
||||
`Successfully updated checksums for ${successfulChecksumUpdates}/${processingResult.successfulPages.size} successful pages`
|
||||
)
|
||||
|
||||
try {
|
||||
const {
|
||||
checksum,
|
||||
sections,
|
||||
meta = {},
|
||||
ragIgnore = false,
|
||||
}: {
|
||||
checksum: string
|
||||
sections: Section[]
|
||||
ragIgnore?: boolean
|
||||
meta?: Record<string, unknown>
|
||||
} = await embeddingSource.process()
|
||||
logFailedPages(pageInfoMap, processingResult)
|
||||
|
||||
// Check for existing page in DB and compare checksums
|
||||
const { error: fetchPageError, data: existingPage } = await supabaseClient
|
||||
.from('page')
|
||||
.select('id, path, checksum')
|
||||
.filter('path', 'eq', path)
|
||||
.limit(1)
|
||||
.maybeSingle()
|
||||
|
||||
if (fetchPageError) {
|
||||
throw fetchPageError
|
||||
}
|
||||
|
||||
// We use checksum to determine if this page & its sections need to be regenerated
|
||||
if (!shouldRefresh && existingPage?.checksum === checksum) {
|
||||
// No content/embedding update required on this page
|
||||
// Update other meta info
|
||||
const { error: updatePageError } = await supabaseClient
|
||||
.from('page')
|
||||
.update({
|
||||
type,
|
||||
source,
|
||||
meta,
|
||||
version: refreshVersion,
|
||||
last_refresh: refreshDate,
|
||||
})
|
||||
.filter('id', 'eq', existingPage.id)
|
||||
|
||||
if (updatePageError) {
|
||||
throw updatePageError
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
if (existingPage) {
|
||||
if (!shouldRefresh) {
|
||||
console.log(
|
||||
`[${path}] Docs have changed, removing old page sections and their embeddings`
|
||||
)
|
||||
} else {
|
||||
console.log(`[${path}] Refresh flag set, removing old page sections and their embeddings`)
|
||||
}
|
||||
|
||||
const { error: deletePageSectionError } = await supabaseClient
|
||||
.from('page_section')
|
||||
.delete()
|
||||
.filter('page_id', 'eq', existingPage.id)
|
||||
|
||||
if (deletePageSectionError) {
|
||||
throw deletePageSectionError
|
||||
}
|
||||
}
|
||||
|
||||
// Create/update page record. Intentionally clear checksum until we
|
||||
// have successfully generated all page sections.
|
||||
const { error: upsertPageError, data: page } = await supabaseClient
|
||||
.from('page')
|
||||
.upsert(
|
||||
{
|
||||
checksum: null,
|
||||
path,
|
||||
type,
|
||||
source,
|
||||
meta,
|
||||
content: embeddingSource.extractIndexedContent(),
|
||||
version: refreshVersion,
|
||||
last_refresh: refreshDate,
|
||||
},
|
||||
{ onConflict: 'path' }
|
||||
)
|
||||
.select()
|
||||
.limit(1)
|
||||
.single()
|
||||
|
||||
if (upsertPageError) {
|
||||
throw upsertPageError
|
||||
}
|
||||
|
||||
console.log(`[${path}] Adding ${sections.length} page sections (with embeddings)`)
|
||||
for (const { slug, heading, content } of sections) {
|
||||
// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
|
||||
// force a redeploy
|
||||
const input = content.replace(/\n/g, ' ')
|
||||
|
||||
try {
|
||||
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY })
|
||||
|
||||
const embeddingResponse = await openai.embeddings.create({
|
||||
model: 'text-embedding-ada-002',
|
||||
input,
|
||||
})
|
||||
|
||||
const [responseData] = embeddingResponse.data
|
||||
|
||||
const { error: insertPageSectionError } = await supabaseClient
|
||||
.from('page_section')
|
||||
.insert({
|
||||
page_id: page.id,
|
||||
slug,
|
||||
heading,
|
||||
content,
|
||||
token_count: embeddingResponse.usage.total_tokens,
|
||||
embedding: responseData.embedding,
|
||||
rag_ignore: ragIgnore,
|
||||
})
|
||||
.select()
|
||||
.limit(1)
|
||||
.single()
|
||||
|
||||
if (insertPageSectionError) {
|
||||
throw insertPageSectionError
|
||||
}
|
||||
} catch (err) {
|
||||
// TODO: decide how to better handle failed embeddings
|
||||
console.error(
|
||||
`Failed to generate embeddings for '${path}' page section starting with '${input.slice(
|
||||
0,
|
||||
40
|
||||
)}...'`
|
||||
)
|
||||
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
// Set page checksum so that we know this page was stored successfully
|
||||
const { error: updatePageError } = await supabaseClient
|
||||
.from('page')
|
||||
.update({ checksum })
|
||||
.filter('id', 'eq', page.id)
|
||||
|
||||
if (updatePageError) {
|
||||
throw updatePageError
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(
|
||||
`Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.`
|
||||
)
|
||||
console.error(err)
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Removing old pages and their sections`)
|
||||
|
||||
// Delete pages that have been removed (and their sections via cascade)
|
||||
const { error: deletePageError } = await supabaseClient
|
||||
.from('page')
|
||||
.delete()
|
||||
.filter('version', 'neq', refreshVersion)
|
||||
|
||||
if (deletePageError) {
|
||||
throw deletePageError
|
||||
}
|
||||
await purgeOldPages(supabaseClient, pageTable, refreshVersion)
|
||||
|
||||
console.log('Embedding generation complete')
|
||||
}
|
||||
|
||||
async function updateSuccessfulChecksums(
|
||||
supabaseClient: SupabaseClient,
|
||||
pageTable: string,
|
||||
pageInfoMap: Map<number, PageInfo>,
|
||||
processingResult: ProcessingResult
|
||||
): Promise<number> {
|
||||
let successfulChecksumUpdates = 0
|
||||
const pageIds = Array.from(processingResult.successfulPages)
|
||||
const batches = createBatches(pageIds, CONFIG.SOURCE_CONCURRENCY)
|
||||
|
||||
for (const batch of batches) {
|
||||
const results = await Promise.all(
|
||||
batch.map(async (pageId) => {
|
||||
const pageInfo = pageInfoMap.get(pageId)
|
||||
if (!pageInfo) {
|
||||
console.error(`Missing page info for pageId ${pageId}`)
|
||||
return 0
|
||||
}
|
||||
|
||||
try {
|
||||
const { error: updatePageError } = await supabaseClient
|
||||
.from(pageTable)
|
||||
.update({ checksum: pageInfo.checksum })
|
||||
.eq('id', pageId)
|
||||
if (updatePageError) {
|
||||
console.error(`Failed to update checksum for page ${pageInfo.path}:`, updatePageError)
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
} catch (error) {
|
||||
console.error(`Error updating checksum for page ${pageInfo.path}:`, error)
|
||||
return 0
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
successfulChecksumUpdates += results.reduce((sum, x) => sum + x, 0)
|
||||
}
|
||||
|
||||
return successfulChecksumUpdates
|
||||
}
|
||||
|
||||
function logFailedPages(pageInfoMap: Map<number, PageInfo>, processingResult: ProcessingResult) {
|
||||
if (processingResult.failedPages.size === 0) return
|
||||
console.log(`\nFailed pages:`)
|
||||
for (const pageId of processingResult.failedPages) {
|
||||
const pageInfo = pageInfoMap.get(pageId)
|
||||
if (pageInfo) console.log(` - ${pageInfo.path}`)
|
||||
}
|
||||
}
|
||||
|
||||
async function purgeOldPages(
|
||||
supabaseClient: SupabaseClient,
|
||||
pageTable: string,
|
||||
refreshVersion: string
|
||||
) {
|
||||
console.log(`Removing old pages and their sections`)
|
||||
const { error: deletePageError } = await supabaseClient
|
||||
.from(pageTable)
|
||||
.delete()
|
||||
.filter('version', 'neq', refreshVersion)
|
||||
if (deletePageError) throw deletePageError
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await generateEmbeddings()
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { type GuideModel } from '../../../resources/guide/guideModel.js'
|
||||
import { GuideModelLoader } from '../../../resources/guide/guideModelLoader.js'
|
||||
import {
|
||||
GitHubDiscussionLoader,
|
||||
@@ -28,7 +29,7 @@ export type SearchSource =
|
||||
export async function fetchGuideSources() {
|
||||
const guides = (await GuideModelLoader.allFromFs()).unwrapLeft()
|
||||
|
||||
return guides.map((guide) => MarkdownLoader.fromGuideModel('guide', guide))
|
||||
return guides.map((guide: GuideModel) => MarkdownLoader.fromGuideModel('guide', guide))
|
||||
}
|
||||
|
||||
export async function fetchOpenApiReferenceSource() {
|
||||
@@ -125,27 +126,29 @@ export async function fetchLintWarningsGuideSources() {
|
||||
/**
|
||||
* Fetches all the sources we want to index for search
|
||||
*/
|
||||
export async function fetchAllSources() {
|
||||
export async function fetchAllSources(fullIndex: boolean) {
|
||||
const guideSources = fetchGuideSources()
|
||||
const lintWarningsGuideSources = fetchLintWarningsGuideSources()
|
||||
const openApiReferenceSource = fetchOpenApiReferenceSource()
|
||||
const jsLibReferenceSource = fetchJsLibReferenceSource()
|
||||
const dartLibReferenceSource = fetchDartLibReferenceSource()
|
||||
const pythonLibReferenceSource = fetchPythonLibReferenceSource()
|
||||
const cSharpLibReferenceSource = fetchCSharpLibReferenceSource()
|
||||
const swiftLibReferenceSource = fetchSwiftLibReferenceSource()
|
||||
const ktLibReferenceSource = fetchKtLibReferenceSource()
|
||||
const cliReferenceSource = fetchCliLibReferenceSource()
|
||||
const dartLibReferenceSource = fullIndex ? fetchDartLibReferenceSource() : []
|
||||
const pythonLibReferenceSource = fullIndex ? fetchPythonLibReferenceSource() : []
|
||||
const cSharpLibReferenceSource = fullIndex ? fetchCSharpLibReferenceSource() : []
|
||||
const swiftLibReferenceSource = fullIndex ? fetchSwiftLibReferenceSource() : []
|
||||
const ktLibReferenceSource = fullIndex ? fetchKtLibReferenceSource() : []
|
||||
const cliReferenceSource = fullIndex ? fetchCliLibReferenceSource() : []
|
||||
|
||||
const partnerIntegrationSources = fetchPartners()
|
||||
.then((partners) =>
|
||||
partners
|
||||
? Promise.all(
|
||||
partners.map((partner) => new IntegrationLoader(partner.slug, partner).load())
|
||||
)
|
||||
: []
|
||||
)
|
||||
.then((data) => data.flat())
|
||||
const partnerIntegrationSources = fullIndex
|
||||
? fetchPartners()
|
||||
.then((partners) =>
|
||||
partners
|
||||
? Promise.all(
|
||||
partners.map((partner) => new IntegrationLoader(partner.slug, partner).load())
|
||||
)
|
||||
: []
|
||||
)
|
||||
.then((data) => data.flat())
|
||||
: []
|
||||
|
||||
const githubDiscussionSources = fetchDiscussions(
|
||||
'supabase',
|
||||
|
||||
@@ -820,7 +820,7 @@
|
||||
{
|
||||
"id": "supabase-postgres-config",
|
||||
"title": "Manage Postgres configurations",
|
||||
"slug": "supabase-ssl-enforcement",
|
||||
"slug": "supabase-postgres-config",
|
||||
"type": "cli-command"
|
||||
},
|
||||
{
|
||||
|
||||
@@ -46,6 +46,7 @@
|
||||
"DOCS_GITHUB_APP_PRIVATE_KEY",
|
||||
"DOCS_REVALIDATION_KEYS",
|
||||
"DOCS_REVALIDATION_OVERRIDE_KEYS",
|
||||
"ENABLED_FEATURES_OVERRIDE_DISABLE_ALL",
|
||||
"GITHUB_ACTIONS",
|
||||
"FORCE_ASSET_CDN",
|
||||
"LOGFLARE_INGESTION_API_KEY",
|
||||
|
||||
@@ -13,10 +13,13 @@ interface PageSection {
|
||||
rag_ignore?: boolean
|
||||
}
|
||||
|
||||
type MatchPageSectionsFunction = 'match_page_sections_v2' | 'match_page_sections_v2_nimbus'
|
||||
|
||||
export async function clippy(
|
||||
openai: OpenAI,
|
||||
supabaseClient: SupabaseClient<any, 'public', any>,
|
||||
messages: Message[]
|
||||
messages: Message[],
|
||||
options?: { useAltSearchIndex?: boolean }
|
||||
) {
|
||||
// TODO: better sanitization
|
||||
const contextMessages = messages.map(({ role, content }) => {
|
||||
@@ -63,14 +66,19 @@ export async function clippy(
|
||||
|
||||
const [{ embedding }] = embeddingResponse.data
|
||||
|
||||
const searchFunction = options?.useAltSearchIndex
|
||||
? 'match_page_sections_v2_nimbus'
|
||||
: 'match_page_sections_v2'
|
||||
const joinedTable = options?.useAltSearchIndex ? 'page_nimbus' : 'page'
|
||||
|
||||
const { error: matchError, data: pageSections } = (await supabaseClient
|
||||
.rpc('match_page_sections_v2', {
|
||||
.rpc(searchFunction, {
|
||||
embedding,
|
||||
match_threshold: 0.78,
|
||||
min_content_length: 50,
|
||||
})
|
||||
.neq('rag_ignore', true)
|
||||
.select('content,page!inner(path),rag_ignore')
|
||||
.select(`content,${joinedTable}!inner(path),rag_ignore`)
|
||||
.limit(10)) as { error: any; data: PageSection[] | null }
|
||||
|
||||
if (matchError || !pageSections) {
|
||||
|
||||
@@ -294,6 +294,48 @@ export type Database = {
|
||||
}
|
||||
Relationships: []
|
||||
}
|
||||
page_nimbus: {
|
||||
Row: {
|
||||
checksum: string | null
|
||||
content: string | null
|
||||
fts_tokens: unknown | null
|
||||
id: number
|
||||
last_refresh: string | null
|
||||
meta: Json | null
|
||||
path: string
|
||||
source: string | null
|
||||
title_tokens: unknown | null
|
||||
type: string | null
|
||||
version: string | null
|
||||
}
|
||||
Insert: {
|
||||
checksum?: string | null
|
||||
content?: string | null
|
||||
fts_tokens?: unknown | null
|
||||
id?: never
|
||||
last_refresh?: string | null
|
||||
meta?: Json | null
|
||||
path: string
|
||||
source?: string | null
|
||||
title_tokens?: unknown | null
|
||||
type?: string | null
|
||||
version?: string | null
|
||||
}
|
||||
Update: {
|
||||
checksum?: string | null
|
||||
content?: string | null
|
||||
fts_tokens?: unknown | null
|
||||
id?: never
|
||||
last_refresh?: string | null
|
||||
meta?: Json | null
|
||||
path?: string
|
||||
source?: string | null
|
||||
title_tokens?: unknown | null
|
||||
type?: string | null
|
||||
version?: string | null
|
||||
}
|
||||
Relationships: []
|
||||
}
|
||||
page_section: {
|
||||
Row: {
|
||||
content: string | null
|
||||
@@ -335,6 +377,47 @@ export type Database = {
|
||||
},
|
||||
]
|
||||
}
|
||||
page_section_nimbus: {
|
||||
Row: {
|
||||
content: string | null
|
||||
embedding: string | null
|
||||
heading: string | null
|
||||
id: number
|
||||
page_id: number
|
||||
rag_ignore: boolean | null
|
||||
slug: string | null
|
||||
token_count: number | null
|
||||
}
|
||||
Insert: {
|
||||
content?: string | null
|
||||
embedding?: string | null
|
||||
heading?: string | null
|
||||
id?: never
|
||||
page_id: number
|
||||
rag_ignore?: boolean | null
|
||||
slug?: string | null
|
||||
token_count?: number | null
|
||||
}
|
||||
Update: {
|
||||
content?: string | null
|
||||
embedding?: string | null
|
||||
heading?: string | null
|
||||
id?: never
|
||||
page_id?: number
|
||||
rag_ignore?: boolean | null
|
||||
slug?: string | null
|
||||
token_count?: number | null
|
||||
}
|
||||
Relationships: [
|
||||
{
|
||||
foreignKeyName: 'page_section_nimbus_page_id_fkey'
|
||||
columns: ['page_id']
|
||||
isOneToOne: false
|
||||
referencedRelation: 'page_nimbus'
|
||||
referencedColumns: ['id']
|
||||
},
|
||||
]
|
||||
}
|
||||
tickets: {
|
||||
Row: {
|
||||
company: string | null
|
||||
@@ -526,6 +609,22 @@ export type Database = {
|
||||
slugs: string[]
|
||||
}[]
|
||||
}
|
||||
docs_search_embeddings_nimbus: {
|
||||
Args: {
|
||||
embedding: string
|
||||
match_threshold: number
|
||||
}
|
||||
Returns: {
|
||||
id: number
|
||||
path: string
|
||||
type: string
|
||||
title: string
|
||||
subtitle: string
|
||||
description: string
|
||||
headings: string[]
|
||||
slugs: string[]
|
||||
}[]
|
||||
}
|
||||
docs_search_fts: {
|
||||
Args: {
|
||||
query: string
|
||||
@@ -539,6 +638,19 @@ export type Database = {
|
||||
description: string
|
||||
}[]
|
||||
}
|
||||
docs_search_fts_nimbus: {
|
||||
Args: {
|
||||
query: string
|
||||
}
|
||||
Returns: {
|
||||
id: number
|
||||
path: string
|
||||
type: string
|
||||
title: string
|
||||
subtitle: string
|
||||
description: string
|
||||
}[]
|
||||
}
|
||||
get_full_content_url: {
|
||||
Args: {
|
||||
type: string
|
||||
@@ -621,6 +733,23 @@ export type Database = {
|
||||
token_count: number | null
|
||||
}[]
|
||||
}
|
||||
match_embedding_nimbus: {
|
||||
Args: {
|
||||
embedding: string
|
||||
match_threshold?: number
|
||||
max_results?: number
|
||||
}
|
||||
Returns: {
|
||||
content: string | null
|
||||
embedding: string | null
|
||||
heading: string | null
|
||||
id: number
|
||||
page_id: number
|
||||
rag_ignore: boolean | null
|
||||
slug: string | null
|
||||
token_count: number | null
|
||||
}[]
|
||||
}
|
||||
match_page_sections_v2: {
|
||||
Args: {
|
||||
embedding: string
|
||||
@@ -638,6 +767,23 @@ export type Database = {
|
||||
token_count: number | null
|
||||
}[]
|
||||
}
|
||||
match_page_sections_v2_nimbus: {
|
||||
Args: {
|
||||
embedding: string
|
||||
match_threshold: number
|
||||
min_content_length: number
|
||||
}
|
||||
Returns: {
|
||||
content: string | null
|
||||
embedding: string | null
|
||||
heading: string | null
|
||||
id: number
|
||||
page_id: number
|
||||
rag_ignore: boolean | null
|
||||
slug: string | null
|
||||
token_count: number | null
|
||||
}[]
|
||||
}
|
||||
search_content: {
|
||||
Args: {
|
||||
embedding: string
|
||||
@@ -676,6 +822,44 @@ export type Database = {
|
||||
subsections: Json[]
|
||||
}[]
|
||||
}
|
||||
search_content_hybrid_nimbus: {
|
||||
Args: {
|
||||
query_text: string
|
||||
query_embedding: string
|
||||
max_result?: number
|
||||
full_text_weight?: number
|
||||
semantic_weight?: number
|
||||
rrf_k?: number
|
||||
match_threshold?: number
|
||||
include_full_content?: boolean
|
||||
}
|
||||
Returns: {
|
||||
id: number
|
||||
page_title: string
|
||||
type: string
|
||||
href: string
|
||||
content: string
|
||||
metadata: Json
|
||||
subsections: Json[]
|
||||
}[]
|
||||
}
|
||||
search_content_nimbus: {
|
||||
Args: {
|
||||
embedding: string
|
||||
include_full_content?: boolean
|
||||
match_threshold?: number
|
||||
max_result?: number
|
||||
}
|
||||
Returns: {
|
||||
id: number
|
||||
page_title: string
|
||||
type: string
|
||||
href: string
|
||||
content: string
|
||||
metadata: Json
|
||||
subsections: Json[]
|
||||
}[]
|
||||
}
|
||||
update_last_changed_checksum: {
|
||||
Args: {
|
||||
new_parent_page: string
|
||||
|
||||
@@ -76,5 +76,7 @@
|
||||
"sdk:dart": true,
|
||||
"sdk:kotlin": true,
|
||||
"sdk:python": true,
|
||||
"sdk:swift": true
|
||||
"sdk:swift": true,
|
||||
|
||||
"search:fullIndex": true
|
||||
}
|
||||
|
||||
@@ -255,6 +255,11 @@
|
||||
"sdk:swift": {
|
||||
"type": "boolean",
|
||||
"description": "Enable the Swift SDK"
|
||||
},
|
||||
|
||||
"search:fullIndex": {
|
||||
"type": "boolean",
|
||||
"description": "Enable the full search index. When true, uses the full search; when false, uses the alternate search index."
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
@@ -314,7 +319,8 @@
|
||||
"sdk:dart",
|
||||
"sdk:kotlin",
|
||||
"sdk:python",
|
||||
"sdk:swift"
|
||||
"sdk:swift",
|
||||
"search:fullIndex"
|
||||
],
|
||||
"additionalProperties": false
|
||||
}
|
||||
|
||||
@@ -40,6 +40,15 @@ function isFeatureEnabled<T extends Feature | Feature[]>(
|
||||
features: T,
|
||||
runtimeDisabledFeatures?: Feature[]
|
||||
) {
|
||||
// Override is used to produce a filtered version of the docs search index
|
||||
// using the same sync setup as our normal search index
|
||||
if (process.env.ENABLED_FEATURES_OVERRIDE_DISABLE_ALL === 'true') {
|
||||
if (Array.isArray(features)) {
|
||||
return Object.fromEntries(features.map((feature) => [featureToCamelCase(feature), false]))
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
const disabledFeatures = new Set([
|
||||
...(runtimeDisabledFeatures ?? []),
|
||||
...disabledFeaturesStaticArray,
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
import { compact, debounce, uniqBy } from 'lodash'
|
||||
import { useCallback, useMemo, useReducer, useRef } from 'react'
|
||||
|
||||
import { isFeatureEnabled } from '../enabled-features'
|
||||
|
||||
const NUMBER_SOURCES = 2
|
||||
|
||||
const SUPABASE_URL = process.env.NEXT_PUBLIC_SUPABASE_URL
|
||||
@@ -200,7 +202,10 @@ const useDocsSearch = () => {
|
||||
|
||||
let sourcesLoaded = 0
|
||||
|
||||
fetch(`${SUPABASE_URL}/rest/v1/rpc/docs_search_fts`, {
|
||||
const useAlternateSearchIndex = !isFeatureEnabled('search:fullIndex')
|
||||
|
||||
const searchEndpoint = useAlternateSearchIndex ? 'docs_search_fts_nimbus' : 'docs_search_fts'
|
||||
fetch(`${SUPABASE_URL}/rest/v1/rpc/${searchEndpoint}`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
@@ -244,7 +249,7 @@ const useDocsSearch = () => {
|
||||
|
||||
fetch(`${SUPABASE_URL}${FUNCTIONS_URL}search-embeddings`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({ query }),
|
||||
body: JSON.stringify({ query, useAlternateSearchIndex }),
|
||||
})
|
||||
.then((response) => response.json())
|
||||
.then((results) => {
|
||||
|
||||
@@ -38,7 +38,7 @@ Deno.serve(async (req) => {
|
||||
throw new UserError('Missing request data')
|
||||
}
|
||||
|
||||
const { query } = requestData
|
||||
const { query, useAlternateSearchIndex } = requestData
|
||||
|
||||
if (!query) {
|
||||
throw new UserError('Missing query in request data')
|
||||
@@ -76,7 +76,11 @@ Deno.serve(async (req) => {
|
||||
}
|
||||
|
||||
const [{ embedding }] = embeddingResponse.data.data
|
||||
const { error: matchError, data: pages } = await supabaseClient.rpc('docs_search_embeddings', {
|
||||
|
||||
const searchFunction = useAlternateSearchIndex
|
||||
? 'docs_search_embeddings_nimbus'
|
||||
: 'docs_search_embeddings'
|
||||
const { error: matchError, data: pages } = await supabaseClient.rpc(searchFunction, {
|
||||
embedding,
|
||||
match_threshold: 0.78,
|
||||
})
|
||||
|
||||
@@ -0,0 +1,325 @@
|
||||
-- Create nimbus tables for feature-flag-filtered search
|
||||
-- These tables mirror the structure of page and page_section but contain only content
|
||||
-- that should be visible when all feature flags are disabled
|
||||
|
||||
create table "public"."page_nimbus" (
|
||||
id bigint primary key generated always as identity,
|
||||
path text not null unique,
|
||||
checksum text,
|
||||
meta jsonb,
|
||||
type text,
|
||||
source text,
|
||||
content text,
|
||||
version uuid,
|
||||
last_refresh timestamptz,
|
||||
fts_tokens tsvector generated always as (to_tsvector('english', content)) stored,
|
||||
title_tokens tsvector generated always as (to_tsvector('english', coalesce(meta ->> 'title', ''))) stored
|
||||
);
|
||||
|
||||
alter table "public"."page_nimbus"
|
||||
enable row level security;
|
||||
|
||||
create policy "anon can read page_nimbus"
|
||||
on public.page_nimbus
|
||||
for select
|
||||
to anon
|
||||
using (true);
|
||||
|
||||
create policy "authenticated can read page_nimbus"
|
||||
on public.page_nimbus
|
||||
for select
|
||||
to authenticated
|
||||
using (true);
|
||||
|
||||
create table "public"."page_section_nimbus" (
|
||||
id bigint primary key generated always as identity,
|
||||
page_id bigint not null references public.page_nimbus (id) on delete cascade,
|
||||
content text,
|
||||
token_count int,
|
||||
embedding vector(1536),
|
||||
slug text,
|
||||
heading text,
|
||||
rag_ignore boolean default false
|
||||
);
|
||||
|
||||
alter table "public"."page_section_nimbus"
|
||||
enable row level security;
|
||||
|
||||
create policy "anon can read page_section_nimbus"
|
||||
on public.page_section_nimbus
|
||||
for select
|
||||
to anon
|
||||
using (true);
|
||||
|
||||
create policy "authenticated can read page_section_nimbus"
|
||||
on public.page_section_nimbus
|
||||
for select
|
||||
to authenticated
|
||||
using (true);
|
||||
|
||||
-- Create indexes for nimbus tables (matching the regular tables)
|
||||
create index fts_search_index_content_nimbus
|
||||
on page_nimbus
|
||||
using gin(fts_tokens);
|
||||
|
||||
create index fts_search_index_title_nimbus
|
||||
on page_nimbus
|
||||
using gin(title_tokens);
|
||||
|
||||
-- Create search function for nimbus tables (FTS search)
|
||||
create or replace function docs_search_fts_nimbus(query text)
|
||||
returns table (
|
||||
id bigint,
|
||||
path text,
|
||||
type text,
|
||||
title text,
|
||||
subtitle text,
|
||||
description text
|
||||
)
|
||||
set search_path = ''
|
||||
language plpgsql
|
||||
as $$
|
||||
#variable_conflict use_variable
|
||||
begin
|
||||
return query
|
||||
select
|
||||
page_nimbus.id,
|
||||
page_nimbus.path,
|
||||
page_nimbus.type,
|
||||
page_nimbus.meta ->> 'title' as title,
|
||||
page_nimbus.meta ->> 'subtitle' as subtitle,
|
||||
page_nimbus.meta ->> 'description' as description
|
||||
from public.page_nimbus
|
||||
where title_tokens @@ websearch_to_tsquery(query) or fts_tokens @@ websearch_to_tsquery(query)
|
||||
order by greatest(
|
||||
-- Title is more important than body, so use 10 as the weighting factor
|
||||
-- Cut off at max rank of 1
|
||||
least(10 * ts_rank(title_tokens, websearch_to_tsquery(query)), 1),
|
||||
ts_rank(fts_tokens, websearch_to_tsquery(query))
|
||||
) desc
|
||||
limit 10;
|
||||
end;
|
||||
$$;
|
||||
|
||||
-- Create embedding matching function for nimbus tables
|
||||
create or replace function match_embedding_nimbus(
|
||||
embedding vector(1536),
|
||||
match_threshold float default 0.78,
|
||||
max_results int default 30
|
||||
)
|
||||
returns setof public.page_section_nimbus
|
||||
set search_path = ''
|
||||
language plpgsql
|
||||
as $$
|
||||
#variable_conflict use_variable
|
||||
begin
|
||||
return query
|
||||
select *
|
||||
from public.page_section_nimbus
|
||||
where (page_section_nimbus.embedding operator(public.<#>) embedding) <= -match_threshold
|
||||
order by page_section_nimbus.embedding operator(public.<#>) embedding
|
||||
limit max_results;
|
||||
end;
|
||||
$$;
|
||||
|
||||
-- Create hybrid search function for nimbus tables
|
||||
create or replace function search_content_hybrid_nimbus(
|
||||
query_text text,
|
||||
query_embedding vector(1536),
|
||||
max_result int default 30,
|
||||
full_text_weight float default 1,
|
||||
semantic_weight float default 1,
|
||||
rrf_k int default 50,
|
||||
match_threshold float default 0.78,
|
||||
include_full_content boolean default false
|
||||
)
|
||||
returns table (
|
||||
id bigint,
|
||||
page_title text,
|
||||
type text,
|
||||
href text,
|
||||
content text,
|
||||
metadata json,
|
||||
subsections json[]
|
||||
)
|
||||
language sql
|
||||
set search_path = ''
|
||||
as $$
|
||||
with full_text as (
|
||||
select
|
||||
id,
|
||||
row_number() over(order by greatest(
|
||||
least(10 * ts_rank(title_tokens, websearch_to_tsquery(query_text)), 1),
|
||||
ts_rank(fts_tokens, websearch_to_tsquery(query_text))
|
||||
) desc) as rank_ix
|
||||
from public.page_nimbus
|
||||
where title_tokens @@ websearch_to_tsquery(query_text) or fts_tokens @@ websearch_to_tsquery(query_text)
|
||||
order by rank_ix
|
||||
limit least(max_result, 30) * 2
|
||||
),
|
||||
semantic as (
|
||||
select
|
||||
page_id as id,
|
||||
row_number() over () as rank_ix
|
||||
from public.match_embedding_nimbus(query_embedding, match_threshold, max_result * 2)
|
||||
),
|
||||
rrf as (
|
||||
select
|
||||
coalesce(full_text.id, semantic.id) as id,
|
||||
coalesce(1.0 / (rrf_k + full_text.rank_ix), 0.0) * full_text_weight +
|
||||
coalesce(1.0 / (rrf_k + semantic.rank_ix), 0.0) * semantic_weight as rrf_score
|
||||
from full_text
|
||||
full outer join semantic on full_text.id = semantic.id
|
||||
)
|
||||
select
|
||||
page_nimbus.id,
|
||||
page_nimbus.meta ->> 'title' as page_title,
|
||||
page_nimbus.type,
|
||||
public.get_full_content_url(page_nimbus.type, page_nimbus.path, null) as href,
|
||||
case when include_full_content then page_nimbus.content else null end as content,
|
||||
page_nimbus.meta as metadata,
|
||||
array_agg(json_build_object(
|
||||
'title', page_section_nimbus.heading,
|
||||
'href', public.get_full_content_url(page_nimbus.type, page_nimbus.path, page_section_nimbus.slug),
|
||||
'content', page_section_nimbus.content
|
||||
)) as subsections
|
||||
from rrf
|
||||
join public.page_nimbus on page_nimbus.id = rrf.id
|
||||
left join public.page_section_nimbus on page_section_nimbus.page_id = page_nimbus.id
|
||||
where rrf.rrf_score > 0
|
||||
group by page_nimbus.id
|
||||
order by max(rrf.rrf_score) desc
|
||||
limit max_result;
|
||||
$$;
|
||||
|
||||
create or replace function match_page_sections_v2_nimbus(
|
||||
embedding vector(1536),
|
||||
match_threshold float,
|
||||
min_content_length int
|
||||
)
|
||||
returns setof page_section_nimbus
|
||||
set search_path = ''
|
||||
language plpgsql
|
||||
as $$
|
||||
#variable_conflict use_variable
|
||||
begin
|
||||
return query
|
||||
select *
|
||||
from public.page_section_nimbus
|
||||
|
||||
-- We only care about sections that have a useful amount of content
|
||||
where length(page_section_nimbus.content) >= min_content_length
|
||||
|
||||
-- The dot product is negative because of a Postgres limitation, so we negate it
|
||||
and (page_section_nimbus.embedding operator(public.<#>) embedding) * -1 > match_threshold
|
||||
|
||||
-- OpenAI embeddings are normalized to length 1, so
|
||||
-- cosine similarity and dot product will produce the same results.
|
||||
-- Using dot product which can be computed slightly faster.
|
||||
--
|
||||
-- For the different syntaxes, see https://github.com/pgvector/pgvector
|
||||
order by page_section_nimbus.embedding operator(public.<#>) embedding;
|
||||
end;
|
||||
$$;
|
||||
|
||||
create or replace function docs_search_embeddings_nimbus(
|
||||
embedding vector(1536),
|
||||
match_threshold float
|
||||
)
|
||||
returns table (
|
||||
id bigint,
|
||||
path text,
|
||||
type text,
|
||||
title text,
|
||||
subtitle text,
|
||||
description text,
|
||||
headings text[],
|
||||
slugs text[]
|
||||
)
|
||||
set search_path = ''
|
||||
language plpgsql
|
||||
as $$
|
||||
#variable_conflict use_variable
|
||||
begin
|
||||
return query
|
||||
with match as(
|
||||
select *
|
||||
from public.page_section_nimbus
|
||||
-- The dot product is negative because of a Postgres limitation, so we negate it
|
||||
where (page_section_nimbus.embedding operator(public.<#>) embedding) * -1 > match_threshold
|
||||
-- OpenAI embeddings are normalized to length 1, so
|
||||
-- cosine similarity and dot product will produce the same results.
|
||||
-- Using dot product which can be computed slightly faster.
|
||||
--
|
||||
-- For the different syntaxes, see https://github.com/pgvector/pgvector
|
||||
order by page_section_nimbus.embedding operator(public.<#>) embedding
|
||||
limit 10
|
||||
)
|
||||
select
|
||||
page_nimbus.id,
|
||||
page_nimbus.path,
|
||||
page_nimbus.type,
|
||||
page_nimbus.meta ->> 'title' as title,
|
||||
page_nimbus.meta ->> 'subtitle' as title,
|
||||
page_nimbus.meta ->> 'description' as description,
|
||||
array_agg(match.heading) as headings,
|
||||
array_agg(match.slug) as slugs
|
||||
from public.page_nimbus
|
||||
join match on match.page_id = page_nimbus.id
|
||||
group by page_nimbus.id;
|
||||
end;
|
||||
$$;
|
||||
|
||||
create or replace function search_content_nimbus(
|
||||
embedding vector(1536),
|
||||
include_full_content boolean default false,
|
||||
match_threshold float default 0.78,
|
||||
max_result int default 30
|
||||
)
|
||||
returns table (
|
||||
id bigint,
|
||||
page_title text,
|
||||
type text,
|
||||
href text,
|
||||
content text,
|
||||
metadata json,
|
||||
subsections json[]
|
||||
)
|
||||
set search_path = ''
|
||||
language sql
|
||||
as $$
|
||||
with matched_section as (
|
||||
select
|
||||
*,
|
||||
row_number() over () as ranking
|
||||
from public.match_embedding_nimbus(
|
||||
embedding,
|
||||
match_threshold,
|
||||
max_result
|
||||
)
|
||||
)
|
||||
select
|
||||
page_nimbus.id,
|
||||
meta ->> 'title' as page_title,
|
||||
type,
|
||||
public.get_full_content_url(type, path, null) as href,
|
||||
case
|
||||
when include_full_content
|
||||
then page_nimbus.content
|
||||
else
|
||||
null
|
||||
end as content,
|
||||
meta as metadata,
|
||||
array_agg(
|
||||
json_build_object(
|
||||
'title', heading,
|
||||
'href', public.get_full_content_url(type, path, slug),
|
||||
'content', matched_section.content
|
||||
)
|
||||
)
|
||||
from matched_section
|
||||
join public.page_nimbus on matched_section.page_id = page_nimbus.id
|
||||
group by page_nimbus.id
|
||||
order by min(ranking);
|
||||
$$;
|
||||
|
||||
Reference in New Issue
Block a user