From 8cd5e100387d8f3c360553f71c5234e1786690f5 Mon Sep 17 00:00:00 2001 From: Charis <26616127+charislam@users.noreply.github.com> Date: Tue, 16 Sep 2025 12:37:53 -0400 Subject: [PATCH] feat: alternate search index for nimbus (#38662) * feat: alternate search index for nimbus Create an alternate search index for Nimbus that filters out feature-flagged pages (equivalent to setting all feature flags to false). Notes: - Creates two new DB tables, `page_nimbus` and `page_section_nimbus`, which are filtered versions of `page` and `page_section` - Makes `nimbus` versions of all the DB search functions - Refactored the embedding upload script. Changes to make it faster (got annoyed by how slow it was when testing...), incorporate retries, and produce better summary logs. - Upload script, when run with the environment variable ENABLED_FEATURES_OVERRIDE_DISABLE_ALL, produces and uploads the alternate search index - Changed all the search calls in frontend/API to check for `isFeatureEnabled('search:fullIndex')` to determine whether to search the full or alternate index * ci: produce nimbus search indexes on merge * fix: turn full search index on --- .github/workflows/search.yml | 6 +- apps/docs/app/api/ai/docs/route.ts | 7 +- apps/docs/package.json | 2 + .../globalSearch/globalSearchModel.ts | 19 +- apps/docs/scripts/search/embeddings/utils.ts | 106 +++ .../scripts/search/generate-embeddings.ts | 744 +++++++++++++----- apps/docs/scripts/search/sources/index.ts | 37 +- apps/docs/spec/common-cli-sections.json | 2 +- apps/docs/turbo.json | 1 + packages/ai-commands/src/docs.ts | 14 +- packages/common/database-types.ts | 184 +++++ .../enabled-features/enabled-features.json | 4 +- .../enabled-features.schema.json | 8 +- packages/common/enabled-features/index.ts | 9 + packages/common/hooks/useDocsSearch.ts | 9 +- supabase/functions/search-embeddings/index.ts | 8 +- ...0910155912_create_nimbus_search_tables.sql | 325 ++++++++ 17 files changed, 1256 insertions(+), 229 deletions(-) create mode 100644 apps/docs/scripts/search/embeddings/utils.ts create mode 100644 supabase/migrations/20250910155912_create_nimbus_search_tables.sql diff --git a/.github/workflows/search.yml b/.github/workflows/search.yml index ade8c38a93..cdafa84f2b 100644 --- a/.github/workflows/search.yml +++ b/.github/workflows/search.yml @@ -68,8 +68,12 @@ jobs: run: | pnpm run codegen:examples pnpm run embeddings + pnpm run embeddings:nimbus - name: Refresh embeddings working-directory: ./apps/docs if: ${{ inputs.refresh }} - run: pnpm run embeddings:refresh + run: | + pnpm run codegen:examples + pnpm run embeddings:refresh + pnpm run embeddings:nimbus:refresh diff --git a/apps/docs/app/api/ai/docs/route.ts b/apps/docs/app/api/ai/docs/route.ts index b88d7837f8..8e3df4206a 100644 --- a/apps/docs/app/api/ai/docs/route.ts +++ b/apps/docs/app/api/ai/docs/route.ts @@ -3,6 +3,8 @@ import { ApplicationError, UserError, clippy } from 'ai-commands/edge' import { NextRequest, NextResponse } from 'next/server' import OpenAI from 'openai' +import { isFeatureEnabled } from 'common/enabled-features' + export const runtime = 'edge' /* To avoid OpenAI errors, restrict to the Vercel Edge Function regions that overlap with the OpenAI API regions. @@ -54,7 +56,10 @@ export async function POST(req: NextRequest) { throw new UserError('Missing messages in request data') } - const response = await clippy(openai, supabaseClient, messages) + const useAltSearchIndex = !isFeatureEnabled('search:fullIndex') + const response = await clippy(openai, supabaseClient, messages, { + useAltSearchIndex, + }) // Proxy the streamed SSE response from OpenAI return new NextResponse(response.body, { diff --git a/apps/docs/package.json b/apps/docs/package.json index fbf1faf4ba..5f9eb55ac1 100644 --- a/apps/docs/package.json +++ b/apps/docs/package.json @@ -17,7 +17,9 @@ "dev:secrets:pull": "AWS_PROFILE=supa-dev node ../../scripts/getSecrets.js -n local/docs", "dev:watch:troubleshooting": "node ./scripts/troubleshooting/watch.mjs", "embeddings": "tsx --conditions=react-server scripts/search/generate-embeddings.ts", + "embeddings:nimbus": "ENABLED_FEATURES_OVERRIDE_DISABLE_ALL=true pnpm run embeddings", "embeddings:refresh": "pnpm run embeddings --refresh", + "embeddings:nimbus:refresh": "ENABLED_FEATURES_OVERRIDE_DISABLE_ALL=true pnpm run embeddings:refresh", "last-changed": "tsx scripts/last-changed.ts", "last-changed:reset": "pnpm run last-changed -- --reset", "lint": "next lint", diff --git a/apps/docs/resources/globalSearch/globalSearchModel.ts b/apps/docs/resources/globalSearch/globalSearchModel.ts index c6243d530c..8b46f5d870 100644 --- a/apps/docs/resources/globalSearch/globalSearchModel.ts +++ b/apps/docs/resources/globalSearch/globalSearchModel.ts @@ -3,6 +3,8 @@ import { convertPostgrestToApiError, type ApiErrorGeneric } from '~/app/api/util import { Result } from '~/features/helpers.fn' import { openAI } from '~/lib/openAi' import { supabase, type DatabaseCorrected } from '~/lib/supabase' + +import { isFeatureEnabled } from 'common/enabled-features' import { GuideModel } from '../guide/guideModel' import { DB_METADATA_TAG_PLATFORM_CLI, @@ -13,6 +15,9 @@ import { ReferenceSDKFunctionModel, SDKLanguageValues } from '../reference/refer import { TroubleshootingModel } from '../troubleshooting/troubleshootingModel' import { SearchResultInterface } from './globalSearchInterface' +type SearchFunction = 'search_content' | 'search_content_nimbus' +type SearchHybridFunction = 'search_content_hybrid' | 'search_content_hybrid_nimbus' + export abstract class SearchResultModel { static async search( args: RootQueryTypeSearchDocsArgs, @@ -22,9 +27,14 @@ export abstract class SearchResultModel { const includeFullContent = requestedFields.includes('content') const embeddingResult = await openAI().createContentEmbedding(query) + const useAltSearchIndex = !isFeatureEnabled('search:fullIndex') + const searchFunction: SearchFunction = useAltSearchIndex + ? 'search_content_nimbus' + : 'search_content' + return embeddingResult.flatMapAsync(async ({ embedding }) => { const matchResult = new Result( - await supabase().rpc('search_content', { + await supabase().rpc(searchFunction, { embedding, include_full_content: includeFullContent, max_result: args.limit ?? undefined, @@ -49,9 +59,14 @@ export abstract class SearchResultModel { const includeFullContent = requestedFields.includes('content') const embeddingResult = await openAI().createContentEmbedding(query) + const useAltSearchIndex = !isFeatureEnabled('search:fullIndex') + const searchFunction: SearchHybridFunction = useAltSearchIndex + ? 'search_content_hybrid_nimbus' + : 'search_content_hybrid' + return embeddingResult.flatMapAsync(async ({ embedding }) => { const matchResult = new Result( - await supabase().rpc('search_content_hybrid', { + await supabase().rpc(searchFunction, { query_text: query, query_embedding: embedding, include_full_content: includeFullContent, diff --git a/apps/docs/scripts/search/embeddings/utils.ts b/apps/docs/scripts/search/embeddings/utils.ts new file mode 100644 index 0000000000..63f8578d2b --- /dev/null +++ b/apps/docs/scripts/search/embeddings/utils.ts @@ -0,0 +1,106 @@ +export interface PageInfo { + pageId: number + path: string + checksum: string + sectionsCount: number +} + +export interface PageSectionForEmbedding { + pageId: number + path: string + slug?: string + heading?: string + content: string + input: string + ragIgnore: boolean +} + +export interface PageSectionWithEmbedding extends PageSectionForEmbedding { + embedding: number[] +} + +export interface ProcessingResult { + successfulPages: Set + failedPages: Set + totalSectionsProcessed: number + totalSectionsInserted: number +} + +export function createBatches(array: T[], batchSize: number): T[][] { + const batches: T[][] = [] + for (let i = 0; i < array.length; i += batchSize) { + batches.push(array.slice(i, i + batchSize)) + } + return batches +} + +export function mapEmbeddingsToSections( + batch: PageSectionForEmbedding[], + data: Array<{ embedding?: number[] }>, + batchNumber: number +): { + sectionsWithEmbeddings: PageSectionWithEmbedding[] + failedSectionIndexes: Set +} { + const sectionsWithEmbeddings: PageSectionWithEmbedding[] = [] + const failedSectionIndexes: Set = new Set() + + if (batch.length !== data.length) { + console.error( + `Ignoring all embeddings returned from batch ${batchNumber} because returned number doesn't match input number` + ) + batch.forEach((_, index) => { + failedSectionIndexes.add(index) + }) + } + + for (let i = 0; i < batch.length; i++) { + if (data[i].embedding) { + sectionsWithEmbeddings.push({ ...batch[i], embedding: data[i].embedding! }) + } else { + failedSectionIndexes.add(i) + } + } + + return { sectionsWithEmbeddings, failedSectionIndexes } +} + +export function updatePageInsertionCounts( + pageSectionsInserted: Map, + sectionsWithEmbeddings: PageSectionWithEmbedding[] +) { + sectionsWithEmbeddings.forEach((section) => { + const current = pageSectionsInserted.get(section.pageId) || 0 + pageSectionsInserted.set(section.pageId, current + 1) + }) +} + +export function computePageResults( + pageInfoMap: Map, + pageSectionsInserted: Map, + result: ProcessingResult +) { + for (const [pageId, pageInfo] of pageInfoMap) { + const insertedCount = pageSectionsInserted.get(pageId) || 0 + if (insertedCount === pageInfo.sectionsCount && !result.failedPages.has(pageId)) { + result.successfulPages.add(pageId) + } else { + result.failedPages.add(pageId) + console.warn( + `Page ${pageInfo.path}: inserted ${insertedCount}/${pageInfo.sectionsCount} sections` + ) + } + } +} + +export function logFailedSections( + batch: PageSectionForEmbedding[], + inputs: string[], + failedSectionIndexes: Set +) { + failedSectionIndexes.forEach((i) => { + console.error( + `Failed to process section: ${batch[i].path}#${batch[i].slug} (content: "${inputs[i]?.slice(0, 50)}...")` + ) + }) +} diff --git a/apps/docs/scripts/search/generate-embeddings.ts b/apps/docs/scripts/search/generate-embeddings.ts index d8a7bfdf90..a5abac6401 100644 --- a/apps/docs/scripts/search/generate-embeddings.ts +++ b/apps/docs/scripts/search/generate-embeddings.ts @@ -1,24 +1,453 @@ import '../utils/dotenv.js' -import { createClient } from '@supabase/supabase-js' +import { createClient, type SupabaseClient } from '@supabase/supabase-js' import { parseArgs } from 'node:util' import { OpenAI } from 'openai' import { v4 as uuidv4 } from 'uuid' + import type { Section } from '../helpers.mdx.js' +import { + type PageInfo, + type PageSectionForEmbedding, + type PageSectionWithEmbedding, + type ProcessingResult, + createBatches, + mapEmbeddingsToSections, + updatePageInsertionCounts, + computePageResults, + logFailedSections, +} from './embeddings/utils.js' import { fetchAllSources } from './sources/index.js' +const CONFIG = { + // OpenAI settings + EMBEDDING_MODEL: 'text-embedding-ada-002' as const, + EMBEDDING_DIMENSION: 1536, // Keep in sync with EMBEDDING_MODEL + OPENAI_BATCH_SIZE: 128, + OPENAI_MAX_RETRIES: 3, + OPENAI_BASE_DELAY_MS: 500, + /** + * If context length is exceeded, truncate inputs over this character length + * and retry. This is a character-based heuristic, not token-exact. + */ + EMBEDDING_TRUNCATE_CHAR_LIMIT: 16_000, + + // Supabase settings + SUPABASE_MAX_RETRIES: 2, + SUPABASE_BASE_DELAY_MS: 100, + + // Processing settings + SOURCE_CONCURRENCY: 10, +} as const + +function delay(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +function exponentialBackoff(attempt: number, baseDelay: number, maxDelay: number = 30_000): number { + const exponentialDelay = baseDelay * Math.pow(2, attempt) + const jitter = (Math.random() - 0.5) * 0.1 * exponentialDelay + return Math.min(Math.max(0, exponentialDelay + jitter), maxDelay) +} + +async function withRetry( + operation: () => Promise, + maxRetries: number, + baseDelay: number, + operationName: string, + shouldRetryOnError: (error: unknown) => boolean = () => true +): Promise { + let lastError: Error + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + return await operation() + } catch (error) { + lastError = error as Error + + // Allow caller to prevent redundant retries for specific errors + if (!shouldRetryOnError?.(error)) { + console.warn(`${operationName} encountered non-retryable error:`, lastError.message) + throw lastError + } + + if (attempt === maxRetries) { + console.error(`${operationName} failed after ${maxRetries + 1} attempts:`, lastError) + throw lastError + } + + const delayMs = exponentialBackoff(attempt, baseDelay) + console.warn( + `${operationName} attempt ${attempt + 1} failed, retrying in ${delayMs}ms:`, + lastError.message + ) + await delay(delayMs) + } + } + + throw lastError! +} + +function isNimbusMode(): boolean { + return process.env.ENABLED_FEATURES_OVERRIDE_DISABLE_ALL === 'true' +} + +function getPageTables() { + const nimbus = isNimbusMode() + return { + pageTable: nimbus ? 'page_nimbus' : 'page', + pageSectionTable: nimbus ? 'page_section_nimbus' : 'page_section', + } as const +} + +function requireEnvOrThrow(names: string[]): void { + const missing = names.filter((n) => !process.env[n]) + if (missing.length) { + throw new Error( + `Environment variables ${missing.join(', ')} are required: skipping embeddings generation` + ) + } +} + +function initSupabase(): SupabaseClient { + return createClient(process.env.NEXT_PUBLIC_SUPABASE_URL!, process.env.SUPABASE_SECRET_KEY!, { + auth: { persistSession: false, autoRefreshToken: false }, + }) +} + +type PreparedSections = { + allSectionsToProcess: PageSectionForEmbedding[] + pageInfoMap: Map +} + +async function prepareSections( + supabaseClient: SupabaseClient, + pageTable: string, + pageSectionTable: string, + shouldRefresh: boolean, + refreshVersion: string, + refreshDate: Date, + fullIndex = true, + debug = false +): Promise { + const embeddingSources = await fetchAllSources(fullIndex) + console.log(`Discovered ${embeddingSources.length} sources`) + + const allSectionsToProcess: PageSectionForEmbedding[] = [] + const pageInfoMap = new Map() + + for (const sourceBatch of createBatches(embeddingSources, CONFIG.SOURCE_CONCURRENCY)) { + await Promise.all( + sourceBatch.map(async (embeddingSource) => { + const { type, source, path } = embeddingSource + + try { + const { + checksum, + sections, + meta = {}, + ragIgnore = false, + }: { + checksum: string + sections: Section[] + ragIgnore?: boolean + meta?: Record + } = await embeddingSource.process() + + const { error: fetchPageError, data: existingPage } = await supabaseClient + .from(pageTable) + .select('id, path, checksum') + .filter('path', 'eq', path) + .limit(1) + .maybeSingle() + + if (fetchPageError) throw fetchPageError + + if (!shouldRefresh && existingPage?.checksum === checksum) { + const { error: updatePageError } = await supabaseClient + .from(pageTable) + .update({ + type, + source, + meta, + version: refreshVersion, + last_refresh: refreshDate, + }) + .filter('id', 'eq', existingPage.id) + if (updatePageError) throw updatePageError + return + } + + if (existingPage && debug) { + console.log( + !shouldRefresh + ? `[${path}] Docs have changed, removing old page sections and their embeddings` + : `[${path}] Refresh flag set, removing old page sections and their embeddings` + ) + + const { error: deletePageSectionError } = await supabaseClient + .from(pageSectionTable) + .delete() + .filter('page_id', 'eq', existingPage.id) + if (deletePageSectionError) throw deletePageSectionError + } + + const { error: upsertPageError, data: page } = await supabaseClient + .from(pageTable) + .upsert( + { + checksum: null, + path, + type, + source, + meta, + content: embeddingSource.extractIndexedContent(), + version: refreshVersion, + last_refresh: refreshDate, + }, + { onConflict: 'path' } + ) + .select() + .limit(1) + .single() + if (upsertPageError) throw upsertPageError + + if (debug) { + console.log(`[${path}] Preparing ${sections.length} page sections for processing`) + } + + pageInfoMap.set(page.id, { + pageId: page.id, + path, + checksum, + sectionsCount: sections.length, + }) + + const sectionsForBatching = sections.map(({ slug, heading, content }) => ({ + pageId: page.id, + path, + slug, + heading, + content, + input: content.replace(/\n/g, ' '), + ragIgnore, + })) + allSectionsToProcess.push(...sectionsForBatching) + } catch (err) { + console.error(`Error preparing path '${path}' for processing.`) + console.error(err) + } + }) + ) + } + + console.log( + `Prepared ${allSectionsToProcess.length} sections for processing from ${pageInfoMap.size} pages` + ) + return { allSectionsToProcess, pageInfoMap } +} + +async function processAndInsertEmbeddings( + openai: OpenAI, + supabaseClient: SupabaseClient, + pageSectionTable: string, + allSections: PageSectionForEmbedding[], + pageInfoMap: Map +): Promise { + if (allSections.length === 0) { + return { + successfulPages: new Set(), + failedPages: new Set(), + totalSectionsProcessed: 0, + totalSectionsInserted: 0, + } + } + + console.log(`Processing ${allSections.length} sections with embeddings + insertion`) + + const embeddingBatches = createBatches(allSections, CONFIG.OPENAI_BATCH_SIZE) + const result: ProcessingResult = { + successfulPages: new Set(), + failedPages: new Set(), + totalSectionsProcessed: 0, + totalSectionsInserted: 0, + } + + // Track sections inserted per page + const pageSectionsInserted = new Map() + + for (let batchIndex = 0; batchIndex < embeddingBatches.length; batchIndex++) { + const batch = embeddingBatches[batchIndex] + try { + const batchResult = await processEmbeddingBatch( + openai, + batch, + batchIndex, + embeddingBatches.length + ) + + result.totalSectionsProcessed += batchResult.processedCount + + if (batchResult.sectionsWithEmbeddings.length > 0) { + const insertedCount = await insertSectionBatch( + supabaseClient, + pageSectionTable, + batchResult.sectionsWithEmbeddings + ) + result.totalSectionsInserted += insertedCount + updatePageInsertionCounts(pageSectionsInserted, batchResult.sectionsWithEmbeddings) + } + + // Mark failed section pages + batchResult.failedSectionIndexes.forEach((i) => { + result.failedPages.add(batch[i].pageId) + }) + } catch (error) { + console.error(`Batch ${batchIndex + 1} completely failed:`, error) + batch.forEach((section) => result.failedPages.add(section.pageId)) + } + + if (batchIndex < embeddingBatches.length - 1) { + await delay(CONFIG.OPENAI_BASE_DELAY_MS) + } + } + + computePageResults(pageInfoMap, pageSectionsInserted, result) + + return result +} + +type BatchEmbeddingResult = { + sectionsWithEmbeddings: PageSectionWithEmbedding[] + failedSectionIndexes: Set + processedCount: number +} + +async function processEmbeddingBatch( + openai: OpenAI, + batch: PageSectionForEmbedding[], + batchIndex: number, + totalBatches: number +): Promise { + const inputs = batch.map((section) => section.input) + console.log( + `Processing embedding batch ${batchIndex + 1}/${totalBatches} (${inputs.length} sections)` + ) + + // Helper to identify context length exceeded errors from OpenAI + const isContextLengthError = (err: unknown) => { + if (!(err instanceof OpenAI.APIError)) return false + + const message = err.error?.message as string + const status = err.status + return status === 400 && message.toLowerCase().includes('context') + } + + let embeddingResponse: OpenAI.Embeddings.CreateEmbeddingResponse + try { + embeddingResponse = await withRetry( + () => + openai.embeddings.create({ + model: CONFIG.EMBEDDING_MODEL, + input: inputs, + }), + CONFIG.OPENAI_MAX_RETRIES, + CONFIG.OPENAI_BASE_DELAY_MS, + `OpenAI embedding batch ${batchIndex + 1}`, + (err) => !isContextLengthError(err) + ) + } catch (err) { + if (!isContextLengthError(err)) { + throw err + } + + // Context length exceeded: truncate problematic sections and try once more + const limit = CONFIG.EMBEDDING_TRUNCATE_CHAR_LIMIT + const truncatedInputs = inputs.map((s) => (s.length > limit ? s.slice(0, limit) : s)) + const truncatedCount = truncatedInputs.filter((s, i) => s !== inputs[i]).length + console.warn( + `OpenAI embedding batch ${batchIndex + 1}: context length exceeded. ` + + `Truncating ${truncatedCount} overly long section(s) to ${limit} chars and retrying once.` + ) + + embeddingResponse = await openai.embeddings.create({ + model: CONFIG.EMBEDDING_MODEL, + input: truncatedInputs, + }) + + // Replace inputs with truncated inputs for downstream bookkeeping + for (let i = 0; i < inputs.length; i++) inputs[i] = truncatedInputs[i] + } + + const { sectionsWithEmbeddings, failedSectionIndexes } = mapEmbeddingsToSections( + batch, + embeddingResponse.data, + batchIndex + ) + logFailedSections(batch, inputs, failedSectionIndexes) + + return { + sectionsWithEmbeddings, + failedSectionIndexes, + processedCount: inputs.length, + } +} + +async function insertSectionBatch( + supabaseClient: SupabaseClient, + pageSectionTable: string, + sectionsWithEmbeddings: PageSectionWithEmbedding[] +): Promise { + if (sectionsWithEmbeddings.length === 0) { + return 0 + } + + const pageSectionsToInsert = sectionsWithEmbeddings.map((section) => ({ + page_id: section.pageId, + slug: section.slug, + heading: section.heading, + content: section.content, + embedding: section.embedding, + rag_ignore: section.ragIgnore, + })) + + await withRetry( + async () => { + const { error } = await supabaseClient.from(pageSectionTable).insert(pageSectionsToInsert) + + if (error) { + throw new Error(`Supabase insert error: ${error.message}`) + } + }, + CONFIG.SUPABASE_MAX_RETRIES, + CONFIG.SUPABASE_BASE_DELAY_MS, + `Insert batch of ${sectionsWithEmbeddings.length} sections` + ) + + return sectionsWithEmbeddings.length +} + const args = parseArgs({ options: { refresh: { type: 'boolean', }, + debug: { + type: 'boolean', + }, }, }) async function generateEmbeddings() { const shouldRefresh = Boolean(args.values.refresh) + const debug = Boolean(args.values.debug) - const requiredEnvVars = [ + const nimbus = isNimbusMode() + if (nimbus) { + console.log('Running in Nimbus mode - will filter content based on disabled feature flags') + } + + requireEnvOrThrow([ 'DOCS_GITHUB_APP_ID', 'DOCS_GITHUB_APP_INSTALLATION_ID', 'DOCS_GITHUB_APP_PRIVATE_KEY', @@ -27,217 +456,136 @@ async function generateEmbeddings() { 'NEXT_PUBLIC_SUPABASE_URL', 'OPENAI_API_KEY', 'SUPABASE_SECRET_KEY', - ] + ]) - const missingEnvVars = requiredEnvVars.filter((name) => !process.env[name]) - if (missingEnvVars.length > 0) { - throw new Error( - `Environment variables ${missingEnvVars.join( - ', ' - )} are required: skipping embeddings generation` - ) - } + const supabaseClient = initSupabase() - const supabaseClient = createClient( - process.env.NEXT_PUBLIC_SUPABASE_URL!, - process.env.SUPABASE_SECRET_KEY!, - { - auth: { - persistSession: false, - autoRefreshToken: false, - }, - } - ) - - // Use this version to track which pages to purge - // after the refresh const refreshVersion = uuidv4() - const refreshDate = new Date() - const embeddingSources = await fetchAllSources() + const { pageTable, pageSectionTable } = getPageTables() + const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }) - console.log(`Discovered ${embeddingSources.length} pages`) + console.log( + shouldRefresh + ? 'Refresh flag set, re-generating all pages' + : 'Checking which pages are new or have changed' + ) - if (!shouldRefresh) { - console.log('Checking which pages are new or have changed') - } else { - console.log('Refresh flag set, re-generating all pages') + const { allSectionsToProcess, pageInfoMap } = await prepareSections( + supabaseClient, + pageTable, + pageSectionTable, + shouldRefresh, + refreshVersion, + refreshDate, + !nimbus, + debug + ) + + let processingResult: ProcessingResult + try { + processingResult = await processAndInsertEmbeddings( + openai, + supabaseClient, + pageSectionTable, + allSectionsToProcess, + pageInfoMap + ) + console.log( + `Processing complete: ${processingResult.totalSectionsInserted}/${processingResult.totalSectionsProcessed} sections inserted successfully` + ) + console.log( + `Page summary: ${processingResult.successfulPages.size} successful, ${processingResult.failedPages.size} failed` + ) + } catch (error) { + console.error('Critical error during embedding processing:', error) + console.log('Exiting due to complete processing failure') + return } - for (const embeddingSource of embeddingSources) { - const { type, source, path } = embeddingSource + console.log(`\nUpdating checksums for ${processingResult.successfulPages.size} successful pages`) + const successfulChecksumUpdates = await updateSuccessfulChecksums( + supabaseClient, + pageTable, + pageInfoMap, + processingResult + ) + console.log( + `Successfully updated checksums for ${successfulChecksumUpdates}/${processingResult.successfulPages.size} successful pages` + ) - try { - const { - checksum, - sections, - meta = {}, - ragIgnore = false, - }: { - checksum: string - sections: Section[] - ragIgnore?: boolean - meta?: Record - } = await embeddingSource.process() + logFailedPages(pageInfoMap, processingResult) - // Check for existing page in DB and compare checksums - const { error: fetchPageError, data: existingPage } = await supabaseClient - .from('page') - .select('id, path, checksum') - .filter('path', 'eq', path) - .limit(1) - .maybeSingle() - - if (fetchPageError) { - throw fetchPageError - } - - // We use checksum to determine if this page & its sections need to be regenerated - if (!shouldRefresh && existingPage?.checksum === checksum) { - // No content/embedding update required on this page - // Update other meta info - const { error: updatePageError } = await supabaseClient - .from('page') - .update({ - type, - source, - meta, - version: refreshVersion, - last_refresh: refreshDate, - }) - .filter('id', 'eq', existingPage.id) - - if (updatePageError) { - throw updatePageError - } - - continue - } - - if (existingPage) { - if (!shouldRefresh) { - console.log( - `[${path}] Docs have changed, removing old page sections and their embeddings` - ) - } else { - console.log(`[${path}] Refresh flag set, removing old page sections and their embeddings`) - } - - const { error: deletePageSectionError } = await supabaseClient - .from('page_section') - .delete() - .filter('page_id', 'eq', existingPage.id) - - if (deletePageSectionError) { - throw deletePageSectionError - } - } - - // Create/update page record. Intentionally clear checksum until we - // have successfully generated all page sections. - const { error: upsertPageError, data: page } = await supabaseClient - .from('page') - .upsert( - { - checksum: null, - path, - type, - source, - meta, - content: embeddingSource.extractIndexedContent(), - version: refreshVersion, - last_refresh: refreshDate, - }, - { onConflict: 'path' } - ) - .select() - .limit(1) - .single() - - if (upsertPageError) { - throw upsertPageError - } - - console.log(`[${path}] Adding ${sections.length} page sections (with embeddings)`) - for (const { slug, heading, content } of sections) { - // OpenAI recommends replacing newlines with spaces for best results (specific to embeddings) - // force a redeploy - const input = content.replace(/\n/g, ' ') - - try { - const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }) - - const embeddingResponse = await openai.embeddings.create({ - model: 'text-embedding-ada-002', - input, - }) - - const [responseData] = embeddingResponse.data - - const { error: insertPageSectionError } = await supabaseClient - .from('page_section') - .insert({ - page_id: page.id, - slug, - heading, - content, - token_count: embeddingResponse.usage.total_tokens, - embedding: responseData.embedding, - rag_ignore: ragIgnore, - }) - .select() - .limit(1) - .single() - - if (insertPageSectionError) { - throw insertPageSectionError - } - } catch (err) { - // TODO: decide how to better handle failed embeddings - console.error( - `Failed to generate embeddings for '${path}' page section starting with '${input.slice( - 0, - 40 - )}...'` - ) - - throw err - } - } - - // Set page checksum so that we know this page was stored successfully - const { error: updatePageError } = await supabaseClient - .from('page') - .update({ checksum }) - .filter('id', 'eq', page.id) - - if (updatePageError) { - throw updatePageError - } - } catch (err) { - console.error( - `Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.` - ) - console.error(err) - } - } - - console.log(`Removing old pages and their sections`) - - // Delete pages that have been removed (and their sections via cascade) - const { error: deletePageError } = await supabaseClient - .from('page') - .delete() - .filter('version', 'neq', refreshVersion) - - if (deletePageError) { - throw deletePageError - } + await purgeOldPages(supabaseClient, pageTable, refreshVersion) console.log('Embedding generation complete') } +async function updateSuccessfulChecksums( + supabaseClient: SupabaseClient, + pageTable: string, + pageInfoMap: Map, + processingResult: ProcessingResult +): Promise { + let successfulChecksumUpdates = 0 + const pageIds = Array.from(processingResult.successfulPages) + const batches = createBatches(pageIds, CONFIG.SOURCE_CONCURRENCY) + + for (const batch of batches) { + const results = await Promise.all( + batch.map(async (pageId) => { + const pageInfo = pageInfoMap.get(pageId) + if (!pageInfo) { + console.error(`Missing page info for pageId ${pageId}`) + return 0 + } + + try { + const { error: updatePageError } = await supabaseClient + .from(pageTable) + .update({ checksum: pageInfo.checksum }) + .eq('id', pageId) + if (updatePageError) { + console.error(`Failed to update checksum for page ${pageInfo.path}:`, updatePageError) + return 0 + } + return 1 + } catch (error) { + console.error(`Error updating checksum for page ${pageInfo.path}:`, error) + return 0 + } + }) + ) + + successfulChecksumUpdates += results.reduce((sum, x) => sum + x, 0) + } + + return successfulChecksumUpdates +} + +function logFailedPages(pageInfoMap: Map, processingResult: ProcessingResult) { + if (processingResult.failedPages.size === 0) return + console.log(`\nFailed pages:`) + for (const pageId of processingResult.failedPages) { + const pageInfo = pageInfoMap.get(pageId) + if (pageInfo) console.log(` - ${pageInfo.path}`) + } +} + +async function purgeOldPages( + supabaseClient: SupabaseClient, + pageTable: string, + refreshVersion: string +) { + console.log(`Removing old pages and their sections`) + const { error: deletePageError } = await supabaseClient + .from(pageTable) + .delete() + .filter('version', 'neq', refreshVersion) + if (deletePageError) throw deletePageError +} + async function main() { await generateEmbeddings() } diff --git a/apps/docs/scripts/search/sources/index.ts b/apps/docs/scripts/search/sources/index.ts index 9dc5541b27..88a1de69c2 100644 --- a/apps/docs/scripts/search/sources/index.ts +++ b/apps/docs/scripts/search/sources/index.ts @@ -1,3 +1,4 @@ +import { type GuideModel } from '../../../resources/guide/guideModel.js' import { GuideModelLoader } from '../../../resources/guide/guideModelLoader.js' import { GitHubDiscussionLoader, @@ -28,7 +29,7 @@ export type SearchSource = export async function fetchGuideSources() { const guides = (await GuideModelLoader.allFromFs()).unwrapLeft() - return guides.map((guide) => MarkdownLoader.fromGuideModel('guide', guide)) + return guides.map((guide: GuideModel) => MarkdownLoader.fromGuideModel('guide', guide)) } export async function fetchOpenApiReferenceSource() { @@ -125,27 +126,29 @@ export async function fetchLintWarningsGuideSources() { /** * Fetches all the sources we want to index for search */ -export async function fetchAllSources() { +export async function fetchAllSources(fullIndex: boolean) { const guideSources = fetchGuideSources() const lintWarningsGuideSources = fetchLintWarningsGuideSources() const openApiReferenceSource = fetchOpenApiReferenceSource() const jsLibReferenceSource = fetchJsLibReferenceSource() - const dartLibReferenceSource = fetchDartLibReferenceSource() - const pythonLibReferenceSource = fetchPythonLibReferenceSource() - const cSharpLibReferenceSource = fetchCSharpLibReferenceSource() - const swiftLibReferenceSource = fetchSwiftLibReferenceSource() - const ktLibReferenceSource = fetchKtLibReferenceSource() - const cliReferenceSource = fetchCliLibReferenceSource() + const dartLibReferenceSource = fullIndex ? fetchDartLibReferenceSource() : [] + const pythonLibReferenceSource = fullIndex ? fetchPythonLibReferenceSource() : [] + const cSharpLibReferenceSource = fullIndex ? fetchCSharpLibReferenceSource() : [] + const swiftLibReferenceSource = fullIndex ? fetchSwiftLibReferenceSource() : [] + const ktLibReferenceSource = fullIndex ? fetchKtLibReferenceSource() : [] + const cliReferenceSource = fullIndex ? fetchCliLibReferenceSource() : [] - const partnerIntegrationSources = fetchPartners() - .then((partners) => - partners - ? Promise.all( - partners.map((partner) => new IntegrationLoader(partner.slug, partner).load()) - ) - : [] - ) - .then((data) => data.flat()) + const partnerIntegrationSources = fullIndex + ? fetchPartners() + .then((partners) => + partners + ? Promise.all( + partners.map((partner) => new IntegrationLoader(partner.slug, partner).load()) + ) + : [] + ) + .then((data) => data.flat()) + : [] const githubDiscussionSources = fetchDiscussions( 'supabase', diff --git a/apps/docs/spec/common-cli-sections.json b/apps/docs/spec/common-cli-sections.json index 1f658681d3..b6cd3b5c09 100644 --- a/apps/docs/spec/common-cli-sections.json +++ b/apps/docs/spec/common-cli-sections.json @@ -820,7 +820,7 @@ { "id": "supabase-postgres-config", "title": "Manage Postgres configurations", - "slug": "supabase-ssl-enforcement", + "slug": "supabase-postgres-config", "type": "cli-command" }, { diff --git a/apps/docs/turbo.json b/apps/docs/turbo.json index 1cf047c8b0..d980120ebd 100644 --- a/apps/docs/turbo.json +++ b/apps/docs/turbo.json @@ -46,6 +46,7 @@ "DOCS_GITHUB_APP_PRIVATE_KEY", "DOCS_REVALIDATION_KEYS", "DOCS_REVALIDATION_OVERRIDE_KEYS", + "ENABLED_FEATURES_OVERRIDE_DISABLE_ALL", "GITHUB_ACTIONS", "FORCE_ASSET_CDN", "LOGFLARE_INGESTION_API_KEY", diff --git a/packages/ai-commands/src/docs.ts b/packages/ai-commands/src/docs.ts index 07111da4cb..f77095aaa5 100644 --- a/packages/ai-commands/src/docs.ts +++ b/packages/ai-commands/src/docs.ts @@ -13,10 +13,13 @@ interface PageSection { rag_ignore?: boolean } +type MatchPageSectionsFunction = 'match_page_sections_v2' | 'match_page_sections_v2_nimbus' + export async function clippy( openai: OpenAI, supabaseClient: SupabaseClient, - messages: Message[] + messages: Message[], + options?: { useAltSearchIndex?: boolean } ) { // TODO: better sanitization const contextMessages = messages.map(({ role, content }) => { @@ -63,14 +66,19 @@ export async function clippy( const [{ embedding }] = embeddingResponse.data + const searchFunction = options?.useAltSearchIndex + ? 'match_page_sections_v2_nimbus' + : 'match_page_sections_v2' + const joinedTable = options?.useAltSearchIndex ? 'page_nimbus' : 'page' + const { error: matchError, data: pageSections } = (await supabaseClient - .rpc('match_page_sections_v2', { + .rpc(searchFunction, { embedding, match_threshold: 0.78, min_content_length: 50, }) .neq('rag_ignore', true) - .select('content,page!inner(path),rag_ignore') + .select(`content,${joinedTable}!inner(path),rag_ignore`) .limit(10)) as { error: any; data: PageSection[] | null } if (matchError || !pageSections) { diff --git a/packages/common/database-types.ts b/packages/common/database-types.ts index 80ad326b92..1fe0d208dc 100644 --- a/packages/common/database-types.ts +++ b/packages/common/database-types.ts @@ -294,6 +294,48 @@ export type Database = { } Relationships: [] } + page_nimbus: { + Row: { + checksum: string | null + content: string | null + fts_tokens: unknown | null + id: number + last_refresh: string | null + meta: Json | null + path: string + source: string | null + title_tokens: unknown | null + type: string | null + version: string | null + } + Insert: { + checksum?: string | null + content?: string | null + fts_tokens?: unknown | null + id?: never + last_refresh?: string | null + meta?: Json | null + path: string + source?: string | null + title_tokens?: unknown | null + type?: string | null + version?: string | null + } + Update: { + checksum?: string | null + content?: string | null + fts_tokens?: unknown | null + id?: never + last_refresh?: string | null + meta?: Json | null + path?: string + source?: string | null + title_tokens?: unknown | null + type?: string | null + version?: string | null + } + Relationships: [] + } page_section: { Row: { content: string | null @@ -335,6 +377,47 @@ export type Database = { }, ] } + page_section_nimbus: { + Row: { + content: string | null + embedding: string | null + heading: string | null + id: number + page_id: number + rag_ignore: boolean | null + slug: string | null + token_count: number | null + } + Insert: { + content?: string | null + embedding?: string | null + heading?: string | null + id?: never + page_id: number + rag_ignore?: boolean | null + slug?: string | null + token_count?: number | null + } + Update: { + content?: string | null + embedding?: string | null + heading?: string | null + id?: never + page_id?: number + rag_ignore?: boolean | null + slug?: string | null + token_count?: number | null + } + Relationships: [ + { + foreignKeyName: 'page_section_nimbus_page_id_fkey' + columns: ['page_id'] + isOneToOne: false + referencedRelation: 'page_nimbus' + referencedColumns: ['id'] + }, + ] + } tickets: { Row: { company: string | null @@ -526,6 +609,22 @@ export type Database = { slugs: string[] }[] } + docs_search_embeddings_nimbus: { + Args: { + embedding: string + match_threshold: number + } + Returns: { + id: number + path: string + type: string + title: string + subtitle: string + description: string + headings: string[] + slugs: string[] + }[] + } docs_search_fts: { Args: { query: string @@ -539,6 +638,19 @@ export type Database = { description: string }[] } + docs_search_fts_nimbus: { + Args: { + query: string + } + Returns: { + id: number + path: string + type: string + title: string + subtitle: string + description: string + }[] + } get_full_content_url: { Args: { type: string @@ -621,6 +733,23 @@ export type Database = { token_count: number | null }[] } + match_embedding_nimbus: { + Args: { + embedding: string + match_threshold?: number + max_results?: number + } + Returns: { + content: string | null + embedding: string | null + heading: string | null + id: number + page_id: number + rag_ignore: boolean | null + slug: string | null + token_count: number | null + }[] + } match_page_sections_v2: { Args: { embedding: string @@ -638,6 +767,23 @@ export type Database = { token_count: number | null }[] } + match_page_sections_v2_nimbus: { + Args: { + embedding: string + match_threshold: number + min_content_length: number + } + Returns: { + content: string | null + embedding: string | null + heading: string | null + id: number + page_id: number + rag_ignore: boolean | null + slug: string | null + token_count: number | null + }[] + } search_content: { Args: { embedding: string @@ -676,6 +822,44 @@ export type Database = { subsections: Json[] }[] } + search_content_hybrid_nimbus: { + Args: { + query_text: string + query_embedding: string + max_result?: number + full_text_weight?: number + semantic_weight?: number + rrf_k?: number + match_threshold?: number + include_full_content?: boolean + } + Returns: { + id: number + page_title: string + type: string + href: string + content: string + metadata: Json + subsections: Json[] + }[] + } + search_content_nimbus: { + Args: { + embedding: string + include_full_content?: boolean + match_threshold?: number + max_result?: number + } + Returns: { + id: number + page_title: string + type: string + href: string + content: string + metadata: Json + subsections: Json[] + }[] + } update_last_changed_checksum: { Args: { new_parent_page: string diff --git a/packages/common/enabled-features/enabled-features.json b/packages/common/enabled-features/enabled-features.json index 30ca1cb6f7..e39c1ba682 100644 --- a/packages/common/enabled-features/enabled-features.json +++ b/packages/common/enabled-features/enabled-features.json @@ -76,5 +76,7 @@ "sdk:dart": true, "sdk:kotlin": true, "sdk:python": true, - "sdk:swift": true + "sdk:swift": true, + + "search:fullIndex": true } diff --git a/packages/common/enabled-features/enabled-features.schema.json b/packages/common/enabled-features/enabled-features.schema.json index 313975ad61..c34c1c1a22 100644 --- a/packages/common/enabled-features/enabled-features.schema.json +++ b/packages/common/enabled-features/enabled-features.schema.json @@ -255,6 +255,11 @@ "sdk:swift": { "type": "boolean", "description": "Enable the Swift SDK" + }, + + "search:fullIndex": { + "type": "boolean", + "description": "Enable the full search index. When true, uses the full search; when false, uses the alternate search index." } }, "required": [ @@ -314,7 +319,8 @@ "sdk:dart", "sdk:kotlin", "sdk:python", - "sdk:swift" + "sdk:swift", + "search:fullIndex" ], "additionalProperties": false } diff --git a/packages/common/enabled-features/index.ts b/packages/common/enabled-features/index.ts index 8dba6caf46..5439a102d6 100644 --- a/packages/common/enabled-features/index.ts +++ b/packages/common/enabled-features/index.ts @@ -40,6 +40,15 @@ function isFeatureEnabled( features: T, runtimeDisabledFeatures?: Feature[] ) { + // Override is used to produce a filtered version of the docs search index + // using the same sync setup as our normal search index + if (process.env.ENABLED_FEATURES_OVERRIDE_DISABLE_ALL === 'true') { + if (Array.isArray(features)) { + return Object.fromEntries(features.map((feature) => [featureToCamelCase(feature), false])) + } + return false + } + const disabledFeatures = new Set([ ...(runtimeDisabledFeatures ?? []), ...disabledFeaturesStaticArray, diff --git a/packages/common/hooks/useDocsSearch.ts b/packages/common/hooks/useDocsSearch.ts index 99e7e8462f..1b02bb4477 100644 --- a/packages/common/hooks/useDocsSearch.ts +++ b/packages/common/hooks/useDocsSearch.ts @@ -3,6 +3,8 @@ import { compact, debounce, uniqBy } from 'lodash' import { useCallback, useMemo, useReducer, useRef } from 'react' +import { isFeatureEnabled } from '../enabled-features' + const NUMBER_SOURCES = 2 const SUPABASE_URL = process.env.NEXT_PUBLIC_SUPABASE_URL @@ -200,7 +202,10 @@ const useDocsSearch = () => { let sourcesLoaded = 0 - fetch(`${SUPABASE_URL}/rest/v1/rpc/docs_search_fts`, { + const useAlternateSearchIndex = !isFeatureEnabled('search:fullIndex') + + const searchEndpoint = useAlternateSearchIndex ? 'docs_search_fts_nimbus' : 'docs_search_fts' + fetch(`${SUPABASE_URL}/rest/v1/rpc/${searchEndpoint}`, { method: 'POST', headers: { 'content-type': 'application/json', @@ -244,7 +249,7 @@ const useDocsSearch = () => { fetch(`${SUPABASE_URL}${FUNCTIONS_URL}search-embeddings`, { method: 'POST', - body: JSON.stringify({ query }), + body: JSON.stringify({ query, useAlternateSearchIndex }), }) .then((response) => response.json()) .then((results) => { diff --git a/supabase/functions/search-embeddings/index.ts b/supabase/functions/search-embeddings/index.ts index 91a912fe3f..5d1a98adce 100644 --- a/supabase/functions/search-embeddings/index.ts +++ b/supabase/functions/search-embeddings/index.ts @@ -38,7 +38,7 @@ Deno.serve(async (req) => { throw new UserError('Missing request data') } - const { query } = requestData + const { query, useAlternateSearchIndex } = requestData if (!query) { throw new UserError('Missing query in request data') @@ -76,7 +76,11 @@ Deno.serve(async (req) => { } const [{ embedding }] = embeddingResponse.data.data - const { error: matchError, data: pages } = await supabaseClient.rpc('docs_search_embeddings', { + + const searchFunction = useAlternateSearchIndex + ? 'docs_search_embeddings_nimbus' + : 'docs_search_embeddings' + const { error: matchError, data: pages } = await supabaseClient.rpc(searchFunction, { embedding, match_threshold: 0.78, }) diff --git a/supabase/migrations/20250910155912_create_nimbus_search_tables.sql b/supabase/migrations/20250910155912_create_nimbus_search_tables.sql new file mode 100644 index 0000000000..2f305b7fa2 --- /dev/null +++ b/supabase/migrations/20250910155912_create_nimbus_search_tables.sql @@ -0,0 +1,325 @@ +-- Create nimbus tables for feature-flag-filtered search +-- These tables mirror the structure of page and page_section but contain only content +-- that should be visible when all feature flags are disabled + +create table "public"."page_nimbus" ( + id bigint primary key generated always as identity, + path text not null unique, + checksum text, + meta jsonb, + type text, + source text, + content text, + version uuid, + last_refresh timestamptz, + fts_tokens tsvector generated always as (to_tsvector('english', content)) stored, + title_tokens tsvector generated always as (to_tsvector('english', coalesce(meta ->> 'title', ''))) stored +); + +alter table "public"."page_nimbus" +enable row level security; + +create policy "anon can read page_nimbus" +on public.page_nimbus +for select +to anon +using (true); + +create policy "authenticated can read page_nimbus" +on public.page_nimbus +for select +to authenticated +using (true); + +create table "public"."page_section_nimbus" ( + id bigint primary key generated always as identity, + page_id bigint not null references public.page_nimbus (id) on delete cascade, + content text, + token_count int, + embedding vector(1536), + slug text, + heading text, + rag_ignore boolean default false +); + +alter table "public"."page_section_nimbus" +enable row level security; + +create policy "anon can read page_section_nimbus" +on public.page_section_nimbus +for select +to anon +using (true); + +create policy "authenticated can read page_section_nimbus" +on public.page_section_nimbus +for select +to authenticated +using (true); + +-- Create indexes for nimbus tables (matching the regular tables) +create index fts_search_index_content_nimbus +on page_nimbus +using gin(fts_tokens); + +create index fts_search_index_title_nimbus +on page_nimbus +using gin(title_tokens); + +-- Create search function for nimbus tables (FTS search) +create or replace function docs_search_fts_nimbus(query text) +returns table ( + id bigint, + path text, + type text, + title text, + subtitle text, + description text +) +set search_path = '' +language plpgsql +as $$ +#variable_conflict use_variable +begin + return query + select + page_nimbus.id, + page_nimbus.path, + page_nimbus.type, + page_nimbus.meta ->> 'title' as title, + page_nimbus.meta ->> 'subtitle' as subtitle, + page_nimbus.meta ->> 'description' as description + from public.page_nimbus + where title_tokens @@ websearch_to_tsquery(query) or fts_tokens @@ websearch_to_tsquery(query) + order by greatest( + -- Title is more important than body, so use 10 as the weighting factor + -- Cut off at max rank of 1 + least(10 * ts_rank(title_tokens, websearch_to_tsquery(query)), 1), + ts_rank(fts_tokens, websearch_to_tsquery(query)) + ) desc + limit 10; +end; +$$; + +-- Create embedding matching function for nimbus tables +create or replace function match_embedding_nimbus( + embedding vector(1536), + match_threshold float default 0.78, + max_results int default 30 +) +returns setof public.page_section_nimbus +set search_path = '' +language plpgsql +as $$ +#variable_conflict use_variable +begin + return query + select * + from public.page_section_nimbus + where (page_section_nimbus.embedding operator(public.<#>) embedding) <= -match_threshold + order by page_section_nimbus.embedding operator(public.<#>) embedding + limit max_results; +end; +$$; + +-- Create hybrid search function for nimbus tables +create or replace function search_content_hybrid_nimbus( + query_text text, + query_embedding vector(1536), + max_result int default 30, + full_text_weight float default 1, + semantic_weight float default 1, + rrf_k int default 50, + match_threshold float default 0.78, + include_full_content boolean default false +) +returns table ( + id bigint, + page_title text, + type text, + href text, + content text, + metadata json, + subsections json[] +) +language sql +set search_path = '' +as $$ +with full_text as ( + select + id, + row_number() over(order by greatest( + least(10 * ts_rank(title_tokens, websearch_to_tsquery(query_text)), 1), + ts_rank(fts_tokens, websearch_to_tsquery(query_text)) + ) desc) as rank_ix + from public.page_nimbus + where title_tokens @@ websearch_to_tsquery(query_text) or fts_tokens @@ websearch_to_tsquery(query_text) + order by rank_ix + limit least(max_result, 30) * 2 +), +semantic as ( + select + page_id as id, + row_number() over () as rank_ix + from public.match_embedding_nimbus(query_embedding, match_threshold, max_result * 2) +), +rrf as ( + select + coalesce(full_text.id, semantic.id) as id, + coalesce(1.0 / (rrf_k + full_text.rank_ix), 0.0) * full_text_weight + + coalesce(1.0 / (rrf_k + semantic.rank_ix), 0.0) * semantic_weight as rrf_score + from full_text + full outer join semantic on full_text.id = semantic.id +) +select + page_nimbus.id, + page_nimbus.meta ->> 'title' as page_title, + page_nimbus.type, + public.get_full_content_url(page_nimbus.type, page_nimbus.path, null) as href, + case when include_full_content then page_nimbus.content else null end as content, + page_nimbus.meta as metadata, + array_agg(json_build_object( + 'title', page_section_nimbus.heading, + 'href', public.get_full_content_url(page_nimbus.type, page_nimbus.path, page_section_nimbus.slug), + 'content', page_section_nimbus.content + )) as subsections +from rrf +join public.page_nimbus on page_nimbus.id = rrf.id +left join public.page_section_nimbus on page_section_nimbus.page_id = page_nimbus.id +where rrf.rrf_score > 0 +group by page_nimbus.id +order by max(rrf.rrf_score) desc +limit max_result; +$$; + +create or replace function match_page_sections_v2_nimbus( + embedding vector(1536), + match_threshold float, + min_content_length int +) +returns setof page_section_nimbus +set search_path = '' +language plpgsql +as $$ +#variable_conflict use_variable +begin + return query + select * + from public.page_section_nimbus + + -- We only care about sections that have a useful amount of content + where length(page_section_nimbus.content) >= min_content_length + + -- The dot product is negative because of a Postgres limitation, so we negate it + and (page_section_nimbus.embedding operator(public.<#>) embedding) * -1 > match_threshold + + -- OpenAI embeddings are normalized to length 1, so + -- cosine similarity and dot product will produce the same results. + -- Using dot product which can be computed slightly faster. + -- + -- For the different syntaxes, see https://github.com/pgvector/pgvector + order by page_section_nimbus.embedding operator(public.<#>) embedding; +end; +$$; + +create or replace function docs_search_embeddings_nimbus( + embedding vector(1536), + match_threshold float +) +returns table ( + id bigint, + path text, + type text, + title text, + subtitle text, + description text, + headings text[], + slugs text[] +) +set search_path = '' +language plpgsql +as $$ +#variable_conflict use_variable +begin + return query + with match as( + select * + from public.page_section_nimbus + -- The dot product is negative because of a Postgres limitation, so we negate it + where (page_section_nimbus.embedding operator(public.<#>) embedding) * -1 > match_threshold + -- OpenAI embeddings are normalized to length 1, so + -- cosine similarity and dot product will produce the same results. + -- Using dot product which can be computed slightly faster. + -- + -- For the different syntaxes, see https://github.com/pgvector/pgvector + order by page_section_nimbus.embedding operator(public.<#>) embedding + limit 10 + ) + select + page_nimbus.id, + page_nimbus.path, + page_nimbus.type, + page_nimbus.meta ->> 'title' as title, + page_nimbus.meta ->> 'subtitle' as title, + page_nimbus.meta ->> 'description' as description, + array_agg(match.heading) as headings, + array_agg(match.slug) as slugs + from public.page_nimbus + join match on match.page_id = page_nimbus.id + group by page_nimbus.id; +end; +$$; + +create or replace function search_content_nimbus( + embedding vector(1536), + include_full_content boolean default false, + match_threshold float default 0.78, + max_result int default 30 +) +returns table ( + id bigint, + page_title text, + type text, + href text, + content text, + metadata json, + subsections json[] +) +set search_path = '' +language sql +as $$ + with matched_section as ( + select + *, + row_number() over () as ranking + from public.match_embedding_nimbus( + embedding, + match_threshold, + max_result + ) + ) + select + page_nimbus.id, + meta ->> 'title' as page_title, + type, + public.get_full_content_url(type, path, null) as href, + case + when include_full_content + then page_nimbus.content + else + null + end as content, + meta as metadata, + array_agg( + json_build_object( + 'title', heading, + 'href', public.get_full_content_url(type, path, slug), + 'content', matched_section.content + ) + ) + from matched_section + join public.page_nimbus on matched_section.page_id = page_nimbus.id + group by page_nimbus.id + order by min(ranking); +$$; +