Files
supabase/apps/docs/scripts/search/embeddings/utils.ts
Charis 8cd5e10038 feat: alternate search index for nimbus (#38662)
* feat: alternate search index for nimbus

Create an alternate search index for Nimbus that filters out
feature-flagged pages (equivalent to setting all feature flags to
false).

Notes:
- Creates two new DB tables, `page_nimbus` and `page_section_nimbus`,
  which are filtered versions of `page` and `page_section`
- Makes `nimbus` versions of all the DB search functions
- Refactored the embedding upload script. Changes to make it faster (got
  annoyed by how slow it was when testing...), incorporate retries, and
  produce better summary logs.
- Upload script, when run with the environment variable
  ENABLED_FEATURES_OVERRIDE_DISABLE_ALL, produces and uploads the
  alternate search index
- Changed all the search calls in frontend/API to check for
  `isFeatureEnabled('search:fullIndex')` to determine whether to search
  the full or alternate index

* ci: produce nimbus search indexes on merge

* fix: turn full search index on
2025-09-16 12:37:53 -04:00

107 lines
2.8 KiB
TypeScript

export interface PageInfo {
pageId: number
path: string
checksum: string
sectionsCount: number
}
export interface PageSectionForEmbedding {
pageId: number
path: string
slug?: string
heading?: string
content: string
input: string
ragIgnore: boolean
}
export interface PageSectionWithEmbedding extends PageSectionForEmbedding {
embedding: number[]
}
export interface ProcessingResult {
successfulPages: Set<number>
failedPages: Set<number>
totalSectionsProcessed: number
totalSectionsInserted: number
}
export function createBatches<T>(array: T[], batchSize: number): T[][] {
const batches: T[][] = []
for (let i = 0; i < array.length; i += batchSize) {
batches.push(array.slice(i, i + batchSize))
}
return batches
}
export function mapEmbeddingsToSections(
batch: PageSectionForEmbedding[],
data: Array<{ embedding?: number[] }>,
batchNumber: number
): {
sectionsWithEmbeddings: PageSectionWithEmbedding[]
failedSectionIndexes: Set<number>
} {
const sectionsWithEmbeddings: PageSectionWithEmbedding[] = []
const failedSectionIndexes: Set<number> = new Set()
if (batch.length !== data.length) {
console.error(
`Ignoring all embeddings returned from batch ${batchNumber} because returned number doesn't match input number`
)
batch.forEach((_, index) => {
failedSectionIndexes.add(index)
})
}
for (let i = 0; i < batch.length; i++) {
if (data[i].embedding) {
sectionsWithEmbeddings.push({ ...batch[i], embedding: data[i].embedding! })
} else {
failedSectionIndexes.add(i)
}
}
return { sectionsWithEmbeddings, failedSectionIndexes }
}
export function updatePageInsertionCounts(
pageSectionsInserted: Map<number, number>,
sectionsWithEmbeddings: PageSectionWithEmbedding[]
) {
sectionsWithEmbeddings.forEach((section) => {
const current = pageSectionsInserted.get(section.pageId) || 0
pageSectionsInserted.set(section.pageId, current + 1)
})
}
export function computePageResults(
pageInfoMap: Map<number, PageInfo>,
pageSectionsInserted: Map<number, number>,
result: ProcessingResult
) {
for (const [pageId, pageInfo] of pageInfoMap) {
const insertedCount = pageSectionsInserted.get(pageId) || 0
if (insertedCount === pageInfo.sectionsCount && !result.failedPages.has(pageId)) {
result.successfulPages.add(pageId)
} else {
result.failedPages.add(pageId)
console.warn(
`Page ${pageInfo.path}: inserted ${insertedCount}/${pageInfo.sectionsCount} sections`
)
}
}
}
export function logFailedSections(
batch: PageSectionForEmbedding[],
inputs: string[],
failedSectionIndexes: Set<number>
) {
failedSectionIndexes.forEach((i) => {
console.error(
`Failed to process section: ${batch[i].path}#${batch[i].slug} (content: "${inputs[i]?.slice(0, 50)}...")`
)
})
}