feat(content api): add management api references to semantic search (#36289)

* docs: add cursor rule for embedding generation process

Add documentation for cursor IDE about how docs embeddings are generated,
including the workflow for creating and uploading semantic search content.

* feat: improve API reference metadata upload with descriptive content

- Add preembeddings script to run codegen before embedding generation
- Enhance OpenApiReferenceSource to generate more descriptive content including
  parameters, responses, path information, and better structured documentation

* feat: add Management API references to searchDocs GraphQL query

- Add ManagementApiReference GraphQL type and model for API endpoint search results
- Integrate Management API references into global search results
- Update test snapshots and add comprehensive test coverage for Management API search

* style: format
This commit is contained in:
Charis
2025-06-18 09:12:03 -04:00
committed by GitHub
parent 97d80a7166
commit 68cb1a1870
11 changed files with 313 additions and 29 deletions

View File

@@ -0,0 +1,59 @@
# Documentation Embeddings Generation System
## Overview
The documentation embeddings generation system processes various documentation sources and uploads their metadata to a database for semantic search functionality. The system is located in `apps/docs/scripts/search/` and works by:
1. **Discovering content sources** from multiple types of documentation
2. **Processing content** into structured sections with checksums
3. **Generating embeddings** using OpenAI's text-embedding-ada-002 model
4. **Storing in database** with vector embeddings for semantic search
## Architecture
### Main Entry Point
- `generate-embeddings.ts` - Main script that orchestrates the entire process
- Supports `--refresh` flag to force regeneration of all content
### Content Sources (`sources/` directory)
#### Base Classes
- `BaseLoader` - Abstract class for loading content from different sources
- `BaseSource` - Abstract class for processing and formatting content
#### Source Types
1. **Markdown Sources** (`markdown.ts`)
- Processes `.mdx` files from guides and documentation
- Extracts frontmatter metadata and content sections
2. **Reference Documentation** (`reference-doc.ts`)
- **OpenAPI References** - Management API documentation from OpenAPI specs
- **Client Library References** - JavaScript, Dart, Python, C#, Swift, Kotlin SDKs
- **CLI References** - Command-line interface documentation
- Processes YAML/JSON specs and matches with common sections
3. **GitHub Discussions** (`github-discussion.ts`)
- Fetches troubleshooting discussions from GitHub using GraphQL API
- Uses GitHub App authentication for access
4. **Partner Integrations** (`partner-integrations.ts`)
- Fetches approved partner integration documentation from Supabase database
- Technology integrations only (excludes agencies)
### Processing Flow
1. **Content Discovery**: Each source loader discovers and loads content files/data
2. **Content Processing**: Each source processes content into:
- Checksum for change detection
- Metadata (title, subtitle, etc.)
- Sections with headings and content
3. **Change Detection**: Compares checksums against existing database records
4. **Embedding Generation**: Uses OpenAI to generate embeddings for new/changed content
5. **Database Storage**: Stores in `page` and `page_section` tables with embeddings
6. **Cleanup**: Removes outdated pages using version tracking
### Database Schema
- **`page`** table: Stores page metadata, content, checksum, version
- **`page_section`** table: Stores individual sections with embeddings, token counts

View File

@@ -84,6 +84,20 @@ type CLICommandReference implements SearchResult {
content: String
}
"""
A reference document containing a description of a Supabase Management API endpoint
"""
type ManagementApiReference implements SearchResult {
"""The title of the document"""
title: String
"""The URL of the document"""
href: String
"""The content of the reference document, as text"""
content: String
}
"""
A reference document containing a description of a function from a Supabase client library
"""

View File

@@ -204,4 +204,40 @@ describe('prod smoke test: graphql: searchDocs', () => {
expect(guideNode).toHaveProperty('href')
expect(guideNode).toHaveProperty('content')
})
it('searchDocs query includes Management API references', async () => {
const query = `
query SearchDocsQuery($query: String!) {
searchDocs(query: $query) {
nodes {
...on ManagementApiReference {
title
href
content
}
}
}
}
`
const result = await fetch(GRAPHQL_URL, {
method: 'POST',
body: JSON.stringify({ query, variables: { query: 'create SSO provider' } }),
})
expect(result.status).toBe(200)
const { data, errors } = await result.json()
expect(errors).toBeUndefined()
const {
searchDocs: { nodes },
} = data
expect(Array.isArray(nodes)).toBe(true)
expect(nodes.length).toBeGreaterThan(0)
const managementApiNode = nodes.find((node: any) => !!node.title)
expect(managementApiNode).toBeDefined()
expect(managementApiNode).toHaveProperty('title')
expect(managementApiNode).toHaveProperty('href')
expect(managementApiNode).toHaveProperty('content')
})
})

View File

@@ -33,6 +33,16 @@ const rpcSpy = vi.fn().mockImplementation((funcName, params) => {
content: params?.include_full_content ? 'Another content' : null,
subsections: [{ title: 'Getting Started', content: 'Getting Started content' }],
},
{
type: 'reference',
page_title: 'Create a SSO provider',
href: 'https://supabase.com/docs/reference/api/v1-create-a-sso-provider',
content: params?.include_full_content ? 'Creates a new SSO provider for a project' : null,
metadata: {
title: 'Create a SSO provider',
subtitle: 'Management API Reference: Create a SSO provider',
},
},
]
return Promise.resolve({ data: mockResults.slice(0, limit), error: null })
}
@@ -190,4 +200,40 @@ describe('/api/graphql searchDocs', () => {
expect(json.errors).toBeDefined()
expect(json.errors[0].message).toContain('required')
})
it('should return Management API references with proper fields', async () => {
const searchQuery = `
query {
searchDocs(query: "SSO provider", limit: 3) {
nodes {
... on ManagementApiReference {
title
href
content
}
}
}
}
`
const request = new Request('http://localhost/api/graphql', {
method: 'POST',
body: JSON.stringify({ query: searchQuery }),
})
const response = await POST(request)
const json = await response.json()
expect(json.errors).toBeUndefined()
expect(json.data).toBeDefined()
expect(json.data.searchDocs).toBeDefined()
expect(json.data.searchDocs.nodes).toBeInstanceOf(Array)
expect(json.data.searchDocs.nodes).toHaveLength(3)
const managementApiNode = json.data.searchDocs.nodes[2]
expect(managementApiNode).toMatchObject({
title: 'Create a SSO provider',
href: 'https://supabase.com/docs/reference/api/v1-create-a-sso-provider',
content: 'Creates a new SSO provider for a project',
})
})
})

View File

@@ -18,7 +18,12 @@ type Database = {
DatabaseGenerated['public']['Functions']['search_content']['Returns'][number],
'subsections' | 'metadata'
> & {
metadata: { language?: string; methodName?: string; platform?: string }
metadata: {
subtitle?: string
language?: string
methodName?: string
platform?: string
}
subsections: Array<{ title?: string; href?: string; content?: string }>
}
>

View File

@@ -25,6 +25,7 @@
"postbuild": "pnpm run build:sitemap && pnpm run build:llms && ./../../scripts/upload-static-assets.sh",
"prebuild": "pnpm run codegen:graphql && pnpm run codegen:references && pnpm run codegen:examples",
"predev": "pnpm run codegen:graphql && pnpm run codegen:references && pnpm run codegen:examples",
"preembeddings": "pnpm run codegen:references",
"preinstall": "npx only-allow pnpm",
"presync": "pnpm run codegen:graphql",
"pretest": "pnpm run codegen:examples",

View File

@@ -8,6 +8,7 @@ import {
DB_METADATA_TAG_PLATFORM_CLI,
ReferenceCLICommandModel,
} from '../reference/referenceCLIModel'
import { ReferenceManagementApiModel } from '../reference/referenceManagementApiModel'
import { ReferenceSDKFunctionModel, SDKLanguageValues } from '../reference/referenceSDKModel'
import { TroubleshootingModel } from '../troubleshooting/troubleshootingModel'
import { SearchResultInterface } from './globalSearchInterface'
@@ -74,6 +75,13 @@ function createModelFromMatch({
content,
subsections,
})
// TODO [Charis 2025-06-09] replace with less hacky check
} else if (metadata.subtitle?.startsWith('Management API Reference')) {
return new ReferenceManagementApiModel({
title: page_title,
href,
content,
})
} else {
return null
}

View File

@@ -0,0 +1,13 @@
import { type SearchResultInterface } from '../globalSearch/globalSearchInterface'
export class ReferenceManagementApiModel implements SearchResultInterface {
public title?: string
public href?: string
public content?: string
constructor({ title, href, content }: { title?: string; href?: string; content?: string }) {
this.title = title
this.href = href
this.content = content
}
}

View File

@@ -0,0 +1,25 @@
import { GraphQLObjectType, GraphQLString } from 'graphql'
import { GraphQLInterfaceTypeSearchResult } from '../globalSearch/globalSearchSchema'
import { ReferenceManagementApiModel } from './referenceManagementApiModel'
export const GraphQLObjectTypeReferenceManagementApi = new GraphQLObjectType({
name: 'ManagementApiReference',
interfaces: [GraphQLInterfaceTypeSearchResult],
isTypeOf: (value: unknown) => value instanceof ReferenceManagementApiModel,
description:
'A reference document containing a description of a Supabase Management API endpoint',
fields: {
title: {
type: GraphQLString,
description: 'The title of the document',
},
href: {
type: GraphQLString,
description: 'The URL of the document',
},
content: {
type: GraphQLString,
description: 'The content of the reference document, as text',
},
},
})

View File

@@ -10,6 +10,7 @@ import { errorRoot, errorsRoot } from './error/errorResolver'
import { searchRoot } from './globalSearch/globalSearchResolver'
import { GraphQLObjectTypeGuide } from './guide/guideSchema'
import { GraphQLObjectTypeReferenceCLICommand } from './reference/referenceCLISchema'
import { GraphQLObjectTypeReferenceManagementApi } from './reference/referenceManagementApiSchema'
import { GraphQLObjectTypeReferenceSDKFunction } from './reference/referenceSDKSchema'
import { GraphQLObjectTypeTroubleshooting } from './troubleshooting/troubleshootingSchema'
@@ -43,6 +44,7 @@ export const rootGraphQLSchema = new GraphQLSchema({
types: [
GraphQLObjectTypeGuide,
GraphQLObjectTypeReferenceCLICommand,
GraphQLObjectTypeReferenceManagementApi,
GraphQLObjectTypeReferenceSDKFunction,
GraphQLObjectTypeTroubleshooting,
],

View File

@@ -8,6 +8,7 @@ import type {
IFunctionDefinition,
ISpec,
} from '../../../components/reference/Reference.types.js'
import { getApiEndpointById } from '../../../features/docs/Reference.generated.singleton.js'
import type { CliCommand, CliSpec } from '../../../generator/types/CliSpec.js'
import { flattenSections } from '../../../lib/helpers.js'
import { enrichedOperation, gen_v3 } from '../../../lib/refGenerator/helpers.js'
@@ -39,30 +40,35 @@ export abstract class ReferenceLoader<SpecSection> extends BaseLoader {
const specSections = this.getSpecSections(specContents)
const sections = flattenedRefSections
.map((refSection) => {
const specSection = this.matchSpecSection(specSections, refSection.id)
const sections = (
await Promise.all(
flattenedRefSections.map(async (refSection) => {
const specSection = await this.matchSpecSection(specSections, refSection.id)
if (!specSection) {
return
}
if (!specSection) {
return
}
return this.sourceConstructor(
this.source,
`${this.path}/${refSection.slug}`,
refSection,
specSection,
this.enhanceMeta(specSection)
)
})
.filter((item): item is ReferenceSource<SpecSection> => item !== undefined)
return this.sourceConstructor(
this.source,
`${this.path}/${refSection.slug}`,
refSection,
specSection,
this.enhanceMeta(specSection)
)
})
)
).filter((item): item is ReferenceSource<SpecSection> => item !== undefined)
return sections as BaseSource[]
}
abstract getSpecSections(specContents: string): SpecSection[]
abstract matchSpecSection(specSections: SpecSection[], id: string): SpecSection | undefined
enhanceMeta(section: SpecSection): Json {
abstract matchSpecSection(
specSections: SpecSection[],
id: string
): SpecSection | undefined | Promise<SpecSection | undefined>
enhanceMeta(_section: SpecSection): Json {
return this.meta
}
}
@@ -115,7 +121,7 @@ export abstract class ReferenceSource<SpecSection> extends BaseSource {
abstract extractSubtitle(): string
}
export class OpenApiReferenceLoader extends ReferenceLoader<enrichedOperation> {
export class OpenApiReferenceLoader extends ReferenceLoader<Partial<enrichedOperation>> {
constructor(
source: string,
path: string,
@@ -136,39 +142,108 @@ export class OpenApiReferenceLoader extends ReferenceLoader<enrichedOperation> {
return generatedSpec.operations
}
matchSpecSection(operations: enrichedOperation[], id: string): enrichedOperation | undefined {
return operations.find((operation) => operation.operationId === id)
async matchSpecSection(
_operations: enrichedOperation[],
id: string
): Promise<Partial<enrichedOperation> | undefined> {
const apiEndpoint = await getApiEndpointById(id)
if (!apiEndpoint) return undefined
const enrichedOp: Partial<enrichedOperation> = {
operationId: apiEndpoint.id,
operation: apiEndpoint.method,
path: apiEndpoint.path,
summary: apiEndpoint.summary,
description: apiEndpoint.description,
deprecated: apiEndpoint.deprecated,
parameters: apiEndpoint.parameters as any,
requestBody: apiEndpoint.requestBody as any,
responses: apiEndpoint.responses as any,
}
return enrichedOp
}
}
export class OpenApiReferenceSource extends ReferenceSource<enrichedOperation> {
export class OpenApiReferenceSource extends ReferenceSource<Partial<enrichedOperation>> {
formatSection(specOperation: enrichedOperation, _: ICommonItem) {
const { summary, description, operation, path, tags } = specOperation
const { summary, description, operation, path, tags, parameters, responses, operationId } =
specOperation
return JSON.stringify({
summary,
description,
operation,
path,
tags,
parameters,
responses,
operationId,
})
}
extractSubtitle() {
return `${this.meta.title}: ${this.specSection.description}`
return `${this.meta.title}: ${this.specSection.description || this.specSection.operationId || ''}`
}
extractTitle() {
return (
this.specSection.summary ||
(typeof this.meta.title === 'string' ? this.meta.title : this.specSection.operation)
(typeof this.meta.title === 'string' ? this.meta.title : this.specSection.operation) ||
''
)
}
extractIndexedContent(): string {
const { summary, description, operation, tags } = this.specSection
return `# ${this.meta.title ?? ''}\n\n${summary ?? ''}\n\n${description ?? ''}\n\n${operation ?? ''}\n\n${
tags?.join(', ') ?? ''
}`
const { summary, description, operation, tags, path, parameters, responses } = this.specSection
const sections: string[] = []
// Title
sections.push(`# ${this.meta.title ?? ''}`)
// Summary
if (summary) {
sections.push(summary)
}
// Description
if (description) {
sections.push(`Description: ${description}`)
}
// Path and Method
if (path) {
sections.push(`Path: ${operation?.toUpperCase() || 'GET'} ${path}`)
}
// Parameters
if (parameters && parameters.length > 0) {
const paramList = parameters
.map((param: any) => {
const required = param.required ? 'required' : 'optional'
return `- ${param.name} (${param.schema?.type || 'string'}, ${required}): ${param.description || ''}`
})
.join('\n')
sections.push(`Parameters:\n${paramList}`)
}
// Response Types
if (responses) {
const responseList = Object.entries(responses)
.map(([code, response]: [string, any]) => {
const desc = response.description || 'No description'
return `- ${code}: ${desc}`
})
.join('\n')
sections.push(`Responses:\n${responseList}`)
}
// Tags
if (tags && tags.length > 0) {
sections.push(`Tags: ${tags.join(', ')}`)
}
return sections.filter(Boolean).join('\n\n')
}
}