From c281dd0eb1774e5119a6c453cd101094eae91e97 Mon Sep 17 00:00:00 2001 From: Charis <26616127+charislam@users.noreply.github.com> Date: Thu, 6 Jun 2024 15:20:05 -0400 Subject: [PATCH] ci(docs): update last_changed table action (#27038) Keeps track of fine-grained (per section) edit times for docs content. Once daily, a GitHub Action runs that: - Checks whether content hashes have changed for each section - Updates the table that tracks content edit times if the hashes have changed Note: The cron job isn't scheduled yet. I'll run the Action manually a few times to validate it, then turn it on in another PR. --- .github/workflows/docs-last-changed.yml | 46 ++++++ apps/docs/package.json | 2 + apps/docs/scripts/last-changed.ts | 152 ++++++++++-------- .../20240605171314_last_changed_update.sql | 101 ++++++++++++ 4 files changed, 234 insertions(+), 67 deletions(-) create mode 100644 .github/workflows/docs-last-changed.yml create mode 100644 supabase/migrations/20240605171314_last_changed_update.sql diff --git a/.github/workflows/docs-last-changed.yml b/.github/workflows/docs-last-changed.yml new file mode 100644 index 0000000000..188a9c7afd --- /dev/null +++ b/.github/workflows/docs-last-changed.yml @@ -0,0 +1,46 @@ +name: "[Docs] Update last-changed dates" + +on: + # Temporarily turned off until the action is validated manually + # + # schedule: + # - cron: '0 0 * * *' + workflow_dispatch: + inputs: + reset: + description: 'Reset last-updated dates using Git commit dates' + required: false + type: boolean + +jobs: + deploy: + runs-on: ubuntu-latest + + env: + NEXT_PUBLIC_SUPABASE_URL: ${{ secrets.SEARCH_SUPABASE_URL }} + SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SEARCH_SUPABASE_SERVICE_ROLE_KEY }} + + steps: + - name: Check out repo + uses: actions/checkout@v4 + with: + sparse-checkout: | + apps/docs + + - name: Setup node + uses: actions/setup-node@v4 + with: + node-version-file: '.nvmrc' + + - name: Download dependencies + run: npm ci + + - name: Update last-changed dates + working-directory: ./apps/docs + if: ${{ !inputs.reset }} + run: npm run last-changed + + - name: Reset last-changed dates + working-directory: ./apps/docs + if: ${{ inputs.reset }} + run: npm run last-changed:reset diff --git a/apps/docs/package.json b/apps/docs/package.json index 332686ad90..35fd98186c 100644 --- a/apps/docs/package.json +++ b/apps/docs/package.json @@ -14,6 +14,8 @@ "build:sitemap": "node ./internals/generate-sitemap.mjs", "embeddings": "tsx scripts/search/generate-embeddings.ts", "embeddings:refresh": "npm run embeddings -- --refresh", + "last-changed": "tsx scripts/last-changed.ts", + "last-changed:reset": "npm run last-changed -- --reset", "codemod:frontmatter": "node ./scripts/codemod/mdx-meta.mjs && prettier --write \"content/**/*.mdx\"", "postbuild": "node ./internals/generate-sitemap.mjs" }, diff --git a/apps/docs/scripts/last-changed.ts b/apps/docs/scripts/last-changed.ts index 1151d52d24..44ca06d8ad 100644 --- a/apps/docs/scripts/last-changed.ts +++ b/apps/docs/scripts/last-changed.ts @@ -20,7 +20,7 @@ import { readdirSync } from 'node:fs' import { readFile } from 'node:fs/promises' import { join } from 'node:path' import { parseArgs } from 'node:util' -import { simpleGit } from 'simple-git' +import { SimpleGit, simpleGit } from 'simple-git' import { Section } from './helpers.mdx' @@ -31,6 +31,13 @@ interface Options { interface Stats { sectionsUpdated: number sectionsRemoved: number + sectionsErrored: number +} + +interface Ctx { + supabase: SupabaseClient + git: SimpleGit + stats: Stats } type SectionWithChecksum = Omit & @@ -50,17 +57,22 @@ async function main() { const { reset } = parseOptions() const supabase = createSupabaseClient() + const git = simpleGit() const stats: Stats = { sectionsUpdated: 0, sectionsRemoved: 0, + sectionsErrored: 0, } - await updateContentDates({ reset, supabase, stats }) + const ctx: Ctx = { supabase, git, stats } + + await updateContentDates({ reset, ctx }) console.log('Content timestamps successfully updated') console.log(` - ${stats.sectionsUpdated} sections updated`) console.log(` - ${stats.sectionsRemoved} old sections removed`) + console.log(` - ${stats.sectionsErrored} sections errored when updating`) } function checkEnv() { @@ -100,15 +112,7 @@ function createSupabaseClient() { ) } -async function updateContentDates({ - supabase, - reset, - stats, -}: { - supabase: SupabaseClient - reset: boolean - stats: Stats -}) { +async function updateContentDates({ reset, ctx }: { reset: boolean; ctx: Ctx }) { const CONTENT_DIR = getContentDir() const mdxFiles = await walkDir(CONTENT_DIR) @@ -116,12 +120,12 @@ async function updateContentDates({ const updateTasks: Array> = [] for (const file of mdxFiles) { - const tasks = await updateTimestamps(file, { supabase, reset, timestamp, stats }) + const tasks = await updateTimestamps(file, { reset, timestamp, ctx }) updateTasks.push(...tasks) } await Promise.all(updateTasks) - await cleanupObsoleteRows(timestamp, supabase, stats) + await cleanupObsoleteRows(ctx) } function getContentDir() { @@ -138,21 +142,16 @@ async function walkDir(fullPath: string) { async function updateTimestamps( filePath: string, - { - supabase, - reset, - timestamp, - stats, - }: { supabase: SupabaseClient; reset: boolean; timestamp: Date; stats: Stats } + { reset, timestamp, ctx }: { reset: boolean; timestamp: Date; ctx: Ctx } ) { try { const content = await readFile(filePath, 'utf-8') const sections = processMdx(content) return sections.map((section) => { if (reset) { - return updateTimestampsWithLastCommitDate(filePath, section, timestamp, supabase, stats) + return updateTimestampsWithLastCommitDate(filePath, section, timestamp, ctx) } else { - throw Error('not implemented') + return updateTimestampsWithChecksumMatch(filePath, section, timestamp, ctx) } }) } catch (err) { @@ -176,7 +175,7 @@ function processMdx(rawContent: string): Array { if (seenHeadings.has(rawHeading)) { const idx = seenHeadings.get(rawHeading) + 1 seenHeadings.set(rawHeading, idx) - heading = `${rawHeading} (${idx})` + heading = `${rawHeading} (__UNIQUE_MARKER__${idx})` } else { seenHeadings.set(rawHeading, 1) } @@ -201,69 +200,88 @@ async function updateTimestampsWithLastCommitDate( filePath: string, section: SectionWithChecksum, timestamp: Date, - supabase: SupabaseClient, - stats: Stats + ctx: Ctx ) { + const parentPage = getContentDirParentPage(filePath) + try { - const git = simpleGit() - const updatedAt = (await git.raw('log', '-1', '--format=%cI', filePath)).trim() + const updatedAt = await getGitUpdatedAt(filePath, ctx) - const contentDir = getContentDir() - const parentPage = `/content${filePath.replace(contentDir, '')}` - - const { data, error } = await supabase + const { error } = await ctx.supabase .from('last_changed') - .select('id') - .eq('parent_page', parentPage) - .eq('heading', section.heading) - .maybeSingle() - if (error) { - throw Error(error.message ?? 'Failed to fetch section from database') - } - - if (data && 'id' in data) { - const { error } = await supabase - .from('last_changed') - .update({ + .upsert( + { + parent_page: parentPage, + heading: section.heading, checksum: section.checksum, last_updated: updatedAt, last_checked: timestamp, - }) - .eq('id', data.id) - if (error) { - throw Error(error.message ?? 'Failed to update row') - } - stats.sectionsUpdated++ - } else { - const { error } = await supabase.from('last_changed').insert({ - parent_page: parentPage, - heading: section.heading, - checksum: section.checksum, - last_updated: updatedAt, - last_checked: timestamp, - }) - if (error) { - throw Error(error.message ?? 'Failed to insert row') - } - stats.sectionsUpdated++ + }, + { + onConflict: 'parent_page,heading', + } + ) + .lt('last_checked', timestamp) + if (error) { + throw Error(error.message ?? 'Failed to upsert') } + ctx.stats.sectionsUpdated++ } catch (err) { console.error( - `Failed to update timestamp with last commit date for section ${filePath}:${section.heading}: ${err}` + `Failed to update timestamp with last commit date for section ${parentPage}:${section.heading}:\n${err}` ) + ctx.stats.sectionsErrored++ } } -async function cleanupObsoleteRows(timestamp: Date, supabase: SupabaseClient, stats: Stats) { +async function updateTimestampsWithChecksumMatch( + filePath: string, + section: SectionWithChecksum, + timestamp: Date, + ctx: Ctx +) { + const parentPage = getContentDirParentPage(filePath) + try { - const { count, error } = await supabase - .from('last_changed') - .delete({ count: 'exact' }) - .neq('last_checked', timestamp.toISOString()) - stats.sectionsRemoved = count + const gitUpdatedAt = await getGitUpdatedAt(filePath, ctx) + + const { data, error } = await ctx.supabase.rpc('update_last_changed_checksum', { + new_parent_page: parentPage, + new_heading: section.heading, + new_checksum: section.checksum, + git_update_time: gitUpdatedAt, + check_time: timestamp, + }) + if (error) { + throw Error(error.message || 'Error running function to update checksum') + } + if (timestamp.toISOString() === new Date(data ?? null).toISOString()) { + ctx.stats.sectionsUpdated++ + } + } catch (err) { + console.error( + `Failed to update timestamp with checksum for section ${parentPage}:${section.heading}:\n${err}` + ) + ctx.stats.sectionsErrored++ + } +} + +async function getGitUpdatedAt(filePath: string, { git }: { git: SimpleGit }) { + return (await git.raw('log', '-1', '--format=%cI', filePath)).trim() +} + +function getContentDirParentPage(filePath: string) { + const contentDir = getContentDir() + return `/content${filePath.replace(contentDir, '')}` +} + +async function cleanupObsoleteRows(ctx: Ctx) { + try { + const { data: count, error } = await ctx.supabase.rpc('cleanup_last_changed_pages') if (error) { throw Error(error.message ?? 'Failed to delete rows') } + ctx.stats.sectionsRemoved = count } catch (err) { console.error(`Error cleanup obsolete rows: ${err}`) } diff --git a/supabase/migrations/20240605171314_last_changed_update.sql b/supabase/migrations/20240605171314_last_changed_update.sql new file mode 100644 index 0000000000..05213443d5 --- /dev/null +++ b/supabase/migrations/20240605171314_last_changed_update.sql @@ -0,0 +1,101 @@ +create or replace function update_last_changed_checksum( + new_parent_page text, + new_heading text, + new_checksum text, + git_update_time timestamp with time zone, + check_time timestamp with time zone +) +returns timestamp with time zone +language plpgsql +as $$ +declare + existing_id bigint; + previous_checksum text; + updated_check_time timestamp with time zone; +begin + select id, checksum into existing_id, previous_checksum + from public.last_changed + where + parent_page = new_parent_page + and heading = new_heading + ; + + if existing_id is not null + and previous_checksum is not null + and previous_checksum = new_checksum + + then + update public.last_changed set + last_checked = check_time + where + last_changed.id = existing_id + and last_changed.last_checked < check_time + returning last_checked into updated_check_time + ; + + else + insert into public.last_changed ( + parent_page, + heading, + checksum, + last_updated, + last_checked + ) values ( + new_parent_page, + new_heading, + new_checksum, + git_update_time, + check_time + ) + on conflict + on constraint last_changed_parent_page_heading_key + do update set + checksum = new_checksum, + last_updated = git_update_time, + last_checked = check_time + where + last_changed.id = existing_id + and last_changed.last_checked < check_time + returning last_checked into updated_check_time + ; + + end if; + + return updated_check_time; +end; +$$ +; + +revoke all on function public.update_last_changed_checksum +from public, anon, authenticated; + +create or replace function cleanup_last_changed_pages() +returns integer +language plpgsql +as $$ +declare + newest_check_time timestamp with time zone; + number_deleted integer; +begin + select last_checked into newest_check_time + from public.last_changed + order by last_checked desc + limit 1 + ; + + with deleted as ( + delete from public.last_changed + where last_checked <> newest_check_time + returning id + ) + select count(*) + from deleted + into number_deleted; + + return number_deleted; +end; +$$ +; + +revoke all on function public.cleanup_last_changed_pages +from public, anon, authenticated;