ci(docs): update last_changed table action (#27038)

Keeps track of fine-grained (per section) edit times for docs content.

Once daily, a GitHub Action runs that:

- Checks whether content hashes have changed for each section
- Updates the table that tracks content edit times if the hashes have changed

Note: The cron job isn't scheduled yet. I'll run the Action manually a few times to validate it, then turn it on in another PR.
This commit is contained in:
Charis
2024-06-06 15:20:05 -04:00
committed by GitHub
parent a210d3a15e
commit c281dd0eb1
4 changed files with 234 additions and 67 deletions

46
.github/workflows/docs-last-changed.yml vendored Normal file
View File

@@ -0,0 +1,46 @@
name: "[Docs] Update last-changed dates"
on:
# Temporarily turned off until the action is validated manually
#
# schedule:
# - cron: '0 0 * * *'
workflow_dispatch:
inputs:
reset:
description: 'Reset last-updated dates using Git commit dates'
required: false
type: boolean
jobs:
deploy:
runs-on: ubuntu-latest
env:
NEXT_PUBLIC_SUPABASE_URL: ${{ secrets.SEARCH_SUPABASE_URL }}
SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SEARCH_SUPABASE_SERVICE_ROLE_KEY }}
steps:
- name: Check out repo
uses: actions/checkout@v4
with:
sparse-checkout: |
apps/docs
- name: Setup node
uses: actions/setup-node@v4
with:
node-version-file: '.nvmrc'
- name: Download dependencies
run: npm ci
- name: Update last-changed dates
working-directory: ./apps/docs
if: ${{ !inputs.reset }}
run: npm run last-changed
- name: Reset last-changed dates
working-directory: ./apps/docs
if: ${{ inputs.reset }}
run: npm run last-changed:reset

View File

@@ -14,6 +14,8 @@
"build:sitemap": "node ./internals/generate-sitemap.mjs",
"embeddings": "tsx scripts/search/generate-embeddings.ts",
"embeddings:refresh": "npm run embeddings -- --refresh",
"last-changed": "tsx scripts/last-changed.ts",
"last-changed:reset": "npm run last-changed -- --reset",
"codemod:frontmatter": "node ./scripts/codemod/mdx-meta.mjs && prettier --write \"content/**/*.mdx\"",
"postbuild": "node ./internals/generate-sitemap.mjs"
},

View File

@@ -20,7 +20,7 @@ import { readdirSync } from 'node:fs'
import { readFile } from 'node:fs/promises'
import { join } from 'node:path'
import { parseArgs } from 'node:util'
import { simpleGit } from 'simple-git'
import { SimpleGit, simpleGit } from 'simple-git'
import { Section } from './helpers.mdx'
@@ -31,6 +31,13 @@ interface Options {
interface Stats {
sectionsUpdated: number
sectionsRemoved: number
sectionsErrored: number
}
interface Ctx {
supabase: SupabaseClient
git: SimpleGit
stats: Stats
}
type SectionWithChecksum = Omit<Section, 'heading'> &
@@ -50,17 +57,22 @@ async function main() {
const { reset } = parseOptions()
const supabase = createSupabaseClient()
const git = simpleGit()
const stats: Stats = {
sectionsUpdated: 0,
sectionsRemoved: 0,
sectionsErrored: 0,
}
await updateContentDates({ reset, supabase, stats })
const ctx: Ctx = { supabase, git, stats }
await updateContentDates({ reset, ctx })
console.log('Content timestamps successfully updated')
console.log(` - ${stats.sectionsUpdated} sections updated`)
console.log(` - ${stats.sectionsRemoved} old sections removed`)
console.log(` - ${stats.sectionsErrored} sections errored when updating`)
}
function checkEnv() {
@@ -100,15 +112,7 @@ function createSupabaseClient() {
)
}
async function updateContentDates({
supabase,
reset,
stats,
}: {
supabase: SupabaseClient
reset: boolean
stats: Stats
}) {
async function updateContentDates({ reset, ctx }: { reset: boolean; ctx: Ctx }) {
const CONTENT_DIR = getContentDir()
const mdxFiles = await walkDir(CONTENT_DIR)
@@ -116,12 +120,12 @@ async function updateContentDates({
const updateTasks: Array<Promise<void>> = []
for (const file of mdxFiles) {
const tasks = await updateTimestamps(file, { supabase, reset, timestamp, stats })
const tasks = await updateTimestamps(file, { reset, timestamp, ctx })
updateTasks.push(...tasks)
}
await Promise.all(updateTasks)
await cleanupObsoleteRows(timestamp, supabase, stats)
await cleanupObsoleteRows(ctx)
}
function getContentDir() {
@@ -138,21 +142,16 @@ async function walkDir(fullPath: string) {
async function updateTimestamps(
filePath: string,
{
supabase,
reset,
timestamp,
stats,
}: { supabase: SupabaseClient; reset: boolean; timestamp: Date; stats: Stats }
{ reset, timestamp, ctx }: { reset: boolean; timestamp: Date; ctx: Ctx }
) {
try {
const content = await readFile(filePath, 'utf-8')
const sections = processMdx(content)
return sections.map((section) => {
if (reset) {
return updateTimestampsWithLastCommitDate(filePath, section, timestamp, supabase, stats)
return updateTimestampsWithLastCommitDate(filePath, section, timestamp, ctx)
} else {
throw Error('not implemented')
return updateTimestampsWithChecksumMatch(filePath, section, timestamp, ctx)
}
})
} catch (err) {
@@ -176,7 +175,7 @@ function processMdx(rawContent: string): Array<SectionWithChecksum> {
if (seenHeadings.has(rawHeading)) {
const idx = seenHeadings.get(rawHeading) + 1
seenHeadings.set(rawHeading, idx)
heading = `${rawHeading} (${idx})`
heading = `${rawHeading} (__UNIQUE_MARKER__${idx})`
} else {
seenHeadings.set(rawHeading, 1)
}
@@ -201,69 +200,88 @@ async function updateTimestampsWithLastCommitDate(
filePath: string,
section: SectionWithChecksum,
timestamp: Date,
supabase: SupabaseClient,
stats: Stats
ctx: Ctx
) {
const parentPage = getContentDirParentPage(filePath)
try {
const git = simpleGit()
const updatedAt = (await git.raw('log', '-1', '--format=%cI', filePath)).trim()
const updatedAt = await getGitUpdatedAt(filePath, ctx)
const contentDir = getContentDir()
const parentPage = `/content${filePath.replace(contentDir, '')}`
const { data, error } = await supabase
const { error } = await ctx.supabase
.from('last_changed')
.select('id')
.eq('parent_page', parentPage)
.eq('heading', section.heading)
.maybeSingle()
if (error) {
throw Error(error.message ?? 'Failed to fetch section from database')
}
if (data && 'id' in data) {
const { error } = await supabase
.from('last_changed')
.update({
.upsert(
{
parent_page: parentPage,
heading: section.heading,
checksum: section.checksum,
last_updated: updatedAt,
last_checked: timestamp,
})
.eq('id', data.id)
if (error) {
throw Error(error.message ?? 'Failed to update row')
}
stats.sectionsUpdated++
} else {
const { error } = await supabase.from('last_changed').insert({
parent_page: parentPage,
heading: section.heading,
checksum: section.checksum,
last_updated: updatedAt,
last_checked: timestamp,
})
if (error) {
throw Error(error.message ?? 'Failed to insert row')
}
stats.sectionsUpdated++
},
{
onConflict: 'parent_page,heading',
}
)
.lt('last_checked', timestamp)
if (error) {
throw Error(error.message ?? 'Failed to upsert')
}
ctx.stats.sectionsUpdated++
} catch (err) {
console.error(
`Failed to update timestamp with last commit date for section ${filePath}:${section.heading}: ${err}`
`Failed to update timestamp with last commit date for section ${parentPage}:${section.heading}:\n${err}`
)
ctx.stats.sectionsErrored++
}
}
async function cleanupObsoleteRows(timestamp: Date, supabase: SupabaseClient, stats: Stats) {
async function updateTimestampsWithChecksumMatch(
filePath: string,
section: SectionWithChecksum,
timestamp: Date,
ctx: Ctx
) {
const parentPage = getContentDirParentPage(filePath)
try {
const { count, error } = await supabase
.from('last_changed')
.delete({ count: 'exact' })
.neq('last_checked', timestamp.toISOString())
stats.sectionsRemoved = count
const gitUpdatedAt = await getGitUpdatedAt(filePath, ctx)
const { data, error } = await ctx.supabase.rpc('update_last_changed_checksum', {
new_parent_page: parentPage,
new_heading: section.heading,
new_checksum: section.checksum,
git_update_time: gitUpdatedAt,
check_time: timestamp,
})
if (error) {
throw Error(error.message || 'Error running function to update checksum')
}
if (timestamp.toISOString() === new Date(data ?? null).toISOString()) {
ctx.stats.sectionsUpdated++
}
} catch (err) {
console.error(
`Failed to update timestamp with checksum for section ${parentPage}:${section.heading}:\n${err}`
)
ctx.stats.sectionsErrored++
}
}
async function getGitUpdatedAt(filePath: string, { git }: { git: SimpleGit }) {
return (await git.raw('log', '-1', '--format=%cI', filePath)).trim()
}
function getContentDirParentPage(filePath: string) {
const contentDir = getContentDir()
return `/content${filePath.replace(contentDir, '')}`
}
async function cleanupObsoleteRows(ctx: Ctx) {
try {
const { data: count, error } = await ctx.supabase.rpc('cleanup_last_changed_pages')
if (error) {
throw Error(error.message ?? 'Failed to delete rows')
}
ctx.stats.sectionsRemoved = count
} catch (err) {
console.error(`Error cleanup obsolete rows: ${err}`)
}

View File

@@ -0,0 +1,101 @@
create or replace function update_last_changed_checksum(
new_parent_page text,
new_heading text,
new_checksum text,
git_update_time timestamp with time zone,
check_time timestamp with time zone
)
returns timestamp with time zone
language plpgsql
as $$
declare
existing_id bigint;
previous_checksum text;
updated_check_time timestamp with time zone;
begin
select id, checksum into existing_id, previous_checksum
from public.last_changed
where
parent_page = new_parent_page
and heading = new_heading
;
if existing_id is not null
and previous_checksum is not null
and previous_checksum = new_checksum
then
update public.last_changed set
last_checked = check_time
where
last_changed.id = existing_id
and last_changed.last_checked < check_time
returning last_checked into updated_check_time
;
else
insert into public.last_changed (
parent_page,
heading,
checksum,
last_updated,
last_checked
) values (
new_parent_page,
new_heading,
new_checksum,
git_update_time,
check_time
)
on conflict
on constraint last_changed_parent_page_heading_key
do update set
checksum = new_checksum,
last_updated = git_update_time,
last_checked = check_time
where
last_changed.id = existing_id
and last_changed.last_checked < check_time
returning last_checked into updated_check_time
;
end if;
return updated_check_time;
end;
$$
;
revoke all on function public.update_last_changed_checksum
from public, anon, authenticated;
create or replace function cleanup_last_changed_pages()
returns integer
language plpgsql
as $$
declare
newest_check_time timestamp with time zone;
number_deleted integer;
begin
select last_checked into newest_check_time
from public.last_changed
order by last_checked desc
limit 1
;
with deleted as (
delete from public.last_changed
where last_checked <> newest_check_time
returning id
)
select count(*)
from deleted
into number_deleted;
return number_deleted;
end;
$$
;
revoke all on function public.cleanup_last_changed_pages
from public, anon, authenticated;