From c90dbacab05d2ceab84b53c66c7be77b4dda47ef Mon Sep 17 00:00:00 2001
From: Anton Bulakh <him@necauq.ua>
Date: Tue, 07 Jan 2025 20:33:34 +0000
Subject: [PATCH] chore(build): separate markdown and html handling into two separate stages (#1675)

---
 quartz/plugins/vfile.ts     |    8 +-
 quartz/bootstrap-worker.mjs |    5 +
 quartz/worker.ts            |   37 ++++++++++--
 quartz/processors/parse.ts  |   90 +++++++++++++++++++++--------
 4 files changed, 103 insertions(+), 37 deletions(-)

diff --git a/quartz/bootstrap-worker.mjs b/quartz/bootstrap-worker.mjs
index b08689c..c4c4949 100644
--- a/quartz/bootstrap-worker.mjs
+++ b/quartz/bootstrap-worker.mjs
@@ -1,7 +1,8 @@
 #!/usr/bin/env node
 import workerpool from "workerpool"
 const cacheFile = "./.quartz-cache/transpiled-worker.mjs"
-const { parseFiles } = await import(cacheFile)
+const { parseMarkdown, processHtml } = await import(cacheFile)
 workerpool.worker({
-  parseFiles,
+  parseMarkdown,
+  processHtml,
 })
diff --git a/quartz/plugins/vfile.ts b/quartz/plugins/vfile.ts
index 5be2105..8c5cf6a 100644
--- a/quartz/plugins/vfile.ts
+++ b/quartz/plugins/vfile.ts
@@ -1,11 +1,13 @@
-import { Node, Parent } from "hast"
+import { Root as HtmlRoot } from "hast"
+import { Root as MdRoot } from "mdast"
 import { Data, VFile } from "vfile"
 
 export type QuartzPluginData = Data
-export type ProcessedContent = [Node, VFile]
+export type MarkdownContent = [MdRoot, VFile]
+export type ProcessedContent = [HtmlRoot, VFile]
 
 export function defaultProcessedContent(vfileData: Partial<QuartzPluginData>): ProcessedContent {
-  const root: Parent = { type: "root", children: [] }
+  const root: HtmlRoot = { type: "root", children: [] }
   const vfile = new VFile("")
   vfile.data = vfileData
   return [root, vfile]
diff --git a/quartz/processors/parse.ts b/quartz/processors/parse.ts
index 2bd530c..479313f 100644
--- a/quartz/processors/parse.ts
+++ b/quartz/processors/parse.ts
@@ -4,18 +4,20 @@
 import { Processor, unified } from "unified"
 import { Root as MDRoot } from "remark-parse/lib"
 import { Root as HTMLRoot } from "hast"
-import { ProcessedContent } from "../plugins/vfile"
+import { MarkdownContent, ProcessedContent } from "../plugins/vfile"
 import { PerfTimer } from "../util/perf"
 import { read } from "to-vfile"
-import { FilePath, QUARTZ, slugifyFilePath } from "../util/path"
+import { FilePath, FullSlug, QUARTZ, slugifyFilePath } from "../util/path"
 import path from "path"
 import workerpool, { Promise as WorkerPromise } from "workerpool"
 import { QuartzLogger } from "../util/log"
 import { trace } from "../util/trace"
 import { BuildCtx } from "../util/ctx"
 
-export type QuartzProcessor = Processor<MDRoot, MDRoot, HTMLRoot>
-export function createProcessor(ctx: BuildCtx): QuartzProcessor {
+export type QuartzMdProcessor = Processor<MDRoot, MDRoot, MDRoot>
+export type QuartzHtmlProcessor = Processor<undefined, MDRoot, HTMLRoot>
+
+export function createMdProcessor(ctx: BuildCtx): QuartzMdProcessor {
   const transformers = ctx.cfg.plugins.transformers
 
   return (
@@ -24,14 +26,20 @@
       .use(remarkParse)
       // MD AST -> MD AST transforms
       .use(
-        transformers
-          .filter((p) => p.markdownPlugins)
-          .flatMap((plugin) => plugin.markdownPlugins!(ctx)),
-      )
+        transformers.flatMap((plugin) => plugin.markdownPlugins?.(ctx) ?? []),
+      ) as unknown as QuartzMdProcessor
+    //  ^ sadly the typing of `use` is not smart enough to infer the correct type from our plugin list
+  )
+}
+
+export function createHtmlProcessor(ctx: BuildCtx): QuartzHtmlProcessor {
+  const transformers = ctx.cfg.plugins.transformers
+  return (
+    unified()
       // MD AST -> HTML AST
       .use(remarkRehype, { allowDangerousHtml: true })
       // HTML AST -> HTML AST transforms
-      .use(transformers.filter((p) => p.htmlPlugins).flatMap((plugin) => plugin.htmlPlugins!(ctx)))
+      .use(transformers.flatMap((plugin) => plugin.htmlPlugins?.(ctx) ?? []))
   )
 }
 
@@ -75,8 +83,8 @@
 
 export function createFileParser(ctx: BuildCtx, fps: FilePath[]) {
   const { argv, cfg } = ctx
-  return async (processor: QuartzProcessor) => {
-    const res: ProcessedContent[] = []
+  return async (processor: QuartzMdProcessor) => {
+    const res: MarkdownContent[] = []
     for (const fp of fps) {
       try {
         const perf = new PerfTimer()
@@ -100,10 +108,32 @@
         res.push([newAst, file])
 
         if (argv.verbose) {
-          console.log(`[process] ${fp} -> ${file.data.slug} (${perf.timeSince()})`)
+          console.log(`[markdown] ${fp} -> ${file.data.slug} (${perf.timeSince()})`)
         }
       } catch (err) {
-        trace(`\nFailed to process \`${fp}\``, err as Error)
+        trace(`\nFailed to process markdown \`${fp}\``, err as Error)
+      }
+    }
+
+    return res
+  }
+}
+
+export function createMarkdownParser(ctx: BuildCtx, mdContent: MarkdownContent[]) {
+  return async (processor: QuartzHtmlProcessor) => {
+    const res: ProcessedContent[] = []
+    for (const [ast, file] of mdContent) {
+      try {
+        const perf = new PerfTimer()
+
+        const newAst = await processor.run(ast as MDRoot, file)
+        res.push([newAst, file])
+
+        if (ctx.argv.verbose) {
+          console.log(`[html] ${file.data.slug} (${perf.timeSince()})`)
+        }
+      } catch (err) {
+        trace(`\nFailed to process html \`${file.data.filePath}\``, err as Error)
       }
     }
 
@@ -113,6 +143,7 @@
 
 const clamp = (num: number, min: number, max: number) =>
   Math.min(Math.max(Math.round(num), min), max)
+
 export async function parseMarkdown(ctx: BuildCtx, fps: FilePath[]): Promise<ProcessedContent[]> {
   const { argv } = ctx
   const perf = new PerfTimer()
@@ -126,9 +157,8 @@
   log.start(`Parsing input files using ${concurrency} threads`)
   if (concurrency === 1) {
     try {
-      const processor = createProcessor(ctx)
-      const parse = createFileParser(ctx, fps)
-      res = await parse(processor)
+      const mdRes = await createFileParser(ctx, fps)(createMdProcessor(ctx))
+      res = await createMarkdownParser(ctx, mdRes)(createHtmlProcessor(ctx))
     } catch (error) {
       log.end()
       throw error
@@ -140,17 +170,27 @@
       maxWorkers: concurrency,
       workerType: "thread",
     })
-
-    const childPromises: WorkerPromise<ProcessedContent[]>[] = []
-    for (const chunk of chunks(fps, CHUNK_SIZE)) {
-      childPromises.push(pool.exec("parseFiles", [ctx.buildId, argv, chunk, ctx.allSlugs]))
+    const errorHandler = (err: any) => {
+      console.error(`${err}`.replace(/^error:\s*/i, ""))
+      process.exit(1)
     }
 
-    const results: ProcessedContent[][] = await WorkerPromise.all(childPromises).catch((err) => {
-      const errString = err.toString().slice("Error:".length)
-      console.error(errString)
-      process.exit(1)
-    })
+    const mdPromises: WorkerPromise<[MarkdownContent[], FullSlug[]]>[] = []
+    for (const chunk of chunks(fps, CHUNK_SIZE)) {
+      mdPromises.push(pool.exec("parseMarkdown", [ctx.buildId, argv, chunk]))
+    }
+    const mdResults: [MarkdownContent[], FullSlug[]][] =
+      await WorkerPromise.all(mdPromises).catch(errorHandler)
+
+    const childPromises: WorkerPromise<ProcessedContent[]>[] = []
+    for (const [_, extraSlugs] of mdResults) {
+      ctx.allSlugs.push(...extraSlugs)
+    }
+    for (const [mdChunk, _] of mdResults) {
+      childPromises.push(pool.exec("processHtml", [ctx.buildId, argv, mdChunk, ctx.allSlugs]))
+    }
+    const results: ProcessedContent[][] = await WorkerPromise.all(childPromises).catch(errorHandler)
+
     res = results.flat()
     await pool.terminate()
   }
diff --git a/quartz/worker.ts b/quartz/worker.ts
index a209df9..c9cd980 100644
--- a/quartz/worker.ts
+++ b/quartz/worker.ts
@@ -3,23 +3,46 @@
 import cfg from "../quartz.config"
 import { Argv, BuildCtx } from "./util/ctx"
 import { FilePath, FullSlug } from "./util/path"
-import { createFileParser, createProcessor } from "./processors/parse"
+import {
+  createFileParser,
+  createHtmlProcessor,
+  createMarkdownParser,
+  createMdProcessor,
+} from "./processors/parse"
 import { options } from "./util/sourcemap"
+import { MarkdownContent, ProcessedContent } from "./plugins/vfile"
 
 // only called from worker thread
-export async function parseFiles(
+export async function parseMarkdown(
   buildId: string,
   argv: Argv,
   fps: FilePath[],
-  allSlugs: FullSlug[],
-) {
+): Promise<[MarkdownContent[], FullSlug[]]> {
+  // this is a hack
+  // we assume markdown parsers can add to `allSlugs`,
+  // but don't actually use them
+  const allSlugs: FullSlug[] = []
   const ctx: BuildCtx = {
     buildId,
     cfg,
     argv,
     allSlugs,
   }
-  const processor = createProcessor(ctx)
-  const parse = createFileParser(ctx, fps)
-  return parse(processor)
+  return [await createFileParser(ctx, fps)(createMdProcessor(ctx)), allSlugs]
+}
+
+// only called from worker thread
+export function processHtml(
+  buildId: string,
+  argv: Argv,
+  mds: MarkdownContent[],
+  allSlugs: FullSlug[],
+): Promise<ProcessedContent[]> {
+  const ctx: BuildCtx = {
+    buildId,
+    cfg,
+    argv,
+    allSlugs,
+  }
+  return createMarkdownParser(ctx, mds)(createHtmlProcessor(ctx))
 }

--
Gitblit v1.10.0