perf(core): optimize broken links checker (#9778)

2025-07-19 01:28:38 +02:00 · 2024-01-24 12:14:26 +01:00 · 2024-01-24 12:14:26 +01:00 · c827b6de2d
commit c827b6de2d
parent 8597e5dcdb
2 changed files with 174 additions and 66 deletions
--- a/packages/docusaurus/src/server/tests/brokenLinks.test.ts
+++ b/packages/docusaurus/src/server/tests/brokenLinks.test.ts
@ -6,6 +6,7 @@
 */

 import {jest} from '@jest/globals';
+import reactRouterConfig from 'react-router-config';
 import {handleBrokenLinks} from '../brokenLinks';
 import type {RouteConfig} from '@docusaurus/types';

@ -718,4 +719,60 @@ describe('handleBrokenLinks', () => {
      "
    `);
  });
+
+  it('is performant and minimize calls to matchRoutes', async () => {
+    const matchRoutesMock = jest.spyOn(reactRouterConfig, 'matchRoutes');
+
+    const scale = 100;
+
+    const routes: SimpleRoute[] = [
+      ...Array.from<SimpleRoute>({length: scale}).map((_, i) => ({
+        path: `/page${i}`,
+      })),
+      ...Array.from<SimpleRoute>({length: scale}).fill({
+        path: '/pageDynamic/:subpath1',
+      }),
+    ];
+
+    const collectedLinks: Params['collectedLinks'] = Object.fromEntries(
+      Array.from<SimpleRoute>({length: scale}).map((_, i) => [
+        `/page${i}`,
+        {
+          links: [
+            ...Array.from<SimpleRoute>({length: scale}).flatMap((_2, j) => [
+              `/page${j}`,
+              `/page${j}?age=42`,
+              `/page${j}#anchor${j}`,
+              `/page${j}?age=42#anchor${j}`,
+              `/pageDynamic/subPath${j}`,
+              `/pageDynamic/subPath${j}?age=42`,
+              // `/pageDynamic/subPath${j}#anchor${j}`,
+              // `/pageDynamic/subPath${j}?age=42#anchor${j}`,
+            ]),
+          ],
+          anchors: Array.from<SimpleRoute>({length: scale}).map(
+            (_2, j) => `anchor${j}`,
+          ),
+        },
+      ]),
+    );
+
+    // console.time('testBrokenLinks');
+    await testBrokenLinks({
+      routes,
+      collectedLinks,
+    });
+    // console.timeEnd('testBrokenLinks');
+
+    // Idiomatic code calling matchRoutes multiple times is not performant
+    // We try to minimize the calls to this expensive function
+    // Otherwise large sites will have super long execution times
+    // See https://github.com/facebook/docusaurus/issues/9754
+    // See https://twitter.com/sebastienlorber/status/1749392773415858587
+    // We expect no more matchRoutes calls than number of dynamic route links
+    expect(matchRoutesMock).toHaveBeenCalledTimes(scale);
+    // We expect matchRoutes to be called with a reduced number of routes
+    expect(routes).toHaveLength(scale * 2);
+    expect(matchRoutesMock.mock.calls[0]![0]).toHaveLength(scale);
+  });
 });
--- a/packages/docusaurus/src/server/brokenLinks.ts
+++ b/packages/docusaurus/src/server/brokenLinks.ts
@ -7,11 +7,18 @@

 import _ from 'lodash';
 import logger from '@docusaurus/logger';
-import {matchRoutes} from 'react-router-config';
+import {matchRoutes as reactRouterMatchRoutes} from 'react-router-config';
 import {parseURLPath, serializeURLPath, type URLPath} from '@docusaurus/utils';
 import {getAllFinalRoutes} from './utils';
 import type {RouteConfig, ReportingSeverity} from '@docusaurus/types';

+function matchRoutes(routeConfig: RouteConfig[], pathname: string) {
+  // @ts-expect-error: React router types RouteConfig with an actual React
+  // component, but we load route components with string paths.
+  // We don't actually access component here, so it's fine.
+  return reactRouterMatchRoutes(routeConfig, pathname);
+}
+
 type BrokenLink = {
  link: string;
  resolvedLink: string;
@ -26,88 +33,121 @@ type CollectedLinks = {
  [pathname: string]: {links: string[]; anchors: string[]};
 };

-function getBrokenLinksForPage({
+// We use efficient data structures for performance reasons
+// See https://github.com/facebook/docusaurus/issues/9754
+type CollectedLinksNormalized = Map<
+  string,
+  {links: Set<string>; anchors: Set<string>}
+>;
+
+type BrokenLinksHelper = {
+  collectedLinks: CollectedLinksNormalized;
+  isPathBrokenLink: (linkPath: URLPath) => boolean;
+  isAnchorBrokenLink: (linkPath: URLPath) => boolean;
+};
+
+function createBrokenLinksHelper({
  collectedLinks,
-  pagePath,
-  pageLinks,
  routes,
 }: {
-  collectedLinks: CollectedLinks;
-  pagePath: string;
-  pageLinks: string[];
-  pageAnchors: string[];
+  collectedLinks: CollectedLinksNormalized;
  routes: RouteConfig[];
-}): BrokenLink[] {
-  const allCollectedPaths = new Set(Object.keys(collectedLinks));
+}): BrokenLinksHelper {
+  const validPathnames = new Set(collectedLinks.keys());
+
+  // Matching against the route array can be expensive
+  // If the route is already in the valid pathnames,
+  // we can avoid matching against it as an optimization
+  const remainingRoutes = routes.filter(
+    (route) => !validPathnames.has(route.path),
+  );
+
+  function isPathnameMatchingAnyRoute(pathname: string): boolean {
+    if (matchRoutes(remainingRoutes, pathname).length > 0) {
+      // IMPORTANT: this is an optimization here
+      // See https://github.com/facebook/docusaurus/issues/9754
+      // Large Docusaurus sites have many routes!
+      // We try to minimize calls to a possibly expensive matchRoutes function
+      validPathnames.add(pathname);
+      return true;
+    }
+
+    return false;
+  }

  function isPathBrokenLink(linkPath: URLPath) {
    const pathnames = [linkPath.pathname, decodeURI(linkPath.pathname)];
-    const matchedRoutes = pathnames
-      // @ts-expect-error: React router types RouteConfig with an actual React
-      // component, but we load route components with string paths.
-      // We don't actually access component here, so it's fine.
-      .map((l) => matchRoutes(routes, l))
-      .flat();
-    // The link path is broken if:
-    // - it doesn't match any route
-    // - it doesn't match any collected path
-    return (
-      matchedRoutes.length === 0 &&
-      !pathnames.some((p) => allCollectedPaths.has(p))
-    );
+    if (pathnames.some((p) => validPathnames.has(p))) {
+      return false;
+    }
+    if (pathnames.some(isPathnameMatchingAnyRoute)) {
+      return false;
+    }
+    return true;
  }

  function isAnchorBrokenLink(linkPath: URLPath) {
    const {pathname, hash} = linkPath;
-
    // Link has no hash: it can't be a broken anchor link
    if (hash === undefined) {
      return false;
    }
-
    // Link has empty hash ("#", "/page#"...): we do not report it as broken
    // Empty hashes are used for various weird reasons, by us and other users...
    // See for example: https://github.com/facebook/docusaurus/pull/6003
    if (hash === '') {
      return false;
    }
-
    const targetPage =
-      collectedLinks[pathname] || collectedLinks[decodeURI(pathname)];
-
+      collectedLinks.get(pathname) || collectedLinks.get(decodeURI(pathname));
    // link with anchor to a page that does not exist (or did not collect any
    // link/anchor) is considered as a broken anchor
    if (!targetPage) {
      return true;
    }
-
-    // it's a broken anchor if the target page exists
-    // but the anchor does not exist on that page
-    const hashes = [hash, decodeURIComponent(hash)];
-    return !targetPage.anchors.some((anchor) => hashes.includes(anchor));
+    // it's a not broken anchor if the anchor exists on the target page
+    if (
+      targetPage.anchors.has(hash) ||
+      targetPage.anchors.has(decodeURIComponent(hash))
+    ) {
+      return false;
+    }
+    return true;
  }

-  const brokenLinks = pageLinks.flatMap((link) => {
+  return {
+    collectedLinks,
+    isPathBrokenLink,
+    isAnchorBrokenLink,
+  };
+}
+
+function getBrokenLinksForPage({
+  pagePath,
+  helper,
+}: {
+  pagePath: string;
+  helper: BrokenLinksHelper;
+}): BrokenLink[] {
+  const pageData = helper.collectedLinks.get(pagePath)!;
+
+  const brokenLinks: BrokenLink[] = [];
+
+  pageData.links.forEach((link) => {
    const linkPath = parseURLPath(link, pagePath);
-    if (isPathBrokenLink(linkPath)) {
-      return [
-        {
-          link,
-          resolvedLink: serializeURLPath(linkPath),
-          anchor: false,
-        },
-      ];
+    if (helper.isPathBrokenLink(linkPath)) {
+      brokenLinks.push({
+        link,
+        resolvedLink: serializeURLPath(linkPath),
+        anchor: false,
+      });
+    } else if (helper.isAnchorBrokenLink(linkPath)) {
+      brokenLinks.push({
+        link,
+        resolvedLink: serializeURLPath(linkPath),
+        anchor: true,
+      });
    }
-    if (isAnchorBrokenLink(linkPath)) {
-      return [
-        {
-          link,
-          resolvedLink: serializeURLPath(linkPath),
-          anchor: true,
-        },
-      ];
-    }
-    return [];
  });

  return brokenLinks;
@ -128,19 +168,22 @@ function getBrokenLinks({
  collectedLinks,
  routes,
 }: {
-  collectedLinks: CollectedLinks;
+  collectedLinks: CollectedLinksNormalized;
  routes: RouteConfig[];
 }): BrokenLinksMap {
  const filteredRoutes = filterIntermediateRoutes(routes);

-  return _.mapValues(collectedLinks, (pageCollectedData, pagePath) => {
+  const helper = createBrokenLinksHelper({
+    collectedLinks,
+    routes: filteredRoutes,
+  });
+
+  const result: BrokenLinksMap = {};
+  collectedLinks.forEach((_unused, pagePath) => {
    try {
-      return getBrokenLinksForPage({
-        collectedLinks,
-        pageLinks: pageCollectedData.links,
-        pageAnchors: pageCollectedData.anchors,
+      result[pagePath] = getBrokenLinksForPage({
        pagePath,
-        routes: filteredRoutes,
+        helper,
      });
    } catch (e) {
      throw new Error(`Unable to get broken links for page ${pagePath}.`, {
@ -148,6 +191,7 @@ function getBrokenLinks({
      });
    }
  });
+  return result;
 }

 function brokenLinkMessage(brokenLink: BrokenLink): string {
@ -303,15 +347,22 @@ function reportBrokenLinks({
 // JS users might call "collectLink(undefined)" for example
 // TS users might call "collectAnchor('#hash')" with/without #
 // We clean/normalize the collected data to avoid obscure errors being thrown
+// We also use optimized data structures for a faster algorithm
 function normalizeCollectedLinks(
  collectedLinks: CollectedLinks,
-): CollectedLinks {
-  return _.mapValues(collectedLinks, (pageCollectedData) => ({
-    links: pageCollectedData.links.filter(_.isString),
-    anchors: pageCollectedData.anchors
-      .filter(_.isString)
-      .map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
-  }));
+): CollectedLinksNormalized {
+  const result: CollectedLinksNormalized = new Map();
+  Object.entries(collectedLinks).forEach(([pathname, pageCollectedData]) => {
+    result.set(pathname, {
+      links: new Set(pageCollectedData.links.filter(_.isString)),
+      anchors: new Set(
+        pageCollectedData.anchors
+          .filter(_.isString)
+          .map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
+      ),
+    });
+  });
+  return result;
 }

 export async function handleBrokenLinks({