mirror of
https://github.com/facebook/docusaurus.git
synced 2025-04-29 18:27:56 +02:00
perf(core): optimize broken links checker (#9778)
This commit is contained in:
parent
8597e5dcdb
commit
c827b6de2d
2 changed files with 174 additions and 66 deletions
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
import {jest} from '@jest/globals';
|
||||
import reactRouterConfig from 'react-router-config';
|
||||
import {handleBrokenLinks} from '../brokenLinks';
|
||||
import type {RouteConfig} from '@docusaurus/types';
|
||||
|
||||
|
@ -718,4 +719,60 @@ describe('handleBrokenLinks', () => {
|
|||
"
|
||||
`);
|
||||
});
|
||||
|
||||
it('is performant and minimize calls to matchRoutes', async () => {
|
||||
const matchRoutesMock = jest.spyOn(reactRouterConfig, 'matchRoutes');
|
||||
|
||||
const scale = 100;
|
||||
|
||||
const routes: SimpleRoute[] = [
|
||||
...Array.from<SimpleRoute>({length: scale}).map((_, i) => ({
|
||||
path: `/page${i}`,
|
||||
})),
|
||||
...Array.from<SimpleRoute>({length: scale}).fill({
|
||||
path: '/pageDynamic/:subpath1',
|
||||
}),
|
||||
];
|
||||
|
||||
const collectedLinks: Params['collectedLinks'] = Object.fromEntries(
|
||||
Array.from<SimpleRoute>({length: scale}).map((_, i) => [
|
||||
`/page${i}`,
|
||||
{
|
||||
links: [
|
||||
...Array.from<SimpleRoute>({length: scale}).flatMap((_2, j) => [
|
||||
`/page${j}`,
|
||||
`/page${j}?age=42`,
|
||||
`/page${j}#anchor${j}`,
|
||||
`/page${j}?age=42#anchor${j}`,
|
||||
`/pageDynamic/subPath${j}`,
|
||||
`/pageDynamic/subPath${j}?age=42`,
|
||||
// `/pageDynamic/subPath${j}#anchor${j}`,
|
||||
// `/pageDynamic/subPath${j}?age=42#anchor${j}`,
|
||||
]),
|
||||
],
|
||||
anchors: Array.from<SimpleRoute>({length: scale}).map(
|
||||
(_2, j) => `anchor${j}`,
|
||||
),
|
||||
},
|
||||
]),
|
||||
);
|
||||
|
||||
// console.time('testBrokenLinks');
|
||||
await testBrokenLinks({
|
||||
routes,
|
||||
collectedLinks,
|
||||
});
|
||||
// console.timeEnd('testBrokenLinks');
|
||||
|
||||
// Idiomatic code calling matchRoutes multiple times is not performant
|
||||
// We try to minimize the calls to this expensive function
|
||||
// Otherwise large sites will have super long execution times
|
||||
// See https://github.com/facebook/docusaurus/issues/9754
|
||||
// See https://twitter.com/sebastienlorber/status/1749392773415858587
|
||||
// We expect no more matchRoutes calls than number of dynamic route links
|
||||
expect(matchRoutesMock).toHaveBeenCalledTimes(scale);
|
||||
// We expect matchRoutes to be called with a reduced number of routes
|
||||
expect(routes).toHaveLength(scale * 2);
|
||||
expect(matchRoutesMock.mock.calls[0]![0]).toHaveLength(scale);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -7,11 +7,18 @@
|
|||
|
||||
import _ from 'lodash';
|
||||
import logger from '@docusaurus/logger';
|
||||
import {matchRoutes} from 'react-router-config';
|
||||
import {matchRoutes as reactRouterMatchRoutes} from 'react-router-config';
|
||||
import {parseURLPath, serializeURLPath, type URLPath} from '@docusaurus/utils';
|
||||
import {getAllFinalRoutes} from './utils';
|
||||
import type {RouteConfig, ReportingSeverity} from '@docusaurus/types';
|
||||
|
||||
function matchRoutes(routeConfig: RouteConfig[], pathname: string) {
|
||||
// @ts-expect-error: React router types RouteConfig with an actual React
|
||||
// component, but we load route components with string paths.
|
||||
// We don't actually access component here, so it's fine.
|
||||
return reactRouterMatchRoutes(routeConfig, pathname);
|
||||
}
|
||||
|
||||
type BrokenLink = {
|
||||
link: string;
|
||||
resolvedLink: string;
|
||||
|
@ -26,88 +33,121 @@ type CollectedLinks = {
|
|||
[pathname: string]: {links: string[]; anchors: string[]};
|
||||
};
|
||||
|
||||
function getBrokenLinksForPage({
|
||||
// We use efficient data structures for performance reasons
|
||||
// See https://github.com/facebook/docusaurus/issues/9754
|
||||
type CollectedLinksNormalized = Map<
|
||||
string,
|
||||
{links: Set<string>; anchors: Set<string>}
|
||||
>;
|
||||
|
||||
type BrokenLinksHelper = {
|
||||
collectedLinks: CollectedLinksNormalized;
|
||||
isPathBrokenLink: (linkPath: URLPath) => boolean;
|
||||
isAnchorBrokenLink: (linkPath: URLPath) => boolean;
|
||||
};
|
||||
|
||||
function createBrokenLinksHelper({
|
||||
collectedLinks,
|
||||
pagePath,
|
||||
pageLinks,
|
||||
routes,
|
||||
}: {
|
||||
collectedLinks: CollectedLinks;
|
||||
pagePath: string;
|
||||
pageLinks: string[];
|
||||
pageAnchors: string[];
|
||||
collectedLinks: CollectedLinksNormalized;
|
||||
routes: RouteConfig[];
|
||||
}): BrokenLink[] {
|
||||
const allCollectedPaths = new Set(Object.keys(collectedLinks));
|
||||
}): BrokenLinksHelper {
|
||||
const validPathnames = new Set(collectedLinks.keys());
|
||||
|
||||
// Matching against the route array can be expensive
|
||||
// If the route is already in the valid pathnames,
|
||||
// we can avoid matching against it as an optimization
|
||||
const remainingRoutes = routes.filter(
|
||||
(route) => !validPathnames.has(route.path),
|
||||
);
|
||||
|
||||
function isPathnameMatchingAnyRoute(pathname: string): boolean {
|
||||
if (matchRoutes(remainingRoutes, pathname).length > 0) {
|
||||
// IMPORTANT: this is an optimization here
|
||||
// See https://github.com/facebook/docusaurus/issues/9754
|
||||
// Large Docusaurus sites have many routes!
|
||||
// We try to minimize calls to a possibly expensive matchRoutes function
|
||||
validPathnames.add(pathname);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
function isPathBrokenLink(linkPath: URLPath) {
|
||||
const pathnames = [linkPath.pathname, decodeURI(linkPath.pathname)];
|
||||
const matchedRoutes = pathnames
|
||||
// @ts-expect-error: React router types RouteConfig with an actual React
|
||||
// component, but we load route components with string paths.
|
||||
// We don't actually access component here, so it's fine.
|
||||
.map((l) => matchRoutes(routes, l))
|
||||
.flat();
|
||||
// The link path is broken if:
|
||||
// - it doesn't match any route
|
||||
// - it doesn't match any collected path
|
||||
return (
|
||||
matchedRoutes.length === 0 &&
|
||||
!pathnames.some((p) => allCollectedPaths.has(p))
|
||||
);
|
||||
if (pathnames.some((p) => validPathnames.has(p))) {
|
||||
return false;
|
||||
}
|
||||
if (pathnames.some(isPathnameMatchingAnyRoute)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function isAnchorBrokenLink(linkPath: URLPath) {
|
||||
const {pathname, hash} = linkPath;
|
||||
|
||||
// Link has no hash: it can't be a broken anchor link
|
||||
if (hash === undefined) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Link has empty hash ("#", "/page#"...): we do not report it as broken
|
||||
// Empty hashes are used for various weird reasons, by us and other users...
|
||||
// See for example: https://github.com/facebook/docusaurus/pull/6003
|
||||
if (hash === '') {
|
||||
return false;
|
||||
}
|
||||
|
||||
const targetPage =
|
||||
collectedLinks[pathname] || collectedLinks[decodeURI(pathname)];
|
||||
|
||||
collectedLinks.get(pathname) || collectedLinks.get(decodeURI(pathname));
|
||||
// link with anchor to a page that does not exist (or did not collect any
|
||||
// link/anchor) is considered as a broken anchor
|
||||
if (!targetPage) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// it's a broken anchor if the target page exists
|
||||
// but the anchor does not exist on that page
|
||||
const hashes = [hash, decodeURIComponent(hash)];
|
||||
return !targetPage.anchors.some((anchor) => hashes.includes(anchor));
|
||||
// it's a not broken anchor if the anchor exists on the target page
|
||||
if (
|
||||
targetPage.anchors.has(hash) ||
|
||||
targetPage.anchors.has(decodeURIComponent(hash))
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const brokenLinks = pageLinks.flatMap((link) => {
|
||||
return {
|
||||
collectedLinks,
|
||||
isPathBrokenLink,
|
||||
isAnchorBrokenLink,
|
||||
};
|
||||
}
|
||||
|
||||
function getBrokenLinksForPage({
|
||||
pagePath,
|
||||
helper,
|
||||
}: {
|
||||
pagePath: string;
|
||||
helper: BrokenLinksHelper;
|
||||
}): BrokenLink[] {
|
||||
const pageData = helper.collectedLinks.get(pagePath)!;
|
||||
|
||||
const brokenLinks: BrokenLink[] = [];
|
||||
|
||||
pageData.links.forEach((link) => {
|
||||
const linkPath = parseURLPath(link, pagePath);
|
||||
if (isPathBrokenLink(linkPath)) {
|
||||
return [
|
||||
{
|
||||
link,
|
||||
resolvedLink: serializeURLPath(linkPath),
|
||||
anchor: false,
|
||||
},
|
||||
];
|
||||
if (helper.isPathBrokenLink(linkPath)) {
|
||||
brokenLinks.push({
|
||||
link,
|
||||
resolvedLink: serializeURLPath(linkPath),
|
||||
anchor: false,
|
||||
});
|
||||
} else if (helper.isAnchorBrokenLink(linkPath)) {
|
||||
brokenLinks.push({
|
||||
link,
|
||||
resolvedLink: serializeURLPath(linkPath),
|
||||
anchor: true,
|
||||
});
|
||||
}
|
||||
if (isAnchorBrokenLink(linkPath)) {
|
||||
return [
|
||||
{
|
||||
link,
|
||||
resolvedLink: serializeURLPath(linkPath),
|
||||
anchor: true,
|
||||
},
|
||||
];
|
||||
}
|
||||
return [];
|
||||
});
|
||||
|
||||
return brokenLinks;
|
||||
|
@ -128,19 +168,22 @@ function getBrokenLinks({
|
|||
collectedLinks,
|
||||
routes,
|
||||
}: {
|
||||
collectedLinks: CollectedLinks;
|
||||
collectedLinks: CollectedLinksNormalized;
|
||||
routes: RouteConfig[];
|
||||
}): BrokenLinksMap {
|
||||
const filteredRoutes = filterIntermediateRoutes(routes);
|
||||
|
||||
return _.mapValues(collectedLinks, (pageCollectedData, pagePath) => {
|
||||
const helper = createBrokenLinksHelper({
|
||||
collectedLinks,
|
||||
routes: filteredRoutes,
|
||||
});
|
||||
|
||||
const result: BrokenLinksMap = {};
|
||||
collectedLinks.forEach((_unused, pagePath) => {
|
||||
try {
|
||||
return getBrokenLinksForPage({
|
||||
collectedLinks,
|
||||
pageLinks: pageCollectedData.links,
|
||||
pageAnchors: pageCollectedData.anchors,
|
||||
result[pagePath] = getBrokenLinksForPage({
|
||||
pagePath,
|
||||
routes: filteredRoutes,
|
||||
helper,
|
||||
});
|
||||
} catch (e) {
|
||||
throw new Error(`Unable to get broken links for page ${pagePath}.`, {
|
||||
|
@ -148,6 +191,7 @@ function getBrokenLinks({
|
|||
});
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
function brokenLinkMessage(brokenLink: BrokenLink): string {
|
||||
|
@ -303,15 +347,22 @@ function reportBrokenLinks({
|
|||
// JS users might call "collectLink(undefined)" for example
|
||||
// TS users might call "collectAnchor('#hash')" with/without #
|
||||
// We clean/normalize the collected data to avoid obscure errors being thrown
|
||||
// We also use optimized data structures for a faster algorithm
|
||||
function normalizeCollectedLinks(
|
||||
collectedLinks: CollectedLinks,
|
||||
): CollectedLinks {
|
||||
return _.mapValues(collectedLinks, (pageCollectedData) => ({
|
||||
links: pageCollectedData.links.filter(_.isString),
|
||||
anchors: pageCollectedData.anchors
|
||||
.filter(_.isString)
|
||||
.map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
|
||||
}));
|
||||
): CollectedLinksNormalized {
|
||||
const result: CollectedLinksNormalized = new Map();
|
||||
Object.entries(collectedLinks).forEach(([pathname, pageCollectedData]) => {
|
||||
result.set(pathname, {
|
||||
links: new Set(pageCollectedData.links.filter(_.isString)),
|
||||
anchors: new Set(
|
||||
pageCollectedData.anchors
|
||||
.filter(_.isString)
|
||||
.map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
|
||||
),
|
||||
});
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
export async function handleBrokenLinks({
|
||||
|
|
Loading…
Add table
Reference in a new issue