mirror of
https://github.com/facebook/docusaurus.git
synced 2025-04-30 02:37:59 +02:00
perf(core): optimize broken links checker (#9778)
This commit is contained in:
parent
8597e5dcdb
commit
c827b6de2d
2 changed files with 174 additions and 66 deletions
|
@ -6,6 +6,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import {jest} from '@jest/globals';
|
import {jest} from '@jest/globals';
|
||||||
|
import reactRouterConfig from 'react-router-config';
|
||||||
import {handleBrokenLinks} from '../brokenLinks';
|
import {handleBrokenLinks} from '../brokenLinks';
|
||||||
import type {RouteConfig} from '@docusaurus/types';
|
import type {RouteConfig} from '@docusaurus/types';
|
||||||
|
|
||||||
|
@ -718,4 +719,60 @@ describe('handleBrokenLinks', () => {
|
||||||
"
|
"
|
||||||
`);
|
`);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('is performant and minimize calls to matchRoutes', async () => {
|
||||||
|
const matchRoutesMock = jest.spyOn(reactRouterConfig, 'matchRoutes');
|
||||||
|
|
||||||
|
const scale = 100;
|
||||||
|
|
||||||
|
const routes: SimpleRoute[] = [
|
||||||
|
...Array.from<SimpleRoute>({length: scale}).map((_, i) => ({
|
||||||
|
path: `/page${i}`,
|
||||||
|
})),
|
||||||
|
...Array.from<SimpleRoute>({length: scale}).fill({
|
||||||
|
path: '/pageDynamic/:subpath1',
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const collectedLinks: Params['collectedLinks'] = Object.fromEntries(
|
||||||
|
Array.from<SimpleRoute>({length: scale}).map((_, i) => [
|
||||||
|
`/page${i}`,
|
||||||
|
{
|
||||||
|
links: [
|
||||||
|
...Array.from<SimpleRoute>({length: scale}).flatMap((_2, j) => [
|
||||||
|
`/page${j}`,
|
||||||
|
`/page${j}?age=42`,
|
||||||
|
`/page${j}#anchor${j}`,
|
||||||
|
`/page${j}?age=42#anchor${j}`,
|
||||||
|
`/pageDynamic/subPath${j}`,
|
||||||
|
`/pageDynamic/subPath${j}?age=42`,
|
||||||
|
// `/pageDynamic/subPath${j}#anchor${j}`,
|
||||||
|
// `/pageDynamic/subPath${j}?age=42#anchor${j}`,
|
||||||
|
]),
|
||||||
|
],
|
||||||
|
anchors: Array.from<SimpleRoute>({length: scale}).map(
|
||||||
|
(_2, j) => `anchor${j}`,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
|
||||||
|
// console.time('testBrokenLinks');
|
||||||
|
await testBrokenLinks({
|
||||||
|
routes,
|
||||||
|
collectedLinks,
|
||||||
|
});
|
||||||
|
// console.timeEnd('testBrokenLinks');
|
||||||
|
|
||||||
|
// Idiomatic code calling matchRoutes multiple times is not performant
|
||||||
|
// We try to minimize the calls to this expensive function
|
||||||
|
// Otherwise large sites will have super long execution times
|
||||||
|
// See https://github.com/facebook/docusaurus/issues/9754
|
||||||
|
// See https://twitter.com/sebastienlorber/status/1749392773415858587
|
||||||
|
// We expect no more matchRoutes calls than number of dynamic route links
|
||||||
|
expect(matchRoutesMock).toHaveBeenCalledTimes(scale);
|
||||||
|
// We expect matchRoutes to be called with a reduced number of routes
|
||||||
|
expect(routes).toHaveLength(scale * 2);
|
||||||
|
expect(matchRoutesMock.mock.calls[0]![0]).toHaveLength(scale);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
|
@ -7,11 +7,18 @@
|
||||||
|
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import logger from '@docusaurus/logger';
|
import logger from '@docusaurus/logger';
|
||||||
import {matchRoutes} from 'react-router-config';
|
import {matchRoutes as reactRouterMatchRoutes} from 'react-router-config';
|
||||||
import {parseURLPath, serializeURLPath, type URLPath} from '@docusaurus/utils';
|
import {parseURLPath, serializeURLPath, type URLPath} from '@docusaurus/utils';
|
||||||
import {getAllFinalRoutes} from './utils';
|
import {getAllFinalRoutes} from './utils';
|
||||||
import type {RouteConfig, ReportingSeverity} from '@docusaurus/types';
|
import type {RouteConfig, ReportingSeverity} from '@docusaurus/types';
|
||||||
|
|
||||||
|
function matchRoutes(routeConfig: RouteConfig[], pathname: string) {
|
||||||
|
// @ts-expect-error: React router types RouteConfig with an actual React
|
||||||
|
// component, but we load route components with string paths.
|
||||||
|
// We don't actually access component here, so it's fine.
|
||||||
|
return reactRouterMatchRoutes(routeConfig, pathname);
|
||||||
|
}
|
||||||
|
|
||||||
type BrokenLink = {
|
type BrokenLink = {
|
||||||
link: string;
|
link: string;
|
||||||
resolvedLink: string;
|
resolvedLink: string;
|
||||||
|
@ -26,88 +33,121 @@ type CollectedLinks = {
|
||||||
[pathname: string]: {links: string[]; anchors: string[]};
|
[pathname: string]: {links: string[]; anchors: string[]};
|
||||||
};
|
};
|
||||||
|
|
||||||
function getBrokenLinksForPage({
|
// We use efficient data structures for performance reasons
|
||||||
|
// See https://github.com/facebook/docusaurus/issues/9754
|
||||||
|
type CollectedLinksNormalized = Map<
|
||||||
|
string,
|
||||||
|
{links: Set<string>; anchors: Set<string>}
|
||||||
|
>;
|
||||||
|
|
||||||
|
type BrokenLinksHelper = {
|
||||||
|
collectedLinks: CollectedLinksNormalized;
|
||||||
|
isPathBrokenLink: (linkPath: URLPath) => boolean;
|
||||||
|
isAnchorBrokenLink: (linkPath: URLPath) => boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
function createBrokenLinksHelper({
|
||||||
collectedLinks,
|
collectedLinks,
|
||||||
pagePath,
|
|
||||||
pageLinks,
|
|
||||||
routes,
|
routes,
|
||||||
}: {
|
}: {
|
||||||
collectedLinks: CollectedLinks;
|
collectedLinks: CollectedLinksNormalized;
|
||||||
pagePath: string;
|
|
||||||
pageLinks: string[];
|
|
||||||
pageAnchors: string[];
|
|
||||||
routes: RouteConfig[];
|
routes: RouteConfig[];
|
||||||
}): BrokenLink[] {
|
}): BrokenLinksHelper {
|
||||||
const allCollectedPaths = new Set(Object.keys(collectedLinks));
|
const validPathnames = new Set(collectedLinks.keys());
|
||||||
|
|
||||||
|
// Matching against the route array can be expensive
|
||||||
|
// If the route is already in the valid pathnames,
|
||||||
|
// we can avoid matching against it as an optimization
|
||||||
|
const remainingRoutes = routes.filter(
|
||||||
|
(route) => !validPathnames.has(route.path),
|
||||||
|
);
|
||||||
|
|
||||||
|
function isPathnameMatchingAnyRoute(pathname: string): boolean {
|
||||||
|
if (matchRoutes(remainingRoutes, pathname).length > 0) {
|
||||||
|
// IMPORTANT: this is an optimization here
|
||||||
|
// See https://github.com/facebook/docusaurus/issues/9754
|
||||||
|
// Large Docusaurus sites have many routes!
|
||||||
|
// We try to minimize calls to a possibly expensive matchRoutes function
|
||||||
|
validPathnames.add(pathname);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
function isPathBrokenLink(linkPath: URLPath) {
|
function isPathBrokenLink(linkPath: URLPath) {
|
||||||
const pathnames = [linkPath.pathname, decodeURI(linkPath.pathname)];
|
const pathnames = [linkPath.pathname, decodeURI(linkPath.pathname)];
|
||||||
const matchedRoutes = pathnames
|
if (pathnames.some((p) => validPathnames.has(p))) {
|
||||||
// @ts-expect-error: React router types RouteConfig with an actual React
|
return false;
|
||||||
// component, but we load route components with string paths.
|
}
|
||||||
// We don't actually access component here, so it's fine.
|
if (pathnames.some(isPathnameMatchingAnyRoute)) {
|
||||||
.map((l) => matchRoutes(routes, l))
|
return false;
|
||||||
.flat();
|
}
|
||||||
// The link path is broken if:
|
return true;
|
||||||
// - it doesn't match any route
|
|
||||||
// - it doesn't match any collected path
|
|
||||||
return (
|
|
||||||
matchedRoutes.length === 0 &&
|
|
||||||
!pathnames.some((p) => allCollectedPaths.has(p))
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function isAnchorBrokenLink(linkPath: URLPath) {
|
function isAnchorBrokenLink(linkPath: URLPath) {
|
||||||
const {pathname, hash} = linkPath;
|
const {pathname, hash} = linkPath;
|
||||||
|
|
||||||
// Link has no hash: it can't be a broken anchor link
|
// Link has no hash: it can't be a broken anchor link
|
||||||
if (hash === undefined) {
|
if (hash === undefined) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Link has empty hash ("#", "/page#"...): we do not report it as broken
|
// Link has empty hash ("#", "/page#"...): we do not report it as broken
|
||||||
// Empty hashes are used for various weird reasons, by us and other users...
|
// Empty hashes are used for various weird reasons, by us and other users...
|
||||||
// See for example: https://github.com/facebook/docusaurus/pull/6003
|
// See for example: https://github.com/facebook/docusaurus/pull/6003
|
||||||
if (hash === '') {
|
if (hash === '') {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const targetPage =
|
const targetPage =
|
||||||
collectedLinks[pathname] || collectedLinks[decodeURI(pathname)];
|
collectedLinks.get(pathname) || collectedLinks.get(decodeURI(pathname));
|
||||||
|
|
||||||
// link with anchor to a page that does not exist (or did not collect any
|
// link with anchor to a page that does not exist (or did not collect any
|
||||||
// link/anchor) is considered as a broken anchor
|
// link/anchor) is considered as a broken anchor
|
||||||
if (!targetPage) {
|
if (!targetPage) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
// it's a not broken anchor if the anchor exists on the target page
|
||||||
// it's a broken anchor if the target page exists
|
if (
|
||||||
// but the anchor does not exist on that page
|
targetPage.anchors.has(hash) ||
|
||||||
const hashes = [hash, decodeURIComponent(hash)];
|
targetPage.anchors.has(decodeURIComponent(hash))
|
||||||
return !targetPage.anchors.some((anchor) => hashes.includes(anchor));
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const brokenLinks = pageLinks.flatMap((link) => {
|
return {
|
||||||
|
collectedLinks,
|
||||||
|
isPathBrokenLink,
|
||||||
|
isAnchorBrokenLink,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function getBrokenLinksForPage({
|
||||||
|
pagePath,
|
||||||
|
helper,
|
||||||
|
}: {
|
||||||
|
pagePath: string;
|
||||||
|
helper: BrokenLinksHelper;
|
||||||
|
}): BrokenLink[] {
|
||||||
|
const pageData = helper.collectedLinks.get(pagePath)!;
|
||||||
|
|
||||||
|
const brokenLinks: BrokenLink[] = [];
|
||||||
|
|
||||||
|
pageData.links.forEach((link) => {
|
||||||
const linkPath = parseURLPath(link, pagePath);
|
const linkPath = parseURLPath(link, pagePath);
|
||||||
if (isPathBrokenLink(linkPath)) {
|
if (helper.isPathBrokenLink(linkPath)) {
|
||||||
return [
|
brokenLinks.push({
|
||||||
{
|
link,
|
||||||
link,
|
resolvedLink: serializeURLPath(linkPath),
|
||||||
resolvedLink: serializeURLPath(linkPath),
|
anchor: false,
|
||||||
anchor: false,
|
});
|
||||||
},
|
} else if (helper.isAnchorBrokenLink(linkPath)) {
|
||||||
];
|
brokenLinks.push({
|
||||||
|
link,
|
||||||
|
resolvedLink: serializeURLPath(linkPath),
|
||||||
|
anchor: true,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
if (isAnchorBrokenLink(linkPath)) {
|
|
||||||
return [
|
|
||||||
{
|
|
||||||
link,
|
|
||||||
resolvedLink: serializeURLPath(linkPath),
|
|
||||||
anchor: true,
|
|
||||||
},
|
|
||||||
];
|
|
||||||
}
|
|
||||||
return [];
|
|
||||||
});
|
});
|
||||||
|
|
||||||
return brokenLinks;
|
return brokenLinks;
|
||||||
|
@ -128,19 +168,22 @@ function getBrokenLinks({
|
||||||
collectedLinks,
|
collectedLinks,
|
||||||
routes,
|
routes,
|
||||||
}: {
|
}: {
|
||||||
collectedLinks: CollectedLinks;
|
collectedLinks: CollectedLinksNormalized;
|
||||||
routes: RouteConfig[];
|
routes: RouteConfig[];
|
||||||
}): BrokenLinksMap {
|
}): BrokenLinksMap {
|
||||||
const filteredRoutes = filterIntermediateRoutes(routes);
|
const filteredRoutes = filterIntermediateRoutes(routes);
|
||||||
|
|
||||||
return _.mapValues(collectedLinks, (pageCollectedData, pagePath) => {
|
const helper = createBrokenLinksHelper({
|
||||||
|
collectedLinks,
|
||||||
|
routes: filteredRoutes,
|
||||||
|
});
|
||||||
|
|
||||||
|
const result: BrokenLinksMap = {};
|
||||||
|
collectedLinks.forEach((_unused, pagePath) => {
|
||||||
try {
|
try {
|
||||||
return getBrokenLinksForPage({
|
result[pagePath] = getBrokenLinksForPage({
|
||||||
collectedLinks,
|
|
||||||
pageLinks: pageCollectedData.links,
|
|
||||||
pageAnchors: pageCollectedData.anchors,
|
|
||||||
pagePath,
|
pagePath,
|
||||||
routes: filteredRoutes,
|
helper,
|
||||||
});
|
});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
throw new Error(`Unable to get broken links for page ${pagePath}.`, {
|
throw new Error(`Unable to get broken links for page ${pagePath}.`, {
|
||||||
|
@ -148,6 +191,7 @@ function getBrokenLinks({
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
function brokenLinkMessage(brokenLink: BrokenLink): string {
|
function brokenLinkMessage(brokenLink: BrokenLink): string {
|
||||||
|
@ -303,15 +347,22 @@ function reportBrokenLinks({
|
||||||
// JS users might call "collectLink(undefined)" for example
|
// JS users might call "collectLink(undefined)" for example
|
||||||
// TS users might call "collectAnchor('#hash')" with/without #
|
// TS users might call "collectAnchor('#hash')" with/without #
|
||||||
// We clean/normalize the collected data to avoid obscure errors being thrown
|
// We clean/normalize the collected data to avoid obscure errors being thrown
|
||||||
|
// We also use optimized data structures for a faster algorithm
|
||||||
function normalizeCollectedLinks(
|
function normalizeCollectedLinks(
|
||||||
collectedLinks: CollectedLinks,
|
collectedLinks: CollectedLinks,
|
||||||
): CollectedLinks {
|
): CollectedLinksNormalized {
|
||||||
return _.mapValues(collectedLinks, (pageCollectedData) => ({
|
const result: CollectedLinksNormalized = new Map();
|
||||||
links: pageCollectedData.links.filter(_.isString),
|
Object.entries(collectedLinks).forEach(([pathname, pageCollectedData]) => {
|
||||||
anchors: pageCollectedData.anchors
|
result.set(pathname, {
|
||||||
.filter(_.isString)
|
links: new Set(pageCollectedData.links.filter(_.isString)),
|
||||||
.map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
|
anchors: new Set(
|
||||||
}));
|
pageCollectedData.anchors
|
||||||
|
.filter(_.isString)
|
||||||
|
.map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
|
||||||
|
),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function handleBrokenLinks({
|
export async function handleBrokenLinks({
|
||||||
|
|
Loading…
Add table
Reference in a new issue