mirror of
https://github.com/facebook/docusaurus.git
synced 2025-08-06 02:08:55 +02:00
fix(sitemap): filter all routes with robots meta containing noindex (#7964)
This commit is contained in:
parent
39883e70de
commit
a335a69982
4 changed files with 90 additions and 13 deletions
|
@ -158,7 +158,10 @@ describe('createSitemap', () => {
|
||||||
meta: {
|
meta: {
|
||||||
// @ts-expect-error: bad lib def
|
// @ts-expect-error: bad lib def
|
||||||
toComponent: () => [
|
toComponent: () => [
|
||||||
React.createElement('meta', {name: 'robots', content: 'noindex'}),
|
React.createElement('meta', {
|
||||||
|
name: 'robots',
|
||||||
|
content: 'NoFolloW, NoiNDeX',
|
||||||
|
}),
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
|
@ -13,6 +13,40 @@ import type {DocusaurusConfig} from '@docusaurus/types';
|
||||||
import type {HelmetServerState} from 'react-helmet-async';
|
import type {HelmetServerState} from 'react-helmet-async';
|
||||||
import type {PluginOptions} from './options';
|
import type {PluginOptions} from './options';
|
||||||
|
|
||||||
|
function isNoIndexMetaRoute({
|
||||||
|
head,
|
||||||
|
route,
|
||||||
|
}: {
|
||||||
|
head: {[location: string]: HelmetServerState};
|
||||||
|
route: string;
|
||||||
|
}) {
|
||||||
|
const isNoIndexMetaTag = ({
|
||||||
|
name,
|
||||||
|
content,
|
||||||
|
}: {
|
||||||
|
name?: string;
|
||||||
|
content?: string;
|
||||||
|
}): boolean => {
|
||||||
|
if (!name || !content) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return (
|
||||||
|
// meta name is not case-sensitive
|
||||||
|
name.toLowerCase() === 'robots' &&
|
||||||
|
// Robots directives are not case-sensitive
|
||||||
|
content.toLowerCase().includes('noindex')
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
// https://github.com/staylor/react-helmet-async/pull/167
|
||||||
|
const meta = head[route]?.meta.toComponent() as unknown as
|
||||||
|
| ReactElement<{name?: string; content?: string}>[]
|
||||||
|
| undefined;
|
||||||
|
return meta?.some((tag) =>
|
||||||
|
isNoIndexMetaTag({name: tag.props.name, content: tag.props.content}),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
export default async function createSitemap(
|
export default async function createSitemap(
|
||||||
siteConfig: DocusaurusConfig,
|
siteConfig: DocusaurusConfig,
|
||||||
routesPaths: string[],
|
routesPaths: string[],
|
||||||
|
@ -27,18 +61,15 @@ export default async function createSitemap(
|
||||||
|
|
||||||
const ignoreMatcher = createMatcher(ignorePatterns);
|
const ignoreMatcher = createMatcher(ignorePatterns);
|
||||||
|
|
||||||
const includedRoutes = routesPaths.filter((route) => {
|
function isRouteExcluded(route: string) {
|
||||||
if (route.endsWith('404.html') || ignoreMatcher(route)) {
|
return (
|
||||||
return false;
|
route.endsWith('404.html') ||
|
||||||
}
|
ignoreMatcher(route) ||
|
||||||
// https://github.com/staylor/react-helmet-async/pull/167
|
isNoIndexMetaRoute({head, route})
|
||||||
const meta = head[route]?.meta.toComponent() as unknown as
|
|
||||||
| ReactElement<{name?: string; content?: string}>[]
|
|
||||||
| undefined;
|
|
||||||
return !meta?.some(
|
|
||||||
(tag) => tag.props.name === 'robots' && tag.props.content === 'noindex',
|
|
||||||
);
|
);
|
||||||
});
|
}
|
||||||
|
|
||||||
|
const includedRoutes = routesPaths.filter((route) => !isRouteExcluded(route));
|
||||||
|
|
||||||
if (includedRoutes.length === 0) {
|
if (includedRoutes.length === 0) {
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -124,7 +124,11 @@ Read more about the robots file in [the Google documentation](https://developers
|
||||||
|
|
||||||
:::caution
|
:::caution
|
||||||
|
|
||||||
**Important**: the `robots.txt` file does **not** prevent HTML pages from being indexed. Use `<meta name="robots" content="noindex">` as [page metadata](#single-page-metadata) to prevent it from appearing in search results entirely.
|
**Important**: the `robots.txt` file does **not** prevent HTML pages from being indexed.
|
||||||
|
|
||||||
|
To prevent your whole Docusaurus site from being indexed, use the [`noIndex`](./api/docusaurus.config.js.md#noIndex) site config. Some [hosting providers](./deployment.mdx) may also let you configure a `X-Robots-Tag: noindex` HTTP header (GitHub Pages does not support this).
|
||||||
|
|
||||||
|
To prevent a single page from being indexed, use `<meta name="robots" content="noindex">` as [page metadata](#single-page-metadata). Read more about the [robots meta tag](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag).
|
||||||
|
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
@ -132,6 +136,20 @@ Read more about the robots file in [the Google documentation](https://developers
|
||||||
|
|
||||||
Docusaurus provides the [`@docusaurus/plugin-sitemap`](./api/plugins/plugin-sitemap.md) plugin, which is shipped with `preset-classic` by default. It autogenerates a `sitemap.xml` file which will be available at `https://example.com/[baseUrl]/sitemap.xml` after the production build. This sitemap metadata helps search engine crawlers crawl your site more accurately.
|
Docusaurus provides the [`@docusaurus/plugin-sitemap`](./api/plugins/plugin-sitemap.md) plugin, which is shipped with `preset-classic` by default. It autogenerates a `sitemap.xml` file which will be available at `https://example.com/[baseUrl]/sitemap.xml` after the production build. This sitemap metadata helps search engine crawlers crawl your site more accurately.
|
||||||
|
|
||||||
|
:::tip
|
||||||
|
|
||||||
|
The sitemap plugin automatically filters pages containing a `noindex` [robots meta directive](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag).
|
||||||
|
|
||||||
|
For example, [`/examples/noIndex`](/examples/noIndex) is not included in the [Docusaurus sitemap.xml file](pathname:///sitemap.xml) because it contains the following [page metadata](#single-page-metadata):
|
||||||
|
|
||||||
|
```html
|
||||||
|
<head>
|
||||||
|
<meta name="robots" content="noindex, nofollow" />
|
||||||
|
</head>
|
||||||
|
```
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
## Human readable links {#human-readable-links}
|
## Human readable links {#human-readable-links}
|
||||||
|
|
||||||
Docusaurus uses your file names as links, but you can always change that using slugs, see this [tutorial](./guides/docs/docs-introduction.md#document-id) for more details.
|
Docusaurus uses your file names as links, but you can always change that using slugs, see this [tutorial](./guides/docs/docs-introduction.md#document-id) for more details.
|
||||||
|
|
25
website/src/pages/examples/noIndex.md
Normal file
25
website/src/pages/examples/noIndex.md
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
# No Index Page example
|
||||||
|
|
||||||
|
<head>
|
||||||
|
<meta name="robots" content="nOiNdeX, NoFolLoW" />
|
||||||
|
</head>
|
||||||
|
|
||||||
|
This page will not be indexed by search engines because it contains the page following [page metadata](/docs/seo#single-page-metadata) markup:
|
||||||
|
|
||||||
|
```html
|
||||||
|
<head>
|
||||||
|
<meta name="robots" content="noindex, nofollow" />
|
||||||
|
</head>
|
||||||
|
```
|
||||||
|
|
||||||
|
:::tip
|
||||||
|
|
||||||
|
The sitemap plugin filters pages containing a `noindex` content value. This page doesn't appear in Docusaurus [sitemap.xml](pathname:///sitemap.xml) file.
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
:::note
|
||||||
|
|
||||||
|
Robots directives are [case-insensitive](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag#directives).
|
||||||
|
|
||||||
|
:::
|
Loading…
Add table
Add a link
Reference in a new issue