diff --git a/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts b/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts index 9d42f781ba..8a4fa2ac55 100644 --- a/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts +++ b/packages/docusaurus-plugin-sitemap/src/__tests__/createSitemap.test.ts @@ -158,7 +158,10 @@ describe('createSitemap', () => { meta: { // @ts-expect-error: bad lib def toComponent: () => [ - React.createElement('meta', {name: 'robots', content: 'noindex'}), + React.createElement('meta', { + name: 'robots', + content: 'NoFolloW, NoiNDeX', + }), ], }, }, diff --git a/packages/docusaurus-plugin-sitemap/src/createSitemap.ts b/packages/docusaurus-plugin-sitemap/src/createSitemap.ts index 536f2ecfc6..2c91667397 100644 --- a/packages/docusaurus-plugin-sitemap/src/createSitemap.ts +++ b/packages/docusaurus-plugin-sitemap/src/createSitemap.ts @@ -13,6 +13,40 @@ import type {DocusaurusConfig} from '@docusaurus/types'; import type {HelmetServerState} from 'react-helmet-async'; import type {PluginOptions} from './options'; +function isNoIndexMetaRoute({ + head, + route, +}: { + head: {[location: string]: HelmetServerState}; + route: string; +}) { + const isNoIndexMetaTag = ({ + name, + content, + }: { + name?: string; + content?: string; + }): boolean => { + if (!name || !content) { + return false; + } + return ( + // meta name is not case-sensitive + name.toLowerCase() === 'robots' && + // Robots directives are not case-sensitive + content.toLowerCase().includes('noindex') + ); + }; + + // https://github.com/staylor/react-helmet-async/pull/167 + const meta = head[route]?.meta.toComponent() as unknown as + | ReactElement<{name?: string; content?: string}>[] + | undefined; + return meta?.some((tag) => + isNoIndexMetaTag({name: tag.props.name, content: tag.props.content}), + ); +} + export default async function createSitemap( siteConfig: DocusaurusConfig, routesPaths: string[], @@ -27,18 +61,15 @@ export default async function createSitemap( const ignoreMatcher = createMatcher(ignorePatterns); - const includedRoutes = routesPaths.filter((route) => { - if (route.endsWith('404.html') || ignoreMatcher(route)) { - return false; - } - // https://github.com/staylor/react-helmet-async/pull/167 - const meta = head[route]?.meta.toComponent() as unknown as - | ReactElement<{name?: string; content?: string}>[] - | undefined; - return !meta?.some( - (tag) => tag.props.name === 'robots' && tag.props.content === 'noindex', + function isRouteExcluded(route: string) { + return ( + route.endsWith('404.html') || + ignoreMatcher(route) || + isNoIndexMetaRoute({head, route}) ); - }); + } + + const includedRoutes = routesPaths.filter((route) => !isRouteExcluded(route)); if (includedRoutes.length === 0) { return null; diff --git a/website/docs/seo.md b/website/docs/seo.md index 578bb7761c..f252889cdf 100644 --- a/website/docs/seo.md +++ b/website/docs/seo.md @@ -124,7 +124,11 @@ Read more about the robots file in [the Google documentation](https://developers :::caution -**Important**: the `robots.txt` file does **not** prevent HTML pages from being indexed. Use `` as [page metadata](#single-page-metadata) to prevent it from appearing in search results entirely. +**Important**: the `robots.txt` file does **not** prevent HTML pages from being indexed. + +To prevent your whole Docusaurus site from being indexed, use the [`noIndex`](./api/docusaurus.config.js.md#noIndex) site config. Some [hosting providers](./deployment.mdx) may also let you configure a `X-Robots-Tag: noindex` HTTP header (GitHub Pages does not support this). + +To prevent a single page from being indexed, use `` as [page metadata](#single-page-metadata). Read more about the [robots meta tag](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag). ::: @@ -132,6 +136,20 @@ Read more about the robots file in [the Google documentation](https://developers Docusaurus provides the [`@docusaurus/plugin-sitemap`](./api/plugins/plugin-sitemap.md) plugin, which is shipped with `preset-classic` by default. It autogenerates a `sitemap.xml` file which will be available at `https://example.com/[baseUrl]/sitemap.xml` after the production build. This sitemap metadata helps search engine crawlers crawl your site more accurately. +:::tip + +The sitemap plugin automatically filters pages containing a `noindex` [robots meta directive](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag). + +For example, [`/examples/noIndex`](/examples/noIndex) is not included in the [Docusaurus sitemap.xml file](pathname:///sitemap.xml) because it contains the following [page metadata](#single-page-metadata): + +```html + + + +``` + +::: + ## Human readable links {#human-readable-links} Docusaurus uses your file names as links, but you can always change that using slugs, see this [tutorial](./guides/docs/docs-introduction.md#document-id) for more details. diff --git a/website/src/pages/examples/noIndex.md b/website/src/pages/examples/noIndex.md new file mode 100644 index 0000000000..b4df30e2f8 --- /dev/null +++ b/website/src/pages/examples/noIndex.md @@ -0,0 +1,25 @@ +# No Index Page example + + + + + +This page will not be indexed by search engines because it contains the page following [page metadata](/docs/seo#single-page-metadata) markup: + +```html + + + +``` + +:::tip + +The sitemap plugin filters pages containing a `noindex` content value. This page doesn't appear in Docusaurus [sitemap.xml](pathname:///sitemap.xml) file. + +::: + +:::note + +Robots directives are [case-insensitive](https://developers.google.com/search/docs/advanced/robots/robots_meta_tag#directives). + +:::