feat(v2): various markdown string parsing improvements/fixes (#4590)

* extract createExcerpt code in separate file + add bad test * almost working markdown parsing refactor * complete parseMarkdownString refactor * fix tests * fix blog test issue * fix docusaurus utils imports
2025-07-16 08:15:55 +02:00 · 2021-04-09 17:09:33 +02:00 · 2021-04-09 17:09:33 +02:00 · 4efe6824b3
commit 4efe6824b3
parent b743edf5fb
15 changed files with 895 additions and 563 deletions
--- a/packages/docusaurus-utils/src/tests/snapshots/parseMarkdown.test.ts.snap
+++ b/packages/docusaurus-utils/src/tests/snapshots/parseMarkdown.test.ts.snap
@ -1,148 +0,0 @@
-// Jest Snapshot v1, https://goo.gl/fbAQLP
-
-exports[`load utils: parseMarkdown parseMarkdownString should delete only first heading 1`] = `
-Object {
-  "content": "
-test test test test test test
-test test test # test bar
-# test
-### test",
-  "excerpt": "",
-  "frontMatter": Object {
-    "title": "test",
-  },
-  "hasFrontMatter": false,
-}
-`;
-
-exports[`load utils: parseMarkdown parseMarkdownString should ignore heading if its not a first text 1`] = `
-Object {
-  "content": "foo
-# test",
-  "excerpt": "foo",
-  "frontMatter": Object {},
-  "hasFrontMatter": false,
-}
-`;
-
-exports[`load utils: parseMarkdown parseMarkdownString should parse first heading as title 1`] = `
-Object {
-  "content": "",
-  "excerpt": "",
-  "frontMatter": Object {
-    "title": "test",
-  },
-  "hasFrontMatter": false,
-}
-`;
-
-exports[`load utils: parseMarkdown parseMarkdownString should preserve front-matter title and warn about duplication 1`] = `
-Object {
-  "content": "# test",
-  "excerpt": "test",
-  "frontMatter": Object {
-    "title": "title",
-  },
-  "hasFrontMatter": true,
-}
-`;
-
-exports[`load utils: parseMarkdown parseMarkdownString should read front matter 1`] = `
-Object {
-  "content": "",
-  "excerpt": undefined,
-  "frontMatter": Object {
-    "title": "test",
-  },
-  "hasFrontMatter": true,
-}
-`;
-
-exports[`load utils: parseMarkdown readFrontMatter should delete only first heading 1`] = `
-Object {
-  "content": "test test test # test bar
-# test
-### test",
-  "excerpt": "",
-  "frontMatter": Object {
-    "title": "test",
-  },
-  "hasFrontMatter": false,
-}
-`;
-
-exports[`load utils: parseMarkdown readFrontMatter should ignore heading if its not a first text 1`] = `
-Object {
-  "content": "foo
-# test",
-  "excerpt": "",
-  "frontMatter": Object {},
-  "hasFrontMatter": false,
-}
-`;
-
-exports[`load utils: parseMarkdown readFrontMatter should not warn about duplicated title 1`] = `
-Object {
-  "content": "# test",
-  "excerpt": "",
-  "frontMatter": Object {
-    "title": "title",
-  },
-  "hasFrontMatter": true,
-}
-`;
-
-exports[`load utils: parseMarkdown readFrontMatter should parse first heading as title 1`] = `
-Object {
-  "content": "",
-  "excerpt": "",
-  "frontMatter": Object {
-    "title": "test",
-  },
-  "hasFrontMatter": false,
-}
-`;
-
-exports[`load utils: parseMarkdown readFrontMatter should parse first heading as title and keep it in content 1`] = `
-Object {
-  "content": "# test",
-  "excerpt": "",
-  "frontMatter": Object {
-    "title": "test",
-  },
-  "hasFrontMatter": false,
-}
-`;
-
-exports[`load utils: parseMarkdown readFrontMatter should parse front-matter and ignore h2 1`] = `
-Object {
-  "content": "## test",
-  "excerpt": "",
-  "frontMatter": Object {
-    "title": "title",
-  },
-  "hasFrontMatter": true,
-}
-`;
-
-exports[`load utils: parseMarkdown readFrontMatter should preserve front-matter title and warn about duplication 1`] = `
-Object {
-  "content": "# test",
-  "excerpt": "",
-  "frontMatter": Object {
-    "title": "title",
-  },
-  "hasFrontMatter": true,
-}
-`;
-
-exports[`load utils: parseMarkdown readFrontMatter should read front matter 1`] = `
-Object {
-  "content": "",
-  "excerpt": "",
-  "frontMatter": Object {
-    "title": "test",
-  },
-  "hasFrontMatter": true,
-}
-`;
--- a/packages/docusaurus-utils/src/tests/index.test.ts
+++ b/packages/docusaurus-utils/src/tests/index.test.ts
@ -18,7 +18,6 @@ import {
  posixPath,
  objectWithKeySorted,
  aliasedSitePath,
-  createExcerpt,
  isValidPathname,
  addTrailingSlash,
  removeTrailingSlash,
@ -372,81 +371,6 @@ describe('load utils', () => {
    );
  });

-  test('createExcerpt', () => {
-    const asserts = [
-      // Regular content
-      {
-        input: `
-          Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum ex urna, molestie et sagittis ut, varius ac justo.
-
-          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
-        `,
-        output:
-          'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum ex urna, molestie et sagittis ut, varius ac justo.',
-      },
-      // Content with imports/exports declarations and Markdown markup, as well as Emoji
-      {
-        input: `
-          import Component from '@site/src/components/Component';
-          import Component from '@site/src/components/Component'
-          import './styles.css';
-
-          export function ItemCol(props) { return <Item {...props} className={'col col--6 margin-bottom--lg'}/> }
-
-          export function ItemCol(props) { return <Item {...props} className={'col col--6 margin-bottom--lg'}/> };
-
-          Lorem **ipsum** dolor sit \`amet\`[^1], consectetur _adipiscing_ elit. [**Vestibulum**](https://wiktionary.org/wiki/vestibulum) ex urna[^bignote], ~molestie~ et sagittis ut, varius ac justo :wink:.
-
-          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
-        `,
-        output:
-          'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum ex urna, molestie et sagittis ut, varius ac justo.',
-      },
-      // Content beginning with admonitions
-      {
-        input: `
-          import Component from '@site/src/components/Component'
-
-          :::caution
-
-          Lorem ipsum dolor sit amet, consectetur adipiscing elit.
-
-          :::
-
-          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
-        `,
-        output: 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
-      },
-      // Content beginning with heading
-      {
-        input: `
-          ## Lorem ipsum dolor sit amet
-
-          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
-        `,
-        output: 'Lorem ipsum dolor sit amet',
-      },
-      // Content beginning with blockquote
-      {
-        input: `
-          > Lorem ipsum dolor sit amet
-        `,
-        output: 'Lorem ipsum dolor sit amet',
-      },
-      // Content beginning with image (eg. blog post)
-      {
-        input: `
-          ![Lorem ipsum](/img/lorem-ipsum.svg)
-        `,
-        output: 'Lorem ipsum',
-      },
-    ];
-
-    asserts.forEach((testCase) => {
-      expect(createExcerpt(testCase.input)).toEqual(testCase.output);
-    });
-  });
-
  test('isValidPathname', () => {
    expect(isValidPathname('/')).toBe(true);
    expect(isValidPathname('/hey')).toBe(true);
--- a/packages/docusaurus-utils/src/tests/markdownParser.test.ts
+++ b/packages/docusaurus-utils/src/tests/markdownParser.test.ts
@ -0,0 +1,568 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import {
+  createExcerpt,
+  parseMarkdownContentTitle,
+  parseMarkdownString,
+} from '../markdownParser';
+import dedent from 'dedent';
+
+describe('createExcerpt', () => {
+  test('should create excerpt for text-only content', () => {
+    expect(
+      createExcerpt(dedent`
+          Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum ex urna, molestie et sagittis ut, varius ac justo.
+
+          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
+        `),
+    ).toEqual(
+      'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum ex urna, molestie et sagittis ut, varius ac justo.',
+    );
+  });
+
+  test('should create excerpt for regular content with regular title', () => {
+    expect(
+      createExcerpt(dedent`
+
+          # Markdown Regular Title
+
+          Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum ex urna, molestie et sagittis ut, varius ac justo.
+
+          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
+        `),
+    ).toEqual(
+      // h1 title is skipped on purpose, because we don't want the page to have SEO metadatas title === description
+      'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum ex urna, molestie et sagittis ut, varius ac justo.',
+    );
+  });
+
+  test('should create excerpt for regular content with alternate title', () => {
+    expect(
+      createExcerpt(dedent`
+
+          Markdown Alternate Title
+          ================
+
+          Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum ex urna, molestie et sagittis ut, varius ac justo.
+
+          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
+        `),
+    ).toEqual(
+      // h1 title is skipped on purpose, because we don't want the page to have SEO metadatas title === description
+      'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum ex urna, molestie et sagittis ut, varius ac justo.',
+    );
+  });
+
+  test('should create excerpt for content with h2 heading', () => {
+    expect(
+      createExcerpt(dedent`
+          ## Lorem ipsum dolor sit amet
+
+          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
+        `),
+    ).toEqual('Lorem ipsum dolor sit amet');
+  });
+
+  test('should create excerpt for content beginning with blockquote', () => {
+    expect(
+      createExcerpt(dedent`
+          > Lorem ipsum dolor sit amet
+
+          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
+        `),
+    ).toEqual('Lorem ipsum dolor sit amet');
+  });
+
+  test('should create excerpt for content beginning with image (eg. blog post)', () => {
+    expect(
+      createExcerpt(dedent`
+          ![Lorem ipsum](/img/lorem-ipsum.svg)
+        `),
+    ).toEqual('Lorem ipsum');
+  });
+
+  test('should create excerpt for content beginning with admonitions', () => {
+    expect(
+      createExcerpt(dedent`
+          import Component from '@site/src/components/Component'
+
+          :::caution
+
+          Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+
+          :::
+
+          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
+        `),
+    ).toEqual('Lorem ipsum dolor sit amet, consectetur adipiscing elit.');
+  });
+
+  test('should create excerpt for content with imports/exports declarations and Markdown markup, as well as Emoji', () => {
+    expect(
+      createExcerpt(dedent`
+          import Component from '@site/src/components/Component';
+          import Component from '@site/src/components/Component'
+          import './styles.css';
+
+          export function ItemCol(props) { return <Item {...props} className={'col col--6 margin-bottom--lg'}/> }
+
+          export function ItemCol(props) { return <Item {...props} className={'col col--6 margin-bottom--lg'}/> };
+
+          Lorem **ipsum** dolor sit \`amet\`[^1], consectetur _adipiscing_ elit. [**Vestibulum**](https://wiktionary.org/wiki/vestibulum) ex urna[^bignote], ~molestie~ et sagittis ut, varius ac justo :wink:.
+
+          Nunc porttitor libero nec vulputate venenatis. Nam nec rhoncus mauris. Morbi tempus est et nibh maximus, tempus venenatis arcu lobortis.
+        `),
+    ).toEqual(
+      'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vestibulum ex urna, molestie et sagittis ut, varius ac justo.',
+    );
+  });
+});
+
+describe('parseMarkdownContentTitle', () => {
+  test('Should parse markdown h1 title at the top', () => {
+    const markdown = dedent`
+
+          # Markdown Title
+
+          Lorem Ipsum
+
+        `;
+    expect(parseMarkdownContentTitle(markdown)).toEqual({
+      content: 'Lorem Ipsum',
+      contentTitle: 'Markdown Title',
+    });
+  });
+
+  test('Should parse markdown h1 title at the top (atx style with closing #)', () => {
+    const markdown = dedent`
+
+          # Markdown Title #
+
+          Lorem Ipsum
+
+        `;
+    expect(parseMarkdownContentTitle(markdown)).toEqual({
+      content: 'Lorem Ipsum',
+      contentTitle: 'Markdown Title',
+    });
+  });
+
+  test('Should parse markdown h1 alternate title', () => {
+    const markdown = dedent`
+
+          Markdown Title
+          ================
+
+          Lorem Ipsum
+
+        `;
+    expect(parseMarkdownContentTitle(markdown)).toEqual({
+      content: 'Lorem Ipsum',
+      contentTitle: 'Markdown Title',
+    });
+  });
+
+  test('Should parse title-only', () => {
+    const markdown = '# Document With Only A Title ';
+    expect(parseMarkdownContentTitle(markdown)).toEqual({
+      content: '',
+      contentTitle: 'Document With Only A Title',
+    });
+  });
+
+  test('Should parse markdown h1 title at the top but keep it in content', () => {
+    const markdown = dedent`
+
+          # Markdown Title
+
+          Lorem Ipsum
+
+        `;
+    expect(
+      parseMarkdownContentTitle(markdown, {keepContentTitle: true}),
+    ).toEqual({
+      content: markdown.trim(),
+      contentTitle: 'Markdown Title',
+    });
+  });
+
+  test('Should not parse markdown h1 title in the middle of a doc', () => {
+    const markdown = dedent`
+
+          Lorem Ipsum
+
+          # Markdown Title
+
+          Lorem Ipsum
+
+        `;
+    expect(parseMarkdownContentTitle(markdown)).toEqual({
+      content: markdown,
+      contentTitle: undefined,
+    });
+  });
+
+  test('Should not parse markdown h1 alternate title in the middle of the doc', () => {
+    const markdown = dedent`
+
+          Lorem Ipsum
+
+          Markdown Title
+          ================
+
+          Lorem Ipsum
+
+        `;
+    expect(parseMarkdownContentTitle(markdown)).toEqual({
+      content: markdown,
+      contentTitle: undefined,
+    });
+  });
+});
+
+describe('parseMarkdownString', () => {
+  const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
+  beforeEach(() => {
+    warn.mockReset();
+  });
+
+  function expectDuplicateTitleWarning() {
+    expect(warn).toBeCalledWith(
+      expect.stringMatching(/Duplicate title found in this file/),
+    );
+  }
+  function expectNoWarning() {
+    expect(warn).not.toBeCalled();
+  }
+
+  test('parse markdown with frontmatter', () => {
+    expect(
+      parseMarkdownString(dedent`
+        ---
+        title: Frontmatter title
+        ---
+
+        Some text
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "Some text",
+        "contentTitle": undefined,
+        "excerpt": "Some text",
+        "frontMatter": Object {
+          "title": "Frontmatter title",
+        },
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should parse first heading as contentTitle', () => {
+    expect(
+      parseMarkdownString(dedent`
+        # Markdown Title
+
+        Some text
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "Some text",
+        "contentTitle": "Markdown Title",
+        "excerpt": "Some text",
+        "frontMatter": Object {},
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should warn about duplicate titles (frontmatter + markdown)', () => {
+    expect(
+      parseMarkdownString(dedent`
+        ---
+        title: Frontmatter title
+        ---
+
+        # Markdown Title
+
+        Some text
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "Some text",
+        "contentTitle": "Markdown Title",
+        "excerpt": "Some text",
+        "frontMatter": Object {
+          "title": "Frontmatter title",
+        },
+      }
+    `);
+    expectDuplicateTitleWarning();
+  });
+
+  test('should warn about duplicate titles (frontmatter + markdown alternate)', () => {
+    expect(
+      parseMarkdownString(dedent`
+        ---
+        title: Frontmatter title
+        ---
+
+        Markdown Title alternate
+        ================
+
+        Some text
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "Some text",
+        "contentTitle": "Markdown Title alternate",
+        "excerpt": "Some text",
+        "frontMatter": Object {
+          "title": "Frontmatter title",
+        },
+      }
+    `);
+    expectDuplicateTitleWarning();
+  });
+
+  test('should not warn for duplicate title if keepContentTitle=true', () => {
+    expect(
+      parseMarkdownString(
+        dedent`
+        ---
+        title: Frontmatter title
+        ---
+
+        # Markdown Title
+
+        Some text
+        `,
+        {keepContentTitle: true},
+      ),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "# Markdown Title
+
+      Some text",
+        "contentTitle": "Markdown Title",
+        "excerpt": "Some text",
+        "frontMatter": Object {
+          "title": "Frontmatter title",
+        },
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should not warn for duplicate title if markdown title is not at the top', () => {
+    expect(
+      parseMarkdownString(dedent`
+        ---
+        title: Frontmatter title
+        ---
+
+        foo
+
+        # Markdown Title
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "foo
+
+      # Markdown Title",
+        "contentTitle": undefined,
+        "excerpt": "foo",
+        "frontMatter": Object {
+          "title": "Frontmatter title",
+        },
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should parse markdown title and keep it in content', () => {
+    expect(
+      parseMarkdownString(
+        dedent`
+          # Markdown Title
+          `,
+        {keepContentTitle: true},
+      ),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "# Markdown Title",
+        "contentTitle": "Markdown Title",
+        "excerpt": undefined,
+        "frontMatter": Object {},
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should delete only first heading', () => {
+    expect(
+      parseMarkdownString(dedent`
+        # Markdown Title
+
+        test test test # test bar
+
+        # Markdown Title 2
+
+        ### Markdown Title h3
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "test test test # test bar
+
+      # Markdown Title 2
+
+      ### Markdown Title h3",
+        "contentTitle": "Markdown Title",
+        "excerpt": "test test test # test bar",
+        "frontMatter": Object {},
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should parse front-matter and ignore h2', () => {
+    expect(
+      parseMarkdownString(
+        dedent`
+          ---
+          title: Frontmatter title
+          ---
+          ## test
+          `,
+      ),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "## test",
+        "contentTitle": undefined,
+        "excerpt": "test",
+        "frontMatter": Object {
+          "title": "Frontmatter title",
+        },
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should read front matter only', () => {
+    expect(
+      parseMarkdownString(dedent`
+        ---
+        title: test
+        ---
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "",
+        "contentTitle": undefined,
+        "excerpt": undefined,
+        "frontMatter": Object {
+          "title": "test",
+        },
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should parse title only', () => {
+    expect(parseMarkdownString('# test')).toMatchInlineSnapshot(`
+      Object {
+        "content": "",
+        "contentTitle": "test",
+        "excerpt": undefined,
+        "frontMatter": Object {},
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should parse title only alternate', () => {
+    expect(
+      parseMarkdownString(dedent`
+        test
+        ===
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "",
+        "contentTitle": "test",
+        "excerpt": undefined,
+        "frontMatter": Object {},
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should warn about duplicate titles', () => {
+    expect(
+      parseMarkdownString(dedent`
+        ---
+        title: Frontmatter title
+        ---
+        # test
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "",
+        "contentTitle": "test",
+        "excerpt": undefined,
+        "frontMatter": Object {
+          "title": "Frontmatter title",
+        },
+      }
+    `);
+    expectDuplicateTitleWarning();
+  });
+
+  test('should ignore markdown title if its not a first text', () => {
+    expect(
+      parseMarkdownString(dedent`
+        foo
+        # test
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "foo
+      # test",
+        "contentTitle": undefined,
+        "excerpt": "foo",
+        "frontMatter": Object {},
+      }
+    `);
+    expectNoWarning();
+  });
+
+  test('should delete only first heading', () => {
+    expect(
+      parseMarkdownString(dedent`
+        # test
+
+        test test test test test test
+        test test test # test bar
+        # test2
+        ### test
+        test3
+        `),
+    ).toMatchInlineSnapshot(`
+      Object {
+        "content": "test test test test test test
+      test test test # test bar
+      # test2
+      ### test
+      test3",
+        "contentTitle": "test",
+        "excerpt": "test test test test test test",
+        "frontMatter": Object {},
+      }
+    `);
+    expectNoWarning();
+  });
+});
--- a/packages/docusaurus-utils/src/tests/parseMarkdown.test.ts
+++ b/packages/docusaurus-utils/src/tests/parseMarkdown.test.ts
@ -1,177 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-import {parseMarkdownString, readFrontMatter} from '../index';
-import dedent from 'dedent';
-
-describe('load utils: parseMarkdown', () => {
-  describe('readFrontMatter', () => {
-    test('should read front matter', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        readFrontMatter(dedent`
-        ---
-        title: test
-        ---
-        `),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-    test('should parse first heading as title', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        readFrontMatter(dedent`
-        # test
-        `),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-    test('should preserve front-matter title and warn about duplication', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        readFrontMatter(dedent`
-        ---
-        title: title
-        ---
-        # test
-        `),
-      ).toMatchSnapshot();
-      expect(warn).toBeCalledWith('Duplicate title detected in `this` file');
-      warn.mockReset();
-    });
-    test('should ignore heading if its not a first text', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        readFrontMatter(dedent`
-        foo
-        # test
-        `),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-    test('should parse first heading as title and keep it in content', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        readFrontMatter(
-          dedent`
-          # test
-          `,
-          undefined,
-          {},
-          false,
-        ),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-    test('should delete only first heading', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        readFrontMatter(dedent`
-        # test
-        test test test # test bar
-        # test
-        ### test
-        `),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-    test('should parse front-matter and ignore h2', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        readFrontMatter(
-          dedent`
-          ---
-          title: title
-          ---
-          ## test
-          `,
-          undefined,
-          {},
-          false,
-        ),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-    test('should not warn about duplicated title', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        readFrontMatter(
-          dedent`
-          ---
-          title: title
-          ---
-          # test
-          `,
-          undefined,
-          {},
-          false,
-        ),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-  });
-
-  describe('parseMarkdownString', () => {
-    test('should read front matter', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        parseMarkdownString(dedent`
-        ---
-        title: test
-        ---
-        `),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-    test('should parse first heading as title', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        parseMarkdownString(dedent`
-        # test
-        `),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-    test('should preserve front-matter title and warn about duplication', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        parseMarkdownString(dedent`
-        ---
-        title: title
-        ---
-        # test
-        `),
-      ).toMatchSnapshot();
-      expect(warn).toBeCalledWith('Duplicate title detected in `this` file');
-      warn.mockReset();
-    });
-    test('should ignore heading if its not a first text', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        parseMarkdownString(dedent`
-        foo
-        # test
-        `),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-    test('should delete only first heading', () => {
-      const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
-      expect(
-        parseMarkdownString(dedent`
-        # test
-
-        test test test test test test
-        test test test # test bar
-        # test
-        ### test
-        `),
-      ).toMatchSnapshot();
-      expect(warn).not.toBeCalled();
-    });
-  });
-});
--- a/packages/docusaurus-utils/src/index.ts
+++ b/packages/docusaurus-utils/src/index.ts
@ -7,7 +7,6 @@

 import chalk from 'chalk';
 import path from 'path';
-import matter from 'gray-matter';
 import {createHash} from 'crypto';
 import {camelCase, kebabCase, mapValues} from 'lodash';
 import escapeStringRegexp from 'escape-string-regexp';
@ -23,6 +22,8 @@ import {
 import resolvePathnameUnsafe from 'resolve-pathname';

 export * from './codeTranslationsUtils';
+export * from './markdownParser';
+export * from './markdownLinks';

 const fileHash = new Map();
 export async function generate(
@ -206,135 +207,6 @@ export function getSubFolder(file: string, refDir: string): string | null {
  return match && match[1];
 }

-export function createExcerpt(fileString: string): string | undefined {
-  const fileLines = fileString.trimLeft().split('\n');
-
-  /* eslint-disable no-continue */
-  // eslint-disable-next-line no-restricted-syntax
-  for (const fileLine of fileLines) {
-    // Skip empty line.
-    if (!fileLine.trim()) {
-      continue;
-    }
-
-    // Skip import/export declaration.
-    if (/^\s*?import\s.*(from.*)?;?|export\s.*{.*};?/.test(fileLine)) {
-      continue;
-    }
-
-    const cleanedLine = fileLine
-      // Remove HTML tags.
-      .replace(/<[^>]*>/g, '')
-      // Remove ATX-style headers.
-      .replace(/^\#{1,6}\s*([^#]*)\s*(\#{1,6})?/gm, '$1')
-      // Remove emphasis and strikethroughs.
-      .replace(/([\*_~]{1,3})(\S.*?\S{0,1})\1/g, '$2')
-      // Remove images.
-      .replace(/\!\[(.*?)\][\[\(].*?[\]\)]/g, '$1')
-      // Remove footnotes.
-      .replace(/\[\^.+?\](\: .*?$)?/g, '')
-      // Remove inline links.
-      .replace(/\[(.*?)\][\[\(].*?[\]\)]/g, '$1')
-      // Remove inline code.
-      .replace(/`(.+?)`/g, '$1')
-      // Remove blockquotes.
-      .replace(/^\s{0,3}>\s?/g, '')
-      // Remove admonition definition.
-      .replace(/(:{3}.*)/, '')
-      // Remove Emoji names within colons include preceding whitespace.
-      .replace(/\s?(:(::|[^:\n])+:)/g, '')
-      .trim();
-
-    if (cleanedLine) {
-      return cleanedLine;
-    }
-  }
-
-  return undefined;
-}
-
-type ParsedMarkdown = {
-  // Returned by gray-matter
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  frontMatter: Record<string, any>;
-  content: string;
-  excerpt: string | undefined;
-  hasFrontMatter: boolean;
-};
-
-export function readFrontMatter(
-  markdownString: string,
-  source?: string,
-  options: Record<string, unknown> = {},
-  removeTitleHeading = true,
-): ParsedMarkdown {
-  try {
-    const result = matter(markdownString, options);
-    result.data = result.data || {};
-    result.content = result.content.trim();
-
-    const hasFrontMatter = Object.keys(result.data).length > 0;
-
-    const heading = /^# (.*)[\n\r]?/gi.exec(result.content);
-    if (heading) {
-      if (result.data.title) {
-        if (removeTitleHeading) {
-          console.warn(
-            `Duplicate title detected in \`${source || 'this'}\` file`,
-          );
-        }
-      } else {
-        result.data.title = heading[1].trim();
-        if (removeTitleHeading) {
-          result.content = result.content.replace(heading[0], '');
-          if (result.excerpt) {
-            result.excerpt = result.excerpt.replace(heading[1], '');
-          }
-        }
-      }
-    }
-
-    return {
-      frontMatter: result.data,
-      content: result.content,
-      excerpt: result.excerpt,
-      hasFrontMatter,
-    };
-  } catch (e) {
-    throw new Error(`Error while parsing markdown front matter.
-This can happen if you use special characters like : in frontmatter values (try using "" around that value)
-${e.message}`);
-  }
-}
-
-export function parseMarkdownString(
-  markdownString: string,
-  source?: string,
-): ParsedMarkdown {
-  return readFrontMatter(markdownString, source, {
-    excerpt: (file: matter.GrayMatterFile<string>): void => {
-      // Hacky way of stripping out import statements from the excerpt
-      // TODO: Find a better way to do so, possibly by compiling the Markdown content,
-      // stripping out HTML tags and obtaining the first line.
-      file.excerpt = createExcerpt(file.content);
-    },
-  });
-}
-
-export async function parseMarkdownFile(
-  source: string,
-): Promise<ParsedMarkdown> {
-  const markdownString = await fs.readFile(source, 'utf-8');
-  try {
-    return parseMarkdownString(markdownString, source);
-  } catch (e) {
-    throw new Error(
-      `Error while parsing markdown file ${source}
-${e.message}`,
-    );
-  }
-}
-
 export function normalizeUrl(rawUrls: string[]): string {
  const urls = rawUrls;
  const resultArray = [];
--- a/packages/docusaurus-utils/src/markdownParser.ts
+++ b/packages/docusaurus-utils/src/markdownParser.ts
@ -0,0 +1,185 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import chalk from 'chalk';
+import fs from 'fs-extra';
+import matter from 'gray-matter';
+
+// Hacky way of stripping out import statements from the excerpt
+// TODO: Find a better way to do so, possibly by compiling the Markdown content,
+// stripping out HTML tags and obtaining the first line.
+export function createExcerpt(fileString: string): string | undefined {
+  const fileLines = fileString
+    .trimLeft()
+    // Remove Markdown alternate title
+    .replace(/^[^\n]*\n[=]+/g, '')
+    .split('\n');
+
+  /* eslint-disable no-continue */
+  // eslint-disable-next-line no-restricted-syntax
+  for (const fileLine of fileLines) {
+    // Skip empty line.
+    if (!fileLine.trim()) {
+      continue;
+    }
+
+    // Skip import/export declaration.
+    if (/^\s*?import\s.*(from.*)?;?|export\s.*{.*};?/.test(fileLine)) {
+      continue;
+    }
+
+    const cleanedLine = fileLine
+      // Remove HTML tags.
+      .replace(/<[^>]*>/g, '')
+      // Remove Title headers
+      .replace(/^\#\s*([^#]*)\s*\#?/gm, '')
+      // Remove Markdown + ATX-style headers
+      .replace(/^\#{1,6}\s*([^#]*)\s*(\#{1,6})?/gm, '$1')
+      // Remove emphasis and strikethroughs.
+      .replace(/([\*_~]{1,3})(\S.*?\S{0,1})\1/g, '$2')
+      // Remove images.
+      .replace(/\!\[(.*?)\][\[\(].*?[\]\)]/g, '$1')
+      // Remove footnotes.
+      .replace(/\[\^.+?\](\: .*?$)?/g, '')
+      // Remove inline links.
+      .replace(/\[(.*?)\][\[\(].*?[\]\)]/g, '$1')
+      // Remove inline code.
+      .replace(/`(.+?)`/g, '$1')
+      // Remove blockquotes.
+      .replace(/^\s{0,3}>\s?/g, '')
+      // Remove admonition definition.
+      .replace(/(:{3}.*)/, '')
+      // Remove Emoji names within colons include preceding whitespace.
+      .replace(/\s?(:(::|[^:\n])+:)/g, '')
+      .trim();
+
+    if (cleanedLine) {
+      return cleanedLine;
+    }
+  }
+
+  return undefined;
+}
+
+export function parseFrontMatter(
+  markdownFileContent: string,
+): {
+  frontMatter: Record<string, unknown>;
+  content: string;
+} {
+  const {data, content} = matter(markdownFileContent);
+  return {
+    frontMatter: data ?? {},
+    content: content?.trim() ?? '',
+  };
+}
+
+export function parseMarkdownContentTitle(
+  contentUntrimmed: string,
+  options?: {keepContentTitle?: boolean},
+): {content: string; contentTitle: string | undefined} {
+  const keepContentTitleOption = options?.keepContentTitle ?? false;
+
+  const content = contentUntrimmed.trim();
+
+  const regularTitleMatch = /^(?<pattern>#\s*(?<title>[^#\n]*)+\s*#*[\s\r]*?\n*?)/g.exec(
+    content,
+  );
+  const alternateTitleMatch = /^(?<pattern>\s*(?<title>[^\n]*)\s*\n[=]+)/g.exec(
+    content,
+  );
+
+  const titleMatch = regularTitleMatch ?? alternateTitleMatch;
+  const {pattern, title} = titleMatch?.groups ?? {};
+
+  if (!pattern || !title) {
+    return {content, contentTitle: undefined};
+  }
+
+  const newContent = keepContentTitleOption
+    ? content
+    : content.replace(pattern, '');
+
+  return {
+    content: newContent.trim(),
+    contentTitle: title.trim(),
+  };
+}
+
+type ParsedMarkdown = {
+  frontMatter: Record<string, unknown>;
+  content: string;
+  contentTitle: string | undefined;
+  excerpt: string | undefined;
+};
+
+export function parseMarkdownString(
+  markdownFileContent: string,
+  options?: {
+    source?: string;
+    keepContentTitle?: boolean;
+  },
+): ParsedMarkdown {
+  try {
+    const sourceOption = options?.source;
+    const keepContentTitle = options?.keepContentTitle ?? false;
+
+    const {frontMatter, content: contentWithoutFrontMatter} = parseFrontMatter(
+      markdownFileContent,
+    );
+
+    const {content, contentTitle} = parseMarkdownContentTitle(
+      contentWithoutFrontMatter,
+      {
+        keepContentTitle,
+      },
+    );
+
+    const excerpt = createExcerpt(content);
+
+    // TODO not sure this is a good place for this warning
+    if (
+      frontMatter.title &&
+      contentTitle &&
+      !keepContentTitle &&
+      !(process.env.DOCUSAURUS_NO_DUPLICATE_TITLE_WARNING === 'false')
+    ) {
+      console.warn(
+        chalk.yellow(`Duplicate title found in ${sourceOption ?? 'this'} file.
+Use either a frontmatter title or a markdown title, not both.
+If this is annoying you, use env DOCUSAURUS_NO_DUPLICATE_TITLE_WARNING=false`),
+      );
+    }
+
+    return {
+      frontMatter,
+      content,
+      contentTitle,
+      excerpt,
+    };
+  } catch (e) {
+    console.error(
+      chalk.red(`Error while parsing markdown front matter.
+This can happen if you use special characters like : in frontmatter values (try using "" around that value)`),
+    );
+    throw e;
+  }
+}
+
+export async function parseMarkdownFile(
+  source: string,
+): Promise<ParsedMarkdown> {
+  const markdownString = await fs.readFile(source, 'utf-8');
+  try {
+    return parseMarkdownString(markdownString, {source});
+  } catch (e) {
+    throw new Error(
+      `Error while parsing markdown file ${source}
+${e.message}`,
+    );
+  }
+}