feat(v2): various markdown string parsing improvements/fixes (#4590)

* extract createExcerpt code in separate file + add bad test * almost working markdown parsing refactor * complete parseMarkdownString refactor * fix tests * fix blog test issue * fix docusaurus utils imports
2025-07-27 13:38:33 +02:00 · 2021-04-09 17:09:33 +02:00 · 2021-04-09 17:09:33 +02:00 · 4efe6824b3
commit 4efe6824b3
parent b743edf5fb
15 changed files with 895 additions and 563 deletions
--- a/packages/docusaurus-utils/src/index.ts
+++ b/packages/docusaurus-utils/src/index.ts
@ -7,7 +7,6 @@

 import chalk from 'chalk';
 import path from 'path';
-import matter from 'gray-matter';
 import {createHash} from 'crypto';
 import {camelCase, kebabCase, mapValues} from 'lodash';
 import escapeStringRegexp from 'escape-string-regexp';
@ -23,6 +22,8 @@ import {
 import resolvePathnameUnsafe from 'resolve-pathname';

 export * from './codeTranslationsUtils';
+export * from './markdownParser';
+export * from './markdownLinks';

 const fileHash = new Map();
 export async function generate(
@ -206,135 +207,6 @@ export function getSubFolder(file: string, refDir: string): string | null {
  return match && match[1];
 }

-export function createExcerpt(fileString: string): string | undefined {
-  const fileLines = fileString.trimLeft().split('\n');
-
-  /* eslint-disable no-continue */
-  // eslint-disable-next-line no-restricted-syntax
-  for (const fileLine of fileLines) {
-    // Skip empty line.
-    if (!fileLine.trim()) {
-      continue;
-    }
-
-    // Skip import/export declaration.
-    if (/^\s*?import\s.*(from.*)?;?|export\s.*{.*};?/.test(fileLine)) {
-      continue;
-    }
-
-    const cleanedLine = fileLine
-      // Remove HTML tags.
-      .replace(/<[^>]*>/g, '')
-      // Remove ATX-style headers.
-      .replace(/^\#{1,6}\s*([^#]*)\s*(\#{1,6})?/gm, '$1')
-      // Remove emphasis and strikethroughs.
-      .replace(/([\*_~]{1,3})(\S.*?\S{0,1})\1/g, '$2')
-      // Remove images.
-      .replace(/\!\[(.*?)\][\[\(].*?[\]\)]/g, '$1')
-      // Remove footnotes.
-      .replace(/\[\^.+?\](\: .*?$)?/g, '')
-      // Remove inline links.
-      .replace(/\[(.*?)\][\[\(].*?[\]\)]/g, '$1')
-      // Remove inline code.
-      .replace(/`(.+?)`/g, '$1')
-      // Remove blockquotes.
-      .replace(/^\s{0,3}>\s?/g, '')
-      // Remove admonition definition.
-      .replace(/(:{3}.*)/, '')
-      // Remove Emoji names within colons include preceding whitespace.
-      .replace(/\s?(:(::|[^:\n])+:)/g, '')
-      .trim();
-
-    if (cleanedLine) {
-      return cleanedLine;
-    }
-  }
-
-  return undefined;
-}
-
-type ParsedMarkdown = {
-  // Returned by gray-matter
-  // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  frontMatter: Record<string, any>;
-  content: string;
-  excerpt: string | undefined;
-  hasFrontMatter: boolean;
-};
-
-export function readFrontMatter(
-  markdownString: string,
-  source?: string,
-  options: Record<string, unknown> = {},
-  removeTitleHeading = true,
-): ParsedMarkdown {
-  try {
-    const result = matter(markdownString, options);
-    result.data = result.data || {};
-    result.content = result.content.trim();
-
-    const hasFrontMatter = Object.keys(result.data).length > 0;
-
-    const heading = /^# (.*)[\n\r]?/gi.exec(result.content);
-    if (heading) {
-      if (result.data.title) {
-        if (removeTitleHeading) {
-          console.warn(
-            `Duplicate title detected in \`${source || 'this'}\` file`,
-          );
-        }
-      } else {
-        result.data.title = heading[1].trim();
-        if (removeTitleHeading) {
-          result.content = result.content.replace(heading[0], '');
-          if (result.excerpt) {
-            result.excerpt = result.excerpt.replace(heading[1], '');
-          }
-        }
-      }
-    }
-
-    return {
-      frontMatter: result.data,
-      content: result.content,
-      excerpt: result.excerpt,
-      hasFrontMatter,
-    };
-  } catch (e) {
-    throw new Error(`Error while parsing markdown front matter.
-This can happen if you use special characters like : in frontmatter values (try using "" around that value)
-${e.message}`);
-  }
-}
-
-export function parseMarkdownString(
-  markdownString: string,
-  source?: string,
-): ParsedMarkdown {
-  return readFrontMatter(markdownString, source, {
-    excerpt: (file: matter.GrayMatterFile<string>): void => {
-      // Hacky way of stripping out import statements from the excerpt
-      // TODO: Find a better way to do so, possibly by compiling the Markdown content,
-      // stripping out HTML tags and obtaining the first line.
-      file.excerpt = createExcerpt(file.content);
-    },
-  });
-}
-
-export async function parseMarkdownFile(
-  source: string,
-): Promise<ParsedMarkdown> {
-  const markdownString = await fs.readFile(source, 'utf-8');
-  try {
-    return parseMarkdownString(markdownString, source);
-  } catch (e) {
-    throw new Error(
-      `Error while parsing markdown file ${source}
-${e.message}`,
-    );
-  }
-}
-
 export function normalizeUrl(rawUrls: string[]): string {
  const urls = rawUrls;
  const resultArray = [];