Fix escape codeblock (#2230)

* updated package-lock * added fix and test for escape html in code block * fixed markdown preview render bug * updated comment in escape function * improved escape function * Delete package-lock.json
2026-01-27 23:57:17 +00:00 · 2018-08-09 15:08:52 +07:00
parent 7330cdaf1c
commit f0df787bbe
5 changed files with 96 additions and 53 deletions
--- a/browser/lib/markdown-it-sanitize-html.js
+++ b/browser/lib/markdown-it-sanitize-html.js
@@ -1,6 +1,7 @@
 'use strict'

 import sanitizeHtml from 'sanitize-html'
+import { escapeHtmlCharacters } from './utils'

 module.exports = function sanitizePlugin (md, options) {
  options = options || {}
@@ -8,16 +9,26 @@ module.exports = function sanitizePlugin (md, options) {
  md.core.ruler.after('linkify', 'sanitize_inline', state => {
    for (let tokenIdx = 0; tokenIdx < state.tokens.length; tokenIdx++) {
      if (state.tokens[tokenIdx].type === 'html_block') {
-        state.tokens[tokenIdx].content = sanitizeHtml(state.tokens[tokenIdx].content, options)
+        state.tokens[tokenIdx].content = sanitizeHtml(
+          state.tokens[tokenIdx].content,
+          options
+        )
      }
      if (state.tokens[tokenIdx].type === 'fence') {
-        state.tokens[tokenIdx].content = state.tokens[tokenIdx].content.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/"/g, '&quot;')
+        // escapeHtmlCharacters has better performance
+        state.tokens[tokenIdx].content = escapeHtmlCharacters(
+          state.tokens[tokenIdx].content,
+          { skipSingleQuote: true }
+        )
      }
      if (state.tokens[tokenIdx].type === 'inline') {
        const inlineTokens = state.tokens[tokenIdx].children
        for (let childIdx = 0; childIdx < inlineTokens.length; childIdx++) {
          if (inlineTokens[childIdx].type === 'html_inline') {
-            inlineTokens[childIdx].content = sanitizeHtml(inlineTokens[childIdx].content, options)
+            inlineTokens[childIdx].content = sanitizeHtml(
+              inlineTokens[childIdx].content,
+              options
+            )
          }
        }
      }
--- a/browser/lib/utils.js
+++ b/browser/lib/utils.js
@@ -6,8 +6,12 @@ export function lastFindInArray (array, callback) {
  }
 }

-export function escapeHtmlCharacters (html, opt = { detectCodeBlock: false }) {
+export function escapeHtmlCharacters (
+  html,
+  opt = { detectCodeBlock: false, skipSingleQuote: false }
+) {
  const matchHtmlRegExp = /["'&<>]/g
+  const matchCodeBlockRegExp = /```/g
  const escapes = ['&quot;', '&amp;', '&#39;', '&lt;', '&gt;']
  let match = null
  const replaceAt = (str, index, replace) =>
@@ -15,11 +19,18 @@ export function escapeHtmlCharacters (html, opt = { detectCodeBlock: false }) {
    replace +
    str.substr(index + replace.length - (replace.length - 1))

-  // detecting code block
-  while ((match = matchHtmlRegExp.exec(html)) != null) {
+  while ((match = matchHtmlRegExp.exec(html)) !== null) {
    const current = { char: match[0], index: match.index }
+    const codeBlockIndexs = []
+    let openCodeBlock = null
+    // if the detectCodeBlock option is activated then this function should skip
+    // characters that needed to be escape but located in code block
    if (opt.detectCodeBlock) {
-      // position of the nearest line start
+      // The first type of code block is lines that start with 4 spaces
+      // Here we check for the \n character located before the character that
+      // needed to be escape. It means we check for the begining of the line that
+      // contain that character, then we check if there are 4 spaces next to the
+      // \n character (the line start with 4 spaces)
      let previousLineEnd = current.index - 1
      while (html[previousLineEnd] !== '\n' && previousLineEnd !== -1) {
        previousLineEnd--
@@ -31,16 +42,54 @@ export function escapeHtmlCharacters (html, opt = { detectCodeBlock: false }) {
        html[previousLineEnd + 3] === ' ' &&
        html[previousLineEnd + 4] === ' '
      ) {
-        // so skip it
+        // skip the current character
+        continue
+      }
+      // The second type of code block is lines that wrapped in ```
+      // We will get the position of each ```
+      // then push it into an array
+      // then the array returned will be like this:
+      // [startCodeblock, endCodeBlock, startCodeBlock, endCodeBlock]
+      while ((openCodeBlock = matchCodeBlockRegExp.exec(html)) !== null) {
+        codeBlockIndexs.push(openCodeBlock.index)
+      }
+      let shouldSkipChar = false
+      // we loop through the array of positions
+      // we skip 2 element as the i index position is the position of ``` that
+      // open the codeblock and the i + 1 is the position of the ``` that close
+      // the code block
+      for (let i = 0; i < codeBlockIndexs.length; i += 2) {
+        // the i index position is the position of the ``` that open code block
+        // so we have to + 2 as that position is the position of the first ` in the ````
+        // but we need to make sure that the position current character is larger
+        // that the last ` in the ``` that open the code block so we have to take
+        // the position of the first ` and + 2
+        // the i + 1 index position is the closing ``` so the char must less than it
+        if (
+          current.index > codeBlockIndexs[i] + 2 &&
+          current.index < codeBlockIndexs[i + 1]
+        ) {
+          // skip it
+          shouldSkipChar = true
+          break
+        }
+      }
+      if (shouldSkipChar) {
+        // skip the current character
        continue
      }
    }
    // otherwise, escape it !!!
    if (current.char === '&') {
+      // when escaping character & we have to be becareful as the & could be a part
+      // of an escaped character like &quot; will be came &amp;quot;
      let nextStr = ''
      let nextIndex = current.index
      let escapedStr = false
-      // maximum length of an escape string is 5. For example ('&quot;')
+      // maximum length of an escaped string is 5. For example ('&quot;')
+      // we take the next 5 character of the next string if it is one of the string:
+      // ['&quot;', '&amp;', '&#39;', '&lt;', '&gt;'] then we will not escape the & character
+      // as it is a part of the escaped string and should not be escaped
      while (nextStr.length <= 5) {
        nextStr += html[nextIndex]
        nextIndex++
@@ -55,7 +104,7 @@ export function escapeHtmlCharacters (html, opt = { detectCodeBlock: false }) {
      }
    } else if (current.char === '"') {
      html = replaceAt(html, current.index, '&quot;')
-    } else if (current.char === "'") {
+    } else if (current.char === "'" && !opt.skipSingleQuote) {
      html = replaceAt(html, current.index, '&#39;')
    } else if (current.char === '<') {
      html = replaceAt(html, current.index, '&lt;')