off-by-one tokenizing error in codemirror/lang-sql

blh · August 21, 2021, 5:26am

I’m noticing this commit introduces an off-by-one tokenizing error, at the minimum for getting keywords.

The following results in the first character of a keyword being skipped. For example, for “CREATE”, String.fromCharCode(next) => C, but result from readWord would give REATE since input.advance() was called in between the destructuring of input and calling of readWord.

// tokensFor
let {next} = input
input.advance()

    } else if (isAlpha(next)) {
      let word = readWord(input) // 'REATE'
      input.acceptToken(d.words[word.toLowerCase()] ?? Identifier)
    }

function readWord(input: InputStream) {
  let result = ""
  for (;;) {
    if (input.next != Ch.Underscore && !isAlpha(input.next)) break
    result += String.fromCharCode(input.next)
    input.advance()
  }
  return result
}

Previously, we had

let pos = token.start, next = input.get(pos++)

    } else if (isAlpha(next)) {
      pos = readWord(input, pos) // skip to after end of 'CREATE'
      token.accept(d.words[input.read(token.start, pos).toLowerCase()] ?? Identifier, pos) // get word between token.start and pos which is 'CREATE'
    }

function readWord(input: Input, pos: number) {
  for (;; pos++) {
    let next = input.get(pos)
    if (next != Ch.Underscore && !isAlpha(next)) break
  }
  return pos
}

marijn · August 21, 2021, 6:06am

Thanks for spotting that. This patch should help.