The interview question of the week for this week on Cassidoo's weekly newsletter is,
Write a function that takes in a string of valid JSON and converts it to an object (or whatever your chosen language uses, dicts, maps, etc). Example input:
fakeParseJSON('{ "data": { "fish": "cake", "array": [1,2,3], "children": [ { "something": "else" }, { "candy": "cane" }, { "sponge": "bob" } ] } } ')
At one point, I was tempted to just to write:
const fakeParseJSON = JSON.parse;
But, I thought, I've written quite a few articles about AST:
- Creating custom JavaScript syntax with Babel
- Step-by-step guide for writing a custom babel transformation
- Manipulating AST with JavaScript
which covers the overview of the compiler pipeline, as well as how to manipulate AST, but I haven't covered much on how to implement a parser.
That's because, implementing a JavaScript compiler in an article is a task too daunting for me.
Well, fret not. JSON is also a language. It has its own grammar, which you can refer from the specifications. The knowledge and technique you need to write a JSON parser is transferrable to writing a JS parser.
So, let's start writing a JSON parser!
Understand the grammar
If you look at the specification page, there's 2 diagrams:
The syntax diagram (or railroad diagram) on the left,
Image source: https://www.json.org/img/object.png
The McKeeman Form, a variant of Backus-Naur Form (BNF), on the right
json
element
value
object
array
string
number
"true"
"false"
"null"
object
'{' ws '}'
'{' members '}'
Both diagrams are equivalent.
One is visual and one is text based. The text based grammar syntax, Backus-Naur Form, is usually fed to another parser that parse this grammar and generate a parser for it. Speaking of parser-ception!
In this article, we will focus on the railroad diagram, because it is visual and seemed to be more friendly to me.
Lets' look at the first railroad diagram:
Image source: https://www.json.org/img/object.png
So this is the grammar for "object" in JSON.
We start from the left, following the arrow, and then we end at the right.
The circles, eg {
, ,
, :
, }
, are the characters, and the boxes eg: whitespace
, string
, and value
is a placeholder for another grammar. So to parse the "whitespace", we will need to look at the grammar for "whitepsace".
So, starting from the left, for an object, the first character has to be an open curly bracket, {
. and then we have 2 options from here:
whitespace
→}
→ end, orwhitespace
→string
→whitespace
→:
→value
→}
→ end
Of course, when you reach "value", you can choose to go to:
- →
}
→ end, or - →
,
→whitespace
→ ... → value
and you can keep looping, until you decide to go to:
- →
}
→ end.
So, I guess we are now acquainted with the railroad diagram, let's carry on to the next section.
Implementing the parser
Let's start with the following structure:
function fakeParseJSON(str) {
let i = 0;
// TODO
}
We initialise i
as the index for the current character, we will end as soon as i
reaches the end of the str
.
Let's implement the grammar for the "object":
function fakeParseJSON(str) {
let i = 0;
function parseObject() {
if (str[i] === '{') {
i++;
skipWhitespace();
// if it is not '}',
// we take the path of string -> whitespace -> ':' -> value -> ...
while (str[i] !== '}') {
const key = parseString();
skipWhitespace();
eatColon();
const value = parseValue();
}
}
}
}
In the parseObject
, we will call parse of other grammars, like "string" and "whitespace", when we implement them, everything will work 🤞.
One thing that I forgot to add is the comma, ,
. The ,
only appears before we start the second loop of whitespace
→ string
→ whitespace
→ :
→ ...
Based on that, we add the following lines:
...
function fakeParseJSON(str) {
let i = 0;
const value = parseValue();
expectEndOfInput();
return value;
function parseObject() {
if (str[i] === "{") {
i++;
skipWhitespace();
const result = {};
let initial = true;
// if it is not '}',
// we take the path of string -> whitespace -> ':' -> value -> ...
while (i < str.length && str[i] !== "}") {
if (!initial) {
eatComma();
skipWhitespace();
}
const key = parseString();
if (key === undefined) {
expectObjectKey();
}
skipWhitespace();
eatColon();
const value = parseValue();
result[key] = value;
initial = false;
}
expectNotEndOfInput("}");
// move to the next character of '}'
i++;
return result;
}
}
function parseArray() {
if (str[i] === "[") {
i++;
skipWhitespace();
const result = [];
let initial = true;
while (i < str.length && str[i] !== "]") {
if (!initial) {
eatComma();
}
const value = parseValue();
result.push(value);
initial = false;
}
expectNotEndOfInput("]");
// move to the next character of ']'
i++;
return result;
}
}
function parseValue() {
skipWhitespace();
const value =
parseString() ??
parseNumber() ??
parseObject() ??
parseArray() ??
parseKeyword("true", true) ??
parseKeyword("false", false) ??
parseKeyword("null", null);
skipWhitespace();
return value;
}
function parseKeyword(name, value) {
if (str.slice(i, i + name.length) === name) {
i += name.length;
return value;
}
}
function skipWhitespace() {
while (
str[i] === " " ||
str[i] === "\n" ||
str[i] === "\t" ||
str[i] === "\r"
) {
i++;
}
}
function parseString() {
if (str[i] === '"') {
i++;
let result = "";
while (i < str.length && str[i] !== '"') {
if (str[i] === "\\") {
const char = str[i + 1];
if (
char === '"' ||
char === "\\" ||
char === "/" ||
char === "b" ||
char === "f" ||
char === "n" ||
char === "r" ||
char === "t"
) {
result += char;
i++;
} else if (char === "u") {
if (
isHexadecimal(str[i + 2]) &&
isHexadecimal(str[i + 3]) &&
isHexadecimal(str[i + 4]) &&
isHexadecimal(str[i + 5])
) {
result += String.fromCharCode(
parseInt(str.slice(i + 2, i + 6), 16)
);
i += 5;
} else {
i += 2;
expectEscapeUnicode(result);
}
} else {
expectEscapeCharacter(result);
}
} else {
result += str[i];
}
i++;
}
expectNotEndOfInput('"');
i++;
return result;
}
}
function isHexadecimal(char) {
return (
(char >= "0" && char <= "9") ||
(char.toLowerCase() >= "a" && char.toLowerCase() <= "f")
);
}
function parseNumber() {
let start = i;
if (str[i] === "-") {
i++;
expectDigit(str.slice(start, i));
}
if (str[i] === "0") {
i++;
} else if (str[i] >= "1" && str[i] <= "9") {
i++;
while (str[i] >= "0" && str[i] <= "9") {
i++;
}
}
if (str[i] === ".") {
i++;
expectDigit(str.slice(start, i));
while (str[i] >= "0" && str[i] <= "9") {
i++;
}
}
if (str[i] === "e" || str[i] === "E") {
i++;
if (str[i] === "-" || str[i] === "+") {
i++;
}
expectDigit(str.slice(start, i));
while (str[i] >= "0" && str[i] <= "9") {
i++;
}
}
if (i > start) {
return Number(str.slice(start, i));
}
}
function eatComma() {
expectCharacter(",");
i++;
}
function eatColon() {
expectCharacter(":");
i++;
}
// error handling
function expectNotEndOfInput(expected) {
if (i === str.length) {
printCodeSnippet(`Expecting a \`${expected}\` here`);
throw new Error("JSON_ERROR_0001 Unexpected End of Input");
}
}
function expectEndOfInput() {
if (i < str.length) {
printCodeSnippet("Expecting to end here");
throw new Error("JSON_ERROR_0002 Expected End of Input");
}
}
function expectObjectKey() {
printCodeSnippet(`Expecting object key here
For example:
{ "foo": "bar" }
^^^^^`);
throw new Error("JSON_ERROR_0003 Expecting JSON Key");
}
function expectCharacter(expected) {
if (str[i] !== expected) {
printCodeSnippet(`Expecting a \`${expected}\` here`);
throw new Error("JSON_ERROR_0004 Unexpected token");
}
}
function expectDigit(numSoFar) {
if (!(str[i] >= "0" && str[i] <= "9")) {
printCodeSnippet(`JSON_ERROR_0005 Expecting a digit here
For example:
${numSoFar}5
${" ".repeat(numSoFar.length)}^`);
throw new Error("JSON_ERROR_0006 Expecting a digit");
}
}
function expectEscapeCharacter(strSoFar) {
printCodeSnippet(`JSON_ERROR_0007 Expecting escape character
For example:
"${strSoFar}\\n"
${" ".repeat(strSoFar.length + 1)}^^
List of escape characters are: \\", \\\\, \\/, \\b, \\f, \\n, \\r, \\t, \\u`);
throw new Error("JSON_ERROR_0008 Expecting an escape character");
}
function expectEscapeUnicode(strSoFar) {
printCodeSnippet(`Expect escape unicode
For example:
"${strSoFar}\\u0123
${" ".repeat(strSoFar.length + 1)}^^^^^^`);
throw new Error("JSON_ERROR_0009 Expecting an escape unicode");
}
function printCodeSnippet(message) {
const from = Math.max(0, i - 10);
const trimmed = from > 0;
const padding = (trimmed ? 4 : 0) + (i - from);
const snippet = [
(trimmed ? "... " : "") + str.slice(from, i + 1),
" ".repeat(padding) + "^",
" ".repeat(padding) + message
].join("\n");
console.log(snippet);
}
}
...
Summary
To implement a parser, you need to start with the grammar.
You can formalise the grammar with the railroad diagrams or the Backus-Naur Form. Designing the grammar is the hardest step.
Once you've settled with the grammar, you can start implementing the parser based on it.
Error handling is important, what's more important is to have meaningful error messages, so that the user knows how to fix it.
Now you know how a simple parser is implemented, it's time to set eyes on a more complex one:
版权声明
本博客所有的转载文章,作者皆保留版权。转载必须包含本声明,保持本文完整,并以超链接形式注明作者陈立豪和本文原始地址:https://lihautan.com/json-parser-with-javascript/
(完)