跳到主要内容

tokenizer.js

tokenizer 功能为将代码分词,为后续的 parser 做准备

Demo实例一

<h1 id="title"><span>hello</span>world</h1> 生成 AST 抽象语法树

分析

<h1 id="title"><span>hello</span>world</h1> 生成 AST 抽象语法树之前,必须将 <h1 id="title"><span>hello</span>world</h1> 分解成一个个的 {type:……,value:……} , 这种结构叫做 token 。生成 token 的过程叫做分词,过程中用到的思想为有限状态机

<h1 id="title"><span>hello</span>world</h1>生成的token:

[
{ type: 'LeftParentheses', value: '<' },
{ type: 'JSXIdentifier', value: 'h1' },
{ type: 'AttributeKey', value: 'id' },
{ type: 'AttributeStringValue', value: 'title' },
{ type: 'RightParentheses', value: '>' },
{ type: 'LeftParentheses', value: '<' },
{ type: 'JSXIdentifier', value: 'span' },
{ type: 'RightParentheses', value: '>' },
{ type: 'JSXText', value: 'hello' },
{ type: 'LeftParentheses', value: '<' },
{ type: 'BackSlash', value: '/' },
{ type: 'JSXIdentifier', value: 'span' },
{ type: 'RightParentheses', value: '>' },
{ type: 'JSXText', value: 'world' },
{ type: 'LeftParentheses', value: '<' },
{ type: 'BackSlash', value: '/' },
{ type: 'JSXIdentifier', value: 'h1' },
{ type: 'RightParentheses', value: '>' }
]

实现

const tokenTypes = require("./tokenTypes");

const tokens = [];
let currentToken = null;
const SpaceReg = /\s/;
const LettersReg = /[a-zA-Z0-9]/;

function tokenizer(input) {
let state = start;
for (let char of input) {
if (state) {
state = state(char);
}
}
return tokens;
}

function start(char) {
if (char === "<") {
let token = {
type: tokenTypes.LeftParentheses,
value: "<",
};
emit(token);
return foundLeftParentheses;
}
throw new Error("第一个字符必须为 <");
}

function emit(token) {
currentToken = { type: "", value: "" };
tokens.push(token);
}
function foundLeftParentheses(char) {
if (LettersReg.test(char)) {
currentToken.type = tokenTypes.JSXIdentifier;
currentToken.value += char;
return jsxIdentifier;
} else if (char === "/") {
let token = {
type: tokenTypes.BackSlash,
value: "/",
};
emit(token);
return foundLeftParentheses;
}
throw new TypeError("语法错误");
}
function jsxIdentifier(char) {
if (LettersReg.test(char)) {
currentToken.value += char;
return jsxIdentifier;
} else if (SpaceReg.test(char)) {
emit(currentToken);
return attribute;
} else if (char === ">") {
emit(currentToken);
let token = {
type: tokenTypes.RightParentheses,
value: ">",
};
emit(token);
return foundRightParentheses;
}
return eof;
}
function attribute(char) {
if (LettersReg.test(char)) {
currentToken.type = tokenTypes.AttributeKey;
currentToken.value += char;
return attributeKey;
}
throw new TypeError("语法错误");
}
function foundRightParentheses(char) {
if (char === "<") {
let token = {
type: tokenTypes.LeftParentheses,
value: "<",
};
emit(token);
return foundLeftParentheses;
} else {
currentToken.value += char;
currentToken.type = tokenTypes.JSXText;
return jsxText;
}
throw new Error("语法错误");
}
function eof() {
if (currentToken.value.length) {
emit(currentToken);
}
}
function attributeKey(char) {
if (LettersReg.test(char)) {
currentToken.value += char;
return attributeKey;
} else if (char === "=") {
emit(currentToken);
return attributeValue;
}
}
function attributeValue(char) {
if (char === '"') {
currentToken.type = tokenTypes.AttributeStringValue;
currentToken.value = "";
return attributeStringValue;
}
throw new Error("语法错误");
}
function attributeStringValue(char) {
if (LettersReg.test(char)) {
currentToken.value += char;
return attributeStringValue;
} else if (char === '"') {
emit(currentToken);
return tryLeaveAttribute;
}
throw new Error("语法错误");
}
function tryLeaveAttribute(char) {
if (SpaceReg.test(char)) {
return attribute;
} else if (char === ">") {
let token = {
type: tokenTypes.RightParentheses,
value: ">",
};
emit(token);
return foundRightParentheses;
}
throw new TypeError("Error");
}
function jsxText(char) {
if (char === "<") {
emit(currentToken);
let token = {
type: tokenTypes.LeftParentheses,
value: "<",
};
emit(token);
return foundLeftParentheses;
} else {
currentToken.value += char;
return jsxText;
}
}

module.exports = tokenizer;

测试

const tokenizer = require("./tokenizer");

const sourceCode = '<h1 id="title"><span>hello</span>world</h1>';
const tokens = tokenizer(sourceCode);
console.log(tokens)