跳到主要内容

词法分析

词法分析采用标记化算法,输入源码,输出源码标记。也称为标记生成器源码->源码标记的过程通过有限自动状态机来实现。即在当前状态下,接收一个或者多个字符,就会更新到下一个状态。

实现


tokenTypes
exports.AttributeKey = "AttributeKey";
exports.JSXIdentifier = "JSXIdentifier";
exports.LeftParentheses = "LeftParentheses";
exports.AttributeStringValue = "AttributeStringValue";
exports.RightParentheses = "RightParentheses";
exports.JSXText = "JSXText";
exports.BackSlash = "BackSlash"
tokenizer.js
const LettersReg = /[a-zA-Z0-9]/;
const tokenTypes = require("./tokenTypes");
let currentToken = { type: "", value: "" };
const tokens = [];

function start(char) {
if (char === "<") {
emit({ type: tokenTypes.LeftParentheses, value: "<" });
return foundLeftParentheses;
}
throw new Error("第一个字符必须为<");
}
function emit(token) {
currentToken = { type: "", value: "" };
tokens.push(token);
}
function eof() {
if (currentToken.value.length) {
emit(currentToken);
}
}
function foundLeftParentheses(char) {
if (LettersReg.test(char)) {
currentToken.type = tokenTypes.JSXIdentifier;
currentToken.value += char;
return jsxIdentifier;
} else if (char === "/") {
emit({ type: tokenTypes.BackSlash, value: "/" });
return foundLeftParentheses;
}
throw new TypeError("Error");
}
function jsxIdentifier(char) {
if (LettersReg.test(char)) {
currentToken.value += char;
return jsxIdentifier;
} else if (char === " ") {
emit(currentToken);
return attribute;
} else if (char === ">") {
emit(currentToken);
emit({ type: tokenTypes.RightParentheses, value: ">" });
return foundRightParentheses;
}
return eof;
}
function attribute(char) {
if (LettersReg.test(char)) {
currentToken.type = tokenTypes.AttributeKey;
currentToken.value += char;
return attributeKey;
}
throw new TypeError("Error");
}
function attributeKey(char) {
if (LettersReg.test(char)) {
currentToken.value += char;
return attributeKey;
} else if (char === "=") {
emit(currentToken);
return attributeValue;
}
}
function attributeValue(char) {
if (char === '"') {
currentToken.type = tokenTypes.AttributeStringValue;
currentToken.value = "";
return attributeStringValue;
}
throw new TypeError("Error");
}
function attributeStringValue(char) {
if (LettersReg.test(char)) {
currentToken.value += char;
return attributeStringValue;
} else if (char === '"') {
emit(currentToken);
return tryLeaveAttribute;
}
throw new TypeError("Error");
}
function tryLeaveAttribute(char) {
if (char === " ") {
return attribute;
} else if (char === ">") {
emit({ type: tokenTypes.RightParentheses, value: ">" });
return foundRightParentheses;
}
throw new TypeError("Error");
}
function foundRightParentheses(char) {
if (char === "<") {
emit({ type: tokenTypes.LeftParentheses, value: "<" });
return foundLeftParentheses;
} else {
currentToken.value += char;
currentToken.type = tokenTypes.JSXText;
return jsxText;
}
throw new TypeError("Error");
}
function jsxText(char) {
if (char === "<") {
emit(currentToken);
emit({ type: tokenTypes.LeftParentheses, value: "<" });
return foundLeftParentheses;
} else {
currentToken.value += char;
return jsxText;
}
}

function tokenizer(input) {
let state = start;
for (let char of input) {
if (state) {
state = state(char);
}
}
return tokens;
}

module.exports = {
tokenizer,
};

测试


const { tokenizer } = require("./tokenizer");

debugger
let sourceCode = '<h1 id="title"><span>hello</span>world</h1>';
console.log(tokenizer(sourceCode));