一个简单的对html进行词法分析
将一段html进行扫描,在扫描读取字符流过程中会逐个字的分析,进行词的分类。把每个词添加token数组中,进行身份标识。并且对每个语句进行分析,然后把它抽象化。变成一个抽象化树结构的元素对象。目录HTML代码node对象tokens生成html 高亮完整代码例子举例:HTML代码<div class="wrapper" ><!--注释--><div..
·
将一段html进行扫描,在扫描读取字符流过程中会逐个字的分析,进行词的分类。把每个词添加token数组中,进行身份标识。并且对每个语句进行分析,然后把它抽象化。变成一个抽象化树结构的元素对象。
目录
举例:
HTML代码
<div class="wrapper" >
<!--注释-->
<div class="head">
hello world
你好世界
</div>
<div class="main">
<div class="content">
<input type="text" />
</div>
</div>
</div>
<div class="footer">
footer
</div>
会通过上面这段html,分析收集生成一个 tokens数组,还会生成一个root抽象化的对象
tokens:属性名、属性值、标签名、左尖括号、右尖括号、注释、文本
node:Array<{type:string,attributes:Array<{name:string,value:any}>,children?:any[]}>
Tokens数组
[
{
"text": "<",
"type": "startLess",
"line": 1,
"start": 0,
"end": 1
},
{
"text": "div",
"type": "nodeName",
"line": 1,
"start": 1,
"end": 4
},
{
"text": "class",
"type": "attributeName",
"line": 1,
"start": 5,
"end": 10
},
{
"text": "=",
"type": "attrubuteEqual",
"line": 1,
"start": 10,
"end": 11
},
{
"text": "\"wrapper\"",
"type": "attributeValue",
"line": 1,
"start": 11,
"end": 20
},
{
"text": ">",
"type": "startGreater",
"line": 1,
"start": 21,
"end": 22
},
{
"text": "<!--注释-->",
"type": "comment",
"line": 2,
"start": 21,
"end": 30
},
{
"text": "<",
"type": "startLess",
"line": 3,
"start": 20,
"end": 21
},
{
"text": "div",
"type": "nodeName",
"line": 3,
"start": 21,
"end": 24
},
{
"text": "class",
"type": "attributeName",
"line": 3,
"start": 25,
"end": 30
},
{
"text": "=",
"type": "attrubuteEqual",
"line": 3,
"start": 30,
"end": 31
},
{
"text": "\"head\"",
"type": "attributeValue",
"line": 3,
"start": 31,
"end": 37
},
{
"text": ">",
"type": "startGreater",
"line": 3,
"start": 37,
"end": 38
},
{
"text": "hello world",
"type": "text",
"line": 4,
"start": 24,
"end": 35
},
{
"text": "你好世界",
"type": "text",
"line": 5,
"start": 24,
"end": 28
},
{
"text": "</",
"type": "endLess",
"line": 6,
"start": 20,
"end": 22
},
{
"text": "div",
"type": "nodeName",
"line": 6,
"start": 22,
"end": 25
},
{
"text": ">",
"type": "endGreater",
"line": 6,
"start": 25,
"end": 26
},
{
"text": "<",
"type": "startLess",
"line": 7,
"start": 20,
"end": 21
},
{
"text": "div",
"type": "nodeName",
"line": 7,
"start": 21,
"end": 24
},
{
"text": "class",
"type": "attributeName",
"line": 7,
"start": 25,
"end": 30
},
{
"text": "=",
"type": "attrubuteEqual",
"line": 7,
"start": 30,
"end": 31
},
{
"text": "\"main\"",
"type": "attributeValue",
"line": 7,
"start": 31,
"end": 37
},
{
"text": ">",
"type": "startGreater",
"line": 7,
"start": 37,
"end": 38
},
{
"text": "<",
"type": "startLess",
"line": 8,
"start": 24,
"end": 25
},
{
"text": "div",
"type": "nodeName",
"line": 8,
"start": 25,
"end": 28
},
{
"text": "class",
"type": "attributeName",
"line": 8,
"start": 29,
"end": 34
},
{
"text": "=",
"type": "attrubuteEqual",
"line": 8,
"start": 34,
"end": 35
},
{
"text": "\"content\"",
"type": "attributeValue",
"line": 8,
"start": 35,
"end": 44
},
{
"text": ">",
"type": "startGreater",
"line": 8,
"start": 44,
"end": 45
},
{
"text": "<",
"type": "startLess",
"line": 9,
"start": 28,
"end": 29
},
{
"text": "input",
"type": "nodeName",
"line": 9,
"start": 29,
"end": 34
},
{
"text": "type",
"type": "attributeName",
"line": 9,
"start": 35,
"end": 39
},
{
"text": "=",
"type": "attrubuteEqual",
"line": 9,
"start": 39,
"end": 40
},
{
"text": "\"text\"",
"type": "attributeValue",
"line": 9,
"start": 40,
"end": 46
},
{
"text": "/>",
"type": "startGreater",
"line": 9,
"start": 47,
"end": 49
},
{
"text": "</",
"type": "endLess",
"line": 10,
"start": 24,
"end": 26
},
{
"text": "div",
"type": "nodeName",
"line": 10,
"start": 26,
"end": 29
},
{
"text": ">",
"type": "endGreater",
"line": 10,
"start": 29,
"end": 30
},
{
"text": "</",
"type": "endLess",
"line": 11,
"start": 20,
"end": 22
},
{
"text": "div",
"type": "nodeName",
"line": 11,
"start": 22,
"end": 25
},
{
"text": ">",
"type": "endGreater",
"line": 11,
"start": 25,
"end": 26
},
{
"text": "</",
"type": "endLess",
"line": 12,
"start": 16,
"end": 18
},
{
"text": "div",
"type": "nodeName",
"line": 12,
"start": 18,
"end": 21
},
{
"text": ">",
"type": "endGreater",
"line": 12,
"start": 21,
"end": 22
},
{
"text": "<",
"type": "startLess",
"line": 13,
"start": 16,
"end": 17
},
{
"text": "div",
"type": "nodeName",
"line": 13,
"start": 17,
"end": 20
},
{
"text": "class",
"type": "attributeName",
"line": 13,
"start": 21,
"end": 26
},
{
"text": "=",
"type": "attrubuteEqual",
"line": 13,
"start": 26,
"end": 27
},
{
"text": "\"footer\"",
"type": "attributeValue",
"line": 13,
"start": 27,
"end": 35
},
{
"text": ">",
"type": "startGreater",
"line": 13,
"start": 35,
"end": 36
},
{
"text": "footer",
"type": "text",
"line": 14,
"start": 20,
"end": 26
},
{
"text": "</",
"type": "endLess",
"line": 15,
"start": 16,
"end": 18
},
{
"text": "div",
"type": "nodeName",
"line": 15,
"start": 18,
"end": 21
},
{
"text": ">",
"type": "endGreater",
"line": 15,
"start": 21,
"end": 22
}
]
node对象
{
"type": "root",
"children": [
{
"type": "node",
"name": "div",
"attributes": [
{
"name": "class",
"value": "wrapper"
}
],
"children": [
{
"type": "comment",
"text": "注释"
},
{
"type": "node",
"name": "div",
"attributes": [
{
"name": "class",
"value": "head"
}
],
"children": [
{
"type": "text",
"text": "hello world"
},
{
"type": "text",
"text": "你好世界"
}
]
},
{
"type": "node",
"name": "div",
"attributes": [
{
"name": "class",
"value": "main"
}
],
"children": [
{
"type": "node",
"name": "div",
"attributes": [
{
"name": "class",
"value": "content"
}
],
"children": [
{
"type": "node",
"name": "input",
"attributes": [
{
"name": "type",
"value": "text"
}
]
}
]
}
]
}
]
},
{
"type": "node",
"name": "div",
"attributes": [
{
"name": "class",
"value": "footer"
}
],
"children": [
{
"type": "text",
"text": "footer"
}
]
}
]
}
tokens生成html 高亮
根据tokens生成html:
let _d=document.createElement('div')
function escapeHTML(text){
_d.innerText=text
return _d.innerHTML
}
function tokenToHTMLHighlight(tokens,colors){
let lineSet=new Set(),start=0,html=''
tokens.forEach(token=>{
if(!lineSet.has(token.line)){
lineSet.add(token.line)
start=0
if(html!=''){
html+='\r'
}
html+='<span style="color:#dfdede;display:inline-block;width:40px;text-align:right;">'+token.line+'</span>'
}
for(let i=start,len=token.start;i<len;i++){
html+=' '
}
html+='<span style="'+colors[token.type]+'">'+escapeHTML(token.text)+'</span>'
start=token.end;
lastLine=token.line
})
return '<pre>'+html+'</pre>'
}
完整代码例子
var addExample = createModuleExample('vue3')
addExample("html lexer parser", function () {
let { toRaw, ref, unref, provide, inject, getCurrentInstance, reactive, shallowReactive, computed, watchEffect, watch, onBeforeMount, onMounted, onBeforeUpdated, onUpdated, onBeforeUnmount, onUnmounted, toRef, toRefs } = Vue;
var html = `<div class="wrapper" >
<!--注释-->
<div class="head">
hello world
你好世界
</div>
<div class="main">
<div class="content">
<input type="text" />
</div>
</div>
</div>
<div class="footer">
footer
</div>
`
/**
* @引用
https://cdn.jsdelivr.net/npm/codemirror@5.62.3/src/util/StringStream.js
*/
class StringStream {
constructor(input, line) {
this.line = line
this.string = input
this.start = 0 // 起始位置
this.pos = 0 // 当前位置
this.lineStart = 0
}
eol() { return this.pos >= this.string.length }
sol() { return this.pos == this.lineStart }
peek() { return this.string.charAt(this.pos) || undefined }
next() {
if (this.pos < this.string.length)
return this.string.charAt(this.pos++)
}
eat(match) {
let ch = this.string.charAt(this.pos)
let ok
if (typeof match == "string") ok = ch == match
else ok = ch && (match.test ? match.test(ch) : match(ch))
if (ok) { ++this.pos; return ch }
}
eatWhile(match) {
let start = this.pos
while (this.eat(match)) { }
return this.pos > start
}
eatSpace() {
let start = this.pos
while (/[\s\u00a0]/.test(this.string.charAt(this.pos))) ++this.pos
return this.pos > start
}
match(pattern, consume, caseInsensitive) {
if (typeof pattern == "string") {
let cased = str => caseInsensitive ? str.toLowerCase() : str
let substr = this.string.substr(this.pos, pattern.length)
if (cased(substr) == cased(pattern)) {
if (consume !== false) this.pos += pattern.length
return true
}
} else {
let match = this.string.slice(this.pos).match(pattern)
if (match && match.index > 0) return null
if (match && consume !== false) this.pos += match[0].length
return match
}
}
current() { return this.string.slice(this.start, this.pos) }
skipToEnd() { this.pos = this.string.length }
skipTo(ch) {
let found = this.string.indexOf(ch, this.pos)
if (found > -1) { this.pos = found; return true }
}
backUp(n) { this.pos -= n }
}
class Tokenizer {
constructor(parser) {
this.parser = parser
this.nextToken = this.tokenBase
// 抽象数据节点
this.root = {
type: "root",
children: []
}
this.stack = []
}
getTokens() {
return this.root
}
get currentParent(){
return this.stack[this.stack.length-1]
}
pushStack(el) {
this.stack.push(el)
}
popStack() {
this.stack.pop()
}
createComment(text) {
return {
type: "comment",
text: text
}
}
createNode(name) {
return {
type: "node",
name: name,
attributes: []
}
}
createTextNode(text) {
return {
type: "text",
text: text
}
}
addChildren(node) {
if(this.currentParent){
if(!this.currentParent.children){
this.currentParent.children=[]
}
this.currentParent.children.push(node)
}else{
this.root.children.push(node)
}
return node
}
token(stream) {
if (stream.eatSpace()) {
return
}
return this.nextToken(stream)
}
tokenBase(stream) {
let ch = stream.next()
// 如果是<标记,有三种情冲1:<div> 2:<!----> 3:</div>
if (ch == '<') {
// 如果是注释
let m;
if (m = stream.match(/^!--([\s\S]+?)-->/, true)) {
this.addChildren(this.createComment(m[1]))
return 'comment'
} else {
//是否结束标识
let isEnd = !!stream.eat('/')
// 下次识别元素
this.nextToken = this.tokenElement.bind(this, isEnd,null)
return isEnd?'endLess':'startLess'
}
}else{
if(stream.eatWhile(/[^<]/)){
this.addChildren(this.createTextNode(stream.current()))
return 'text'
}
}
}
/**
* @param {boolean} isEnd 是否闭合标签
* @param {any} currentNode 当前元素
*/
tokenElement(isEnd,currentNode, stream) {
var ch = stream.next(),
// 是否自闭合标签
autoClosed=false;
// 判断标签闭合情况
if (ch == '>' || (autoClosed=(ch == '/' && stream.eat('>')))) {
this.nextToken = this.tokenBase
if(autoClosed||isEnd){
this.popStack()
}
// 闭合标签
if(isEnd){
return 'endGreater'
}else{
return 'startGreater'
}
} else if (ch === '=') {
return 'attrubuteEqual'
} else if (ch == '"' || ch == "'") {
// 解析属性值
let next, escaped = false,quote=ch,value='';
while ((next = stream.next()) != null) {
// 如果存在转义就忽略
if (!escaped && next == quote) {
break
}
value+=next
escaped = !escaped && next == '\\' // 是否包含转义
}
currentNode.attributes[currentNode.attributes.length-1].value=value
return 'attributeValue'
} else {
// 收集节点名和属性名
if (stream.eatWhile(/[^\s=>]/)) {
if (!isEnd && currentNode) {
let attr = { name: stream.current()}
currentNode.attributes.push(attr)
return 'attributeName'
}
if (!isEnd) {
let node = this.createNode(stream.current())
this.addChildren(node)
this.pushStack(node)
this.nextToken=this.tokenElement.bind(this,isEnd,node)
return 'nodeName'
}
if (isEnd) {
return 'nodeName'
}
}
}
}
}
class Token {
constructor(text, type, line, start, end) {
this.text = text
this.type = type
this.line = line
this.start = start
this.end = end
}
}
class Parser {
constructor(input) {
this.lines = input.split(/\r\n?|\n|\u2028|\u2029/)
this.tokenizer = new Tokenizer(this)
this.tokens = [] // 词法分析标记集合
this.curLine = 0
}
static parse(input) {
return new Parser(input).read()
}
read() {
while (this.curLine < this.lines.length) {
let stream = new StringStream(this.lines[this.curLine], this.curLine+1)
stream.start = stream.pos = 0;
// 如果没有结束就读取token
while (!stream.eol()) {
let type = this.readToken(stream)
if (type) {
this.tokens.push(new Token(stream.current(), type, stream.line, stream.start, stream.pos))
}
stream.start = stream.pos;
}
this.curLine++
}
return this
}
readToken(stream) {
// 最多迭代5次,如果没有处理stream,就抛异常
for (let i = 0; i < 5; i++) {
let style = this.tokenizer.token(stream)
if (stream.pos > stream.start) {
return style
}
}
throw new Error('行号:'+stream.line+',解析有问题')
}
}
let parser = Parser.parse(html)
console.log('tokens', JSON.stringify(parser.tokens))
console.log('nodes', JSON.stringify(parser.tokenizer.root,null,2))
let _d=document.createElement('div')
function escapeHTML(text){
_d.innerText=text
return _d.innerHTML
}
function tokenToHTMLHighlight(tokens,colors){
let lineSet=new Set(),start=0,html=''
tokens.forEach(token=>{
if(!lineSet.has(token.line)){
lineSet.add(token.line)
start=0
if(html!=''){
html+='\r'
}
html+='<span style="color:#dfdede;display:inline-block;width:40px;text-align:right;">'+token.line+'</span>'
}
for(let i=start,len=token.start;i<len;i++){
html+=' '
}
html+='<span style="'+colors[token.type]+'">'+escapeHTML(token.text)+'</span>'
start=token.end;
lastLine=token.line
})
return '<pre>'+html+'</pre>'
}
return {
template: `<div><div ref="main" v-html="html"></div></div>`,
setup(props, ctx) {
let container = ref();
onMounted(() => {
})
return {
html:tokenToHTMLHighlight(parser.tokens,{
nodeName:'color:#01aca7',
attributeValue:'color:#cd6d02',
attributeName:'color:#1428d1',
attrubuteEqual:'color:red',
startLess:'color:#a00298',
startGreater:'color:#a00298',
endLess:'color:#a00298',
endGreater:'color:#a00298',
comment:'color:green',
text:'color:#898484'
}),
main: container
}
}
}
})
更多推荐
已为社区贡献2条内容
所有评论(0)