一个简单的对html进行词法分析

将一段html进行扫描，在扫描读取字符流过程中会逐个字的分析，进行词的分类。把每个词添加token数组中，进行身份标识。并且对每个语句进行分析，然后把它抽象化。变成一个抽象化树结构的元素对象。目录HTML代码node对象tokens生成html 高亮完整代码例子举例：HTML代码<div class="wrapper" ><div..

很甜的西瓜

478人浏览 · 2021-09-23 21:01:57

很甜的西瓜 · 2021-09-23 21:01:57 发布

将一段html进行扫描，在扫描读取字符流过程中会逐个字的分析，进行词的分类。把每个词添加token数组中，进行身份标识。并且对每个语句进行分析，然后把它抽象化。变成一个抽象化树结构的元素对象。

举例：

HTML代码

<div class="wrapper" >
                     <!--注释-->
                    <div class="head">
                        hello world
                        你好世界
                    </div>
                    <div class="main">
                        <div class="content">
                            <input type="text" />
                        </div>
                    </div>
                </div>
                <div class="footer">
                    footer
                </div>

会通过上面这段html,分析收集生成一个 tokens数组,还会生成一个root抽象化的对象

tokens：属性名、属性值、标签名、左尖括号、右尖括号、注释、文本

node:Array<{type:string,attributes:Array<{name:string,value:any}>,children?:any[]}>

Tokens数组

[
  {
    "text": "<",
    "type": "startLess",
    "line": 1,
    "start": 0,
    "end": 1
  },
  {
    "text": "div",
    "type": "nodeName",
    "line": 1,
    "start": 1,
    "end": 4
  },
  {
    "text": "class",
    "type": "attributeName",
    "line": 1,
    "start": 5,
    "end": 10
  },
  {
    "text": "=",
    "type": "attrubuteEqual",
    "line": 1,
    "start": 10,
    "end": 11
  },
  {
    "text": "\"wrapper\"",
    "type": "attributeValue",
    "line": 1,
    "start": 11,
    "end": 20
  },
  {
    "text": ">",
    "type": "startGreater",
    "line": 1,
    "start": 21,
    "end": 22
  },
  {
    "text": "<!--注释-->",
    "type": "comment",
    "line": 2,
    "start": 21,
    "end": 30
  },
  {
    "text": "<",
    "type": "startLess",
    "line": 3,
    "start": 20,
    "end": 21
  },
  {
    "text": "div",
    "type": "nodeName",
    "line": 3,
    "start": 21,
    "end": 24
  },
  {
    "text": "class",
    "type": "attributeName",
    "line": 3,
    "start": 25,
    "end": 30
  },
  {
    "text": "=",
    "type": "attrubuteEqual",
    "line": 3,
    "start": 30,
    "end": 31
  },
  {
    "text": "\"head\"",
    "type": "attributeValue",
    "line": 3,
    "start": 31,
    "end": 37
  },
  {
    "text": ">",
    "type": "startGreater",
    "line": 3,
    "start": 37,
    "end": 38
  },
  {
    "text": "hello world",
    "type": "text",
    "line": 4,
    "start": 24,
    "end": 35
  },
  {
    "text": "你好世界",
    "type": "text",
    "line": 5,
    "start": 24,
    "end": 28
  },
  {
    "text": "</",
    "type": "endLess",
    "line": 6,
    "start": 20,
    "end": 22
  },
  {
    "text": "div",
    "type": "nodeName",
    "line": 6,
    "start": 22,
    "end": 25
  },
  {
    "text": ">",
    "type": "endGreater",
    "line": 6,
    "start": 25,
    "end": 26
  },
  {
    "text": "<",
    "type": "startLess",
    "line": 7,
    "start": 20,
    "end": 21
  },
  {
    "text": "div",
    "type": "nodeName",
    "line": 7,
    "start": 21,
    "end": 24
  },
  {
    "text": "class",
    "type": "attributeName",
    "line": 7,
    "start": 25,
    "end": 30
  },
  {
    "text": "=",
    "type": "attrubuteEqual",
    "line": 7,
    "start": 30,
    "end": 31
  },
  {
    "text": "\"main\"",
    "type": "attributeValue",
    "line": 7,
    "start": 31,
    "end": 37
  },
  {
    "text": ">",
    "type": "startGreater",
    "line": 7,
    "start": 37,
    "end": 38
  },
  {
    "text": "<",
    "type": "startLess",
    "line": 8,
    "start": 24,
    "end": 25
  },
  {
    "text": "div",
    "type": "nodeName",
    "line": 8,
    "start": 25,
    "end": 28
  },
  {
    "text": "class",
    "type": "attributeName",
    "line": 8,
    "start": 29,
    "end": 34
  },
  {
    "text": "=",
    "type": "attrubuteEqual",
    "line": 8,
    "start": 34,
    "end": 35
  },
  {
    "text": "\"content\"",
    "type": "attributeValue",
    "line": 8,
    "start": 35,
    "end": 44
  },
  {
    "text": ">",
    "type": "startGreater",
    "line": 8,
    "start": 44,
    "end": 45
  },
  {
    "text": "<",
    "type": "startLess",
    "line": 9,
    "start": 28,
    "end": 29
  },
  {
    "text": "input",
    "type": "nodeName",
    "line": 9,
    "start": 29,
    "end": 34
  },
  {
    "text": "type",
    "type": "attributeName",
    "line": 9,
    "start": 35,
    "end": 39
  },
  {
    "text": "=",
    "type": "attrubuteEqual",
    "line": 9,
    "start": 39,
    "end": 40
  },
  {
    "text": "\"text\"",
    "type": "attributeValue",
    "line": 9,
    "start": 40,
    "end": 46
  },
  {
    "text": "/>",
    "type": "startGreater",
    "line": 9,
    "start": 47,
    "end": 49
  },
  {
    "text": "</",
    "type": "endLess",
    "line": 10,
    "start": 24,
    "end": 26
  },
  {
    "text": "div",
    "type": "nodeName",
    "line": 10,
    "start": 26,
    "end": 29
  },
  {
    "text": ">",
    "type": "endGreater",
    "line": 10,
    "start": 29,
    "end": 30
  },
  {
    "text": "</",
    "type": "endLess",
    "line": 11,
    "start": 20,
    "end": 22
  },
  {
    "text": "div",
    "type": "nodeName",
    "line": 11,
    "start": 22,
    "end": 25
  },
  {
    "text": ">",
    "type": "endGreater",
    "line": 11,
    "start": 25,
    "end": 26
  },
  {
    "text": "</",
    "type": "endLess",
    "line": 12,
    "start": 16,
    "end": 18
  },
  {
    "text": "div",
    "type": "nodeName",
    "line": 12,
    "start": 18,
    "end": 21
  },
  {
    "text": ">",
    "type": "endGreater",
    "line": 12,
    "start": 21,
    "end": 22
  },
  {
    "text": "<",
    "type": "startLess",
    "line": 13,
    "start": 16,
    "end": 17
  },
  {
    "text": "div",
    "type": "nodeName",
    "line": 13,
    "start": 17,
    "end": 20
  },
  {
    "text": "class",
    "type": "attributeName",
    "line": 13,
    "start": 21,
    "end": 26
  },
  {
    "text": "=",
    "type": "attrubuteEqual",
    "line": 13,
    "start": 26,
    "end": 27
  },
  {
    "text": "\"footer\"",
    "type": "attributeValue",
    "line": 13,
    "start": 27,
    "end": 35
  },
  {
    "text": ">",
    "type": "startGreater",
    "line": 13,
    "start": 35,
    "end": 36
  },
  {
    "text": "footer",
    "type": "text",
    "line": 14,
    "start": 20,
    "end": 26
  },
  {
    "text": "</",
    "type": "endLess",
    "line": 15,
    "start": 16,
    "end": 18
  },
  {
    "text": "div",
    "type": "nodeName",
    "line": 15,
    "start": 18,
    "end": 21
  },
  {
    "text": ">",
    "type": "endGreater",
    "line": 15,
    "start": 21,
    "end": 22
  }
]

node对象

{
  "type": "root",
  "children": [
    {
      "type": "node",
      "name": "div",
      "attributes": [
        {
          "name": "class",
          "value": "wrapper"
        }
      ],
      "children": [
        {
          "type": "comment",
          "text": "注释"
        },
        {
          "type": "node",
          "name": "div",
          "attributes": [
            {
              "name": "class",
              "value": "head"
            }
          ],
          "children": [
            {
              "type": "text",
              "text": "hello world"
            },
            {
              "type": "text",
              "text": "你好世界"
            }
          ]
        },
        {
          "type": "node",
          "name": "div",
          "attributes": [
            {
              "name": "class",
              "value": "main"
            }
          ],
          "children": [
            {
              "type": "node",
              "name": "div",
              "attributes": [
                {
                  "name": "class",
                  "value": "content"
                }
              ],
              "children": [
                {
                  "type": "node",
                  "name": "input",
                  "attributes": [
                    {
                      "name": "type",
                      "value": "text"
                    }
                  ]
                }
              ]
            }
          ]
        }
      ]
    },
    {
      "type": "node",
      "name": "div",
      "attributes": [
        {
          "name": "class",
          "value": "footer"
        }
      ],
      "children": [
        {
          "type": "text",
          "text": "footer"
        }
      ]
    }
  ]
}

tokens生成html 高亮

根据tokens生成html:

   let _d=document.createElement('div')
            function escapeHTML(text){
                _d.innerText=text
                return _d.innerHTML
            }
            function tokenToHTMLHighlight(tokens,colors){
                let lineSet=new Set(),start=0,html=''
                tokens.forEach(token=>{
                    if(!lineSet.has(token.line)){
                        lineSet.add(token.line)
                        start=0
                        if(html!=''){
                            html+='\r'
                        }
                        html+='<span style="color:#dfdede;display:inline-block;width:40px;text-align:right;">'+token.line+'</span>'
                    }
                    for(let i=start,len=token.start;i<len;i++){
                        html+=' '
                    }
                    html+='<span style="'+colors[token.type]+'">'+escapeHTML(token.text)+'</span>'
                    start=token.end;
                    lastLine=token.line
                })
                return '<pre>'+html+'</pre>'
            }

完整代码例子

 var addExample = createModuleExample('vue3')

        addExample("html lexer parser", function () {
            let { toRaw, ref, unref, provide, inject, getCurrentInstance, reactive, shallowReactive, computed, watchEffect, watch, onBeforeMount, onMounted, onBeforeUpdated, onUpdated, onBeforeUnmount, onUnmounted, toRef, toRefs } = Vue;

            var html = `<div class="wrapper" >
                     <!--注释-->
                    <div class="head">
                        hello world
                        你好世界
                    </div>
                    <div class="main">
                        <div class="content">
                            <input type="text" />
                        </div>
                    </div>
                </div>
                <div class="footer">
                    footer
                </div>
                `
            /**
             * @引用
https://cdn.jsdelivr.net/npm/codemirror@5.62.3/src/util/StringStream.js
            */
            class StringStream {
                constructor(input, line) {
                    this.line = line
                    this.string = input
                    this.start = 0 // 起始位置
                    this.pos = 0 // 当前位置
                    this.lineStart = 0
                }
                eol() { return this.pos >= this.string.length }
                sol() { return this.pos == this.lineStart }
                peek() { return this.string.charAt(this.pos) || undefined }
                next() {
                    if (this.pos < this.string.length)
                        return this.string.charAt(this.pos++)
                }
                eat(match) {
                    let ch = this.string.charAt(this.pos)
                    let ok
                    if (typeof match == "string") ok = ch == match
                    else ok = ch && (match.test ? match.test(ch) : match(ch))
                    if (ok) { ++this.pos; return ch }
                }
                eatWhile(match) {
                    let start = this.pos
                    while (this.eat(match)) { }
                    return this.pos > start
                }
                eatSpace() {
                    let start = this.pos
                    while (/[\s\u00a0]/.test(this.string.charAt(this.pos))) ++this.pos
                    return this.pos > start
                }
                match(pattern, consume, caseInsensitive) {
                    if (typeof pattern == "string") {
                        let cased = str => caseInsensitive ? str.toLowerCase() : str
                        let substr = this.string.substr(this.pos, pattern.length)
                        if (cased(substr) == cased(pattern)) {
                            if (consume !== false) this.pos += pattern.length
                            return true
                        }
                    } else {
                        let match = this.string.slice(this.pos).match(pattern)
                        if (match && match.index > 0) return null
                        if (match && consume !== false) this.pos += match[0].length
                        return match
                    }
                }
                current() { return this.string.slice(this.start, this.pos) }
                skipToEnd() { this.pos = this.string.length }
                skipTo(ch) {
                    let found = this.string.indexOf(ch, this.pos)
                    if (found > -1) { this.pos = found; return true }
                }
                backUp(n) { this.pos -= n }
            }
            class Tokenizer {
                constructor(parser) {
                    this.parser = parser
                    this.nextToken = this.tokenBase
                    // 抽象数据节点
                    this.root = {
                        type: "root",
                        children: []
                    }
                    this.stack = []
                }
                getTokens() {
                    return this.root
                }
                get currentParent(){
                    return this.stack[this.stack.length-1]
                }
                pushStack(el) {
                    this.stack.push(el)
                }
                popStack() {
                    this.stack.pop()
                }
                createComment(text) {
                    return {
                        type: "comment",
                        text: text
                    }
                }
                createNode(name) {
                    return {
                        type: "node",
                        name: name,
                        attributes: []
                    }
                }
                createTextNode(text) {
                    return {
                        type: "text",
                        text: text
                    }
                }
                addChildren(node) {
                    if(this.currentParent){
                         if(!this.currentParent.children){
                            this.currentParent.children=[]
                         }
                         this.currentParent.children.push(node)
                    }else{
                         this.root.children.push(node)
                    }
                    return node
                }
                token(stream) {
                    if (stream.eatSpace()) {
                        return
                    }
                    return this.nextToken(stream)
                }
                tokenBase(stream) {
                    let ch = stream.next()
                    // 如果是<标记，有三种情冲1:<div> 2:<!----> 3:</div>
                    if (ch == '<') {
                        // 如果是注释
                        let m;
                        if (m = stream.match(/^!--([\s\S]+?)-->/, true)) {
                            this.addChildren(this.createComment(m[1]))
                            return 'comment'
                        } else {
                            //是否结束标识
                            let isEnd = !!stream.eat('/')
                            // 下次识别元素
                            this.nextToken = this.tokenElement.bind(this, isEnd,null)
                            return isEnd?'endLess':'startLess'
                        }

                    }else{
                        if(stream.eatWhile(/[^<]/)){
                            this.addChildren(this.createTextNode(stream.current()))
                            return 'text'
                        }
                    }

                }
                /**
                 * @param {boolean} isEnd 是否闭合标签
                 * @param {any} currentNode 当前元素
                */
                tokenElement(isEnd,currentNode, stream) {
                    var ch = stream.next(),
                    // 是否自闭合标签
                    autoClosed=false;
                    // 判断标签闭合情况
                    if (ch == '>' || (autoClosed=(ch == '/' && stream.eat('>')))) {
                        this.nextToken = this.tokenBase
                        if(autoClosed||isEnd){
                            this.popStack()
                        }
                        // 闭合标签
                        if(isEnd){                                
                            return 'endGreater'
                        }else{
                            return 'startGreater'
                        }
                    } else if (ch === '=') {
                        return 'attrubuteEqual'
                    } else if (ch == '"' || ch == "'") {
                        // 解析属性值
                        let next, escaped = false,quote=ch,value='';
                        while ((next = stream.next()) != null) {
                            // 如果存在转义就忽略
                            if (!escaped && next == quote) {
                                break
                            }
                            value+=next
                            escaped = !escaped && next == '\\' // 是否包含转义
                        }
                        currentNode.attributes[currentNode.attributes.length-1].value=value
                        return 'attributeValue'
                    } else {
                        // 收集节点名和属性名
                        if (stream.eatWhile(/[^\s=>]/)) {                           
                            if (!isEnd && currentNode) {
                                let attr = { name: stream.current()}
                                currentNode.attributes.push(attr)
                                return 'attributeName'
                            }
                            if (!isEnd) {
                                let node = this.createNode(stream.current())
                                this.addChildren(node)
                                this.pushStack(node)
                                this.nextToken=this.tokenElement.bind(this,isEnd,node) 
                                return 'nodeName'
                            }
                            if (isEnd) {
                                return 'nodeName'
                            }
                        }
                    }

                }
            }

            class Token {
                constructor(text, type, line, start, end) {
                    this.text = text
                    this.type = type
                    this.line = line
                    this.start = start
                    this.end = end
                }
            }
            class Parser {
                constructor(input) {
                    this.lines = input.split(/\r\n?|\n|\u2028|\u2029/)
                    this.tokenizer = new Tokenizer(this)
                    this.tokens = [] // 词法分析标记集合
                    this.curLine = 0

                }
                static parse(input) {
                    return new Parser(input).read()
                }
                read() {
                    while (this.curLine < this.lines.length) {
                        let stream = new StringStream(this.lines[this.curLine], this.curLine+1)
                        stream.start = stream.pos = 0;
                        // 如果没有结束就读取token
                        while (!stream.eol()) {
                            let type = this.readToken(stream)
                            if (type) {
                                this.tokens.push(new Token(stream.current(), type, stream.line, stream.start, stream.pos))
                            }
                            stream.start = stream.pos;
                        }
                        this.curLine++
                    }
                    return this
                }
                readToken(stream) {
                    // 最多迭代5次，如果没有处理stream,就抛异常
                    for (let i = 0; i < 5; i++) {
                        let style = this.tokenizer.token(stream)
                        if (stream.pos > stream.start) {
                            return style
                        }
                    }
                    throw new Error('行号：'+stream.line+',解析有问题')
                }
            }

            let parser = Parser.parse(html)
            console.log('tokens', JSON.stringify(parser.tokens))
            console.log('nodes', JSON.stringify(parser.tokenizer.root,null,2))

            let _d=document.createElement('div')
            function escapeHTML(text){
                _d.innerText=text
                return _d.innerHTML
            }
            function tokenToHTMLHighlight(tokens,colors){
                let lineSet=new Set(),start=0,html=''
                tokens.forEach(token=>{
                    if(!lineSet.has(token.line)){
                        lineSet.add(token.line)
                        start=0
                        if(html!=''){
                            html+='\r'
                        }
                        html+='<span style="color:#dfdede;display:inline-block;width:40px;text-align:right;">'+token.line+'</span>'
                    }
                    for(let i=start,len=token.start;i<len;i++){
                        html+=' '
                    }
                    html+='<span style="'+colors[token.type]+'">'+escapeHTML(token.text)+'</span>'
                    start=token.end;
                    lastLine=token.line
                })
                return '<pre>'+html+'</pre>'
            }
      
            return {
                template: `<div><div ref="main" v-html="html"></div></div>`,
                setup(props, ctx) {
                    let container = ref();
                    onMounted(() => {

                    })
                    return {
                        html:tokenToHTMLHighlight(parser.tokens,{
                            nodeName:'color:#01aca7',
                            attributeValue:'color:#cd6d02',
                            attributeName:'color:#1428d1',
                            attrubuteEqual:'color:red',
                            startLess:'color:#a00298',
                            startGreater:'color:#a00298',
                            endLess:'color:#a00298',
                            endGreater:'color:#a00298',
                            comment:'color:green',
                            text:'color:#898484'
                        }),
                        main: container
                    }
                }
            }
        })