编译原理--词法分析器(python语言实现)
词法分析器最近在学习编译原理。由于实验要求有词法分析器,这里我就先记录一下词法分析器实现过程以及具体思路。目标语言此处我选择的目标语言是c语言的子集来进行词法分析。实现语言此处我选用的语言是python,主要还是考虑到python的数据结构比较强大而且包容性强。并且我pyqt用的比较熟练,很容易设计出GUI界面。...
词法分析器
最近在学习编译原理。由于实验要求有词法分析器,这里我就先记录一下词法分析器实现过程以及具体思路。
目标语言
此处我选择的目标语言是c语言的子集来进行词法分析。
实现语言
此处我选用的语言是python,主要还是考虑到python的数据结构比较强大而且包容性强。并且我pyqt用的比较熟练,很容易设计出GUI界面。关于pyqt的相关内容网上资料比较少对初学者不是很友好,我下面会出一些关于pyqt的教程,还望持续关注!
词法分析器主要工作
- 从源程序文件中读入字符
- 统计行数和列数用于进行错误定位
- 识别出单词并用(内码,属性)二元式表示
- 识别出错误记录报告错误但不会终止扫描
- 填写标识符表
设计思路
词法分析不必要设计成单独的一遍,我认为词法分析器应该设计成一个子程序,每当语法分析需要一个单词符号时,那么此时向词法分析器传递一个输入串,词法分析器便要能分析出这个输入串中的单词。
设计流程图
算法思路
对于单词的分析关键在于第一个字符的性质。第一个字符的性质决定了下面的单词分析进程。如果第一个字符是一个数字那么下面这个单词就要判断是否为常量。接下来读取的如果是字符除了是e或E其他字符都可以直接判断此单词非法为error。因此这里可以将其单独分离出一个函数,这里我取名为isDigit()函数。其他包括标识符的判定以及算术或逻辑运算符的判定也可以按照此思路分离出相应的函数。
函数表
源程序代码
一些初始设定
self.reserveWord = ["auto", "break", "case", "char", "const", "continue",
"default", "do", "double", "else", "enum", "extern",
"float", "for", "goto", "if", "int", "long",
"register", "return", "short", "signed", "sizeof", "static",
"struct", "switch", "typedef", "union", "unsigned", "void",
"volatile", "while"] # c++中的关键字
self.operatorOrDelimiter = ["+", "-", "*", "/", "<", "<=", ">", ">=", "=", "==",
"!=", ";", "(", ")", "^", ",", "\"", "\'", "#", "&",
"&&", "|", "||", "%", "~", "<<", ">>", "[", "]", "{",
"}", "\\", ".", ":", "!"] # c++中的一些符号
self.Delimiter = [";", "(", ")", ",", "#", "[", "]", "{", "}", "\\"]
self.RelationOperation = ["<", "<=", ">", ">=", "=", "==", "!=", "^", "&", "&&", "|", "||", "<<", ">>", "!"]
self.Operator = ["+", "-", "*", "/", "%", "~", "+=", "*=", "/=", "-=",
"++", "--"]
处理开头以数字开头的字符串
这里分为四种情况
部分代码
// 判断数字代码
def IsDigits(self, inString, pos):
flag = False
for i in inString:
pos += 1
if i.isdigit():
self.token += str(i)
flag = True
elif i == '.' and i not in self.token and 'e' not in self.token and 'E' not in self.token:
self.token += str(i)
elif i == 'e' or i == 'E' and i not in self.token: # 处理含E或e的合法指数情况
self.token += str(i)
else:
if i in self.operatorOrDelimiter or i == ' ' or i == '\n':
flag = True
else:
flag = False
break
return flag, pos
对应的scan()函数中处理数字开头的代码
if inString[0].isdigit():
judge, index = self.IsDigits(inString, 0)
if judge:
"""if '.' in self.token: # 此处是对常量的转化过程此处写成注释
print("--------")
print(float(self.token))
print("--------")
elif 'e' not in self.token and 'E' not in self.token:
print("--------")
print(int(self.token))
print("--------")
else:
num1 = 0
num2 = 0
if 'E' in self.token:
l = self.token.split('E')
if '.' in l[0]:
num1 = float(l[0])
else:
num1 = int(l[0])
num2 = int(l[1])
elif 'e' in self.token:
l = self.token.split('e')
if '.' in l[0]:
num1 = float(l[0])
else:
num1 = int(l[0])
num2 = int(l[1])
for i in range(0, num2):
num1 *= 10
print("--------")
print(num1)
print("--------")"""
self.result.append([self.token, "常数", (row, col)])
else:
self.result.append([self.token, "ERROR", (row, col)])
if index - 1 < len(inString) and index - 1 > 0:
if index == len(inString):
print(inString[len(inString) - 1])
self.scan(inString[len(inString) - 1], row, col)
else:
self.scan(inString[index - 1:], row, col)
处理以字母开头的字符串
此处以字母开头的字符串可能出现的情况为:
- 标识符
- 关键字
- 非法字符串
对于这里的判断要注意几种情况:
- 类似i++这种一个标识符后面跟着算术或逻辑运算符
- i;后面跟着终结符这种比较好判断
- 单独的一个标识符或关键字 如 int这种。
部分代码
def isReserve(self, target): # 判断是否为关键字
if target in self.reserveWord:
return True
return False
def isMark(self, inString, pos):
flag = False
for i in inString:
pos += 1
if i.isalpha() or i.isdigit() or i == '_':
self.token += str(i)
flag = True
elif i in self.operatorOrDelimiter: # 遇到算术/逻辑/分隔符结束搜索
flag = True
break
else:
flag = False
return flag, pos
对应scan()中以字母开头部分代码
elif inString[0].isalpha():
judge, index = self.isMark(inString, 0)
if self.isReserve(self.token):
self.result.append([self.token, "关键字", (row, col)])
else:
self.result.append([self.token, "标识符", (row, col)])
if index <= len(inString) and not inString[index - 1].isalpha():
self.scan(inString[index - 1:], row, col)
elif inString[0] == '\'':
judge, index = self.isChar(inString, 0)
if judge:
self.result.append([self.token, "字符常量", (row, col)])
if index < len(inString) and index - 1 >= 0:
self.scan(inString[index - 1:], row, col)
elif inString[0] == '\"':
judge, index = self.isString(inString, 0)
index = index + 1 # 最后一个”不能算
if judge:
self.result.append([self.token, "字符串常量", (row, col)])
if index < len(inString) and index - 1 >= 0:
self.scan(inString[index - 1:], row, col)
elif inString[0] in self.Operator or inString[0] in self.RelationOperation:
index = self.IsOperator(inString, 0)
if self.token in self.Operator:
self.result.append([self.token, "算术运算符", (row, col)])
elif self.token in self.RelationOperation:
self.result.append([self.token, "关系运算符", (row, col)])
else:
self.result.append([self.token, "ERROR", (row, col)])
if index <= len(inString) and index - 1 >= 0:
self.scan(inString[index - 1:], row, col)
处理一些算术/逻辑运算符的情况
这里主要是考虑如下一些情况
- 运算符在开头这里也是有可能的比如 ++i;这在c以及c++中均为合法语句,词法分析器应该能够识别出这一个串中的++、i单词
- 运算符在内部,这也是有可能的比如a+b,词法分析器也要识别出a、+、b这几个单词
部分代码
def IsOperator(self, inString, pos):
if len(inString) == 1:
self.token += str(inString[0])
return pos
for i in inString[0:]:
pos += 1
if i in self.operatorOrDelimiter:
self.token += str(i)
else:
break
return pos
scan()函数部分代码
elif inString[0] in self.Operator or inString[0] in self.RelationOperation:
index = self.IsOperator(inString, 0)
if self.token in self.Operator:
self.result.append([self.token, "算术运算符", (row, col)])
elif self.token in self.RelationOperation:
self.result.append([self.token, "关系运算符", (row, col)])
else:
self.result.append([self.token, "ERROR", (row, col)])
if index <= len(inString) and index - 1 >= 0:
self.scan(inString[index - 1:], row, col)
至此一些关键处理代码已经实现,并经过测试后是可以使用的,上面代码部分只是我的思路,我使用了递归调用,这会有一个缺点就是无法更好的得出当前处理的位置,仍需改进。
运行测试图片
更新内容
我加入了GUI界面并对其进行了代码优化,优化的代码在上面已经进行了更新。下面贴出整体GUI部分代码
# -*- coding: utf-8 -*-
# Form implementation generated from reading ui file 'analysis.ui'
#
# Created by: PyQt5 UI code generator 5.15.4
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again. Do not edit this file unless you know what you are doing.
from PyQt5 import QtCore, QtGui, QtWidgets
class Ui_MainWindow(object):
def setupUi(self, MainWindow):
MainWindow.setObjectName("MainWindow")
MainWindow.resize(961, 816)
self.centralwidget = QtWidgets.QWidget(MainWindow)
self.centralwidget.setObjectName("centralwidget")
self.code = QtWidgets.QTextEdit(self.centralwidget)
self.code.setGeometry(QtCore.QRect(10, 10, 781, 391))
self.code.setObjectName("code")
self.result = QtWidgets.QTextEdit(self.centralwidget)
self.result.setGeometry(QtCore.QRect(0, 410, 951, 361))
self.result.setObjectName("result")
self.gridLayoutWidget = QtWidgets.QWidget(self.centralwidget)
self.gridLayoutWidget.setGeometry(QtCore.QRect(800, 50, 161, 231))
self.gridLayoutWidget.setObjectName("gridLayoutWidget")
self.gridLayout = QtWidgets.QGridLayout(self.gridLayoutWidget)
self.gridLayout.setContentsMargins(0, 0, 0, 0)
self.gridLayout.setObjectName("gridLayout")
self.pushButton = QtWidgets.QPushButton(self.gridLayoutWidget)
self.pushButton.setObjectName("pushButton")
self.gridLayout.addWidget(self.pushButton, 0, 0, 1, 1)
self.pushButton_2 = QtWidgets.QPushButton(self.gridLayoutWidget)
self.pushButton_2.setObjectName("pushButton_2")
self.gridLayout.addWidget(self.pushButton_2, 1, 0, 1, 1)
MainWindow.setCentralWidget(self.centralwidget)
self.menubar = QtWidgets.QMenuBar(MainWindow)
self.menubar.setGeometry(QtCore.QRect(0, 0, 961, 26))
self.menubar.setObjectName("menubar")
MainWindow.setMenuBar(self.menubar)
self.statusbar = QtWidgets.QStatusBar(MainWindow)
self.statusbar.setObjectName("statusbar")
MainWindow.setStatusBar(self.statusbar)
self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)
def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
self.pushButton.setText(_translate("MainWindow", "词法分析"))
self.pushButton_2.setText(_translate("MainWindow", "重试"))
主控窗口
import sys
from PyQt5 import QtWidgets, QtCore
from analysis import Analysis
from MainWindow import Ui_MainWindow
class My_Window(QtWidgets.QMainWindow, Ui_MainWindow):
def __init__(self):
super(My_Window, self).__init__()
self.setupUi(self)
@QtCore.pyqtSlot()
def on_pushButton_clicked(self):
deal = Analysis()
string = self.code.toPlainText()
j = 0
results = []
for i in string.split('\n'):
if i != '':
initial = i.split(' ')
col = 1
for s in initial:
deal.scan(s, j + 1, col)
col += 1
for res in deal.result:
results.append(res)
deal.result = []
j += 1
title = "单词 二元序列 类 型 位置(行,列)\n (单词种别,单词属性)\n"
for result in results:
title += '{:<10}{:<20}{:<20}{:<15}'.format(str(result[0]),
'(' + str(deal.dic[result[1]]) + ',' + result[1] + ')', result[
1], str(result[2])) + '\n'
self.result.setText(title)
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
my_window = My_Window()
my_window.show()
sys.exit(app.exec())
给您的建议
各位看到这里应该对我的思路有了一定的认识,但我的水平比较低,而且表达能力不是很强,如果有哪些不懂得地方,欢迎与我联系。我希望各位能够亲自动手实现,其实并不是很难。我使用了递归来进行解析,其实并不是一定要使用,而且递归只会徒增程序得复杂性。我进行了许多边界测试,但仍不是很全。我看来,在设计时没必要考虑过多得边界问题,等我们程序大致框架搭起来了以后,通过运行测试边界,然后再进行更改调试效果会更好。我进行了更新,但关于具体类的实现我还是不能贴出来!如果有帮助还请点点赞
由于我们编译原理课程实验还未验收此处我就不把完整代码放出但我把截图放在下方,希望大家可以对照参考同时也欢迎私信我交流讨论。
如果您想要完整代码,欢迎私信我并说明用途,感谢您的支持!
我又重新更新完善了一下此处把代码贴出,欢迎大家进行测试,如果有什么需要改进的地方欢迎指出~~
import sys
from PyQt5 import QtWidgets, QtCore
from analysis import Analysis
from MainWindow import Ui_MainWindow
class My_Window(QtWidgets.QMainWindow, Ui_MainWindow):
def __init__(self):
super(My_Window, self).__init__()
self.setupUi(self)
@QtCore.pyqtSlot()
def on_pushButton_clicked(self):
deal = Analysis()
string = self.code.toPlainText()
j = 0
results = []
deal.scan(string,1,1)
for i in string.split('\n'):
if i != '':
initial = i.split(' ')
col = 1
for s in initial:
deal.scan(s, j + 1, col)
col += 1
for res in deal.result:
results.append(res)
deal.result = []
j += 1
title = "单词 二元序列 类 型 位置(行,列)\n (单词种别,单词属性)\n"
for result in results:
title += '{:<10}{:<20}{:<20}{:<15}'.format(str(result[0]),
'(' + str(deal.dic[result[1]]) + ',' + result[1] + ')', result[
1], str(result[2])) + '\n'
self.result.setText(title)
@QtCore.pyqtSlot()
def on_pushButton_2_clicked(self):
self.result.clear()
self.code.clear()
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
my_window = My_Window()
my_window.show()
sys.exit(app.exec())
# -*- coding: utf-8 -*-
# Form implementation generated from reading ui file 'analysis.ui'
#
# Created by: PyQt5 UI code generator 5.15.4
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again. Do not edit this file unless you know what you are doing.
from PyQt5 import QtCore, QtGui, QtWidgets
class Ui_MainWindow(object):
def setupUi(self, MainWindow):
MainWindow.setObjectName("MainWindow")
MainWindow.resize(961, 816)
self.centralwidget = QtWidgets.QWidget(MainWindow)
self.centralwidget.setObjectName("centralwidget")
self.code = QtWidgets.QTextEdit(self.centralwidget)
self.code.setGeometry(QtCore.QRect(10, 10, 781, 391))
self.code.setObjectName("code")
self.result = QtWidgets.QTextEdit(self.centralwidget)
self.result.setGeometry(QtCore.QRect(0, 410, 951, 361))
self.result.setObjectName("result")
self.gridLayoutWidget = QtWidgets.QWidget(self.centralwidget)
self.gridLayoutWidget.setGeometry(QtCore.QRect(800, 50, 161, 231))
self.gridLayoutWidget.setObjectName("gridLayoutWidget")
self.gridLayout = QtWidgets.QGridLayout(self.gridLayoutWidget)
self.gridLayout.setContentsMargins(0, 0, 0, 0)
self.gridLayout.setObjectName("gridLayout")
self.pushButton = QtWidgets.QPushButton(self.gridLayoutWidget)
self.pushButton.setObjectName("pushButton")
self.gridLayout.addWidget(self.pushButton, 0, 0, 1, 1)
self.pushButton_2 = QtWidgets.QPushButton(self.gridLayoutWidget)
self.pushButton_2.setObjectName("pushButton_2")
self.gridLayout.addWidget(self.pushButton_2, 1, 0, 1, 1)
MainWindow.setCentralWidget(self.centralwidget)
self.menubar = QtWidgets.QMenuBar(MainWindow)
self.menubar.setGeometry(QtCore.QRect(0, 0, 961, 26))
self.menubar.setObjectName("menubar")
MainWindow.setMenuBar(self.menubar)
self.statusbar = QtWidgets.QStatusBar(MainWindow)
self.statusbar.setObjectName("statusbar")
MainWindow.setStatusBar(self.statusbar)
self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)
def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
self.pushButton.setText(_translate("MainWindow", "词法分析"))
self.pushButton_2.setText(_translate("MainWindow", "重试"))
"""
词法分析器的实现
"""
class Analysis:
def __init__(self):
self.reserveWord = ["auto", "break", "case", "char", "const", "continue",
"default", "do", "double", "else", "enum", "extern",
"float", "for", "goto", "if", "int", "long",
"register", "return", "short", "signed", "sizeof", "static",
"struct", "switch", "typedef", "union", "unsigned", "void",
"volatile", "while"] # c++中的关键字
self.operatorOrDelimiter = ["+", "-", "*", "/", "<", "<=", ">", ">=", "=", "==",
"!=", ";", "(", ")", "^", ",", "\"", "\'", "#", "&",
"&&", "|", "||", "%", "~", "<<", ">>", "[", "]", "{",
"}", "\\", ".", ":", "!"] # c++中的一些符号
self.Delimiter = [";", "(", ")", ",", "#", "[", "]", "{", "}", "\\"]
self.RelationOperation = ["<", "<=", ">", ">=", "=", "==", "!=", "^", "&", "&&", "|", "||", "<<", ">>", "!"]
self.Operator = ["+", "-", "*", "/", "%", "~", "+=", "*=", "/=", "-=",
"++", "--"]
self.token = "" # 得到的单词
self.result = [] # 储存扫描得到的单词信息结果
self.dic = {'标识符': 2, '关键字': 1, "常数": 3, "算术运算符": 4, "关系运算符": 5, "字符串常量": 6,
"字符常量": 7, "分界符": 8,"ERROR":9}
def isReserve(self, target): # 判断是否为关键字
if target in self.reserveWord:
return True
return False
def isMark(self, inString, pos):
flag = False
for i in inString:
pos += 1
if i.isalpha() or i.isdigit() or i == '_':
self.token += str(i)
flag = True
elif i in self.operatorOrDelimiter: # 遇到算术/逻辑/分隔符结束搜索
flag = True
break
else:
flag = False
return flag, pos
def IsDigits(self, inString, pos):
flag = False
for i in inString:
pos += 1
if i.isdigit():
self.token += str(i)
flag = True
elif i == '.' and i not in self.token and 'e' not in self.token and 'E' not in self.token:
self.token += str(i)
elif i == 'e' or i == 'E' and i not in self.token: # 处理含E或e的合法指数情况
self.token += str(i)
else:
if i in self.operatorOrDelimiter or i == ' ' or i == '\n':
flag = True
else:
flag = False
break
return flag, pos
def isChar(self, inString, pos):
flag = False
self.token += str(inString[0])
if len(inString) < 3:
return False, pos + len(inString)
else:
if inString[1].isalpha() and inString[2] == "\'":
self.token += str(inString[1])
self.token += str(inString[2])
if len(inString) > 3:
if inString[3] in self.Delimiter or inString[3] in self.Operator:
pos = 3
return True, pos
else:
return True, 3
else:
return False, 3
def isString(self, inString, pos):
flag = False
self.token += str(inString[0])
for i in inString[1:]:
pos = pos + 1
if i == '\"':
self.token += str(i)
flag = True
break
if i.isalpha():
self.token += str(i)
else:
return False, pos
return flag, pos
def IsOperator(self, inString, pos):
if len(inString) == 1:
self.token += str(inString[0])
return pos
for i in inString[0:]:
pos += 1
if i in self.operatorOrDelimiter:
self.token += str(i)
else:
break
return pos
def scan(self, inString, row, col):
"""
扫描字符串
:param col: 储存当前扫描的列
:param row: 储存当前扫描的行
:type inString: 待处理的字符串
:return: 对字符串的判断结果,类型为列表
"""
inString = str(inString).strip() # 去除字符串两端可能含有的空格
self.token = ""
if inString[0].isdigit():
judge, index = self.IsDigits(inString, 0)
if judge:
"""if '.' in self.token: # 此处是对常量的转化过程此处写成注释
print("--------")
print(float(self.token))
print("--------")
elif 'e' not in self.token and 'E' not in self.token:
print("--------")
print(int(self.token))
print("--------")
else:
num1 = 0
num2 = 0
if 'E' in self.token:
l = self.token.split('E')
if '.' in l[0]:
num1 = float(l[0])
else:
num1 = int(l[0])
num2 = int(l[1])
elif 'e' in self.token:
l = self.token.split('e')
if '.' in l[0]:
num1 = float(l[0])
else:
num1 = int(l[0])
num2 = int(l[1])
for i in range(0, num2):
num1 *= 10
print("--------")
print(num1)
print("--------")"""
self.result.append([self.token, "常数", (row, col)])
else:
print(index)
self.result.append([self.token, "ERROR", (row, col)])
if index < len(inString) and index - 1 > 0:
if index == len(inString):
self.scan(inString[len(inString) - 1], row, col)
else:
self.scan(inString[index - 1:], row, col)
elif inString[0].isalpha():
judge, index = self.isMark(inString, 0)
if self.isReserve(self.token):
self.result.append([self.token, "关键字", (row, col)])
else:
self.result.append([self.token, "标识符", (row, col)])
if index <= len(inString) and not inString[index - 1].isalpha():
self.scan(inString[index - 1:], row, col)
elif inString[0] == '\'':
judge, index = self.isChar(inString, 0)
if judge:
self.result.append([self.token, "字符常量", (row, col)])
if len(inString) > index > 0:
self.scan(inString[index:], row, col)
elif inString[0] == '\"':
judge, index = self.isString(inString, 0)
index = index + 1 # 最后一个”不能算
if judge:
self.result.append([self.token, "字符串常量", (row, col)])
if index < len(inString) and index - 1 >= 0:
self.scan(inString[index - 1:], row, col)
elif inString[0] in self.Operator or inString[0] in self.RelationOperation:
index = self.IsOperator(inString, 0)
if self.token in self.Operator:
self.result.append([self.token, "算术运算符", (row, col)])
elif self.token in self.RelationOperation:
self.result.append([self.token, "关系运算符", (row, col)])
else:
self.result.append([self.token, "ERROR", (row, col)])
if index <= len(inString) and index - 1 >= 0:
self.scan(inString[index-1:], row, col)
elif inString[0] in self.Delimiter:
if len(inString) == 1:
self.token += str(inString[0])
self.result.append([self.token, "分界符", (row, col)])
elif len(inString) > 1 and inString[1] in self.Delimiter: # 分隔符为一个字符
self.token += str(inString)
self.result.append([self.token, "ERROR", (row, col)])
elif len(inString) > 1 and inString[1] not in self.Delimiter:
self.token += str(inString[0])
self.result.append([self.token, "分界符", (row, col)])
self.scan(inString[1:], row, col)
"""elif inString[0] in self.operatorOrDelimiter:
judge, index = self.IsOperation(inString, 0)
self.result.append([self.token, "操作符或分隔符", (row, col)])
if index < len(
inString) and index - 1 >= 0: # 主要是考虑当只有一个字符的情况,因为我在判断操作符时,当首先判断出其长度为一时会默认是操作符,并不再进行判断,导致会出现死循环
self.scan(inString[index - 1:], row, col)"""
if __name__ == "__main__":
analysis = Analysis()
initial = input("请输入代码:").split(' ')
col = 1
for s in initial:
analysis.scan(s, 1, col)
col += 1
for res in analysis.result:
print(res)
更多推荐
所有评论(0)