bison flex 实现tiny语言的编译器
项目地址:tiny-compiler
完成了词法分析,语法分析,中间代码生成,虚拟机执行,没有进行类型检查、错误处理和中间代码优化。
词法分析
%{
#include <iostream>
#include "tiny-parser.tab.h"
using namespace std;
/* lexeme of identifier or reserved word */
extern FILE *fin; /* we read from this file */
extern int current_lineno ;
#undef YY_INPUT
#define YY_INPUT(buf,result,max_size) \
if ( (result = fread( (char*)buf, sizeof(char), max_size, fin)) < 0) \
YY_FATAL_ERROR( "read() in flex scanner failed");
%}
digit [0-9]
number {digit}+
letter [a-zA-Z]
identifier {letter}+
whitespace [ \t]+
%%
"if" {return IF_T;}
"then" {return THEN;}
"else" {return ELSE;}
"end" {return END;}
"repeat" {return REPEAT;}
"until" {return UNTIL;}
"read" {return READ;}
"write" {return WRITE;}
":=" {return ASSIGN;}
"=" {return EQ;}
"<" {return LT;}
"+" {return PLUS;}
"-" {return MINUS;}
"*" {return TIMES;}
"/" {return OVER;}
"(" {return LPAREN;}
")" {return RPAREN;}
";" {return SEMI;}
{number} {return NUM;}
{identifier} {return ID;}
\n {current_lineno++;}
{whitespace} {/* skip whitespace */}
\{[^\}]*\} {
if(yytext[0] == '\n') {
current_lineno++;
}
}
"$" {return EOF;}
"}" {return ERROR;}
. {return ERROR;}
%%
这里需要注意两个点,
- 定义了
current_lineno
变量,用于记录当前行号。每次碰到\n都是需要加一的 - YY_INPUT 宏,用于指定从哪里读入数据,具体的File * fin在main函数中设置。
语法分析
代码
%{
#include "TreeNode.h"
#define YYSTYPE TreeNode *
#include <string>
using namespace std;
extern int current_lineno ;
extern char *yytext;
int yylex();
int yyerror(const char *s);
static char saved_name[256]; /* for use in assignments */
static int saved_lineNo; /* ditto */
extern TreeNode * root; /* stores syntax tree for later return */
%}
%token IF_T THEN ELSE END REPEAT UNTIL READ WRITE
%token ID NUM
%token ASSIGN EQ LT PLUS MINUS TIMES OVER LPAREN RPAREN SEMI
%token ERROR
%% /* Grammar for TINY */
program : stmt_seq {root = $1; }
;
stmt_seq : stmt_seq SEMI stmt {$1->children.push_back( $3);
$$ = $1;
}
| stmt { $$=new_stmt_node();
$$->children.push_back( $1);
}
;
stmt : if_stmt { $$ = $1; }
| repeat_stmt { $$ = $1; }
| assign_stmt { $$ = $1; }
| read_stmt { $$ = $1; }
| write_stmt { $$ = $1; }
| error { $$ = NULL; }
;
if_stmt : IF_T exp THEN stmt_seq END
{
$$ = new_if_stmt(current_lineno, $2, $4);
}
| IF_T exp THEN stmt_seq ELSE stmt_seq END
{ $$ = new_if_stmt(current_lineno, $2, $4, $6);
}
;
repeat_stmt : REPEAT stmt_seq UNTIL exp
{ $$ = new_repeat_stmt(current_lineno, $2, $4);
}
;
assign_stmt : ID { $1=new_id(current_lineno, yytext);
}
ASSIGN exp
{
$$ = new_assign_stmt(current_lineno, $1, $4);
}
;
read_stmt : READ ID
{
$$=new_read_stmt(current_lineno, new_id(current_lineno, yytext));
}
;
write_stmt : WRITE exp
{
$$=new_write_stmt(current_lineno, $2);
}
;
exp : simple_exp LT simple_exp
{
$$ = new_exp(current_lineno, $1,LT,$3);
}
| simple_exp EQ simple_exp
{
$$=new_exp(current_lineno, $1,EQ,$3);
}
| simple_exp { $$ =$1; }
;
simple_exp : simple_exp PLUS term
{
$$=new_simple_exp(current_lineno, $1, PLUS, $3);
}
| simple_exp MINUS term
{
$$=new_simple_exp(current_lineno, $1, MINUS, $3);
}
| term { $$ = $1; }
;
term : term TIMES factor
{
$$=new_term(current_lineno, $1, TIMES, $3);
}
| term OVER factor
{
$$=new_term(current_lineno, $1, OVER, $3);
}
| factor { $$ = $1; }
;
factor : LPAREN exp RPAREN
{ $$ = $2; }
| NUM
{
$$=new_num(current_lineno, atoi(yytext));
}
| ID {
$$=new_id(current_lineno, yytext);
}
| error { $$ = NULL; }
;
%%
int yyerror(const char *s) {
fprintf(stderr, "Error: %s at line %d near %s\n", s, current_lineno, yytext);
return 0;
}
tiny语言的表达式还挺难写的
- 这里需要先定义token,生成tiny-parser.tab.h文件,然后让tiny-lexer.l包含,这样可以解决重定义token的问题。
- 对于每一个表达式的值,如果没有指定的话,就是int型,指定就是你自己定义的类型
- 一定要是指针类型,因为局部变量在过了这个函数之后,他就会自动销毁
- 不要用c++的智能指针,bison是c语言写的,听说,不支持那么高级的语法
- 示例,就是这个宏
#define YYSTYPE TreeNode *
- 注意这里是**$4不是$3**,前面的SDT也算是一个$,这个bug特别难找,莫名其妙的空指针错误
assign_stmt : ID { $1=new_id(current_lineno, yytext);
}
ASSIGN exp
{
$$ = new_assign_stmt(current_lineno, $1, $4);
}
;
- 要声明yylex()函数,这个是从词法解析里定义的入口函数。
接口
#ifndef TREE_NODE_H
#define TREE_NODE_H
#include <string.h>
#include <vector>
#include "tiny-parser.tab.h"
using namespace std;
extern int lineno ;
enum NodeKind{StmtK=1,ExpK} ;
enum StmtKind{Stmt_K=3,IfK,RepeatK,AssignK,ReadK,WriteK} ;
enum ExpKind {OpK=9,ConstK,IdK} ;
/* ExpType is used for type checking */
enum ExpType{Void_type=9,Integer,Boolean} ;
#define Token_Kind int
class TreeNode{
private:
void show_info(int depth,bool is_last_child);
public:
NodeKind kind;
int lineno;
union { StmtKind stmt; ExpKind exp;} production_kind;
union { Token_Kind op;
int val;
char *name; } attr;
ExpType type; /* for type checking of exps */
vector<TreeNode *> children;
void copy_name(const char *name){
this->attr.name=new char(strlen(name));
strcpy(this->attr.name,name);
};
void show_all_info();
TreeNode(int line){
this->lineno = line;
}
template<typename... Args>
TreeNode(int line,TreeNode * first_child, Args... args){
this->lineno = line;
this->children.push_back(first_child);
(this->children.push_back(args),...);
};
~TreeNode(){
if(this->production_kind.exp==IdK){
delete this->attr.name;
}
}
virtual void generate_code()=0;
};
TreeNode * new_stmt_node();
TreeNode * new_if_stmt(int line,TreeNode * cond, TreeNode * then_stmt, TreeNode * else_stmt);
TreeNode * new_if_stmt(int line,TreeNode * cond, TreeNode * then_stmt);
TreeNode * new_repeat_stmt(int line, TreeNode * body,TreeNode * cond);
TreeNode * new_assign_stmt(int line,TreeNode * var, TreeNode * exp);
TreeNode * new_read_stmt(int line,TreeNode * var);
TreeNode * new_write_stmt(int line,TreeNode * exp);
TreeNode * new_exp(int line,TreeNode * left, Token_Kind op, TreeNode * right);
TreeNode * new_exp(int line,TreeNode * simple_exp);
TreeNode* new_simple_exp(int line,TreeNode * left, Token_Kind op, TreeNode * right);
TreeNode * new_term(int line,TreeNode * left, Token_Kind op, TreeNode * right);
TreeNode * new_term(int line,TreeNode * factor);
TreeNode * new_num(int line,int val);
TreeNode * new_id(int line,const char *name);
#endif
就是使用TreeNode*节点,记录当前的需要的信息,比方说 表达式的符号,变量的id,当前节点的行号,每次进行到规约的时候,就执行这个,把节点增加到根节点里去。
接口实现
#ifndef TREE_NODE_EXTERN_H
#define TREE_NODE_EXTERN_H
#include <iostream>
#include "TreeNode.h"
using namespace std;
extern FILE *fout;
class StmtNode : public TreeNode {
public:
StmtNode(int line):TreeNode(line){
};
void generate_code() override{
for(auto &child:this->children){
child->generate_code();
}
}
};
class IfNode : public TreeNode {
public:
static int if_count ;
IfNode(int line, TreeNode* condition, TreeNode* true_stmt, TreeNode* false_stmt):TreeNode(line){};
IfNode(int line, TreeNode* condition, TreeNode* true_stmt):TreeNode(line,condition,true_stmt){};
void generate_code() override{
if_count++;
this->children[0]->generate_code();
if(this->children.size()==3){
//if else
fprintf(fout,"if_else_%d:\n",if_count);
this->children[1]->generate_code();
fprintf(fout,"jmp if_end_%d\n",if_count);
fprintf(fout,"label if_else_%d\n",if_count);
this->children[2]->generate_code();
fprintf(fout,"label if_end_%d\n",if_count);
}else{
//if
fprintf(fout,"if_end_%d:\n",if_count);
this->children[1]->generate_code();
fprintf(fout,"label if_end_%d\n",if_count);
}
fflush(fout);
}
};
int IfNode::if_count = 0;
class RepeatNode : public TreeNode {
public:
static int repeat_count ;
RepeatNode(int line, TreeNode* condition, TreeNode* stmt):TreeNode(line,condition,stmt){};
void generate_code() override{
repeat_count++;
fprintf(fout,"label repeat_%d\n",repeat_count);
this->children[0]->generate_code();
this->children[1]->generate_code();
fprintf(fout,"repeat_%d\n",repeat_count);
fflush(fout);
}
};
int RepeatNode::repeat_count = 0;
class AssignNode : public TreeNode {
public:
AssignNode(int line, TreeNode* left, TreeNode* right):TreeNode(line,left,right){};
void generate_code() override{
this->children[1]->generate_code();
fprintf(fout,"pop %s\n",this->children[0]->attr.name);
fflush(fout);
}
};
class ReadNode : public TreeNode {
public:
ReadNode(int line, TreeNode* left):TreeNode(line,left){};
void generate_code() override{
fprintf(fout,"read %s\n",children[0]->attr.name);
fflush(fout);
}
};
class WriteNode : public TreeNode {
public:
WriteNode(int line, TreeNode* right):TreeNode(line,right){};
void generate_code() override{
fprintf(fout,"write %s\n",children[0]->attr.name);
}
};
class ExpNode : public TreeNode {
public:
ExpNode(int line, TreeNode* left, TreeNode* right):TreeNode(line,left,right){};
ExpNode(int line, TreeNode* simple_exp):TreeNode(line,simple_exp){};
ExpNode(int line):TreeNode(line){};
void generate_code() override{
if(this->children.size()==1){
this->children[0]->generate_code();
}
else if(this->children.size()==2){
//this->show_all_info();
//cout<<endl;
this->children[0]->generate_code();
this->children[1]->generate_code();
fprintf(fout,"sub\n");
switch (this->attr.op)
{
case EQ:
fprintf(fout,"jnz ");
break;
case LT:
fprintf(fout,"jeg ");
break;
default:
cerr<<"wrong operator in exp node"<<endl;
break;
}
}
fflush(fout);
}
};
class SimpleExpNode : public TreeNode {
public:
SimpleExpNode(int line, TreeNode* left, TreeNode* right,Token_Kind op):TreeNode(line,left,right){
this->attr.op = op;
this->kind = ExpK;
this->type = Integer;
this->production_kind.exp=OpK;
};
SimpleExpNode(int line, TreeNode* factor):TreeNode(line,factor){
this->kind = ExpK;
this->type = Integer;
this->production_kind.exp=OpK;
};
void generate_code() override{
if(this->children.size()==1){
this->children[0]->generate_code();
}else{
switch (this->attr.op)
{
case PLUS:
this->children[0]->generate_code();
this->children[1]->generate_code();
fprintf(fout,"add\n");
break;
case MINUS:
this->children[0]->generate_code();
this->children[1]->generate_code();
fprintf(fout,"sub\n");
break;
default:
cerr<<"wrong operator in simple exp node"<<endl;
break;
}
}
fflush(fout);
}
};
class TermNode : public TreeNode {
public:
TermNode(int line, TreeNode* left, TreeNode* right):TreeNode(line,left,right){};
TermNode(int line, TreeNode* factor):TreeNode(line,factor){};
void generate_code() override{
if(this->children.size()==0){
this->children[0]->generate_code();
}else{
this->children[0]->generate_code();
this->children[1]->generate_code();
switch (this->attr.op)
{
case TIMES:
fprintf(fout,"mul\n");
break;
case OVER:
fprintf(fout,"div\n");
break;
default:
cerr<<"wrong operator in term node"<<endl;
break;
}
}
fflush(fout);
}
};
class IdNode : public TreeNode {
public:
IdNode(int line,const char *name):TreeNode(line){
this->kind = ExpK;
this->type = Integer;
this->attr.name = new char[strlen(name)+1];
strcpy(this->attr.name,name);
this->production_kind.exp=IdK;
}
void generate_code() override{
fprintf(fout,"push %s\n",this->attr.name);
fflush(fout);
}
};
class NumNode : public TreeNode {
public:
NumNode(int line,int value):TreeNode(line){
this->kind = ExpK;
this->type = Integer;
this->attr.val = value;
this->production_kind.exp=ConstK;
}
void generate_code() override{
fprintf(fout,"push $%d\n",this->attr.val);
fflush(fout);
}
};
#endif
对于每一个不同类型的节点,都有子类和他对应,生成不同的汇编代码。
本来应该是像后面的那样,更加简洁一点的,但我懒得改了,哈哈,先做个垃圾出来再说。
抽象语法树的生成
program : stmt_seq {root = $1; }
这里执行成功了之后,就可以对root调用get_info的接口了,输出所有节点的信息
这是最后的结果,这个就是树的前序遍历,没啥好说的,最后一行可以不用| 的。
void TreeNode::show_info(int depth,bool is_last_child){
// 输出当前节点的信息
for (int i = 0; i < depth; i++) {
if (i == depth-1) {
if (is_last_child) {
cout << "└── ";
} else {
cout << "├── ";
}
} else {
cout << "│ ";
}
}
cout<<"line: "<<this->lineno<<", type: "<<getExpTypeName(type);
cout<<", ";
if(this->kind==StmtK){
cout<<"stmt: "<<getStmtKindName(production_kind.stmt)<<endl;
}else{
cout<<"exp: "<<getExpKindName(production_kind.exp);
switch (production_kind.exp)
{
case OpK:
cout<<", op: "<<Token_Kind_to_string(attr.op)<<endl;
break;
case ConstK:
cout<<", val: "<<attr.val<<endl;
break;
case IdK:
cout<<", name: "<<attr.name<<endl;
break;
default:
cout<<endl;
break;
}
}
// 递归输出子节点
for (size_t i = 0; i < children.size(); i++) {
if (children[i] != nullptr) {
children[i]->show_info(depth + 1, (i == (children.size() - 1)));
} else {
if (i == children.size() - 1) {
for (int j = 0; j < depth + 1; j++) {
if (j == depth) {
cout << "└── ";
} else {
cout << "│ ";
}
}
} else {
for (int j = 0; j < depth + 1; j++) {
if (j == depth) {
cout << "├── ";
} else {
cout << "│ ";
}
}
}
}
}
};
虚拟机的实现
见VM文件夹的虚拟机指令,模拟的是一个栈型计算机
这里用label代替的行号,因为我懒得回填行号了,大不了先遍历一遍,记录所有的label
#include <iostream>
#include <fstream>
#include <stack>
#include <string>
#include <unordered_map>
#include <vector>
#include <functional>
using namespace std;
class VirtualMachine {
public:
VirtualMachine(const string& filename) {
// 从文件中读取指令
ifstream file(filename);
if (file.is_open()) {
string line;
while (getline(file, line)) {
instructions.push_back(line);
}
file.close();
} else {
cout << "Unable to open file" << endl;
exit(1);
}
// 初始化指令映射表
instructionMap.emplace("halt", [this]() { this->halt(); });
instructionMap.emplace("add", [this]() { this->add(); });
instructionMap.emplace("sub", [this]() { this->sub(); });
instructionMap.emplace("mul", [this]() { this->mul(); });
instructionMap.emplace("div", [this]() { this->div(); });
instructionMapTwo.emplace("read", [this](string & tokens) { this->read(tokens); });
instructionMapTwo.emplace("write", [this](string& tokens) { this->write(tokens); });
instructionMapTwo.emplace("push", [this](string& tokens) { this->push(tokens); });
instructionMapTwo.emplace("pop", [this](string& tokens) { this->pop(tokens); });
instructionMapTwo.emplace("label", [this](string& tokens) { this->label(tokens); });
instructionMapTwo.emplace("jmp", [this](string& tokens) { this->jmp(tokens); });
instructionMapTwo.emplace("jnz", [this](string &tokens) { this->jnz(tokens); });
instructionMapTwo.emplace("jeg", [this](string &tokens) { this->jeg(tokens); });
instructionMapTwo.emplace("jel", [this](string &tokens) { this->jel(tokens); });
}
void run() {
// 初始化label
for(int i=0;i<instructions.size();i++){
if(instructions[i].find("label")!=string::npos){
labels[instructions[i].substr(6)]=i;
}
}
// 运行虚拟机
while (pc < instructions.size()) {
execute(instructions[pc]);
pc++;
}
}
private:
stack<int> stack_;
unordered_map<string, function<void()>> instructionMap;
unordered_map<string,function<void(string &)>> instructionMapTwo;
unordered_map<string, int> labels;
unordered_map<string,int> variables;
vector<string> instructions;
int pc = 0; // 程序计数器
void halt() {
// 停止运行
exit(0);
}
void read(string & tokens) {
// 从输入流中读取一个值并存入寄存器
cout << "Enter a value for variable " << tokens << ": ";
int value;
cin >> value;
variables[tokens] = value;
}
void write(string &tokens) {
// 将寄存器的值输出到输出流
cout <<tokens << " = " << variables[tokens] << endl;
}
void push(string& tokens) {
// 将寄存器或常数压入栈顶
if (tokens.front() == '$') {
int reg = stoi(tokens.substr(1));
stack_.push(reg);
} else if(variables.count(tokens)==1){
stack_.push(variables[tokens]);
}else{
cerr<<"Undefined variable: "<<tokens<<endl;
}
}
void pop(string& tokens) {
// 将栈顶的值弹出并存入寄存器
int reg = stack_.top();
variables[tokens] = reg;
stack_.pop();
}
void label(string& tokens) {
// 啥也不用作,前面记录过了
}
void jmp(string &tokens) {
// 跳转到标签
if(labels.count(tokens)==0){
cerr << "Undefined label: " << tokens << endl;
exit(1);
}
pc = labels[tokens];
}
void jnz(string &tokens) {
// 如果栈顶的值为0,则跳转到标签
if (stack_.top() != 0) {
if(labels.count(tokens)==0){
cerr << "Undefined label: " << tokens << endl;
exit(1);
}
pc = labels[tokens];
}
stack_.pop();
}
void jeg(string &tokens){
// 如果栈顶的值大于等于0,则跳转到标签
if (stack_.top() >= 0) {
if(labels.count(tokens)==0){
cerr << "Undefined label: " << tokens << endl;
}
pc=labels[tokens];
}
stack_.pop();
}
void jel(string &tokens){
// 如果栈顶的值小于等于0,则跳转到标签
if (stack_.top() <= 0) {
if(labels.count(tokens)==0){
cerr << "Undefined label: " << tokens << endl;
}
pc=labels[tokens];
}
stack_.pop();
}
void add() {
// 栈顶两个元素相加
int a = stack_.top();
stack_.pop();
int b = stack_.top();
stack_.pop();
stack_.push(b + a);
}
void sub() {
// 栈顶两个元素相减
int a = stack_.top();
stack_.pop();
int b = stack_.top();
stack_.pop();
stack_.push(b - a);
}
void mul() {
// 栈顶两个元素相乘
int a = stack_.top();
stack_.pop();
int b = stack_.top();
stack_.pop();
stack_.push(a * b);
}
void div() {
// 栈顶两个元素相除
int a = stack_.top();
stack_.pop();
int b = stack_.top();
stack_.pop();
stack_.push(b / a);
}
void execute(string& tokens) {
if(tokens.front()=='#'){
return ; // 注释
}
// 执行指令
if (instructionMap.count(tokens)) {
instructionMap[tokens]();
} else {
auto posi= tokens.find(' ');
if(posi != string::npos){
string instr= tokens.substr(0,posi);
string arg= tokens.substr(posi+1);
if(instructionMapTwo.count(instr)){
instructionMapTwo[instr](arg);
}else{
cout << "Invalid instruction: " << tokens << endl;
}
}else{
cout << "Invalid instruction: " << tokens << endl;
}
}
}
};
int main(int argc, char* argv[]) {
if (argc != 2) {
cout << "Usage: " << argv[0] << " <input_file>" << endl;
return 1;
}
VirtualMachine vm(argv[1]);
vm.run();
return 0;
}
这里写的非常好的地方在于,他把指令放到了一个map里,不用if判断了,耦合很低。
感想
- 最难的还是flex和bison的使用,资料和demo太少了,yyerror yylval yytexts啥的宏都要自己查官网文档。
- 虚拟机的实现很快,寄存器型的会难一些,需要把if while 这些翻译的逻辑搞懂。