编译器构造/案例研究 1B
这是一个正在进行中的案例研究,现在已链接到主页面(完成后)。User:matt73 正在研究这个。 |
本案例研究的目的是提供一个使用 Lex 和 Yacc 在 C 中编写的编译器/解释器前端的示例。使用解释器是因为它允许在最少的额外工作(在前端构建之后)的情况下创建工作程序。通过用编译器后端替换最后一阶段,可以将此代码开发成编译器。
代码按突出显示创建编译器过程的顺序显示,因此同一个文件将在开发过程中多次显示。
本案例研究开发了一个针对 Very Tiny Basic 的解释器,该解释器在本书的案例研究 1部分中进行了说明。
以下步骤将用于完成解释器
- 词法分析
- 语法分析
- 词法分析与语义
- 抽象语法树
- 生成抽象语法树
- 解释
为了简洁和简单起见,许多有用的编译器/解释器的重要功能已被省略,包括
- 处理错误
- 优化
此外,某些过程不适用于 Very Tiny Basic 这样简单的语言,例如名称表不适用,因为只使用单个字符来命名变量。
要求
lex 文件的第一稿通过在关联的 C 操作中返回特定值来识别不同的标记。对于关键字和运算符来说这很简单,但是标识符、值和注释则更复杂。
VTB.l - 版本 1
%{
%}
DIGIT [0-9]
LETTER [A-Z]
%%
"LET" {return LET;}
"IF" {return IF;}
"THEN" {return THEN;}
"PRINT" {return PRINT;}
"REM"[^\n]* {return REM;}
"GOTO" {return GOTO;}
"GOSUB" {return GOSUB;}
"RETURN" {return RETURN;}
"STOP" {return STOP;}
"FOR" {return FOR;}
"TO" {return TO;}
"STEP" {return STEP;}
"NEXT" {return NEXT;}
\"[^\"]\" {return STRING;}
"=" {return EQUAL;}
"<>" {return NEQUAL;}
%%
可以使用以下方法之一处理此文件
lex VTB.l flex VTB.l
您可能想知道所有这些返回的值从何而来。它们将在 Yacc 语法文件处理时创建。
与案例研究 1中的Very Tiny Basic - 规范相比,存在一些差异,例如 MULT 和 DIV 代替 MULDIV。这是因为我们需要区分这两个值。LineNumber 和 WholeNumber 在词法上是相同的,因此目前无法分离。定义标记的使用和类别留到下一阶段。
VTB.y - 版本 1
%{
int yylex(void);
void yyerror(char* str)
{
/*This should handle errors!*/
}
%}
%union {
int number;
char *string;
char var;
}
%token <number> NUMBER
%token <string> STRING
%token <var> VAR_NUM
%token <var> VAR_STR
%token LET IF THEN PRINT REM GOTO GOSUB RETURN STOP
FOR EQUAL TO STEP NEXT SEPARATE
NEQUAL LT GT LTEQ GTEQ
ADD SUB MUL DIV
LPARAN RPARAN
%start lines
%%
lines: line lines
| /*Nothing*/
;
line: NUMBER statement
;
statement: LET variable EQUAL exp
| IF test THEN NUMBER
| PRINT printItem printList
| REM
| GOTO NUMBER
| GOSUB NUMBER
| RETURN
| STOP
| FOR variable EQUAL exp TO exp STEP exp
| FOR variable EQUAL exp TO exp
| NEXT variable
;
variable: VAR_NUM
| VAR_STR
;
printList: SEPARATE printItem
| /* Nothing */
;
printItem: exp
/* | STRING */
;
test: exp comparison exp
;
comparison: EQUAL
| NEQUAL
| LT
| GT
| LTEQ
| GTEQ
;
exp: addsub secondary secList
| secondary secList
;
secList: addsub secondary
| /* Nothing */
;
addsub : ADD
| SUB
;
secondary : primary priList
;
priList : muldiv primary
| /* Nothing */
;
muldiv : MUL
| DIV
;
primary : LPARAN exp RPARAN
| variable
| NUMBER
| STRING
;
在此版本的词法分析器中,包含了由 yacc/bison 生成的头文件。头文件定义了返回值和用于存储标记语义值的联合。这是根据语法文件的%token 声明和%union 部分创建的。词法分析器现在从某些类型的标记中提取值,并将这些值存储在yylval 联合中。
VTB.l - 版本 2
%{
#include "y.tab.h"
#include <string.h>
#include <malloc.h>
%}
DIGIT [0-9]
LETTER [A-Z]
%%
"LET" {return LET;}
"IF" {return IF;}
"THEN" {return THEN;}
"PRINT" {return PRINT;}
"REM"[^\n]* {return REM;}
"GOTO" {return GOTO;}
"GOSUB" {return GOSUB;}
"RETURN" {return RETURN;}
"STOP" {return STOP;}
"FOR" {return FOR;}
"TO" {return TO;}
"STEP" {return STEP;}
"NEXT" {return NEXT;}
\"[^\"]\" {yylval.string=malloc(strlen(yytext)+1);
strcpy(yylval.string,yytext); return STRING;}
"=" {return EQUAL;}
"<>" {return NEQUAL;}
"<" {return LT;}
">" {return GT;}
"<=" {return LTEQ;}
">=" {return GTEQ;}
{LETTER} {yylval.var=yytext[0]; return VAR_NUM;}
{LETTER}$ {yylval.var=yytext[0]; return VAR_STR;}
{DIGIT}+ {yylval.number=atoi(yytext); return NUMBER;}
"," |
";" {return SEPARATE;}
"+" {return ADD;}
"-" {return SUB;}
"*" {return MUL;}
"/" {return DIV;}
"(" {return LPARAN;}
")" {return RPARAN;}
. {/*This is an error!*/}
%%
抽象语法树 是使用数据结构在内存中创建的代码的中间表示。语言的语法结构(已定义并已作为 YACC 语法文件写下)被转换为树结构。使用 YACC & C,这意味着大多数语法规则和标记都变成了节点。注释是一个没有放入树中的示例。
在结构中,操作数的组合是清晰的,因此不需要在树中存在括号之类的标记。对于结束代码块的标记也是如此。这是可能的,因为语法文件中的规则可以使用这些标记以正确的形状创建树。
抽象语法树中组合的说明
(1+3)*4 1+3*4 * + / \ / \ 4 + 1 * / \ / \ 1 3 3 4
在此解释器中,可以通过将主/次表达式结构折叠来丢弃它们。这会增加代码的复杂性,因此目前没有实现。Rem 语句也被保留(没有注释文本),因为 VTB 的定义意味着它们是 Goto 的有效目标。实际上,跳转到不存在的行是未定义的,因此此解释器将发出错误。
对于树结构,使用指针和动态分配是必须的,否则它将是无限大的,因此会使用太多资源(仔细想想)。为了保持代码整洁,
/*
* Abstract Syntax Trees for Very Tiny Basic
*/
typedef struct statement_ *statement;
typedef struct exp_ *exp;
typedef struct variable_ *variable;
typedef exp printItem; /*Not required since it is always an expression.*/
typedef struct printList_ *printList;
typedef struct test_ *test;
typedef enum comparison_ comparison;
typedef struct secondary_ *secondary;
typedef struct secList_ *secList;
typedef struct primary_ *primary;
typedef struct priList_ *priList;
typedef enum muldiv_ muldiv;
typedef enum addsub_ addsub;
enum comparison_ {equal, nequal, lt, gt, lteq, gteq};
enum muldiv_ {mul,divd};
enum addsub_ {add, sub};
struct statement_ {
enum{ let, ifThen, print, rem, goto_, gosub, return_, stop, for_step, for_, next} type;
union {
struct {variable var; exp e;} let_val;
struct {test t; int target;} ifThen_val;
struct {printItem item; printList lst;} print_val;
/* REM has no value */
int goto_val;
int gosub_val;
/* RETURN has no value */
/* STOP has no value */
struct {variable var; exp assign; exp to; exp step;} for_val; /* Use for both.*/
variable next_val;
} u;
};
struct exp_ {
int fHasAddSub; /* Flag */
addsub as;
secondary sec;
secList list;
};
struct variable_ {
enum { stringV, integerV} type;
char v;
};
struct printList_ {
printItem item;
printList next;
};
struct test_ {
exp a, b;
comparison c;
};
struct secondary_ {
primary p;
priList list;
};
struct secList_ {
addsub as;
secondary s;
secList next;
};
struct primary_ {
enum {bracket, var, integerP, stringP} type;
union {
exp bracket_val;
variable var_val;
int integer_val;
char *string_val;
} u;
};
struct priList_ {
muldiv md;
primary p;
priList next;
};
statement LetStatement(variable var_, exp e_);
statement IfThenStatement(test t_, int target_);
statement PrintStatement(printItem item_, printList lst_);
statement RemStatement();
statement GotoStatement(int line);
statement GosubStatement(int line);
statement ReturnStatement();
statement StopStatement();
statement ForStatement(variable var_, exp assign_, exp to_, exp step_); /* Step can be NULL */
statement NextStatement(variable var);
exp AddSubExp(addsub as_, secondary sec_, secList list_);
exp PlainExp(secondary sec_, secList list_);
variable StringVariable(char v_);
variable IntegerVariable(char v_);
printItem NewPrintItem(exp e_);
printList NewPrintList(printItem item_, printList next_);
test NewTest(exp a_, exp b_, comparison c_);
secondary NewSecondary(primary p_, priList list_);
secList NewSecList(addsub as_, secondary s_, secList next_);
primary BracketPrimary(exp e_);
primary VarPrimary (variable v_);
primary IntegerPrimary(int n_);
primary StringPrimary(char *str_);
priList NewPriList(muldiv md_, primary p_, priList next_);
absyn.c 的目的是为抽象语法树中使用到的所有结构提供初始化例程。此代码不太有趣,并且非常重复。
/*
* Abstract Syntax Trees for Very Tiny Basic
*/
#include <malloc.h>
#include "absyn.h"
#define safe_malloc malloc
statement LetStatement(variable var_, exp e_) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = let;
ret->u.let_val.var = var_;
ret->u.let_val.e = e_;
return ret;
}
statement IfThenStatement(test t_, int target_) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = ifThen;
ret->u.ifThen_val.t = t_;
ret->u.ifThen_val.target = target_;
return ret;
}
statement PrintStatement(printItem item_, printList lst_) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = print;
ret->u.print_val.item = item_;
ret->u.print_val.lst = lst_;
return ret;
}
statement RemStatement() {
statement ret = safe_malloc(sizeof(*ret));
ret->type = rem;
return ret;
}
statement GotoStatement(int line) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = goto_;
ret->u.goto_val = line;
return ret;
}
statement GosubStatement(int line) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = gosub;
ret->u.gosub_val = line;
return ret;
}
statement ReturnStatement() {
statement ret = safe_malloc(sizeof(*ret));
ret->type = return_;
return ret;
}
statement StopStatement(){
statement ret = safe_malloc(sizeof(*ret));
ret->type = stop;
return ret;
}
/* Step can be NULL */
statement ForStatement(variable var_, exp assign_, exp to_, exp step_) {
statement ret = safe_malloc(sizeof(*ret));
if(step_ == NULL) {
ret->type = for_;
} else {
ret->type = for_step;
}
ret->u.for_val.var = var_;
ret->u.for_val.assign = assign_;
ret->u.for_val.to = to_;
ret->u.for_val.step = step_;
return ret;
}
statement NextStatement(variable var) {
statement ret = safe_malloc(sizeof(*ret));
ret->type = next;
ret->u.next_val = var;
return ret;
}
exp AddSubExp(addsub as_, secondary sec_, secList list_) {
exp ret = safe_malloc(sizeof(*ret));
ret->fHasAddSub = 1;
ret->as = as_;
ret->sec = sec_;
ret->list = list_;
return ret;
}
exp PlainExp(secondary sec_, secList list_) {
exp ret = safe_malloc(sizeof(*ret));
ret->fHasAddSub = 0;
ret->sec = sec_;
ret->list = list_;
return ret;
}
variable StringVariable(char v_) {
variable ret = safe_malloc(sizeof(*ret));
ret->type = stringV;
ret->v = v_;
return ret;
}
variable IntegerVariable(char v_) {
variable ret = safe_malloc(sizeof(*ret));
ret->type = integerV;
ret->v = v_;
return ret;
}
printItem NewPrintItem(exp e_) { return e_;}
printList NewPrintList(printItem item_, printList next_) {
printList ret = safe_malloc(sizeof(*ret));
ret->item = item_;
ret->next = next_;
return ret;
}
test NewTest(exp a_, exp b_, comparison c_) {
test ret = safe_malloc(sizeof(*ret));
ret->a = a_;
ret->b = b_;
ret->c = c_;
return ret;
}
secondary NewSecondary(primary p_, priList list_) {
secondary ret = safe_malloc(sizeof(*ret));
ret->p = p_;
ret->list = list_;
return ret;
}
secList NewSecList(addsub as_, secondary s_, secList next_) {
secList ret = safe_malloc(sizeof(*ret));
ret->as = as_;
ret->s = s_;
ret->next = next_;
return ret;
}
primary BracketPrimary(exp e_) {
primary ret = safe_malloc(sizeof(*ret));
ret->type = bracket;
ret->u.bracket_val = e_;
return ret;
}
primary VarPrimary (variable v_) {
primary ret = safe_malloc(sizeof(*ret));
ret->type = var;
ret->u.var_val = v_;
return ret;
}
primary IntegerPrimary(int n_) {
primary ret = safe_malloc(sizeof(*ret));
ret->type = integerP;
ret->u.integer_val = n_;
return ret;
}
primary StringPrimary(char *str_) {
primary ret = safe_malloc(sizeof(*ret));
ret->type = stringP;
ret->u.string_val = str_;
return ret;
}
priList NewPriList(muldiv md_, primary p_, priList next_) {
priList ret = safe_malloc(sizeof(*ret));
return ret;
}
由于我们使用的是标准 *nix 工具,而 *nix 上常用的构建系统是 make,因此为解释器编写 Makefile 非常有用。请记住,大多数编译器/解释器非常大,需要比本示例更高级的构建系统。它们可能需要 CVS、autoconf 和分布在不同目录中的许多 makefile。由于这个只有一个使用五个文件,所以它非常简单。
DEBUG_CFLAGS := -Wall -Wno-unknown-pragmas -Wno-format -g -DDEBUG -O0
DEBUG_LDFLAGS := -g
RELEASE_CFLAGS := -O3
RELEASE_LDFLAGS :=
#Comment out following line if you need optimization ;-)
DEBUG := YES
#Comment out following line if you're not using Windows/Dos
POSTFIX := .exe
ifeq (YES, ${DEBUG})
CFLAGS := ${DEBUG_CFLAGS}
LDFLAGS := ${DEBUG_LDFLAGS}
else
CFLAGS := ${RELEASE_CFLAGS}
LDFLAGS := ${RELEASE_LDFLAGS}
endif
.PHONY : all clean
all : vtbi${POSTFIX}
vtbi${POSTFIX} : vtbi.o y.tab.o lex.yy.o absyn.o
gcc ${LDFLAGS} ${CFLAGS} -o vtbi vtbi.o y.tab.o lex.yy.o absyn.o
clean :
-rm *.o y.tab.h y.tab.c lex.yy.c *.exe
y.tab.c : VTB.y
bison -yd VTB.y -r state
y.tab.h : y.tab.c
echo "y.tab.h created by Bison as well as y.tab.c"
lex.yy.c : VTB.l
flex VTB.l
%.o : %.c
gcc -c ${CFLAGS} $< -o $@
VTB.y - 版本 2 词法分析器现在为标记提供值,并且抽象语法树结构已编写完成。接下来,更新语法文件以从规则和语义值中构建树。所有树节点类型都被添加到联合声明中。必须为规则指定类型并返回正确的类型。
%{
#include <stdio.h>
#include "absyn.h"
int yylex(void);
void yyerror(char* str)
{
/*This should handle errors!*/
}
void addLine(int line, statement stm);
%}
%union {
/* Token Types */
int number;
char *string;
char var;
/* Abstract Syntax Tree Types */
statement stm;
exp exp;
variable variable;
printList printList;
test test;
comparison comp;
secondary sec;
secList secList;
primary pri;
priList priList;
muldiv muldiv;
addsub addsub;
}
%token <number> NUMBER
%token <string> STRING
%token <var> VAR_NUM
%token <var> VAR_STR
%token LET IF THEN PRINT REM GOTO GOSUB RETURN STOP
FOR EQUAL TO STEP NEXT SEPARATE
NEQUAL LT GT LTEQ GTEQ
ADD SUB MUL DIV
LPARAN RPARAN
%start lines
%type <stm> statement
%type <exp> exp
%type <exp> printItem
%type <variable> variable
%type <printList> printList
%type <test> test
%type <comp> comparison
%type <sec> secondary
%type <secList> secList
%type <pri> primary
%type <priList> priList
%type <muldiv> muldiv
%type <addsub> addsub
%%
lines: line lines
| /*Nothing*/
;
line: NUMBER statement {addLine($1, $2);}
;
statement: LET variable EQUAL exp {$$=LetStatement($2,$4);}
| IF test THEN NUMBER {$$=IfThenStatement($2, $4);}
| PRINT printItem printList {$$=PrintStatement($2, $3);}
| REM {$$=RemStatement();}
| GOTO NUMBER {$$=GotoStatement($2);}
| GOSUB NUMBER {$$=GosubStatement($2);}
| RETURN {$$=ReturnStatement();}
| STOP {$$=StopStatement();}
| FOR variable EQUAL exp TO exp STEP exp
{$$=ForStatement($2, $4, $6, $8);}
| FOR variable EQUAL exp TO exp
{$$=ForStatement($2, $4, $6, NULL);}
| NEXT variable {$$=NextStatement($2);}
;
variable: VAR_NUM {$$=IntegerVariable($1);}
| VAR_STR {$$=StringVariable($1);}
;
printList: SEPARATE printItem printList {$$=NewPrintList($2,$3);}
| /* Nothing */ {$$=NULL;}
;
printItem: exp {$$=NewPrintItem($1);}
/* | STRING */
;
test: exp comparison exp {$$=NewTest($1, $3, $2);}
;
comparison: EQUAL {$$=equal;}
| NEQUAL {$$=nequal;}
| LT {$$=lt;}
| GT {$$=gt;}
| LTEQ {$$=lteq;}
| GTEQ {$$=gteq;}
;
exp: addsub secondary secList {$$=AddSubExp($1, $2, $3);}
| secondary secList {$$=PlainExp($1, $2);}
;
secList: addsub secondary secList {$$=NewSecList($1, $2, $3);}
| /* Nothing */ {$$=NULL;}
;
addsub : ADD {$$=add;}
| SUB {$$=sub;}
;
secondary : primary priList {$$=NewSecondary($1, $2);}
;
priList : muldiv primary priList {$$=NewPriList($1, $2, $3);}
| /* Nothing */ {$$=NULL;}
;
muldiv : MUL {$$=mul;}
| DIV {$$=divd;}
;
primary : LPARAN exp RPARAN {$$=BracketPrimary($2);}
| variable {$$=VarPrimary($1);}
| NUMBER {$$=IntegerPrimary($1);}
| STRING {$$=StringPrimary($1);}
;
