编译器构造/案例研究 1B
这是一个正在进行中的案例研究,现在已链接到主页面(完成后)。User:matt73 正在研究这个。 |
本案例研究的目的是提供一个使用 Lex 和 Yacc 在 C 中编写的编译器/解释器前端的示例。使用解释器是因为它允许在最少的额外工作(在前端构建之后)的情况下创建工作程序。通过用编译器后端替换最后一阶段,可以将此代码开发成编译器。
代码按突出显示创建编译器过程的顺序显示,因此同一个文件将在开发过程中多次显示。
本案例研究开发了一个针对 Very Tiny Basic 的解释器,该解释器在本书的案例研究 1部分中进行了说明。
以下步骤将用于完成解释器
- 词法分析
- 语法分析
- 词法分析与语义
- 抽象语法树
- 生成抽象语法树
- 解释
为了简洁和简单起见,许多有用的编译器/解释器的重要功能已被省略,包括
- 处理错误
- 优化
此外,某些过程不适用于 Very Tiny Basic 这样简单的语言,例如名称表不适用,因为只使用单个字符来命名变量。
要求
lex 文件的第一稿通过在关联的 C 操作中返回特定值来识别不同的标记。对于关键字和运算符来说这很简单,但是标识符、值和注释则更复杂。
VTB.l - 版本 1
%{ %} DIGIT [0-9] LETTER [A-Z] %% "LET" {return LET;} "IF" {return IF;} "THEN" {return THEN;} "PRINT" {return PRINT;} "REM"[^\n]* {return REM;} "GOTO" {return GOTO;} "GOSUB" {return GOSUB;} "RETURN" {return RETURN;} "STOP" {return STOP;} "FOR" {return FOR;} "TO" {return TO;} "STEP" {return STEP;} "NEXT" {return NEXT;} \"[^\"]\" {return STRING;} "=" {return EQUAL;} "<>" {return NEQUAL;} %%
可以使用以下方法之一处理此文件
lex VTB.l flex VTB.l
您可能想知道所有这些返回的值从何而来。它们将在 Yacc 语法文件处理时创建。
与案例研究 1中的Very Tiny Basic - 规范相比,存在一些差异,例如 MULT 和 DIV 代替 MULDIV。这是因为我们需要区分这两个值。LineNumber 和 WholeNumber 在词法上是相同的,因此目前无法分离。定义标记的使用和类别留到下一阶段。
VTB.y - 版本 1
%{ int yylex(void); void yyerror(char* str) { /*This should handle errors!*/ } %} %union { int number; char *string; char var; } %token <number> NUMBER %token <string> STRING %token <var> VAR_NUM %token <var> VAR_STR %token LET IF THEN PRINT REM GOTO GOSUB RETURN STOP FOR EQUAL TO STEP NEXT SEPARATE NEQUAL LT GT LTEQ GTEQ ADD SUB MUL DIV LPARAN RPARAN %start lines %% lines: line lines | /*Nothing*/ ; line: NUMBER statement ; statement: LET variable EQUAL exp | IF test THEN NUMBER | PRINT printItem printList | REM | GOTO NUMBER | GOSUB NUMBER | RETURN | STOP | FOR variable EQUAL exp TO exp STEP exp | FOR variable EQUAL exp TO exp | NEXT variable ; variable: VAR_NUM | VAR_STR ; printList: SEPARATE printItem | /* Nothing */ ; printItem: exp /* | STRING */ ; test: exp comparison exp ; comparison: EQUAL | NEQUAL | LT | GT | LTEQ | GTEQ ; exp: addsub secondary secList | secondary secList ; secList: addsub secondary | /* Nothing */ ; addsub : ADD | SUB ; secondary : primary priList ; priList : muldiv primary | /* Nothing */ ; muldiv : MUL | DIV ; primary : LPARAN exp RPARAN | variable | NUMBER | STRING ;
在此版本的词法分析器中,包含了由 yacc/bison 生成的头文件。头文件定义了返回值和用于存储标记语义值的联合。这是根据语法文件的%token 声明和%union 部分创建的。词法分析器现在从某些类型的标记中提取值,并将这些值存储在yylval 联合中。
VTB.l - 版本 2
%{ #include "y.tab.h" #include <string.h> #include <malloc.h> %} DIGIT [0-9] LETTER [A-Z] %% "LET" {return LET;} "IF" {return IF;} "THEN" {return THEN;} "PRINT" {return PRINT;} "REM"[^\n]* {return REM;} "GOTO" {return GOTO;} "GOSUB" {return GOSUB;} "RETURN" {return RETURN;} "STOP" {return STOP;} "FOR" {return FOR;} "TO" {return TO;} "STEP" {return STEP;} "NEXT" {return NEXT;} \"[^\"]\" {yylval.string=malloc(strlen(yytext)+1); strcpy(yylval.string,yytext); return STRING;} "=" {return EQUAL;} "<>" {return NEQUAL;} "<" {return LT;} ">" {return GT;} "<=" {return LTEQ;} ">=" {return GTEQ;} {LETTER} {yylval.var=yytext[0]; return VAR_NUM;} {LETTER}$ {yylval.var=yytext[0]; return VAR_STR;} {DIGIT}+ {yylval.number=atoi(yytext); return NUMBER;} "," | ";" {return SEPARATE;} "+" {return ADD;} "-" {return SUB;} "*" {return MUL;} "/" {return DIV;} "(" {return LPARAN;} ")" {return RPARAN;} . {/*This is an error!*/} %%
抽象语法树 是使用数据结构在内存中创建的代码的中间表示。语言的语法结构(已定义并已作为 YACC 语法文件写下)被转换为树结构。使用 YACC & C,这意味着大多数语法规则和标记都变成了节点。注释是一个没有放入树中的示例。
在结构中,操作数的组合是清晰的,因此不需要在树中存在括号之类的标记。对于结束代码块的标记也是如此。这是可能的,因为语法文件中的规则可以使用这些标记以正确的形状创建树。
抽象语法树中组合的说明
(1+3)*4 1+3*4 * + / \ / \ 4 + 1 * / \ / \ 1 3 3 4
在此解释器中,可以通过将主/次表达式结构折叠来丢弃它们。这会增加代码的复杂性,因此目前没有实现。Rem 语句也被保留(没有注释文本),因为 VTB 的定义意味着它们是 Goto 的有效目标。实际上,跳转到不存在的行是未定义的,因此此解释器将发出错误。
对于树结构,使用指针和动态分配是必须的,否则它将是无限大的,因此会使用太多资源(仔细想想)。为了保持代码整洁,
/* * Abstract Syntax Trees for Very Tiny Basic */ typedef struct statement_ *statement; typedef struct exp_ *exp; typedef struct variable_ *variable; typedef exp printItem; /*Not required since it is always an expression.*/ typedef struct printList_ *printList; typedef struct test_ *test; typedef enum comparison_ comparison; typedef struct secondary_ *secondary; typedef struct secList_ *secList; typedef struct primary_ *primary; typedef struct priList_ *priList; typedef enum muldiv_ muldiv; typedef enum addsub_ addsub; enum comparison_ {equal, nequal, lt, gt, lteq, gteq}; enum muldiv_ {mul,divd}; enum addsub_ {add, sub}; struct statement_ { enum{ let, ifThen, print, rem, goto_, gosub, return_, stop, for_step, for_, next} type; union { struct {variable var; exp e;} let_val; struct {test t; int target;} ifThen_val; struct {printItem item; printList lst;} print_val; /* REM has no value */ int goto_val; int gosub_val; /* RETURN has no value */ /* STOP has no value */ struct {variable var; exp assign; exp to; exp step;} for_val; /* Use for both.*/ variable next_val; } u; }; struct exp_ { int fHasAddSub; /* Flag */ addsub as; secondary sec; secList list; }; struct variable_ { enum { stringV, integerV} type; char v; }; struct printList_ { printItem item; printList next; }; struct test_ { exp a, b; comparison c; }; struct secondary_ { primary p; priList list; }; struct secList_ { addsub as; secondary s; secList next; }; struct primary_ { enum {bracket, var, integerP, stringP} type; union { exp bracket_val; variable var_val; int integer_val; char *string_val; } u; }; struct priList_ { muldiv md; primary p; priList next; }; statement LetStatement(variable var_, exp e_); statement IfThenStatement(test t_, int target_); statement PrintStatement(printItem item_, printList lst_); statement RemStatement(); statement GotoStatement(int line); statement GosubStatement(int line); statement ReturnStatement(); statement StopStatement(); statement ForStatement(variable var_, exp assign_, exp to_, exp step_); /* Step can be NULL */ statement NextStatement(variable var); exp AddSubExp(addsub as_, secondary sec_, secList list_); exp PlainExp(secondary sec_, secList list_); variable StringVariable(char v_); variable IntegerVariable(char v_); printItem NewPrintItem(exp e_); printList NewPrintList(printItem item_, printList next_); test NewTest(exp a_, exp b_, comparison c_); secondary NewSecondary(primary p_, priList list_); secList NewSecList(addsub as_, secondary s_, secList next_); primary BracketPrimary(exp e_); primary VarPrimary (variable v_); primary IntegerPrimary(int n_); primary StringPrimary(char *str_); priList NewPriList(muldiv md_, primary p_, priList next_);
absyn.c 的目的是为抽象语法树中使用到的所有结构提供初始化例程。此代码不太有趣,并且非常重复。
/* * Abstract Syntax Trees for Very Tiny Basic */ #include <malloc.h> #include "absyn.h" #define safe_malloc malloc statement LetStatement(variable var_, exp e_) { statement ret = safe_malloc(sizeof(*ret)); ret->type = let; ret->u.let_val.var = var_; ret->u.let_val.e = e_; return ret; } statement IfThenStatement(test t_, int target_) { statement ret = safe_malloc(sizeof(*ret)); ret->type = ifThen; ret->u.ifThen_val.t = t_; ret->u.ifThen_val.target = target_; return ret; } statement PrintStatement(printItem item_, printList lst_) { statement ret = safe_malloc(sizeof(*ret)); ret->type = print; ret->u.print_val.item = item_; ret->u.print_val.lst = lst_; return ret; } statement RemStatement() { statement ret = safe_malloc(sizeof(*ret)); ret->type = rem; return ret; } statement GotoStatement(int line) { statement ret = safe_malloc(sizeof(*ret)); ret->type = goto_; ret->u.goto_val = line; return ret; } statement GosubStatement(int line) { statement ret = safe_malloc(sizeof(*ret)); ret->type = gosub; ret->u.gosub_val = line; return ret; } statement ReturnStatement() { statement ret = safe_malloc(sizeof(*ret)); ret->type = return_; return ret; } statement StopStatement(){ statement ret = safe_malloc(sizeof(*ret)); ret->type = stop; return ret; } /* Step can be NULL */ statement ForStatement(variable var_, exp assign_, exp to_, exp step_) { statement ret = safe_malloc(sizeof(*ret)); if(step_ == NULL) { ret->type = for_; } else { ret->type = for_step; } ret->u.for_val.var = var_; ret->u.for_val.assign = assign_; ret->u.for_val.to = to_; ret->u.for_val.step = step_; return ret; } statement NextStatement(variable var) { statement ret = safe_malloc(sizeof(*ret)); ret->type = next; ret->u.next_val = var; return ret; } exp AddSubExp(addsub as_, secondary sec_, secList list_) { exp ret = safe_malloc(sizeof(*ret)); ret->fHasAddSub = 1; ret->as = as_; ret->sec = sec_; ret->list = list_; return ret; } exp PlainExp(secondary sec_, secList list_) { exp ret = safe_malloc(sizeof(*ret)); ret->fHasAddSub = 0; ret->sec = sec_; ret->list = list_; return ret; } variable StringVariable(char v_) { variable ret = safe_malloc(sizeof(*ret)); ret->type = stringV; ret->v = v_; return ret; } variable IntegerVariable(char v_) { variable ret = safe_malloc(sizeof(*ret)); ret->type = integerV; ret->v = v_; return ret; } printItem NewPrintItem(exp e_) { return e_;} printList NewPrintList(printItem item_, printList next_) { printList ret = safe_malloc(sizeof(*ret)); ret->item = item_; ret->next = next_; return ret; } test NewTest(exp a_, exp b_, comparison c_) { test ret = safe_malloc(sizeof(*ret)); ret->a = a_; ret->b = b_; ret->c = c_; return ret; } secondary NewSecondary(primary p_, priList list_) { secondary ret = safe_malloc(sizeof(*ret)); ret->p = p_; ret->list = list_; return ret; } secList NewSecList(addsub as_, secondary s_, secList next_) { secList ret = safe_malloc(sizeof(*ret)); ret->as = as_; ret->s = s_; ret->next = next_; return ret; } primary BracketPrimary(exp e_) { primary ret = safe_malloc(sizeof(*ret)); ret->type = bracket; ret->u.bracket_val = e_; return ret; } primary VarPrimary (variable v_) { primary ret = safe_malloc(sizeof(*ret)); ret->type = var; ret->u.var_val = v_; return ret; } primary IntegerPrimary(int n_) { primary ret = safe_malloc(sizeof(*ret)); ret->type = integerP; ret->u.integer_val = n_; return ret; } primary StringPrimary(char *str_) { primary ret = safe_malloc(sizeof(*ret)); ret->type = stringP; ret->u.string_val = str_; return ret; } priList NewPriList(muldiv md_, primary p_, priList next_) { priList ret = safe_malloc(sizeof(*ret)); return ret; }
由于我们使用的是标准 *nix 工具,而 *nix 上常用的构建系统是 make,因此为解释器编写 Makefile 非常有用。请记住,大多数编译器/解释器非常大,需要比本示例更高级的构建系统。它们可能需要 CVS、autoconf 和分布在不同目录中的许多 makefile。由于这个只有一个使用五个文件,所以它非常简单。
DEBUG_CFLAGS := -Wall -Wno-unknown-pragmas -Wno-format -g -DDEBUG -O0 DEBUG_LDFLAGS := -g RELEASE_CFLAGS := -O3 RELEASE_LDFLAGS := #Comment out following line if you need optimization ;-) DEBUG := YES #Comment out following line if you're not using Windows/Dos POSTFIX := .exe ifeq (YES, ${DEBUG}) CFLAGS := ${DEBUG_CFLAGS} LDFLAGS := ${DEBUG_LDFLAGS} else CFLAGS := ${RELEASE_CFLAGS} LDFLAGS := ${RELEASE_LDFLAGS} endif .PHONY : all clean all : vtbi${POSTFIX} vtbi${POSTFIX} : vtbi.o y.tab.o lex.yy.o absyn.o gcc ${LDFLAGS} ${CFLAGS} -o vtbi vtbi.o y.tab.o lex.yy.o absyn.o clean : -rm *.o y.tab.h y.tab.c lex.yy.c *.exe y.tab.c : VTB.y bison -yd VTB.y -r state y.tab.h : y.tab.c echo "y.tab.h created by Bison as well as y.tab.c" lex.yy.c : VTB.l flex VTB.l %.o : %.c gcc -c ${CFLAGS} $< -o $@
VTB.y - 版本 2 词法分析器现在为标记提供值,并且抽象语法树结构已编写完成。接下来,更新语法文件以从规则和语义值中构建树。所有树节点类型都被添加到联合声明中。必须为规则指定类型并返回正确的类型。
%{ #include <stdio.h> #include "absyn.h" int yylex(void); void yyerror(char* str) { /*This should handle errors!*/ } void addLine(int line, statement stm); %} %union { /* Token Types */ int number; char *string; char var; /* Abstract Syntax Tree Types */ statement stm; exp exp; variable variable; printList printList; test test; comparison comp; secondary sec; secList secList; primary pri; priList priList; muldiv muldiv; addsub addsub; } %token <number> NUMBER %token <string> STRING %token <var> VAR_NUM %token <var> VAR_STR %token LET IF THEN PRINT REM GOTO GOSUB RETURN STOP FOR EQUAL TO STEP NEXT SEPARATE NEQUAL LT GT LTEQ GTEQ ADD SUB MUL DIV LPARAN RPARAN %start lines %type <stm> statement %type <exp> exp %type <exp> printItem %type <variable> variable %type <printList> printList %type <test> test %type <comp> comparison %type <sec> secondary %type <secList> secList %type <pri> primary %type <priList> priList %type <muldiv> muldiv %type <addsub> addsub %% lines: line lines | /*Nothing*/ ; line: NUMBER statement {addLine($1, $2);} ; statement: LET variable EQUAL exp {$$=LetStatement($2,$4);} | IF test THEN NUMBER {$$=IfThenStatement($2, $4);} | PRINT printItem printList {$$=PrintStatement($2, $3);} | REM {$$=RemStatement();} | GOTO NUMBER {$$=GotoStatement($2);} | GOSUB NUMBER {$$=GosubStatement($2);} | RETURN {$$=ReturnStatement();} | STOP {$$=StopStatement();} | FOR variable EQUAL exp TO exp STEP exp {$$=ForStatement($2, $4, $6, $8);} | FOR variable EQUAL exp TO exp {$$=ForStatement($2, $4, $6, NULL);} | NEXT variable {$$=NextStatement($2);} ; variable: VAR_NUM {$$=IntegerVariable($1);} | VAR_STR {$$=StringVariable($1);} ; printList: SEPARATE printItem printList {$$=NewPrintList($2,$3);} | /* Nothing */ {$$=NULL;} ; printItem: exp {$$=NewPrintItem($1);} /* | STRING */ ; test: exp comparison exp {$$=NewTest($1, $3, $2);} ; comparison: EQUAL {$$=equal;} | NEQUAL {$$=nequal;} | LT {$$=lt;} | GT {$$=gt;} | LTEQ {$$=lteq;} | GTEQ {$$=gteq;} ; exp: addsub secondary secList {$$=AddSubExp($1, $2, $3);} | secondary secList {$$=PlainExp($1, $2);} ; secList: addsub secondary secList {$$=NewSecList($1, $2, $3);} | /* Nothing */ {$$=NULL;} ; addsub : ADD {$$=add;} | SUB {$$=sub;} ; secondary : primary priList {$$=NewSecondary($1, $2);} ; priList : muldiv primary priList {$$=NewPriList($1, $2, $3);} | /* Nothing */ {$$=NULL;} ; muldiv : MUL {$$=mul;} | DIV {$$=divd;} ; primary : LPARAN exp RPARAN {$$=BracketPrimary($2);} | variable {$$=VarPrimary($1);} | NUMBER {$$=IntegerPrimary($1);} | STRING {$$=StringPrimary($1);} ;