ROSE 编译器框架/算术强度测量工具
一个用于帮助测量循环算术强度(FLOPS/内存)的工具。它通过以下方式实现:
- 静态估计用户指定循环的每次迭代中的浮点运算和加载/存储字节数
- 用语句修改循环,以捕获循环迭代次数并计算 FLOPS 和内存占用(加载/存储字节)
- 用户随后运行修改后的代码以生成最终报告。
快速信息
- 工具位置:https://github.com/rose-compiler/rose-develop/tree/master/projects/ArithmeticMeasureTool
- 测试:在相应的构建树中键入 "make check"
建议从 rose-develop 仓库获取工具以获得最新更新。
第一步是像往常一样下载并安装 rose
然后
- cd rose-build-tree/projects/ArithmeticMeasureTool
- make && make install
一个名为 measureTool 的可执行文件将被安装在 ROSE_INSTALLATION_PATH/bin 目录中
现在准备您的环境以便可以调用该工具
# set.rose file, source it to set up environment variables ROSE_INS=/home/liao6/workspace/masterDevClean/install export ROSE_INS PATH=$ROSE_INS/bin:$PATH export PATH LD_LIBRARY_PATH=$ROSE_INS/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH
列表
- -help:打印帮助信息
- -debug:启用调试模式,生成显示进度和内部结果的屏幕输出
- -annot your_annotation_file:接受用户指定的函数副作用标注,补充编译器分析
- -static-counting-only:一种特殊的执行模式,其中工具扫描所有循环体并将计数结果写入报告文件
- -report-file your_report_file.txt:指定您自己的报告文件名,否则将使用默认文件 ai_tool_report.txt。
- -use-algorithm-v2:在静态计数模式中使用第二个版本的算法,自下而上合成遍历以计算 FLOPS,仍在开发中
编译器分析无法确定所有函数的副作用。这可能是由于无法访问库源代码或源代码中指针使用的复杂性。为了解决这个问题,该工具通过 --annot 选项接受函数副作用标注文件
标注文件格式
operator abs(int val) { modify none; read{val}; alias none; } operator max(double val1, double val2) { modify none; read{val1, val2}; alias none; }
示例命令行
- measureTool -c -annot /path/to/functionSideEffect.annot your_input.c
这是一种特殊的模式,该工具仅查找所有循环并计算循环体的 FLOPS。报告的数字仅针对单次迭代。
加载/存储字节以两种方式表示
- 表达式格式:例如 3*sizeof(float) + 5*sizeof(double)
- 最终求值的整数值:52
结果写入文本报告文件。
./measureTool -c -static-counting-only -annot ../../../sourcetree/projects/ArithmeticMeasureTool/src/functionSideEffect.annot -I. ../../../sourcetree/projects/ArithmeticMeasureTool/test/jacobi.c
生成的报告摘录。注意,第 129 行的循环有两个加法 FP 操作和两个乘法操作。它加载 0 字节并存储一个双精度元素(通常为 8 字节)。因此,最终算术强度 (AI) 为 4/8 = 0.5 ops/byte
生成的报告文件内容:ai_tool_report.txt
----------Floating Point Operation Counts--------------------- SgForStatement@ /home/liao6/workspace/ExReDi/ai_tool/sourcetree/projects/ArithmeticMeasureTool/test/jacobi.c:129:10 fp_plus:2 fp_minus:0 fp_multiply:2 fp_divide:0 fp_total:4 ----------Memory Operation Counts--------------------- Loads: NULL Loads int: 0 Stores:1 * sizeof(double ) Store int: 8 ----------Arithmetic Intensity--------------------- AI=0.5
现在
- 如果 AI 未初始化,则将其设置为 -1.0
- 如果除以零字节,则 AI 将设置为 9999.9
在此模式下,翻译器可以通过将结果与输入代码中的 pragma 指示的结果进行比较来验证工具生成的结果。
用户提供的 pragma 采用以下形式
#pragma aitool fp_plus(10) fp_minus(10) fp_multiply(10) fp_divide (10) fp_total(40) for () ... void error_check ( ) { int i,j; double xx,yy,temp,error; dx = 2.0 / (n-1); dy = 2.0 / (m-1); error = 0.0 ; #pragma aitool fp_plus(3) fp_minus(3) fp_multiply(6) for (i=0;i<n;i++) for (j=0;j<m;j++) { xx = -1.0 + dx * (i-1); yy = -1.0 + dy * (j-1); temp = u[i][j] - (1.0-xx*xx)*(1.0-yy*yy); error = error + temp*temp; } error = sqrt(error)/(n*m); printf("Solution Error :%E \n",error); }
fp_total 是必需的,而其他类型的 FP 操作的子句是可选的。
这是默认模式。
该工具目前与用户添加的代码修改协同工作,使用以下步骤
- 使用特定变量名声明四个全局计数器,这些计数器稍后会被工具识别
- 在您想要计算 FPs 和加载/存储字节的循环之前添加 chiterations = ..
- 打印结果:printf ("chflops =%lu chloads =%lu chstores=%lu\n", chflops, chloads, chstores);
1 #include <stdio.h> 2 #define SIZE 10 3 4 // Instrumentation 1: add a few global variables 5 unsigned long int chiterations = 0; 6 unsigned long int chloads = 0; 7 unsigned long int chstores = 0; 8 unsigned long int chflops = 0; 9 10 double ref[2] = {9.2, 5.4}; 11 double coarse[SIZE][SIZE][SIZE]; 12 int main() 13 { 14 double refScale = 1.0 / (ref[0] * ref[1]); 15 int iboxlo1 = 0, iboxlo0 = 0, iboxhi1 = SIZE-1, iboxhi0 = SIZE-1; 16 int var; 17 int ic1=0, ic0=0; 18 int ip0 = ic0 * ref[0]; 19 int ip1 = ic1 * ref[1]; 20 double coarseSum = 0.0; 21 int ii1, ii0; 22 23 for (var =0; var < SIZE ; var++) 24 { 25 //Instrumentation 2: pass in loop iteration for the loop to be counted 26 chiterations = (1 + iboxhi1 - iboxlo1) * (1 + iboxhi0 - iboxlo0); 27 for (ic1 = iboxlo1; ic1< iboxhi1 +1; ic1++) 28 for (ic0 = iboxlo0; ic0< iboxhi0 +1; ic0++) 29 { 30 int ibreflo1 = 0, ibreflo0 = 0, ibrefhi1 = SIZE-1, ibrefhi0 = SIZE-1; 31 //Instrumentation 3: pass in loop iteration for the loop to be counted 32 chiterations = (1 + ibrefhi1 - ibreflo1) * (1 + ibrefhi0 - ibreflo0); 33 for (ii1 = ibreflo1; ii1< ibrefhi1 +1; ii1++) 34 for (ii0 = ibreflo0; ii0< ibrefhi0 +1; ii0++) 35 { 36 coarseSum = coarseSum + coarse[ii1][ii0][ii1] +(ip0 + ii0) + (ip1 + ii1) + var; 37 } 38 coarse[ic0][ic1][var] = coarseSum * refScale; 39 } 40 } 41 //Instrumentation 4: print out results 42 printf ("chflops =%lu chloads =%lu chstores=%lu\n", chflops, chloads, chstores); 43 return 0; 44 }
./measureTool -c -annot ../../../sourcetree/projects/ArithmeticMeasureTool/src/functionSideEffect.annot nestedloops.c
该工具将
- 计算指定循环的 FLOPS 和加载存储字节
- 添加计数器累加语句,为不同的循环使用不同的计数器
1 #include <stdio.h> 2 #define SIZE 10 3 // Instrumentation 1: add a few global variables 4 unsigned long chiterations = 0; 5 unsigned long chloads = 0; 6 unsigned long chstores = 0; 7 unsigned long chflops = 0; 8 double ref[2] = {(9.2), (5.4)}; 9 double coarse[10][10][10]; 10 11 int main() 12 { 13 double refScale = 1.0 / (ref[0] * ref[1]); 14 int iboxlo1 = 0; 15 int iboxlo0 = 0; 16 int iboxhi1 = 10 - 1; 17 int iboxhi0 = 10 - 1; 18 int var; 19 int ic1 = 0; 20 int ic0 = 0; 21 int ip0 = (ic0 * ref[0]); 22 int ip1 = (ic1 * ref[1]); 23 double coarseSum = 0.0; 24 int ii1; 25 int ii0; 26 unsigned long chiterations_1; 27 unsigned long chiterations_2; 28 for (var = 0; var < 10; var++) { 29 //Instrumentation 2: pass in loop iteration for the loop to be counted 30 chiterations_2 = (1 + iboxhi1 - iboxlo1) * (1 + iboxhi0 - iboxlo0); 31 for (ic1 = iboxlo1; ic1 < iboxhi1 + 1; ic1++) { 32 for (ic0 = iboxlo0; ic0 < iboxhi0 + 1; ic0++) { 33 int ibreflo1 = 0; 34 int ibreflo0 = 0; 35 int ibrefhi1 = 10 - 1; 36 int ibrefhi0 = 10 - 1; 37 //Instrumentation 3: pass in loop iteration for the loop to be counted 38 chiterations_1 = (1 + ibrefhi1 - ibreflo1) * (1 + ibrefhi0 - ibreflo0); 39 for (ii1 = ibreflo1; ii1 < ibrefhi1 + 1; ii1++) { 40 for (ii0 = ibreflo0; ii0 < ibrefhi0 + 1; ii0++) { 41 coarseSum = coarseSum + coarse[ii1][ii0][ii1] + (ip0 + ii0) + (ip1 + ii1) + var; 42 } 43 } 44 /* aitool generated Loads counting statement ... */ 45 chloads = chloads + chiterations_1 * (1 * sizeof(double )); 46 /* aitool generated FLOPS counting statement ... */ 47 chflops = chflops + chiterations_1 * 4; 48 coarse[ic0][ic1][var] = coarseSum * refScale; 49 } 50 } 51 /* aitool generated Stores counting statement ... */ 52 chstores = chstores + chiterations_2 * (1 * sizeof(double )); 53 /* aitool generated FLOPS counting statement ... */ 54 chflops = chflops + chiterations_2 * 1; 55 } 56 //Instrumentation 4: pass in loop iteration for the loop to be counted 57 printf("chflops =%lu chloads =%lu chstores=%lu\n",chflops,chloads,chstores); 58 return 0; 59 }
gcc -O3 rose_nestedloops.c -o nestedloops.out -l
./nestedloops.out
结果看起来像
chflops =401000 chloads =800000 chstores=8000
该工具目前不支持带有函数调用的Fortran循环
- ROSE的Fortran过程/例程表示不够准确(缺少参数类型信息),无法与为匹配C/C++函数而设计的函数副作用注释挂钩。
执行模型变量running_mode
- e_analysis_and_instrument
- e_static_counting
class FPCounters: public AstAttribute {}; 用于存储分析结果
void CountFPOperations() 来自src/ai_measurement.cpp
Rose_STL_Container<SgNode*> nodeList = NodeQuery::querySubTree(input, V_SgBinaryOp); for (Rose_STL_Container<SgNode *>::iterator i = nodeList.begin(); i != nodeList.end(); i++) { fp_operation_kind_enum op_kind = e_unknown; // bool isFPType = false; // check operation type SgBinaryOp* bop= isSgBinaryOp(*i); switch (bop->variantT()) { case V_SgAddOp: case V_SgPlusAssignOp: op_kind = e_plus; break; case V_SgSubtractOp: case V_SgMinusAssignOp: op_kind = e_minus; break; case V_SgMultiplyOp: case V_SgMultAssignOp: op_kind = e_multiply; break; case V_SgDivideOp: case V_SgDivAssignOp: op_kind = e_divide; break; default: break; } //end switch ... }
主要函数定义在ai_measurement.cpp中
- std::pair <SgExpression*, SgExpression*> CountLoadStoreBytes (SgLocatedNode* input, bool includeScalars /* = true */, bool includeIntType /* = true */)
- SgExpression* calculateBytes (std::set<SgInitializedName*>& name_set, SgStatement* lbody, bool isRead)
返回用于计算值的表达式,而不是实际值,因为sizeof(type)是机器相关的。
配置
- 默认情况下:仅计算数组引用。标量被忽略。
算法
- 调用副作用分析以查找读/写变量,某些引用可能会同时触发读和写访问。如果分析成功,则继续。否则会发出警告。
- 对同一个数组/标量变量的访问被归类为一个读访问(或写访问):例如array[i][j]、array[i][j+1]、array[i][j-1]等被计为单个访问
- 根据类型对访问进行分组:相同类型的访问->增加相同的计数器以缩短表达式的长度
- 迭代结果以生成类似2*sizeof(float) + 5* sizeof(double)的表达式
- 作为近似值,我们在不考虑函数调用的情况下,在此使用简单的分析。
// Obtain per-iteration load/store bytes calculation expressions // excluding scalar types to match the manual version //CountLoadStoreBytes (SgLocatedNode* input, bool includeScalars = true, bool includeIntType = true); std::pair <SgExpression*, SgExpression*> load_store_count_pair = CountLoadStoreBytes (loop_body, false, true); // chstores=chstores+chiterations*8 if (load_store_count_pair.second!= NULL) { SgExprStatement* store_byte_stmt = buildCounterAccumulationStmt("chstores", new_iter_var_name, load_store_count_pair.second, scope); insertStatementAfter (loop, store_byte_stmt); attachComment(store_byte_stmt," aitool generated Stores counting statement ..."); } // handle loads stmt 2nd so it can be inserted as the first after the loop // build chloads=chloads+chiterations*2*8 if (load_store_count_pair.first != NULL) { SgExprStatement* load_byte_stmt = buildCounterAccumulationStmt("chloads", new_iter_var_name, load_store_count_pair.first, scope); insertStatementAfter (loop, load_byte_stmt); attachComment(load_byte_stmt," aitool generated Loads counting statement ..."); }
科学应用通常具有嵌套循环。简单的插桩会造成两个问题
- 对嵌套循环体进行双重计数
- chiterations= .. 语句用于所有级别的循环。内部循环的chiterations将覆盖用于指示外部循环的chiterations。
解决方案
- 翻译器使用自下而上的遍历顺序:首先处理内部循环,然后处理外部循环。
- 为了避免在嵌套循环内对FP操作进行双重计数:所有已访问的FP操作表达式都被存储到一个查找表中。后面的计数将检查操作是否已被计入。如果是,则跳过。
- 为了避免在计算外部循环体时对嵌套循环中使用的变量进行双重计数:这与FP运算表达式的处理略有不同。在这里,我们找到在内部循环中计数的所有变量,并在对外部循环进行计数时将其排除在外。注意:完全排除,而不仅仅是标记对a的引用,并在稍后排除此引用。
- 注意:静态计数模式不会进行这种排除,因为冗余执行的假设不再是问题。如果嵌套循环,我们仍然会为内部循环和外部循环计算循环体的FLOPS。
- 将chiterations=改写为chiterations_loopId= .. ,以便每个循环都有自己的迭代次数变量。
// global chiterations is changed to two local variables: each for one loop unsigned long chiterations_1; unsigned long chiterations_2; for (var = 0; var < 10; var++) { //Instrumentation 2: pass in loop iteration for the loop to be counted chiterations_2 = ((1 + iboxhi1 - iboxlo1) * (1 + iboxhi0 - iboxlo0) * 1); for (ic1 = iboxlo1; ic1 < iboxhi1 + 1; ic1++) { for (ic0 = iboxlo0; ic0 < iboxhi0 + 1; ic0++) { int ibreflo1 = 0; int ibreflo0 = 0; int ibrefhi1 = 10 - 1; int ibrefhi0 = 10 - 1; //Instrumentation 3: pass in loop iteration for the loop to be counted chiterations_1 = ((1 + ibrefhi1 - ibreflo1) * (1 + ibrefhi0 - ibreflo0) * 1); for (ii1 = ibreflo1; ii1 < ibrefhi1 + 1; ii1++) { for (ii0 = ibreflo0; ii0 < ibrefhi0 + 1; ii0++) { coarseSum = coarseSum + coarse[ii1][ii0][ii1] + (ip0 + ii0) + (ip1 + ii1) + var; } } /* aitool generated Loads counting statement ... */ chloads = chloads + chiterations_1 * (1 * sizeof(double )); /* aitool generated FLOPS counting statement ... */ chflops = chflops + chiterations_1 * 4; coarse[ic0][ic1][var] = coarseSum * refScale; } } /* aitool generated Stores counting statement ... */ chstores = chstores + chiterations_2 * (1 * sizeof(double )); /* aitool generated FLOPS counting statement ... */ chflops = chflops + chiterations_2 * 1; }
运行所有内置测试
- make check
仅运行静态分析的测试
- make check-static
手动测试
- [liao6@tux322:~/workspace/ExReDi/ai_tool.git/translator]m && ./measureTool -c -annot ./src/functionSideEffect.annot -I. ./test/jacobi-v3.c