| | import os |
| | |
| | |
| | |
| | import json |
| | import time |
| | import csv |
| | import pathlib |
| | import difflib |
| | import re |
| | from bleu import _bleu |
| | from fuzzywuzzy import fuzz |
| | import random |
| | import numpy as np |
| | from transformers import RobertaTokenizer |
| | |
| |
|
| | folder = str(pathlib.Path(__file__).parent.resolve()) |
| | isa_type_dir = folder+"/../../../Dataset" |
| | src_dir = folder+"/../../../Dataset/Code_Generation" |
| | dst_dir = folder+"/Result" |
| |
|
| | train_lis = [] |
| | valid_lis = [] |
| | test_lis = [] |
| |
|
| | target_clf = {} |
| | def get_target_clf_list(): |
| | global target_clf |
| | with open(isa_type_dir+"/comback_isa_type.csv","r",encoding="utf-8") as f: |
| | reader = csv.reader(f) |
| | for idx, l in enumerate(reader): |
| | if l[1].lower() == "arc" or l[1].lower() == "riscv" or l[1].lower() == "nvptx": |
| | continue |
| | if l[0] + " " + l[2] not in target_clf.keys(): |
| | target_clf[l[0] + " " + l[2]] = [l[1]] |
| | else: |
| | target_clf[l[0] + " " + l[2]] += [l[1]] |
| |
|
| |
|
| | def Calculate_Statements_Ratio(Src_List, Fork_Lis, src_name, fork_name): |
| | src_code = "" |
| | Fork_code = "" |
| | idx = 0 |
| | cnt_stmt = 0.0 |
| | while idx < len(Src_List): |
| | src_code += Src_List[idx].replace(src_name, "") |
| | if Src_List[idx] in [";", ":", "{", "}"]: |
| | src_code += "\n" |
| | cnt_stmt += 1 |
| | idx += 1 |
| | while idx < len(Fork_Lis): |
| | Fork_code += Fork_Lis[idx].replace(fork_name, "") |
| | if Fork_Lis[idx] in [";", ":", "{", "}"]: |
| | Fork_code += "\n" |
| | idx += 1 |
| | |
| | code_same = 0 |
| | code_modi = 0 |
| | code_add = 0 |
| | diff_code = list(difflib.Differ().compare(src_code.splitlines(), Fork_code.splitlines())) |
| | for idx, dv in enumerate(diff_code): |
| | if dv[0] == '-': |
| | if idx < len(diff_code) - 1 and diff_code[idx+1][0] == '?': |
| | code_modi += 1 |
| | else: |
| | code_add += 1 |
| | elif dv[0] == '+': |
| | continue |
| | elif dv[0] == '?': |
| | continue |
| | |
| | elif dv.strip().replace("\n", "") == '': |
| | continue |
| | else: |
| | code_same += 1 |
| | return round(float(code_same) / cnt_stmt, 2) |
| |
|
| |
|
| |
|
| | def Calculate_Forkflow(): |
| | get_target_clf_list() |
| | print("############## Exp 1: Calculate Fork-Flow ################\n") |
| | |
| | test_lis = ["nvptx","arc","riscv"] |
| | for comp_type in ["GCC", "LLVM"]: |
| | for isa_type in ["GPU", "MPU", "CPU"]: |
| | max_ed = 0 |
| | avg_ed = 0 |
| | max_bleu4 = 0 |
| | avg_bleu4 = 0 |
| | avg_cnt = 0 |
| | target_lis = target_clf[comp_type + " " + isa_type] |
| | test_target_dic = {} |
| | cnt_idx = 0 |
| | if comp_type == "GCC": |
| | if isa_type == "CPU": |
| | for line in open(src_dir + "/GCC/riscv.jsonl", 'r'): |
| | dic = json.loads(line) |
| | test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("riscv", "")] = dic["ground_truth"] |
| | cnt_idx += 1 |
| | |
| | for tar in target_lis: |
| | edit_dis = 0.0 |
| | EM = [] |
| | bleu4 = 0.0 |
| | stmt_mod = 0.0 |
| | cnt = 0 |
| | fork_target_dic = {} |
| | for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'): |
| | dic = json.loads(line) |
| | fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"] |
| |
|
| | for k in test_target_dic.keys(): |
| | func = k.split(" ")[1] |
| | src_code = " ".join(test_target_dic[k]).replace("riscv", "") |
| | if func in fork_target_dic.keys(): |
| | fork_code = " ".join(fork_target_dic[func]).replace(tar, "") |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "riscv", tar) |
| | else: |
| | fork_code = "" |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "riscv", tar) |
| | |
| | with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1: |
| | f.write(fork_code+'\n') |
| | f1.write(src_code+'\n') |
| | EM.append(fork_code==src_code) |
| | edit_dis += fuzz.ratio(fork_code, src_code) |
| | avg_ed += fuzz.ratio(fork_code, src_code) |
| | cnt += 1 |
| | avg_cnt += 1 |
| | if fork_code.strip() == "": |
| | bleu4 += 0 |
| | else: |
| | tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output") |
| | bleu4 += tmp_bleu4 |
| | avg_bleu4 += tmp_bleu4 |
| | |
| | with open(dst_dir + '/result.csv', 'a', newline='') as file: |
| | writer = csv.writer(file) |
| | writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))]) |
| | if round(float(bleu4)/cnt,2) > max_bleu4: |
| | max_bleu4 = round(float(bleu4)/cnt,2) |
| | if round(float(edit_dis)/cnt,2) > max_ed: |
| | max_ed = round(float(edit_dis)/cnt,2) |
| | if isa_type == "GPU": |
| | for line in open(src_dir + "/GCC/nvptx.jsonl", 'r'): |
| | dic = json.loads(line) |
| | test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("nvptx", "")] = dic["ground_truth"] |
| | cnt_idx += 1 |
| | |
| | for tar in target_lis: |
| | edit_dis = 0.0 |
| | EM = [] |
| | bleu4 = 0.0 |
| | stmt_mod = 0.0 |
| | cnt = 0 |
| | fork_target_dic = {} |
| | for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'): |
| | dic = json.loads(line) |
| | fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"] |
| |
|
| | for k in test_target_dic.keys(): |
| | func = k.split(" ")[1] |
| | src_code = " ".join(test_target_dic[k]).replace("nvptx", "") |
| | if func in fork_target_dic.keys(): |
| | fork_code = " ".join(fork_target_dic[func]).replace(tar, "") |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "nvptx", tar) |
| | else: |
| | fork_code = "" |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "nvptx", tar) |
| | |
| | with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1: |
| | f.write(fork_code+'\n') |
| | f1.write(src_code+'\n') |
| | EM.append(fork_code==src_code) |
| | edit_dis += fuzz.ratio(fork_code, src_code) |
| | avg_ed += fuzz.ratio(fork_code, src_code) |
| | cnt += 1 |
| | avg_cnt += 1 |
| | if fork_code.strip() == "": |
| | bleu4 += 0 |
| | else: |
| | tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output") |
| | bleu4 += tmp_bleu4 |
| | avg_bleu4 += tmp_bleu4 |
| |
|
| | with open(dst_dir + '/result.csv', 'a', newline='') as file: |
| | writer = csv.writer(file) |
| | writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))]) |
| | if round(float(bleu4)/cnt,2) > max_bleu4: |
| | max_bleu4 = round(float(bleu4)/cnt,2) |
| | if round(float(edit_dis)/cnt,2) > max_ed: |
| | max_ed = round(float(edit_dis)/cnt,2) |
| | if isa_type == "MPU": |
| | for line in open(src_dir + "/GCC/arc.jsonl", 'r'): |
| | dic = json.loads(line) |
| | test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("arc", "")] = dic["ground_truth"] |
| | cnt_idx += 1 |
| | |
| | for tar in target_lis: |
| | edit_dis = 0.0 |
| | EM = [] |
| | bleu4 = 0.0 |
| | stmt_mod = 0.0 |
| | cnt = 0 |
| | fork_target_dic = {} |
| | for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'): |
| | dic = json.loads(line) |
| | fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"] |
| |
|
| | for k in test_target_dic.keys(): |
| | func = k.split(" ")[1] |
| | src_code = " ".join(test_target_dic[k]).replace("arc", "") |
| | if func in fork_target_dic.keys(): |
| | fork_code = " ".join(fork_target_dic[func]).replace(tar, "") |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "arc", tar) |
| | else: |
| | fork_code = "" |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "arc", tar) |
| | |
| | with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1: |
| | f.write(fork_code+'\n') |
| | f1.write(src_code+'\n') |
| | EM.append(fork_code==src_code) |
| | edit_dis += fuzz.ratio(fork_code, src_code) |
| | avg_ed += fuzz.ratio(fork_code, src_code) |
| | cnt += 1 |
| | avg_cnt += 1 |
| | if fork_code.strip() == "": |
| | bleu4 += 0 |
| | else: |
| | tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output") |
| | bleu4 += tmp_bleu4 |
| | avg_bleu4 += tmp_bleu4 |
| | with open(dst_dir + '/result.csv', 'a', newline='') as file: |
| | writer = csv.writer(file) |
| | writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))]) |
| | if round(float(bleu4)/cnt,2) > max_bleu4: |
| | max_bleu4 = round(float(bleu4)/cnt,2) |
| | if round(float(edit_dis)/cnt,2) > max_ed: |
| | max_ed = round(float(edit_dis)/cnt,2) |
| | if comp_type == "LLVM": |
| | if isa_type == "CPU": |
| | for line in open(src_dir + "/LLVM/RISCV.jsonl", 'r'): |
| | dic = json.loads(line) |
| | test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("RISCV", "")] = dic["ground_truth"] |
| | cnt_idx += 1 |
| | |
| | for tar in target_lis: |
| | if tar == "RI5CY": |
| | continue |
| | edit_dis = 0.0 |
| | EM = [] |
| | bleu4 = 0.0 |
| | stmt_mod = 0.0 |
| | cnt = 0 |
| | fork_target_dic = {} |
| | for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'): |
| | dic = json.loads(line) |
| | fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"] |
| |
|
| | for k in test_target_dic.keys(): |
| | func = k.split(" ")[1] |
| | src_code = " ".join(test_target_dic[k]).replace("RISCV", "") |
| | if func in fork_target_dic.keys(): |
| | fork_code = " ".join(fork_target_dic[func]).replace(tar, "") |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "RISCV", tar) |
| | else: |
| | fork_code = "" |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "RISCV", tar) |
| | with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1: |
| | f.write(fork_code+'\n') |
| | f1.write(src_code+'\n') |
| | EM.append(fork_code==src_code) |
| | edit_dis += fuzz.ratio(fork_code, src_code) |
| | avg_ed += fuzz.ratio(fork_code, src_code) |
| | cnt += 1 |
| | avg_cnt += 1 |
| | if fork_code.strip() == "": |
| | bleu4 += 0 |
| | else: |
| | tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output") |
| | bleu4 += tmp_bleu4 |
| | avg_bleu4 += tmp_bleu4 |
| | with open(dst_dir + '/result.csv', 'a', newline='') as file: |
| | writer = csv.writer(file) |
| | writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))]) |
| | if round(float(bleu4)/cnt,2) > max_bleu4: |
| | max_bleu4 = round(float(bleu4)/cnt,2) |
| | if round(float(edit_dis)/cnt,2) > max_ed: |
| | max_ed = round(float(edit_dis)/cnt,2) |
| | if isa_type == "GPU": |
| | for line in open(src_dir + "/LLVM/NVPTX.jsonl", 'r'): |
| | dic = json.loads(line) |
| | test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("NVPTX", "")] = dic["ground_truth"] |
| | cnt_idx += 1 |
| | |
| | for tar in target_lis: |
| | edit_dis = 0.0 |
| | EM = [] |
| | bleu4 = 0.0 |
| | stmt_mod = 0.0 |
| | cnt = 0 |
| | fork_target_dic = {} |
| | for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'): |
| | dic = json.loads(line) |
| | fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"] |
| |
|
| | for k in test_target_dic.keys(): |
| | func = k.split(" ")[1] |
| | src_code = " ".join(test_target_dic[k]).replace("NVPTX", "") |
| | if func in fork_target_dic.keys(): |
| | fork_code = " ".join(fork_target_dic[func]).replace(tar, "") |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "NVPTX", tar) |
| | else: |
| | fork_code = "" |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "NVPTX", tar) |
| | |
| | with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1: |
| | f.write(fork_code+'\n') |
| | f1.write(src_code+'\n') |
| | EM.append(fork_code==src_code) |
| | edit_dis += fuzz.ratio(fork_code, src_code) |
| | avg_ed += fuzz.ratio(fork_code, src_code) |
| | cnt += 1 |
| | avg_cnt += 1 |
| | if fork_code.strip() == "": |
| | bleu4 += 0 |
| | else: |
| | tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output") |
| | bleu4 += tmp_bleu4 |
| | avg_bleu4 += tmp_bleu4 |
| | with open(dst_dir + '/result.csv', 'a', newline='') as file: |
| | writer = csv.writer(file) |
| | writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))]) |
| | if round(float(bleu4)/cnt,2) > max_bleu4: |
| | max_bleu4 = round(float(bleu4)/cnt,2) |
| | if round(float(edit_dis)/cnt,2) > max_ed: |
| | max_ed = round(float(edit_dis)/cnt,2) |
| | if isa_type == "MPU": |
| | for line in open(src_dir + "/LLVM/ARC.jsonl", 'r'): |
| | dic = json.loads(line) |
| | test_target_dic[str(cnt_idx) + " " + dic["Func"].replace("ARC", "")] = dic["ground_truth"] |
| | cnt_idx += 1 |
| | for tar in target_lis: |
| | edit_dis = 0.0 |
| | EM = [] |
| | bleu4 = 0.0 |
| | stmt_mod = 0.0 |
| | cnt = 0 |
| | fork_target_dic = {} |
| | for line in open(src_dir + "/" + comp_type + "/" + tar + ".jsonl", 'r'): |
| | dic = json.loads(line) |
| | fork_target_dic[dic["Func"].replace(tar, "")] = dic["ground_truth"] |
| |
|
| | for k in test_target_dic.keys(): |
| | func = k.split(" ")[1] |
| | src_code = " ".join(test_target_dic[k]).replace("ARC", "") |
| | if func in fork_target_dic.keys(): |
| | fork_code = " ".join(fork_target_dic[func]).replace(tar, "") |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], fork_target_dic[func], "ARC", tar) |
| | else: |
| | fork_code = "" |
| | stmt_mod += Calculate_Statements_Ratio(test_target_dic[k], [], "ARC", tar) |
| | with open(dst_dir+"/test.output",'w') as f, open(dst_dir+"/test.gold",'w') as f1: |
| | f.write(fork_code+'\n') |
| | f1.write(src_code+'\n') |
| | EM.append(fork_code==src_code) |
| | edit_dis += fuzz.ratio(fork_code, src_code) |
| | avg_ed += fuzz.ratio(fork_code, src_code) |
| | cnt += 1 |
| | avg_cnt += 1 |
| | if fork_code.strip() == "": |
| | bleu4 += 0 |
| | else: |
| | tmp_bleu4 = _bleu(dst_dir+"/test.gold", dst_dir+"/test.output") |
| | bleu4 += tmp_bleu4 |
| | avg_bleu4 += tmp_bleu4 |
| | |
| | |
| | with open(dst_dir + '/result.csv', 'a', newline='') as file: |
| | writer = csv.writer(file) |
| | writer.writerow([comp_type, isa_type, tar, str(round(float(bleu4)/cnt,2)), str(round(np.mean(EM)*100,2)), str(round(float(edit_dis)/cnt,2)), str(round(float(stmt_mod)*100/cnt,2))]) |
| | if round(float(bleu4)/cnt,2) > max_bleu4: |
| | max_bleu4 = round(float(bleu4)/cnt,2) |
| | if round(float(edit_dis)/cnt,2) > max_ed: |
| | max_ed = round(float(edit_dis)/cnt,2) |
| | print(comp_type + " " + isa_type) |
| | print("Avg ED: " + str(round(float(avg_ed)/avg_cnt,2))) |
| | print("Max ED: " + str(max_ed)) |
| | print("Avg BLEU4: " + str(round(float(avg_bleu4)/avg_cnt,2))) |
| | print("Max BLEU4: " + str(max_bleu4)) |
| | print("\n\n") |
| |
|
| |
|
| |
|
| |
|
| |
|
| | if __name__ == "__main__": |
| | with open(dst_dir + '/result.csv', 'w', newline='') as file: |
| | writer = csv.writer(file) |
| | writer.writerow(["Compiler Type", "ISA Type", "Target", "BLEU4", "Exact Match", "Edit Didtance", "Stmt_Ratio"]) |
| | Calculate_Forkflow() |
| |
|
| |
|
| |
|
| |
|