Replace pointer to pointer LLVM - pointers

I am trying to make a front-end output providing LLVM IR compatible with LLVM IR for FPGAs (backend goal).
So the problem is that FPGAs can't handle pointer to pointers due to memory allocation issues. However, my front end provides me with a pointer to pointer in LLVM IR. Thus they need to be replaced. How could I do this?
Below you can find a specific example, so I would need to replace all i8** variables/pointers with at least single pointers or others. %buffer_table would be in most applications a multidimensional array. The IR is obtained out of TF XLA by setting the dump_ir environment flag. Please let me know, if you need further information.
; Function Attrs: nounwind
define void #_Z3topv(i8* %retval, i8* noalias %run_options, i8** noalias %buffer_table, i64* noalias %prof_counters) #0 {
entry:
%0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
%1 = load i8*, i8** %0, align 8, !invariant.load !0, !dereferenceable !1, !align !1
%arg0.1 = bitcast i8* %1 to i32*
%2 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
%3 = load i8*, i8** %2, align 8, !invariant.load !0, !dereferenceable !1, !align !1
%arg1.2 = bitcast i8* %3 to i32*
%4 = getelementptr inbounds i8*, i8** %buffer_table, i64 0
%5 = load i8*, i8** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !1
%multiply.5 = bitcast i8* %5 to i32*
%6 = load i32, i32* %arg0.1, align 4, !invariant.load !0, !noalias !2
%7 = load i32, i32* %arg1.2, align 4, !invariant.load !0, !noalias !2
%8 = mul i32 %6, %7
store i32 %8, i32* %multiply.5, align 4, !alias.scope !2
ret void
}
attributes #0 = { nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="true" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "fpga.demangled.name"="top" "fpga.top.func"="top" "less-precise-fpmad"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false"
!0 = !{}
!1 = !{i64 4}
!2 = !{!3}
!3 = !{!"buffer: {index:0, offset:0, size:4}", !4}
!4 = !{!"XLA global AA domain"}
I am quite new to LLVM and I need it for my master thesis. Any help would be highly appreciated!

Related

ThreadSanitizer (TSan) instrumentation using LLVM opt and TSan passes

My goal is to instrument my initial IR with proper calls to TSan runtime library functions using LLVM opt tool and TSan passes. In other words, I want to end up with similar TSan instrumentation as when using clang -fsanitize=thread -S but by directly using opt and TSan passes instead.
As far as I know, LLVM has two passes for TSan instrumentation: tsan-module (a module pass) and tsan (a function pass). Both passes are available by default in opt, i.e. are included in opt -print-passes report.
I choose tiny_race.c as my sample programe, where the main thread and the thread it spawns (Thread1) form a data race while accessing a global variable Global.
Here are the two steps I take to instrument the code my way:
Generating the initial LLVM IR for tiny_race.c:
clang -S -emit-llvm tiny_race.c -o tiny_race.ll
Using LLVM opt to instrument tiny_race.ll with the two TSan passes:
opt -passes='tsan-module,tsan' tiny_race.ll -S -o myInstrumented.ll
The above pass pipeline executes fine but the resulting myInstrumented.ll lacks some TSan instrumentations. More specifically:
Thread1 (child thread) is left completely un-instrumented.
main thread only has #__tsan_func_entry and #__tsan_func_exit instrumentations and its accesses to Global are not instrumented.
Could anyone please explain why my approach produces a partially-instrumented output? Any suggestion is greatly appreciated.
To better display the difference between the IR resulting from my approach and the expected one, bellow you can find definitions of main and Thread1 in each of them.
Here is myInstrumented.ll:
; Function Attrs: noinline nounwind optnone uwtable
define dso_local ptr #Thread1(ptr noundef %x) #0 {
entry:
%x.addr = alloca ptr, align 8
store ptr %x, ptr %x.addr, align 8
store i32 42, ptr #Global, align 4
%0 = load ptr, ptr %x.addr, align 8
ret ptr %0
}
; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 #main() #0 {
entry:
%0 = call ptr #llvm.returnaddress(i32 0)
call void #__tsan_func_entry(ptr %0) *****TSAN INSTRUMENTATION*****
%retval = alloca i32, align 4
%t = alloca i64, align 8
store i32 0, ptr %retval, align 4
%call = call i32 #pthread_create(ptr noundef %t, ptr noundef null, ptr noundef #Thread1, ptr noundef null) #4
store i32 43, ptr #Global, align 4
%1 = load i64, ptr %t, align 8
%call1 = call i32 #pthread_join(i64 noundef %1, ptr noundef null)
%2 = load i32, ptr #Global, align 4
call void #__tsan_func_exit() *****TSAN INSTRUMENTATION*****
ret i32 %2
}
And here is the resulting IR when using clang -fsanitize=thread -S -emit-llvm tiny_race.c which is my expected result:
; Function Attrs: noinline nounwind optnone sanitize_thread uwtable
define dso_local ptr #Thread1(ptr noundef %x) #0 {
entry:
%0 = call ptr #llvm.returnaddress(i32 0)
call void #__tsan_func_entry(ptr %0) *****TSAN INSTRUMENTATION*****
%x.addr = alloca ptr, align 8
store ptr %x, ptr %x.addr, align 8
call void #__tsan_write4(ptr #Global) *****TSAN INSTRUMENTATION*****
store i32 42, ptr #Global, align 4
%1 = load ptr, ptr %x.addr, align 8
call void #__tsan_func_exit() *****TSAN INSTRUMENTATION*****
ret ptr %1
}
; Function Attrs: noinline nounwind optnone sanitize_thread uwtable
define dso_local i32 #main() #0 {
entry:
%0 = call ptr #llvm.returnaddress(i32 0)
call void #__tsan_func_entry(ptr %0) *****TSAN INSTRUMENTATION*****
%retval = alloca i32, align 4
%t = alloca i64, align 8
store i32 0, ptr %retval, align 4
%call = call i32 #pthread_create(ptr noundef %t, ptr noundef null, ptr noundef #Thread1, ptr noundef null) #4
call void #__tsan_write4(ptr #Global) *****TSAN INSTRUMENTATION*****
store i32 43, ptr #Global, align 4
call void #__tsan_read8(ptr %t) *****TSAN INSTRUMENTATION*****
%1 = load i64, ptr %t, align 8
%call1 = call i32 #pthread_join(i64 noundef %1, ptr noundef null)
call void #__tsan_read4(ptr #Global) *****TSAN INSTRUMENTATION*****
%2 = load i32, ptr #Global, align 4
call void #__tsan_func_exit() *****TSAN INSTRUMENTATION*****
ret i32 %2
}

LLVM global constructor is not called for ATmel processors

I have compiled a cpp code and downloaded it to Arduino Uno for blinking an LED. The code works fine.
However, when I convert it to .ll and from .ll to an object file then hex and upload, the code stops working. No LED blinks by the Arduino.
If I address the ports directly:
typedef unsigned char uint8_t;
typedef uint8_t * volatile port_type;
const port_type portB = (port_type) 0x25;
const port_type ddrB = (port_type) 0x24;
it will work fine but if I initialize port addressed via global constructor, it does not work:
int getPortB() {return 0x25;}
int getDdrB() {return 0x24;}
const port_type portB = (port_type) getPortB();
const port_type ddrB = (port_type) getDdrB();
This is because that global constructor is not called at all. If I call it from main function via
call addrspace(1) void #global_var_init()
it will work.
I use the following commands to compile and download the ll file to the Arduino uno:
llvm-as-9 blink1.ll -o blink1.bc
llc-9 -filetype=obj blink1.bc
avr-g++ -mmcu=atmega328p blink1.o -o blink1
avr-objcopy -O ihex -R .eeprom blink1 blink1.hex
avrdude -F -V -c arduino -p ATMEGA328P -P /dev/ttyUSB0 -b 115200 -U flash:w:blink1.hex
blink1.ll
; ModuleID = 'blink1.cpp'
source_filename = "blink1.cpp"
target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8"
target triple = "avr"
#portB = dso_local global i8* null, align 1
#ddrB = dso_local global i8* null, align 1
#llvm.global_ctors = appending global [1 x { i32, void () addrspace(1)*, i8* }] [{ i32, void () addrspace(1)*, i8* } { i32 65535, void () addrspace(1)* #global_var_init, i8* null }]
; Function Attrs: noinline
define internal void #global_var_init() addrspace(1) {
%1 = inttoptr i16 37 to i8*
store volatile i8* %1, i8** #portB, align 1
%2 = inttoptr i16 36 to i8*
store volatile i8* %2, i8** #ddrB, align 1
ret void
}
; Function Attrs: noinline nounwind optnone
define dso_local void #delay_500ms() addrspace(1) {
call addrspace(0) void asm sideeffect "ldi r19, 150 \0A\09ldi r20, 128 \0A\09ldi r23, 41 \0A\09L1: \0A\09dec r20 \0A\09brne L1 \0A\09dec r19 \0A\09brne L1 \0A\09dec r23 \0A\09brne L1 \0A\09", ""() #3, !srcloc !2
ret void
}
; Function Attrs: noinline norecurse nounwind optnone
define dso_local i16 #main() addrspace(1) {
; call addrspace(1) void #global_var_init()
%1 = alloca i16, align 1
store i16 0, i16* %1, align 1
%2 = load volatile i8*, i8** #ddrB, align 1
store i8 32, i8* %2, align 1
br label %3
3: ; preds = %0, %3
%4 = load volatile i8*, i8** #portB, align 1
store i8 32, i8* %4, align 1
call addrspace(1) void #delay_500ms()
%5 = load volatile i8*, i8** #portB, align 1
store i8 0, i8* %5, align 1
call addrspace(1) void #delay_500ms()
br label %3
}
!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{!"clang version 9.0.1-+20210314105943+c1a0a213378a-1~exp1~20210314220516.107 "}
!2 = !{i32 1296, i32 1313, i32 1338, i32 1362, i32 1377, i32 1397, i32 1416, i32 1436, i32 1455, i32 1475, i32 1494}
Is this an LLVM bug or am I doing a mistake?

error: expected end of struct constant

I'm trying to generate an array of pointers to structs, but I get this error:
llc: llvm_test.ll:7:64: error: expected end of struct constant
This is the full code:
#gc.info.type1 = global {i32, i32} {i32 1, i32 2}
#gc.info.types = global [1 x i8* ] { {i32, i32}* #gc.info.type1* }
The error is on the last line.
Try the following:
#gc.info.type1 = global {i32, i32} {i32 1, i32 2}
#gc.info.types = global [1 x {i32, i32}* ] [ {i32, i32}* #gc.info.type1 ]
How my solution ended up using bitcast (thanks to #llvm IRC channel):
#gc.info.type2 = global {i32, i32, i32} {i32 1, i32 2, i32 3}
#gc.info.type2_ptr = global i8* bitcast ({i32, i32, i32}* #gc.info.type2 to i8*)
#gc.info.types = global [2 x i8** ] [ i8** #gc.info.type1_ptr, i8** #gc.info.type2_ptr ]

What are glue and chain dependencies in an LLVM DAG?

I'm somewhat new to LLVM and compilers.
I've decided to generate a DAG using the following command
llc -view-sched-dags hello_world.ll
I got a really big graph with different dependency types. "Getting Started with LLVM Core Libraries" book explained that:
Black arrows mean data flow dependency
Red arrows mean glue dependency
Blue dashed arrows mean chain dependency
I clearly remember talking about data flow dependency in my compiler class at school. But I don't remember talking about the other two. Can someone expland the meaning of other dependencies? Any help is appreciated.
hello_world.cpp
#include <stdio.h>
#include <assert.h>
int sum(int a, int b) {
return a + b;
}
int main(int argc, char** argv) {
printf("Hello World! %d\n", sum(argc, 1));
return 0;
}
hello_world.ll
; ModuleID = 'hello_world.cpp'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
#.str = private unnamed_addr constant [17 x i8] c"Hello World! %d\0A\00", align 1
; Function Attrs: nounwind uwtable
define i32 #_Z3sumii(i32 %a, i32 %b) #0 {
entry:
%a.addr = alloca i32, align 4
%b.addr = alloca i32, align 4
store i32 %a, i32* %a.addr, align 4
store i32 %b, i32* %b.addr, align 4
%0 = load i32* %a.addr, align 4
%1 = load i32* %b.addr, align 4
%add = add nsw i32 %0, %1
ret i32 %add
}
; Function Attrs: uwtable
define i32 #main(i32 %argc, i8** %argv) #1 {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 0, i32* %retval
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%0 = load i32* %argc.addr, align 4
%call = call i32 #_Z3sumii(i32 %0, i32 1)
%call1 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([17 x i8]* #.str, i32 0, i32 0), i32 %call)
ret i32 0
}
declare i32 #printf(i8*, ...) #2
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = metadata !{metadata !"clang version 3.5.0 "}
hello_world.main.jpg
hello_world.sum.jpg
Chain dependencies prevent nodes with side effects (including memory operations and explicit register operations) from being scheduled out of order relative to each other.
Glue prevents the two nodes from being broken up during scheduling. It's actually more subtle than that [1], but most of the time you don't need to worry about it. (If you're implementing your own backend that requires two instructions to be adjacent to each other, you really want to be using a pseudoinstruction instead, and expand that after scheduling happens.)
[1]: See http://lists.llvm.org/pipermail/llvm-dev/2014-June/074046.html for example

Summation of floats in matrix multiplication in in openCL slows down kernel time

I have two OpenCL kernels used for matrix multiplication. Both are working fine except for some rounding errors.
The first one is straighforward from the literature. It uses three row-column ordered float-arrays for C = A*B. The given matrices are quadratic, to make sure, there is no side error from wrongly set dimensions.
Kernel 1 - Matrix Multiplication with floats
kernel void matrixmult(
global float* a,
global float* b,
global float* c,
const unsigned int rows, const unsigned int cols)
{
const unsigned int i = get_global_id(0);
const unsigned int j = get_global_id(1);
if ((i >= cols) || (j >= rows)) return;
float sum = 0.0f;
for (int k = 0; k < cols; k++) {
sum += a[j*cols + k]*b[k*cols+i];
}
c[j*cols + i] = sum;
}
The second OpenCL kernel makes use of float4-arrays and calls the dot product. The matrix B is given as transposition of the original matrix. float4 is the interpretation of the float-array defined in JavaCL. So I simply create same array on the host for both of my kernels.
Kernel 2 - Matrix Multiplication with float4
kernel void matrixmult4(
global const float4* a,
global const float4* bTransposed,
global float* c,
const unsigned int n)
{
const int rowsOut = get_global_size(0);
const int colsOut = get_global_size(1);
const unsigned int row = get_global_id(0);
const unsigned int col = get_global_id(1);
if ((col > colsOut) || (row > rowsOut)) return;
const int indexA = row*n/4;
const int indexB = col*n/4;
float sum = 0.0f;
for (int k = 0; k < n/4; k++) {
sum += dot(a[indexA+k], bTransposed[indexB+k]);
}
c[row*colsOut + col] = sum;
}
My Problem: The first kernel runs 5 times faster (~70ms on a GTX460) than the second kernel (~350ms). The main consumption rises from the summation line
sum += dot(a[indexA+k], bTransposed[indexB+k]);
If I use + instead of +=the second kernel runs the same time like the first, but of course the matrix multiplication is wrong.
Is there a need for synchonized addition of sum? It is used within the same kernel instance, and nowhere else.
UPDATE This is the resulting SPIR(?) code (Intel HD 5100)
Sniplet from the kernel with sum += ...
%32 = load i32* %indexA, align 4, !tbaa !12
%33 = load i32* %k, align 4, !tbaa !12
%34 = add nsw i32 %32, %33
%35 = sext i32 %34 to i64
%36 = load <4 x float> addrspace(1)** %1, align 8, !tbaa !9
%37 = getelementptr inbounds <4 x float> addrspace(1)* %36, i64 %35
%38 = load <4 x float> addrspace(1)* %37, align 16, !tbaa !10
%39 = load i32* %indexB, align 4, !tbaa !12
%40 = load i32* %k, align 4, !tbaa !12
%41 = add nsw i32 %39, %40
%42 = sext i32 %41 to i64
%43 = load <4 x float> addrspace(1)** %2, align 8, !tbaa !9
%44 = getelementptr inbounds <4 x float> addrspace(1)* %43, i64 %42
%45 = load <4 x float> addrspace(1)* %44, align 16, !tbaa !10
%46 = call float #_Z3dotDv4_fS_(<4 x f+loat> %38, <4 x float> %45)
%47 = load float* %sum, align 4, !tbaa !13
%48 = fadd float %47, %46
store float %48, float* %sum, align 4, !tbaa !13
br label %49
SPIR code for the simplified sum = version only leaves out the lines starting with %47 and %48
Does faddlead to such an overhead??

Resources