ThreadSanitizer (TSan) instrumentation using LLVM opt and TSan passes - llvm-ir

My goal is to instrument my initial IR with proper calls to TSan runtime library functions using LLVM opt tool and TSan passes. In other words, I want to end up with similar TSan instrumentation as when using clang -fsanitize=thread -S but by directly using opt and TSan passes instead.
As far as I know, LLVM has two passes for TSan instrumentation: tsan-module (a module pass) and tsan (a function pass). Both passes are available by default in opt, i.e. are included in opt -print-passes report.
I choose tiny_race.c as my sample programe, where the main thread and the thread it spawns (Thread1) form a data race while accessing a global variable Global.
Here are the two steps I take to instrument the code my way:
Generating the initial LLVM IR for tiny_race.c:
clang -S -emit-llvm tiny_race.c -o tiny_race.ll
Using LLVM opt to instrument tiny_race.ll with the two TSan passes:
opt -passes='tsan-module,tsan' tiny_race.ll -S -o myInstrumented.ll
The above pass pipeline executes fine but the resulting myInstrumented.ll lacks some TSan instrumentations. More specifically:
Thread1 (child thread) is left completely un-instrumented.
main thread only has #__tsan_func_entry and #__tsan_func_exit instrumentations and its accesses to Global are not instrumented.
Could anyone please explain why my approach produces a partially-instrumented output? Any suggestion is greatly appreciated.
To better display the difference between the IR resulting from my approach and the expected one, bellow you can find definitions of main and Thread1 in each of them.
Here is myInstrumented.ll:
; Function Attrs: noinline nounwind optnone uwtable
define dso_local ptr #Thread1(ptr noundef %x) #0 {
entry:
%x.addr = alloca ptr, align 8
store ptr %x, ptr %x.addr, align 8
store i32 42, ptr #Global, align 4
%0 = load ptr, ptr %x.addr, align 8
ret ptr %0
}
; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 #main() #0 {
entry:
%0 = call ptr #llvm.returnaddress(i32 0)
call void #__tsan_func_entry(ptr %0) *****TSAN INSTRUMENTATION*****
%retval = alloca i32, align 4
%t = alloca i64, align 8
store i32 0, ptr %retval, align 4
%call = call i32 #pthread_create(ptr noundef %t, ptr noundef null, ptr noundef #Thread1, ptr noundef null) #4
store i32 43, ptr #Global, align 4
%1 = load i64, ptr %t, align 8
%call1 = call i32 #pthread_join(i64 noundef %1, ptr noundef null)
%2 = load i32, ptr #Global, align 4
call void #__tsan_func_exit() *****TSAN INSTRUMENTATION*****
ret i32 %2
}
And here is the resulting IR when using clang -fsanitize=thread -S -emit-llvm tiny_race.c which is my expected result:
; Function Attrs: noinline nounwind optnone sanitize_thread uwtable
define dso_local ptr #Thread1(ptr noundef %x) #0 {
entry:
%0 = call ptr #llvm.returnaddress(i32 0)
call void #__tsan_func_entry(ptr %0) *****TSAN INSTRUMENTATION*****
%x.addr = alloca ptr, align 8
store ptr %x, ptr %x.addr, align 8
call void #__tsan_write4(ptr #Global) *****TSAN INSTRUMENTATION*****
store i32 42, ptr #Global, align 4
%1 = load ptr, ptr %x.addr, align 8
call void #__tsan_func_exit() *****TSAN INSTRUMENTATION*****
ret ptr %1
}
; Function Attrs: noinline nounwind optnone sanitize_thread uwtable
define dso_local i32 #main() #0 {
entry:
%0 = call ptr #llvm.returnaddress(i32 0)
call void #__tsan_func_entry(ptr %0) *****TSAN INSTRUMENTATION*****
%retval = alloca i32, align 4
%t = alloca i64, align 8
store i32 0, ptr %retval, align 4
%call = call i32 #pthread_create(ptr noundef %t, ptr noundef null, ptr noundef #Thread1, ptr noundef null) #4
call void #__tsan_write4(ptr #Global) *****TSAN INSTRUMENTATION*****
store i32 43, ptr #Global, align 4
call void #__tsan_read8(ptr %t) *****TSAN INSTRUMENTATION*****
%1 = load i64, ptr %t, align 8
%call1 = call i32 #pthread_join(i64 noundef %1, ptr noundef null)
call void #__tsan_read4(ptr #Global) *****TSAN INSTRUMENTATION*****
%2 = load i32, ptr #Global, align 4
call void #__tsan_func_exit() *****TSAN INSTRUMENTATION*****
ret i32 %2
}

Related

Array of pointers, VirtualAlloc and RtlMoveMemory. MASM, some kind of problem

Does anybody know how to fix the addElement function so it ends up as another element in the array. The idea is a dynamic array, where arrayPtr is a pointer to the first element, then new elements can be added dynamically and kept track of by increasing the arrayPtr value. So in-fact I think what it would end up being is an array of pointers to DbRecord structs in memory. Allocated by VirtualAlloc and copied by RtlMoveMemory. I am kinda of hung up on RtlMoveMemeory line. I feel like my line of thinking is correct.
.386
.model flat, stdcall
option casemap :none
include windows.inc
include user32.inc
include kernel32.inc
addElement PROTO: ptr DbRecord
.data?
DbRecord struct
Id dd ?
WordOne db 32 dup(?) ; db is define byte, set value of byte
WordTwo db 32 dup(?)
WordThree db 32 dup(?)
Year dd ?
DbRecord ends
arrayPtr dd ? ; pointer in memory to start of array
newElementPointer DbRecord <>
hStdOut dd ?
bytesWritten dd ?
.data
arrayCount dd 0
hello db 'Hello World!', 0
.code
main proc
LOCAL DbRecord01:DbRecord
mov [DbRecord01.Id], 1;
; any other way than one character at a time?
mov byte ptr [DbRecord01.WordOne], 'D'
mov byte ptr [DbRecord01.WordOne + 1], 'o'
mov byte ptr [DbRecord01.WordOne + 2], 'g'
mov byte ptr [DbRecord01.WordOne + 3], 0
mov byte ptr [DbRecord01.WordTwo], 'C'
mov byte ptr [DbRecord01.WordTwo + 1], 'a'
mov byte ptr [DbRecord01.WordTwo + 2], 't'
mov byte ptr [DbRecord01.WordTwo + 3], 0
mov byte ptr [DbRecord01.WordThree], 'E'
mov byte ptr [DbRecord01.WordThree + 1], 'y'
mov byte ptr [DbRecord01.WordThree + 2], 'e'
mov byte ptr [DbRecord01.WordThree + 3], 0
mov [DbRecord01.Year], 2022;
invoke GetStdHandle, STD_OUTPUT_HANDLE
mov [hStdOut], eax
invoke WriteConsole, hStdOut, offset hello, sizeof hello, offset bytesWritten, NULL
invoke addElement, addr DbRecord01
ret
main endp
addElement proc DbRecordPointer: ptr DbRecord
invoke VirtualAlloc, NULL, sizeof DbRecord, MEM_COMMIT, PAGE_READWRITE ; I beleive store a memory address in eax
invoke RtlMoveMemory, DbRecord ptr [eax], DbRecordPointer, sizeof DbRecord ; but how to use that memory address here?
ret
addElement endp
end main
EDIT/Update:
So yes part of the answer is just passing in eax.
I am here now
How do I get the value of eax ("memory location from VirtualAlloc",) where data was copied into arrayPtr (arrayPtr + count * sizeof DbRecord)
.386
.model flat, stdcall
option casemap :none
include windows.inc
include user32.inc
include kernel32.inc
addElement PROTO: ptr DbRecord
.data?
DbRecord struct
Id dd ?
WordOne db 32 dup(?) ; db is define byte, set value of byte
WordTwo db 32 dup(?)
WordThree db 32 dup(?)
Year dd ?
DbRecord ends
arrayPtr dword ? ; pointer in memory to start of array
; newElementPointer DbRecord <>
hStdOut dd ?
bytesWritten dd ?
.data
arrayCount dd 0
hello db 'Hello World!', 0
.code
main proc
LOCAL DbRecord01:DbRecord
mov [DbRecord01.Id], 1;
; any other way than one character at a time?
mov byte ptr [DbRecord01.WordOne], 'D'
mov byte ptr [DbRecord01.WordOne + 1], 'o'
mov byte ptr [DbRecord01.WordOne + 2], 'g'
mov byte ptr [DbRecord01.WordOne + 3], 0
mov byte ptr [DbRecord01.WordTwo], 'C'
mov byte ptr [DbRecord01.WordTwo + 1], 'a'
mov byte ptr [DbRecord01.WordTwo + 2], 't'
mov byte ptr [DbRecord01.WordTwo + 3], 0
mov byte ptr [DbRecord01.WordThree], 'E'
mov byte ptr [DbRecord01.WordThree + 1], 'y'
mov byte ptr [DbRecord01.WordThree + 2], 'e'
mov byte ptr [DbRecord01.WordThree + 3], 0
mov [DbRecord01.Year], 2022;
invoke GetStdHandle, STD_OUTPUT_HANDLE
mov [hStdOut], eax
invoke WriteConsole, hStdOut, offset hello, sizeof hello, offset bytesWritten, NULL
invoke addElement, addr DbRecord01
ret
main endp
addElement proc uses edx DbRecordPointer: ptr DbRecord
Local newElementPointer: Dword
invoke VirtualAlloc, NULL, sizeof DbRecord, MEM_COMMIT, PAGE_READWRITE ; I beleive store a memory address in eax
mov newElementPointer, eax
;invoke RtlMoveMemory, newElementPointer , DbRecordPointer, sizeof DbRecord ; but how to use that memory address here?
invoke RtlMoveMemory, eax , DbRecordPointer, sizeof DbRecord
mov edx, arrayCount
inc edx
mov arrayCount, edx
;mov dword ptr [arrayPtr+arrayCount], eax
ret
addElement endp
end main

Replace pointer to pointer LLVM

I am trying to make a front-end output providing LLVM IR compatible with LLVM IR for FPGAs (backend goal).
So the problem is that FPGAs can't handle pointer to pointers due to memory allocation issues. However, my front end provides me with a pointer to pointer in LLVM IR. Thus they need to be replaced. How could I do this?
Below you can find a specific example, so I would need to replace all i8** variables/pointers with at least single pointers or others. %buffer_table would be in most applications a multidimensional array. The IR is obtained out of TF XLA by setting the dump_ir environment flag. Please let me know, if you need further information.
; Function Attrs: nounwind
define void #_Z3topv(i8* %retval, i8* noalias %run_options, i8** noalias %buffer_table, i64* noalias %prof_counters) #0 {
entry:
%0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
%1 = load i8*, i8** %0, align 8, !invariant.load !0, !dereferenceable !1, !align !1
%arg0.1 = bitcast i8* %1 to i32*
%2 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
%3 = load i8*, i8** %2, align 8, !invariant.load !0, !dereferenceable !1, !align !1
%arg1.2 = bitcast i8* %3 to i32*
%4 = getelementptr inbounds i8*, i8** %buffer_table, i64 0
%5 = load i8*, i8** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !1
%multiply.5 = bitcast i8* %5 to i32*
%6 = load i32, i32* %arg0.1, align 4, !invariant.load !0, !noalias !2
%7 = load i32, i32* %arg1.2, align 4, !invariant.load !0, !noalias !2
%8 = mul i32 %6, %7
store i32 %8, i32* %multiply.5, align 4, !alias.scope !2
ret void
}
attributes #0 = { nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="true" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "fpga.demangled.name"="top" "fpga.top.func"="top" "less-precise-fpmad"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false"
!0 = !{}
!1 = !{i64 4}
!2 = !{!3}
!3 = !{!"buffer: {index:0, offset:0, size:4}", !4}
!4 = !{!"XLA global AA domain"}
I am quite new to LLVM and I need it for my master thesis. Any help would be highly appreciated!

LLVM global constructor is not called for ATmel processors

I have compiled a cpp code and downloaded it to Arduino Uno for blinking an LED. The code works fine.
However, when I convert it to .ll and from .ll to an object file then hex and upload, the code stops working. No LED blinks by the Arduino.
If I address the ports directly:
typedef unsigned char uint8_t;
typedef uint8_t * volatile port_type;
const port_type portB = (port_type) 0x25;
const port_type ddrB = (port_type) 0x24;
it will work fine but if I initialize port addressed via global constructor, it does not work:
int getPortB() {return 0x25;}
int getDdrB() {return 0x24;}
const port_type portB = (port_type) getPortB();
const port_type ddrB = (port_type) getDdrB();
This is because that global constructor is not called at all. If I call it from main function via
call addrspace(1) void #global_var_init()
it will work.
I use the following commands to compile and download the ll file to the Arduino uno:
llvm-as-9 blink1.ll -o blink1.bc
llc-9 -filetype=obj blink1.bc
avr-g++ -mmcu=atmega328p blink1.o -o blink1
avr-objcopy -O ihex -R .eeprom blink1 blink1.hex
avrdude -F -V -c arduino -p ATMEGA328P -P /dev/ttyUSB0 -b 115200 -U flash:w:blink1.hex
blink1.ll
; ModuleID = 'blink1.cpp'
source_filename = "blink1.cpp"
target datalayout = "e-P1-p:16:8-i8:8-i16:8-i32:8-i64:8-f32:8-f64:8-n8-a:8"
target triple = "avr"
#portB = dso_local global i8* null, align 1
#ddrB = dso_local global i8* null, align 1
#llvm.global_ctors = appending global [1 x { i32, void () addrspace(1)*, i8* }] [{ i32, void () addrspace(1)*, i8* } { i32 65535, void () addrspace(1)* #global_var_init, i8* null }]
; Function Attrs: noinline
define internal void #global_var_init() addrspace(1) {
%1 = inttoptr i16 37 to i8*
store volatile i8* %1, i8** #portB, align 1
%2 = inttoptr i16 36 to i8*
store volatile i8* %2, i8** #ddrB, align 1
ret void
}
; Function Attrs: noinline nounwind optnone
define dso_local void #delay_500ms() addrspace(1) {
call addrspace(0) void asm sideeffect "ldi r19, 150 \0A\09ldi r20, 128 \0A\09ldi r23, 41 \0A\09L1: \0A\09dec r20 \0A\09brne L1 \0A\09dec r19 \0A\09brne L1 \0A\09dec r23 \0A\09brne L1 \0A\09", ""() #3, !srcloc !2
ret void
}
; Function Attrs: noinline norecurse nounwind optnone
define dso_local i16 #main() addrspace(1) {
; call addrspace(1) void #global_var_init()
%1 = alloca i16, align 1
store i16 0, i16* %1, align 1
%2 = load volatile i8*, i8** #ddrB, align 1
store i8 32, i8* %2, align 1
br label %3
3: ; preds = %0, %3
%4 = load volatile i8*, i8** #portB, align 1
store i8 32, i8* %4, align 1
call addrspace(1) void #delay_500ms()
%5 = load volatile i8*, i8** #portB, align 1
store i8 0, i8* %5, align 1
call addrspace(1) void #delay_500ms()
br label %3
}
!0 = !{i32 1, !"wchar_size", i32 2}
!1 = !{!"clang version 9.0.1-+20210314105943+c1a0a213378a-1~exp1~20210314220516.107 "}
!2 = !{i32 1296, i32 1313, i32 1338, i32 1362, i32 1377, i32 1397, i32 1416, i32 1436, i32 1455, i32 1475, i32 1494}
Is this an LLVM bug or am I doing a mistake?

Returning a value pointed to by a pointer in x86 NASM

I'm trying to write a function in x86 NASM assembly that takes a pointer to a structure (structure contains pointer to a buffer) and 2 ints (x,y) which then computes the address of the byte containing (x,y) and returns the value in this address. (The buffer contains a bmp file) I have this function written in C and it works fine.
C function
int loadByte(imgInfo* pImg, int x, int y)
{
unsigned char *pPix = pImg->pImg + (((pImg->width + 31) >> 5) << 2) * y + (x >> 3);
return *pPix;
}
x86 function
load_byte:
push ebp ; prologue
mov ebp, esp
lea ecx, [ebp + 8]
mov ecx, [ecx] ; ecx = &imgInfo
mov eax, [ecx+0] ; eax = width
add eax, 31 ; eax = width + 31
sar eax, 5 ; eax = (width + 31) >> 5
sal eax, 2 ; eax = ((width + 31) >> 5) << 2
mul DWORD [ebp+16] ; eax * y
mov edx, [ebp+12] ; edx = x
sar edx, 3 ; edx = x>>3
add eax, edx ; eax = ((width + 31) >> 5) << 2 * y + (x >> 3)
mov edx, [ecx+8] ; edx = &pImg
add eax, edx
mov eax, [eax]
pop ebp ; epilogue
ret
I tried checking if the address computed in both functions is the same so I changed the return of C to return pPix and commented the line mov eax, [eax] in x86 and to my surprise both functions returned the same number but in the unchanged form (as in the code above) the x86 function always returns -1 for some reason. Is return *pPix not equivalent to mov eax, [eax]? What is wrong with my reasoning?
imgInfo struct
typedef struct
{
int width, height;
unsigned char* pImg; //buffer
int cX, cY;
int col;
} imgInfo;
load_byte C declaration
extern int load_byte(imgInfo* pInfo, int x, int y);

What are glue and chain dependencies in an LLVM DAG?

I'm somewhat new to LLVM and compilers.
I've decided to generate a DAG using the following command
llc -view-sched-dags hello_world.ll
I got a really big graph with different dependency types. "Getting Started with LLVM Core Libraries" book explained that:
Black arrows mean data flow dependency
Red arrows mean glue dependency
Blue dashed arrows mean chain dependency
I clearly remember talking about data flow dependency in my compiler class at school. But I don't remember talking about the other two. Can someone expland the meaning of other dependencies? Any help is appreciated.
hello_world.cpp
#include <stdio.h>
#include <assert.h>
int sum(int a, int b) {
return a + b;
}
int main(int argc, char** argv) {
printf("Hello World! %d\n", sum(argc, 1));
return 0;
}
hello_world.ll
; ModuleID = 'hello_world.cpp'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
#.str = private unnamed_addr constant [17 x i8] c"Hello World! %d\0A\00", align 1
; Function Attrs: nounwind uwtable
define i32 #_Z3sumii(i32 %a, i32 %b) #0 {
entry:
%a.addr = alloca i32, align 4
%b.addr = alloca i32, align 4
store i32 %a, i32* %a.addr, align 4
store i32 %b, i32* %b.addr, align 4
%0 = load i32* %a.addr, align 4
%1 = load i32* %b.addr, align 4
%add = add nsw i32 %0, %1
ret i32 %add
}
; Function Attrs: uwtable
define i32 #main(i32 %argc, i8** %argv) #1 {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 0, i32* %retval
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%0 = load i32* %argc.addr, align 4
%call = call i32 #_Z3sumii(i32 %0, i32 1)
%call1 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([17 x i8]* #.str, i32 0, i32 0), i32 %call)
ret i32 0
}
declare i32 #printf(i8*, ...) #2
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = metadata !{metadata !"clang version 3.5.0 "}
hello_world.main.jpg
hello_world.sum.jpg
Chain dependencies prevent nodes with side effects (including memory operations and explicit register operations) from being scheduled out of order relative to each other.
Glue prevents the two nodes from being broken up during scheduling. It's actually more subtle than that [1], but most of the time you don't need to worry about it. (If you're implementing your own backend that requires two instructions to be adjacent to each other, you really want to be using a pseudoinstruction instead, and expand that after scheduling happens.)
[1]: See http://lists.llvm.org/pipermail/llvm-dev/2014-June/074046.html for example

Resources