Can the name of a running CUDA kernel be obtained by its threads? - reflection

Suppose some kernel (a __global__ function named foo) is running on a CUDA device. And suppose that kernel calls a __device__ function bar which is sometimes called from other kernels, i.e. the code of bar does not know at compile-time whether the kernel is foo or something else.
Can a thread running foo, within bar, obtain either the name "foo", the signature, or some other identifier of the kernel, preferable a human-readable one?
If necessary, assume the code has been compiled with any of --debug, --device-debug and/or --lineinfo.

The kernel can read the special register %gridid. %gridid is unique per launch. If performance then a simple kernel prolog can have one thread from each kernel launch output the gridid global function map using func and %gridid. Alternatively, the CUPTI SDK Activity API can be used to collect this information. The CUpti_ActivityKernel2 event contains per launch meta-data including the gridId and CUfunction name.
Here is an example reading %gridid.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
static __device__ __inline__ uint64_t __gridid()
{
uint64_t gridid;
asm volatile("mov.u64 %0, %%gridid;" : "=l"(gridid));
return gridid;
}
__device__ void devPrintName()
{
static const char* name = __func__;
printf("%llu %s\n", __gridid(), name);
}
__global__ void globPrintName()
{
static const char* name = __func__;
printf("%llu %s\n", __gridid(), name);
devPrintName();
}
int main()
{
for (int i = 0; i < 4; ++i)
{
globPrintName<<<1,1,0>>>();
cudaDeviceReset();
}
return 0;
}
This sample outputs
1 globPrintName
1 devPrintName
2 globPrintName
2 devPrintName
3 globPrintName
3 devPrintName
4 globPrintName
4 devPrintName

Related

How to define and execute an array of functions on Sycl+openCL+DPCPP

In my program, I defined an array of functions
#include <CL/sycl.hpp>
#include <iostream>
#include <tbb/tbb.h>
#include <tbb/parallel_for.h>
#include <vector>
#include <string>
#include <queue>
#include<tbb/blocked_range.h>
#include <tbb/global_control.h>
#include <chrono>
using namespace tbb;
template<class Tin, class Tout, class Function>
class Map {
private:
Function fun;
public:
Map() {}
Map(Function f):fun(f) {}
std::vector<Tout> operator()(bool use_tbb, std::vector<Tin>& v) {
std::vector<Tout> r(v.size());
if(use_tbb){
// Start measuring time
auto begin = std::chrono::high_resolution_clock::now();
tbb::parallel_for(tbb::blocked_range<Tin>(0, v.size()),
[&](tbb::blocked_range<Tin> t) {
for (int index = t.begin(); index < t.end(); ++index){
r[index] = fun(v[index]);
}
});
// Stop measuring time and calculate the elapsed time
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
return r;
} else {
sycl::queue gpuQueue{sycl::gpu_selector()};
sycl::range<1> n_item{v.size()};
sycl::buffer<Tin, 1> in_buffer(&v[0], n_item);
sycl::buffer<Tout, 1> out_buffer(&r[0], n_item);
gpuQueue.submit([&](sycl::handler& h){
//local copy of fun
auto f = fun;
sycl::accessor in_accessor(in_buffer, h, sycl::read_only);
sycl::accessor out_accessor(out_buffer, h, sycl::write_only);
h.parallel_for(n_item, [=](sycl::id<1> index) {
out_accessor[index] = f(in_accessor[index]);
});
}).wait();
}
return r;
}
};
template<class Tin, class Tout, class Function>
Map<Tin, Tout, Function> make_map(Function f) { return Map<Tin, Tout, Function>(f);}
typedef int(*func)(int x);
//define different functions
auto function = [](int x){ return x; };
auto functionTimesTwo = [](int x){ return (x*2); };
auto functionDivideByTwo = [](int x){ return (x/2); };
auto lambdaFunction = [](int x){return (++x);};
int main(int argc, char *argv[]) {
std::vector<int> v = {1,2,3,4,5,6,7,8,9};
//auto f = [](int x){return (++x);};
//Array of functions
func functions[] =
{
function,
functionTimesTwo,
functionDivideByTwo,
lambdaFunction
};
for(int i = 0; i< sizeof(functions); i++){
auto m1 = make_map<int, int>(functions[i]);
//auto m1 = make_map<int, int>(f);
std::vector<int> r = m1(true, v);
//print the result
for(auto &e:r) {
std::cout << e << " ";
}
}
return 0;
}
instead of each time defining a function, I am interested in defining an array of functions and then execute it in my program. But in the part of SYCL for executing on GPU, I have an error and I do not know how to fix it.
The ERROR:
SYCL kernel cannot call through a function pointer
In particular, SYCL device code, as defined by this specification, does not support virtual function calls, function pointers in general, exceptions, runtime type information or the full set of C++ libraries that may depend on these features or on features of a particular host compiler. Nevertheless, these basic restrictions can be relieved by some specific Khronos or vendor extensions.
As per the sycl 2020 specification, No function pointers are allowed to be called in a SYCL kernel or any functions called by the kernel.
Please refer https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#introduction

How does one use qsort to sort a char containing pathnames/files based on their bytes?

I basically wrote a code in which I take two command line arguments one being the type of file that I want to search in my directory and they other being the amount I want(which is not implemented yet, but I can fix that)
The code is like so:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#define sizeFileName 500
#define filesMax 5000
int cmpfunc( const void *a, const void *b) {
return *(char*)a + *(char*)b;
}
int main( int argc, char ** argv) {
FILE * fp = popen( "find . -type f", "r");
char * type = argv[1];
char * extension = ".";
char* tExtension;
tExtension = malloc(strlen(type)+1+4);
strcpy(tExtension, extension);
strcat(tExtension, type);
// printf("%s\n",tExtension);
int amount = atoi(argv[2]);
//printf("%d\n",amount);
char buff[sizeFileName];
int nFiles = 0;
char * files[filesMax];
while(fgets(buff,sizeFileName,fp)) {
int leng = strlen(buff) - 1;
if (strncmp(buff + leng - 4, tExtension, 4) == 0){
files[nFiles] = strndup(buff,leng);
//printf("\t%s\n", files[nFiles]);
nFiles ++;
}
}
fclose(fp);
printf("Found %d files\n", nFiles);
long long totalBytes = 0;
struct stat st;
// sorting based on byte size from greatest to least
qsort(files, (size_t) strlen(files), (size_t) sizeof(char), cmpfunc);
for(int i = 0;i< nFiles; i ++) {
if(0!= stat(files[i],&st)){
perror("stat failed:");
exit(-1);
}
totalBytes += st.st_size;
printf("%s : %ld\n",files[i],st.st_size);
}
printf("Total size: %lld\n", totalBytes);
// clean up
for(int i = 0; i < nFiles ; i ++ ) {
free(files[i]);
}
return 0;
}
So far I have every section set up properly, upon running the code say $./find ini 5, it would print out all the ini files followed by their byte size(it's currently ignore the 5). However, for the qsort(), I'm not exactly sure how I would sort the contents of char * files as while it holds the pathnames, I had to use stat to get the byte sizes, how would I print out a sorted version of my print statements featuring the first statement being the most bytes and finishes at the least bytes?
If we suppose your input is valid, your question could be simplified with:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define filesMax 5000
int cmpfunc(const void const *a, const void *b) { return *(char *)a + *(char *)b; }
int main(void) {
int nFiles = 4;
char *files[filesMax] = {"amazing", "hello", "this is a file", "I'm a bad file"};
qsort(files, strlen(files), sizeof(char), cmpfunc);
for (int i = 0; i < nFiles;; i++) {
printf("%s\n", files[i]);
}
}
If you compile with warning that give you:
source_file.c:11:23: warning: incompatible pointer types passing 'char *[5000]' to parameter of type 'const char *' [-Wincompatible-pointer-types]
qsort(files, strlen(files), sizeof(char), cmpfunc);
^~~~~
qsort() expect the size of your array (or in your case a subsize) and it's also expect the size of one element of your array. In both you wrongly give it to it. Also, your compare function doesn't compare anything, you are currently adding the first bytes of both pointer of char, that doesn't make a lot of sense.
To fix your code you must write:
qsort(files, nFiles, sizeof *files, &cmpfunc);
and also fix your compare function:
int cmpfunc_aux(char * const *a, char * const *b) { return strcmp(*a, *b); }
int cmpfunc(void const *a, void const *b) { return cmpfunc_aux(a, b); }
also size should be of type size_t:
size_t nFiles = 0;
Don't forget that all informations about how to use a function are write in their doc.
how would I print out a sorted version of my print statements featuring the first statement being the most bytes and finishes at the least bytes?
Your code don't show any clue that your are trying to do that, you are currently storing name file and only that. How do you expect sort your file with an information you didn't acquired ?
However, that simple create a struct that contain both file name and size, acquire information needed to sort it and sort it:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <inttypes.h>
struct file {
off_t size;
char *name;
};
int cmpfunc_aux(struct file const *a, struct file const *b) {
if (a->size > b->size) {
return -1;
} else if (a->size < b->size) {
return 1;
} else {
return 0;
}
}
int cmpfunc(void const *a, void const *b) { return cmpfunc_aux(a, b); }
#define filesMax 5000
int main(void) {
size_t nFiles = 4;
struct file files[filesMax] = {{42, "amazing"},
{21, "hello"},
{168, "this is a file"},
{84, "I'm a bad file"}};
qsort(files, nFiles, sizeof *files, &cmpfunc);
for (size_t i = 0; i < nFiles; i++) {
printf("%s, %" PRId64 "\n", files[i].name, (intmax_t)files[i].size);
}
}
The function cmpfunc() provided adds the first character of each string, and that's not a proper comparison function (it should give a opposite sign value when you switch the parameters, e.g. if "a" and "b" are the strings to compare, it adds the first two characters of both strings, giving 97+98 == 195, which is positive on unsigned chars, then calling with "b" and "a" should give a negative number (and it again gives you 98 + 97 == 195), more on, it always gives the same result ---even with signed chars--- so it cannot be used as a sorting comparator)
As you are comparing strings, why not to use the standard library function strcmp(3) which is a valid comparison function? It gives a negative number if first string is less lexicographically than the second, 0 if both are equal, and positive if first is greater lexicographically than the second.
if your function has to check (and sort) by the lenght of the filenames, then you can define it as:
int cmpfunc(char *a, char *b) /* yes, you can define parameters as char * */
{
return strlen(a) - strlen(b);
}
or, first based on file length, then lexicographically:
int cmpfunc(char *a, char *b)
{
int la = strlen(a), lb = strlen(b);
if (la != lb) return la - lb;
/* la == lb, so we must check lexicographycally */
return strcmp(a, b);
}
Now, to continue helping you, I need to know why do you need to sort anything, as you say that you want to search a directory for a file, where does the sorting take place in the problem?

Where should I define a C function that will be called in C kernel code when using PYOPENCL

Since Kernel Code in PyOpenCl needs to be written only in C, I have written few functions that need to be called inside the Kernel code in PyOpenCL.Where should I store these functions? how to pass a global variable to that function.
In PyOpenCl my kernel code looks like this:
program = cl.Program(context, """
__kernel void Kernel_OVERLAP_BETWEEN_N_IP_GPU(__constant int *FBNs_array,__local int *Binary_IP, __local int *cc,__global const int *olp)
{
function1(int *x, int *y,__global const int *olp);
}
""").build()
Where should I write and store the function1 function. should I define it in kernel itself, or in some other file and provide a path. If i need to define it at some other place and provide a path, please provide me some details , I am completely new to C.
Thanks
Like in C, before the kernel.
program = cl.Program(context, """
void function1(int *x, int *y)
{
//function1 code
}
__kernel void kernel_name()
{
function1(int *x, int *y);
}""").build()
program = cl.Program(context, """
void function1(int x, int *y,__global const int *cc)
{
x=10;
}
__kernel void kernel_name(__global const int *cc)
{
int x=1;
int y[1]={10};
function1(x,y,cc); //now x=10
}""").build()

why fgets() not working here?

In the below code scanf() is working for getting the name from the user but fgets() is not working pls someone help me to understand why it's not working
#include <stdio.h>
#include <stdlib.h>
typedef struct university{
int roll_no;
char name[16];
}uni;
int main()
{
uni *ptr[5],soome;char i,j=0;
for(i=0;i<5;i++)
{
ptr[i]=(uni*)calloc(1,20);
if(ptr[i]==NULL)
{
printf("memory allocation failure");
}
printf("enter the roll no and name \n");
printf("ur going to enter at the address%u \n",ptr[i]);
scanf("%d",&ptr[i]->roll_no);
//scanf("%s",&ptr[i]->name);
fgets(&ptr[i]->name,16,stdin);
}
while(*(ptr+j))
{
printf("%d %s\n",ptr[j]->roll_no,ptr[j]->name);
j++;
}
return 0;
}
First of all, fgets(char *s, int n, FILE *stream) takes three argument: a pointer s to the beginning of a character array, a count n, and an input stream.
In the original application you used the address operator & to get the pointer not to the first element of the name[16] array, but to something else (to use the address operator, you should have referenced the first char in the array: name[0]).
You use a lot of magic numbers in your application (e.g. 20 as the size of the uni struct). In my sample I'm using sizeof as much as possible.
Given that you use calloc, I've used the fact that the first parameter is the number of elements of size equal to the second parameter to preallocate all the five uni struct at once.
Final result is:
#include <stdio.h>
#include <stdlib.h>
#define NUM_ITEMS (5)
#define NAME_LENGTH (16)
typedef struct university{
int roll_no;
char name[NAME_LENGTH];
} uni;
int main()
{
uni *ptr;
int i;
ptr = (uni*)calloc(NUM_ITEMS, sizeof(uni));
if(NULL == ptr) {
printf("memory allocation failure");
return -1;
}
for(i=0; i<NUM_ITEMS; i++) {
printf("enter the roll no and name \n");
printf("You're going to enter at the address: 0x%X \n",(unsigned int)&ptr[i]);
scanf("%d",&ptr[i].roll_no);
fgets(ptr[i].name, NAME_LENGTH, stdin);
}
for(i=0; i<NUM_ITEMS; i++) {
printf("%d - %s",ptr[i].roll_no,ptr[i].name);
}
free(ptr);
return 0;
}
Note: I've added a call to free(ptr); to free the memory allocated by calloc at the end of the application and a different return code if it's not possible to allocate the memory.

gcc: /home/jamie/aws/btree_int.c|28|error: request for member ‘btree_start’ in something not a structure or union|

This code:
#include <stdlib.h>
#include <stdio.h>
int j_btree_create (int fn_initial_nodes);
typedef struct {
int depth;
int value;
void *item;
void *left_pointer;
void *right_pointer;
} j_btree_node_int;
typedef struct {
int nodes;
int available_nodes;
int btree_extension;
} j_btree_descriptor_int;
int j_btree_create (int fn_initial_nodes) {
int *free_btree_node;
int loop_counter;
j_btree_descriptor_int *btree_start;
btree_start = (j_btree_descriptor_int *) malloc (((sizeof(j_btree_node_int) + sizeof(free_btree_node)) * fn_initial_nodes) + sizeof(j_btree_descriptor_int));
printf ("btree_start: " . btree_start);
/* *btree_start.nodes = fn_initial_nodes;
*btree_start.available_nodes = fn_initial_nodes;
*btree_start.extension = NULL; */
for (loop_counter = 0; loop_counter < fn_initial_nodes; loop_counter++) {
printf ("loop_test:" . loop_counter);
}
}
Produces this error:
/home/jamie/aws/btree_int.c||In function ‘j_btree_create’:|
/home/jamie/aws/btree_int.c|28|error: request for member ‘btree_start’ in something not a structure or union|
/home/jamie/aws/btree_int.c|33|error: request for member ‘loop_counter’ in something not a structure or union|
||=== Build finished: 2 errors, 0 warnings ===|
When compiled with CodeBlocks. I have not managed to find an exact answer to my problem (I have looked), does anyone know roughly what I am doing wrong? Probably more than one thing given I am fairly new to C.
printf ("btree_start: " . btree_start);
This is not how the things are done in c. There's no . concatenation operator and you do not concatenate strings (pointers to characters) and pointers to structures. If you want to print out the pointer, it's
printf("btree_start: %p\n",btree_start);
For the loop counter it's
printf("loop_test: %d",loop_counter);

Resources