What might cause OpenCL to crash on cl::Program.build? - opencl

This program crashes when I try to cl::Program.build() but I don't know why. It crashes on the last line of this block of code:
#define __NO_STD_VECTOR
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hPP>
#include <iostream>
#include <fstream>
#include <string>
#include <CL/cl.h>
using namespace std;
using namespace cl;
int _tmain(int argc, _TCHAR* argv[])
{
int tmpSize = 1024;
float **my2D = new float*[tmpSize];
for(int i = 0; i < tmpSize; i++)
{
my2D[i] = new float[tmpSize];
for(int i2 = 0; i2 < tmpSize; i2++)
{
my2D[i][i2] = 5;
}
}
cl::vector <Platform> platforms;
Platform::get(&platforms);
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[1]()), 0};
Context context(CL_DEVICE_TYPE_ALL, cps);
cl::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
CommandQueue queue = CommandQueue(context, devices[0], 0);
int W = tmpSize; //i.width();
int H = tmpSize; //i.height();
Buffer d_ip = Buffer(context, CL_MEM_READ_ONLY, W*H*sizeof(float));
Buffer d_op = Buffer(context, CL_MEM_WRITE_ONLY, W*H*sizeof(float));
queue.enqueueWriteBuffer(d_ip, CL_TRUE, 0, W*H*sizeof(float), my2D);
std::ifstream sourceFileName("c:\\users\\me\\desktop\\img_rotate_kernel.cl");
std::string sourceFile(istreambuf_iterator<char>(sourceFileName), (istreambuf_iterator<char>()));
Program::Sources rotn_source(1,std::make_pair(sourceFile.c_str(), sourceFile.length() + 1));
Program rotn_program(context, rotn_source);
rotn_program.build(devices); // <----- CRASHES HERE
}
using this kernel
__kernel void img_rotate(__global float* dest_data, __global float* src_data, int W, int H, float sinTheta, float cosTheta)
const int ix = get_global_id(0);
const int iy = get_global_id(1);
float x0 = W/2.0f;
float y0 = W/2.0f;
float xOff = ix-x0;
float yOff = iy - y0;
int xpos = (int)(xOff*cosTheta + yOff*sinTheta + x0);
int ypos = (int)(yOff*cosTheta - yOff*sinTheta + y0);
if(((int)xpos>=0) && ((int)xpos < W) && ((int)ypos>=0) && ((int)ypos<H))
{
dest_data[iy*W+ix] = src_data[ypos*W+xpos];
}
}
Here is exception dialog I get when it crashes

From the OpenCL C++ wrapper spec:
cl::Program::Program returns a valid program object and err is set to CL_SUCCESS if the program object is
created successfully. Otherwise, it returns one of the following error values returned in err [...]
Your program object was likely not created properly, change your program construction call to use the err parameter following this signature
cl::Program::Program(const Context& context, const Sources& sources, cl_int * err = NULL)
And make sure err == CL_SUCCESS before doing anything else with your program object.
Most OpenCL calls allow you to pass a pointer to an error parameter. You should really do so and check it after your calls (at least in debug builds I guess) to reduce future headaches.
Ok so I modified your source code a little. Here it is I'll explain my changes right after.
#define __NO_STD_VECTOR
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <iostream>
#include <fstream>
#include <string>
#include <CL/cl.h>
#define ARRAY_SIZE 128
using namespace std;
using namespace cl;
int main(int, char**)
{
int err;
float my2D[ARRAY_SIZE * ARRAY_SIZE] = { 0 };
for(int i = 0; i < ARRAY_SIZE * ARRAY_SIZE; i++)
{
my2D[i] = 5;
}
cl::vector <Platform> platforms;
err = Platform::get(&platforms);
if(err != CL_SUCCESS) {
std::cout << "Platform::get failed - " << err << std::endl;
std::cin.get();
}
cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0]()), 0 };
Context context(CL_DEVICE_TYPE_ALL, cps, nullptr, nullptr, &err);
if(err != CL_SUCCESS) {
std::cout << "Context::Context failed - " << err << std::endl;
std::cin.get();
}
cl::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(&err);
if(err != CL_SUCCESS) {
std::cout << "Context::getInfo failed - " << err << std::endl;
std::cin.get();
}
CommandQueue queue = CommandQueue(context, devices[0], 0, &err);
if(err != CL_SUCCESS) {
std::cout << "CommandQueue::CommandQueue failed - " << err << std::endl;
std::cin.get();
}
int W = ARRAY_SIZE; //i.width();
int H = ARRAY_SIZE; //i.height();
Buffer d_ip = Buffer(context, CL_MEM_READ_ONLY, W*H*sizeof(float), nullptr, &err);
if(err != CL_SUCCESS) {
std::cout << "Buffer::Buffer 1 failed - " << err << std::endl;
std::cin.get();
}
Buffer d_op = Buffer(context, CL_MEM_WRITE_ONLY, W*H*sizeof(float), nullptr, &err);
if(err != CL_SUCCESS) {
std::cout << "Buffer::Buffer 2 failed - " << err << std::endl;
std::cin.get();
}
err = queue.enqueueWriteBuffer(d_ip, CL_TRUE, 0, W*H*sizeof(float), &my2D[0]);
if(err != CL_SUCCESS) {
std::cout << "Queue::enqueueWriteBuffer failed - " << err << std::endl;
std::cin.get();
}
std::ifstream sourceFileName("so_question.cl");
std::string sourceFile(std::istreambuf_iterator<char>(sourceFileName), (std::istreambuf_iterator<char>()));
Program::Sources rotn_source(1,std::make_pair(sourceFile.c_str(), sourceFile.length() + 1));
Program rotn_program(context, rotn_source, &err);
if(err != CL_SUCCESS) {
std::cout << "Program::Program failed - " << err << std::endl;
std::cin.get();
}
err = rotn_program.build(devices);
if(err != CL_SUCCESS) {
std::cout << "Program::build failed - " << err << std::endl;
std::cin.get();
}
}
You'll notice I added a lot more error checks. This allowed me to find out that the call to Context::Context actually did fail in your initial program. The issue likely was that platforms[1] didn't exist (there was 1 element in the vector) so I switched it to platforms[0].
Once that was fixed, I was getting an access violation on the queue.enqueueWriteBuffer(); call. The issue was that your 2-dimensional array was actually an array of heap allocated arrays. That's a problem because OpenCL expects to be able to read data from contiguous memory, which is not the case when allocating with new in a loop like you did. There actually was no guarantee that your arrays were next to each other in memory.
To solve this point, I allocated a one dimensional array on the stack (see the loop at the beginning). The call then becomes
queue.enqueueWriteBuffer(d_ip, CL_TRUE, 0, W*H*sizeof(float), &my2D[0]);
However, you probably won't be able to do so with a 1024 x 1024 array of float because you'll bust stack space. If you need an array that big, you probably want to new a single one dimensional array large enough to contain your data and perform the index arithmetic yourself. This ensures you get your entire storage space as one contiguous chunk.
The code now errors with CL_BUILD_PROGRAM_FAILURE on the err = rotn_program.build() call which means there's probably an error in your CL program code. Since this is an entirely different issue, I'll let you figure this one out.

Related

OpenCL: Basic example not working. clSetKernelArg -38 Error

I am attempting a very simple OpenCL example. I have developed the following code below. It compiles a simple kernel, and then I create a simple float* buffer and set it to a cl::Buffer. However, when I attempt to call the kernel.setArg() function, it crashes, with an error -38. This error states that my cl::Buffer is invalid. I have no idea why this is happening:
#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 200
#include <CL/cl2.hpp>
#define MULTI_LINE_STRING(ARG) #ARG
namespace op
{
const char *resizeAndMergeKernel = MULTI_LINE_STRING(
__kernel void testKernel(__global float* image)
{
}
);
}
void testCL(){
cl::Device device;
cl::Context context;
cl::CommandQueue queue;
int deviceId = 0;
// Load Device
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
std::string deviceName;
cl_uint i, type;
cl::Platform::get(&platforms);
type = platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
if( type == CL_SUCCESS)
{
// Get only relavent device
cl::Context allContext(devices);
std::vector<cl::Device> gpuDevices;
gpuDevices = allContext.getInfo<CL_CONTEXT_DEVICES>();
bool deviceFound = false;
for(int i=0; i<gpuDevices.size(); i++){
if(i == deviceId){
device = gpuDevices[i];
context = cl::Context(device);
queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
deviceFound = true;
cout << "Made new GPU Instance: " << deviceId << endl;
break;
}
}
if(!deviceFound)
{
throw std::runtime_error("Error: Invalid GPU ID");
}
}
// Create Kernel
cl::Program program = cl::Program(context, op::resizeAndMergeKernel, true);
cl::Kernel kernel = cl::Kernel(program, "testKernel");
// Simple Buffer
cl_int err;
float* test = new float[3*224*224];
cl::Buffer x = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * 3 * 224 * 224, (void*)test, &err);
cout << err << endl;
kernel.setArg(0,x); // CRASHES WITH cl::Error -38
}
As you can see the last line kernel.setArg(0,x) crashes with error -38.
It's not a "crash", it's an error code. OpenCL error -38 is CL_INVALID_MEM_OBJECT. It means the cl_mem_obj is not valid. It is because you are passing a cl::Buffer object to setArg, but you need to instead pass the cl_mem handle which represents that buffer. The cl::Buffer operator() method returns that. So use kernel.setArg(0,x()). Note the () are the added part (yes, it's subtle).

Create an OpenCL Context using all CPU and GPU Devices

I am using OpenCL 1.1. I am going to run my code on all of my gpus and all of my cpus. So, as the synchronization on two different contexts is hard to do, I wanted to create a context which contains all CPUs and GPUs as devices. So, First of all I am going to get all the platforms, then the devices related to each platform and then get CPU & GPU devices and store them in seperate vectors. Then afterward, to make the context I am going to create a vector made by all the CPU and GPU Devices. Then, I will call the clCreateContext. It will work fine but afterwards, when I want to create command queues for each device seprately, it always give me:
OpenCL call falls with error -34.
The code is as follows:
cl_int error = CL_SUCCESS;
cl_uint num_platforms;
clGetPlatformIDs(0, nullptr, &num_platforms);
if (num_platforms == 0){
std::cout << "Cannot find any platform.\n";
return;
}
platform.resize(num_platforms);
error = clGetPlatformIDs(num_platforms, platform.data(), nullptr);
checkError(error);
for (cl_uint i = 0; i < num_platforms; i++){
std::string platform_name;
size_t platform_name_len;
clGetPlatformInfo(platform[i], CL_PLATFORM_NAME, 0, nullptr, &platform_name_len);
platform_name.resize(platform_name_len);
clGetPlatformInfo(platform[i], CL_PLATFORM_NAME, platform_name_len, const_cast<char*>(platform_name.data()), nullptr);
std::cout << "[" << i << "]\t" << platform_name << std::endl;
std::vector<cl_device_id> devices(0);
cl_uint num_cpus = 0, num_gpus = 0;
error = clGetDeviceIDs(platform[i], CL_DEVICE_TYPE_CPU, 0, nullptr, &num_cpus);
error = clGetDeviceIDs(platform[i], CL_DEVICE_TYPE_GPU, 0, nullptr, &num_gpus);
devices.resize(num_cpus);
std::cout << "\tCPUS: \n";
error = clGetDeviceIDs(platform[i], CL_DEVICE_TYPE_CPU, num_cpus, devices.data(), nullptr);
for (cl_uint d = 0; d < num_cpus; d++){
std::string device_name;
size_t device_name_len;
clGetDeviceInfo(devices[d], CL_DEVICE_NAME, 0, nullptr, &device_name_len);
device_name.resize(device_name_len);
clGetDeviceInfo(devices[d], CL_DEVICE_NAME, device_name_len, const_cast<char*>(device_name.data()), nullptr);
std::cout << "\t\t[" << d << "]\t" << device_name << std::endl;
cpu_devices.push_back(devices[d]);
}
std::cout << "\tGPUS: \n";
devices.resize(num_gpus);
error = clGetDeviceIDs(platform[i], CL_DEVICE_TYPE_GPU, num_gpus, devices.data(), nullptr);
for (cl_uint d = 0; d < num_gpus; d++){
std::string device_name;
size_t device_name_len;
clGetDeviceInfo(devices[d], CL_DEVICE_NAME, 0, nullptr, &device_name_len);
device_name.resize(device_name_len);
clGetDeviceInfo(devices[d], CL_DEVICE_NAME, device_name_len, const_cast<char*>(device_name.data()), nullptr);
std::cout << "\t\t[" << d << "]\t" << device_name << std::endl;
gpu_devices.push_back(devices[d]);
}
}
std::vector<cl_device_id> devices;
for (size_t i = 0; i < cpu_devices.size(); i++)
devices.push_back(cpu_devices[i]);
for (size_t i = 0; i < gpu_devices.size(); i++)
devices.push_back(gpu_devices[i]);
ctx = clCreateContext(NULL, static_cast<cl_uint>(devices.size()), devices.data(), nullptr, nullptr, nullptr);
cpu_devices_queue.resize(cpu_devices.size());
for (size_t i = 0; i < cpu_devices.size(); i++){
cpu_devices_queue[i] = clCreateCommandQueue(ctx, cpu_devices[i], 0, &error);
checkError(error);
}
gpu_devices_queue.resize(gpu_devices.size());
for (size_t i = 0; i < gpu_devices.size(); i++){
gpu_devices_queue[i] = clCreateCommandQueue(ctx, gpu_devices[i], 0, &error);
checkError(error);
}
An OpenCL context can only encapsulate devices from a single platform, and cannot be created using devices from two or more different platforms.
You are not actually checking whether your call to clCreateContext succeeds. If you checked the return value or the error code, you would likely see that it was in fact failing. This is why when you later use that context in your call to clCreateCommandQueue, you receive error -34 (CL_INVALID_CONTEXT).

openCL trouble saving compiled binaries for CPU and GPU simultaneously

So I'm writing an openCL program that runs on both CPU + GPU and am currently trying to save/cache the binaries after creating my program with clCreateProgramWithSource(). I create my clContext and clProgram with CL_DEVICE_TYPE_ALL and build the source with those specifications.
I then take the binaries and store them to disk (with one binary file per device) so that on subsequent starts my program automatically calls clBuildProgramWithBinary.
The problem is that if I save the binaries to disk that were created with the setting CL_DEVICE_TYPE_ALL, the binary for the CPU gets corrupted and clBuildProgramWithBinary throws an error.
In order to get all the binary files saved to disk properly, I've had to edit my code to first run using CL_DEVICE_TYPE_CPU and save the CPU binary on its own, then edit my code again to run using CL_DEVICE_TYPE_GPU, save the gpu binaries and then finally switch it back to CL_DEVICE_TYPE_ALL. If I do this, clBuildProgramWithBinary is able to accurately build the binary for each device type and execute my program.
So is this just a quirk of openCL that I can't build binaries for GPUs and CPUs together? Or am I just doing this incorrectly?
I'm basing my code on the implementation of binary saving found here: https://code.google.com/p/opencl-book-samples/source/browse/trunk/src/Chapter_6/HelloBinaryWorld/HelloBinaryWorld.cpp?r=42 with modifications in place to handle multiple devices.
Here are some portions of my code below:
/*----Initial setup of platform, context and devices---*/
cl_int err, deviceCount;
cl_device_id *devices;
cl_platform_id platform;
cl_context context;
cl_program program;
err = clGetPlatformIDs(1, &platform, NULL);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
devices = new cl_device_id[deviceCount];
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
context = clCreateContext(NULL, deviceCount, devices, NULL, NULL, &err);
/*---Build Program---*/
int numFiles = 2;
const char *sourceFiles[] =
{
"File1.cl",
"File2.cl",
};
char *sourceStrings[numFiles];
for(int i = 0; i < numFiles; i++)
{
sourceStrings[i] = ReadFile(sourceFiles[i]);
}
/*---Create the compute program from the source buffer---*/
program = clCreateProgramWithSource(context, numFiles, (const char **)sourceStrings, NULL, &err);
/*---Build the program executable---*/
err = clBuildProgram(program, deviceCount, devices, NULL, NULL, NULL);
/*----Save binary to disk---*/
//Determine the size of each program binary
size_t *programBinarySizes = new size_t[deviceCount];
err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * deviceCount, programBinarySizes, NULL);
if(err != CL_SUCCESS)
{
delete [] devices;
delete [] programBinarySizes;
return false;
}
unsigned char **programBinaries = new unsigned char*[deviceCount];
for(cl_uint i = 0; i < deviceCount; i++)
{
programBinaries[i] = new unsigned char[programBinarySizes[i]];
}
//Get all of the program binaries
err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *) * deviceCount, programBinaries, NULL);
if (err != CL_SUCCESS)
{
delete [] devices;
delete [] programBinarySizes;
for (cl_uint i = 0; i < deviceCount; i++)
{
delete [] programBinaries[i];
}
delete [] programBinaries;
}
//Store the binaries
for(cl_uint i = 0; i < deviceCount; i++)
{
// Store the binary for all devices
std::string currFile = binaryFile + to_string(i) + ".txt";
FILE *fp = fopen(currFile.c_str(), "wb");
fwrite(programBinaries[i], 1, programBinarySizes[i], fp);
fclose(fp);
}
// Cleanup
delete [] programBinarySizes;
for (cl_uint i = 0; i < deviceCount; i++)
{
delete [] programBinaries[i];
}
delete [] programBinaries;
And then on the next go around my code with call this function to create the program from the binaries:
unsigned char **programBinaries = new unsigned char *[deviceCount];
size_t sizes[deviceCount];
for(int i = 0; i < deviceCount; i++)
{
string currFile = binaryFile + to_string(i) + ".txt";
FILE *fp = fopen(currFile.c_str(), "rb");
if(!fp) return NULL;
size_t binarySize;
fseek(fp, 0, SEEK_END);
binarySize = ftell(fp);
sizes[i] = binarySize;
rewind(fp);
programBinaries[i] = new unsigned char[binarySize];
fread(programBinaries[i], 1, binarySize, fp);
fclose(fp);
}
cl_int errNum = 0;
cl_program program;
cl_int binaryStatus;
program = clCreateProgramWithBinary(context,
deviceCount,
devices,
sizes,
(const unsigned char **)programBinaries,
&binaryStatus,
&errNum);
delete [] programBinaries;
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
I have a rmbp which has three devices on the only one apple platform. I run your code on it and encountered the same problem. Actually I do not know the solution, but I can give you some hints for debugging.
do not use ftell to compute the size of a regular file, see the reason here
I modified your snippet as follows:
#include <sys/stat.h>
unsigned char **programBinaries = new unsigned char *[deviceCount];
size_t sizes[deviceCount];
int fd;
struct stat st;
for(cl_uint i = 0; i < deviceCount; i++)
{
string currFile = binaryFile + to_string(i) + ".txt";
fd = open(currFile.c_str(), O_RDONLY);
if (fd == -1) {
return -1;
}
if ((fstat(fd, &st) != 0) || (!S_ISREG(st.st_mode))) {
return -2;
}
size_t binarySize;
FILE *fp = fdopen(fd, "rb");
if (fseeko(fp, 0 , SEEK_END) != 0) {
return -3;
}
binarySize = ftello(fp);
cout << "device " << i << ": " << binarySize << endl;
sizes[i] = binarySize;
rewind(fp);
programBinaries[i] = new unsigned char[binarySize];
fread(programBinaries[i], 1, binarySize, fp);
fclose(fp);
close(fd);
}
on my system, however, I got the same result as your original code.
according to
cl_program clCreateProgramWithBinary ( cl_context context,
cl_uint num_devices,
const cl_device_id *device_list,
const size_t *lengths,
const unsigned char **binaries,
cl_int *binary_status,
cl_int *errcode_ret)
binary_status: Returns whether the program binary for each device specified in device_list was loaded successfully or not. It is an array of num_devices entries and returns CL_SUCCESS in binary_status[i] if binary was successfully loaded for device specified by device_list[i]; otherwise returns CL_INVALID_VALUE if lengths[i] is zero or if binaries[i] is a NULL value or CL_INVALID_BINARY in binary_status[i] if program binary is not a valid binary for the specified device. If binary_status is NULL, it is ignored.
if you modify your code like this:
cl_int binaryStatus[deviceCount];
program = clCreateProgramWithBinary(context,
deviceCount,
devices,
sizes,
(const unsigned char **)programBinaries,
binaryStatus,
&errNum);
for (cl_uint i = 0; i < deviceCount; ++i)
{
cout << "device: " << i << ": " << binaryStatus[i] << endl;
}
normally, you will get the following results:
device: 0: 0
device: 1: -42
the first line means that the first binary program (for CPU) was successfully loaded. -42 in the second line corresponds CL_INVALID_BINARY ,which means it is failed to load the binary program.
I also try to retrieve the build options from the program, but got nothing.
//set device_id to 0,1,3...
cl_uint device_id = 0;
cl_build_status status;
// Determine the reason for the error
char buildOptions[16384];
char buildLog[16384];
clGetProgramBuildInfo(program, devices[device_id], CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &status, NULL);
std::cout << "status: " << status << endl;
clGetProgramBuildInfo(program, devices[device_id], CL_PROGRAM_BUILD_OPTIONS,
sizeof(buildOptions), buildOptions, NULL);
std::cout << "build options: " << endl;
std::cout << buildOptions;
clGetProgramBuildInfo(program, devices[device_id], CL_PROGRAM_BUILD_LOG,
sizeof(buildLog), buildLog, NULL);
std::cout << "build log: " << endl;
std::cout << buildLog;
I guess it is a bug of opencl driver. hope the above stuff is helpful for you.

MPI hangs on MPI_Send for large messages

There is a simple program in c++ / mpi (mpich2), which sends an array of type double. If the size of the array more than 9000, then during the call MPI_Send my programm hangs. If array is smaller than 9000 (8000, for example) programm works fine. Source code is bellow:
main.cpp
using namespace std;
Cube** cubes;
int cubesLen;
double* InitVector(int N) {
double* x = new double[N];
for (int i = 0; i < N; i++) {
x[i] = i + 1;
}
return x;
}
void CreateCubes() {
cubes = new Cube*[12];
cubesLen = 12;
for (int i = 0; i < 12; i++) {
cubes[i] = new Cube(9000);
}
}
void SendSimpleData(int size, int rank) {
Cube* cube = cubes[0];
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
double* coefImOut = (double *) malloc(sizeof (double)*cube->coefficentsImLength);
cout << "Before send" << endl;
int count = cube->coefficentsImLength;
MPI_Send(coefImOut, count, MPI_DOUBLE, nodeDest, 0, MPI_COMM_WORLD);
cout << "After send" << endl;
free(coefImOut);
MPI_Status status;
double *coefIm = (double *) malloc(sizeof(double)*count);
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Recv(coefIm, count, MPI_DOUBLE, nodeFrom, 0, MPI_COMM_WORLD, &status);
free(coefIm);
}
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
CreateCubes();
if (rank != root) {
SendSimpleData(size, rank);
}
MPI_Finalize();
return 0;
}
class Cube
class Cube {
public:
Cube(int size);
Cube(const Cube& orig);
virtual ~Cube();
int Id() { return id; }
void Id(int id) { this->id = id; }
int coefficentsImLength;
double* coefficentsIm;
private:
int id;
};
Cube::Cube(int size) {
this->coefficentsImLength = size;
coefficentsIm = new double[size];
for (int i = 0; i < size; i++) {
coefficentsIm[i] = 1;
}
}
Cube::Cube(const Cube& orig) {
}
Cube::~Cube() {
delete[] coefficentsIm;
}
The program runs on 4 processes:
mpiexec -n 4 ./myApp1
Any ideas?
The details of the Cube class aren't relevant here: consider a simpler version
#include <mpi.h>
#include <cstdlib>
using namespace std;
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
int datasize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank != root) {
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Status status;
int *data = new int[datasize];
for (int i=0; i<datasize; i++)
data[i] = rank;
cout << "Before send" << endl;
MPI_Send(&data, datasize, MPI_INT, nodeDest, 0, MPI_COMM_WORLD);
cout << "After send" << endl;
MPI_Recv(&data, datasize, MPI_INT, nodeFrom, 0, MPI_COMM_WORLD, &status);
delete [] data;
}
MPI_Finalize();
return 0;
}
where running gives
$ mpirun -np 4 ./send 1
Before send
After send
Before send
After send
Before send
After send
$ mpirun -np 4 ./send 65000
Before send
Before send
Before send
If in DDT you looked at the message queue window, you'd see everyone is sending, and no one is receiving, and you have a classic deadlock.
MPI_Send's semantics, wierdly, aren't well defined, but it is allowed to block until "the receive has been posted". MPI_Ssend is clearer in this regard; it will always block until the receive has been posted. Details about the different send modes can be seen here.
The reason it worked for smaller messages is an accident of the implementation; for "small enough" messages (for your case, it looks to be <64kB), your MPI_Send implementation uses an "eager send" protocol and doesn't block on the receive; for larger messages, where it isn't necessarily safe just to keep buffered copies of the message kicking around in memory, the Send waits for the matching receive (which it is always allowed to do anyway).
There's a few things you could do to avoid this; all you have to do is make sure not everyone is calling a blocking MPI_Send at the same time. You could (say) have even processors send first, then receive, and odd processors receive first, and then send. You could use nonblocking communications (Isend/Irecv/Waitall). But the simplest solution in this case is to use MPI_Sendrecv, which is a blocking (Send + Recv), rather than a blocking send plus a blocking receive. The send and receive will execute concurrently, and the function will block until both are complete. So this works
#include <mpi.h>
#include <cstdlib>
using namespace std;
int main(int argc, char *argv[]) {
int size, rank;
const int root = 0;
int datasize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank != root) {
int nodeDest = rank + 1;
if (nodeDest > size - 1) {
nodeDest = 1;
}
int nodeFrom = rank - 1;
if (nodeFrom < 1) {
nodeFrom = size - 1;
}
MPI_Status status;
int *outdata = new int[datasize];
int *indata = new int[datasize];
for (int i=0; i<datasize; i++)
outdata[i] = rank;
cout << "Before sendrecv" << endl;
MPI_Sendrecv(outdata, datasize, MPI_INT, nodeDest, 0,
indata, datasize, MPI_INT, nodeFrom, 0, MPI_COMM_WORLD, &status);
cout << "After sendrecv" << endl;
delete [] outdata;
delete [] indata;
}
MPI_Finalize();
return 0;
}
Running gives
$ mpirun -np 4 ./send 65000
Before sendrecv
Before sendrecv
Before sendrecv
After sendrecv
After sendrecv
After sendrecv

Issue porting Decryption from Windows CryptoAPI to linux libmcrypt

I am trying to port my program from Windows to Linux. The windows program uses Window CryptoAPI and linux is using libmcrypt.
Here is the Windows code:
#include <windows.h>
#include <iostream>
#include <iomanip>
#include <vector>
#include <string>
#include <exception>
using namespace std;
class CryptError:public exception{
public:
CryptError(){}
};
#define CHECK_RET(x) if(x == FALSE) {retval = GetLastError(); throw CryptError();};
LONG Decrypt(const string &key, std::vector<BYTE> &data){
LONG retval = 0;
try{
HCRYPTPROV hCrypt;
HCRYPTHASH hHash;
HCRYPTKEY hKey;
CHECK_RET(CryptAcquireContext(&hCrypt, NULL, NULL, PROV_RSA_FULL, 0));
CHECK_RET(CryptCreateHash(hCrypt, CALG_MD5, 0, 0, &hHash));
CHECK_RET(CryptHashData(hHash, reinterpret_cast<const BYTE *>(key.c_str()), key.size(), 0));
CHECK_RET(CryptDeriveKey(hCrypt, CALG_RC2, hHash, MAKELPARAM(CRYPT_EXPORTABLE, 80), &hKey));
BYTE tempVal[200];
DWORD len = 200;
CryptGetKeyParam(hKey, KP_EFFECTIVE_KEYLEN, tempVal, &len, 0);
len = 200;
CryptGetKeyParam(hKey, KP_MODE, tempVal, &len, 0);
len = 200;
CryptExportKey(hKey, NULL, PLAINTEXTKEYBLOB, 0, tempVal, &len);
len = 200;
CryptGetKeyParam(hKey, KP_IV, tempVal, &len, 0);
DWORD count = data.size();
CHECK_RET(CryptDecrypt(hKey, 0, TRUE, 0, &(data[0]), &count));
data.resize(count);
}catch(CryptError &e){
}
return retval;
}
int main(void){
BYTE data[9] = {0xdc,0x3d,0x96,0x23,0x29,0xdd,0x1b,0x2f, 0};
vector<BYTE> vData(data, data + 8);
Decrypt("PNEMAIL", vData);
cerr << "vData: ";
int len = vData.size();
for(int i = 0; i < len; i++){
if(i > 0)
cerr << ',';
cerr << hex << setw(2) << setfill('0') << (int)(vData[i]);
}
cerr << endl;
return 0;
}
When the program is run, it returns:
vData: 42,46,30,41,43,34,31
The Q&D linux version looks like this:
#include <mcrypt.h>
#include <iostream>
#include <iomanip>
#include <string>
#include <openssl/md5.h>
#include <stdint.h>
#include <stdexcept>
#include <vector>
#include <valarray>
#include <memory.h>
using namespace std;
class MCrypt{
private:
MCRYPT mcrypt;
public:
MCrypt(char *algorithm, char* algorithm_directory, char *mode, char* mode_directory){
mcrypt = mcrypt_module_open(algorithm, algorithm_directory, mode, mode_directory);
if(mcrypt == MCRYPT_FAILED)
throw runtime_error("MCrypt init failed");
}
int init(void *key, int lenofkey, void *IV){
return mcrypt_generic_init(mcrypt, key, lenofkey, IV);
}
int enc_get_iv_size(){
return mcrypt_enc_get_iv_size(mcrypt);
}
int deinit(){
return mcrypt_generic_deinit(mcrypt);
}
int decrypt(void *data, int len){
mdecrypt_generic(mcrypt, data, len);
}
~MCrypt(){
deinit();
mcrypt_module_close(mcrypt);
}
};
#ifdef DEBUG
void inline printArrayFunc(const char *start, const uint8_t *data, int len){
// DEBUG: print value of $key1
cerr << start;
for(int i = 0; i < len; i++){
if(i > 0)
cerr << ',';
cerr << hex << setw(2) << setfill('0') << (int)(data[i]);
}
cerr << endl;
}
#define printArray(start, data, len) printArrayFunc(start, data, len)
#else
#define printArray(start, data, len)
#endif
int main(void){
uint8_t data[8] = {0xdc,0x3d,0x96,0x23,0x29,0xdd,0x1b,0x2f};
const char *sKey1 = "PNEMAIL";
const int key1Len = 7;
uint8_t *dataPtr = &(data[0]);
uint8_t key1[17];
key1[16] = 0;
// Hash sKey1
MD5(reinterpret_cast<const unsigned char *>(sKey1), key1Len, key1);
MCrypt mcrypt(MCRYPT_RC2, NULL, MCRYPT_CBC, NULL);
vector<uint8_t> iv(mcrypt.enc_get_iv_size(), 0);
// Use the first 80-bits of key1
mcrypt.init(key1, 10, &(iv[0]));
mcrypt.decrypt(dataPtr, 8);
printArray("vData: ", dataPtr, 8);
return 0;
}
When the program is run, it returns:
vData: 4d,3d,82,71,88,d2,d5,4b
I've check that both programs are using the same data.
CryptDeriveKey creates a key 07,f1,e2,ea,d4,c8,79,74,03,a6 (according to CryptExportKey), the same as the first 10 bytes of the md5 generated in Linux (which I shorten to match the requested 80-bit key).
Neither are using a salt on the algorithm (or at least are not reporting as such)
They are both using an 8-byte IV of {0,0,0,0,0,0,0,0}
They are both using the RC2 algorithm
They are both using CBC mode
I cannot figure out why they are returning different data. Any assistance would be greatly appreciated.

Resources