Ascon-128 in arduino

Ascon-128 in arduino - arduino

I have tried implementing the Ascon-128 encryption algorithm in Arduino https://ascon.iaik.tugraz.at.
The compiler doesn't find any mistakes, but when I run the code the monitor doesn't return anything. Not even the Serial.println(ciphertext[0]); gets printed.
Do you see any mistakes in my code?
Thank you
#include <stdio.h>
typedef unsigned long long bit64;
bit64 constants[16] = {0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87, 0x78, 0x69
,0x5a, 0x4b, 0x3c, 0x2d, 0x1e, 0x0f };
bit64 state[5] = {0}, t[5] = { 0 };
void setup() {
Serial.begin(9600);
}
void loop() {
// put your main code here, to run repeatedly:
main();
delay(500);
}
bit64 rotate(bit64 x, int l){
bit64 temp;
temp = (x >> l) ^ (x << (64 - l));
return temp;
}
void print_state(bit64 state[5]){
for (int i = 0; i < 5; i++) printf("%llx\n", state[i]);
}
void sbox (bit64 x[5]) {
x[0] ^= x[4]; x[4] ^= x[3]; x[2] ^= x[1];
t[0] = x[0]; t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; t[4] = x[4];
t[0] = ~t[0]; t[1] = ~t[1]; t[2] = ~t[2]; t[3] = ~t[3]; t[4] = ~t[4];
t[0] &= x[1]; t[1] &= x[2]; t[2] &= x[3]; t[3] &= x[4]; t[4] &= x[0];
x[0] ^= t[1]; x[1] ^= t[2]; x[2] ^= t[3]; x[3] ^= t[4]; x[4] ^= t[0];
x[1] ^= x[0]; x[0] ^= x[4]; x[3] ^= x[2]; x[2] = ~x[2];
}
void linear(bit64 state[5]) {
bit64 temp0, temp1;
temp0 = rotate(state[0], 19);
temp1 = rotate(state[0], 28);
state[0] ^= temp0 ^temp1;
temp0 = rotate(state[1], 61);
temp1 = rotate(state[1], 39);
state[1] ^= temp0 ^temp1;
temp0 = rotate(state[2], 1);
temp1 = rotate(state[2], 6);
state[2] ^= temp0 ^temp1;
temp0 = rotate(state[3], 10);
temp1 = rotate(state[3], 17);
state[3] ^= temp0 ^temp1;
temp0 = rotate(state[4], 7);
temp1 = rotate(state[4], 41);
state[4] ^= temp0 ^temp1;
}
void add_constant(bit64 state[5], int i, int a) {
state[2] = state[2] ^ constants[12- a + i];
}
void p(bit64 state[5], int a) {
for (int i = 0; i < a; i++){
add_constant(state, i, a);
sbox(state);
linear(state);
}
}
void initialization(bit64 state[5], bit64 key[2]) {
p(state, 12);
state[3] ^= key[0];
state[4] ^= key[1];
}
void encrypt(bit64 state[5], int length, bit64 plaintext[], bit64 ciphertext[]) {
ciphertext[0] = plaintext[0] ^ state[0];
for (int i = 1; i < length; i++){
p(state, 6);
ciphertext[i] = plaintext[i] ^ state[0];
state[0] = ciphertext[i];
}
}
void finalization(bit64 state[5], bit64 key[2]){
state[3] ^= key[0];
state[4] ^= key[1];
p(state, 12);
// bijgevoegd
state[3] ^= key[0];
state[4] ^= key[1];
}
int main() {
bit64 nonce[2] = { 2000 };
bit64 key[2] = { 3000 };
bit64 IV = 0x80400c060000000;
bit64 to_encode = 0x82187;
bit64 plaintext [] = { 0x1234567890abcdef, to_encode }, ciphertext[10] = { 0 };
state[0] = IV;
state[1] = key[0];
state[2] = key[1];
state[3] = nonce[0];
state[4] = key[1];
initialization(state, key);
print_state(state);
encrypt(state, 2, plaintext, ciphertext);
Serial.println(ciphertext[0]);
Serial.println(ciphertext[1]);
finalization(state, key);
Serial.println(state[3]);
Serial.println(state[4]);
return 0;
}

Related

Trying to add functions on ESP32CAM CameraWebServer Example Code

I am trying to control ESP32CAM's I/O pins and also getting view from camera.
For this purpose, I tried to edit CameraWebServer example like this:
#include "esp_camera.h"
#include <WiFi.h>
//
// WARNING!!! PSRAM IC required for UXGA resolution and high JPEG quality
// Ensure ESP32 Wrover Module or other board with PSRAM is selected
// Partial images will be transmitted if image exceeds buffer size
//
// Select camera model
//#define CAMERA_MODEL_WROVER_KIT // Has PSRAM
//#define CAMERA_MODEL_ESP_EYE // Has PSRAM
//#define CAMERA_MODEL_M5STACK_PSRAM // Has PSRAM
//#define CAMERA_MODEL_M5STACK_V2_PSRAM // M5Camera version B Has PSRAM
//#define CAMERA_MODEL_M5STACK_WIDE // Has PSRAM
//#define CAMERA_MODEL_M5STACK_ESP32CAM // No PSRAM
#define CAMERA_MODEL_AI_THINKER // Has PSRAM
//#define CAMERA_MODEL_TTGO_T_JOURNAL // No PSRAM
#include "camera_pins.h"
WiFiServer espServer(81);
String request;
const char* ssid = "VODAFONE_9D53";
const char* password = "fc1f1fff";
void startCameraServer();
void setup() {
Serial.begin(115200);
Serial.setDebugOutput(true);
Serial.println();
pinMode(12, OUTPUT);
pinMode(13, OUTPUT);
digitalWrite(4, LOW);
camera_config_t config;
config.ledc_channel = LEDC_CHANNEL_0;
config.ledc_timer = LEDC_TIMER_0;
config.pin_d0 = Y2_GPIO_NUM;
config.pin_d1 = Y3_GPIO_NUM;
config.pin_d2 = Y4_GPIO_NUM;
config.pin_d3 = Y5_GPIO_NUM;
config.pin_d4 = Y6_GPIO_NUM;
config.pin_d5 = Y7_GPIO_NUM;
config.pin_d6 = Y8_GPIO_NUM;
config.pin_d7 = Y9_GPIO_NUM;
config.pin_xclk = XCLK_GPIO_NUM;
config.pin_pclk = PCLK_GPIO_NUM;
config.pin_vsync = VSYNC_GPIO_NUM;
config.pin_href = HREF_GPIO_NUM;
config.pin_sscb_sda = SIOD_GPIO_NUM;
config.pin_sscb_scl = SIOC_GPIO_NUM;
config.pin_pwdn = PWDN_GPIO_NUM;
config.pin_reset = RESET_GPIO_NUM;
config.xclk_freq_hz = 20000000;
config.pixel_format = PIXFORMAT_JPEG;
// if PSRAM IC present, init with UXGA resolution and higher JPEG quality
// for larger pre-allocated frame buffer.
if(psramFound()){
config.frame_size = FRAMESIZE_UXGA;
config.jpeg_quality = 10;
config.fb_count = 2;
} else {
config.frame_size = FRAMESIZE_SVGA;
config.jpeg_quality = 12;
config.fb_count = 1;
}
#if defined(CAMERA_MODEL_ESP_EYE)
pinMode(13, INPUT_PULLUP);
pinMode(14, INPUT_PULLUP);
#endif
// camera init
esp_err_t err = esp_camera_init(&config);
if (err != ESP_OK) {
Serial.printf("Camera init failed with error 0x%x", err);
return;
}
sensor_t * s = esp_camera_sensor_get();
// initial sensors are flipped vertically and colors are a bit saturated
if (s->id.PID == OV3660_PID) {
s->set_vflip(s, 1); // flip it back
s->set_brightness(s, 1); // up the brightness just a bit
s->set_saturation(s, -2); // lower the saturation
}
// drop down frame size for higher initial frame rate
s->set_framesize(s, FRAMESIZE_QVGA);
#if defined(CAMERA_MODEL_M5STACK_WIDE) || defined(CAMERA_MODEL_M5STACK_ESP32CAM)
s->set_vflip(s, 1);
s->set_hmirror(s, 1);
#endif
WiFi.begin(ssid, password);
while (WiFi.status() != WL_CONNECTED) {
delay(500);
Serial.print(".");
}
Serial.println("");
Serial.println("WiFi connected");
startCameraServer();
Serial.print("Camera Ready! Use 'http://");
Serial.print(WiFi.localIP());
Serial.println("' to connect");
delay(2000);
espServer.begin();
}
void loop() {
WiFiClient client = espServer.available(); /* Check if a client is available */
if(!client)
{
return;
}
Serial.println("New Client!!!");
boolean currentLineIsBlank = true;
while (client.connected())
{
if (client.available())
{
char c = client.read();
request += c;
Serial.write(c);
if (c == '\n' && currentLineIsBlank)
{
if (request.indexOf("/GPIO12ON") != -1)
{
Serial.println("GPIO12 LED is ON");
digitalWrite(12, HIGH);
Serial.printf("12 HIGH");
}
if (request.indexOf("/GPIO12OFF") != -1)
{
Serial.println("GPIO12 LED is OFF");
digitalWrite(12, LOW);
Serial.printf("12 LOW");
}
if (request.indexOf("/GPIO13ON") != -1)
{
Serial.println("GPIO13 LED is ON");
digitalWrite(13, HIGH);
Serial.printf("13 HIGH");
}
if (request.indexOf("/GPIO13OFF") != -1)
{
Serial.println("GPIO13 LED is OFF");
digitalWrite(13, LOW);
Serial.printf("13 LOW");
}
client.println("HTTP/1.1 200 OK");
client.println("Content-Type: text/html");
client.println("Connection: close");
client.println(); // IMPORTANT
break;
}
if(c == '\n')
{
currentLineIsBlank = true;
}
else if(c != '\r')
{
currentLineIsBlank = false;
}
//client.print("\n");
}
}
delay(1);
request = "";
//client.flush();
client.stop();
Serial.println("Client disconnected");
Serial.print("\n");
}
I did 81 port because i want to use camera and I/O control on ngrok. (I can only open 1 port on ngrok, stream URL is already on 81 port so i tried to move I/O control part to 81 port)
I can control I/O pins but i cant use camera on xxx.xxx.x.xx:81/stream URL. Can you help me ?

I got some help and found this.
First of all, function is need to be defined in the app_httpd.cpp like this:
static esp_err_t gpio12On_handler(httpd_req_t *req){
Serial.println("ON, 12.port HIGH");
digitalWrite(12, HIGH);
return httpd_resp_send(req, NULL, 0);
}
And then in the startCameraServer() function you need to declare the URI like this:
httpd_uri_t gpio12On_uri = {
.uri = "/gpio12On",
.method = HTTP_GET,
.handler = gpio12On_handler,
.user_ctx = NULL
};
Finally, you can add the function to server with this code:
httpd_register_uri_handler(camera_httpd, &ledOn_uri);
Note:
httpd_register_uri_handler comand needs to come after the
httpd_start command
If you want to have this URI on the port 80 you need to use
stream_httpd
But if you want to use port 81 then you need to use camera_httpd
Full code:
#include "esp_http_server.h"
#include "esp_timer.h"
#include "esp_camera.h"
#include "img_converters.h"
#include "camera_index.h"
#include "Arduino.h"
#include "fb_gfx.h"
#include "fd_forward.h"
#include "fr_forward.h"
#define ENROLL_CONFIRM_TIMES 5
#define FACE_ID_SAVE_NUMBER 7
#define FACE_COLOR_WHITE 0x00FFFFFF
#define FACE_COLOR_BLACK 0x00000000
#define FACE_COLOR_RED 0x000000FF
#define FACE_COLOR_GREEN 0x0000FF00
#define FACE_COLOR_BLUE 0x00FF0000
#define FACE_COLOR_YELLOW (FACE_COLOR_RED | FACE_COLOR_GREEN)
#define FACE_COLOR_CYAN (FACE_COLOR_BLUE | FACE_COLOR_GREEN)
#define FACE_COLOR_PURPLE (FACE_COLOR_BLUE | FACE_COLOR_RED)
typedef struct {
size_t size; //number of values used for filtering
size_t index; //current value index
size_t count; //value count
int sum;
int * values; //array to be filled with values
} ra_filter_t;
typedef struct {
httpd_req_t *req;
size_t len;
} jpg_chunking_t;
#define PART_BOUNDARY "123456789000000000000987654321"
static const char* _STREAM_CONTENT_TYPE = "multipart/x-mixed-replace;boundary=" PART_BOUNDARY;
static const char* _STREAM_BOUNDARY = "\r\n--" PART_BOUNDARY "\r\n";
static const char* _STREAM_PART = "Content-Type: image/jpeg\r\nContent-Length: %u\r\n\r\n";
static ra_filter_t ra_filter;
httpd_handle_t stream_httpd = NULL;
httpd_handle_t camera_httpd = NULL;
static mtmn_config_t mtmn_config = {0};
static int8_t detection_enabled = 0;
static int8_t recognition_enabled = 0;
static int8_t is_enrolling = 0;
static face_id_list id_list = {0};
static ra_filter_t * ra_filter_init(ra_filter_t * filter, size_t sample_size){
memset(filter, 0, sizeof(ra_filter_t));
filter->values = (int *)malloc(sample_size * sizeof(int));
if(!filter->values){
return NULL;
}
memset(filter->values, 0, sample_size * sizeof(int));
filter->size = sample_size;
return filter;
}
static int ra_filter_run(ra_filter_t * filter, int value){
if(!filter->values){
return value;
}
filter->sum -= filter->values[filter->index];
filter->values[filter->index] = value;
filter->sum += filter->values[filter->index];
filter->index++;
filter->index = filter->index % filter->size;
if (filter->count < filter->size) {
filter->count++;
}
return filter->sum / filter->count;
}
static void rgb_print(dl_matrix3du_t *image_matrix, uint32_t color, const char * str){
fb_data_t fb;
fb.width = image_matrix->w;
fb.height = image_matrix->h;
fb.data = image_matrix->item;
fb.bytes_per_pixel = 3;
fb.format = FB_BGR888;
fb_gfx_print(&fb, (fb.width - (strlen(str) * 14)) / 2, 10, color, str);
}
static int rgb_printf(dl_matrix3du_t *image_matrix, uint32_t color, const char *format, ...){
char loc_buf[64];
char * temp = loc_buf;
int len;
va_list arg;
va_list copy;
va_start(arg, format);
va_copy(copy, arg);
len = vsnprintf(loc_buf, sizeof(loc_buf), format, arg);
va_end(copy);
if(len >= sizeof(loc_buf)){
temp = (char*)malloc(len+1);
if(temp == NULL) {
return 0;
}
}
vsnprintf(temp, len+1, format, arg);
va_end(arg);
rgb_print(image_matrix, color, temp);
if(len > 64){
free(temp);
}
return len;
}
static void draw_face_boxes(dl_matrix3du_t *image_matrix, box_array_t *boxes, int face_id){
int x, y, w, h, i;
uint32_t color = FACE_COLOR_YELLOW;
if(face_id < 0){
color = FACE_COLOR_RED;
} else if(face_id > 0){
color = FACE_COLOR_GREEN;
}
fb_data_t fb;
fb.width = image_matrix->w;
fb.height = image_matrix->h;
fb.data = image_matrix->item;
fb.bytes_per_pixel = 3;
fb.format = FB_BGR888;
for (i = 0; i < boxes->len; i++){
// rectangle box
x = (int)boxes->box[i].box_p[0];
y = (int)boxes->box[i].box_p[1];
w = (int)boxes->box[i].box_p[2] - x + 1;
h = (int)boxes->box[i].box_p[3] - y + 1;
fb_gfx_drawFastHLine(&fb, x, y, w, color);
fb_gfx_drawFastHLine(&fb, x, y+h-1, w, color);
fb_gfx_drawFastVLine(&fb, x, y, h, color);
fb_gfx_drawFastVLine(&fb, x+w-1, y, h, color);
#if 0
// landmark
int x0, y0, j;
for (j = 0; j < 10; j+=2) {
x0 = (int)boxes->landmark[i].landmark_p[j];
y0 = (int)boxes->landmark[i].landmark_p[j+1];
fb_gfx_fillRect(&fb, x0, y0, 3, 3, color);
}
#endif
}
}
static int run_face_recognition(dl_matrix3du_t *image_matrix, box_array_t *net_boxes){
dl_matrix3du_t *aligned_face = NULL;
int matched_id = 0;
aligned_face = dl_matrix3du_alloc(1, FACE_WIDTH, FACE_HEIGHT, 3);
if(!aligned_face){
Serial.println("Could not allocate face recognition buffer");
return matched_id;
}
if (align_face(net_boxes, image_matrix, aligned_face) == ESP_OK){
if (is_enrolling == 1){
int8_t left_sample_face = enroll_face(&id_list, aligned_face);
if(left_sample_face == (ENROLL_CONFIRM_TIMES - 1)){
Serial.printf("Enrolling Face ID: %d\n", id_list.tail);
}
Serial.printf("Enrolling Face ID: %d sample %d\n", id_list.tail, ENROLL_CONFIRM_TIMES - left_sample_face);
rgb_printf(image_matrix, FACE_COLOR_CYAN, "ID[%u] Sample[%u]", id_list.tail, ENROLL_CONFIRM_TIMES - left_sample_face);
if (left_sample_face == 0){
is_enrolling = 0;
Serial.printf("Enrolled Face ID: %d\n", id_list.tail);
}
} else {
matched_id = recognize_face(&id_list, aligned_face);
if (matched_id >= 0) {
Serial.printf("Match Face ID: %u\n", matched_id);
rgb_printf(image_matrix, FACE_COLOR_GREEN, "Hello Subject %u", matched_id);
} else {
Serial.println("No Match Found");
rgb_print(image_matrix, FACE_COLOR_RED, "Intruder Alert!");
matched_id = -1;
}
}
} else {
Serial.println("Face Not Aligned");
//rgb_print(image_matrix, FACE_COLOR_YELLOW, "Human Detected");
}
dl_matrix3du_free(aligned_face);
return matched_id;
}
static size_t jpg_encode_stream(void * arg, size_t index, const void* data, size_t len){
jpg_chunking_t *j = (jpg_chunking_t *)arg;
if(!index){
j->len = 0;
}
if(httpd_resp_send_chunk(j->req, (const char *)data, len) != ESP_OK){
return 0;
}
j->len += len;
return len;
}
static esp_err_t capture_handler(httpd_req_t *req){
camera_fb_t * fb = NULL;
esp_err_t res = ESP_OK;
int64_t fr_start = esp_timer_get_time();
fb = esp_camera_fb_get();
if (!fb) {
Serial.println("Camera capture failed");
httpd_resp_send_500(req);
return ESP_FAIL;
}
httpd_resp_set_type(req, "image/jpeg");
httpd_resp_set_hdr(req, "Content-Disposition", "inline; filename=capture.jpg");
httpd_resp_set_hdr(req, "Access-Control-Allow-Origin", "*");
size_t out_len, out_width, out_height;
uint8_t * out_buf;
bool s;
bool detected = false;
int face_id = 0;
if(!detection_enabled || fb->width > 400){
size_t fb_len = 0;
if(fb->format == PIXFORMAT_JPEG){
fb_len = fb->len;
res = httpd_resp_send(req, (const char *)fb->buf, fb->len);
} else {
jpg_chunking_t jchunk = {req, 0};
res = frame2jpg_cb(fb, 80, jpg_encode_stream, &jchunk)?ESP_OK:ESP_FAIL;
httpd_resp_send_chunk(req, NULL, 0);
fb_len = jchunk.len;
}
esp_camera_fb_return(fb);
int64_t fr_end = esp_timer_get_time();
Serial.printf("JPG: %uB %ums\n", (uint32_t)(fb_len), (uint32_t)((fr_end - fr_start)/1000));
return res;
}
dl_matrix3du_t *image_matrix = dl_matrix3du_alloc(1, fb->width, fb->height, 3);
if (!image_matrix) {
esp_camera_fb_return(fb);
Serial.println("dl_matrix3du_alloc failed");
httpd_resp_send_500(req);
return ESP_FAIL;
}
out_buf = image_matrix->item;
out_len = fb->width * fb->height * 3;
out_width = fb->width;
out_height = fb->height;
s = fmt2rgb888(fb->buf, fb->len, fb->format, out_buf);
esp_camera_fb_return(fb);
if(!s){
dl_matrix3du_free(image_matrix);
Serial.println("to rgb888 failed");
httpd_resp_send_500(req);
return ESP_FAIL;
}
box_array_t *net_boxes = face_detect(image_matrix, &mtmn_config);
if (net_boxes){
detected = true;
if(recognition_enabled){
face_id = run_face_recognition(image_matrix, net_boxes);
}
draw_face_boxes(image_matrix, net_boxes, face_id);
free(net_boxes->score);
free(net_boxes->box);
free(net_boxes->landmark);
free(net_boxes);
}
jpg_chunking_t jchunk = {req, 0};
s = fmt2jpg_cb(out_buf, out_len, out_width, out_height, PIXFORMAT_RGB888, 90, jpg_encode_stream, &jchunk);
dl_matrix3du_free(image_matrix);
if(!s){
Serial.println("JPEG compression failed");
return ESP_FAIL;
}
int64_t fr_end = esp_timer_get_time();
Serial.printf("FACE: %uB %ums %s%d\n", (uint32_t)(jchunk.len), (uint32_t)((fr_end - fr_start)/1000), detected?"DETECTED ":"", face_id);
return res;
}
static esp_err_t stream_handler(httpd_req_t *req){
camera_fb_t * fb = NULL;
esp_err_t res = ESP_OK;
size_t _jpg_buf_len = 0;
uint8_t * _jpg_buf = NULL;
char * part_buf[64];
dl_matrix3du_t *image_matrix = NULL;
bool detected = false;
int face_id = 0;
int64_t fr_start = 0;
int64_t fr_ready = 0;
int64_t fr_face = 0;
int64_t fr_recognize = 0;
int64_t fr_encode = 0;
static int64_t last_frame = 0;
if(!last_frame) {
last_frame = esp_timer_get_time();
}
res = httpd_resp_set_type(req, _STREAM_CONTENT_TYPE);
if(res != ESP_OK){
return res;
}
httpd_resp_set_hdr(req, "Access-Control-Allow-Origin", "*");
while(true){
detected = false;
face_id = 0;
fb = esp_camera_fb_get();
if (!fb) {
Serial.println("Camera capture failed");
res = ESP_FAIL;
} else {
fr_start = esp_timer_get_time();
fr_ready = fr_start;
fr_face = fr_start;
fr_encode = fr_start;
fr_recognize = fr_start;
if(!detection_enabled || fb->width > 400){
if(fb->format != PIXFORMAT_JPEG){
bool jpeg_converted = frame2jpg(fb, 80, &_jpg_buf, &_jpg_buf_len);
esp_camera_fb_return(fb);
fb = NULL;
if(!jpeg_converted){
Serial.println("JPEG compression failed");
res = ESP_FAIL;
}
} else {
_jpg_buf_len = fb->len;
_jpg_buf = fb->buf;
}
} else {
image_matrix = dl_matrix3du_alloc(1, fb->width, fb->height, 3);
if (!image_matrix) {
Serial.println("dl_matrix3du_alloc failed");
res = ESP_FAIL;
} else {
if(!fmt2rgb888(fb->buf, fb->len, fb->format, image_matrix->item)){
Serial.println("fmt2rgb888 failed");
res = ESP_FAIL;
} else {
fr_ready = esp_timer_get_time();
box_array_t *net_boxes = NULL;
if(detection_enabled){
net_boxes = face_detect(image_matrix, &mtmn_config);
}
fr_face = esp_timer_get_time();
fr_recognize = fr_face;
if (net_boxes || fb->format != PIXFORMAT_JPEG){
if(net_boxes){
detected = true;
if(recognition_enabled){
face_id = run_face_recognition(image_matrix, net_boxes);
}
fr_recognize = esp_timer_get_time();
draw_face_boxes(image_matrix, net_boxes, face_id);
free(net_boxes->score);
free(net_boxes->box);
free(net_boxes->landmark);
free(net_boxes);
}
if(!fmt2jpg(image_matrix->item, fb->width*fb->height*3, fb->width, fb->height, PIXFORMAT_RGB888, 90, &_jpg_buf, &_jpg_buf_len)){
Serial.println("fmt2jpg failed");
res = ESP_FAIL;
}
esp_camera_fb_return(fb);
fb = NULL;
} else {
_jpg_buf = fb->buf;
_jpg_buf_len = fb->len;
}
fr_encode = esp_timer_get_time();
}
dl_matrix3du_free(image_matrix);
}
}
}
if(res == ESP_OK){
res = httpd_resp_send_chunk(req, _STREAM_BOUNDARY, strlen(_STREAM_BOUNDARY));
}
if(res == ESP_OK){
size_t hlen = snprintf((char *)part_buf, 64, _STREAM_PART, _jpg_buf_len);
res = httpd_resp_send_chunk(req, (const char *)part_buf, hlen);
}
if(res == ESP_OK){
res = httpd_resp_send_chunk(req, (const char *)_jpg_buf, _jpg_buf_len);
}
if(fb){
esp_camera_fb_return(fb);
fb = NULL;
_jpg_buf = NULL;
} else if(_jpg_buf){
free(_jpg_buf);
_jpg_buf = NULL;
}
if(res != ESP_OK){
break;
}
int64_t fr_end = esp_timer_get_time();
int64_t ready_time = (fr_ready - fr_start)/1000;
int64_t face_time = (fr_face - fr_ready)/1000;
int64_t recognize_time = (fr_recognize - fr_face)/1000;
int64_t encode_time = (fr_encode - fr_recognize)/1000;
int64_t process_time = (fr_encode - fr_start)/1000;
int64_t frame_time = fr_end - last_frame;
last_frame = fr_end;
frame_time /= 1000;
uint32_t avg_frame_time = ra_filter_run(&ra_filter, frame_time);
Serial.printf("MJPG: %uB %ums (%.1ffps), AVG: %ums (%.1ffps), %u+%u+%u+%u=%u %s%d\n",
(uint32_t)(_jpg_buf_len),
(uint32_t)frame_time, 1000.0 / (uint32_t)frame_time,
avg_frame_time, 1000.0 / avg_frame_time,
(uint32_t)ready_time, (uint32_t)face_time, (uint32_t)recognize_time, (uint32_t)encode_time, (uint32_t)process_time,
(detected)?"DETECTED ":"", face_id
);
}
last_frame = 0;
return res;
}
static esp_err_t cmd_handler(httpd_req_t *req){
char* buf;
size_t buf_len;
char variable[32] = {0,};
char value[32] = {0,};
buf_len = httpd_req_get_url_query_len(req) + 1;
if (buf_len > 1) {
buf = (char*)malloc(buf_len);
if(!buf){
httpd_resp_send_500(req);
return ESP_FAIL;
}
if (httpd_req_get_url_query_str(req, buf, buf_len) == ESP_OK) {
if (httpd_query_key_value(buf, "var", variable, sizeof(variable)) == ESP_OK &&
httpd_query_key_value(buf, "val", value, sizeof(value)) == ESP_OK) {
} else {
free(buf);
httpd_resp_send_404(req);
return ESP_FAIL;
}
} else {
free(buf);
httpd_resp_send_404(req);
return ESP_FAIL;
}
free(buf);
} else {
httpd_resp_send_404(req);
return ESP_FAIL;
}
int val = atoi(value);
sensor_t * s = esp_camera_sensor_get();
int res = 0;
if(!strcmp(variable, "framesize")) {
if(s->pixformat == PIXFORMAT_JPEG) res = s->set_framesize(s, (framesize_t)val);
}
else if(!strcmp(variable, "quality")) res = s->set_quality(s, val);
else if(!strcmp(variable, "contrast")) res = s->set_contrast(s, val);
else if(!strcmp(variable, "brightness")) res = s->set_brightness(s, val);
else if(!strcmp(variable, "saturation")) res = s->set_saturation(s, val);
else if(!strcmp(variable, "gainceiling")) res = s->set_gainceiling(s, (gainceiling_t)val);
else if(!strcmp(variable, "colorbar")) res = s->set_colorbar(s, val);
else if(!strcmp(variable, "awb")) res = s->set_whitebal(s, val);
else if(!strcmp(variable, "agc")) res = s->set_gain_ctrl(s, val);
else if(!strcmp(variable, "aec")) res = s->set_exposure_ctrl(s, val);
else if(!strcmp(variable, "hmirror")) res = s->set_hmirror(s, val);
else if(!strcmp(variable, "vflip")) res = s->set_vflip(s, val);
else if(!strcmp(variable, "awb_gain")) res = s->set_awb_gain(s, val);
else if(!strcmp(variable, "agc_gain")) res = s->set_agc_gain(s, val);
else if(!strcmp(variable, "aec_value")) res = s->set_aec_value(s, val);
else if(!strcmp(variable, "aec2")) res = s->set_aec2(s, val);
else if(!strcmp(variable, "dcw")) res = s->set_dcw(s, val);
else if(!strcmp(variable, "bpc")) res = s->set_bpc(s, val);
else if(!strcmp(variable, "wpc")) res = s->set_wpc(s, val);
else if(!strcmp(variable, "raw_gma")) res = s->set_raw_gma(s, val);
else if(!strcmp(variable, "lenc")) res = s->set_lenc(s, val);
else if(!strcmp(variable, "special_effect")) res = s->set_special_effect(s, val);
else if(!strcmp(variable, "wb_mode")) res = s->set_wb_mode(s, val);
else if(!strcmp(variable, "ae_level")) res = s->set_ae_level(s, val);
else if(!strcmp(variable, "face_detect")) {
detection_enabled = val;
if(!detection_enabled) {
recognition_enabled = 0;
}
}
else if(!strcmp(variable, "face_enroll")) is_enrolling = val;
else if(!strcmp(variable, "face_recognize")) {
recognition_enabled = val;
if(recognition_enabled){
detection_enabled = val;
}
}
else {
res = -1;
}
if(res){
return httpd_resp_send_500(req);
}
httpd_resp_set_hdr(req, "Access-Control-Allow-Origin", "*");
return httpd_resp_send(req, NULL, 0);
}
static esp_err_t status_handler(httpd_req_t *req){
static char json_response[1024];
sensor_t * s = esp_camera_sensor_get();
char * p = json_response;
*p++ = '{';
p+=sprintf(p, "\"framesize\":%u,", s->status.framesize);
p+=sprintf(p, "\"quality\":%u,", s->status.quality);
p+=sprintf(p, "\"brightness\":%d,", s->status.brightness);
p+=sprintf(p, "\"contrast\":%d,", s->status.contrast);
p+=sprintf(p, "\"saturation\":%d,", s->status.saturation);
p+=sprintf(p, "\"sharpness\":%d,", s->status.sharpness);
p+=sprintf(p, "\"special_effect\":%u,", s->status.special_effect);
p+=sprintf(p, "\"wb_mode\":%u,", s->status.wb_mode);
p+=sprintf(p, "\"awb\":%u,", s->status.awb);
p+=sprintf(p, "\"awb_gain\":%u,", s->status.awb_gain);
p+=sprintf(p, "\"aec\":%u,", s->status.aec);
p+=sprintf(p, "\"aec2\":%u,", s->status.aec2);
p+=sprintf(p, "\"ae_level\":%d,", s->status.ae_level);
p+=sprintf(p, "\"aec_value\":%u,", s->status.aec_value);
p+=sprintf(p, "\"agc\":%u,", s->status.agc);
p+=sprintf(p, "\"agc_gain\":%u,", s->status.agc_gain);
p+=sprintf(p, "\"gainceiling\":%u,", s->status.gainceiling);
p+=sprintf(p, "\"bpc\":%u,", s->status.bpc);
p+=sprintf(p, "\"wpc\":%u,", s->status.wpc);
p+=sprintf(p, "\"raw_gma\":%u,", s->status.raw_gma);
p+=sprintf(p, "\"lenc\":%u,", s->status.lenc);
p+=sprintf(p, "\"vflip\":%u,", s->status.vflip);
p+=sprintf(p, "\"hmirror\":%u,", s->status.hmirror);
p+=sprintf(p, "\"dcw\":%u,", s->status.dcw);
p+=sprintf(p, "\"colorbar\":%u,", s->status.colorbar);
p+=sprintf(p, "\"face_detect\":%u,", detection_enabled);
p+=sprintf(p, "\"face_enroll\":%u,", is_enrolling);
p+=sprintf(p, "\"face_recognize\":%u", recognition_enabled);
*p++ = '}';
*p++ = 0;
httpd_resp_set_type(req, "application/json");
httpd_resp_set_hdr(req, "Access-Control-Allow-Origin", "*");
return httpd_resp_send(req, json_response, strlen(json_response));
}
static esp_err_t index_handler(httpd_req_t *req){
httpd_resp_set_type(req, "text/html");
httpd_resp_set_hdr(req, "Content-Encoding", "gzip");
sensor_t * s = esp_camera_sensor_get();
if (s->id.PID == OV3660_PID) {
return httpd_resp_send(req, (const char *)index_ov3660_html_gz, index_ov3660_html_gz_len);
}
return httpd_resp_send(req, (const char *)index_ov2640_html_gz, index_ov2640_html_gz_len);
}
static esp_err_t gpio12On_handler(httpd_req_t *req){
Serial.println("ON, 12.port HIGH");
digitalWrite(12, HIGH);
return httpd_resp_send(req, NULL, 0);
}
static esp_err_t gpio12Off_handler(httpd_req_t *req){
Serial.println("OFF, 12.port LOW");
digitalWrite(12, LOW);
return httpd_resp_send(req, NULL, 0);
}
static esp_err_t gpio13On_handler(httpd_req_t *req){
Serial.println("ON, 13.port HIGH");
digitalWrite(13, HIGH);
return httpd_resp_send(req, NULL, 0);
}
static esp_err_t gpio13Off_handler(httpd_req_t *req){
Serial.println("OF, 13.port LOW");
digitalWrite(13, LOW);
return httpd_resp_send(req, NULL, 0);
}
void startCameraServer(){
httpd_config_t config = HTTPD_DEFAULT_CONFIG();
httpd_uri_t index_uri = {
.uri = "/",
.method = HTTP_GET,
.handler = index_handler,
.user_ctx = NULL
};
httpd_uri_t status_uri = {
.uri = "/status",
.method = HTTP_GET,
.handler = status_handler,
.user_ctx = NULL
};
httpd_uri_t cmd_uri = {
.uri = "/control",
.method = HTTP_GET,
.handler = cmd_handler,
.user_ctx = NULL
};
httpd_uri_t capture_uri = {
.uri = "/capture",
.method = HTTP_GET,
.handler = capture_handler,
.user_ctx = NULL
};
httpd_uri_t stream_uri = {
.uri = "/stream",
.method = HTTP_GET,
.handler = stream_handler,
.user_ctx = NULL
};
httpd_uri_t gpio12On_uri = {
.uri = "/gpio12On",
.method = HTTP_GET,
.handler = gpio12On_handler,
.user_ctx = NULL
};
httpd_uri_t gpio12Off_uri = {
.uri = "/gpio12Off",
.method = HTTP_GET,
.handler = gpio12Off_handler,
.user_ctx = NULL
};
httpd_uri_t gpio13On_uri = {
.uri = "/gpio13On",
.method = HTTP_GET,
.handler = gpio13On_handler,
.user_ctx = NULL
};
httpd_uri_t gpio13Off_uri = {
.uri = "/gpio13Off",
.method = HTTP_GET,
.handler = gpio13Off_handler,
.user_ctx = NULL
};
ra_filter_init(&ra_filter, 20);
mtmn_config.type = FAST;
mtmn_config.min_face = 80;
mtmn_config.pyramid = 0.707;
mtmn_config.pyramid_times = 4;
mtmn_config.p_threshold.score = 0.6;
mtmn_config.p_threshold.nms = 0.7;
mtmn_config.p_threshold.candidate_number = 20;
mtmn_config.r_threshold.score = 0.7;
mtmn_config.r_threshold.nms = 0.7;
mtmn_config.r_threshold.candidate_number = 10;
mtmn_config.o_threshold.score = 0.7;
mtmn_config.o_threshold.nms = 0.7;
mtmn_config.o_threshold.candidate_number = 1;
face_id_init(&id_list, FACE_ID_SAVE_NUMBER, ENROLL_CONFIRM_TIMES);
Serial.printf("Starting web server on port: '%d'\n", config.server_port);
if (httpd_start(&camera_httpd, &config) == ESP_OK) {
httpd_register_uri_handler(camera_httpd, &index_uri);
httpd_register_uri_handler(camera_httpd, &cmd_uri);
httpd_register_uri_handler(camera_httpd, &status_uri);
httpd_register_uri_handler(camera_httpd, &capture_uri);
}
config.server_port += 1;
config.ctrl_port += 1;
Serial.printf("Starting stream server on port: '%d'\n", config.server_port);
if (httpd_start(&stream_httpd, &config) == ESP_OK) {
httpd_register_uri_handler(stream_httpd, &stream_uri);
httpd_register_uri_handler(stream_httpd, &gpio12On_uri);
httpd_register_uri_handler(stream_httpd, &gpio12Off_uri);
httpd_register_uri_handler(stream_httpd, &gpio13On_uri);
httpd_register_uri_handler(stream_httpd, &gpio13Off_uri);
}
}

OpenCL generate SHA-256 hash

I need help with OpenCL.
The task is as follows:
There is an input parameter of type string. It is necessary to generate a SHA-256 hash using the resources of the video card.
It is necessary to create a cycle to select a hash. Each time add some postfix to the original string.
Result*Hash should start with 5 zeros "00000 ...".
For example, the entrance. parameter: "strela".
SHA-256: "7d7ceecdee08ea1c0ac46b27657a79395af36526b3214b59a92f8351ccf8f762"
Next, you need to add a postfix. For example, "strela1"
Here the hash will be: a2afd15651f44f19f3e4e216bf3ead22d5f5937e9f9dc250382ff1f764ba219f
then continue to add the postfix until the resulting hash begins to start with "00000.."
It is necessary to use all the cores of the video card, i.e. use parallelization. Each core will use its postfix.
As soon as some kernel computes the hash we need, interrupt all calculations on the cores and display the hash we need.
Source:
main.cpp
#define _CRT_SECURE_NO_WARNINGS
#include "sha256.h"
#include <stdio.h>
#include < string.h >
void crypt_and_print(char input[])
{
char result[65];
char diff[65] = "00000";
char *istr;
char buffer2[20];
int temp;
char str2[20];
for (int i = 0; i < 1; i++)
{
char string[] = "1qqq";
sprintf(buffer2, "%d", i);
temp = 8 - strlen(buffer2);
str2[0] = '\0';
while (strlen(str2) != temp)
strcat(str2, "0");
strcat(str2, buffer2);
strcat(string, str2);
sha256_crypt(string, result);
istr = strstr(result, diff);
if (istr != NULL) {
printf(istr);
break;
}
}
}
int main()
{
char result[65];
sha256_init(2048);
crypt_and_print((char*)"");
}
sha256.c
#define _CRT_SECURE_NO_WARNINGS
#include "sha256.h"
static cl_platform_id platform_id = NULL;
static cl_device_id device_id = NULL;
static cl_uint ret_num_devices;
static cl_uint ret_num_platforms;
static cl_context context;
static cl_int ret;
static char* source_str;
static size_t source_size;
static cl_program program;
static cl_kernel kernel;
static cl_command_queue command_queue;
static cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys, data_info;
static cl_uint *partial_hashes;
static cl_uint *res_hashes;
static char *saved_plain;
static unsigned int datai[3];
static int have_full_hashes;
static size_t kpc = 4;
static size_t global_work_size=3;
static size_t local_work_size=1;
static size_t string_len;
void load_source();
void createDevice();
void createkernel();
void create_clobj();
void crypt_all();
void sha256_init(size_t user_kpc)
{
kpc = user_kpc;
load_source();
createDevice();
createkernel();
create_clobj();
}
void sha256_crypt(char input[], char* output)
{
int i;
string_len = strlen(input);
global_work_size = 3;
datai[0] = SHA256_PLAINTEXT_LENGTH;
datai[1] = global_work_size;
datai[2] = string_len;
memcpy(saved_plain, input, string_len+1);
crypt_all();
for(i=0; i<SHA256_RESULT_SIZE; i++)
{
sprintf(output+i*8,"%08x", partial_hashes[i]);
}
printf("'%s':\n%s\n", input, output);
}
void crypt_all()
{
//printf("%s\n",saved_plain);
ret = clEnqueueWriteBuffer(command_queue, data_info, CL_TRUE, 0, sizeof(unsigned int) * 3, datai, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, buffer_keys, CL_TRUE, 0, SHA256_PLAINTEXT_LENGTH * kpc, saved_plain, 0, NULL, NULL);
// printf("%s\n",buffer_keys);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
ret = clFinish(command_queue);
// read back partial hashes
ret = clEnqueueReadBuffer(command_queue, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, partial_hashes, 0, NULL, NULL);
have_full_hashes = 0;
}
void load_source()
{
FILE *fp;
fp = fopen("/sha256.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
}
void create_clobj(){
pinned_saved_keys = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, (SHA256_PLAINTEXT_LENGTH)*kpc, NULL, &ret);
saved_plain = (char*)clEnqueueMapBuffer(command_queue, pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, (SHA256_PLAINTEXT_LENGTH)*kpc, 0, NULL, NULL, &ret);
memset(saved_plain, 0, SHA256_PLAINTEXT_LENGTH * kpc);
res_hashes = (cl_uint *)malloc(sizeof(cl_uint) * SHA256_RESULT_SIZE);
memset(res_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
pinned_partial_hashes = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
partial_hashes = (cl_uint *) clEnqueueMapBuffer(command_queue, pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE, 0, NULL, NULL, &ret);
memset(partial_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
buffer_keys = clCreateBuffer(context, CL_MEM_READ_ONLY, (SHA256_PLAINTEXT_LENGTH) * kpc, NULL, &ret);
buffer_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * SHA256_RESULT_SIZE, NULL, &ret);
data_info = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int) * 3, NULL, &ret);
clSetKernelArg(kernel, 0, sizeof(data_info), (void *) &data_info);
clSetKernelArg(kernel, 1, sizeof(buffer_keys), (void *) &buffer_keys);
clSetKernelArg(kernel, 2, sizeof(buffer_out), (void *) &buffer_out);
}
void createDevice()
{
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
}
void createkernel()
{
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "sha256_crypt_kernel", &ret);
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
}
sha256.cl
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#define H0 0x6a09e667
#define H1 0xbb67ae85
#define H2 0x3c6ef372
#define H3 0xa54ff53a
#define H4 0x510e527f
#define H5 0x9b05688c
#define H6 0x1f83d9ab
#define H7 0x5be0cd19
uint rotr(uint x, int n) {
if (n < 32) return (x >> n) | (x << (32 - n));
return x;
}
uint ch(uint x, uint y, uint z) {
return (x & y) ^ (~x & z);
}
uint maj(uint x, uint y, uint z) {
return (x & y) ^ (x & z) ^ (y & z);
}
uint sigma0(uint x) {
return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
}
uint sigma1(uint x) {
return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
}
uint gamma0(uint x) {
return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
}
uint gamma1(uint x) {
return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
}
__kernel void sha256_crypt_kernel(__global uint *data_info,__global char *plain_key, __global uint *digest){
int t, gid, msg_pad;
int stop, mmod;
uint i, ulen, item, total;
uint W[80], temp, A,B,C,D,E,F,G,H,T1,T2;
uint num_keys = data_info[1];
int current_pad;
//printf(get_global_id(0));
uint K[64]={
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
msg_pad=0;
ulen = data_info[2];
total = ulen%64>=56?2:1 + ulen/64;
//printf("ulen: %u total:%u\n", ulen, total);
digest[0] = H0;
digest[1] = H1;
digest[2] = H2;
digest[3] = H3;
digest[4] = H4;
digest[5] = H5;
digest[6] = H6;
digest[7] = H7;
for(item=0; item<total; item++)
{
A = digest[0];
B = digest[1];
C = digest[2];
D = digest[3];
E = digest[4];
F = digest[5];
G = digest[6];
H = digest[7];
#pragma unroll
for (t = 0; t < 80; t++){
W[t] = 0x00000000;
}
msg_pad=item*64;
if(ulen > msg_pad)
{
current_pad = (ulen-msg_pad)>64?64:(ulen-msg_pad);
}
else
{
current_pad =-1;
}
// printf("current_pad: %d\n",current_pad);
if(current_pad>0)
{
i=current_pad;
stop = i/4;
// printf("i:%d, stop: %d msg_pad:%d\n",i,stop, msg_pad);
for (t = 0 ; t < stop+get_global_id(0) ; t++){
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= (uchar) plain_key[msg_pad + t * 4 + 3];
// printf("W[%u]: %u\n",t,W[t]);
}
mmod = i % 4;
if ( mmod == 3){
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= ((uchar) 0x80) ;
} else if (mmod == 2) {
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) plain_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= 0x8000 ;
} else if (mmod == 1) {
W[t] = ((uchar) plain_key[msg_pad + t * 4]) << 24;
W[t] |= 0x800000 ;
} else /*if (mmod == 0)*/ {
W[t] = 0x80000000 ;
}
if (current_pad<56)
{
W[15] = ulen*8 ;
// printf("ulen avlue 2 :w[15] :%u\n", W[15]);
}
}
else if(current_pad <0)
{
if( ulen%64==0)
W[0]=0x80000000;
W[15]=ulen*8;
//printf("ulen avlue 3 :w[15] :%u\n", W[15]);
}
for (t = 0; t < 64; t++) {
if (t >= 16)
W[t] = gamma1(W[t - 2]) + W[t - 7] + gamma0(W[t - 15]) + W[t - 16];
T1 = H + sigma1(E) + ch(E, F, G) + K[t] + W[t];
T2 = sigma0(A) + maj(A, B, C);
H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2;
}
digest[0] += A;
digest[1] += B;
digest[2] += C;
digest[3] += D;
digest[4] += E;
digest[5] += F;
digest[6] += G;
digest[7] += H;
}
printf("hi");
}
How can i use here paralelism (all GPU cores) to calculate needed hash code?
Is it real to do task like this using OPENCL ?

How do I send/read data from device in OpenCL?

How do I just create some data on device and then send/read it on the host?
I tried following but does not seem to work.
#include <stdio.h>
#include <stdlib.h>
#include "vectors.h"
#include "sphere.h"
#include "shading.h"
#include "ray.h"
#include "stdbool.h"
#include <CL/cl.h>
#if defined __APPLE__
#include <OpenGL/gl.h>
#include <OpenGL/glu.h>
#include <GLUT/glut.h>
#elif defined (WIN32)
#include <GL/freeglut.h>
#else
#include <GL/gl.h>
#include <GL/glu.h>
#include <GL/freeglut_std.h>
#endif
#include <time.h>
VECTOR3D light;
SPHERE sphere[NSPHERES];
static PIXEL pixel;
VIEWPORT viewport;
VECTOR3D view_point;
VEC_BASIS camera_frame;
cl_double focal_distance;
double color;
//double kd_red, kd_green, kd_blue;
//double ks_red, ks_green, ks_blue;
double red, green, blue;
double light_intensity, ambi_light_intensity;
double theta, reflected_theta;
int bShadow = 0;
int direction[NSPHERES];
int intersection_object = -1; // none
double current_lambda = 0x7fefffffffffffff; // maximum positive double
double current_reflected_lambda = 0x7fefffffffffffff; // maximum positive double
// window identifier:
static int win;
void Timer (int obsolete) {
glutPostRedisplay();
glutTimerFunc(10, Timer, 0);
}
// opencl stuff
typedef struct cl_struct {
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
cl_command_queue queue;
} cl_struct;
#define MAX_SOURCE_SIZE (0x100000)
void disp2(void) {
int i,j;
PIXEL* pCurrentPixel;
PIXEL* pPixels;
int VPWIDTH = viewport.xvmax - viewport.xvmin;
int VPHEIGHT = viewport.yvmax - viewport.yvmin;
pPixels = (PIXEL*)(viewport.pPixels);
//clear all pixels:
glClear(GL_COLOR_BUFFER_BIT);
// For all pixels:
for (i=0; i<VPWIDTH; i++) {
for (j=0; j<VPHEIGHT; j++) {
pCurrentPixel = (PIXEL*)(pPixels + VPWIDTH*i + j);
//set color for the current pixel:
glColor3f(pCurrentPixel->rgb[0] , pCurrentPixel->rgb[1], pCurrentPixel->rgb[2]);
// draw pixel
glBegin(GL_POINTS);
glVertex2i(i, j);
glEnd();
} // j
} //i
//glFlush();
glutSwapBuffers();
}
void init(void) {
direction[0] = 1;
direction[1] = 0;
direction[2] = 1;
pixel.i = 0;
pixel.j = 0;
// set scene:
// 1. define viewport
viewport.xvmin = -VIEWPLANE;
viewport.yvmin = -VIEWPLANE;
viewport.xvmax = VIEWPLANE;
viewport.yvmax = VIEWPLANE;
// 2. allocate enough space for pixels in viewport
viewport.pPixels = (PIXEL *) malloc(sizeof(PIXEL) * (viewport.xvmax - viewport.xvmin) * (viewport.yvmax- viewport.yvmin));
// 3. set camera:
camera_frame.u.x = 1.0;
camera_frame.u.y = 0.0;
camera_frame.u.z = 0.0;
camera_frame.v.x = 0.0;
camera_frame.v.y = 1.0;
camera_frame.v.z = 0.0;
camera_frame.n.x = 0.0;
camera_frame.n.y = 0.0;
camera_frame.n.z = 1.0;
view_point.x = (viewport.xvmax - viewport.xvmin) / 2.0 ;
view_point.y = (viewport.yvmax - viewport.yvmin) / 2.0 ;
view_point.z = 0.0;
// 4. set light:
light.x = view_point.x - 1300;
light.y = view_point.y + 1300 ;
light.z = view_point.z - 300;
ambi_light_intensity = 1.0;
light_intensity = 1.0;
focal_distance = FOCALDIST;
// 5. put spheres behind the viewport:
sphere[0].radius = RADIUS/1.5;
sphere[0].center.x = view_point.x - (RADIUS+30);
sphere[0].center.y = view_point.y ;
sphere[0].center.z = view_point.z - focal_distance - (2*RADIUS+20);
// the first sphere is blue:
set_rgb_array(sphere[0].kd_rgb, 0.0, 0.0, 0.8);
set_rgb_array(sphere[0].ks_rgb, 1.0, 1.0, 1.0);
set_rgb_array(sphere[0].ka_rgb, 0.0, 0.0, 0.2);
sphere[0].shininess = 100.0;
sphere[0].mirror = false;
sphere[1].radius = RADIUS/1.2;
sphere[1].center.x = view_point.x + 0;
sphere[1].center.y = view_point.y + 50;
sphere[1].center.z = view_point.z - focal_distance - (3*RADIUS+20);
// the second sphere is green:
set_rgb_array(sphere[1].kd_rgb, 0.0, 0.5, 0.0);
set_rgb_array(sphere[1].ks_rgb, 1.0, 1.0, 1.0);
set_rgb_array(sphere[1].ka_rgb, 0.0, 0.2, 0.0);
sphere[1].shininess = 10.0;
sphere[1].mirror = false;
sphere[2].radius = RADIUS;
sphere[2].center.x = view_point.x + (2*RADIUS+30);
sphere[2].center.y = view_point.y + 100;
sphere[2].center.z = view_point.z - focal_distance - (4*RADIUS+20);
// the third sphere is red:
set_rgb_array(sphere[2].kd_rgb, 1.0, 0.0, 0.0);
set_rgb_array(sphere[2].ks_rgb, 1.0, 1.0, 1.0);
set_rgb_array(sphere[2].ka_rgb, 0.2, 0.0, 0.0);
sphere[2].shininess = 100.0;
sphere[2].mirror = false;
sphere[3].radius = 1*RADIUS;
sphere[3].center.x = view_point.x ;
sphere[3].center.y = view_point.y - 100*RADIUS-130;
sphere[3].center.z = view_point.z - focal_distance - (4*RADIUS+20);
// the third sphere is red:
set_rgb_array(sphere[3].kd_rgb, 0.5, 0.5, 0.5);
set_rgb_array(sphere[3].ks_rgb, 1.0, 1.0, 1.0);
set_rgb_array(sphere[3].ka_rgb, 0.5, 0.5, 0.5);
sphere[3].shininess = 100.0;
sphere[3].mirror = false;
// set clearing (background) color to white:
glClearColor(0.0, 0.0, 0.0, 0.0);
// specify that ortgogonal 2D projection is to be used to
// map context of 2D world coordinats to the screen. We use the
// world-coordinate rectangle of the same aspect ratio as the display window
// so ther is no distortion:
glMatrixMode(GL_PROJECTION);
gluOrtho2D(0.0, WINDOW, 0.0, WINDOW);
}
int main(int argc, const char * argv[]) {
clock_t startCPU, endCPU, startGPU, endGPU;
// init glut:
glutInit (&argc, argv);
// specify the display mode to be RGB and single buffering:
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB);
// specify the initial window position:
glutInitWindowPosition(100, 100);
// specify the initial window size:
glutInitWindowSize(WINDOW,WINDOW);
// create the window and set title:
win = glutCreateWindow("Basic Ray Tracer by Pa3cio, UL FRI");
init();
// Create the two input vectors
int i, j, k, l;
int VPWIDTH = viewport.xvmax - viewport.xvmin;
int VPHEIGHT = viewport.yvmax - viewport.yvmin;
// PIXEL* pixels = (PIXEL*) malloc(sizeof(PIXEL) * VPWIDTH * VPHEIGHT);
PIXEL* pPixelsFromGPU = (PIXEL*) malloc(sizeof(PIXEL) * VPWIDTH * VPHEIGHT);
PIXEL* pCurrentPixel;
PIXEL* pPixels;
RAY ray, shadow_ray;
SPHERE_INTERSECTION intersection, current_intersection, shadow_ray_intersection;
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("/home/rokj/sula/vpsa/seminarska/kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR,
VPWIDTH * VPHEIGHT * sizeof(PIXEL), NULL, &ret);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "compute_ray", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&output);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 1\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 1, sizeof(VECTOR3D), &view_point);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 2\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 2, sizeof(VECTOR3D), &camera_frame.n);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 3\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 3, sizeof(VECTOR3D), &camera_frame.u);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 4\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 4, sizeof(VECTOR3D), &camera_frame.v);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 5\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 5, sizeof(cl_int), &viewport.xvmin);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 6\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 6, sizeof(cl_int), &viewport.yvmin);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 7\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 7, sizeof(cl_double), &focal_distance);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 7\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 8, sizeof(cl_int), &VPWIDTH);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 9\n", stderr);
exit(1);
}
ret = clSetKernelArg(kernel, 9, sizeof(cl_int), &VPHEIGHT);
if (ret != CL_SUCCESS) {
fputs("error setting CL kernel arg 10\n", stderr);
exit(1);
}
ret = clFinish(command_queue);
// Execute the OpenCL kernel on the list
size_t global_item_size = VPWIDTH * VPHEIGHT; // Process the entire lists
size_t local_item_size = 1024; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
ret = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0,
VPWIDTH * VPHEIGHT * sizeof(PIXEL), pPixelsFromGPU, 0, NULL, NULL);
// Display the result to the screen
//for(i = 0; i < LIST_SIZE; i++)
// printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
// ret = clFlush(command_queue);
// ret = clFinish(command_queue);
// ret = clReleaseKernel(kernel);
// ret = clReleaseProgram(program);
// ret = clReleaseMemObject(b_mem_obj);
// ret = clReleaseCommandQueue(command_queue);
// ret = clReleaseContext(context);
pPixels = (PIXEL*) (pPixelsFromGPU);
// For all pixels:
for (i=0; i<VPWIDTH; i++) {
for (j=0; j<VPHEIGHT; j++) {
//pCurrentPixel = (PIXEL*)(pPixels + VPWIDTH*i + j);
// here I try to get pixel set on GPU, but it does not work
pCurrentPixel = &pPixels[i*VPWIDTH+j];
} //j
} //i
viewport.pPixels = (PIXEL*) (pPixelsFromGPU);
// register callback function to display graphics:
glutDisplayFunc(disp2);
// call Timer():
Timer(0);
// enter tha main loop and process events:
glutMainLoop();
free(pPixelsFromGPU);
return 0;
}
Definitions on host:
#include <CL/cl.h>
#ifndef DEFS_H
#define DEFS_H
#define _BLINNPHONG
//#define _LAMBERT
//#define _NOSHADING
#define NSPHERES 4
#define VIEWPLANE 500
#define WINDOW VIEWPLANE*2
#define FOCALDIST 1000
#define RADIUS 200
//#define _ANIMATE
// typedef enum {false=0, true=1} BOOL;
// typedef enum {down=0, up=1} DIRECTION;
#define CRED 0
#define CGREEN 1
#define CBLUE 2
#define true 1
#define false 0
/* --------------- VECTORS -------------------- */
typedef struct Vector3D{
cl_double x;
cl_double y;
cl_double z;
cl_double dummy1;
} VECTOR3D;
typedef struct ray {
VECTOR3D origin;
VECTOR3D direction;
} RAY;
/* ------------------- PIXEL ------------------ */
typedef struct pixel {
RAY ray;
cl_double rgb[4];
cl_int i;
cl_int j;
cl_int dummy1;
cl_int dummy2;
cl_int dummy3;
cl_int dummy4;
cl_int dummy5;
cl_int dummy6;
} PIXEL;
/* ----------------- VIEWPORT ----------------- */
typedef struct vp {
cl_int xvmin;
cl_int yvmin;
cl_int xvmax;
cl_int yvmax;
PIXEL* pPixels;
} VIEWPORT;
/* ---------------- SPHERE -------------------- */
typedef struct sp_intersection {
cl_double lambda_in;
cl_double lambda_out;
VECTOR3D normal;
VECTOR3D point;
cl_int valid;
} SPHERE_INTERSECTION;
typedef struct sph {
VECTOR3D center;
cl_double radius;
cl_double kd_rgb[3];
cl_double ks_rgb[3];
cl_double ka_rgb[3];
cl_double shininess;
cl_int mirror;
}SPHERE;
/* ------------------- RAY --------------------- */
/* --------------- VECTOR BASIS ---------------- */
typedef struct vb {
VECTOR3D u;
VECTOR3D v;
VECTOR3D n;
} VEC_BASIS;
#endif
In kernel:
typedef struct pixel {
RAY ray;
double rgb[4];
int i;
int j;
int dummy1;
int dummy2;
int dummy3;
int dummy4;
int dummy5;
int dummy6;
} PIXEL;
And kernel:
typedef struct Vector3D {
double x;
double y;
double z;
double dummy1;
} VECTOR3D;
typedef struct ray {
VECTOR3D origin;
VECTOR3D direction;
} RAY;
typedef struct pixel {
RAY ray;
double rgb[4];
int i;
int j;
int dummy1;
int dummy2;
int dummy3;
int dummy4;
int dummy5;
int dummy6;
} PIXEL;
void vec_sub(VECTOR3D *v1, VECTOR3D *v2, VECTOR3D *v3) {
v1->x = v2->x - v3->x;
v1->y = v2->y - v3->y;
v1->z = v2->z - v3->z;
}
void vec_add(VECTOR3D *v1, VECTOR3D *v2, VECTOR3D *v3) {
v1->x = v2->x + v3->x;
v1->y = v2->y + v3->y;
v1->z = v2->z + v3->z;
}
void vec_scale(double scale, VECTOR3D *v1, VECTOR3D *v2) {
v1->x = scale * v2->x;
v1->y = scale * v2->y;
v1->z = scale * v2->z;
}
double dotproduct(VECTOR3D *v1, VECTOR3D *v2) {
return v1->x * v2->x + v1->y * v2->y + v1->z * v2->z;
}
void normalize_vector(VECTOR3D *v) {
double magnitude;
// 1. calculate the magnitude (length):
magnitude = sqrt( dotproduct(v, v) );
// 2. normalize the vector:
v->x = v->x / magnitude;
v->y = v->y / magnitude;
v->z = v->z / magnitude;
}
__kernel void compute_ray(
write_only global PIXEL *output,
VECTOR3D view_point,
VECTOR3D camera_frame_n,
VECTOR3D camera_frame_u,
VECTOR3D camera_frame_v,
const int viewport_xvmin,
const int viewport_yvmin,
const double distance,
const int w, const int h
)
{
float u, v;
VECTOR3D v1, v2, v3, v4, dir;
RAY ray;
PIXEL pixel;
int gi = get_global_id(0);
int i = gi / w;
int j = gi % w;
u = (float)(viewport_xvmin) + (float)(i) + 0.5f;
v = (float)(viewport_yvmin) + (float)(j) + 0.5f;
vec_scale(-distance, &v1, &camera_frame_n);
vec_scale(u, &v2, &camera_frame_u);
vec_scale(v, &v3, &camera_frame_v);
ray.origin.x = 22;
ray.origin.y = 22;
ray.origin.z = 22;
vec_add(&v4, &v1, &v2);
vec_add(&dir, &v4, &v3);
normalize_vector(&dir);
ray.direction.x = 11;
ray.direction.y = 11;
ray.direction.z = 11;
pixel.ray = ray;
pixel.i = 33;
pixel.j = 33;
output[i*w*j] = pixel;
}
I intentionally set i, j, origin and direction structures to fixed number so I could see if numbers are set.
Then I try to get pixel set on GPU in line
pCurrentPixel = &pPixels[i*VPWIDTH+j];
but pCurrentPixel->i for example is 0 instead of 33.
Code compiles with following commands:
gcc -c main.c -o main.o
gcc -c shading.c -o shading.o
gcc -c sphere.c -o sphere.o
gcc -c ray.c -o ray.o
gcc -c vectors.c -o vectors.o
gcc -I/usr/include -L/usr/lib/x86_64-linux-gnu main.o shading.o sphere.o ray.o vectors.o -lGL -lglut -lGLU -lX11 -lm -lrt -lOpenCL -o main

Besides setting the right VPWIDTH and VPHEIGHT output[i*w*j] = pixel; had to be changed to output[i*w+j] = pixel; in C code.

clEnqueueNDRangeKernel throws CL_OUT_OF_RESOURCES

I have a kernel running very well on Intel HD graphics card. But, when I want to run the kernel on my GeForce 960 it gives the CL_OUT_OF_RESOURCES error.
I have tried for different local sizes and made sure to not go beyond the array indices, but still have no clue why this error is happening. Do you know why my code runs fine on Intel and doesn't work on NVIDIA?
One weird thing that is happening in my code is that I have a 13 itrations of similar operations. For performance purposes, I have repeated the same operations for 13 times and avoided writing a loop just to save some additional operations that loops have. The code works on NVIDIA when I reach to the 11th operation. But, when I include the 12th operation in the code it gives the above error and the 11th and 12th operations are similar! Any ideas why such thing is happening?
Here is the kernel:
float2 projectCube(float3 axis, float3 vertex){
float voxelSize = 0.5f;
float2 projection = (float2)(0.0f, 0.0f);
float temp;
//1
temp = axis.x;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//2
temp = axis.x + axis.y;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//3
temp = axis.y;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//4
temp = axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//5
temp = axis.x + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//6
temp = axis.y + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//7
temp = axis.x + axis.y + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
float product = dot(axis, vertex);
projection.x = voxelSize * projection.x + product;
projection.y = voxelSize * projection.y + product;
return projection;
}
float2 projectTriangle(float3 axis, float3 v0, float3 v1, float3 v2){
float2 projection;
projection.x = dot(axis, v0);
projection.y = projection.x;
float temp = dot(axis, v1);
if(projection.x > temp){
projection.x = temp;
}
else if(projection.y < temp){
projection.y = temp;
}
temp = dot(axis, v2);
if (projection.x > temp){
projection.x = temp;
}
else if (projection.y < temp){
projection.y = temp;
}
return projection;
}
float tester(float3 axis, float3 voxel, float3 v0, float3 v1, float3 v2){
float2 voxelProjection = projectCube(axis, voxel);
float2 faceProjection = projectTriangle(axis, v0, v1, v2);
float minProjection = fmin(voxelProjection.x, faceProjection.x);
float maxProjection = fmax(voxelProjection.y, faceProjection.y);
float testResult = maxProjection - minProjection - voxelProjection.y + voxelProjection.x
- faceProjection.y + faceProjection.x;
return testResult;
}
__kernel void voxelizer(size_t global_size,
float h_voxelSize,
__global float* h_minBoundsGrid,
__global int *h_dimGrid,
__global float* coords,
__global int* density)
{
//printf("local size is: %d\n", get_num_groups(0));
int i = get_global_id(0) * 9;
if (i <= global_size * 9){
float voxelSize = h_voxelSize;
float3 minBoundsGrid;
minBoundsGrid.x = h_minBoundsGrid[0];
minBoundsGrid.y = h_minBoundsGrid[1];
minBoundsGrid.z = h_minBoundsGrid[2];
int3 dimGrid;
dimGrid.x = h_dimGrid[0];
dimGrid.y = h_dimGrid[1];
dimGrid.z = h_dimGrid[2];
if ( i %9 == 0){
/*Triangle vertices*/
float3 v0;
v0 = (float3)(coords[i], coords[i + 1], coords[i + 2]);
float3 v1;
v1 = (float3)(coords[i + 3], coords[i + 4], coords[i + 5]);
float3 v2;
v2 = (float3)(coords[i + 6], coords[i + 7], coords[i + 8]);
//printf("i = %d. v0: %f, %f, %f\n", i, v0.x, v0.y, v0.z);
//printf("i = %d. v1: %f, %f, %f\n", i, v1.x, v1.y, v1.z);
//printf("i = %d. v2: %f, %f, %f\n", i, v2.x, v2.y, v2.z);
/*Normal vectors of the each voxel*/
float3 e0;
e0 = (float3)(0.5f, 0.0f, 0.0f);
float3 e1;
e1 = (float3)(0.0f, 0.5f, 0.0f);
float3 e2;
e2 = (float3)(0.0f, 0.0f, 0.5f);
/*Edges of a traingle*/
float3 f0;
f0 = v1 - v0;
float3 f1;
f1 = v2 - v1;
float3 f2;
f2 = v0 - v2;
float3 minLocalGrid;
minLocalGrid.x = fmin(v0.x, fmin(v1.x, v2.x));
minLocalGrid.y = fmin(v0.y, fmin(v1.y, v2.y));
minLocalGrid.z = fmin(v0.z, fmin(v1.z, v2.z));
minLocalGrid.x = voxelSize * floor(minLocalGrid.x / voxelSize);
minLocalGrid.y = voxelSize * floor(minLocalGrid.y / voxelSize);
minLocalGrid.z = voxelSize * floor(minLocalGrid.z / voxelSize);
//printf("i = %d. minLocalGrid = %f, %f, %f.\n", i, minLocalGrid.x, minLocalGrid.y, minLocalGrid.z);
float3 maxLocalGrid;
maxLocalGrid.x = fmax(v0.x, fmax(v1.x, v2.x));
maxLocalGrid.y = fmax(v0.y, fmax(v1.y, v2.y));
maxLocalGrid.z = fmax(v0.z, fmax(v1.z, v2.z));
maxLocalGrid.x = voxelSize * ceil(maxLocalGrid.x / voxelSize);
maxLocalGrid.y = voxelSize * ceil(maxLocalGrid.y / voxelSize);
maxLocalGrid.z = voxelSize * ceil(maxLocalGrid.z / voxelSize);
if (maxLocalGrid.x == minLocalGrid.x){ maxLocalGrid.x += voxelSize; }
if (maxLocalGrid.y == minLocalGrid.y){ maxLocalGrid.y += voxelSize; }
if (maxLocalGrid.z == minLocalGrid.z){ maxLocalGrid.z += voxelSize; }
//printf("i = %d. maxLocalGrid = %f, %f, %f.\n", i, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
//printf("i = %d\n v0 = %f, %f, %f\n v1 = %f, %f, %f\n v2 = %f, %f, %f\n minLocalGrid = %f, %f, %f\n===============\n",
// i, v0.x, v0.y, v0.z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
float j = minLocalGrid.z;
while(j < maxLocalGrid.z){
float k = minLocalGrid.y;
while(k < maxLocalGrid.y){
float l = minLocalGrid.x;
while (l < maxLocalGrid.x){
float3 firstVertexOfVoxel = (float3)(l, k, j);
//printf("l,k,j: %f, %f, %f\n", l, k, j);
float3 globalCoordOffset = (firstVertexOfVoxel - minBoundsGrid) / voxelSize;
int3 globalDimOffset = convert_int3_rtz(globalCoordOffset);
//printf("i = %d. globalCoordOffset: %f, %f, %f\n", i, globalCoordOffset.x, globalCoordOffset.y, globalCoordOffset.z);
//printf("i = %d. globalDimOffset: %d, %d, %d\n", i, globalDimOffset.x, globalDimOffset.y, globalDimOffset.z);
int voxelIndexGlobalGrid = globalDimOffset.x + dimGrid.x * (globalDimOffset.y +
dimGrid.y * globalDimOffset.z);
//printf("i = %d. voxelIndexGlobalGrid = %d\n", i, voxelIndexGlobalGrid);
if (density[voxelIndexGlobalGrid] != 1){
/*The famous 13-axes test*/
float3 axis;
float testResult = 0;
int overlapCount = 0;
//1
testResult = tester(e0, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//2
testResult = tester(e1, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//3
testResult = tester(e2, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//4
//axis = ;
testResult = tester(cross(-f2, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//5
/*axis = cross(e0, f0);*/
testResult = tester(cross(e0, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//6
//axis = cross(e0, f0);
testResult = tester(cross(e0, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//7
//axis = cross(e0, f0);
testResult = tester(cross(e0, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//8
//axis = cross(e1, f0);
testResult = tester(cross(e1, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//9
//axis = cross(e1, f1);
testResult = tester(cross(e1, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//10
//axis = cross(e1, f2);
testResult = tester(cross(e1, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//11
//axis = cross(e2, f0);
testResult = tester(cross(e2, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//12
//axis = cross(e2, f1);
testResult = tester(cross(e2, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//13
//axis = cross(e2, f2);
testResult = tester(cross(e2, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
if (overlapCount == 13){
density[voxelIndexGlobalGrid] = 1;
}
}
l = l + voxelSize;
}// while for l
k = k + voxelSize;
}// while for k
j = j + voxelSize;
}//while for j
//printf("Here are the max of the %d-th face: %f, %f, %f\n", i / 9, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
//printf("Here are the coordinates of the %d-th face: %f, %f, %f\n", i / 9, e1.x, e1.y, e1.z);
//printf("Here are the coordinates of the %d-th face: %f, %f, %f\n", i / 9, e2.x, e2.y, e2.z);
//printf("\n==================KERNEL COMPUTED==================\n");
//barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
And this is the c-code:
#define DEVICE_SELECTOR 1 //0 for Intel and 1 for Nvidia in my computer
#define _CRT_SECURE_NO_WARNINGS
#define KERNEL_FILE "..\\voxelizerKernel.cl"
#define WORK_DIM 1
#define VOXEL_SIZE 0.5f
#define HALF_VOXEL_SIZE VOXEL_SIZE/2.0f;
//C header files
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <algorithm>
//OpenCL header files
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
cl_device_id create_device() {
cl_platform_id *platform;
cl_device_id dev;
cl_uint num_platform;
int err;
/* Identify a platform */
err = clGetPlatformIDs(0, NULL, &num_platform);
if (err < 0) {
printf("Error code: %d. Couldn't identify a platform\n", err);
exit(1);
}
platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
clGetPlatformIDs(num_platform, platform, NULL);
/* Access a device */
err = clGetDeviceIDs(platform[DEVICE_SELECTOR], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't access any devices\n", err);
exit(1);
}
return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
printf("Couldn't find the program file\n");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create the program\n", err);
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
void print_device_info(cl_device_id dev){
cl_ulong glob_mem_size, local_mem_size;
cl_uint clock_freq, num_core, work_item_dim, time_res;
size_t local_size, work_item_size[3];
char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];
clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);
printf("==========================================================\n");
printf("Device Sepc without consideration of kernels:\n");
printf("CL_DEVICE_VENDOR: %s\n", dev_vendor);
printf("CL_DEVICE_NAME: %s\n", dev_name);
printf("CL_DEVICE_GLOBAL_MEM_SIZE: %I64u GB\n", glob_mem_size / 1073741824);
printf("CL_DEVICE_LOCAL_MEM_SIZE: %I64u KB\n", local_mem_size / 1024);
printf("CL_DRIVER_VERSION: %s\n", driver_version);
printf("CL_DEVICE_VERSION: %s\n", device_version);
printf("CL_DEVICE_MAX_CLOCK_FREQUENCY: %I32u MHz\n", clock_freq);
printf("CL_DEVICE_MAX_COMPUTE_UNITS: %I32u\n", num_core);
printf("CL_DEVICE_MAX_WORK_GROUP_SIZE %u\n", local_size);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %I32u\n", work_item_dim);
printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
printf("==========================================================\n");
}
int main()
{
/*OpenCL variables*/
cl_int i, j, err, num_groups;
size_t local_size, max_local_size, global_size, processed_global_size;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_device_id device;
cl_kernel voxelization_kernel, reduction_kernel, reduction_complete_kernel;
cl_mem coords_buffer, density_buffer, dimGrid_buffer, h_minBoundsGrid_buffer, fullVxelsCount_buffer, group_sums_buffer;
void *density_mapped_memory;
cl_event prof_event;
cl_ulong time_start, time_end, total_time;
float h_voxelSize = VOXEL_SIZE;
float fullVxelsCount = 0;
/*Read mesh data*/
float coords[54] =
{ 0.300500,
1.300000,
0.000500,
1.200500,
1.600000,
0.000500,
1.600500,
0.600000,
0.000500,
0.300500,
1.300000,
0.000500,
0.500500,
1.900000,
0.000500,
1.200500,
1.600000,
0.000500,
0.300500,
1.300000,
0.000500,
1.600500,
0.600000,
0.000500,
0.100500,
0.700000,
0.000500,
0.100500,
0.700000,
0.000500,
1.600500,
0.600000,
0.000500,
0.000500,
0.200000,
0.000500,
0.000500,
0.200000,
0.000500,
1.600500,
0.600000,
0.000500,
1.600500,
0.100000,
0.000500,
1.200500,
1.600000,
0.000500,
1.600500,
1.300000,
0.000500,
1.600500,
0.600000,
0.000500 };
/*Get the voxel count*/
float boundsGrid[6] = {0,2,0,2,0,0.5};
int dimGrid[3] = {
(boundsGrid[1] - boundsGrid[0]) / VOXEL_SIZE,
(boundsGrid[3] - boundsGrid[2]) / VOXEL_SIZE,
(boundsGrid[5] - boundsGrid[4]) / VOXEL_SIZE
};
if (dimGrid[0] == 0) dimGrid[0] = 1;
if (dimGrid[1] == 0) dimGrid[1] = 1;
if (dimGrid[2] == 0) dimGrid[2] = 1;
float h_minBoundsGrid[3];
h_minBoundsGrid[0] = boundsGrid[0];
h_minBoundsGrid[1] = boundsGrid[2];
h_minBoundsGrid[2] = boundsGrid[4];
int voxelCounts = dimGrid[0] * dimGrid[1] * dimGrid[2];
/*Prepare kernel output : build an array for storing voxles' density info*/
int *density = (int*)malloc(sizeof(int)*voxelCounts);
for (int i = 0; i < voxelCounts; i++){
density[i] = 0;
}
/*OpenCL essentials*/
device = create_device();
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, NULL);
//print_device_info(device);
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a context\n", err);
exit(1);
}
program = build_program(context, device, KERNEL_FILE);
queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a command queue\n", err);
exit(1);
};
voxelization_kernel = clCreateKernel(program, "voxelizer", &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a kernel\n", err);
exit(1);
};
int numberOfFaces = 6;
global_size = numberOfFaces;
local_size = max_local_size;
if (global_size % local_size != 0){
processed_global_size = (global_size / local_size + 1) * local_size;
//int padding = processed_global_size - global_size;
//int *working_data = (int*)malloc((voxelCounts + padding)*sizeof(int));
//memcpy(working_data, density, voxelCounts);
//memset(working_data + voxelCounts, 0.0, padding);
}
else{
processed_global_size = global_size;
}
/* Create host-device data exchange interface*/
dimGrid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)* 3, dimGrid, &err);
h_minBoundsGrid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)* 3, h_minBoundsGrid, &err);
coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float) * 54, coords, &err);
density_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * voxelCounts, density, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a buffer\n", err);
exit(1);
};
err = clSetKernelArg(voxelization_kernel, 0, sizeof(global_size), &global_size);
err |= clSetKernelArg(voxelization_kernel, 1, sizeof(h_voxelSize), &h_voxelSize);
err |= clSetKernelArg(voxelization_kernel, 2, sizeof(cl_mem), &h_minBoundsGrid_buffer);
err |= clSetKernelArg(voxelization_kernel, 3, sizeof(cl_mem), &dimGrid_buffer);
err |= clSetKernelArg(voxelization_kernel, 4, sizeof(cl_mem), &coords_buffer);
err |= clSetKernelArg(voxelization_kernel, 5, sizeof(cl_mem), &density_buffer);
if (err < 0) {
printf("Error code: %d. Couldn't create an argument for voxelization_kernel\n", err);
exit(1);
}
/* Do the voxelization magic */
err = clEnqueueNDRangeKernel(queue, voxelization_kernel, 1, NULL, &processed_global_size,
&local_size, 0, NULL, &prof_event);
if (err < 0) {
printf("Error code: %d. Couldn't enqueue the voxelization_kernel\n", err);
exit(1);
}
/* Read the results */
density_mapped_memory = clEnqueueMapBuffer(queue, density_buffer, CL_TRUE,
CL_MAP_READ, 0, sizeof(density), 0, NULL, NULL, &err);
if (err < 0) {
printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
exit(1);
}
memcpy(density, density_mapped_memory, sizeof(density)* voxelCounts);
err = clEnqueueUnmapMemObject(queue, density_buffer, density_mapped_memory,
0, NULL, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't unmap the density_buffer\n", err);
exit(1);
}
for (int i = 0; i < voxelCounts; i++){
printf("%d\n", density[i]);
}
/*Clean up*/
clReleaseKernel(voxelization_kernel);
clReleaseMemObject(dimGrid_buffer);
clReleaseMemObject(h_minBoundsGrid_buffer);
clReleaseMemObject(coords_buffer);
clReleaseMemObject(density_buffer);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}

OpenCL Error Computing Matrix Multiplication during Runtime

I have been debugging for the past few days and cannot get this OpenCL matrix multiplication kernel to run. Whenever I run the program, the output from the GPU results in large negative numbers similar to -198746573.0000. I was wondering if someone with HPC experience could point out an error in my code or if it is an error with the driver.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#define widthA 2
#define heightA 2
#define widthB heightA
#define heightB 2
#define widthC widthA
#define heightC heightB
#ifdef __APPLE__
#include < OpenCL/opencl.h >
#else
#include <opencl.h>
#endif
#define MEM_SIZE (128)
#define MAX_SOURCE_SIZE (0x100000)
int main()
{
float * A = (float *)malloc(sizeof(float)*widthA*heightA);
float * B = (float *)malloc(sizeof(float)*widthB*heightB);
float * C = (float *)malloc(sizeof(float)*widthC*heightC);
float * Res = (float *)malloc(sizeof(float)*widthC*heightC);
float * D= (float *)malloc(sizeof(float)*widthC*heightC);
float ref[widthC][heightC];
int i, j, k;
FILE * fp1 = fopen("matAdata.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightA; j++) {
float p=(rand()%100)/7.0;
//*(A+i*heightA+j)=rand()%100 + p;
*(A+i*heightA+j)=4.0;
fprintf(fp1, "%f ",*(A+i*heightA+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
fp1 = fopen("matBdata.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
for(i = 0;i < widthB; i++)
{
for(j=0; j < heightB; j++) {
float p=(rand()%100)/7.0;
//*((B+i*heightB+j))=rand()%100 + p;
*((B+i*heightB+j))=4.0;
fprintf(fp1, "%f ",*(B+i*heightA+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobjA = NULL;
cl_mem memobjB = NULL;
cl_mem memobjC = NULL;
cl_mem rowA = NULL;
cl_mem colC = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id[10];
cl_platform_id platform = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_event GPUDone[0];
//char string[MEM_SIZE];
FILE *fp;
char fileName[] = "matrixMultiplication.cl";
char *source_str;
size_t source_size;
int row = widthA;
int col = heightC;
/* Load the source code containing the kernel*/
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
/* Get Platform and Device Info */
ret = clGetPlatformIDs(10, platform_id, &ret_num_platforms);
char cBuffer[1024];
cl_uint c;
for(c = 0; c < ret_num_platforms; c++)
{
clGetPlatformInfo(platform_id[c], CL_PLATFORM_NAME, 1024, &cBuffer, NULL);
if (strstr(cBuffer, "NVIDIA") != NULL)
{
platform = platform_id[c];
break;
}
}
printf("Found Platform %s\n", cBuffer);
ret = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
printf("Found %d devices.\n", ret_num_devices);
/* Create OpenCL context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create Memory Buffer */
memobjA = clCreateBuffer(context, CL_MEM_READ_ONLY, widthA * heightA * sizeof(float), NULL, &ret);
memobjB = clCreateBuffer(context, CL_MEM_READ_ONLY, widthB * heightB * sizeof(float), NULL, &ret);
memobjC = clCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret);
rowA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
colC = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
widthA * heightA * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
widthB * heightB * sizeof(float), B, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, rowA, CL_TRUE, 0, sizeof(int), &row, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, colC, CL_TRUE, 0, sizeof(int), &col, 0, NULL, NULL);
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create OpenCL Kernel */
kernel = clCreateKernel(program, "matrixMultiplication", &ret);
/* Set OpenCL Kernel Arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&row);
ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&col);
/* Execute OpenCL Kernel */
//ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
size_t globalThreads[2] = {widthA, heightB};
size_t localThreads[2] = {16,16};
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL);
//clFlush(command_queue);
//clFinish(command_queue);
/* Copy results from the memory buffer */
ret = clEnqueueReadBuffer(command_queue, memobjC, CL_TRUE, 0,
widthA * heightC * sizeof(float), Res, 0, NULL, &GPUDone[0]);
printf("Buffer Read ended with %d.\n", ret);
clWaitForEvents(1, GPUDone);
fp1 = fopen("matGPURes.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
exit(1);
}
printf("\nResult\n");
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightC; j++)
{
fprintf(fp1, "%f ",*(Res+i*heightC+j));
ref[i][j] = *(Res+i*heightC+j);
printf("GPU Output: %f\n", *(Res+i*heightC+j));
}
fprintf(fp1, "\n");
}
fclose(fp1);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(memobjA);
ret = clReleaseMemObject(memobjB);
ret = clReleaseMemObject(memobjC);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
ret = clReleaseEvent(GPUDone[0]);
free(source_str);
float sum=0.0;
for(i = 0;i < widthA; i++)
{
for(j = 0; j < heightC; j++)
{
sum = 0;
for(k = 0; k < widthB; k++)
{
sum += A[i*col+k] * B[k*row+j];
printf("Multiplying A: %f, B: %f\n", A[i*col+k], B[k*row+j]);
}
D[i*heightC+j] = sum;
}
}
fp1 = fopen("matNormalMultiplicationRes.txt", "w");
if (!fp1) {
fprintf(stderr, "Failed to open matNormalMultiplicationRes.txt\n");
exit(1);
}
for(i = 0; i<widthA; i++)
{
for(j = 0; j<heightA; j++)
{
if (ref[i][j] != D[i*heightA+j])
{
printf("Calculation error[ CPU: %f, GPU: %f ]\n", D[i*heightA+j], ref[i][j]);
}
}
}
printf("\nResult\n");
for(i = 0;i < widthA; i++)
{
for(j=0;j < heightC; j++)
{
fprintf(fp1, "%f ",*(D+i*heightC+j));
}
fprintf(fp1, "\n");
}
free(A);
free(B);
free(C);
free(D);
free(Res);
return 0;
}
Here is the kernel
#define BLOCK_SIZE 16
__kernel
void matrixMultiplication(__global float* A, __global float* B, __global float* C, int wA, int wB )
{
//int i = get_global_id(0);
//int j = get_global_id(1);
float Csub = 0.0f;
int bx = get_group_id(0);
int by = get_group_id(1);
int tx = get_local_id(0);
int ty = get_local_id(1);
int aBegin = wA * BLOCK_SIZE * by;
int aEnd = aBegin + wA - 1;
int aStep = BLOCK_SIZE;
int bBegin = BLOCK_SIZE * bx;
int bStep = BLOCK_SIZE * wB;
for (int a = aBegin, b=bBegin;
a <= aEnd;
a += aStep, b+=bStep)
{
__local float As[BLOCK_SIZE][BLOCK_SIZE];
__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
barrier(CLK_LOCAL_MEM_FENCE);
for( int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
barrier(CLK_LOCAL_MEM_FENCE);
}
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
/*
float value=0;
for ( int k = 0; k < widthA; k++)
{
value = value + A[k + j * widthA] * B[k*widthB + i];
}
C[i + widthA * j] = value;
*/
}
I have double checked over and over again but simply cannot find any errors. I want to make sure its not a code error before I conclude its a driver issue.
Thanks!

Do you really need a complex kernel like that ? if you really want to do simple matrix multiplication
you can write a simple kernel like this, which is easy to debug.
__kernel void matrixMultiplication (__global float* A,
__global float* B,
__global float* C,
int widthA, int widthB )
{
//y direction
int row = get_global_id(1);
int col = get_global_id(0);
float cSum = 0.0f;
//calculate the result
for (int i=0; i<widthA; i++)
{
cSum += A[row*widthA+ i] * B[i*widthB+col];
}
C[row*widthB+col] = cSum;
}

Case is probably closed already, but for the sake of google-comers:
Shouldnt shared memory be explicitly declared on host and passed as kernel argument to the source? __local keyword is not the one you are looking for in this case.
See post on How to declare local memory in OpenCL? for the detailed explanation.

Check the functionality of your host. Here a few things to get you started ...
1) You don't need to create a buffer and enqueue it for a scalar constant Int like row and col. Just set it as a kernel arg.
2) Wait for the clEnqueueNDRangeKernel with an event. You want to be sure the calc has completed.
3) Add a printf statement in the kernel to print selected values to see that the input and output values are what you expect.
try
if ( get_local_id(0) % 8 == 0)
{
printf some useful value of a,b,c
}
3) Try the host code with a dumb kernel that copies an input array to an output array. That will confirm it you have the handling of buffer creation and the enqeue read/write code correct!

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Ascon-128 in arduino - arduino

Related

Trying to add functions on ESP32CAM CameraWebServer Example Code

OpenCL generate SHA-256 hash

How do I send/read data from device in OpenCL?

clEnqueueNDRangeKernel throws CL_OUT_OF_RESOURCES

OpenCL Error Computing Matrix Multiplication during Runtime

Categories

Resources