Could anyone point out why this code can cause dead-lock?
It is a single producer, multiple consumer problem. The producer have 8 buffers. Here it has 4 consumers. Each consumer will have two buffers. When a buffer is filled, it flags it to be ready to consume and switch to the second buffer. The consumer then can process this buffer. After it done, it return the buffer to the producer.
Buffer 0-1 for consumer 0
Buffer 2-3 for consumer 1
Buffer 4-5 for consumer 2
Buffer 6-7 for consumer 3
The program once a while reaches to a dead lock state.
The understanding is that, since the flag can be only in one state, either 0 or 1, so at least either consumer or producer can proceed. It one proceed, it eventually will unlock the dead lock.
#include <iostream>
#include <thread>
#include <mutex>
using namespace std;
const int BUFFERSIZE = 100;
const int row_size = 10000;
class sharedBuffer
{
public:
int B[8][BUFFERSIZE];
volatile int B_STATUS[8];
volatile int B_SIZE[8];
sharedBuffer()
{
for (int i=0;i<8;i++)
{
B_STATUS[i] = 0;
B_SIZE[i] = 0;
for (int j=0;j<BUFFERSIZE;j++)
{
B[i][j] = 0;
}
}
}
};
class producer
{
public:
sharedBuffer * buffer;
int data[row_size];
producer(sharedBuffer * b)
{
this->buffer = b;
for (int i=0;i<row_size;i++)
{
data[i] = i+1;
}
}
void produce()
{
int consumer_id;
for(int i=0;i<row_size;i++)
{
consumer_id = data[i] % 4;
while(true)
{
if (buffer->B_STATUS[2*consumer_id] ==1 && buffer->B_STATUS[2*consumer_id + 1] == 1)
continue;
if (buffer->B_STATUS[2*consumer_id] ==0 )
{
buffer->B[2*consumer_id][buffer->B_SIZE[2*consumer_id]++] = data[i];
if(buffer->B_SIZE[2*consumer_id] == BUFFERSIZE || i==row_size -1)
{
buffer->B_STATUS[2*consumer_id] =1;
}
break;
}
else if (buffer->B_STATUS[2*consumer_id+1] ==0 )
{
buffer->B[2*consumer_id+1][buffer->B_SIZE[2*consumer_id+1]++] = data[i];
if(buffer->B_SIZE[2*consumer_id+1] == BUFFERSIZE || i==row_size -1)
{
buffer->B_STATUS[2*consumer_id+1] =1;
}
break;
}
}
}
//some buffer is not full, still need set the flag to 1
for (int i=0;i<8;i++)
{
if (buffer->B_STATUS[i] ==0 && buffer->B_SIZE[i] >0 )
buffer->B_STATUS[i] = 1;
}
cout<<"Done produce, wait the data to be consumed\n";
while(true)
{
if (buffer->B_STATUS[0] == 0 && buffer->B_SIZE[0] == 0
&& buffer->B_STATUS[1] == 0 && buffer->B_SIZE[1] == 0
&& buffer->B_STATUS[2] == 0 && buffer->B_SIZE[2] == 0
&& buffer->B_STATUS[3] == 0 && buffer->B_SIZE[3] == 0
&& buffer->B_STATUS[4] == 0 && buffer->B_SIZE[4] == 0
&& buffer->B_STATUS[5] == 0 && buffer->B_SIZE[5] == 0
&& buffer->B_STATUS[6] == 0 && buffer->B_SIZE[6] == 0
&& buffer->B_STATUS[7] == 0 && buffer->B_SIZE[7] == 0 )
{
for (int i=0;i<8;i++)
buffer->B_STATUS[i] = 2;
break;
}
}
};
};
class consumer
{
public:
sharedBuffer * buffer;
int sum;
int index;
consumer(int id, sharedBuffer * buf){this->index = id;this->sum = 0;this->buffer = buf;};
void consume()
{
while(true)
{
if (buffer->B_STATUS[2*index] ==0 && buffer->B_STATUS[2*index+1] ==0 )
continue;
if (buffer->B_STATUS[2*index] ==2 && buffer->B_STATUS[2*index+1] ==2 )
break;
if (buffer->B_STATUS[2*index] == 1)
{
for (int i=0;i<buffer->B_SIZE[2*index];i++)
{
sum+=buffer->B[2*index][i];
}
buffer->B_STATUS[2*index]=0;
buffer->B_SIZE[2*index] =0;
}
if (buffer->B_STATUS[2*index+1] == 1)
{
for (int i=0;i<buffer->B_SIZE[2*index+1];i++)
{
sum+=buffer->B[2*index+1][i];
}
buffer->B_STATUS[2*index+1]=0;
buffer->B_SIZE[2*index+1] =0;
}
}
printf("Sum of consumer %d = %d \n",index,sum);
};
};
int main()
{
sharedBuffer b;
producer p(&b);
consumer c1(0,&b),c2(1,&b),c3(2,&b),c4(3,&b);
thread p_t(&producer::produce,p);
thread c1_t(&consumer::consume,c1);
thread c2_t(&consumer::consume,c2);
thread c3_t(&consumer::consume,c3);
thread c4_t(&consumer::consume,c4);
p_t.join();c1_t.join();c2_t.join();c3_t.join();c4_t.join();
}
This is flawed in many ways. The compiler can reorder your instructions, and different CPU cores may not see memory operations in the same order.
Basically your producer does this:
it writes data to the buffer
it sets the flag
Your consumer does this:
it reads the flag
if the flag is what it wants it reads data
it resets the flag
This does not work, for several reasons.
The compiler can reorder your instructions (both on the consumer and producer side) to do things in a different order. For example, on the producer side, it could store all your computations in registers, and then write the status flag to memory first, and the data later. The consumer would then get stale data.
Even in absence of that, there is no guarantee that different writes to memory are seen in the same order by different CPU cores (e.g. if they have separate caches, and your flag and data are on different cache lines).
This can cause all sorts of trouble - data corruption, deadlocks, segfaults, depending on what exactly your code does. I haven't analyzed your code sufficiently to tell you exactly why this causes a deadlock, but I'm not surprised at all.
Note that the 'volatile' keyword is completely useless for this type of synchronization. 'volatile' is only useful for signal handling (unix signals), not for multithreaded code.
The correct way to do this is to use proper synchronization (for example mutexes) or atomic operations (e.g. std::atomic). They have various different guarantees that make sure that the issues above don't happen.
Mutexes are generally easier to use if speed is not of the highest importance. Atomic operations can get you a little more control but they are very tricky to use.
I would recommend that you do this with mutexes, then profile the program, and then only go to atomic operations if it's insufficiently fast.
valgrind is a great tool which is useful to debug multithreaded programs (it'll point out unsynchronized memory access and the like).
thanks for the helpful comments.
I thought if make sure all the flags/status value are read from memory, not from registers/cache, the deadlock should not happen no matter how compiler reorganize the instructions. The volatile keyword should enforce this. Looks like my understanding is wrong.
Another baffling thing is that, I thought the value of status variable should only be one of (0,1,2), but once a while, I saw the value like 5384. Somehow the data got corrupted.
Related
Today when I am solving Fibonacci arrays, I meet with a very strange thing. Recursion only takes 16ms, but iteration takes 80ms. I have tried to optimize my iteration (such as I use a vector container to fulfill my stack) but iteration is still much slower than recursion. It doesn't make sense because recursion still builds a stack at OS level, which is more time-consuming than iteration.
Here is my iteration code:
class Solution {
public:
int fib(int n) {
std::stack<int, std::vector<int>> st;
st.push(n);
int result = 0;
int temp = 0;
while(!st.empty()) {
temp = st.top(); st.pop();
if(temp == 1) result++;
else if(temp == 0) continue;
else {
st.push(temp - 1);
st.push(temp - 2);
}
}
return result;
}
};
Here is my recursion code
class Solution {
public:
int fib(int n) {
if(n == 0) return 0;
if(n == 1) return 1;
else return fib(n - 1) + fib(n - 2);
}
};
Well, I have searched for the reason. According to Is recursion ever faster than looping?, recursion is more time-consuming than iteration in an imperative language. But C++ is one of the imperative languages, it is not convincing.
I think I find the reason. You can help me check if there is any incorrect in my analysis?
The reason why recursion is faster than iteration is that if you use an STL container as a stack, it would be allocated in heap space.
When the PC pointer wants to access the stack, cache missing might happen, which is greatly expensive as for a small scale problem.
However, as for the Fibonacci solution, the code length is not very long. So the PC pointer can easily jump to the function's beginning. If you use a static int array, the result is satisfying.
Here is the code:
class Solution {
public:
int fib(int n) {
int arr[1000];
arr[0] = n;
int s = 1;
int result = 0;
int temp;
while (s) {
temp = arr[s-1];
s--;
switch (temp) {
case 1:
result++;
break;
case 0:
continue;
break;
default:
arr[s++] = temp - 1;
arr[s++] = temp - 2;
}
}
return result;
}
};
I'm struggling with an issue where an ESP32 is running as a AP with AsyncTCP connecting multiple ESP32 clients. The AP receives some JSON data and replies with some JSON data. Without the handleData() function, the code runs 100% fine with no issues. Heap is static when no clients connect and issues only occur when clients start connecting.
Can anyone see anything with my code that could be causing heap corruption or other memory weirdness?
static void handleData(void* arg, AsyncClient* client, void *data, size_t len) {
int i = 0, j = 0;
char clientData[CLIENT_DATA_MAX];
char packetData[len];
char *packetBuf;
packetBuf = (char *)data;
clientData[0] = '\0';
for (i=0;i <= len;i++) {
packetData[j] = packetBuf[i]; //packetBuf[i];
if ((packetData[j] == '\n') || (i == len)) {
packetData[j] = '\0';
if ((j > 0) && (packetData[0] != '\n') && (packetData[0] != '\r')) {
// See sensorData() below...
parseData.function(packetData, clientData);
if (clientData != NULL) {
// TCP reply to client
if (client->space() > 32 && client->canSend()) {
client->write(clientData);
}
}
}
j = 0;
} else
j++;
}
}
void sensorData(void *data, void *retData) {
StaticJsonDocument<CLIENT_DATA_MAX> fields;
StaticJsonDocument<CLIENT_DATA_MAX> output;
char sensor[15] = "\0";
char MAC[18] = "\0";
char value[20] = "\0";
bool sendOK = false;
memcpy((char *)retData, "\0", 1);
DeserializationError error = deserializeJson(fields, (char *)data, CLIENT_DATA_MAX);
if (error) {
DEBUG_PRINTLN(F("deserializeJson() failed"));
return;
}
if (fields["type"])
strcpy(sensor, fields["type"]);
switch (sensor[0]) {
case 'C':
if (fields["value"])
strcpy(value, fields["value"]);
sendOK = true;
break;
case 'T': //DEBUG_PRINT(F("Temp "));
setExtTempSensor(fields["value"]);
sendOK = true;
break;
case 'N':
output["IT"] = intTempC; //Internal temp
output["B1"] = battLevels[0];
serializeJson(output, (char *)retData, CLIENT_DATA_MAX-1);
break;
}
if (sendOK) {
output["Resp"] = "Ok";
serializeJson(output, (char *)retData, CLIENT_DATA_MAX-1);
}
strcat((char *)retData, "\n");
}
static void handleNewClient(void* arg, AsyncClient* client) {
client->setRxTimeout(1000);
client->setAckTimeout(500);
client->onData(&handleData, NULL);
client->onError(&handleError, NULL);
client->onDisconnect(&handleDisconnect, NULL);
client->onTimeout(&handleTimeOut, NULL);
}
void startServer() {
server = new AsyncServer(WIFI_SERVER_PORT);
server->onClient(&handleNewClient, &server)
}
Using AsyncTCP on the ESP32 was having multiple issues. Heap issues, socket issues, assert issues, ACK timeouts, connection timeouts, etc. Swapping to AsyncUDP using the exact same code as shown above with romkey's changes, resolved all of my issues. (Just using romkey's fixes did not fix the errors I was having with AsyncTCP.) I don't believe the issue is with AsyncTCP but with ESP32 libraries.
Either you should declare packetData to be of length len + 1 or your for loop should iterate until i < len. Because the index starts at 0, packetData[len] is actually byte len + 1, so you'll overwrite something random when you store something in packetData[len] if the array is only len chars long.That something random may be the pointer stored in packetBuf, which could easily cause heap corruption.
You should always use strncpy() and never strcpy(). Likewise use strncat() rather than strcat(). Don't depend on having done the math correctly or on sizes not changing as your code evolves. strncpy() and strncat() will guard against overflows. You'll need to pass a length into sensorData() to do that, but sensorData() shouldn't be making assumptions about the available length of retData.
Your test
if (clientData != NULL) {
will never fail because clientData is the address of array and cannot change. I'm not sure what you're trying to test for here but this if will always succeed.
You can just write:
char sensor[15] = "";
you don't need to explicitly assign a string with a null byte in it.
And
memcpy((char *)retData, "\0", 1);
is equivalent to
((char *)retData)[0] = '\0';
What's the point of declaring retData to be void * in the arguments to sensorData()? Your code starts out with it being a char* before calling sensorData() and uses it as a char* inside sensorData(). void * is meant to be an escape hatch for passing around pointers without worrying about their type. You don't need that here and end up needing to extra casts back to char* because of it. Just declare the argument to be char* and don't worry about casting it again.
You didn't share the code that calls handleData() so there may well be issues outside of these functions.
I've started learning C for Arduino for about 2 weeks. I have the following code and I don't understand how data is retrieved from function ReadLine. Also I don't understand how variable BufferCount affects the program and why it is used. I do know that it holds the number of digits the year have but that's about all I know about this variable.
From what I've learned so far a function is composed of:
function type specifier
function name
function arguments.
What I see in this program makes me think that the function can also return values using the argument part. I always thought that a function can only return a value that is the same type (int, boolean ...) as the type specifier.
void setup() {
Serial.begin(9600);
}
void loop() {
if (Serial.avaible() > 0) {
int bufferCount;
int year;
char myData[20];
bufferCount = ReadLine (myData);
year = atoi(myData); //convert string to int
Serial.print("Year: ");
Serial.print(year);
if (IsLeapYear(year)) {
Serial.print(" is ");
} else {
Serial.print(" is not ");
}
Serial.println("a leap year");
}
}
int IsLeapYear(int yr) {
if (yr % 4 == 0 && yr % 100 != 0 || yr % 400 == 0) {
return 1; //it's a leap year
} else {
return 0;
}
}
int ReadLine (char str[]) {
char c;
int index = 0;
while (true) {
if (Serial.available() > 0) {
c = Serial.read();
if (c != '\n') {
str[index++] = c;
} else {
str[index] = '\0'; //null termination character
break;
}
}
}
return index;
}
The fundamental concept you are missing is pointers. In the case of a function like isLeapYear there, you'd be right about that parameter. It is just a copy of the data from whatever variable was passed in when the function gets called.
But with ReadLine things are different. ReadLine is getting a pointer to a char array. A pointer is a special kind of variable that holds the memory address of another variable. And it is true that in this case you are getting a local copy of the pointer, but it still points to the same location in memory. And during the function, data is copied not into the variable str, but to the memory location it points to. Since that is a memory location that belongs to a variable in the scope of the calling function, that actual variable's value will be changed. You've written over it in memory.
Backgroup: rank 0 send message to rank 1, after rank 1 completes its work it returns messages to rank 0
actually I run a thread for sending message and the other one for receiving in rank 0 like this:
int tag = 1;
void* thread_send(void* argc)
{
...;
while(1)
{
if(tag == 1)
{
MPI_Send(...,1,TAG_SEND,...);//send something to slave
tag = 0;
}
}
...
}
void* thread_receive(void* argc)
{
while(1)
{
MPI_Recv(...,0,TAG_RECV,...); //ready for receiving from slave
tag = 1;
}
}
in rank 1 I run a thread like this:
void* slave(void* argc)
{
...;
while(1)
{
MPI_Probe(0,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
switch(status.MPI_TAG){
case TAG_SEND:
MPI_Recv(..,0,TAG_SEND,..);
break;
}
MPI_Send(...,0,MPI_RECV,...); //notify rank 0 slave has done his work
}
}
then I got an error like this:
[comp01-mpi.gpu01.cis.k.hosei.ac.jp][[54135,1],0]
[btl_tcp_endpoint.c:486:mca_btl_tcp_endpoint_recv_connect_ack]
received unexpected process identifier [[16641,0],301989888]
In fact there are several interfaces for one machine, I know it might to be a problem, so I assign the parameter
--mca btl_tcp_if_include eth0 --mca oob_tcp_if_include eth0
to avoid network traffic.
Have I done something wrong? I will appreciate any suggestion you give me, thanks.
Thanks to #HristoIliev, I checked the Open MPI like this:
MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provide_level);
if(provide_level < MPI_THREAD_MULTIPLE){
printf("Error: the MPI library doesn't provide the required thread level\n");
MPI_Abort(MPI_COMM_WORLD,0);
}
and I got the error:
Error: the MPI library doesn't provide the required thread level
that means I CAN NOT use multiple threads, so what else can I do?
Now I am using the non-blocing sends(Isend) and receives(Irecv), the code is like this:
send thread:
int tag = 1;
void* thread_send(void* argc)
{
...;
while(1)
{
while(1)
{
MPI_Irecv(&tag,MPI_INT,1,MSG_TAG,MPI_COMM_WORLD,&request);
if(tag == 1) break;
printf("tag is %d\n",tag);
MPI_Wait(&request,&status);
}
MPI_Send(...,1,MSG_SEND,...);//send something to slave
tag = 0;
}
...
}
receive thread:
void* slave(void* argc)
{
...;
while(1)
{
MPI_Probe(0,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
switch(status.MPI_TAG){
case TAG_SEND:
MPI_Recv(..,0,MSG_Send,..);
break;
}
int tag = 1;
MPI_Isend(&tag,1,MPI_INT,0,MSG_TAG,MPI_COMM_WORLD,&request); //notify rank 0 slave has done his work
MPI_Wait(&request,&status);
printf("slave is idle now \n");
}
}
and it printed like this:
tag is 0
slave is idle now
and hang here
I have solved the problem by changing the Irecv() funciton's location, like following:
send thread:
int tag = 1;
void* thread_send(void* argc)
{
...;
while(1)
{
while(1)
{
if(tag == 1) break;
printf("tag is %d\n",tag);
MPI_Irecv(&tag,MPI_INT,1,MSG_TAG,MPI_COMM_WORLD,&request);
MPI_Wait(&request,&status);
}
MPI_Send(...,1,MSG_SEND,...);//send something to slave
tag = 0;
}
...
}.
In conclusion, to send and receive messages at the same time, you can use multiple thread if your MPI supports multiple-thread mode, you can check it when you init your MPI program like this:
MPI_Init_thread(&argc,&argv,MPI_THREAD_MULTIPLE,&provide_level);
if(provide_level < MPI_THREAD_MULTIPLE){
printf("Error: the MPI library doesn't provide the required thread level\n");
MPI_Abort(MPI_COMM_WORLD,0);
}
Or if your MPI doesn't support multiple thread mode, you may use non-blocking communication.
I am trying to create a simple program to test semaphores. I am forking the process and tormenting the value of variable c in the critical section of each process, but the value of c I get is still 1 not 2. Even with the mmap() uncommented. Can anyone please explain to me what I am doing wrong? Any help would be appreciated. I am a total newbie in this. Thank you very much for your time.
int main()
{
int c = 0;
sem_t mutex;
sem_t mutex1;
// sem_t *mutex = (sem_t*)mmap(NULL, sizeof(sem_t*), PROT_READ|PROT_WRITE,MAP_SHARED|MAP_ANONYMOUS,-1, 0);
sem_init(&mutex, 0, 1);
sem_init(&mutex1, 0, 1);
pid_t i;
int id = fork();
if(id == -1) {}
else if(id == 0)
{
sem_wait (&mutex);
c++;
sem_post (&mutex);
}
else
{
sem_wait (&mutex);
c++;
sem_post (&mutex);
}
cout<<c<<endl;
//system("pause");
return 0;
}
I tried it another way by making the pshared argument 1, but it still does not work.
I have also tried it sem_op but it still does not work.
int main()
{
int c = 0;
int sid =semget(1105,2, 0666 | IPC_CREAT);
pid_t i;
int id = fork();
if(id == -1)
{
}
else if(id == 0)
{
struct sembuf sb;
sb.sem_num = 0;
sb.sem_op = -1;
sb.sem_flg = 0;
if((semop(sid, &sb, 1)) == -1)
cout<<"error"<<endl;
c++;
sb.sem_num = 0;
sb.sem_op = -1;
sb.sem_flg = 0;
if((semop(sid, &sb, 1)) == -1)
cout<<"error"<<endl;
}
else if(id == 1)
{
struct sembuf sb;
if((semop(sid, &sb, 1)) == -1)
cout<<"error"<<endl;
c++;
sb.sem_num = 0;
sb.sem_op = -1;
sb.sem_flg = 0;
if((semop(sid, &sb, 1)) == -1)
cout<<"error"<<endl;
}
cout<<c<<endl;
return 0;
}
If you use fork() you have to share the semaphores between the forked processes. See sem_init() manual for more details.
Alternatively you can use a named semaphore, see sem_open() for details, and
also a good article on the subject.
Your primary misstep is that the variable c is not itself shared — each process operates on its own copy of the variable. You want something like this:
int *c;
c = mmap(NULL, sizeof(*c), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
*c = 0;
// ... later ...
++*c;
Additionally, with respect to your sem_init() example, you should:
Allocate shared memory of the correct size: sizeof(sem_t) and not sizeof(sem_t*)
Set the pshared flag during sem_init()
You likely don't need conditional logic differentiating parent from child after the fork(). After all, you want them to do the same thing.
(Separately, please do not name a POSIX semaphore "mutex." That name will mislead hurried, POSIXly-minded folk who will think you are referring to a different kind of synchronization primitive.)
With respect to your semget() example, you appear to be waiting on the semaphore twice (sb.sem_op = -1) in the child process. The post-fork() check for the parent is incorrect — you check if the returned PID is 1 (which it will never be on a typical UNIX system) rather than if the returned PID is > 0. (Again, however, you likely don't need to have parent and child do different things here.)