I'm trying to parse a hex number byte by byte, and concatenate to a string the representation of each byte, in the order they're stored in memory. (for a little test on endianness, but that's not important I guess).
Here is the code (please ignore the glaring unit-test issues with it :D; also, some of the code might look weird since initially the display_bytes method took in a char* not an int8_t*, but I thought using an int8_t might make it more obvious to me, what the issue is)
TEST_CLASS(My001littlebigendian)
{
public:
TEST_METHOD(TestMethod1)
{
int i = 0x12345678;
display_bytes((int8_t*)&i, sizeof(i));
}
void display_bytes(int8_t* b, int length)
{
std::stringstream ss;
for (int i = 0; i < length; ++i)
{
int8_t signedCharRepresentation = *(b + i); //signed char has 1 byte
int8_t signed8ByteInt = (int8_t)signedCharRepresentation; //this is not ok
int32_t signed32ByteInt = (int32_t)signedCharRepresentation; //this is ok. why?
//ss << std::hex << signed8ByteInt; //this is not ok. why?
ss << std::hex << signed32ByteInt; //this is ok
}
std::string stringRepresentation = ss.str();
if (stringRepresentation.compare("78563412") == 0)
{
Assert::IsTrue(true, L"machine is little-endian");
}
else if(stringRepresentation.compare("01234567") == 0)
{
Assert::IsTrue(true, L"machine is big-endian");
}
else
{
Assert::IsTrue(true, L"machine is other-endian");
}
}
};
Now, what I don't understand (as hopefull the comments make clear) is why does this only work when I cast each byte to a 4 byte int, and not an 1 byte int. Since I am working with chunks of 1 byte. Intuitively it would make me think doing it like this should cause some sort of overflow? But it seems not.
I've not dug deeper into why this is the issue yet, since I was hoping to not need to. And maybe if someone with more knowledge in this area can give me nudge in the right direction, or maybe even an outright answer if I'm missing something very obvious. (which I do feel I might be, since I'm not used to working at this low level).
Related
I have a question that I found many threads in, but none did explicitly answer my question.
I am trying to have a multidimensional array inside the kernel of the GPU using thrust. Flattening would be difficult, as all the dimensions are non-homogeneous and I go up to 4D. Now I know I cannot have device_vectors of device_vectors, for whichever underlying reason (explanation would be welcome), so I tried going the way over raw-pointers.
My reasoning is, a raw pointer points onto memory on the GPU, why else would I be able to access it from within the kernel. So I should technically be able to have a device_vector, which holds raw pointers, all pointers that should be accessible from within the GPU. This way I constructed the following code:
thrust::device_vector<Vector3r*> d_fluidmodelParticlePositions(nModels);
thrust::device_vector<unsigned int***> d_allFluidNeighborParticles(nModels);
thrust::device_vector<unsigned int**> d_nFluidNeighborsCrossFluids(nModels);
for(unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++)
{
FluidModel *model = sim->getFluidModelFromPointSet(fluidModelIndex);
const unsigned int numParticles = model->numActiveParticles();
thrust::device_vector<Vector3r> d_neighborPositions(model->getPositions().begin(), model->getPositions().end());
d_fluidmodelParticlePositions[fluidModelIndex] = CudaHelper::GetPointer(d_neighborPositions);
thrust::device_vector<unsigned int**> d_fluidNeighborIndexes(nModels);
thrust::device_vector<unsigned int*> d_nNeighborsFluid(nModels);
for(unsigned int pid = 0; pid < nModels; pid++)
{
FluidModel *fm_neighbor = sim->getFluidModelFromPointSet(pid);
thrust::device_vector<unsigned int> d_nNeighbors(numParticles);
thrust::device_vector<unsigned int*> d_neighborIndexesArray(numParticles);
for(unsigned int i = 0; i < numParticles; i++)
{
const unsigned int nNeighbors = sim->numberOfNeighbors(fluidModelIndex, pid, i);
d_nNeighbors[i] = nNeighbors;
thrust::device_vector<unsigned int> d_neighborIndexes(nNeighbors);
for(unsigned int j = 0; j < nNeighbors; j++)
{
d_neighborIndexes[j] = sim->getNeighbor(fluidModelIndex, pid, i, j);
}
d_neighborIndexesArray[i] = CudaHelper::GetPointer(d_neighborIndexes);
}
d_fluidNeighborIndexes[pid] = CudaHelper::GetPointer(d_neighborIndexesArray);
d_nNeighborsFluid[pid] = CudaHelper::GetPointer(d_nNeighbors);
}
d_allFluidNeighborParticles[fluidModelIndex] = CudaHelper::GetPointer(d_fluidNeighborIndexes);
d_nFluidNeighborsCrossFluids[fluidModelIndex] = CudaHelper::GetPointer(d_nNeighborsFluid);
}
Now the compiler won't complain, but accessing for example d_nFluidNeighborsCrossFluids from within the kernel will work, but return wrong values. I access it like this (again, from within a kernel):
d_nFluidNeighborsCrossFluids[iterator1][iterator2][iterator3];
// Note: out of bounds indexing guaranteed to not happen, indexing is definitely right
The question is, why does it return wrong values? The logic behind it should work in my opinion, since my indexing is correct and the pointers should be valid addresses from within the kernel.
Thank you already for your time and have a great day.
EDIT:
Here is a minimal reproducable example. For some reason the values appear right despite of having the same structure as my code, but cuda-memcheck reveals some errors. Uncommenting the two commented lines leads me to my main problem I am trying to solve. What does the cuda-memcheck here tell me?
/* Part of this example has been taken from code of Robert Crovella
in a comment below */
#include <thrust/device_vector.h>
#include <stdio.h>
template<typename T>
static T* GetPointer(thrust::device_vector<T> &vector)
{
return thrust::raw_pointer_cast(vector.data());
}
__global__
void k(unsigned int ***nFluidNeighborsCrossFluids, unsigned int ****allFluidNeighborParticles){
const unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i > 49)
return;
printf("i: %d nNeighbors: %d\n", i, nFluidNeighborsCrossFluids[0][0][i]);
//for(int j = 0; j < nFluidNeighborsCrossFluids[0][0][i]; j++)
// printf("i: %d j: %d neighbors: %d\n", i, j, allFluidNeighborParticles[0][0][i][j]);
}
int main(){
const unsigned int nModels = 2;
const int numParticles = 50;
thrust::device_vector<unsigned int**> d_nFluidNeighborsCrossFluids(nModels);
thrust::device_vector<unsigned int***> d_allFluidNeighborParticles(nModels);
for(unsigned int fluidModelIndex = 0; fluidModelIndex < nModels; fluidModelIndex++)
{
thrust::device_vector<unsigned int*> d_nNeighborsFluid(nModels);
thrust::device_vector<unsigned int**> d_fluidNeighborIndexes(nModels);
for(unsigned int pid = 0; pid < nModels; pid++)
{
thrust::device_vector<unsigned int> d_nNeighbors(numParticles);
thrust::device_vector<unsigned int*> d_neighborIndexesArray(numParticles);
for(unsigned int i = 0; i < numParticles; i++)
{
const unsigned int nNeighbors = i;
d_nNeighbors[i] = nNeighbors;
thrust::device_vector<unsigned int> d_neighborIndexes(nNeighbors);
for(unsigned int j = 0; j < nNeighbors; j++)
{
d_neighborIndexes[j] = i + j;
}
d_neighborIndexesArray[i] = GetPointer(d_neighborIndexes);
}
d_nNeighborsFluid[pid] = GetPointer(d_nNeighbors);
d_fluidNeighborIndexes[pid] = GetPointer(d_neighborIndexesArray);
}
d_nFluidNeighborsCrossFluids[fluidModelIndex] = GetPointer(d_nNeighborsFluid);
d_allFluidNeighborParticles[fluidModelIndex] = GetPointer(d_fluidNeighborIndexes);
}
k<<<256, 256>>>(GetPointer(d_nFluidNeighborsCrossFluids), GetPointer(d_allFluidNeighborParticles));
if (cudaGetLastError() != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(cudaGetLastError()));
cudaDeviceSynchronize();
}
A device_vector is a class definition. That class has various methods and operators associated with it. The thing that allows you to do this:
d_nFluidNeighborsCrossFluids[...]...;
is a square-bracket operator. That operator is a host operator (only). It is not usable in device code. Issues like this give rise to the general statements that "thrust::device_vector is not usable in device code." The device_vector object itself is generally not usable. However the data it contains is usable in device code, if you attempt to access it via a raw pointer.
Here is an example of a thrust device vector that contains an array of pointers to the data contained in other device vectors. That data is usable in device code, as long as you don't attempt to make use of the thrust::device_vector object itself:
$ cat t1509.cu
#include <thrust/device_vector.h>
#include <stdio.h>
template <typename T>
__global__ void k(T **data){
printf("the first element of vector 1 is: %d\n", (int)(data[0][0]));
printf("the first element of vector 2 is: %d\n", (int)(data[1][0]));
printf("the first element of vector 3 is: %d\n", (int)(data[2][0]));
}
int main(){
thrust::device_vector<int> vector_1(1,1);
thrust::device_vector<int> vector_2(1,2);
thrust::device_vector<int> vector_3(1,3);
thrust::device_vector<int *> pointer_vector(3);
pointer_vector[0] = thrust::raw_pointer_cast(vector_1.data());
pointer_vector[1] = thrust::raw_pointer_cast(vector_2.data());
pointer_vector[2] = thrust::raw_pointer_cast(vector_3.data());
k<<<1,1>>>(thrust::raw_pointer_cast(pointer_vector.data()));
cudaDeviceSynchronize();
}
$ nvcc -o t1509 t1509.cu
$ cuda-memcheck ./t1509
========= CUDA-MEMCHECK
the first element of vector 1 is: 1
the first element of vector 2 is: 2
the first element of vector 3 is: 3
========= ERROR SUMMARY: 0 errors
$
EDIT: In the mcve you have now posted, you point out that an ordinary run of the code appears to give correct results, but when you use cuda-memcheck, errors are reported. You have a general design problem that will cause this.
In C++, when an object is defined within a curly-braces region:
{
{
Object A;
// object A is in-scope here
}
// object A is out-of-scope here
}
// object A is out of scope here
k<<<...>>>(anything that points to something in object A); // is illegal
and you exit that region, the object defined within the region is now out of scope. For objects with constructors/destructors, this usually means the destructor of the object will be called when it goes out-of-scope. For a thrust::device_vector (or std::vector) this will deallocate any underlying storage associated with that vector. That does not necessarily "erase" any data, but attempts to use that data are illegal and would be considered UB (undefined behavior) in C++.
When you establish pointers to such data inside an in-scope region, and then go out-of-scope, those pointers no longer point to anything that would be legal to access, so attempts to dereference the pointer would be illegal/UB. Your code is doing this. Yes, it does appear to give the correct answer, because nothing is actually erased on deallocation, but the code design is illegal, and cuda-memcheck will highlight that.
I suppose one fix would be to pull all this stuff out of the inner curly-braces, and put it at main scope, just like the d_nFluidNeighborsCrossFluids device_vector is. But you might also want to rethink your general data organization strategy and flatten your data.
You should really provide a minimal, complete, verifiable/reproducible example; yours is neither minimal, nor complete, nor verifiable.
I will, however, answer your side-question:
I know I cannot have device_vectors of device_vectors, for whichever underlying reason (explanation would be welcome)
While a device_vector regards a bunch of data on the GPU, it's a host-side data structure - otherwise you would not have been able to use it in host-side code. On the host side, what it holds should be something like: The capacity, the size in elements, the device-side pointer to the actual data, and maybe more information. This is similar to how an std::vector variable may refer to data that's on the heap, but if you create the variable locally the fields I mentioned above will exist on the stack.
Now, those fields of the device vector that are located in host memory are not generally accessible from the device-side. In device-side code you would typically use the raw pointer to the device-side data the device_vector manages.
Also, note that if you have a thrust::device_vector<T> v, each use of operator[] means a bunch of separate CUDA calls to copy data to or from the device (unless there's some caching going on under the hoold). So you really want to avoid using square-brackets with this structure.
Finally, remember that pointer-chasing can be a performance killer, especially on a GPU. You might want to consider massaging your data structure somewhat in order to make it amenable to flattening.
I'm trying to concatenate the current DateTime to my devices Mac Address in the following format: aa:bb:cc:dd:ee:ffYYmmDDhhMMss so I can hash it and send it to a web service every time I collect new data (so I'll have to hash it in every loop)
I managed to concatenate the two values (mac address + datetime) and converted it to char array
addressDateTime.toCharArray(thisThing, 28);
However, I'm kind of lost as to how to continue.
I've also tried to read the resulting char* with this cycle but I'm not understanding why it doesn't work:
void loop() {
while (!timeClient.update()) {
timeClient.forceUpdate();
}
String addressDateTime = getPayload(); //this gets the *aa:bb:cc:dd:ee:ffYYmmDDhhMMss* string
char* hashThis;
addressDateTime.toCharArray(hashThis, 28);
for (int i = 0; i < sizeof(hashThis); i++) {
char str[3];
sprintf(str, "%02x", hashThis[i]);
Serial.print(str);
}
delay(5000);
}
Am I converting the String to char* correctly?
How should I go about Hashing the char*?
Or can I Hash the String without converting it to to char*?
Update:
My code's looking like this atm
while (!timeClient.update()) {
timeClient.forceUpdate();
}
String addressDateTime = getPayload();
char hashThis[30];
addressDateTime.toCharArray(hashThis, 30);
for (int i = 0; i < sizeof(hashThis); i++) {
Serial.printf("%02x", hashThis[i]);
}
delay(5000);
}
So I managed to convert the String to Char* Except that the output is looking like this 33433a37313a42463a31443a34323a463431393035303531343038323700 instead of (for example) aa:bb:cc:dd:ee:ff190505141037
After figuring out why my char* array outputs like that I still have to hash it.
Thanks for helping me get this far, I still have ways to go
You're not allocating space to store the C string that you're getting from addressDateTime.
hashThis is a char* which is a pointer to a character. It hasn't been set to anything so it's just... random. Which will almost certainly make your program crash or at least misbehave badly.
Given your code, the quickest fix is to change
char* hashThis;
to
char hasThis[30];
addressDateTime.toCharArray(hashThis, 30);
I changed 28 to 30 because aa:bb:cc:dd:ee:ffYYmmDDhhMMss is actually 29 characters long and also requires an extra byte for the C string null terminator character. I'm not 100% sure if the toCharArray() method sets the null terminator; if it doesn't, you'd need to add
hasThis[29] = '\0';
You can avoid that by just using the String c_str() method, which returns a char* to the internal buffer that String uses to hold the string.
In that case you could rewrite
char* hashThis;
addressDateTime.toCharArray(hashThis, 28);
as
char* hashThis = addressDateTime.c_str();
By the way, you can also just do
Serial.printf("%02x", hashThis[i]);
and dispense with the snprintf(). Kudos on getting the right buffer size there, though!
UPDATE
In your updated question, you said that you're expecting to see output that looks like:
aa:bb:cc:dd:ee:ff190505141037
instead of:
33433a37313a42463a31443a34323a463431393035303531343038323700
Your code is
for (int i = 0; i < sizeof(hashThis); i++) {
Serial.printf("%02x", hashThis[i]);
}
You're writing each character as a two digit hexadecimal number, so you're going to see the number in hexadecimal that represents the character, not the character itself. If you want to see the characters, do:
for (int i = 0; i < strlen(hashThis); i++) {
Serial.printf("%c", hashThis[i]);
}
or (better)
for (int i = 0; i < strlen(hashThis); i++) {
Serial.print(hashThis[i]);
}
or (best)
Serial.println(hashThis);
Note that I changed your sizeof to a strlen. If for some reason you put a shorter string in hashThis, strlen will do the right thing whereas sizeof will always return the length that hashThis was declared with rather than the length of the string in it.
I am a college student so I'm still learning a lot. I ran into something interesting while making a project. I have this segment of code that works when it isn't placed in a for loop, but doesn't work when it is. I just want to understand why. Here is my code:
void setup() {
Serial.begin(9600);
int a[8];
for(int i=0;i<8;i++) {
a[i]=pow(2,i);
}
for(int i=0;i<8;i++) {
Serial.print(a[i]);
}
}
void loop() {
}
Here is the same code written without the first for loop (where the data gets written into the array):
void setup() {
Serial.begin(9600);
int a[8];
a[0]=pow(2,0);
a[1]=pow(2,1);
a[2]=pow(2,2);
a[3]=pow(2,3);
a[4]=pow(2,4);
a[5]=pow(2,5);
a[6]=pow(2,6);
a[7]=pow(2,7);
for(int i=0;i<8;i++) {
Serial.print(a[i]);
}
}
void loop() {
}
The first code outputs:
1
2
3
7
15
31
63
127
While the second code outputs:
1
2
4
8
16
32
64
128
Does anybody know? I really want to know why.
You are experiencing floating point round off. 2^4 will actually give you a value closer to 15.9999 and when this is assigned to an int, it truncates the decimal to 15. I would suggest doing bit shift operations when using powers of 2, such that:
for(int i=0;i<8;i++)
{
a[i]=(1 << i);
}
If you want to read up on bit shifting, look here.
If you want to know more about the floating point round off, look here.
Additionally if you wanted just a quick fix to your code closer to what you have, I believe this will also work:
for(int i=0;i<8;i++)
{
a[i]= (int) round( pow(2, i) );
}
This will round the floating result properly before casting it to an int.
I wrote a function to convert a hexa string representation (like x00) of some binary data to the data itself.
How to improve this code?
QByteArray restoreData(const QByteArray &data, const QString prepender = "x")
{
QByteArray restoredData = data;
return QByteArray::fromHex(restoredData.replace(prepender, ""));
}
How to improve this code?
Benchmark before optimizing this. Do not do premature optimization.
Beyond the main point: Why would you like to optimize it?
1) If you are really that concerned about performance where this negligible code from performance point of view matters, you would not use Qt in the first place because Qt is inherently slow compared to a well-optimized framework.
2) If you are not that concerned about performance, then you should keep the readability and maintenance in mind as leading principle, in which case your code is fine.
You have not shown any real world example either why exactly you want to optimize. This feels like an academic question without much pratical use to me. It would be interesting to know more about the motivation.
That being said, several improvement items, which are also optimization, could be done in your code, but then again: it is not done for optimization, but more like logical reasons.
1) Prepender is bad name; it is usually called "prefix" in the English language.
2) You wish to use QChar as opposed to QString for a character.
3) Similarly, for the replacement, you wish to use '' rather than the string'ish "" formula.
4) I would pass classes like that with reference as opposed to value semantics even if it is CoW (implicitly shared).
5) I would not even use an argument here for the prefix since it is always the same, so it does not really fit the definition of variable.
6) It is needless to create an interim variable explicitly.
7) Make the function inline.
Therefore, you would be writing something like this:
QByteArray restoreData(QByteArray data)
{
return QByteArray::fromHex(data.replace('x', ''));
}
Your code has a performance problem because of replace(). Replace itself is not very fast, and creating intermediate QByteArray object slows the code down even more. If you are really concerned about performance, you can copy QByteArray::fromHex implementation from Qt sources and modify it for your needs. Luckily, its implementation is quite self-contained. I only changed / 2 to / 3 and added --i line to skip "x" characters.
QByteArray myFromHex(const QByteArray &hexEncoded)
{
QByteArray res((hexEncoded.size() + 1)/ 3, Qt::Uninitialized);
uchar *result = (uchar *)res.data() + res.size();
bool odd_digit = true;
for (int i = hexEncoded.size() - 1; i >= 0; --i) {
int ch = hexEncoded.at(i);
int tmp;
if (ch >= '0' && ch <= '9')
tmp = ch - '0';
else if (ch >= 'a' && ch <= 'f')
tmp = ch - 'a' + 10;
else if (ch >= 'A' && ch <= 'F')
tmp = ch - 'A' + 10;
else
continue;
if (odd_digit) {
--result;
*result = tmp;
odd_digit = false;
} else {
*result |= tmp << 4;
odd_digit = true;
--i;
}
}
res.remove(0, result - (const uchar *)res.constData());
return res;
}
Test:
qDebug() << QByteArray::fromHex("54455354"); // => "TEST"
qDebug() << myFromHex("x54x45x53x54"); // => "TEST"
This code can behave unexpectedly when hexEncoded is malformed (.e.g. "x54x45x5" will be converted to "TU"). You can fix this somehow if it's a problem.
OK, say I have a boolean array called bits, and an int called cursor
I know I can access individual bits by using bits[cursor], and that I can use bit logic to get larger datatypes from bits, for example:
short result = (bits[cursor] << 3) |
(bits[cursor+1] << 2) |
(bits[cursor+2] << 1) |
bits[cursor+3];
This is going to result in lines and lines of code when reading larger types like int32 and int64 though.
Is it possible to do a cast of some kind and achieve the same result? I'm not concerned about safety at all in this context (these functions will be wrapped into a class that handles that)
Say I wanted to get an uint64_t out of bits, starting at an arbitrary address specified by cursor, when cursor isn't necessarily a multiple of 64; is this possible by a cast? I thought this
uint64_t result = (uint64_t *)(bits + cursor)[0];
Would work, but it doesn't want to compile.
Sorry I know this is a dumb question, I'm quite inexperienced with pointer math. I'm not looking just for a short solution, I'm also looking for a breakdown of the syntax if anyone would be kind enough.
Thanks!
You could try something like this and cast the result to your target data size.
uint64_t bitsToUint64(bool *bits, unsigned int bitCount)
{
uint64_t result = 0;
uint64_t tempBits = 0;
if(bitCount > 0 && bitCount <= 64)
{
for(unsigned int i = 0, j = bitCount - 1; i < bitCount; i++, j--)
{
tempBits = (bits[i])?1:0;
result |= (tempBits << j);
}
}
return result;
}