How to convince nvcc to use 128-bit wide loads? - vector

I have a kernel that needs to apply a stencil operation on an array and store the result on another array. The stencil could be expressed in a function as:
float stencil(const float* data)
{
return *(data-1) + *(data+1);
}
I want every thread to produce 4 contiguous values of the output array by loading 6 contiguous values of the input array. By doing so I would be able to use the float4 type for loading and storing in chunks of 128 bytes. This is my program (you can download and compile it, but please consider the kernel in first place):
#include<iostream>
#include<cstdlib>
#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
__global__ void kernel(const float* input, float* output, int size)
{
int i = 4*(blockDim.x*blockIdx.x + threadIdx.x);
float values[6];
float res[4];
// Load values
values[0] = *(input+i-1);
*reinterpret_cast<float4*>(values+1) = *reinterpret_cast<const float4*>(input+i);
values[5] = *(input+i+4);
// Compute result
res[0] = values[0]+values[2];
res[1] = values[1]+values[3];
res[2] = values[2]+values[4];
res[3] = values[3]+values[5];
// Store result
*reinterpret_cast<float4*>(output+i) = *reinterpret_cast<const float4*>(res);
}
int main()
{
// Parameters
const int nBlocks = 8;
const int nThreads = 128;
const int nValues = 4 * nThreads * nBlocks;
// Allocate host and device memory
thrust::host_vector<float> input_host(nValues+64);
thrust::device_vector<float> input(nValues+64), output(nValues);
// Generate random input
srand48(42);
thrust::generate(input_host.begin(), input_host.end(), []{ return drand48()+1.; });
input = input_host;
// Run kernel
kernel<<<nBlocks, nThreads>>>(thrust::raw_pointer_cast(input.data()+32), thrust::raw_pointer_cast(output.data()), nValues);
// Check output
for (int i = 0; i < nValues; ++i)
{
float ref = input_host[31+i] + input_host[33+i];
if (ref != output[i])
{
std::cout << "Error at " << i << " : " << ref << " " << output[i] << "\n";
std::cout << "Abort with errors\n";
std::exit(1);
}
}
std::cout << "Success\n";
}
The program works perfectly.
I would expect the compiler to generate one LD.E.128 instruction for the central part of the local array values, and the registers for this central part to be contiguous (e.g. R4, R5, R6, R7); to have two LD.E instructions for both ends of values; to have one ST.E.128 for the output array.
What happens in reality is the following:
code for sm_21
Function : _Z6kernelPKfPfi
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ NOP; /* 0x4000000000001de4 */
/*0010*/ MOV32I R3, 0x4; /* 0x180000001000dde2 */
/*0018*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0020*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0028*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0030*/ SHL R6, R0, 0x2; /* 0x6000c00008019c03 */
/*0038*/ IMAD R10.CC, R6, R3, c[0x0][0x20]; /* 0x2007800080629ca3 */
/*0040*/ IMAD.HI.X R11, R6, R3, c[0x0][0x24]; /* 0x208680009062dce3 */
/*0048*/ IMAD R2.CC, R6, R3, c[0x0][0x28]; /* 0x20078000a0609ca3 */
/*0050*/ LD.E R4, [R10+0xc]; /* 0x8400000030a11c85 */
/*0058*/ IMAD.HI.X R3, R6, R3, c[0x0][0x2c]; /* 0x20868000b060dce3 */
/*0060*/ LD.E R7, [R10+0x4]; /* 0x8400000010a1dc85 */
/*0068*/ LD.E R9, [R10+-0x4]; /* 0x87fffffff0a25c85 */
/*0070*/ LD.E R5, [R10+0x8]; /* 0x8400000020a15c85 */
/*0078*/ LD.E R0, [R10+0x10]; /* 0x8400000040a01c85 */
/*0080*/ LD.E R8, [R10]; /* 0x8400000000a21c85 */
/*0088*/ FADD R6, R7, R4; /* 0x5000000010719c00 */
/*0090*/ FADD R4, R9, R7; /* 0x500000001c911c00 */
/*0098*/ FADD R7, R5, R0; /* 0x500000000051dc00 */
/*00a0*/ FADD R5, R8, R5; /* 0x5000000014815c00 */
/*00a8*/ ST.E.128 [R2], R4; /* 0x9400000000211cc5 */
/*00b0*/ EXIT; /* 0x8000000000001de7 */
................................
All loads are 32-bit wide (LD.E). On the other side, there is just one store instruction ST.E.128, as expected.
I don't show the whole code here again, but I did a test where the stencil does not need a value to the left, but only one to the right (e.g. *data + *(data+1)), in which case my values array contains just 5 values and the float4 load operation modifies the first 4 values of the array (I still have one extra load for the last value). In that case the compiler uses LD.E.128.
My question is why doesn't the compiler understand that it can use the 128-bit wide read if the target register is not the first one in the local array. After all the local array values is just a programming way to say that I need 6 floats to be stored in the registers. There is no such a thing like an array in the resulting ptx or SASS code. I thought I gave the compiler enough hints for it to understand LD.E.128 was the right instruction here.
Second question: how can I make it use the 128-wide load here without having to manually write low-level code? (However if a couple of asm instructions help I'm open to receive suggestions.)
Side note: the decision of using 32-bit load for reading the input and 128-bit store for writing the input is taken while producing ptx code. ptx code already shows this pattern of multiple small loads and a single large store.
I am using CUDA 7.5 under linux.
Based on the suggestions given in the comments, I did some experiments.
Declaring either input or output as __restrict__ (or both) solves the problem: the compiler generated a LD.E.128 and two LD.E, which is what I wanted to achieve, when generating code for the architecture sm_35. Strangely enough, when generating for sm_21 it still prduces six LD.E, but it produces one ST.E.128. It sounds like a compiler bug to me, because the instruction LD.E.128 should be perfectly usable in the older architecture as it is in the newest.
The code presented above uses the 128-bit loads just with the small change of using the __restrict__ keyword as suggested by njuffa and works. I did also follow the suggestion of m.s. I reproduced the same results shown in the pastebin snippet (one LD.E.128 + one LD.E.64). But at runtime it crashes with the following error:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): an illegal memory access was encountered
I'm pretty sure the misalignment is the cause of this problem.
Update: after using cuda-memcheck I'm sure the problem is misalignment:
========= Invalid __global__ read of size 16
========= at 0x00000060 in kernel(float const *, float*, int)
========= by thread (4,0,0) in block (7,0,0)
========= Address 0xb043638bc is misaligned

The problem is that the nvcc compiler is unable to resolve the base address for the vector load in your kernel. This can be a bug or is just an inadequacy.
I modified your code a little bit:
__global__ void kernel2(const float* input, float* output, int size)
{
int i = (blockDim.x*blockIdx.x + threadIdx.x);
float values[6];
float res[4];
// Load values
values[0] = *(input+(i*4)-1);
float4 test =*(reinterpret_cast<const float4*>(input)+i);
values[5] = *(input+(i*4)+4);
values[1] = test.x;
values[2] = test.y;
values[3] = test.z;
values[4] = test.w;
// Compute result
res[0] = values[0]+values[2];
res[1] = values[1]+values[3];
res[2] = values[2]+values[4];
res[3] = values[3]+values[5];
// Store result
*(reinterpret_cast<float4*>(output)+i) = *reinterpret_cast<const float4*>(res);
}
The kernel code compiled to ptx:
.visible .entry _Z7kernel2PKfPfi(
.param .u64 _Z7kernel2PKfPfi_param_0,
.param .u64 _Z7kernel2PKfPfi_param_1,
.param .u32 _Z7kernel2PKfPfi_param_2
)
{
.reg .f32 %f<15>;
.reg .b32 %r<7>;
.reg .b64 %rd<10>;
ld.param.u64 %rd1, [_Z7kernel2PKfPfi_param_0];
ld.param.u64 %rd2, [_Z7kernel2PKfPfi_param_1];
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
shl.b32 %r5, %r4, 2;
add.s32 %r6, %r5, -1;
mul.wide.s32 %rd3, %r6, 4;
cvta.to.global.u64 %rd4, %rd1;
add.s64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5];
mul.wide.s32 %rd6, %r4, 16;
add.s64 %rd7, %rd4, %rd6;
ld.global.v4.f32 {%f2, %f3, %f4, %f5}, [%rd7];
ld.global.f32 %f10, [%rd5+20];
cvta.to.global.u64 %rd8, %rd2;
add.s64 %rd9, %rd8, %rd6;
add.f32 %f11, %f3, %f5;
add.f32 %f12, %f2, %f4;
add.f32 %f13, %f4, %f10;
add.f32 %f14, %f1, %f3;
st.global.v4.f32 [%rd9], {%f14, %f12, %f11, %f13};
ret;
}
You can see nicely how the addresses for the load are computed (%rd6 and %rd8).
While compiling your kernel to ptx results in:
.visible .entry _Z6kernelPKfPfi(
.param .u64 _Z6kernelPKfPfi_param_0,
.param .u64 _Z6kernelPKfPfi_param_1,
.param .u32 _Z6kernelPKfPfi_param_2
)
{
.reg .f32 %f<11>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [_Z6kernelPKfPfi_param_0];
ld.param.u64 %rd2, [_Z6kernelPKfPfi_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
shl.b32 %r5, %r4, 2;
mul.wide.s32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6+-4];
ld.global.f32 %f2, [%rd6];
ld.global.f32 %f3, [%rd6+12];
ld.global.f32 %f4, [%rd6+4];
ld.global.f32 %f5, [%rd6+8];
ld.global.f32 %f6, [%rd6+16];
add.s64 %rd7, %rd3, %rd5;
add.f32 %f7, %f5, %f6;
add.f32 %f8, %f4, %f3;
add.f32 %f9, %f2, %f5;
add.f32 %f10, %f1, %f4;
st.global.v4.f32 [%rd7], {%f10, %f9, %f8, %f7};
ret;
}
where the compiler only generates code to compute one address (%rd6) and uses static offsets. At this point the compiler failed to emit a vector load. Why? I honestly don't know, maybe two optimizations interfere here.
In SASS you see for kernel2:
.section .text._Z7kernel2PKfPfi,"ax",#progbits
.sectioninfo #"SHI_REGISTERS=18"
.align 64
.global _Z7kernel2PKfPfi
.type _Z7kernel2PKfPfi,#function
.size _Z7kernel2PKfPfi,(.L_39 - _Z7kernel2PKfPfi)
.other _Z7kernel2PKfPfi,#"STO_CUDA_ENTRY STV_DEFAULT"
_Z7kernel2PKfPfi:
.text._Z7kernel2PKfPfi:
/*0008*/ MOV R1, c[0x0][0x44];
/*0010*/ S2R R0, SR_CTAID.X;
/*0018*/ MOV R4, c[0x0][0x140];
/*0020*/ S2R R3, SR_TID.X;
/*0028*/ MOV R5, c[0x0][0x144];
/*0030*/ IMAD R3, R0, c[0x0][0x28], R3;
/*0038*/ MOV32I R8, 0x10;
/*0048*/ IMAD R16.CC, R3, 0x10, R4;
/*0050*/ ISCADD R0, R3, -0x1, 0x2;
/*0058*/ IMAD.HI.X R17, R3, 0x10, R5;
/*0060*/ IMAD R14.CC, R0, 0x4, R4;
/*0068*/ IMAD.HI.X R15, R0, 0x4, R5;
/*0070*/ LD.E.128 R4, [R16];
/*0078*/ LD.E R2, [R14];
/*0088*/ IMAD R12.CC, R3, R8, c[0x0][0x148];
/*0090*/ LD.E R0, [R14+0x14];
/*0098*/ IMAD.HI.X R13, R3, R8, c[0x0][0x14c];
/*00a0*/ FADD R9, R4, R6;
/*00a8*/ FADD R10, R5, R7;
/*00b0*/ FADD R8, R2, R5;
/*00b8*/ FADD R11, R6, R0;
/*00c8*/ ST.E.128 [R12], R8;
/*00d0*/ EXIT;
.L_1:
/*00d8*/ BRA `(.L_1);
.L_39:
Here you have your LD.E.128.
Compiled with nvcc release 7.5, V7.5.17.

Related

Bootloader Jump Function. How to Jump to the right Address?

I am trying to create a bootloader that jumps to my application code on a MKE02Z32VFM4 (KEO2 Series from Freescale). I am working with the Keil IDE 5 and the Armv6 Compiler v6.16.
After Issuing the Jump Instruction to the application start address, the code Jumps to "a" reset handler. And when the instruction to jump to __main is reached, it jumps to the main of the bootloader. The Flash Memory is defined by the linker file as followed:
#define m_interrupts_start 0x00000000
#define m_interrupts_size 0x00000200
#define m_flash_config_start 0x00000400
#define m_flash_config_size 0x00000010
#define bootloader_start 0x00000410
#define bootloader_size 0x00000800 //2kb size 0x410+0x800=0xC10 ==> 256 byte aligned => 0xE00
#define ota_part_0_start 0x00000E00 //Vector Table interrupt must be 256 byte aligned
#define ota_part_0_size 0x00003800 //14KB (14336 Byte) 0xE00+0x3800 => 0x4600
#define ota_part_1_start 0x00004600
#define ota_part_1_size 0x00003800 //14KB (14336 Byte) 0x4600+0x3800 = 0x7E00 || flash_end == 0x0000 7FFF => 0x100(256) byte frei
#define m_data_start 0x1FFFFC00 //ram start
#define m_data_size 0x00001000 //4kb
The application linker file (scatter file) is working with these defines:
#define m_interrupts_start 0x00000E00 //Address of the application reset handler
#define m_interrupts_size 0x00000200
#define m_flash_config_start 0x00001000 //some config bytes, defined by manufacturer
#define m_flash_config_size 0x00000010
#define m_text_start 0x00001010 // start address of application code
#define m_text_size 0x000035F0
#define m_data_start 0x1FFFFC00 //ram start
#define m_data_size 0x00001000 //4kb
The reset handler is written in assembler, i tried to comment the instructions:
Reset_Handler:
cpsid i /* Mask interrupts */
.equ VTOR, 0xE000ED08 //.equ is like #define in C. VTOR = predefined ARMv6 label. 0xE000ED08 VectorTableOffsetRegister.
ldr r0, =VTOR // load word from memory. load value from word at VTOR address to r0. R0 now contains the offset for the vector table.
ldr r1, =__Vectors // load word from memory. load value of word at __Vectors address to r1. --> the first word at __Vectors is the initial stack pointer
str r1, [r0] //store Register to memory. content of r1 is stored to memory adress in r0(==VTOR) --> initial stack pointer is stored to the first word of the Vector table
ldr r2, [r1] //load word from memory. r2 is set to the value of the word in memory at address in r1. --> r2 is set to the address of the initial stack pointer
msr msp, r2 //move to special register. move value of r2 to special register msp (main stack pointer) --> main stack pointer is set to the valjue of the initial stack pointer
ldr r0,=SystemInit //set register 0 to address of SystemInit function. (
blx r0 // branch with link ( to address of r0)
cpsie i /* Unmask interrupts */
ldr r0,=__main
bx r0
.pool
.size Reset_Handler, . - Reset_Handler
The bootloader code is as followed:
Address in this first test is the value 0x00000E00 (start of user app)
__attribute__( ( naked, noreturn ) ) void BootJumpASM( uint32_t SP, uint32_t RH )
{
__asm("MSR MSP,r0");
__asm("BX r1");
}
static void BootJump( uint32_t *Address )
{
if( CONTROL_nPRIV_Msk & __get_CONTROL( ) ) //THIS is from the arm doku, but it is always false in our implementation and skipped.
{ /* not in privileged mode */
EnablePrivilegedMode( ) ;
}
NVIC->ICER[0] = 0xFFFFFFFF ;
NVIC->ICPR[0] = 0xFFFFFFFF ;
SysTick->CTRL = 0 ;
SCB->ICSR |= SCB_ICSR_PENDSTCLR_Msk ;
if( CONTROL_SPSEL_Msk & __get_CONTROL( ) ) //THIS is from the arm doku, but it is always false in our implementation and skipped. (only 1 stack pointer used)
{ /* MSP is not active */
__set_MSP( __get_PSP( ) ) ;
__set_CONTROL( __get_CONTROL( ) & ~CONTROL_SPSEL_Msk ) ;
}
SCB->VTOR = ( uint32_t )Address ; //Setting the Vector Table Offset Register to the start of the user app.
BootJumpASM( Address[ 0 ], Address[ 1 ] ) ; //This function is taken from the Arm Documentation
}
After
SCB->VTOR = (uint32_t)Address; // Set VTOR to 0xE00
The VTOR register IS updated to 0xE00. However after executing the function:
__attribute__( ( naked, noreturn ) ) void BootJumpASM( uint32_t SP, uint32_t RH )
{
__asm("MSR MSP,r0");
__asm("BX r1"); //<-- This is the Point where VTOR changes it value to 0x00 again
}
VTOR is 0x00 again and im in the resethandler. This resethandler connects to the bootloader main. So i assume im in the reset handler at 0x00 and not the one at 0xE00. I checked the flash memory and am positive that a Vector Table is located at 0x000 AND 0xE00. I am positive that the firmware of the application is also at the right place in the flash.
I am assuming that I either:
Defined the Memory space wrong.
The BootJumpASM function jumps to a illegal location and the MCU restarts over at 0x00 with a reset VTOR Register.
I am not sure, why the BootJumpASM function uses r0 and r1 and what it does with the arguments of the function. I am simply new at assembler and all the specific compiler attributes. The function like described above is directly copied from:
https://developer.arm.com/documentation/ka002218/latest
And while i do not understand how the compiler manages to put the Function arguments to register r0 and r1 I am sure that the mistake is at my side and not in the official arm docs.
Can someone explain to me, why after the second instruction of the "BootJumpASM" function "VTOR" is reset to 0x00?
and why the resethandler ,the debugger is in right after, connects to the bootloader main and not the application main. And how do i manage to jump to the right location in memory.
Thanks for your time. I hope this explanation is not too confusing.
The problem was not the jump instruction, but the Debugger of the Keil IDE. I set up the debug environment according to arm and Keil documentation but after the jump out of the code environment of the bootloader into the application memory area, the Debugger triggered a reset. (Bootloader is a seperate Keil project.)
Starting the debugger within the application project, no such reset is triggered after the jump instruction and following the dissasembly view the bootloader executes as expected and the jump instruction works.
Thanks to all for taking time to try and find the error with me.

HardFault_Handler on STM32F407ZET6

I am currently facing some problems with STM32F4, the process "hangs" and I am not able to understand at what point it "locked". When this happened, I collected the following values for the following variables (I created the variable stepError to "translate" the CFSR variable):
void prvGetRegistersFromStack (uint32_t * pulFaultStackAddress)
{
volatile uint32_t CFSRValue = SCB-> CFSR;
volatile uint32_t HFSRValue = SCB-> HFSR;
char stepError [1024] = "";
if ((HFSRValue & (1 << 30)) = 0) {
CFSRValue >> = 16;
if ((CFSRValue & (1 << 9)) = 0) strcpy (stepError, "Divide by zero");
if ((CFSRValue & (1 << 8))! = 0) strcpy (stepError, "Unaligned access");
if ((CFSRValue & (1 << 3)) = 0) strcpy (stepError, "No UsageFault coprocessor");
if ((CFSRValue & (1 << 2)) = 0) strcpy (stepError, "Invalid PC load UsageFault");
if ((CFSRValue & (1 << 1))! = 0) strcpy (stepError, "Invalid state");
if ((CFSRValue & (1 << 0))! = 0) strcpy (stepError, "Undefined instruction");
}
/ * These are volatile to try and prevent the compiler / linker optimizing them
away the variables never actually get used. If the debugger will not show the
values of the variables, make them global my moving their declaration outside
of this function. * /
volatile uint32_t r0;
volatile uint32_t r1;
volatile uint32_t r2;
volatile uint32_t r3;
volatile uint32_t r12;
volatile uint32_t lr; / * Link register. * /
volatile uint32_t pc; / * Program counter. * /
volatile uint32_t psr; / * Program status register. * /
r0 = pulFaultStackAddress [0];
r1 = pulFaultStackAddress [1];
r2 = pulFaultStackAddress [2];
r3 = pulFaultStackAddress [3];
r12 = pulFaultStackAddress [4];
lr = pulFaultStackAddress [5]; // Bit (2 or 3) = 0 determines MSP (Main Stack Pointer); 1 = PSP (Process Stack Pointer)
pc = pulFaultStackAddress [6]; // Variable that contains the address where the error occurred. To check where it was, search the Disassembly on the screen Debug the address
psr = pulFaultStackAddress [7];
/ * When the following line is hit, the variables contain the register values. * /
// Joseph Yiu:
/ *
1) Look at LR value when the core enter hardfault, if bit 2 is 0, then read the value of MSP. Otherwise, read the value of PSP.
2) Based on the MSP / PSP value, you should be able to locate the start of stack frame, stacked PC is in address SP + 24.
3) Generate a disassembled listing of the program you run, and try to locate the stack PC address in the disassembled program list.
* /
GPIO_WriteLed (0,1);
for (int i = 0; i <= 10; i ++)
{
PWM_Change_DutyCycle (i, 0);
}
for (;;);
}
HFSRValue 1073741824 CFSRValue 0 StepError 0x2001fbb0 ""
r0 0 r1 0 r2 0 r3 11
r12 536890019 lr 134334773 pc 0x0801bab0 psr 3221225472
But I can not know from these values where the error occurred, whether it was caused by usb, serial, encoder or ADC converter and etc. How to implement void HardFault_Handler (void) so I can recognize where the error occurs?
Edit:From what I understand the disassembly shows the hardfault and not where the code was before hardfault.
You can find the address of the instruction/function that caused the fault from the exception stack frame:
In the example you gave this seems to be already passed to the prvGetRegistersFromStack function you've posted as the pulFaultStackAddress parameter. As you're interested in finding out which part of your code caused the HardFault, this can be found in the PC and LR that have been stacked - in your example those are taken from pulFaultStackAddress[6] and pulFaultStackAddress[5] respectively.
PC should contain the Program Counter, which is the instruction that was being executed as the fault occurred. LR should contain the Link Register value, which is the return address or in other words - address of the calling subroutine/function.
You've posted that those values are: pc 0x0801bab0 and lr 134334773 (0x801C935 in hexadecimal). Both values are valid addresses within internal flash for STM32F407ZE so we may assume they are valid. All that's left is to translate the memory addresses back to lines within your source code. Two examples of how to do so:
Using your IDE
Most IDEs nowadays have a "disassembly" view. Commonly used Eclipse-based ones (eg. SW4STM32 or TrueSTUDIO for STM32) have it under Window->Show View->Other->Debug->Disassembly. IAR also has one. Once open, paste the memory address (e.g. 0x0801bab0 which was the PC value) into the box during debugging and press Enter. That should show you the corresponding disassembly, interleaved with the source code lines. That should give you an idea of where the HardFault occurred.
Another approach is to..
Using your toolchain
Toolchains also have command line tools allowing you do the same thing as the option above. To give an example I'm going to assume you're using arm-none-eabi. There you can use the addr2line to translate the memory address back to source line code:
arm-none-eabi-addr2line.exe -e [your executable].elf -i 0x0801bab0
where [your executable] is the path to the ELF file you've loaded onto the MCU. The -i switch attempts to unwind inlined functions which sometimes helps to better see where the call originated from.
Whichever approach you choose from, you can do the same for both PC value (address of where the fault happened) and LR (caller).

Serial Communication Between Arduino and EPOS: CRC Calculation Problems

I am trying to interface with an EPOS2 motor controller over RS232 Serial with an Arduino Duemilanove (because it's what I had lying around). I got it to work for the most part - I can send and recieve data when I manually calculate the CRC checksum - but I'm trying to dynamically control the velocity of the motor which requires changing data, and therefore, changing checksum. The documentation for calculating the checksum is here, on page 24:
http://www.maxonmotorusa.com/medias/sys_master/8806425067550/EPOS2-Communication-Guide-En.pdf
I copied the code directly out of this documentation, and integrated it into my code, and it does not calculate the checksum correctly. Below is a shortened version of my full sketch (tested, yielding 0x527C). The weirdest part is that it calculates a different value in my full sketch than in the one below, but both are wrong. Is there something obvious that I'm missing?
byte comms[6] = { 0x10, 0x01, 0x03, 0x20, 0x01, 0x02 }; // CRC should be 0xA888
void setup() {
Serial.begin(115200);
}
void loop() {
calcCRC(comms, 6, true);
while(1);
}
word calcCRC(byte *comms, int commsSize, boolean talkative) {
int warraySize = commsSize / 2 + commsSize % 2;
word warray[warraySize];
warray[0] = comms[0] << 8 | comms[1];
Serial.println(warray[0], HEX);
for (int i = 1; i <= warraySize - 1; i++) {
warray[i] = comms[i * 2 + 1] << 8 | comms[i * 2];
Serial.println(warray[i], HEX);
}
word* warrayP = warray;
word shifter, c;
word carry;
word CRC = 0;
//Calculate pDataArray Word by Word
while (commsSize--)
{
shifter = 0x8000;
c = *warrayP ++;
do {
carry = CRC & 0x8000;
CRC <<= 1;
if (c & shifter) CRC++;
if (carry) CRC ^= 0x1021;
shifter >>= 1;
} while (shifter);
}
if (talkative) {
Serial.print("the CRC for this data is ");
Serial.println(CRC, HEX);
}
return CRC;
}
I used the link below to calculate the checksum that works for this data:
https://www.ghsi.de/CRC/index.php?Polynom=10001000000100001&Message=1001+2003+0201
Thanks so much!!
Where to begin.
First off, you are using commsSize-- for your loop, which will go through six times when you have only three words in the warray. So you are doing an out-of-bounds access of warray, and will necessarily get a random result (or crash).
Second, the build of your first word is backwards from your other builds. Your online CRC suffers the same problem, so you apparently don't even have a reliable test case.
Third (not an issue for the test case), if you have an odd number of bytes of input, you are doing an out-of-bounds access of comms to fill out the last word. And you are running the CRC bits too many times, unless the specification directs some sort of padding in that case. (Your documentation link is broken so I can't see what's supposed to happen.) Even then, you are using random data for the padding instead of zeros.
The whole word conversion thing is a waste of time anyway. You can just do it a byte at a time, given the proper ordering of the bytes. That also avoids the odd-number-of-bytes problem. This will produce the 0xa888 from the input you gave the online CRC calculator (which are your bytes in a messed up order, but exactly as you gave them to the calculator):
unsigned char dat[6] = { 0x10, 0x01, 0x20, 0x03, 0x02, 0x01 };
unsigned crc1021(unsigned char *dat, int len) {
unsigned crc = 0;
while (len) {
crc ^= *dat++ << 8;
for (int k = 0; k < 8; k++)
crc = crc & 0x8000 ? (crc << 1) ^ 0x1021 : crc << 1;
len--;
}
return crc & 0xffff;
}

pointers in c translated to assembly

the code below as I understand it says store the pointer in %rsi in %eax if thats correct then the second line says add the pointer in %eax to the pointer in %rdi ?
very confused. I know assembly doesn't have pointers I am just speaking as translating assembly to c. I must write the assembly code into c code, and these two lines are killing me. Can I have clarification?
movl (%rsi), %eax
addl %eax, (%rdi)
Since you seem to be using using AT&T syntax, the parentheses dereference the value in %rsi. The C equivalent for these expressions would be:
/* Expression 1 */
unsigned int* p = some_address;
unsigned int i = *p; /* *p dereferences the address in p */
/* Expression 2 */
unsigned int* p = some_address;
unsigned int i = 8;
i += *p /* Increase i by the value pointed to by p */

Recover a GZIP file of which first 361 bytes are truncated

I have a gzip file of size 325 MB. I just figured it that it is truncated by 361 bytes from the beginning.
Please advise how can I recover the compressed files from it.
You need to find the next deflate block boundary. Such a boundary can occur at any bit location. You will need to attempt decompression starting at every bit until you get successful decoding for at least a few deflate blocks.
You can use zlib's inflatePrime() to feed less than a byte to inflate(). You can use inflateSetDictionary() to provide a faux 32K dictionary to precede the data being inflated, in order to avoid distance-too-far-back errors.
Once you find a block boundary, you have solved half the problem. The next half is to find where in the deflate stream there is no longer a dependence on the unknown uncompressed data derived from that missing 361 bytes of compressed data. It is possible for such a dependency to very long lasting. For example, if the word " the " appears in that missing section, then it can be referred to after that as a missing string. However, you don't know that it is " the ". All you know is that there is a reference to a five-byte string in the missing data. Then where that five-byte string is copied to can itself be referenced by a later match. This could, in principle, propagate through the entire 325 MB, making the whole thing completely unrecoverable.
However that is unlikely. It is more likely that at some point the propagation of strings from the first 361 bytes stops. From there on, you can recover the uncompressed data.
In order to tell whether you are still seeing propagation or not, do the decompression twice. Once with an initial faux dictionary of all 0's, and once with an initial faux dictionary of all 1's. Where the decompressed data is the same for both decompressions, you have successfully recovered that data.
Then you will need to go up to the next level of structure in that data, and see if you can somehow make use of what you have recovered.
Good luck. And don't cut off the first 361 bytes next time.
Below is example code that does what is described above.
/* salvage -- recover data from a corrupted deflate stream
* Copyright (C) 2015 Mark Adler
* Version 1.0 28 June 2015 Mark Adler
*/
/*
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Mark Adler
madler#alumni.caltech.edu
*/
/* Attempt to recover deflate data from a corrupted stream. The corrupted data
is read on stdin, and any reliably decompressed data is written to stdout. A
deflate stream is deemed to have been found successfully if there are eight
or fewer bytes of compressed data unused when done. This can be changed
with the MAXLEFT macro below, or the conditional that currently uses
MAXLEFT. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
#include "zlib.h"
/* Get the size of an allocated piece of memory (usable size -- not necessarily
the requested size). */
#if defined(__APPLE__) && defined(__MACH__)
# include <malloc/malloc.h>
# define memsize(p) malloc_size(p)
#elif defined (__linux__)
# include <malloc.h>
# define memsize(p) malloc_usable_size(p)
#elif defined (_WIN32)
# include <malloc.h>
# define memsize(p) _msize(p)
#else
# error You need to find an allocated memory size function
#endif
#define local static
/* Load an entire file into a memory buffer. load() returns 0 on success, in
which case it puts all of the file data in *dat[0..*len - 1]. That is,
unless *len is zero, in which case *dat is NULL. *data is allocated memory
which should be freed when done with it. load() returns zero on success,
with *data == NULL and *len == 0. The error values are -1 for read error or
1 for out of memory. To guard against bogging down the system with
extremely large allocations, if limit is not zero then load() will return an
out of memory error if the input is larger than limit. */
local int load(FILE *in, unsigned char **data, size_t *len, size_t limit)
{
size_t size = 1048576, have = 0, was;
unsigned char *buf = NULL, *mem;
*data = NULL;
*len = 0;
if (limit == 0)
limit--;
if (size >= limit)
size = limit - 1;
do {
/* if we already saturated the size_t type or reached the limit, then
out of memory */
if (size == limit) {
free(buf);
return 1;
}
/* double size, saturating to the maximum size_t value */
was = size;
size <<= 1;
if (size < was || size > limit)
size = limit;
/* reallocate buf to the new size */
mem = realloc(buf, size);
if (mem == NULL) {
free(buf);
return 1;
}
buf = mem;
/* read as much as is available into the newly allocated space */
have += fread(buf + have, 1, size - have, in);
/* if we filled the space, make more space and try again until we don't
fill the space, indicating end of file */
} while (have == size);
/* if there was an error reading, discard the data and return an error */
if (ferror(in)) {
free(buf);
return -1;
}
/* if a zero-length file is read, return NULL for the data pointer */
if (have == 0) {
free(buf);
return 0;
}
/* resize the buffer to be just big enough to hold the data */
mem = realloc(buf, have);
if (mem != NULL)
buf = mem;
/* return the data */
*data = buf;
*len = have;
return 0;
}
#define DICTSIZE 32768
#if UINT_MAX <= 0xffff
# define BUFSIZE 32768
#else
# define BUFSIZE 1048576
#endif
/* Inflate the provided buffer starting at a specified bit offset. Use an
already-initialized inflate stream structure for rapid repeated attempts.
The structure needs to have been initialized using inflateInit2(strm, -15).
Inflation begins at data[off], starting at bit bit in that byte, going from
that bit to the more significant bits in that byte, and then on to the next
byte. bit must be in the range 0..7. bit == 0 uses the entire byte at
data[off]. bit == 7 uses only the most significant bit of the byte at
data[off]. Before inflation, the dictionary is initialized to
dict[0..DICTSIZE-1] so that references before the start of the uncompressed
data do not stop inflation. Inflation continues as long as possible, until
either an error is encountered, the end of the deflate stream is reached, or
data[len-1] is processed. On entry *recoup is a pointer to allocated memory
or NULL, and on return *recoup points to allocated memory with the
decompressed data. *got is set to the number of bytes of decompressed data
returned at *recoup.
inflate_at() returns Z_DATA_ERROR if an error was detected in the alleged
deflate data, Z_STREAM_END if the end of a valid deflate stream was reached,
or Z_OK if the end of the provided compressed data was reached without
encountering an erorr or the end of the stream. */
local int inflate_at(z_stream *strm, unsigned char *data, size_t len,
size_t off, int bit, size_t *unused, unsigned char *dict,
unsigned char **recoup, size_t *got)
{
int ret;
size_t left, size;
/* check input */
assert(data != NULL && off < len && bit >= 0 && bit <= 7);
assert(dict != NULL && recoup != NULL);
/* set up inflate engine, feeding first few bits if necessary */
ret = inflateReset(strm);
assert(ret == Z_OK);
ret = inflateSetDictionary(strm, dict, DICTSIZE);
assert(ret == Z_OK);
if (bit) {
ret = inflatePrime(strm, 8 - bit, data[off] >> bit);
assert(ret == Z_OK);
off++;
}
/* inflate as much as possible */
strm->next_in = data + off;
left = len - off;
*got = 0;
do {
strm->avail_in = left > UINT_MAX ? UINT_MAX : left;
left -= strm->avail_in;
do {
/* assure at least BUFSIZE available in recoup */
size = memsize(*recoup);
if (*got + BUFSIZE > size) {
size = size ? size << 1 : BUFSIZE;
assert(size != 0);
*recoup = reallocf(*recoup, size);
assert(*recoup != NULL);
}
/* inflate into recoup */
strm->next_out = *recoup + *got;
strm->avail_out = BUFSIZE;
ret = inflate(strm, Z_NO_FLUSH);
assert(ret != Z_STREAM_ERROR && ret != Z_MEM_ERROR);
/* set the number of compressed bytes unused so far, in case we
return */
if (unused != NULL)
*unused = left + strm->avail_in;
/* update the number of uncompressed bytes generated */
*got += BUFSIZE - strm->avail_out;
/* if we cannot continue to decompress, then return the reason */
if (ret == Z_DATA_ERROR || ret == Z_STREAM_END)
return ret;
/* continue with provided input data until all output generated */
} while (strm->avail_out == 0);
assert(strm->avail_in == 0);
/* provide more input data, if any */
} while (left);
/* ran through all compressed data with no errors or end of stream */
return Z_OK;
}
/* The criteria for success is the completion of inflate with no more than this
many bytes unused. (8 is the length of a gzip trailer.) */
#define MAXLEFT 8
/* Read a corrupted (or not) deflate stream from stdin and write the reliably
recovered data to stdout. */
int main(void)
{
int ret, bit;
unsigned char *data = NULL, *recoup = NULL, *comp = NULL;
size_t len, off, unused, got;
z_stream strm;
unsigned char dict[DICTSIZE] = {0};
/* read input into memory */
ret = load(stdin, &data, &len, 0);
if (ret < 0)
fprintf(stderr, "file error reading input\n");
if (ret > 0)
fprintf(stderr, "ran out of memory reading input\n");
assert(ret == 0);
fprintf(stderr, "read %lu bytes\n", len);
/* initialize inflate structure */
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
strm.next_in = Z_NULL;
strm.avail_in = 0;
ret = inflateInit2(&strm, -15);
assert(ret == Z_OK);
/* scan for an acceptable starting point for inflate */
for (off = 0; off < len; off++)
for (bit = 0; bit < 8; bit++) {
ret = inflate_at(&strm, data, len, off, bit, &unused, dict,
&recoup, &got);
if ((ret == Z_STREAM_END || ret == Z_OK) && unused <= MAXLEFT)
goto done;
}
done:
/* if met the criteria, show result and write out reliable data */
if (bit != 8 && (ret == Z_STREAM_END || ret == Z_OK)) {
fprintf(stderr,
"decoded %lu bytes (%lu unused) at offset %lu, bit %d\n",
len - off - unused, unused, off, bit);
/* decompress again with a different dictionary to detect unreliable
data */
memset(dict, 1, DICTSIZE);
inflate_at(&strm, data, len, off, bit, NULL, dict, &comp, &got);
{
unsigned char *p, *q;
/* search backwards from the end for the first unreliable byte */
p = recoup + got;
q = comp + got;
while (q > comp)
if (*--p != *--q) {
p++;
q++;
break;
}
/* write out the reliable data */
fwrite(q, 1, got - (q - comp), stdout);
fprintf(stderr,
"%lu bytes of reliable uncompressed data recovered\n",
got - (q - comp));
fprintf(stderr,
"(out of %lu total uncompressed bytes recovered)\n", got);
}
}
/* otherwise declare failure */
else
fprintf(stderr, "no deflate stream found that met criteria\n");
/* clean up */
free(comp);
free(recoup);
inflateEnd(&strm);
free(data);
return 0;
}

Resources