I have this compute shader that traverses a binary tree. It used to work fine with the separately installed DirectX SDK (June), with compiler #43.
The compilers #46 and #47 (from Windows SDKs 8.0 and 8.1, respectively) however seem to omit two really crucial lines of code, that have the shader running in circles, checking the same tree nodes over and over again until Windows restarts the graphics driver (Verified by looking at disassembly).
Here's a minimal code sample exhibiting this behavior:
#define LEFT_PROCESSED 1
#define RIGHT_PROCESSED 2
struct Node
{
float4 min;
float4 max;
int left;
int right;
int parent;
int flags;
};
RWStructuredBuffer<Node> tree: register(u0);
bool TreeSearch()
{
Node node = tree[0];
int nodeId = 0;
int statusStack[40];
int stackSize = 0;
statusStack[0] = 0;
while (true)
{
if (!(statusStack[stackSize] & LEFT_PROCESSED))
{
statusStack[stackSize] |= LEFT_PROCESSED;
++stackSize;
statusStack[stackSize] = 0;
nodeId = node.left;
node = tree[nodeId];
continue;
}
if (!(statusStack[stackSize] & RIGHT_PROCESSED))
{
statusStack[stackSize] |= RIGHT_PROCESSED; // this line
++stackSize;
statusStack[stackSize] = 0; // and this line
nodeId = node.right;
node = tree[nodeId];
continue;
}
if (node.parent != -1)
{
--stackSize;
nodeId = node.parent;
node = tree[nodeId];
}
else
return false;
}
return false;
}
[numthreads(32, 1, 1)]
void CSSearch(uint2 dispatchThreadId: SV_DispatchThreadID)
{
TreeSearch();
}
And the corresponding assembly:
cs_5_0
dcl_globalFlags refactoringAllowed
dcl_uav_structured u0, 48
dcl_temps 3
dcl_indexableTemp x0[40], 4
dcl_thread_group 32, 1, 1
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r0.xyz, l(0), l(32), u0.xyzx
mov x0[0].x, l(0)
mov r1.xyz, r0.yzxy
mov r0.w, l(0)
loop
mov r1.w, x0[r0.w + 0].x
and r2.x, r1.w, l(1)
if_z r2.x
or r1.w, r1.w, l(1) // here's the first one in the LEFT branch
mov x0[r0.w + 0].x, r1.w //
iadd r1.w, r0.w, l(1)
mov x0[r1.w + 0].x, l(0) // and the second one
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r2.xyz, r1.z, l(32), u0.yzxx
mov r1.xyz, r2.xyzx
mov r0.w, r1.w
continue
endif
mov r1.w, x0[r0.w + 0].x // why is there nothing in the RIGHT branch?
and r1.w, r1.w, l(2)
if_z r1.w
iadd r1.w, r0.w, l(1)
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r2.xyz, r1.x, l(32), u0.yzxx
mov r1.xyz, r2.xyzx
mov r0.w, r1.w
continue
endif
ine r1.w, r1.y, l(-1)
if_nz r1.w
iadd r0.w, r0.w, l(-1)
ld_structured_indexable(structured_buffer, stride=48)(mixed,mixed,mixed,mixed) r1.xyz, r1.y, l(32), u0.yzxx
else
break
endif
endloop
ret
When I omit the first continue, it generates the code for those two lines, but then its also broken.
Any idea how to get the newer compiler to generate that code?
Please note: I'm not familiar with GPU programming, and am not sure whether this is the compiler's fault or the code's fault. The following is just a workaround.
You can mimic the continue behavior by using an explicit variable, in hope that the compiler does not get in the way:
bool TreeSearch()
{
Node node = tree[0];
int nodeId = 0;
int statusStack[40];
int stackSize = 0;
statusStack[0] = 0;
while (stackSize < 10) // Changed to make it compile.
{
int shouldContinue = 1;
if (!(statusStack[stackSize] & LEFT_PROCESSED))
{
statusStack[stackSize] |= LEFT_PROCESSED;
++stackSize;
statusStack[stackSize] = 0;
nodeId = node.left;
node = tree[nodeId];
shouldContinue = 0;
}
if (shouldContinue &&
!(statusStack[stackSize] & RIGHT_PROCESSED))
{
statusStack[stackSize] |= RIGHT_PROCESSED; // this line
++stackSize;
statusStack[stackSize] = 0; // and this line
nodeId = node.right;
node = tree[nodeId];
shouldContinue = 0;
}
if (shouldContinue)
{
if (node.parent != -1)
{
--stackSize;
nodeId = node.parent;
node = tree[nodeId];
}
else
return false;
}
}
return false;
}
The disassembly output doesn't seem to lack any operation that is missing from the original snippet. This may have a overhead, though.
Link: http://shader-playground.timjones.io/6abdc64cdf98e1840a3b38c629b4e217
Related
Using Mplab ide 5.10 and xc8 compiler for the pic18f4550 I am unable to get the code to get into the interrupt function the goal is to get J to count up in the background until something trigger it to output a value in the lcd. Currently only the lcd display the first message and using ICD 3 the value of J does not change and does not look like the program runs the interrupt function at all
#define _XTAL_FREQ 48000000
#include <xc.h>
#include <stdio.h>
#include <stdlib.h>
#include "lcd.h"
unsigned char j, output = 0, i, outchar;
char buffer[2] = " ";
char Message[ ] = "Hands Position ";
void interrupt timer0_isr();
void lcd_write_cmd(unsigned char cmd);
void lcd_write_data(unsigned char data);
void lcd_strobe(void); // Generate the E pulse
void lcd_init(void);
void interrupt timer0_ISR() // Timer0 Interrupt Service Routine (ISR)
{
if (INTCONbits.TMR0IF) // TMR0IF:- Timer0 Overflow Interrupt Flag Bit
{
TMR0H = 0x48; // Timer0 start value = 0x48E5 for 1 second
TMR0L = 0xE5;
PORTCbits.RC1 = !PORTCbits.RC1; /* external timing check - toggle every 1ms */
if (j <= 4) { //limit up to 7
j++; // Increase count by 1
PORTB = j; // Output to Demultiplexer
} else {
j = 0; // Reset count aftwr it hit 7
PORTB = j; // Output to Demultiplexer
}
INTCONbits.TMR0IF = 0; // Reset TMR0IF at interrupt end
}
}
void main(void) // Main Function
{
ADCON1 = 0x0F;
CMCON = 0x07;
RCONbits.IPEN = 1; // Bit7 Interrupt Priority Enable Bit
INTCONbits.GIEH = 1; // Bit7 Global Interrupt Enable bit
INTCONbits.GIEL = 0; /* turn on low & high interrupts */
T0CONbits.TMR0ON = 1; // Turn on timer
T0CON = 0b00000111; // bit7:0 Stop Timer0
// bit6:0 Timer0 as 16 bit timer
// bit5:0 Clock source is internal
// bit4:0 Increment on lo to hi transition on TOCKI pin
// bit3:0 Prescaler output is assigned to Timer0
// bit2-bit0:111 1:256 prescaler
INTCON2 = 0b10000100; // bit7 :PORTB Pull-Up Enable bit
// 1 All PORTB pull-ups are disabled
// bit2 :TMR0 Overflow Int Priority Bit
// 1 High Priority
TMR0H = 0x48; // Initialising TMR0H
TMR0L = 0xE5; // Initialising TMR0L for 1 second interrupt
INTCONbits.TMR0IE = 1; // bit5 TMR0 Overflow Int Enable bit
INTCONbits.TMR0IF = 0; // bit2 TMR0 Overflow Int Flag bit
// 0 TMR0 register did not overflow
TRISC = 0; /* all outputs */
TRISAbits.TRISA5 = 1; // RA5 is the check for signal from input Multiplexer.
TRISAbits.TRISA0 = 0; // RA0, RA1 & RA2 output to arduino
TRISAbits.TRISA1 = 0;
TRISAbits.TRISA2 = 0;
TRISD = 0x00; // PortD connects to Demultiplexer
TRISB = 0;
lcd_init(); // LCD init
lcd_write_cmd(0x80); // Cursor set at line 1 positon 1
for (i = 0; i < 16; i++) {
outchar = Message[i]; // Store Message in outchar
lcd_write_data(outchar); // Display Message
}
__delay_ms(100);
PORTD = 0x00; // Clear PortD
PORTB = 0;
j = 0; // Start count from 0
while (1) // Main Process
{
if (PORTAbits.RA5 == 1) { // If RA3 detect a signal
switch (j) { // Switch case to determine hand position & output to RA0, RA1 & RA2 to transmit to arduino
case(0):
output = 10;
PORTAbits.RA0 = 0;
PORTAbits.RA1 = 0;
PORTAbits.RA2 = 0;
break;
case(1):
output = 20;
PORTAbits.RA0 = 1;
PORTAbits.RA1 = 0;
PORTAbits.RA2 = 0;
break;
case(2):
output = 30;
PORTAbits.RA0 = 0;
PORTAbits.RA1 = 1;
PORTAbits.RA2 = 0;
break;
case(3):
output = 40;
PORTAbits.RA0 = 1;
PORTAbits.RA1 = 1;
PORTAbits.RA2 = 0;
break;
case(4):
output = 50;
PORTAbits.RA0 = 0;
PORTAbits.RA1 = 0;
PORTAbits.RA2 = 1;
break;
}
lcd_write_cmd(0xC0); // Cursor set at line 2 positon 1
sprintf(buffer, "%d", output); // Convert numbers to character
for (i = 0; i < 2; i++)
lcd_write_data(buffer[i]); // Display Hand Position
}
}
}
Status bit_flags_set_flag(BIT_FLAGS hBit_flags, int flag_position) {
Bit_Flags* temp = (Bit_Flags*)hBit_flags;
int* nums;
int i;
int old_size;
if (temp->size < flag_position) {
nums = malloc(sizeof(int)*flag_position+1);
if (nums == NULL) {
return FAILURE;
}
for (i = 0; i < temp->size; i++) {
nums[i] = temp->data[i];
}
free(temp->data);
temp->data = nums;
old_size = temp->size;
temp->size = flag_position + 1;
for (i = old_size; i < temp->size; i++) {
temp->data[i] = 0;
}
}
temp->data[flag_position / 32] |= 1 << flag_position % 32;
return SUCCESS;
}
according to the debugger the error is from the free(temp->data) part. however. I only run into the error the second time I go through the function. any ideas what is happening here.
am getting a heap corruption error on visual studio.
I am writing on some assumptions like you are assuming int size is 32 bits and you are trying to set the bit at flag_position in the bitset and you are using 1 int for 1 bit for setting and unsetting bits
Few comments now
temp->data[flag_position / 32] |= 1 << flag_position % 32; now this doesn't make any sense, this line role is to set bit at flag_position, this should be temp->data[flag_position] = 1; instead because if you see your code your are using ints for each bit.
Also this line temp->size = flag_position + 1; is also incorrect , this should be temp->size = flag_position;
I am also working on the bootloader.
I had the problem in the following:
Once the cmd 'B' is received, later, 'F' is received, then I would start to call block load.
static void start_block_flash_load(uint16_t size, uint32_t *addr) {
uint16_t data_word;
uint8_t sreg = SREG;
uint16_t temp;
int i;
uint8_t my_size;
fprintf(lcdout, "B");
cli();
// Disable interrupts
(*addr) <<= 1;
if (size <= SPM_PAGESIZE) {
boot_page_erase(*addr);
boot_spm_busy_wait();
fprintf(lcdout, "%"PRIu16, size);
uint16_t i;
//store all values. PROBLEM here!!!
my_size = 208;
uint8_t buf[SPM_PAGESIZE] = { 0 };
for (i = 0; i < my_size; i++) {
//for (i=0; i<size; i++){
buf[i] = uart_getc();
// lcd_clear();
// lcd_setCursor(0, 2);
// fprintf(lcdout, "%3d", i);
// _delay_ms(500);
}
for (i = 0; i < my_size; i += 2) { //if size is odd, then use do-while
uint16_t w = buf[i];
w += buf[i + 1] << 8; //first one is low byte, second is high???
boot_page_fill((*addr)+i, w);
}
boot_page_write(*addr);
boot_spm_busy_wait();
(*addr) >>= 1;
uart_putc('\r');
} else
uart_putc('?');
boot_rww_enable ();
SREG = sreg;
}
I can see on the lcd that the size of the block is 256. However, when entering the loop to collect data, it will get stuck.
I tested with my_size and I found that only if my_size=208 the program will run further.
The strange thing is that if I put some statements inside the loop, e.g.
lcd_clear();
lcd_setCursor(0, 2);
then 'i' which I printed out on lcd will not go up to 140 something. I put different statements, the 'i' will give different value. That is very strange, since the uart_getc() will not lose data.
What I expect is that the loop will go up to 256. I cannot figure out what happened there.
Please help if you have any idea.
Thanks
I'm trying to get started on a project using a PIC18F24J10. Attempting to use SDCC for this. At this point I've reduced my code to simply trying to nail down what is happening, as I've been seeing bizarre behavior for a while now and can't really proceed until I figure out what is going on. Not sure if this is my only problem at this point, but I have no idea why this is happening. Timer interrupt fires off, coupled with a #defined for loop causes LEDs on PORTC to blink maybe twice a second. If I just do a PORTC=0xFF and PORTC=0 this works fine. But it gets weird when I try to go much beyond that.
...
#define _RC0 31
#define _RC1 32
#define _RC2 33
#define _RC3 34
#define _RC4 35
#define _RC5 36
#define _RC6 37
#define _RC7 38
#define HI 1
#define LO 0
void why(unsigned char p, int z)
{
if(z == HI)
{
if(p == _RC0) PORTCbits.RC0 = 1;
else if(p == _RC1) PORTCbits.RC1 = 1;
else if(p == _RC2) PORTCbits.RC2 = 1;
else if(p == _RC3) PORTCbits.RC3 = 1;
else if(p == _RC4) PORTCbits.RC4 = 1;
else if(p == _RC5) PORTCbits.RC5 = 1;
else if(p == _RC6) PORTCbits.RC6 = 1;
else if(p == _RC7) PORTCbits.RC7 = 1;
}
else if(z == LO)
{
PORTC = 0b11110000;
}
else
{
PORTC = 0b10101010;
}
}
void timer_isr (void) __interrupt(1) __using (1)
{
TMR0H=0;
TMR0L=0;
why(_RC0, LO);
why(_RC1, LO);
why(_RC2, LO);
WAIT_CYCLES(5000);
why(_RC0, HI);
why(_RC1, HI);
why(_RC2, HI);
WAIT_CYCLES(5000);
}
void main(void)
{
WDTCONbits.SWDTE = 0;
WDTCONbits.SWDTEN = 0;
TRISC = 0b00000000;
PORTC=0b00000000;
INTCONbits.GIE = 1;
INTCONbits.PEIE = 1;
INTCONbits.TMR0IF = 0;
INTCONbits.TMR0IE = 1;
T0CONbits.T08BIT = 0;
T0CONbits.T0CS = 0;
T0CONbits.PSA = 1;
TMR0H = 0;
TMR0L = 0;
T0CONbits.TMR0ON = 1;
while(1)
{
}
}
In the code above, four of the LEDs should blink, while the other four stay on. Instead, the LEDs stay on in the 10101010 pattern that happens in the "else" block, which should happen when "why" is called with any value other than HI or LO. I never call it with anything else, so why does it ever reach that?
UPDATE - Further reduction, no more interrupts or unspecified includes/defines. This is now the entirety of the program, and I am still seeing the same behavior. Changed the pattern from 10101010 to 10101011 so that I could verify the chip is actually being programmed with the new code, and it appears to be.
#include "pic16/pic18f24j10.h"
#define WAIT_CYCLES(A) for(__delay_cycle = 0;__delay_cycle < A;__delay_cycle++)
int __delay_cycle;
#define HI 1
#define LO 0
void why(int z)
{
if(z == HI)
{
PORTC = 0b11111111;
}
else if(z == LO)
{
PORTC = 0b11110000;
}
else
{
PORTC = 0b10101011;
}
}
void main(void)
{
TRISC = 0b00000000;
PORTC=0b00000000;
while(1)
{
why(LO);
WAIT_CYCLES(5000);
why(HI);
WAIT_CYCLES(5000);
}
}
Once the interrupt is asserted it is never cleared. That results in timer_isr() being called repeatedly. No other code is ever reached. The TMR0IF bit must be cleared in software by the Interrupt Service Routine.
Keep in mind you not only need to spend less time in the ISR than the period of the timer - it’s a good practice to spend the minimum amount of time necessary.
Remove the delays and simply toggle a flag or increment a register. In your main while (1) loop monitor the flag or counter and make your calls to why() from there.
i have this string:
12 4 the quick 99 -1 fox dog \
what i want in my program:
myArray[] = {12, 4, 99, -1};
how i do a multiple number scanning?
See my answer to your other question here. It's a relatively simple matter to replace the strtok section to recognize non-numeric words and neither increment the count (in the first pass) nor load them into the array (in the second pass).
The code has changed as follows:
Using an input file of:
12 3 45 6 7 8
3 5 6 7
7 0 -1 4 5
12 4 the quick 99 -1 fox dog \
it produces output along the lines of:
0x8e42170, size = 6:
12 3 45 6 7 8
0x8e421d0, size = 4:
3 5 6 7
0x8e421e0, size = 5:
7 0 -1 4 5
0x8e42278, size = 4:
12 4 99 -1
Here's the code that produced that output:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
// This is the linked list of integer arrays.
typedef struct _tIntArray {
int size;
int *array;
struct _tIntArray *next;
} tIntArray;
static tIntArray *first = NULL;
static tIntArray *last = NULL;
// Check that argument is numeric, optional minus sign followed by
// zero or more digits (you may want one or more).
static int isAllNumeric (char *word) {
char *s = word;
if (*s == '-')
s++;
for (; *s != '\0'; s++)
if ((*s < '0') || (*s > '9'))
return 0;
return 1;
}
// Add a line of integers as a node.
static int addNode (char *str) {
tIntArray *curr; // pointers for new integer array.
char *word; // word within string.
char *tmpStr; // temp copy of buffer.
int fldCnt; // field count for line.
int i;
// Count number of fields.
if ((tmpStr = strdup (str)) == NULL) {
printf ("Cannot allocate duplicate string (%d).\n", errno);
return 1;
}
fldCnt = 0;
for (word = strtok (tmpStr, " "); word; word = strtok (NULL, " "))
if (isAllNumeric (word))
fldCnt++;
free (tmpStr);
// Create new linked list node.
if ((curr = malloc (sizeof (tIntArray))) == NULL) {
printf ("Cannot allocate integer array node (%d).\n", errno);
return 1;
}
curr->size = fldCnt;
if ((curr->array = malloc (fldCnt * sizeof (int))) == NULL) {
printf ("Cannot allocate integer array (%d).\n", errno);
free (curr);
return 1;
}
curr->next = NULL;
for (i = 0, word = strtok (str, " "); word; word = strtok (NULL, " "))
if (isAllNumeric (word))
curr->array[i++] = atoi (word);
if (last == NULL)
first = last = curr;
else {
last->next = curr;
last = curr;
}
return 0;
}
int main(void) {
int lineSz; // current line size.
char *buff; // buffer to hold line.
FILE *fin; // input file handle.
long offset; // offset for re-allocating line buffer.
tIntArray *curr; // pointers for new integer array.
int i;
// Open file.
if ((fin = fopen ("qq.in", "r")) == NULL) {
printf ("Cannot open qq.in, errno = %d\n", errno);
return 1;
}
// Allocate initial line.
lineSz = 2;
if ((buff = malloc (lineSz+1)) == NULL) {
printf ("Cannot allocate initial memory, errno = %d.\n", errno);
return 1;
}
// Loop forever.
while (1) {
// Save offset in case we need to re-read.
offset = ftell (fin);
// Get line, exit if end of file.
if (fgets (buff, lineSz, fin) == NULL)
break;
// If no newline, assume buffer wasn't big enough.
if (buff[strlen(buff)-1] != '\n') {
// Get bigger buffer and seek back to line start and retry.
free (buff);
lineSz += 3;
if ((buff = malloc (lineSz+1)) == NULL) {
printf ("Cannot allocate extra memory, errno = %d.\n", errno);
return 1;
}
if (fseek (fin, offset, SEEK_SET) != 0) {
printf ("Cannot seek, errno = %d.\n", errno);
return 1;
}
continue;
}
// Remove newline and process.
buff[strlen(buff)-1] = '\0';
if (addNode (buff) != 0)
return 1;
}
// Dump table for debugging.
for (curr = first; curr != NULL; curr = curr->next) {
printf ("%p, size = %d:\n ", curr, curr->size);
for (i = 0; i < curr->size; i++)
printf (" %d", curr->array[i]);
printf ("\n");
}
// Free resources and exit.
free (buff);
fclose (fin);
return 0;
}