Looking for source code of __builtin_avr_delay_cycles called by _delay_ms in avr-gcc

Looking for source code of __builtin_avr_delay_cycles called by _delay_ms in avr-gcc - gcc

I was investigating the delay_ms function of avr-gcc. In delay.h I found its definition:
void _delay_ms(double __ms)
{
double __tmp ;
#if __HAS_DELAY_CYCLES && defined(__OPTIMIZE__) && \
!defined(__DELAY_BACKWARD_COMPATIBLE__) && \
__STDC_HOSTED__
uint32_t __ticks_dc;
extern void __builtin_avr_delay_cycles(unsigned long);
__tmp = ((F_CPU) / 1e3) * __ms;
#if defined(__DELAY_ROUND_DOWN__)
__ticks_dc = (uint32_t)fabs(__tmp);
#elif defined(__DELAY_ROUND_CLOSEST__)
__ticks_dc = (uint32_t)(fabs(__tmp)+0.5);
#else
//round up by default
__ticks_dc = (uint32_t)(ceil(fabs(__tmp)));
#endif
__builtin_avr_delay_cycles(__ticks_dc);
#else
...
}
I am interested in how the __builtin_avr_delay_cycles function looks like internally and where it is defined? Where can I find the source?

As said in my comment to this very question on electronics.SE:
Compiler builtins are kinda funky to find, always, because they are not just C functions, but things that get inserted while parsing/compiling the code (at various levels of abstraction from the textual representation of the code itself. compiler theory stuff). What you're looking for is the function avr_expand_builtin in the GCC source tree. There's a case AVR_BUILTIN_DELAY_CYCLES in there. Look for what happens there.
Which is:
/* Implement `TARGET_EXPAND_BUILTIN'. */
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
SUBTARGET may be used as the target for computing one of EXP's operands.
IGNORE is nonzero if the value is to be ignored. */
static rtx
avr_expand_builtin (tree exp, rtx target,
rtx subtarget ATTRIBUTE_UNUSED,
machine_mode mode ATTRIBUTE_UNUSED,
int ignore)
{
tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
const char *bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
unsigned int id = DECL_FUNCTION_CODE (fndecl);
const struct avr_builtin_description *d = &avr_bdesc[id];
tree arg0;
rtx op0;
gcc_assert (id < AVR_BUILTIN_COUNT);
switch (id)
{
case AVR_BUILTIN_NOP:
emit_insn (gen_nopv (GEN_INT (1)));
return 0;
case AVR_BUILTIN_DELAY_CYCLES:
{
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_expr (arg0, NULL_RTX, VOIDmode, EXPAND_NORMAL);
if (!CONST_INT_P (op0))
error ("%s expects a compile time integer constant", bname);
else
avr_expand_delay_cycles (op0);
return NULL_RTX;
}
…
thus, the function you're looking for is avr_expand_delay_cycles in the same file:
static void
avr_expand_delay_cycles (rtx operands0)
{
unsigned HOST_WIDE_INT cycles = UINTVAL (operands0) & GET_MODE_MASK (SImode);
unsigned HOST_WIDE_INT cycles_used;
unsigned HOST_WIDE_INT loop_count;
if (IN_RANGE (cycles, 83886082, 0xFFFFFFFF))
{
loop_count = ((cycles - 9) / 6) + 1;
cycles_used = ((loop_count - 1) * 6) + 9;
emit_insn (gen_delay_cycles_4 (gen_int_mode (loop_count, SImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
if (IN_RANGE (cycles, 262145, 83886081))
{
loop_count = ((cycles - 7) / 5) + 1;
if (loop_count > 0xFFFFFF)
loop_count = 0xFFFFFF;
cycles_used = ((loop_count - 1) * 5) + 7;
emit_insn (gen_delay_cycles_3 (gen_int_mode (loop_count, SImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
if (IN_RANGE (cycles, 768, 262144))
{
loop_count = ((cycles - 5) / 4) + 1;
if (loop_count > 0xFFFF)
loop_count = 0xFFFF;
cycles_used = ((loop_count - 1) * 4) + 5;
emit_insn (gen_delay_cycles_2 (gen_int_mode (loop_count, HImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
if (IN_RANGE (cycles, 6, 767))
{
loop_count = cycles / 3;
if (loop_count > 255)
loop_count = 255;
cycles_used = loop_count * 3;
emit_insn (gen_delay_cycles_1 (gen_int_mode (loop_count, QImode),
avr_mem_clobber()));
cycles -= cycles_used;
}
while (cycles >= 2)
{
emit_insn (gen_nopv (GEN_INT (2)));
cycles -= 2;
}
if (cycles == 1)
{
emit_insn (gen_nopv (GEN_INT (1)));
cycles--;
}
}
Of biggest interest here is that this modifies a node in the Abstract Syntax Tree, and emits instructions there.

Related

When declaring a static array as "private" before a parallel loop is perfectly equivalent to declaring the array inside the loop?

I've encountered a situation where the code generates different results in the case of having arrays defined inside the loop on index i (case #1) and in the case of declaring them outside the loop on the i index and using the clause private (case #2).
Case #2 generates the same results of the code running on CPU only.
Case #1
#pragma acc parallel loop
for (j = jbeg; j <= jend; j++){
#pragma acc loop
for (i = ibeg; i <= iend; i++){
double Rc[NFLX][NFLX];
double eta[NFLX], um[NFLX], dv[NFLX];
double lambda[NFLX], alambda[NFLX];
double fL[NFLX], fR[NFLX];
.
.
.
}
}}
Case #2
#pragma acc parallel loop
for (j = jbeg; j <= jend; j++){
double Rc[NFLX][NFLX];
double eta[NFLX], um[NFLX], dv[NFLX];
double lambda[NFLX], alambda[NFLX];
double fL[NFLX], fR[NFLX];
#pragma acc loop private(Rc[:NFLX][:NFLX], eta[:NFLX], \
um[:NFLX], lambda[:NFLX], alambda[:NFLX], \
dv[:NFLX], fL[:NFLX], fR[:NFLX])
for (i = ibeg; i <= iend; i++){
.
.
.
}
}}
I have the following values:
NFLX = 8;
jbeg = 3, jend = 258;
ibeg = 3, iend = 1026;
In which cases the two techniques are equivalent and when it is better to choose one over the other?
This is what I see with -Minfo=accel:
case #1:
71, Local memory used for Rc,dv,fR,um,lambda,alambda,fL,eta
case #2:
71, Local memory used for Rc,dv,fR,lambda,alambda,fL,eta
CUDA shared memory used for Rc,eta
Local memory used for um
CUDA shared memory used for um,lambda,alambda,dv,fL,fR
function:
/* ********************************************************************* */
void Roe_Solver (Data *d, timeStep *Dts, Grid *grid, RBox *box)
/*
* Solve the Riemann problem between L/R states using a
* Rusanov-Lax Friedrichs flux.
*********************************************************************** */
{
int i, j, k;
int ibeg = *(box->nbeg)-1, iend = *(box->nend);
int jbeg = *(box->tbeg), jend = *(box->tend);
int kbeg = *(box->bbeg), kend = *(box->bend);
int VXn = VX1, VXt = VX2, VXb = VX3;
int MXn = MX1, MXt = MX2, MXb = MX3;
int ni, nj;
double gmm = GAMMA_EOS;
double gmm1 = gmm - 1.0;
double gmm1_inv = 1.0/gmm1;
double delta = 1.e-7;
double delta_inv = 1.0/delta;
ARRAY_OFFSET (grid, ni, nj);
INDEX_CYCLE (grid->dir, VXn, VXt, VXb);
INDEX_CYCLE (grid->dir, MXn, MXt, MXb);
#pragma acc parallel loop collapse(2) present(d, Dts, grid)
for (k = kbeg; k <= kend; k++){
for (j = jbeg; j <= jend; j++){
long int offset = ni*(j + nj*k);
double * __restrict__ cmax = &Dts->cmax [offset];
double * __restrict__ SL = &d->sweep.SL [offset];
double * __restrict__ SR = &d->sweep.SR [offset];
double um[NFLX];
double fL[NFLX], fR[NFLX];
#pragma acc loop private(um[:NFLX], fL[:NFLX], fR[:NFLX])
for (i = ibeg; i <= iend; i++){
int nv;
double scrh, vel2;
double a2, a, h;
double alambda, lambda, eta;
double s, c, hl, hr;
double bmin, bmax, scrh1;
double pL, pR;
double * __restrict__ vL = d->sweep.vL [offset + i];
double * __restrict__ vR = d->sweep.vR [offset + i];
double * __restrict__ uL = d->sweep.uL [offset + i];
double * __restrict__ uR = d->sweep.uR [offset + i];
double * __restrict__ flux = d->sweep.flux[offset + i];
double a2L = SoundSpeed2 (vL);
double a2R = SoundSpeed2 (vR);
PrimToCons (vL, uL);
PrimToCons (vR, uR);
Flux (vL, uL, fL, grid->dir);
Flux (vR, uR, fR, grid->dir);
pL = vL[PRS];
pR = vR[PRS];
s = sqrt(vR[RHO]/vL[RHO]);
um[RHO] = vL[RHO]*s;
s = 1.0/(1.0 + s);
c = 1.0 - s;
um[VX1] = s*vL[VX1] + c*vR[VX1];
um[VX2] = s*vL[VX2] + c*vR[VX2];
um[VX3] = s*vL[VX3] + c*vR[VX3];
vel2 = um[VX1]*um[VX1] + um[VX2]*um[VX2] + um[VX3]*um[VX3];
hl = 0.5*(vL[VX1]*vL[VX1] + vL[VX2]*vL[VX2] + vL[VX3]*vL[VX3]);
hl += a2L*gmm1_inv;
hr = 0.5*(vR[VX1]*vR[VX1] + vR[VX2]*vR[VX2] + vR[VX3]*vR[VX3]);
hr += a2R*gmm1_inv;
h = s*hl + c*hr;
/* ----------------------------------------------------
1. the following should be equivalent to
scrh = dv[VX1]*dv[VX1] + dv[VX2]*dv[VX2] + dv[VX3]*dv[VX3];
a2 = s*a2L + c*a2R + 0.5*gmm1*s*c*scrh;
and therefore always positive.
---------------------------------------------------- */
a2 = gmm1*(h - 0.5*vel2);
a = sqrt(a2);
/* ----------------------------------------------------------------
2. define non-zero components of conservative eigenvectors Rc,
eigenvalues (lambda) and wave strenght eta = L.du
---------------------------------------------------------------- */
#pragma acc loop seq
NFLX_LOOP(nv) flux[nv] = 0.5*(fL[nv] + fR[nv]);
/* ---- (u - c_s) ---- */
SL[i] = um[VXn] - a;
/* ---- (u + c_s) ---- */
SR[i] = um[VXn] + a;
/* ---- get max eigenvalue ---- */
cmax[i] = fabs(um[VXn]) + a;
NFLX_LOOP(nv) flux[nv] = 0.5*(fL[nv] + fR[nv]) - 0.5*cmax[i]*(uR[nv] - uL[nv]);
#if DIMENSIONS > 1
/* ---------------------------------------------
3. use the HLL flux function if the interface
lies within a strong shock.
The effect of this switch is visible
in the Mach reflection test.
--------------------------------------------- */
scrh = fabs(vL[PRS] - vR[PRS]);
scrh /= MIN(vL[PRS],vR[PRS]);
if (scrh > 0.5 && (vR[VXn] < vL[VXn])){ /* -- tunable parameter -- */
bmin = MIN(0.0, SL[i]);
bmax = MAX(0.0, SR[i]);
scrh1 = 1.0/(bmax - bmin);
#pragma acc loop seq
for (nv = 0; nv < NFLX; nv++){
flux[nv] = bmin*bmax*(uR[nv] - uL[nv])
+ bmax*fL[nv] - bmin*fR[nv];
flux[nv] *= scrh1;
}
}
#endif /* DIMENSIONS > 1 */
} /* End loop on i */
}} /* End loop on j,k */
}

Technically they are equivalent, but in practice different. What's happening is that the compiler will hoist the declaration of these arrays outside of the loops. This is standard practice for the compiler and happens before the OpenACC directives are applied. What should happen is that then these arrays are implicitly privatized within the scoping unit they are declared. However the compiler doesn't currently track this so the arrays are implicitly copied into the compute region as shared arrays. If you add the flag "-Minfo=accel", you'll see the compiler feedback messages indicating the implicit copies.
I have an open issue report requesting this support, TPR #31360, however it's been a challenge to implement so not in a released compiler as of yet. Hence until/if we can fix the behavior, you'll need to manually hoist the declaration of these arrays and then add them to a "private" clause.

CUDA string search in large file, wrong result

I am working on simple naive string search in CUDA.
I am new in CUDA. It works fine fol smaller files ( aprox. ~1MB ). After I make these files bigger ( ctrl+a ctrl+c several times in notepad++ ), my program's results are higher ( about +1% ) than a
grep -o text file_name | wc -l
It is very simple function, so I don't know what could cause this. I need it to work with larger files ( ~500MB ).
Kernel code ( gpuCount is a __device__ int global variable ):
__global__ void stringSearchGpu(char *data, int dataLength, char *input, int inputLength){
int id = blockDim.x*blockIdx.x + threadIdx.x;
if (id < dataLength)
{
int fMatch = 1;
for (int j = 0; j < inputLength; j++)
{
if (data[id + j] != input[j]) fMatch = 0;
}
if (fMatch)
{
atomicAdd(&gpuCount, 1);
}
}
}
This is calling the kernel in main function:
int blocks = 1, threads = fileSize;
if (fileSize > 1024)
{
blocks = (fileSize / 1024) + 1;
threads = 1024;
}
clock_t cpu_start = clock();
// kernel call
stringSearchGpu<<<blocks, threads>>>(cudaBuffer, strlen(buffer), cudaInput, strlen(input));
cudaDeviceSynchronize();
After this I just copy the result to Host and print it.
Can anyone please help me with this?

First of all, you should always check return values of CUDA functions to check for errors. Best way to do so would be the following:
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
Wrap your CUDA calls, such as:
gpuErrchk(cudaDeviceSynchronize());
Second, your kernel accesses out of bounds memory. Suppose, dataLength=100, inputLength=7 and id=98. In your kernel code:
if (id < dataLength) // 98 is less than 100, so condition true
{
int fMatch = 1;
for (int j = 0; j < inputLength; j++) // j runs from [0 - 6]
{
// if j>1 then id+j>=100, which is out of bounds, illegal operation
if (data[id + j] != input[j]) fMatch = 0;
}
Change the condition to something like:
if (id < dataLength - inputLength)

Is it possible to do interactive user input and output simulation in VHDL or Verilog?

For example, I would like to run a simulation for an interactive game like: https://github.com/fabioperez/space-invaders-vhdl without an FPGA, such that:
signals are set by keyboard keys
outputs can be displayed on a window
http://www.nand2tetris.org/ does this, but is uses a simplified custom educational language for it.
VHDL's textio's read(input and write(output get somewhat close, but not quite:
read(input waits for a newline, we'd want something that can detect is a keyboard key is pressed or not
write(output: would need some way to flush data to ensure that the renderer that will emulate, say, a display gets it
we need some way to throttle simulation speed
Of course, I don't need to do everything in VHDL: I just need a minimal way to communicate with VHDL synchronously with other programs, and then I can do the e.g. display with SDL in C.
Also asked at: https://github.com/tgingold/ghdl/issues/92

Verilator
Verilator is a perfect solution for this application.
It exposes the Verilog simulation loop to C++ (and transpiles the Verilog to C++), allowing you to set inputs, and get outputs from C++.
See the CONNECTING TO C++ example from the docs: http://www.veripool.org/projects/verilator/wiki/Manual-verilator
So you can just plug that into SDL / ncurses / etc. without any IPC.
For a simulator independent solution, it might be worth looking into the foreign language APIs of VHDL (VHPI) / Verilog (DPI) as mentioned in this comment, but there are few examples of how to use those, and you'll have to worry about IPC.
Minimal runnable example:
A related project that implements nand2tetris in Verilator + SDL can be found at: https://hackaday.io/project/160865-nand2tetris-in-verilog-part3-verilator-and-sdl2
Install dependencies on Ubuntu 22.04:
sudo apt install libsdl2-dev verilator
Makefile
.POSIX:
.PHONY: all clean run
RUN ?= move
OUT_EXT ?= .out
VERILATOR_DIR = ./obj_dir/
all: $(VERILATOR_DIR)Vmove display$(OUT_EXT)
$(VERILATOR_DIR)Vmove: move.v move.cpp fps.hpp
verilator -Wall --cc move.v --exe move.cpp
make -C obj_dir -f Vmove.mk Vmove CXXFLAGS='--std=c++11 -Wall' LIBS='-lSDL2'
display$(OUT_EXT): display.cpp
g++ -o '$#' '$<' -lm -lSDL2
clean:
rm -rf obj_dir *'$(OUT_EXT)'
run: all
'$(VERILATOR_DIR)V$(RUN)'
move.v
module move(
input wire clock,
input wire reset,
input wire up,
input wire down,
input wire left,
input wire right,
output reg [1:0] x,
output reg [1:0] y
);
always # (posedge clock) begin
if (reset == 1'b1) begin
x <= 0;
y <= 0;
end
else begin
if (up == 1'b1) begin
y <= y - 1;
end
if (down == 1'b1) begin
y <= y + 1;
end
if (left == 1'b1) begin
x <= x - 1;
end
if (right == 1'b1) begin
x <= x + 1;
end
end
end
endmodule
move.cpp
const char *help = "asdw: move | q: quit";
#include <cmath>
#include <cstdlib>
#include <time.h>
#include <SDL2/SDL.h>
#include "Vmove.h"
#include "verilated.h"
#include "fps.hpp"
#define WINDOW_WIDTH 512
#define RECTS_PER_WINDOW (4)
#define RECT_WIDTH (WINDOW_WIDTH / RECTS_PER_WINDOW)
#define FASTEST_TICK_PERIOD_S (1.0 / 4.0)
int main(int argc, char **argv) {
SDL_Event event;
SDL_Renderer *renderer;
SDL_Window *window;
double current_time_s, last_tick_time_s;
unsigned int current_time, last_time;
const Uint8 *keystate;
Verilated::commandArgs(argc, argv);
Vmove *top = new Vmove;
SDL_Init(SDL_INIT_TIMER | SDL_INIT_VIDEO);
SDL_CreateWindowAndRenderer(WINDOW_WIDTH, WINDOW_WIDTH, 0, &window, &renderer);
SDL_SetWindowTitle(window, help);
fps_init();
top->clock = 0;
top->eval();
top->reset = 1;
top->clock = 1;
top->eval();
while (1) {
current_time = SDL_GetTicks();
current_time_s = current_time / 1000.0;
/* Deal with keyboard input. */
while (SDL_PollEvent(&event) == 1) {
if (event.type == SDL_QUIT) {
goto quit;
} else if (event.type == SDL_KEYDOWN) {
switch(event.key.keysym.sym) {
case SDLK_q:
goto quit;
default:
break;
}
}
}
keystate = SDL_GetKeyboardState(NULL);
if (keystate[SDL_SCANCODE_ESCAPE]) {
top->reset = 1;
}
if (keystate[SDL_SCANCODE_A]) {
top->left = 1;
}
if (keystate[SDL_SCANCODE_D]) {
top->right = 1;
}
if (keystate[SDL_SCANCODE_W]) {
top->up = 1;
}
if (keystate[SDL_SCANCODE_S]) {
top->down = 1;
}
if (current_time != last_time) {
if (current_time_s - last_tick_time_s > FASTEST_TICK_PERIOD_S) {
/* Draw world. */
SDL_SetRenderDrawColor(renderer, 0, 0, 0, 0);
SDL_RenderClear(renderer);
{
SDL_Rect rect;
rect.w = RECT_WIDTH;
rect.h = RECT_WIDTH;
rect.x = top->x * RECT_WIDTH;
rect.y = top->y * RECT_WIDTH;
SDL_SetRenderDrawColor(renderer, 255, 0, 0, 255);
SDL_RenderFillRect(renderer, &rect);
}
SDL_RenderPresent(renderer);
top->clock = 0;
top->eval();
top->clock = 1;
top->eval();
top->up = 0;
top->down = 0;
top->left = 0;
top->right = 0;
top->reset = 0;
/* Update time tracking. */
last_tick_time_s = current_time_s;
fps_update_and_print();
}
}
last_time = current_time;
}
quit:
top->final();
delete top;
SDL_DestroyRenderer(renderer);
SDL_DestroyWindow(window);
SDL_Quit();
return EXIT_SUCCESS;
}
display.cpp
/*
Test a simple virtual SDL display, without user input.
*/
#include <cstdlib>
#include <cmath>
#include <iostream>
#include <SDL2/SDL.h>
#define WINDOW_WIDTH 600
#define WINDOW_HEIGHT (WINDOW_WIDTH)
#define N_PIXELS_WIDTH 10
#define N_PIXELS_HEIGHT (N_PIXELS_WIDTH)
#define N_PIXELS (N_PIXELS_WIDTH * N_PIXELS_HEIGHT)
#define PIXEL_WIDTH (WINDOW_WIDTH / N_PIXELS_WIDTH)
#define PIXEL_HEIGHT (WINDOW_HEIGHT / N_PIXELS_HEIGHT)
#define MAX_COLOR 255
#define PI2 (2*(acos(-1.0)))
#define FREQ (0.05)
int main(int argc, char **argv, char **env) {
SDL_Event event;
SDL_Rect rect;
SDL_Renderer *renderer;
SDL_Window *window;
const unsigned int max_color_half = MAX_COLOR / 2;
int quit;
double current_time_s;
size_t cur, i , j;
unsigned int
bs[N_PIXELS],
current_time,
gs[N_PIXELS],
last_time,
rs[N_PIXELS],
val
;
quit = 0;
SDL_Init(SDL_INIT_TIMER | SDL_INIT_VIDEO);
SDL_CreateWindowAndRenderer(WINDOW_WIDTH, WINDOW_WIDTH, 0, &window, &renderer);
rect.w = PIXEL_WIDTH;
rect.h = PIXEL_HEIGHT;
last_time = SDL_GetTicks();
while (!quit) {
while (SDL_PollEvent(&event) == 1) {
if (event.type == SDL_QUIT) {
quit = 1;
}
}
current_time = SDL_GetTicks();
if (current_time != last_time) {
for (i = 0; i < N_PIXELS_WIDTH; ++i) {
for (j = 0; j < N_PIXELS_WIDTH; ++j) {
cur = j * N_PIXELS_WIDTH + i;
val = (1 + i) * (1 + j) * PI2 * FREQ * current_time / 1000.0;
rs[cur] = max_color_half * (1.0 + std::sin(1 * val));
gs[cur] = max_color_half * (1.0 + std::sin(2 * val));
bs[cur] = max_color_half * (1.0 + std::sin(3 * val));
}
}
}
for (i = 0; i < N_PIXELS_WIDTH; ++i) {
for (j = 0; j < N_PIXELS_WIDTH; ++j) {
cur = j *N_PIXELS_WIDTH + i;
SDL_SetRenderDrawColor(renderer, rs[cur], gs[cur], bs[cur], 255);
rect.x = i * PIXEL_WIDTH;
rect.y = j * PIXEL_HEIGHT;
SDL_RenderFillRect(renderer, &rect);
}
}
SDL_RenderPresent(renderer);
}
SDL_DestroyRenderer(renderer);
SDL_DestroyWindow(window);
SDL_Quit();
return EXIT_SUCCESS;
}
GitHub upstream.

Connectal connects software running on actual CPUs to RTL (BSV, which can link to VHDL and Verilog) on FPGAs or simulators. BSV is free for academic and research use and for open source projects. In any case, Connectal is open source and the software to simulator connection uses SystemVerilog DPI, which you could use in your project without using BSV.
Connectal has one example that displays the output from the FGPA/simulator on a display. It uses Qt to display on the computer monitor when simulating. From an FPGA it displays directly on an HDMI display.
CPUs simulated in Verilog or VHDL tend to be too slow for interactive use, but I have connected a CPU simulated with qemu to devices or accelerators in verilator or on FPGA. Performance of qemu is quite good. I think it would work for your purposes.
I added a plugin FgpaOps API so that the simulator or FPGA could handle CPU load/store instructions:
struct FpgaOps {
uint64_t (*read)(hwaddr addr);
void (*write)(hwaddr addr, uint64_t value);
void (*close)(void);
void *(*alloc_mem)(size_t size);
};
In my case, I used connectal to implement the FpgaOps plugin. This code is under hw/riscv but is not specific to riscv, so it could be used with any processor architecture supported by qemu.

No need for anything too clever or customised to interact with your Sim providing you're willing to simulate a real hardware interface like a UART.
The Verilog simuation of my TTL CPU includes a (reasonably) accurate model of a UM245R UART and the UART supports interactive IO via the verilog file interface.
One file for input and the other for output; bidirectional.
I use this for interacting with the simualated hardware so that I can develop the software and test it via automated test without having to mess with the hardware.
I even have CHIP8 games running on the simulated hardware and the CHIP8 GUI is drawn by sending control codes from the UART back to a graphics terminal.
The UART is here ...
https://github.com/Johnlon/spam-1/blob/master/verilog/uart/um245r.v
At some point I'll do a write up on it.

OCR algorithm (GOCR) to 32F429IDISCOVERY board

I'm trying to implement an OCR algorithm (GOCR algorithm specifically) to 32F429IDISCOVERY board and I'm still getting nothing back...
I'm recording a image from OV7670 camera in RGB565 format to SDRAM of the board that is then converted to greyscale and passed to the algorithm itself.
From this and other forums I got the impression that GOCR is very good algorithm and it seemed to be working very well on PC but I just cant get it to work on the board.
Does anyone have some experience with implementing OCR or GOCR? I am not sure where the problem is because it beaves in a very wierd way. The code stops in different part of the algorithm almost every time...
Calling the OCR algorithm:
void ocr_algorithm(char *output_str) {
job_t job1, *job; /* fixme, dont want global variables for lib */
job=OCR_JOB=&job1;
int linecounter;
const char *line;
uint8_t r,g,b;
uint32_t n,i,buffer;
char *p_pic;
uint32_t *image = (uint32_t*) SDRAM_START_ADR;
setvbuf(stdout, (char *) NULL, _IONBF, 0); /* not buffered */
job_init(job); /* init cfg and db */
job_init_image(job); /* single image */
p_pic = malloc(IMG_ROWS*IMG_COLUMNS);
// Converting RGB565 to grayscale
i=0;
for (n = 0; n < IMG_ROWS*IMG_COLUMNS; n++) {
if (n % 2 == 0){
buffer = image[i] & 0xFFFF;
}
else{
buffer = (image[i] >> 16) & 0xFFFF;
i++;
}
r = (uint8_t) ((buffer >> 11) & 0x1F);
g = (uint8_t) ((buffer >> 5) & 0x3F);
b = (uint8_t) (buffer & 0x1F);
// RGB888
r = ((r * 527) + 23) >> 6;
g = ((g * 259) + 33) >> 6;
b = ((b * 527) + 23) >> 6;
// Greyscale
p_pic[n] = 0.299*r + 0.587*g + 0.114*b;
}
//read_picture;
job->src.p.p = p_pic;
job->src.p.x = IMG_ROWS;
job->src.p.y = IMG_COLUMNS;
job->src.p.bpp = 1;
/* call main loop */
pgm2asc(job);
//print output
strcpy(output_str, "");
linecounter = 0;
line = getTextLine(&(job->res.linelist), linecounter++);
while (line) {
strcat(output_str, line);
strcat(output_str, "\n");
line = getTextLine(&(job->res.linelist), linecounter++);
}
free_textlines(&(job->res.linelist));
job_free_image(job);
free(p_pic);
}

Monochrome Bitmap SetPixel/GetPixel problems... Win32 C Code

This is some of my bitmask code (monochrome bitmaps). There is no problem with the Bitmask_Create() function. I have tested it with opening, loading and saving windows monochrome bitmaps, and it works great. However, the GetPixel and SetPixel functions I've made don't seem to work right. In some instances they seem to work fine depending on the bitmap dimensions.
If anyone could help, I would appreciate it. It's driving me insane.
Thanks.
typedef struct _GL_BITMASK GL_BITMASK;
struct _GL_BITMASK {
int nWidth; // Width in pixels
int nHeight; // Height in pixels
int nPitch; // Width of scanline in bytes (may have extra padding to align to DWORD)
BYTE *pData; // Pointer to the first byte of the first scanline (top down)
};
int BitMask_GetPixel(GL_BITMASK *pBitMask, int x, int y)
{
INT nElement = ((y * pBitMask->nPitch) + (x / 8));
PBYTE pElement = pBitMask->pData + nElement;
BYTE bMask = 1 << (7 - (x % 8));
return *pElement & bMask;
}
void BitMask_SetPixel(GL_BITMASK *pBitMask, int x, int y, int nPixelColor)
{
INT nElement = x / 8;
INT nScanLineOffset = y * pBitMask->nPitch;
PBYTE pElement = pBitMask->pData + nScanLineOffset + nElement;
BYTE bMask = 1 << (7 - (x % 8));
if(*pElement & bMask)
{
if(!nPixelColor) return;
else *pElement ^= bMask;
}
else
{
if(nPixelColor) return;
else *pElement |= bMask;
}
}
GL_BITMASK *BitMask_Create(INT nWidth, INT nHeight)
{
GL_BITMASK *pBitMask;
int nPitch;
nPitch = ((nWidth / 8) + 3) & ~3;
pBitMask = (GL_BITMASK *)GlobalAlloc(GMEM_FIXED, (nPitch * nHeight) + sizeof(GL_BITMASK));
if(!pBitMask)
return (GL_BITMASK *)NULL;
pBitMask->nPitch = nPitch;
pBitMask->nWidth = nWidth;
pBitMask->nHeight = nHeight;
pBitMask->pData = (PBYTE)pBitMask + sizeof(GL_BITMASK);
return pBitMask;
}

I think your formula for calculating pitch is just a little bit off. It works when the width is a multiple of 8, but not otherwise. Try:
nPitch = ((nWidth + 31) / 8) & ~3;

I figured it out... I had the two tests backwards for nPixelColor in SetPixel()
if(*pElement & bMask)
{
if(nPixelColor) return; // this was !nPixelColor
else *pElement ^= bMask;
}
else
{
if(!nPixelColor) return; // this was nPixelColor
else *pElement |= bMask;
}

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

Looking for source code of __builtin_avr_delay_cycles called by _delay_ms in avr-gcc - gcc

Related

When declaring a static array as "private" before a parallel loop is perfectly equivalent to declaring the array inside the loop?

CUDA string search in large file, wrong result

Is it possible to do interactive user input and output simulation in VHDL or Verilog?

OCR algorithm (GOCR) to 32F429IDISCOVERY board

Monochrome Bitmap SetPixel/GetPixel problems... Win32 C Code

Categories

Resources