When I do VirtualAlloc with MEM_RESERVE | MEM_COMMIT under Windows the virtually allocated pages actually don't get allocated physical pages immediately but the pages are first subtracted from the paging-file and then are mapped on demand as soon as there is a first access to the page (I've measured delays of >= 1.000 clock cycles for this mapping).
But what's with Linux ? When I have overcommitting it seems obvious to me that the physical pages get assigned immediately. But what's when I switch off overcommitting and have a mmap() ? Will the system just subtract the necessary space from the paging-file/-partition to have a backing if physical assignment would fail immediately ? I.e. are the pages then allocated dynamically like under Windows ?
Ok, so I wrote a little program that scans freshly mmap()-allocated pages
#if defined(_MSC_VER)
#include <Windows.h>
#elif defined(__unix__)
#include <sys/mman.h>
#include <iostream>
#include <chrono>
#include <atomic>
using namespace std;
using namespace chrono;
int main()
static size_t const SIZE = (size_t)8 << 30, N_PAGES = SIZE >> 12;
#if defined(_MSC_VER)
void *p = VirtualAlloc( nullptr, SIZE, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE );
#elif defined(__unix__)
void *p = mmap( nullptr, SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, 0, 0 );
if( !p )
auto scan = [&]()
auto start = high_resolution_clock::now();
for( atomic<char> *pc = (atomic<char> *)p, *pcEnd = pc + SIZE; pc < pcEnd; pc->store( 0, memory_order_relaxed ), pc += 0x1000 );
return (double)(int64_t)duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count() / N_PAGES;
cout << scan() << endl;
cout << scan() << endl;
The first scan() is about 1.800ns per page on my Linux Ryzen 7 1800X PC, the latter is about 60ns. And my Ubuntu-computer has overcommitting enabled, so this would apply for non-overcommitting as well for sure. So Linux has the same behaviour like Windows here. My Windows-PC with a Ryzen Threadripper 3990X is significantly faster here; the first scan() is about 700ns per page.
I am trying to create a shared memory segment that will hold an initial set of data, but will need to be extended at some later time when another block of data becomes available. I've read several posts here and tried a few things, but I seem to be missing some magic and could use some advice.
Here is a test program that illustrates the problem I'm encountering:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <errno.h>
#include <signal.h>
#include <sys/mman.h>
#include <fcntl.h>
int main(int argc, char **argv)
void *addr, *resaddr;
int *iarray;
int fd;
/* reserve a large address space - note that this doesn't ALLOCATE
* anything. It simply tells the OS to reserve this much space
* for us to eventually/potentially use. For this test case, we
* will ask for 1GByte of space. We include the MAP_NORESERVE
* flag to indicate that we don't want anything allocated, but
* people report that this flag is ignored as PROT_NONE is
* apparently sufficient for that purpose. */
fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0660);
fprintf(stderr, "GOT ADDRESS %p\n", addr);
iarray = (int*)addr;
/* now get resources for a piece of that file */
ftruncate(fd, 1U << 16);
resaddr = mmap(addr, 1U << 16, PROT_WRITE | PROT_READ, MAP_FIXED | MAP_SHARED, fd, 0);
fprintf(stderr, "RESERVED ADDRESS %p\n", resaddr);
/* put somthing into it */
iarray[1024] = 1;
fprintf(stderr, "iaddr[1024]: %d\n", iarray[1024]);
/* increase the size */
ftruncate(fd, 1U << 18);
resaddr = mmap(addr, (1U << 18) - (1U << 16), PROT_WRITE | PROT_READ, MAP_FIXED | MAP_SHARED, fd, 1U << 16);
fprintf(stderr, "EXTENDED RESERVED ADDRESS %p\n", resaddr);
/* add something to that region */
iarray[31000] = 2;
fprintf(stderr, "\tiarray[1024]: %d\n\tiarray[31000]: %d\n", iarray[1024], iarray[31000]);
Running this yields the following result:
GOT ADDRESS 0x111396000
iaddr[1024]: 1
iarray[1024]: 0
iarray[31000]: 2
Things work as I expected, with the sole caveat being the loss of the initial stored data (apparently reset to zero). Does anyone have a suggestion of what I should do differently to have that initial stored data retained?
In my understanding, the third invocation of mmap() is incorrect.
resaddr = mmap(addr, (1U << 18) - (1U << 16), PROT_WRITE | PROT_READ, MAP_FIXED | MAP_SHARED, fd, 1U << 16);
should be replaced with
resaddr = mmap(addr + (1U << 16), (1U << 18) - (1U << 16), PROT_WRITE | PROT_READ, MAP_FIXED | MAP_SHARED, fd, 1U << 16);
(the offset argument is the offset in the file, not in memory)
While using CudaMallocManaged() to allocate an array of structs with arrays inside, I'm getting the error "out of memory" even though I have enough free memory. Here's some code that replicates my problem:
#include <iostream>
#include <cuda.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
if (code != cudaSuccess)
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
#define N 100000
#define ARR_SZ 100
struct Struct
float* arr;
int main()
Struct* struct_arr;
gpuErrchk( cudaMallocManaged((void**)&struct_arr, sizeof(Struct)*N) );
for(int i = 0; i < N; ++i)
gpuErrchk( cudaMallocManaged((void**)&(struct_arr[i].arr), sizeof(float)*ARR_SZ) ); //out of memory...
for(int i = 0; i < N; ++i)
/*float* f;
gpuErrchk( cudaMallocManaged((void**)&f, sizeof(float)*N*ARR_SZ) ); //this works ok
return 0;
There doesn't seem to be a problem when I call cudaMallocManaged() once to allocate a single chunk of memory, as I'm showing in the last piece of commented code.
I have a GeForce GTX 1070 Ti, and I'm using Windows 10. A friend tried to compile the same code in a PC with Linux and it worked correctly, while it had the same issue in another PC with Windows 10. WDDM TDR is deactivated.
Any help would be appreciated. Thanks.
There is an allocation granularity.
This means that if you ask for 1 byte, or 400 bytes, what is actually used up is something like 4096 65536 bytes. So a bunch of very small allocations will actually use up memory at a much faster rate than what you would predict based on the requested allocation size. The solution is to not make very small allocations, but instead to allocate in larger chunks.
An alternative strategy here would also be to flatten your allocation, and carve out pieces from it for each of your arrays:
#include <iostream>
#include <cstdio>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
if (code != cudaSuccess)
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
#define N 100000
#define ARR_SZ 100
struct Struct
float* arr;
int main()
Struct* struct_arr;
float* f;
gpuErrchk( cudaMallocManaged((void**)&struct_arr, sizeof(Struct)*N) );
gpuErrchk( cudaMallocManaged((void**)&f, sizeof(float)*N*ARR_SZ) );
for(int i = 0; i < N; ++i)
struct_arr[i].arr = f+i*ARR_SZ;
return 0;
ARR_SZ divisible by 4 means the various created pointers can also be up-cast to larger vector types e.g. float2 or float4, if your use had any intention of doing that.
A possible reason the original code works on linux is because managed memory on linux, in a proper setup, can oversubscribe the GPU physical memory. The result is the actual allocation limit is much higher than what the GPU on-board memory would suggest. It might also be that the linux case has a bit more free memory, or perhaps the allocation granularity on linux is different (smaller).
Based on a question in the comments, I decided to estimate the allocation granularity, using this code:
#include <iostream>
#include <cstdio>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
if (code != cudaSuccess)
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
#define N 100000
#define ARR_SZ 100
struct Struct
float* arr;
int main()
Struct* struct_arr;
//float* f;
gpuErrchk(cudaMallocManaged((void**)& struct_arr, sizeof(Struct) * N));
#if 0
gpuErrchk(cudaMallocManaged((void**)& f, sizeof(float) * N * ARR_SZ));
for (int i = 0; i < N; ++i)
struct_arr[i].arr = f + i * ARR_SZ;
size_t fre, tot;
gpuErrchk(cudaMemGetInfo(&fre, &tot));
std::cout << "Free: " << fre << " total: " << tot << std::endl;
for (int i = 0; i < N; ++i)
gpuErrchk(cudaMallocManaged((void**) & (struct_arr[i].arr), sizeof(float) * ARR_SZ));
gpuErrchk(cudaMemGetInfo(&fre, &tot));
std::cout << "Free: " << fre << " total: " << tot << std::endl;
for (int i = 0; i < N; ++i)
return 0;
When I compile a debug project with that code, and run that on a windows 10 desktop with RTX 2070 GPU (8GB memory, same as GTX 1070 Ti) I get the following output:
Microsoft Windows [Version 10.0.17763.973]
(c) 2018 Microsoft Corporation. All rights reserved.
C:\Users\Robert Crovella>cd C:\Users\Robert Crovella\source\repos\test12\x64\Debug
C:\Users\Robert Crovella\source\repos\test12\x64\Debug>test12
Free: 7069866393 total: 8589934592
Free: 516266393 total: 8589934592
C:\Users\Robert Crovella\source\repos\test12\x64\Debug>test12
Free: 7069866393 total: 8589934592
Free: 516266393 total: 8589934592
C:\Users\Robert Crovella\source\repos\test12\x64\Debug>
Note that on my machine there is only 0.5GB of reported free memory left after the 100,000 allocations. So if for any reason your 8GB GPU starts out with less free memory (entirely possible) you may run into an out-of-memory error, even though I did not.
The calculation of the allocation granularity is as follows:
7069866393 - 516266393 / 100000 = 65536 bytes per allocation(!)
So my previous estimate of 4096 bytes per allocation was way off, by at least 1 order of magnitude, on my machine/test setup.
The allocation granularity may vary based on:
windows or linux
x86 or Power9
managed vs ordinary cudaMalloc
possibly other factors (e.g. CUDA version)
so my advice to future readers would not be to assume that it is always 65536 bytes per allocation, minimum.
Suppose we have;
struct collapsed {
char **seq;
int num;
__device__ *collapsed xdev;
collapsed *x_dev
cudaGetSymbolAddress((void **)&x_dev, xdev);
cudaMemcpyToSymbol(x_dev, x, sizeof(collapsed)*size); //x already defined collapsed * , this line gives ERROR
Whay do you think I am getting error at the last line : invalid device symbol ??
The first problem here is that x_dev isn't a device symbol. It might contain an address in a device memory, but that address cannot be passed to cudaMemcpyToSymbol. The call should just be:
cudaMemcpyToSymbol(xdev, ......);
Which brings up the second problem. Doing this:
cudaMemcpyToSymbol(xdev, x, sizeof(collapsed)*size);
would be illegal. xdev is a pointer, so the only valid value you can copy to xdev is a device address. If x is the address of a struct collapsed in device memory, then the only valid version of this memory transfer operation is
cudaMemcpyToSymbol(xdev, &x, sizeof(collapsed *));
ie. x must have previously have been set to the address of memory allocated in the device, something like
collapsed *x;
cudaMalloc((void **)&x, sizeof(collapsed)*size);
cudaMemcpy(x, host_src, sizeof(collapsed)*size, cudaMemcpyHostToDevice);
As promised, here is a complete working example. First the code:
#include <cstdlib>
#include <iostream>
#include <cuda_runtime.h>
struct collapsed {
char **seq;
int num;
__device__ collapsed xdev;
void kernel(const size_t item_sz)
if (threadIdx.x < xdev.num) {
char *p = xdev.seq[threadIdx.x];
char val = 0x30 + threadIdx.x;
for(size_t i=0; i<item_sz; i++) {
p[i] = val;
#define gpuQ(ans) { gpu_assert((ans), __FILE__, __LINE__); }
void gpu_assert(cudaError_t code, const char *file, const int line)
if (code != cudaSuccess)
std::cerr << "gpu_assert: " << cudaGetErrorString(code) << " "
<< file << " " << line << std::endl;
int main(void)
const int nitems = 32;
const size_t item_sz = 16;
const size_t buf_sz = size_t(nitems) * item_sz;
// Gpu memory for sequences
char *_buf;
gpuQ( cudaMalloc((void **)&_buf, buf_sz) );
gpuQ( cudaMemset(_buf, 0x7a, buf_sz) );
// Host array for holding sequence device pointers
char **seq = new char*[nitems];
size_t offset = 0;
for(int i=0; i<nitems; i++, offset += item_sz) {
seq[i] = _buf + offset;
// Device array holding sequence pointers
char **_seq;
size_t seq_sz = sizeof(char*) * size_t(nitems);
gpuQ( cudaMalloc((void **)&_seq, seq_sz) );
gpuQ( cudaMemcpy(_seq, seq, seq_sz, cudaMemcpyHostToDevice) );
// Host copy of the xdev structure to copy to the device
collapsed xdev_host;
xdev_host.num = nitems;
xdev_host.seq = _seq;
// Copy to device symbol
gpuQ( cudaMemcpyToSymbol(xdev, &xdev_host, sizeof(collapsed)) );
// Run Kernel
// Copy back buffer
char *buf = new char[buf_sz];
gpuQ( cudaMemcpy(buf, _buf, buf_sz, cudaMemcpyDeviceToHost) );
// Print out seq values
// Each string should be ASCII starting from ´0´ (0x30)
char *seq_vals = buf;
for(int i=0; i<nitems; i++, seq_vals += item_sz) {
std::string s;
s.append(seq_vals, item_sz);
std::cout << s << std::endl;
return 0;
and here it is compiled and run:
$ /usr/local/cuda/bin/nvcc -arch=sm_12 -Xptxas=-v -g -G -o erogol erogol.cu
./erogol.cu(19): Warning: Cannot tell what pointer points to, assuming global memory space
ptxas info : 8 bytes gmem, 4 bytes cmem[14]
ptxas info : Compiling entry function '_Z6kernelm' for 'sm_12'
ptxas info : Used 5 registers, 20 bytes smem, 4 bytes cmem[1]
$ /usr/local/cuda/bin/cuda-memcheck ./erogol
========= ERROR SUMMARY: 0 errors
Some notes:
To simplify things a bit, I have only used a single memory allocation _buf to hold all of the string data. Each value of seq is set to a different address within _buf. This is functionally equivalent to running a separate cudaMalloc call for each pointer, but much faster.
The key concept is to assemble a copy of the structure you wish to access on the device in host memory, then copy that to the device. All of the pointers in my xdev_host are device pointers. The CUDA API doesn't have any sort of deep copy or automatic pointer translation facility, so it is the programmer's responsibility to make sure this is correct.
Each thread in the kernel just fills its sequence with a difference ASCII character. Note that I have declared my xdev as a structure, rather than pointer to structure and copy values rather than a reference to the __device__ symbol (again to simplify things slightly). But otherwise the sequence of operations is what you would need to make your design pattern work.
Because I only have access to a compute 1.x device, the compiler issues a warning. One compute 2.x and 3.x this won't happen because of the improved memory model in those devices. The warning is normal and can be safely ignored.
Because each sequence is just written into a different part of _buf, I can transfer all the sequences back to the host with a single cudaMemcpy call.
I just bought a nifty MBA 13" Core i7. I'm told the CPU speed varies automatically, and pretty wildly, too. I'd really like to be able to monitor this with a simple app.
Are there any Cocoa or C calls to find the current clock speed, without actually affecting it?
Edit: I'm OK with answers using Terminal calls, as well as programmatic.
Try this tool called "Intel Power Gadget". It displays IA frequency and IA power in real time.
You can query the CPU speed easily via sysctl, either by command line:
sysctl hw.cpufrequency
Or via C:
#include <stdio.h>
#include <sys/types.h>
#include <sys/sysctl.h>
int main() {
int mib[2];
unsigned int freq;
size_t len;
mib[0] = CTL_HW;
mib[1] = HW_CPU_FREQ;
len = sizeof(freq);
sysctl(mib, 2, &freq, &len, NULL, 0);
printf("%u\n", freq);
return 0;
Since it's an Intel processor, you could always use RDTSC. That's an assembler instruction that returns the current cycle counter — a 64bit counter that increments every cycle. It'd be a little approximate but e.g.
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
uint64_t rdtsc(void)
uint32_t ret0[2];
__asm__ __volatile__("rdtsc" : "=a"(ret0[0]), "=d"(ret0[1]));
return ((uint64_t)ret0[1] << 32) | ret0[0];
int main(int argc, const char * argv[])
uint64_t startCount = rdtsc();
uint64_t endCount = rdtsc();
printf("Clocks per second: %llu", endCount - startCount);
return 0;
Output 'Clocks per second: 2002120630' on my 2Ghz MacBook Pro.
There is a kernel extension written by "flAked" which logs the cpu p-state to the kernel log.
maybe you could contact him for the code.
This seems to work correctly on OSX.
However, it doesn't work on Linux, where sysctl is deprecated and KERN_CLOCKRATE is undefined.
#include <sys/sysctl.h>
#include <sys/time.h>
int mib[2];
size_t len;
mib[0] = CTL_KERN;
struct clockinfo clockinfo;
len = sizeof(clockinfo);
int result = sysctl(mib, 2, &clockinfo, &len, NULL, 0);
assert(result != -1);
log_trace("clockinfo.hz: %d\n", clockinfo.hz);
log_trace("clockinfo.tick: %d\n", clockinfo.tick);
Need Help.I'm stuck at a problem when running a C++ code on Windows- Visual Studio.
When I run that code in Linux environment, there is no restriction on the memory I am able to allocate dynamically(till the size available in RAM).
But on VS Compiler, it does not let me create an array beyond a limited size.
I've tried /F option and 20-25 of google links to increase memory size but they dont seem to help much.
I am currently able to assign only around 100mb out of 3gb available.
If there is a solution for this in Windows and not in Visual Studio's compiler, I will be glad to hear that as I have a CUDA TeslaC2070 card which is proving to be pretty useless on Windows as I wanted to run my CUDA/C++ code on Windows environment.
Here's my code. it fails when LENGTH>128(no of images 640x480pngs. less than 0.5mb each. I've also calculated the approximate memory size it takes by counting data structures and types used in OpenCV and by me but still it is very less than 2gb). stackoverflow exception. Same with dynamic allocation. I've already maximized the heap and stack sizes.
#include "stdafx.h"
#include <cv.h>
#include <cxcore.h>
#include <highgui.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define LENGTH 100
#define SIZE1 640
#define SIZE2 480
#include <iostream>
using namespace std;
__global__ void square_array(double *img1_d, long N)
int idx = blockIdx.x * blockDim.x + threadIdx.x;
img1_d[idx]= 255.0-img1_d[idx];
int _tmain(int argc, _TCHAR* argv[])
IplImage *img1[LENGTH];
// Open the file.
for(int i=0;i<LENGTH;i++)
{ img1[i] = cvLoadImage("abstract3.jpg");}
CvMat *mat1[LENGTH];
for(int i=0;i<LENGTH;i++)
mat1[i] = cvCreateMat(img1[i]->height,img1[i]->width,CV_32FC3 );
cvConvert( img1[i], mat1[i] );
double a[LENGTH][2*SIZE1][SIZE2][3];
for(int m=0;m<LENGTH;m++)
for(int i=0;i<SIZE1;i++)
for(int j=0;j<SIZE2;j++)
CvScalar scal = cvGet2D( mat1[m],j,i);
a[m][i][j][0] = scal.val[0];
a[m][i][j][1] = scal.val[1];
a[m][i][j][2] = scal.val[2];
a[m][i+SIZE1][j][0] = scal.val[0];
a[m][i+SIZE1][j][1] = scal.val[1];
a[m][i+SIZE1][j][2] = scal.val[2];
} }
double *a_d;
cudaMalloc((void **) &a_d, N*sizeof(double));
cudaMemcpy(a_d, a, N*sizeof(double), cudaMemcpyHostToDevice);
int block_size = 370;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
cudaMemcpy(a, a_d, N*sizeof(double), cudaMemcpyDeviceToHost);
//cuda end
char name[]= "Image: 00000";
int x=0,y=0;
for(int m=0;m<LENGTH;m++)
for (int i = 0; i < img1[m]->width*img1[m]->height*3; i+=3)
img1[m]->imageData[i]= a[m][x][y][0];
img1[m]->imageData[i+1]= a[m][x][y][1];
img1[m]->imageData[i+2]= a[m][x][y][2];
case '9': switch(name[10])
case '9':
case '9': name[11]='0';name[10]='0';name[9]='0';name[8]++;
default : name[11]='0';
default : name[11]='0'; name[10]++;break;
default : name[11]++;break;
// Display the image.
cvNamedWindow(name, CV_WINDOW_AUTOSIZE);
//cvSaveImage(name ,img1);
// Wait for the user to press a key in the GUI window.
// Free the resources.
return 0;
The problem is that you are allocating a huge multidimensional array on the stack in your main function (double a[..][..][..]). Do not allocate this much memory on the stack. Use malloc/new to allocate on the heap.