kfree_skb() unexpected behavior - linux-kernel

I have some trouble using kfree_skb. The below three lines of code behave in a strange way,
printk(KERN_ALERT"1 - SKB user: %d", atomic_read(&skb->users));
kfree_skb(skb);
printk(KERN_ALERT"2 - SKB user: %d", atomic_read(&skb->users));
I expect that the second printk causes a kernel panic since I freed the skb, but it does not. The output of these lines are as follows;
1 - SKB user: 1
2 - SKB user: 2
What is the point that I missed?
The second output was incorrect. It is as follows:
2 - SKB user: 1

When you free the skb, there is a high chance that it gets allocated in another thread.
Just think about this:
Thread 1:
printk(KERN_ALERT"1 - SKB user: %d", atomic_read(&skb->users));
kfree_skb
-------> scheduled out
Thread 2:
alloc_skb()
//inc user count (whatever the kernel call)
-------> Scheduled out
Then now back in Thread 1,
printk(KERN_ALERT"2 - SKB user: %d", atomic_read(&skb->users));
Now there is a very high likehood that you can get user count 2.
Since skbs are allocated from the slab cache, it highly likely that just now the freed skb gets allocated first.

First let's have a quick look at the kfree_skb() function. The code has been changed at v4.13, but it is still behaves the same:
Before v4.13 (when the question has been published):
void kfree_skb(struct sk_buff *skb)
{
if (unlikely(!skb))
return;
if (likely(atomic_read(&skb->users) == 1))
smp_rmb();
else if (likely(!atomic_dec_and_test(&skb->users)))
return;
trace_kfree_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
After v4.13:
void kfree_skb(struct sk_buff *skb)
{
if (!skb_unref(skb))
return;
trace_kfree_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
static inline bool skb_unref(struct sk_buff *skb)
{
if (unlikely(!skb))
return false;
if (likely(refcount_read(&skb->users) == 1))
smp_rmb();
else if (likely(!refcount_dec_and_test(&skb->users)))
return false;
return true;
}
In both cases, I've noticed that when you call kfree_skb(), the skb does get freed (__kfree_skb() is been called). However, I've also noticed in both cases that if skb->users was 1 before calling kfree_skb(), it would stay 1 even after calling kfree_skb().
kfree_skb() does free the skb when the skb should be freed, but it won't protect you from double freeing the skb. It seems it has been made on purpose in order to avoid the use of unnecessary atomic operations (you can find this comment somewhere in the skbuff.h code):
/*
* If users == 1, we are the only owner and can avoid redundant atomic changes.
*/
Nevertheless, you can see that in __dev_kfree_skb_irq() the behavior is just as you expected it to be (if skb->users was 1 before calling __dev_kfree_skb_irq(), skb->users is set to 0 has expected):
void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
{
unsigned long flags;
if (unlikely(!skb))
return;
if (likely(refcount_read(&skb->users) == 1)) {
smp_rmb();
refcount_set(&skb->users, 0);
} else if (likely(!refcount_dec_and_test(&skb->users))) {
return;
}
get_kfree_skb_cb(skb)->reason = reason;
local_irq_save(flags);
skb->next = __this_cpu_read(softnet_data.completion_queue);
__this_cpu_write(softnet_data.completion_queue, skb);
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}

Related

Copy structure with included user pointers from user space to kernel space (copy_from_user)

I want to transfer a transaction structure, which contains an user space pointer to an array, to kernel by using copy_from_user.
The goal is, to get access to the array elements in kernel space.
User space side:
I allocate an array of _sg_param structures in user space. Now i put the address of this array in a transaction structure (line (*)).
Then i transfer the transaction structure to the kernel via ioctl().
Kernel space side:
On executing this ioctl, the complete transaction structure is copied to kernel space (line ()). Now kernel space is allocated for holding the array (line (*)). Then i try to copy the array from user space to the new allocated kernel space (line (****)), and here start my problems:
The kernel is corrupted during execution of this copy. dmesg shows following output:
[ 54.443106] Unhandled fault: page domain fault (0x01b) at 0xb6f09738
[ 54.448067] pgd = ee5ec000
[ 54.449465] [b6f09738] *pgd=2e9d7831, *pte=2d56875f, *ppte=2d568c7f
[ 54.454411] Internal error: : 1b [#1] PREEMPT SMP ARM
Any ideas ???
Following an simplified extract of my code:
// structure declaration
typedef struct _sg_param {
void *seg_buf;
int seg_len;
int received;
} sg_param_t;
struct transaction {
...
int num_of_elements;
sg_param_t *pbuf_list; // Array of sg_param structure
...
} trans;
// user space side:
if ((pParam = (sg_param_t *) malloc(NR_OF_STRUCTS * sizeof(sg_param_t))) == NULL) {
return -ENOMEM;
}
else {
trans.num_of_elements = NR_OF_STRUCTS;
trans.pbuf_list = pParam; // (*)
}
rc = ioctl(dev->fd, MY_CMD, &trans);
if (rc < 0) {
return rc;
}
// kernel space side
static long ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
arg_ptr = (void __user *)arg;
// Perform the specified command
switch (cmd) {
case MY_CMD:
{
struct transaction *__user user_trans;
user_trans = (struct transaction *__user)arg_ptr;
if (copy_from_user(&trans, arg_ptr, sizeof(trans)) != 0) { // (**)
k_err("Unable to copy transfer info from userspace for "
"AXIDMA_DMA_START_DMA.\n");
return -EFAULT;
}
int size = trans.num_of_elements * sizeof(sg_param_t);
if (trans.pbuf_list != NULL) {
// Allocate kernel memory for buf_list
trans.pbuf_list = (sg_param_t *) kmalloc(size, GFP_KERNEL); // (***)
if (trans.pbuf_list == NULL) {
k_err("Unable to allocate array for buffers.\n");
return -ENOMEM;
}
// Now copy pbuf_list from user space to kernel space
if (copy_from_user(trans.pbuf_list, user_trans->pbuf_list, size) != 0) { // (****)
kfree(trans.pbuf_list);
return -EFAULT;
}
}
break;
}
}
You're directly accessing userspace data (user_trans->pbuf_list). You should use the one that you've already copied to kernel (trans.pbuf_list).
Code for this would normally be something like:
sg_param_t *local_copy = kmalloc(size, ...);
// TODO check it succeeded
if (copy_from_user(local_copy, trans.pbuf_list, size) ...)
trans.pbuf_list = local_copy;
// use trans.pbuf_list
Note that you also need to check trans.num_of_elements to be valid (0 would make kmalloc return ZERO_SIZE_PTR, and too big value might be a way for DoS).

Non-blockings reads/writes to stdin/stdout in C on Linux or Mac

I have two programs communicating via named pipes (on a Mac), but the buffer size of named pipes is too small. Program 1 writes 50K bytes to pipe 1 before reading pipe 2. Named pipes are 8K (on my system) so program 1 blocks until the data is consumed. Program 2 reads 20K bytes from pipe 1 and then writes 20K bytes to pipe2. Pipe2 can't hold 20K so program 2 now blocks. It will only be released when program 1 does its reads. But program 1 is blocked waiting for program 2. deadlock
I thought I could fix the problem by creating a gasket program that reads stdin non-blocking and writes stdout non-blocking, temporarily storing the data in a large buffer. I tested the program using cat data | ./gasket 0 | ./gasket 1 > out, expecting out to be a copy of data. However, while the first invocation of gasket works as expected, the read in the second program returns 0 before all the data is consumed and never returns anything other than 0 in follow on calls.
I tried the code below both on a MAC and Linux. Both behave the same. I've added logging so that I can see that the fread from the second invocation of gasket starts getting no data even though it has not read all the data written by the first invocation.
#include <stdio.h>
#include <fcntl.h>
#include <time.h>
#include <stdlib.h>
#include <unistd.h>
#define BUFFER_SIZE 100000
char buffer[BUFFER_SIZE];
int elements=0;
int main(int argc, char **argv)
{
int total_read=0, total_write=0;
FILE *logfile=fopen(argv[1],"w");
int flags = fcntl(fileno(stdin), F_GETFL, 0);
fcntl(fileno(stdin), F_SETFL, flags | O_NONBLOCK);
flags = fcntl(fileno(stdout), F_GETFL, 0);
fcntl(fileno(stdout), F_SETFL, flags | O_NONBLOCK);
while (1) {
int num_read=0;
if (elements < (BUFFER_SIZE-1024)) { // space in buffer
num_read = fread(&buffer[elements], sizeof(char), 1024, stdin);
elements += num_read;
total_read += num_read;
fprintf(logfile,"read %d (%d) elements \n",num_read, total_read); fflush(logfile);
}
if (elements > 0) { // something in buffer that we can write
int num_written = fwrite(&buffer[0],sizeof(char),elements, stdout); fflush(stdout);
total_write += num_written;
fprintf(logfile,"wrote %d (%d) elements \n",num_written, total_write); fflush(logfile);
if (num_written > 0) { // copy data to top of buffer
for (int i=0; i<(elements-num_written); i++) {
buffer[i] = buffer[i+num_written];
}
elements -= num_written;
}
}
}
}
I guess I could make the gasket multi-threaded and use blocking reads in one thread and blocking writes in the other, but I would like to understand why non-blocking IO seems to break for me.
Thanks!
My general solution to any IPC project is to make the client and server non-blocking I/O. To do so requires queuing data both on writing and reading, to handle cases where the OS can't read/write, or can only read/write a portion of your message.
The code below will probably seem like EXTREME overkill, but if you get it working, you can use it the rest of your career, whether for named pipes, sockets, network, you name it.
In pseudo-code:
typedef struct {
const char* pcData, * pcToFree; // pcData may no longer point to malloc'd region
int iToSend;
} DataToSend_T;
queue of DataToSend_T qdts;
// Caller will use malloc() to allocate storage, and create the message in
// that buffer. MyWrite() will free it now, or WritableCB() will free it
// later. Either way, the app must NOT free it, and must not even refer to
// it again.
MyWrite( const char* pcData, int iToSend ) {
iSent = 0;
// Normally the OS will tell select() if the socket is writable, but if were hugely
// compute-bound, then it won't have a chance to. So let's call WritableCB() to
// send anything in our queue that is now sendable. We have to send the data in
// order, of course, so can't send the new data until the entire queue is done.
WritableCB();
if ( qdts has no entries ) {
iSent = write( pcData, iToSend );
// TODO: check error
// Did we send it all? We're done.
if ( iSent == iToSend ) {
free( pcData );
return;
}
}
// OK, either 1) we had stuff queued already meaning we can't send, or 2)
// we tried to send but couldn't send it all.
add to queue qdts the DataToSend ( pcData + iSent, pcData, iToSend - iSent );
}
WritableCB() {
while ( qdts has entries ) {
DataToSend_T* pdts = qdts head;
int iSent = write( pdts->cData, pdts->iToSend );
// TODO: check error
if ( iSent == pdts->iToSend ) {
free( pdts->pcToFree );
pop the front node off qdts
else {
pdts->pcData += iSent;
pdts->iToSend -= iSent;
return;
}
}
}
// Off-subject but I like a TINY buffer as an original value, that will always
// exercise the "buffer growth" code for almost all usage, so we're sure it works.
// If the initial buffer size is like 1M, and almost never grows, then the grow code
// may be buggy and we won't know until there's a crash years later.
int iBufSize = 1, iEnd = 0; iEnd is the first byte NOT in a message
char* pcBuf = malloc( iBufSize );
ReadableCB() {
// Keep reading the socket until there's no more data. Grow buffer if necessary.
while (1) {
int iRead = read( pcBuf + iEnd, iBufSize - iEnd);
// TODO: check error
iEnd += iRead;
// If we read less than we had space for, then read returned because this is
// all the available data, not because the buffer was too small.
if ( iRead < iBufSize - iEnd )
break;
// Otherwise, double the buffer and try reading some more.
iBufSize *= 2;
pcBuf = realloc( pcBuf, iBufSize );
}
iStart = 0;
while (1) {
if ( pcBuf[ iStart ] until iEnd-1 is less than a message ) {
// If our partial message isn't at the front of the buffer move it there.
if ( iStart ) {
memmove( pcBuf, pcBuf + iStart, iEnd - iStart );
iEnd -= iStart;
}
return;
}
// process a message, and advance iStart by the size of that message.
}
}
main() {
// Do your initial processing, and call MyWrite() to send and/or queue data.
while (1) {
select() // see man page
if ( the file handle is readable )
ReadableCB();
if ( the file handle is writable )
WritableCB();
if ( the file handle is in error )
// handle it;
if ( application is finished )
exit( EXIT_SUCCESS );
}
}

in which condition will queue_work() not wake up the work queue in linux?

i am learning linux kernel, and i meet a problem.
in linux kernel, i use "mod_delayed_work(bdi_wq, &wb->dwork, 0)" to queue a work_struct to a work queue, i assume the work function of the queued work_struct will soon be executed. but work function is not executed until 300 seconds later.
and i find a watchdog thread happens meanwhile.
does this a normal case? or is it because of the watchdog thread that make the work queue sleep although there is a work(my queued work_truct) pending here.
added:
the followings are my condition. i use the linux kernel 4.9.13 codes and do not change them except for adding some printk logs.
i have five disks, and use five shells to copy 4GB files from disks to disks concurrently. this problem happens while i am doing sync. one of the shells is like:
#!/bin/bash
for ((i=0; i<9999; i++))
do
cp disk1/4GB.tar disk2/4GB-chen.tar
sync
rm disk2/4GB-chen.tar
sync
done
i do a sync after each copy is done. after the shells run for some times, i find that the sync command will be blocked for a long time(longer than 2 minutes). i find sync will call a system call, the code is as follows:
SYSCALL_DEFINE0(sync)
{
int nowait = 0, wait = 1;
wakeup_flusher_threads(0, WB_REASON_SYNC);
iterate_supers(sync_inodes_one_sb, NULL);
iterate_supers(sync_fs_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &wait);
iterate_bdevs(fdatawrite_one_bdev, NULL);
iterate_bdevs(fdatawait_one_bdev, NULL);
if (unlikely(laptop_mode))
laptop_sync_completion();
return 0;
}
in iterate_supers(sync_inodes_one_sb, NULL), kernel will call sync_inodes_one_sb for each disk'super block. sync_inodes_one_sb will eventually call sync_inodes_sb, the code is:
void sync_inodes_sb(struct super_block *sb)
{
DEFINE_WB_COMPLETION_ONSTACK(done);
struct wb_writeback_work work = {
.sb = sb,
.sync_mode = WB_SYNC_ALL,
.nr_pages = LONG_MAX,
.range_cyclic = 0,
.done = &done,
.reason = WB_REASON_SYNC,
.for_sync = 1,
};
struct backing_dev_info *bdi = sb->s_bdi;
/*
* Can't skip on !bdi_has_dirty() because we should wait for !dirty
* inodes under writeback and I_DIRTY_TIME inodes ignored by
* bdi_has_dirty() need to be written out too.
*/
if (bdi == &noop_backing_dev_info)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
bdi_split_work_to_wbs(bdi, &work, false); /* split work to wbs */
wb_wait_for_completion(bdi);
wait_sb_inodes(sb);
}
and in bdi_split_work_to_wbs(bdi, &work, false)(in fs/fs-writeback.c), queue the write back works to the work queue:
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
struct bdi_writeback *last_wb = NULL;
struct bdi_writeback *wb = list_entry(&bdi->wb_list,
struct bdi_writeback, bdi_node);
might_sleep();
restart:
rcu_read_lock();
list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
struct wb_writeback_work fallback_work;
struct wb_writeback_work *work;
long nr_pages;
if (last_wb) {
wb_put(last_wb);
last_wb = NULL;
}
/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
list_empty(&wb->b_dirty_time)))
continue;
if (skip_if_busy && writeback_in_progress(wb))
continue;
nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work) {
*work = *base_work;
work->nr_pages = nr_pages;
work->auto_free = 1;
wb_queue_work(wb, work); /*** here to queue write back work ***/
continue;
}
/* alloc failed, execute synchronously using on-stack fallback */
work = &fallback_work;
*work = *base_work;
work->nr_pages = nr_pages;
work->auto_free = 0;
work->done = &fallback_work_done;
wb_queue_work(wb, work);
/*
* Pin #wb so that it stays on #bdi->wb_list. This allows
* continuing iteration from #wb after dropping and
* regrabbing rcu read lock.
*/
wb_get(wb);
last_wb = wb;
rcu_read_unlock();
wb_wait_for_completion(bdi, &fallback_work_done);
goto restart;
}
rcu_read_unlock();
if (last_wb)
wb_put(last_wb);
}
use wb_queue_work(wb, work) to queue a work to work structure, in fs/fs-writeback.c wb_queue_work is:
static void wb_queue_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
trace_writeback_queue(wb, work);
if (work->done)
atomic_inc(&work->done->cnt);
spin_lock_bh(&wb->work_lock);
if (test_bit(WB_registered, &wb->state)) {
list_add_tail(&work->list, &wb->work_list);
mod_delayed_work(bdi_wq, &wb->dwork, 0); /*** queue work to work queue ***/
} else
finish_writeback_work(wb, work);
spin_unlock_bh(&wb->work_lock);
}
here the mod_delayed_work(bdi_wq, &wb->dwork, 0) will actually queue the wb->dwork to the bdi_wq work queue, the work function of wb->dwork is wb_workfn()(in fs/fs-writeback.c), i add some printks when prepare to queue the work and in the work function, i find the printk logs in the work function are not printed out until approximately 300 seconds later some times(most of the times, they will be printed less than 1 seconds after the work has been queued to the work queue). and the bdi_wq work queue will be blocked until 300 seconds later when the work function begin to be executed.

How to CancelSynchronousIo() on WaitForSingleObject() waiting on stdin?

On Windows 10, I'm waiting for input from the console using
WaitForSingleObject( GetStdHandle(STD_INPUT_HANDLE), ... )
and to cancel this waiting using CancelSynchronousIo().
But the cancellation does nothing (returns 0 and GetLastError() is ERROR_NOT_FOUND).
Any idea what I could be doing wrong?
Should I be able to cancel this waiting for new input on stdin?
(I actually want to do this with any HANDLE whose GetFileType() is FILE_TYPE_CHAR, not only stdin, but stdin is certainly the most important use case and the simplest to test with).
Related discussions I've found:
Synchronous ReadFile() on stdin cannot be unblocked by CancelSynchronousIo()
win32: how stop ReadFile (stdin|pipe)
But unfortunately they only discuss ReadFile(), not WaitForSingleObject(). I've also tried WaitForMultipleObjects() (with just a single object in the array), same problem.
(Background: I'm trying to improve input handling in the GHC Haskell compiler runtime.)
CancelSynchronousIo cancel I/O operations that are issued by the specified thread. more concrete it cancel IRP packets which associated with specified thread via call IoCancelIrp. if use undocumented NtCancelSynchronousIoFile (CancelSynchronousIo internally call it with IoRequestToCancel = 0) we can be more selective - cancel only i/o request which used specified IoRequestToCancel (system check that Irp->UserIosb == IoRequestToCancel and cancel only this requests)
but WaitForSingleObject this is not I/O request. this call not create any IRP which can be canceled. so - no way do this.
however if you use WaitForSingleObjectEx with bAlertable set to TRUE - you can break wait by queue apc to thread by using QueueUserAPC . also if use NtWaitForSingleObject instead WaitForSingleObjectEx we can also alert thread by using undocumented call NtAlertThread. in this case NtWaitForSingleObject will break with STATUS_ALERTED (note that WaitForSingleObjectEx which internally call NtWaitForSingleObject do special check for STATUS_ALERTED and in case this status - again run NtWaitForSingleObject - as result we can not break WaitForSingleObjectEx by call NtAlertThread, but NtWaitForSingleObject will be breaked.
so if you need break waiting for std input - create additional thread, which must call not CancelSynchronousIo (this senseless) but QueueUserAPC or NtAlertThread (only if you use NtWaitForSingleObject for wait). and input thread must wait in alertable state. so demo code can look like:
extern "C" NTSYSCALLAPI NTSTATUS NTAPI NtAlertThread(HANDLE ThreadHandle);
VOID NTAPI OnApc(ULONG_PTR Parameter)
{
DbgPrint("OnApc(%p)\n", Parameter);
}
DWORD CALLBACK BreakWaitThread(HANDLE hThread)
{
switch (LONG status = MessageBoxW(0, L"Use Apc(yes) or Alert(No) ?", L"BreakWaitThread",
MB_ICONQUESTION|MB_YESNOCANCEL|MB_DEFBUTTON3))
{
case IDYES:
if (!QueueUserAPC(OnApc, hThread, 0))
{
DbgPrint("QueueUserAPC=%u\n", GetLastError());
}
break;
case IDNO:
if (0 > (status = NtAlertThread(hThread)))
{
DbgPrint("AlertThread=%x\n", status);
}
break;
case IDCANCEL:
DbgPrint("canceled\n");
break;
default:
DbgPrint("MessageBox=%x\n", status);
}
CloseHandle(hThread);
return 0;
}
void ConsoleLoop(HANDLE hStdIn)
{
ULONG NumberOfEvents, NumberOfEventsRead, n;
INPUT_RECORD buf[8], *p;
for (;;)
{
switch (ZwWaitForSingleObject(hStdIn, TRUE, 0))
//switch (WaitForSingleObjectEx(hStdIn, INFINITE, TRUE))
{
case WAIT_OBJECT_0:
while (GetNumberOfConsoleInputEvents(hStdIn, &NumberOfEvents) && NumberOfEvents)
{
do
{
NumberOfEventsRead = min(RTL_NUMBER_OF(buf), NumberOfEvents);
if (ReadConsoleInput(hStdIn, buf, NumberOfEventsRead, &NumberOfEventsRead) && NumberOfEventsRead)
{
n = NumberOfEventsRead;
p = buf;
do
{
if (p->EventType == KEY_EVENT)
{
DbgPrint("%u(%u) %C %x %x %x\n",
p->Event.KeyEvent.bKeyDown,
p->Event.KeyEvent.wRepeatCount,
p->Event.KeyEvent.uChar.UnicodeChar,
p->Event.KeyEvent.wVirtualKeyCode,
p->Event.KeyEvent.wVirtualScanCode,
p->Event.KeyEvent.dwControlKeyState);
if (VK_OEM_PERIOD == p->Event.KeyEvent.wVirtualKeyCode)
{
return ;//if user type '.' return for demo
}
}
} while (p++, --n);
}
else
{
FlushConsoleInputBuffer(hStdIn);
break;
}
} while (NumberOfEvents -= NumberOfEventsRead);
}
continue;
case STATUS_USER_APC:
DbgPrint("\nUSER_APC\n");
return;
case STATUS_ALERTED:
DbgPrint("\nALERTED\n");
return;
case WAIT_FAILED :
DbgPrint("\nWAIT_FAILED=%u\n", GetLastError());
return;
default:
__debugbreak();
return;
}
}
}
void SimpleDemo()
{
if (HANDLE hCurrentThread = OpenThread(THREAD_ALERT|THREAD_SET_CONTEXT , FALSE, GetCurrentThreadId()))
{
ULONG dwThreadId;
HANDLE hThread = CreateThread(0, 0, BreakWaitThread, hCurrentThread, 0, &dwThreadId);
if (hThread)
{
ConsoleLoop(GetStdHandle(STD_INPUT_HANDLE));
PostThreadMessage(dwThreadId, WM_QUIT, 0, 0);
WaitForSingleObject(hThread, INFINITE);
CloseHandle(hThread);
}
else
{
CloseHandle(hCurrentThread);
}
}
}
Console I/O is difficult to use asynchronously, it is simply not designed for it. See IO Completion Ports (IOCP) and Asynchronous I/O through STDIN, STDOUT and STDERR for some possible workarounds.
If that is not an option for you, then you will have to either:
use WaitForSingleObject() in a loop with a short timeout. Create a flag variable that your loop can look at on each iteration to break the loop if the flag is set.
use WaitForMutipleObjects(), giving it 2 HANDLEs to wait on - one for the console (or whatever), and one for an event object from CreateEvent(). Then you can signal the event with SetEvent() when you want to break the wait. The return value of WaitForMutipleObjects() will tell you which HANDLE was signaled.

Using IRPs for I/O on device object returned by IoGetDeviceObjectPointer()

Can one use IoCallDriver() with an IRP created by IoBuildAsynchronousFsdRequest() on a device object returned by IoGetDeviceObjectPointer()? What I have currently fails with blue screen (BSOD) 0x7E (unhandled exception), which when caught shows an Access Violation (0xc0000005). Same code worked when the device was stacked (using the device object returned by IoAttachDeviceToDeviceStack()).
So what I have is about the following:
status = IoGetDeviceObjectPointer(&device_name, FILE_ALL_ACCESS, &FileObject, &windows_device);
if (!NT_SUCCESS(status)) {
return -1;
}
offset.QuadPart = 0;
newIrp = IoBuildAsynchronousFsdRequest(io, windows_device, buffer, 4096, &offset, &io_stat);
if (newIrp == NULL) {
return -1;
}
IoSetCompletionRoutine(newIrp, DrbdIoCompletion, bio, TRUE, TRUE, TRUE);
status = ObReferenceObjectByPointer(newIrp->Tail.Overlay.Thread, THREAD_ALL_ACCESS, NULL, KernelMode);
if (!NT_SUCCESS(status)) {
return -1;
}
status = IoCallDriver(bio->bi_bdev->windows_device, newIrp);
if (!NT_SUCCESS(status)) {
return -1;
}
return 0;
device_name is \Device\HarddiskVolume7 which exists according to WinObj.exe .
buffer has enough space and is read/writable. offset and io_stat are on stack (also tried with heap, didn't help). When catching the exception (SEH exception) it doesn't blue screen but shows an access violation as reason for the exception. io is IRP_MJ_READ.
Do I miss something obvious? Is it in general better to use IRPs than the ZwCreateFile / ZwReadFile / ZwWriteFile API (which would be an option, but isn't that slower?)? I also tried a ZwCreateFile to have an extra reference, but this also didn't help.
Thanks for any insights.
you make in this code how minimum 2 critical errors.
can I ask - from which file you try read (or write) data ? from
FileObject you say ? but how file system driver, which will handle
this request know this ? you not pass any file object to newIrp.
look for IoBuildAsynchronousFsdRequest - it have no file object
parameter (and impossible get file object from device object - only
visa versa - because on device can be multiple files open). so it
and can not be filled by this api in newIrp. you must setup it
yourself:
PIO_STACK_LOCATION irpSp = IoGetNextIrpStackLocation( newIrp );
irpSp->FileObject = FileObject;
I guess bug was exactly when file system try access FileObject
from irp which is 0 in your case. also read docs for
IRP_MJ_READ - IrpSp->FileObject -
Pointer to the file object that is associated with DeviceObject
you pass I guess local variables io_stat (and offset) to
IoBuildAsynchronousFsdRequest. as result io_stat must be valid
until newIrp is completed - I/O subsystem write final result to it
when operation completed. but you not wait in function until request
will be completed (in case STATUS_PENDING returned) but just exit
from function. as result later I/O subsystem, if operation completed
asynchronous, write data to arbitrary address &io_stat (it became
arbitrary just after you exit from function). so you need or check
for STATUS_PENDING returned and wait in this case (have actually
synchronous io request). but more logical use
IoBuildSynchronousFsdRequest in this case. or allocate io_stat
not from stack, but say in your object which correspond to file. in
this case you can not have more than single io request with this
object at time. or if you want exactly asynchronous I/O - you can do
next trick - newIrp->UserIosb = &newIrp->IoStatus. as result you
iosb always will be valid for newIrp. and actual operation status
you check/use in DrbdIoCompletion
also can you explain (not for me - for self) next code line ?:
status = ObReferenceObjectByPointer(newIrp->Tail.Overlay.Thread, THREAD_ALL_ACCESS, NULL, KernelMode);
who and where dereference thread and what sense in this ?
Can one use ...
we can use all, but with condition - we understand what we doing and deep understand system internally.
Is it in general better to use IRPs than the ZwCreateFile / ZwReadFile
/ ZwWriteFile API
for performance - yes, better. but this require more code and more complex code compare api calls. and require more knowledge. also if you know that previous mode is kernel mode - you can use NtCreateFile, NtWriteFile, NtReadFile - this of course will be bit slow (need every time reference file object by handle) but more faster compare Zw version
Just wanted to add that the ObReferenceObjectByPointer is needed
because the IRP references the current thread which may exit before
the request is completed. It is dereferenced in the Completion
Routine. Also as a hint the completion routine must return
STATUS_MORE_PROCESSING_REQUIRED if it frees the IRP (took me several
days to figure that out).
here you make again several mistakes. how i understand you in completion routine do next:
IoFreeIrp(Irp);
return StopCompletion;
but call simply call IoFreeIrp here is error - resource leak. i advice you check (DbgPrint) Irp->MdlAddress at this point. if you read data from file system object and request completed asynchronous - file system always allocate Mdl for access user buffer in arbitrary context. now question - who free this Mdl ? IoFreeIrp - simply free Irp memory - nothing more. you do this yourself ? doubt. but Irp is complex object, which internally hold many resources. as result need not only free it memory but call "destructor" for it. this "destructor" is IofCompleteRequest. when you return StopCompletion (=STATUS_MORE_PROCESSING_REQUIRED) you break this destructor at very begin. but you must latter again call IofCompleteRequest for continue Irp (and it resources) correct destroy.
about referencing Tail.Overlay.Thread - what you doing - have no sense:
It is dereferenced in the Completion Routine.
but IofCompleteRequest access Tail.Overlay.Thread after it
call your completion routine (and if you not return
StopCompletion). as result your reference/dereference thread lost
sense - because you deference it too early, before system
actually access it.
also if you return StopCompletion and not more call
IofCompleteRequest for this Irp - system not access
Tail.Overlay.Thread at all. and you not need reference it in this
case.
and exist else one reason, why reference thread is senseless. system
access Tail.Overlay.Thread only for insert Apc to him - for call
final part (IopCompleteRequest) of Irp destruction in original
thread context. really this need only for user mode Irp's requests,
where buffers and iosb located in user mode and valid only in
context of process (original thread ). but if thread is terminated -
call of KeInsertQueueApc fail - system not let insert apc to
died thread. as result IopCompleteRequest will be not called and
resources not freed.
so you or dereference Tail.Overlay.Thread too early or you not need do this at all. and reference for died thread anyway not help. in all case what you doing is error.
you can try do next here:
PETHREAD Thread = Irp->Tail.Overlay.Thread;
IofCompleteRequest(Irp, IO_NO_INCREMENT);// here Thread will be referenced
ObfDereferenceObject(Thread);
return StopCompletion;
A second call to IofCompleteRequest causes the I/O manager to resume calling the IRP's completion. here io manager and access Tail.Overlay.Thread insert Apc to him. and finally you call ObfDereferenceObject(Thread); already after system access it and return StopCompletion for break first call to IofCompleteRequest. look like correct but.. if thread already terminated, how i explain in 3 this will be error, because KeInsertQueueApc fail. for extended test - call IofCallDriver from separate thread and just exit from it. and in completion run next code:
PETHREAD Thread = Irp->Tail.Overlay.Thread;
if (PsIsThreadTerminating(Thread))
{
DbgPrint("ThreadTerminating\n");
if (PKAPC Apc = (PKAPC)ExAllocatePool(NonPagedPool, sizeof(KAPC)))
{
KeInitializeApc(Apc, Thread, 0, KernelRoutine, 0, 0, KernelMode, 0);
if (!KeInsertQueueApc(Apc, 0, 0, IO_NO_INCREMENT))
{
DbgPrint("!KeInsertQueueApc\n");
ExFreePool(Apc);
}
}
}
PMDL MdlAddress = Irp->MdlAddress;
IofCompleteRequest(Irp, IO_NO_INCREMENT);
ObfDereferenceObject(Thread);
if (MdlAddress == Irp->MdlAddress)
{
// IopCompleteRequest not called due KeInsertQueueApc fail
DbgPrint("!!!!!!!!!!!\n");
IoFreeMdl(MdlAddress);
IoFreeIrp(Irp);
}
return StopCompletion;
//---------------
VOID KernelRoutine (PKAPC Apc,PKNORMAL_ROUTINE *,PVOID *,PVOID *,PVOID *)
{
DbgPrint("KernelRoutine(%p)\n", Apc);
ExFreePool(Apc);
}
and you must got next debug output:
ThreadTerminating
!KeInsertQueueApc
!!!!!!!!!!!
and KernelRoutine will be not called (like and IopCompleteRequest) - no print from it.
so what is correct solution ? this of course not documented anywhere, but based on deep internal understand. you not need reference original thread. you need do next:
Irp->Tail.Overlay.Thread = KeGetCurrentThread();
return ContinueCompletion;
you can safe change Tail.Overlay.Thread - if you have no any pointers valid only in original process context. this is true for kernel mode requests - all your buffers in kernel mode and valid in any context. and of course you not need break Irp destruction but continue it. for correct free mdl and all irp resources. and finally system call IoFreeIrp for you.
and again for iosb pointer. how i say pass local variable address, if you exit from function before irp completed (and this iosb accessed) is error. if you break Irp destruction, iosb will be not accessed of course, but in this case much better pass 0 pointer as iosb. (if you latter something change and iosb pointer will be accessed - will be the worst error - arbitrary memory corrupted - with unpredictable effect. and research crash of this will be very-very hard). but if you completion routine - you not need separate iosb at all - you have irp in completion and can direct access it internal iosb - for what you need else one ? so the best solution will be do next:
Irp->UserIosb = &Irp->IoStatus;
full correct example how read file asynchronous:
NTSTATUS DemoCompletion (PDEVICE_OBJECT /*DeviceObject*/, PIRP Irp, BIO* bio)
{
DbgPrint("DemoCompletion(p=%x mdl=%p)\n", Irp->PendingReturned, Irp->MdlAddress);
bio->CheckResult(Irp->IoStatus.Status, Irp->IoStatus.Information);
bio->Release();
Irp->Tail.Overlay.Thread = KeGetCurrentThread();
return ContinueCompletion;
}
VOID DoTest (PVOID buf)
{
PFILE_OBJECT FileObject;
NTSTATUS status;
UNICODE_STRING ObjectName = RTL_CONSTANT_STRING(L"\\Device\\HarddiskVolume2");
OBJECT_ATTRIBUTES oa = { sizeof(oa), 0, &ObjectName, OBJ_CASE_INSENSITIVE };
if (0 <= (status = GetDeviceObjectPointer(&oa, &FileObject)))
{
status = STATUS_INSUFFICIENT_RESOURCES;
if (BIO* bio = new BIO(FileObject))
{
if (buf = bio->AllocBuffer(PAGE_SIZE))
{
LARGE_INTEGER ByteOffset = {};
PDEVICE_OBJECT DeviceObject = IoGetRelatedDeviceObject(FileObject);
if (PIRP Irp = IoBuildAsynchronousFsdRequest(IRP_MJ_READ, DeviceObject, buf, PAGE_SIZE, &ByteOffset, 0))
{
Irp->UserIosb = &Irp->IoStatus;
Irp->Tail.Overlay.Thread = 0;
PIO_STACK_LOCATION IrpSp = IoGetNextIrpStackLocation(Irp);
IrpSp->FileObject = FileObject;
bio->AddRef();
IrpSp->CompletionRoutine = (PIO_COMPLETION_ROUTINE)DemoCompletion;
IrpSp->Context = bio;
IrpSp->Control = SL_INVOKE_ON_CANCEL|SL_INVOKE_ON_ERROR|SL_INVOKE_ON_SUCCESS;
status = IofCallDriver(DeviceObject, Irp);
}
}
bio->Release();
}
ObfDereferenceObject(FileObject);
}
DbgPrint("DoTest=%x\n", status);
}
struct BIO
{
PVOID Buffer;
PFILE_OBJECT FileObject;
LONG dwRef;
void AddRef()
{
InterlockedIncrement(&dwRef);
}
void Release()
{
if (!InterlockedDecrement(&dwRef))
{
delete this;
}
}
void* operator new(size_t cb)
{
return ExAllocatePool(PagedPool, cb);
}
void operator delete(void* p)
{
ExFreePool(p);
}
BIO(PFILE_OBJECT FileObject) : FileObject(FileObject), Buffer(0), dwRef(1)
{
DbgPrint("%s<%p>(%p)\n", __FUNCTION__, this, FileObject);
ObfReferenceObject(FileObject);
}
~BIO()
{
if (Buffer)
{
ExFreePool(Buffer);
}
ObfDereferenceObject(FileObject);
DbgPrint("%s<%p>(%p)\n", __FUNCTION__, this, FileObject);
}
PVOID AllocBuffer(ULONG NumberOfBytes)
{
return Buffer = ExAllocatePool(PagedPool, NumberOfBytes);
}
void CheckResult(NTSTATUS status, ULONG_PTR Information)
{
DbgPrint("CheckResult:status = %x, info = %p\n", status, Information);
if (0 <= status)
{
if (ULONG_PTR cb = min(16, Information))
{
char buf[64], *sz = buf;
PBYTE pb = (PBYTE)Buffer;
do sz += sprintf(sz, "%02x ", *pb++); while (--cb); sz[-1]= '\n';
DbgPrint(buf);
}
}
}
};
NTSTATUS GetDeviceObjectPointer(POBJECT_ATTRIBUTES poa, PFILE_OBJECT *FileObject )
{
HANDLE hFile;
IO_STATUS_BLOCK iosb;
NTSTATUS status = IoCreateFile(&hFile, FILE_READ_DATA, poa, &iosb, 0, 0,
FILE_SHARE_VALID_FLAGS, FILE_OPEN, FILE_NO_INTERMEDIATE_BUFFERING, 0, 0, CreateFileTypeNone, 0, 0);
if (0 <= (status))
{
status = ObReferenceObjectByHandle(hFile, 0, *IoFileObjectType, KernelMode, (void**)FileObject, 0);
NtClose(hFile);
}
return status;
}
and output:
BIO::BIO<FFFFC000024D4870>(FFFFE00001BAAB70)
DoTest=103
DemoCompletion(p=1 mdl=FFFFE0000200EE70)
CheckResult:status = 0, info = 0000000000001000
eb 52 90 4e 54 46 53 20 20 20 20 00 02 08 00 00
BIO::~BIO<FFFFC000024D4870>(FFFFE00001BAAB70)
the eb 52 90 4e 54 46 53 read ok

Resources