My linked list has this memory layout:
[dummy] -> (prev, ?DUMMY?, next) <-> (prev, A, next) <-> (prev, B, next)
^ ^
+---------------------------------------------------------+
The dummy node is lazily initialized upon insertion, with prev and next pointer pointing towards itself. After insertion, the prev and next pointer will act as the tail and head pointer.
use std::marker::PhantomData;
use self::node::NodePtr;
mod node;
#[derive(Debug)]
pub struct LinkedList<T> {
dummy: Option<NodePtr<T>>,
len: usize,
_phantom: PhantomData<T>,
}
impl<T> Default for LinkedList<T> {
fn default() -> Self {
Self {
dummy: None,
len: 0,
_phantom: PhantomData,
}
}
}
impl<T> LinkedList<T> {
pub fn new() -> Self {
Default::default()
}
pub(crate) fn inner(&mut self) -> NodePtr<T> {
*self.dummy.get_or_insert(NodePtr::dummy())
}
pub fn len(&self) -> usize {
self.len
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn push_front(&mut self, elem: T) {
let dummy = self.inner();
let head = dummy.next();
let new_head = NodePtr::alloc(dummy, elem, head);
head.set_prev(new_head);
dummy.set_next(new_head);
self.len += 1;
}
pub fn pop_front(&mut self) -> Option<T> {
let dummy = self.inner();
unsafe { dummy.next().dealloc() }.map(|(_, elem, new_head)| {
dummy.set_next(new_head);
new_head.set_prev(dummy);
self.len -= 1;
elem
})
}
}
impl<T> Drop for LinkedList<T> {
fn drop(&mut self) {
while self.pop_front().is_some() {}
let dummy = self.dummy.take().unwrap();
unsafe {
let _ = Box::from_raw(dummy.as_ptr());
}
}
}
node.rs
use std::{
fmt::Debug,
mem::MaybeUninit,
ops::Not,
ptr::{self, NonNull},
};
#[derive(Debug)]
pub struct Node<T> {
prev: NodePtr<T>,
next: NodePtr<T>,
elem: MaybeUninit<T>,
}
pub struct NodePtr<T> {
ptr: NonNull<Node<T>>,
}
impl<T> Debug for NodePtr<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("NodePtr").field("ptr", &self.ptr).finish()
}
}
impl<T> Clone for NodePtr<T> {
fn clone(&self) -> Self {
Self { ptr: self.ptr }
}
}
impl<T> Copy for NodePtr<T> {}
impl<T> PartialEq for NodePtr<T> {
fn eq(&self, other: &Self) -> bool {
ptr::eq(self.as_ptr(), other.as_ptr())
}
}
impl<T> Eq for NodePtr<T> {}
impl<T> NodePtr<T> {
pub unsafe fn dangling() -> Self {
Self {
ptr: NonNull::dangling(),
}
}
pub unsafe fn raw_alloc(prev: Self, elem: MaybeUninit<T>, next: Self) -> Self {
let ptr = Box::into_raw(Box::new(Node { prev, next, elem }));
let ptr = NonNull::new_unchecked(ptr);
Self { ptr }
}
pub fn alloc(prev: Self, elem: T, next: Self) -> Self {
unsafe { Self::raw_alloc(prev, MaybeUninit::new(elem), next) }
}
pub fn dummy() -> Self {
unsafe {
let dangling = Self::dangling();
let dummy = Self::raw_alloc(dangling, MaybeUninit::uninit(), dangling);
dummy.set_prev(dummy);
dummy.set_next(dummy);
dummy
}
}
pub fn prev(self) -> Self {
unsafe { (*self.as_ptr()).prev }
}
pub fn set_prev(self, ptr: Self) {
unsafe {
(*self.as_ptr()).prev = ptr;
}
}
pub fn next(self) -> Self {
unsafe { (*self.as_ptr()).next }
}
pub fn set_next(self, ptr: Self) {
unsafe {
(*self.as_ptr()).next = ptr;
}
}
pub fn is_dummy(self) -> bool {
self.prev() == self.next() && self.prev() == self
}
pub fn as_ptr(self) -> *mut Node<T> {
self.ptr.as_ptr()
}
pub unsafe fn as_ref<'a>(self) -> &'a Node<T> {
self.ptr.as_ref()
}
pub unsafe fn as_mut<'a>(mut self) -> &'a mut Node<T> {
self.ptr.as_mut()
}
pub fn get_raw(self) -> Option<NonNull<T>> {
self.is_dummy()
.not()
.then(|| unsafe { NonNull::new_unchecked((*self.as_ptr()).elem.as_mut_ptr()) })
}
pub unsafe fn get<'a>(self) -> Option<&'a T> {
self.get_raw().map(|ptr| ptr.as_ref())
}
pub unsafe fn get_mut<'a>(self) -> Option<&'a mut T> {
self.get_raw().map(|mut ptr| ptr.as_mut())
}
pub unsafe fn dealloc(self) -> Option<(Self, T, Self)> {
self.is_dummy().not().then(|| {
// println!("Deallocating...");
let Node { prev, next, elem } = *Box::from_raw(self.as_ptr());
(prev, elem.assume_init(), next)
})
}
}
Running the test cases with miri:
#[cfg(test)]
mod test {
use super::{node::NodePtr, LinkedList};
#[test]
fn test_node() {
let dummy = NodePtr::dummy();
let node = NodePtr::alloc(dummy, 100, dummy);
dummy.set_next(node);
dummy.set_prev(node);
let (_, elem, _) = unsafe { node.dealloc() }.unwrap();
unsafe {
Box::from_raw(dummy.as_ptr());
}
assert_eq!(elem, 100);
}
#[test]
fn test_basic_front() {
let mut list = LinkedList::new();
// Try to break an empty list
assert_eq!(list.len(), 0);
assert_eq!(list.pop_front(), None);
assert_eq!(list.len(), 0);
// Try to break a one item list
list.push_front(10);
assert_eq!(list.len(), 1);
assert_eq!(list.pop_front(), Some(10));
assert_eq!(list.len(), 0);
assert_eq!(list.pop_front(), None);
assert_eq!(list.len(), 0);
// Mess around
list.push_front(10);
assert_eq!(list.len(), 1);
list.push_front(20);
assert_eq!(list.len(), 2);
list.push_front(30);
assert_eq!(list.len(), 3);
assert_eq!(list.pop_front(), Some(30));
assert_eq!(list.len(), 2);
list.push_front(40);
assert_eq!(list.len(), 3);
assert_eq!(list.pop_front(), Some(40));
assert_eq!(list.len(), 2);
assert_eq!(list.pop_front(), Some(20));
assert_eq!(list.len(), 1);
assert_eq!(list.pop_front(), Some(10));
assert_eq!(list.len(), 0);
assert_eq!(list.pop_front(), None);
assert_eq!(list.len(), 0);
assert_eq!(list.pop_front(), None);
assert_eq!(list.len(), 0);
}
}
miri output:
The following memory was leaked: alloc86121 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1f2da8[a86121]<205762> (8 ptr bytes)╼ ╾0x1f2da8[a86121]<205762> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc86346 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1f3ef0[a86346]<206225> (8 ptr bytes)╼ ╾0x1f3ef0[a86346]<206225> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc86565 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1f4ff8[a86565]<206712> (8 ptr bytes)╼ ╾0x1f4ff8[a86565]<206712> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc86723 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1f5c58[a86723]<207063> (8 ptr bytes)╼ ╾0x1f5c58[a86723]<207063> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc86946 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1f6da8[a86946]<207524> (8 ptr bytes)╼ ╾0x1f6da8[a86946]<207524> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc87169 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1f7f60[a87169]<207985> (8 ptr bytes)╼ ╾0x1f7f60[a87169]<207985> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc87393 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1f9100[a87393]<208448> (8 ptr bytes)╼ ╾0x1f9100[a87393]<208448> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc87599 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1fa1b0[a87599]<208910> (8 ptr bytes)╼ ╾0x1fa1b0[a87599]<208910> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc87823 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1fb2f8[a87823]<209373> (8 ptr bytes)╼ ╾0x1fb2f8[a87823]<209373> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc88030 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1fc3a8[a88030]<209837> (8 ptr bytes)╼ ╾0x1fc3a8[a88030]<209837> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc88237 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1fd3a8[a88237]<210301> (8 ptr bytes)╼ ╾0x1fd3a8[a88237]<210301> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc88454 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1fe4e8[a88454]<210788> (8 ptr bytes)╼ ╾0x1fe4e8[a88454]<210788> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc88613 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1ff160[a88613]<211141> (8 ptr bytes)╼ ╾0x1ff160[a88613]<211141> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
alloc88773 (Rust heap, size: 24, align: 8) {
0x00 │ ╾0x1ffe18[a88773]<211496> (8 ptr bytes)╼ ╾0x1ffe18[a88773]<211496> (8 ptr bytes)╼ │ ╾──────╼╾──────╼
0x10 │ __ __ __ __ __ __ __ __ │ ░░░░░░░░
}
I got 14 memory leaks but running the test_node case alone doesn't leak any memory. With some further debugging, I can assure that NodePtr::dealloc does run properly ("Deallocating" got printed for 5 times as expected. Note that this method won't be deallocating the dummy node, and I would deallocate the dummy node manually in the Drop impl). The code logic seems to work correctly. Could it be some kind of subtle undefined behavior?
Here is your memory leak:
pub(crate) fn inner(&mut self) -> NodePtr<T> {
*self.dummy.get_or_insert(NodePtr::dummy())
}
NodePtr::dummy() allocates a node, returns the raw pointer and never destroys it. The problem here is that the way it is written it always runs dummy() unconditionally.
This is the fix:
pub(crate) fn inner(&mut self) -> NodePtr<T> {
*self.dummy.get_or_insert_with(|| NodePtr::dummy())
}
How I found it
When looking at the output of RUSTFLAGS=-Zsanitizer=address cargo +nightly test I realized that the leaked memory always gets initialized in NodePtr::raw_alloc():
Direct leak of 24 byte(s) in 1 object(s) allocated from:
#0 0x55c80616ed8e in malloc /rustc/llvm/src/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3
#1 0x55c8061a27f9 in alloc::alloc::alloc::ha05f706f7dab9e22 /rustc/06754d8852bea286a3a76d373ccd17e66afb5a8b/library/alloc/src/alloc.rs:171:73
#2 0x55c8061a27f9 in alloc::alloc::Global::alloc_impl::h5cfa6b00beb3203a /rustc/06754d8852bea286a3a76d373ccd17e66afb5a8b/library/alloc/src/alloc.rs:171:73
#3 0x55c8061a1e3d in _$LT$alloc..alloc..Global$u20$as$u20$core..alloc..Allocator$GT$::allocate::h6ea44ef9df335707 /rustc/06754d8852bea286a3a76d373ccd17e66afb5a8b/library/alloc/src/alloc.rs:320:11
#4 0x55c8061a1e3d in alloc::alloc::exchange_malloc::h91c76b5bf53597fd /rustc/06754d8852bea286a3a76d373ccd17e66afb5a8b/library/alloc/src/alloc.rs:320:11
#5 0x55c80619d43f in alloc::boxed::Box$LT$T$GT$::new::hba6cd0555301746a /rustc/06754d8852bea286a3a76d373ccd17e66afb5a8b/library/alloc/src/boxed.rs:215:9
#6 0x55c80619d43f in rust_tmp::node::NodePtr$LT$T$GT$::raw_alloc::hc476ac99ddaa5b56 /home/martin/work/rust-tmp/src/node.rs:49:33
#7 0x55c80619d72b in rust_tmp::node::NodePtr$LT$T$GT$::dummy::h8b4ac6a33651e5fd /home/martin/work/rust-tmp/src/node.rs:62:25
#8 0x55c806199a04 in rust_tmp::test::test_basic_front::hbb4f6a4888143b15 /home/martin/work/rust-tmp/src/lib.rs:104:20
#9 0x55c80619fbe9 in rust_tmp::test::test_basic_front::_$u7b$$u7b$closure$u7d$$u7d$::h970eafca0aaa30ba /home/martin/work/rust-tmp/src/lib.rs:93:5
#10 0x55c8061d94b2 in core::ops::function::FnOnce::call_once::ha04ab83b16462938 /rustc/06754d8852bea286a3a76d373ccd17e66afb5a8b/library/core/src/ops/function.rs:248:5
#11 0x55c8061d94b2 in test::__rust_begin_short_backtrace::h994ad9d2e6435f98 /rustc/06754d8852bea286a3a76d373ccd17e66afb5a8b/library/test/src/lib.rs:572:5
Then, I reduced the usage down to a minimum. I realized it already happens with a single push_front():
fn main() {
let mut list = LinkedList::new();
list.push_front(10);
dbg!(list.pop_front());
dbg!(list.pop_front());
}
I then added println()s to all allocations and deallocations:
pub unsafe fn raw_alloc(prev: Self, elem: MaybeUninit<T>, next: Self) -> Self {
let ptr = Box::into_raw(Box::new(Node { prev, next, elem }));
println!("raw_alloc: {:p}", ptr);
let ptr = NonNull::new_unchecked(ptr);
Self { ptr }
}
// ...
pub unsafe fn dealloc(self) -> Option<(Self, T, Self)> {
self.is_dummy().not().then(|| {
let ptr = self.as_ptr();
println!("dealloc: {:p}", ptr);
let Node { prev, next, elem } = *Box::from_raw(ptr);
(prev, elem.assume_init(), next)
})
}
impl<T> Drop for LinkedList<T> {
fn drop(&mut self) {
while self.pop_front().is_some() {}
let dummy = self.dummy.take().unwrap();
unsafe {
let ptr = dummy.as_ptr();
println!("drop: {:p}", ptr);
let _ = Box::from_raw(ptr);
}
}
}
I saw the following output, when running the main() shown earlier:
raw_alloc: 0x603000000070
raw_alloc: 0x6030000000a0
raw_alloc: 0x6030000000d0
dealloc: 0x6030000000a0
[src/main.rs:78] list.pop_front() = Some(
10,
)
raw_alloc: 0x603000000100
[src/main.rs:79] list.pop_front() = None
raw_alloc: 0x603000000130
drop: 0x603000000070
The allocations 0x603000000070 and 0x6030000000a0 seem fine, that's the dummy element and the first node. The respective deallocations also seem to match, dealloc: 0x6030000000a0 and drop: 0x603000000070. But there are three more allocations that shouldn't be there, 0x6030000000d0, 0x603000000100 and 0x603000000130.
So what I did next is to set a breakpoint in raw_alloc() using VSCode and the amazing rust-analyzer plugin. I ran in debug, hit the first raw_alloc() for the dummy element, then the second for the first node element, and then the third. I looked at the callstack of the third, and saw:
rust_tmp::node::NodePtr<T>::raw_alloc
rust_tmp::node::NodePtr<T>::dummy
rust_tmp::LinkedList<T>::inner
rust_tmp::LinkedList<T>::pop_front
rust_tmp::main
I then thought, 'why does dummy() get called again in inner()?'; and that's when I noticed the get_or_insert().
I hope this analysis helps to improve your own debugging skills :)
Related
I'm trying to get the quota state of a volume in Windows using the win32 api through IDiskQuotaControl interface
The problem that I got into seems to be related the initialization. Please see the code below.
//go:build windows && amd64
package main
import (
"flag"
"fmt"
"runtime"
"sync"
"syscall"
"unsafe"
"golang.org/x/sys/windows"
)
// START OF basic stuff
const COINIT_APARTMENTTHREADED = 0x2
const CLSCTX_INPROC_SERVER = 0x1
const CLSCTX_INPROC_HANDLER = 0x2
const CLSCTX_LOCAL_SERVER = 0x4
var (
modole32 = syscall.NewLazyDLL("ole32.dll")
procCoInitializeEx = modole32.NewProc("CoInitializeEx")
procCoUninitialize = modole32.NewProc("CoUninitialize")
procCoCreateInstance = modole32.NewProc("CoCreateInstance")
)
func CoInitializeEx(pvReserved uintptr, dwCoInit uint32) (r1, r2 uintptr, lastErr error) {
r1, r2, lastErr = procCoInitializeEx.Call(
uintptr(pvReserved),
uintptr(dwCoInit),
)
return
}
func CoUninitialize() (r1, r2 uintptr, lastErr error) {
r1, r2, lastErr = procCoUninitialize.Call()
return
}
func CoCreateInstance(rclsid *GUID, pUnkOuter *byte, dwClsContext uint32, riid *GUID, ppv *uintptr) (r1, r2 uintptr, lastErr error) {
r1, r2, lastErr = procCoCreateInstance.Call(
uintptr(unsafe.Pointer(rclsid)),
uintptr(unsafe.Pointer(pUnkOuter)),
uintptr(dwClsContext),
uintptr(unsafe.Pointer(riid)),
uintptr(unsafe.Pointer(ppv)),
)
return
}
type GUID struct {
Data1 uint32
Data2 uint16
Data3 uint16
Data4 [8]byte
}
type IDiskQuotaControl struct {
lpVtbl *IDiskQuotaControlVtbl
}
type IDiskQuotaControlVtbl struct {
QueryInterface uintptr
AddRef uintptr
Release uintptr
Initialize uintptr
SetQuotaState uintptr
GetQuotaState uintptr
SetQuotaLogFlags uintptr
GetQuotaLogFlags uintptr
SetDefaultQuotaThreshold uintptr
GetDefaultQuotaThreshold uintptr
GetDefaultQuotaThresholdText uintptr
SetDefaultQuotaLimit uintptr
GetDefaultQuotaLimit uintptr
GetDefaultQuotaLimitText uintptr
AddUserSid uintptr
AddUserName uintptr
DeleteUser uintptr
FindUserSid uintptr
FindUserName uintptr
CreateEnumUsers uintptr
CreateUserBatch uintptr
InvalidateSidNameCache uintptr
GiveUserNameResolutionPriority uintptr
ShutdownNameResolution uintptr
}
func (x *IDiskQuotaControl) AddRef() (r1, r2 uintptr, lastErr error) {
r1, r2, lastErr = syscall.SyscallN(
x.lpVtbl.AddRef,
uintptr(unsafe.Pointer(x)),
)
return
}
func (x *IDiskQuotaControl) Release() (r1, r2 uintptr, lastErr error) {
r1, r2, lastErr = syscall.SyscallN(
x.lpVtbl.Release,
uintptr(unsafe.Pointer(x)),
)
return
}
func (x *IDiskQuotaControl) Initialize(pszPath *uint16, bReadWrite int32) (r1, r2 uintptr, lastErr error) {
r1, r2, lastErr = syscall.SyscallN(
x.lpVtbl.Initialize,
uintptr(unsafe.Pointer(x)),
uintptr(unsafe.Pointer(pszPath)),
uintptr(bReadWrite),
)
return
}
func (x *IDiskQuotaControl) GetQuotaState(pdwState *uint32) (r1, r2 uintptr, lastErr error) {
r1, r2, lastErr = syscall.SyscallN(
x.lpVtbl.GetQuotaState,
uintptr(unsafe.Pointer(x)),
uintptr(unsafe.Pointer(pdwState)),
)
return
}
func (x *IDiskQuotaControl) GetDefaultQuotaLimit(pllLimit *int64) (r1, r2 uintptr, lastErr error) {
r1, r2, lastErr = syscall.SyscallN(
x.lpVtbl.GetDefaultQuotaLimit,
uintptr(unsafe.Pointer(x)),
uintptr(unsafe.Pointer(pllLimit)),
)
return
}
var CLSID_DiskQuotaControl = &GUID{0x7988b571, 0xec89, 0x11cf, [8]byte{0x9c, 0x0, 0x0, 0xaa, 0x0, 0xa1, 0x4f, 0x56}}
var IID_IDiskQuotaControl = &GUID{0x7988b572, 0xec89, 0x11cf, [8]byte{0x9c, 0x0, 0x0, 0xaa, 0x0, 0xa1, 0x4f, 0x56}}
// END OF basic stuff
func getVolumeQuota(wg *sync.WaitGroup, volume string) {
defer wg.Done()
runtime.LockOSThread()
defer runtime.UnlockOSThread()
// Init COM
r1, r2, lastErr := CoInitializeEx(
0, // must be NULL
COINIT_APARTMENTTHREADED,
)
defer CoUninitialize()
fmt.Println("CoInitializeEx", r1, r2, lastErr) // Print results for debug
// Create a COM instance
var ppv uintptr
r1, r2, lastErr = CoCreateInstance(
CLSID_DiskQuotaControl,
nil,
CLSCTX_INPROC_SERVER,
IID_IDiskQuotaControl,
&ppv,
)
fmt.Println("CoCreateInstance", r1, r2, lastErr)
diskQuotaControl := (*IDiskQuotaControl)(unsafe.Pointer(ppv))
defer diskQuotaControl.Release()
pszPath, err := windows.UTF16PtrFromString(volume)
if err != nil {
panic(err)
}
// Initialize seems to fail even the return is a success
r1, r2, lastErr = diskQuotaControl.Initialize(
pszPath,
0, // false => read only
)
fmt.Println("Initialize", r1, r2, lastErr)
var pdwState uint32
r1, r2, lastErr = diskQuotaControl.GetQuotaState(
&pdwState,
)
fmt.Println("GetQuotaState", r1, r2, lastErr)
fmt.Println(pdwState)
var pllLimit int64
r1, r2, lastErr = diskQuotaControl.GetDefaultQuotaLimit(
&pllLimit,
)
fmt.Println("GetDefaultQuotaLimit", r1, r2, lastErr)
fmt.Println(pllLimit)
}
func main() {
volume := flag.String("v", `C:\`, "volume")
flag.Parse()
fmt.Println("Volume", *volume)
var wg sync.WaitGroup
wg.Add(1)
go getVolumeQuota(&wg, *volume)
wg.Wait()
}
The reason why I think the problem is in the Initialize, is that whatever input I put (using the flag -v) I always get a success return.
Results:
Volume C:\
CoInitializeEx 0 0 The operation completed successfully.
CoCreateInstance 0 0 The operation completed successfully.
Initialize 0 9298000 The operation completed successfully.
GetQuotaState 2147942403 9298000 The system cannot find the path specified.
0
GetDefaultQuotaLimit 2147942421 9298000 The operation completed successfully.
0
Volume \\?\C:\
CoInitializeEx 0 0 The operation completed successfully.
CoCreateInstance 0 0 The operation completed successfully.
Initialize 0 9103168 The operation completed successfully.
GetQuotaState 2147942403 9103168 The system cannot find the path specified.
0
GetDefaultQuotaLimit 2147942421 9103168 The operation completed successfully.
0
Volume Invalid:Volume://+
CoInitializeEx 0 0 The operation completed successfully.
CoCreateInstance 0 0 The operation completed successfully.
Initialize 0 8709600 The operation completed successfully.
GetQuotaState 2147942403 8709600 The system cannot find the path specified.
0
GetDefaultQuotaLimit 2147942421 8709600 The operation completed successfully.
0
The error in GetDefaultQuotaLimit (2147942421 = 0x80070015) translates to "This object has not been initialized"
The IDiskQuotaControl interface documentation:
The IDiskQuotaControl interface inherits from the IUnknown interface.
is wrong! IDiskQuotaControl derives from IConnectionPointContainer , not from IUnknown. You can see that in dskquota.h:
DECLARE_INTERFACE_IID_(IDiskQuotaControl, IConnectionPointContainer, "7988B572-EC89-11cf-9C00-00AA00A14F56")
So, your vtable definition is wrong, you must add two methods between your Release and Initialize methods.
I'm an experienced C++ programmer, used to low level optimization an I'm trying to get performances out of Go.
So far, I'm interested in GFlop/s.
I wrote the following go code:
package main
import (
"fmt"
"time"
"runtime"
"sync"
)
func expm1(x float64) float64 {
return ((((((((((((((15.0 + x) * x + 210.0) * x + 2730.0) * x + 32760.0) * x + 360360.0) * x + 3603600.0) * x + 32432400.0) * x + 259459200.0) * x + 1816214400.0) * x + 10897286400.0) * x + 54486432000.0) * x + 217945728000.0) *
x + 653837184000.0) * x + 1307674368000.0) * x * 7.6471637318198164759011319857881e-13;
}
func twelve(x float64) float64 {
return expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1(x))))))))))));
}
func populate(data []float64, N int) {
CPUCOUNT := runtime.NumCPU();
var wg sync.WaitGroup
var slice = N / CPUCOUNT;
wg.Add(CPUCOUNT)
defer wg.Wait()
for i := 0; i < CPUCOUNT; i++ {
go func(ii int) {
for j := ii * slice; j < ii * slice + slice; j += 1 {
data[j] = 0.1;
}
defer wg.Done();
}(i);
}
}
func apply(data []float64, N int) {
CPUCOUNT := runtime.NumCPU();
var wg sync.WaitGroup
var slice = N / CPUCOUNT;
wg.Add(CPUCOUNT)
defer wg.Wait()
for i := 0; i < CPUCOUNT; i++ {
go func(ii int) {
for j := ii * slice; j < ii * slice + slice; j += 8 {
data[j] = twelve(data[j]);
data[j+1] = twelve(data[j+1]);
data[j+2] = twelve(data[j+2]);
data[j+3] = twelve(data[j+3]);
data[j+4] = twelve(data[j+4]);
data[j+5] = twelve(data[j+5]);
data[j+6] = twelve(data[j+6]);
data[j+7] = twelve(data[j+7]);
}
defer wg.Done();
}(i);
}
}
func Run(data []float64, N int) {
populate(data, N);
start:= time.Now();
apply(data, N);
stop:= time.Now();
elapsed:=stop.Sub(start);
seconds := float64(elapsed.Milliseconds()) / 1000.0;
Gflop := float64(N) * 12.0 * 15.0E-9;
fmt.Printf("%f\n", Gflop / seconds);
}
func main() {
CPUCOUNT := runtime.NumCPU();
fmt.Printf("num procs : %d\n", CPUCOUNT);
N := 1024*1024*32 * CPUCOUNT;
data:= make([]float64, N);
for i := 0; i < 100; i++ {
Run(data, N);
}
}
which is an attempt of translation from my c++ benchmark which yields 80% of peak flops.
The C++ version yields 95 GFlop/s where the go version yields 6 GFlops/s (FMA counter for 1).
Here is a piece of the go assembly (gccgo -O3 -mfma -mavx2):
vfmadd132sd %xmm1, %xmm15, %xmm0
.loc 1 12 50
vfmadd132sd %xmm1, %xmm14, %xmm0
.loc 1 12 64
vfmadd132sd %xmm1, %xmm13, %xmm0
.loc 1 12 79
vfmadd132sd %xmm1, %xmm12, %xmm0
.loc 1 12 95
vfmadd132sd %xmm1, %xmm11, %xmm0
.loc 1 12 112
vfmadd132sd %xmm1, %xmm10, %xmm0
And what I get from my c++ code (g++ -fopenmp -mfma -mavx2 -O3):
vfmadd213pd .LC3(%rip), %ymm12, %ymm5
vfmadd213pd .LC3(%rip), %ymm11, %ymm4
vfmadd213pd .LC3(%rip), %ymm10, %ymm3
vfmadd213pd .LC3(%rip), %ymm9, %ymm2
vfmadd213pd .LC3(%rip), %ymm8, %ymm1
vfmadd213pd .LC3(%rip), %ymm15, %ymm0
vfmadd213pd .LC4(%rip), %ymm15, %ymm0
vfmadd213pd .LC4(%rip), %ymm14, %ymm7
vfmadd213pd .LC4(%rip), %ymm13, %ymm6
vfmadd213pd .LC4(%rip), %ymm12, %ymm5
vfmadd213pd .LC4(%rip), %ymm11, %ymm4
I therefore have a few questions, most important of which is :
Do I express parallelism the right way ?
and if not, how should I do that ?
For additional performance improvements, I'd need to know what's wrong with the following items :
Why do I see only vfmadd132sd instructions in the assembly, instead of vfmadd132pd?
How can I properly align memory allocations?
How can I remove debug info from the generated executable?
Do I pass the right options to gccgo?
Do I use the right compiler?
Do i express parallelism the right way ?
No. You might be trashing the CPU cache. (But this is hard to tell without knowing details about your system. Guess it's not NUMA?). Anyway, technically your code is concurrent not parallel.
Why do I see only vfmadd132sd instructions in the assembly, instead of vfmadd132pd ?
Because the compiler put it there. Is this a compiler question or a programming question?
How can i properly align memory allocations ?
That depends on your definition of "properly". Struct field and slice alignments are not ad hoc controllable, but you can reorder struct fields (which you did not use at all, so I do not know what you are asking here).
How can i remove debug info from the generated executable ?
Consult the documentation of gcc.
Do i pass the right options to gccgo ?
I do not know.
Do I use the right compiler ?
What makes a compiler "right"?
I'm benchmarking unmarshaling from string to int and uint with this code:
package main
import (
"strconv"
"testing"
)
func BenchmarkUnmarshalInt(b *testing.B) {
for i := 0; i < b.N; i++ {
UnmarshalInt("123456")
}
}
func BenchmarkUnmarshalUint(b *testing.B) {
for i := 0; i < b.N; i++ {
UnmarshalUint("123456")
}
}
func UnmarshalInt(v string) int {
i, _ := strconv.Atoi(v)
return i
}
func UnmarshalUint(v string) uint {
i, _ := strconv.ParseUint(v, 10, 64)
return uint(i)
}
Result:
Running tool: C:\Go\bin\go.exe test -benchmem -run=^$ myBench/main -bench .
goos: windows
goarch: amd64
pkg: myBench/main
BenchmarkUnmarshalInt-8 99994166 11.7 ns/op 0 B/op 0 allocs/op
BenchmarkUnmarshalUint-8 54550413 21.0 ns/op 0 B/op 0 allocs/op
Is it possible that the second (uint) is almost twice as slow as the first (int)?
Yes, it's possible. strconv.Atoi has a fast path when the input string length is less than 19 (or 10 if int is 32 bit). This allows it to be a lot faster because it doesn't need to check for overflow.
If you change your test number to "1234567890123456789" (assuming 64 bit int), then your int benchmark is slightly slower than the uint benchmark because the fast path can't be used. On my machine, it takes 37.6 ns/op for the signed version vs 31.5 ns/op for the unsigned version.
Here's the modified benchmark code (note I added a variable that sums up the parsed results, just in case the compiler got clever and optimized it away).
package main
import (
"fmt"
"strconv"
"testing"
)
const X = "1234567890123456789"
func BenchmarkUnmarshalInt(b *testing.B) {
var T int
for i := 0; i < b.N; i++ {
T += UnmarshalInt(X)
}
fmt.Println(T)
}
func BenchmarkUnmarshalUint(b *testing.B) {
var T uint
for i := 0; i < b.N; i++ {
T += UnmarshalUint(X)
}
fmt.Println(T)
}
func UnmarshalInt(v string) int {
i, _ := strconv.Atoi(v)
return i
}
func UnmarshalUint(v string) uint {
i, _ := strconv.ParseUint(v, 10, 64)
return uint(i)
}
For reference, the code for strconv.Atoi in the standard library is currently as follows:
func Atoi(s string) (int, error) {
const fnAtoi = "Atoi"
sLen := len(s)
if intSize == 32 && (0 < sLen && sLen < 10) ||
intSize == 64 && (0 < sLen && sLen < 19) {
// Fast path for small integers that fit int type.
s0 := s
if s[0] == '-' || s[0] == '+' {
s = s[1:]
if len(s) < 1 {
return 0, &NumError{fnAtoi, s0, ErrSyntax}
}
}
n := 0
for _, ch := range []byte(s) {
ch -= '0'
if ch > 9 {
return 0, &NumError{fnAtoi, s0, ErrSyntax}
}
n = n*10 + int(ch)
}
if s0[0] == '-' {
n = -n
}
return n, nil
}
// Slow path for invalid, big, or underscored integers.
i64, err := ParseInt(s, 10, 0)
if nerr, ok := err.(*NumError); ok {
nerr.Func = fnAtoi
}
return int(i64), err
}
I need to get the top N items from a Vec which is quite large in production. Currently I do it like this inefficient way:
let mut v = vec![6, 4, 3, 7, 2, 1, 5];
v.sort_unstable();
v = v[0..3].to_vec();
In C++, I'd use std::partial_sort, but I can't find an equivalent in the Rust docs.
Am I just overlooking it, or does it not exist (yet)?
The standard library doesn't contain this functionality, but it looks like the lazysort crate is exactly what you need:
So what's the point of lazy sorting? As per the linked blog post, they're useful when you do not need or intend to need every value; for example you may only need the first 1,000 ordered values from a larger set.
#![feature(test)]
extern crate lazysort;
extern crate rand;
extern crate test;
use std::cmp::Ordering;
trait SortLazy<T> {
fn sort_lazy<F>(&mut self, cmp: F, n: usize)
where
F: Fn(&T, &T) -> Ordering;
unsafe fn sort_lazy_fast<F>(&mut self, cmp: F, n: usize)
where
F: Fn(&T, &T) -> Ordering;
}
impl<T> SortLazy<T> for [T] {
fn sort_lazy<F>(&mut self, cmp: F, n: usize)
where
F: Fn(&T, &T) -> Ordering,
{
fn sort_lazy<F, T>(data: &mut [T], accu: &mut usize, cmp: &F, n: usize)
where
F: Fn(&T, &T) -> Ordering,
{
if !data.is_empty() && *accu < n {
let mut pivot = 1;
let mut lower = 0;
let mut upper = data.len();
while pivot < upper {
match cmp(&data[pivot], &data[lower]) {
Ordering::Less => {
data.swap(pivot, lower);
lower += 1;
pivot += 1;
}
Ordering::Greater => {
upper -= 1;
data.swap(pivot, upper);
}
Ordering::Equal => pivot += 1,
}
}
sort_lazy(&mut data[..lower], accu, cmp, n);
sort_lazy(&mut data[upper..], accu, cmp, n);
} else {
*accu += 1;
}
}
sort_lazy(self, &mut 0, &cmp, n);
}
unsafe fn sort_lazy_fast<F>(&mut self, cmp: F, n: usize)
where
F: Fn(&T, &T) -> Ordering,
{
fn sort_lazy<F, T>(data: &mut [T], accu: &mut usize, cmp: &F, n: usize)
where
F: Fn(&T, &T) -> Ordering,
{
if !data.is_empty() && *accu < n {
unsafe {
use std::mem::swap;
let mut pivot = 1;
let mut lower = 0;
let mut upper = data.len();
while pivot < upper {
match cmp(data.get_unchecked(pivot), data.get_unchecked(lower)) {
Ordering::Less => {
swap(
&mut *(data.get_unchecked_mut(pivot) as *mut T),
&mut *(data.get_unchecked_mut(lower) as *mut T),
);
lower += 1;
pivot += 1;
}
Ordering::Greater => {
upper -= 1;
swap(
&mut *(data.get_unchecked_mut(pivot) as *mut T),
&mut *(data.get_unchecked_mut(upper) as *mut T),
);
}
Ordering::Equal => pivot += 1,
}
}
sort_lazy(&mut data[..lower], accu, cmp, n);
sort_lazy(&mut data[upper..], accu, cmp, n);
}
} else {
*accu += 1;
}
}
sort_lazy(self, &mut 0, &cmp, n);
}
}
#[cfg(test)]
mod tests {
use test::Bencher;
use lazysort::Sorted;
use std::collections::BinaryHeap;
use SortLazy;
use rand::{thread_rng, Rng};
const SIZE_VEC: usize = 100_000;
const N: usize = 42;
#[bench]
fn sort(b: &mut Bencher) {
b.iter(|| {
let mut rng = thread_rng();
let mut v: Vec<i32> = std::iter::repeat_with(|| rng.gen())
.take(SIZE_VEC)
.collect();
v.sort_unstable();
})
}
#[bench]
fn lazysort(b: &mut Bencher) {
b.iter(|| {
let mut rng = thread_rng();
let v: Vec<i32> = std::iter::repeat_with(|| rng.gen())
.take(SIZE_VEC)
.collect();
let _: Vec<_> = v.iter().sorted().take(N).collect();
})
}
#[bench]
fn lazysort_in_place(b: &mut Bencher) {
b.iter(|| {
let mut rng = thread_rng();
let mut v: Vec<i32> = std::iter::repeat_with(|| rng.gen())
.take(SIZE_VEC)
.collect();
v.sort_lazy(i32::cmp, N);
})
}
#[bench]
fn lazysort_in_place_fast(b: &mut Bencher) {
b.iter(|| {
let mut rng = thread_rng();
let mut v: Vec<i32> = std::iter::repeat_with(|| rng.gen())
.take(SIZE_VEC)
.collect();
unsafe { v.sort_lazy_fast(i32::cmp, N) };
})
}
#[bench]
fn binaryheap(b: &mut Bencher) {
b.iter(|| {
let mut rng = thread_rng();
let v: Vec<i32> = std::iter::repeat_with(|| rng.gen())
.take(SIZE_VEC)
.collect();
let mut iter = v.iter();
let mut heap: BinaryHeap<_> = iter.by_ref().take(N).collect();
for i in iter {
heap.push(i);
heap.pop();
}
let _ = heap.into_sorted_vec();
})
}
}
running 5 tests
test tests::binaryheap ... bench: 3,283,938 ns/iter (+/- 413,805)
test tests::lazysort ... bench: 1,669,229 ns/iter (+/- 505,528)
test tests::lazysort_in_place ... bench: 1,781,007 ns/iter (+/- 443,472)
test tests::lazysort_in_place_fast ... bench: 1,652,103 ns/iter (+/- 691,847)
test tests::sort ... bench: 5,600,513 ns/iter (+/- 711,927)
test result: ok. 0 passed; 0 failed; 0 ignored; 5 measured; 0 filtered out
This code allows us to see that lazysort is faster than the solution with BinaryHeap. We can also see that BinaryHeap solution gets worse when N increases.
The problem with lazysort is that it creates a second Vec<_>. A "better" solution would be to implement the partial sort in-place. I provided an example of such an implementation.
Keep in mind that all these solutions come with overhead. When N is about SIZE_VEC / 3, the classic sort wins.
You could submit an RFC/issue to ask about adding this feature to the standard library.
There is a select_nth_unstable, the equivalent of std::nth_element. The result of this can then be sorted to achieve what you want.
Example:
let mut v = vec![6, 4, 3, 7, 2, 1, 5];
let top_three = v.select_nth_unstable(3).0;
top_three.sort();
3 here is the index of the "nth" element, so we're actually picking the 4th element, that's because select_nth_unstable returns a tuple of
a slice to the left of the nth element
a reference to the nth element
a slice to the right of the nth element
i'm working on a program that allocate lots of []int with length 4,3,2
and found using a:=[]{1,1,1} is a little bit fast than a:=make([]int,3) a[0] = 1 a[1]=1 a[2]= 1
my question: why a:=[]{1,1,1} is faster than a:=make([]int,3) a[0] = 1 a[1]=1 a[2]= 1?
func BenchmarkMake(b *testing.B) {
var array []int
for i := 0; i < b.N; i++ {
array = make([]int, 4)
array[0] = 1
array[1] = 1
array[2] = 1
array[3] = 1
}
}
func BenchmarkDirect(b *testing.B) {
var array []int
for i := 0; i < b.N; i++ {
array = []int{1, 1, 1, 1}
}
array[0] = 1
}
BenchmarkMake-4 50000000 34.3 ns/op
BenchmarkDirect-4 50000000 33.8 ns/op
Let's look at benchmark output of the following code
package main
import "testing"
func BenchmarkMake(b *testing.B) {
var array []int
for i := 0; i < b.N; i++ {
array = make([]int, 4)
array[0] = 1
array[1] = 1
array[2] = 1
array[3] = 1
}
}
func BenchmarkDirect(b *testing.B) {
var array []int
for i := 0; i < b.N; i++ {
array = []int{1, 1, 1, 1}
}
array[0] = 1
}
func BenchmarkArray(b *testing.B) {
var array [4]int
for i := 0; i < b.N; i++ {
array = [4]int{1, 1, 1, 1}
}
array[0] = 1
}
Usually the output looks like that
$ go test -bench . -benchmem -o alloc_test -cpuprofile cpu.prof
goos: linux
goarch: amd64
pkg: test
BenchmarkMake-8 30000000 61.3 ns/op 32 B/op 1 allocs/op
BenchmarkDirect-8 20000000 60.2 ns/op 32 B/op 1 allocs/op
BenchmarkArray-8 1000000000 2.56 ns/op 0 B/op 0 allocs/op
PASS
ok test 6.003s
The difference is so small that it can be the opposite in some circumstances.
Let's look at the profiling data
$go tool pprof -list 'Benchmark.*' cpu.prof
ROUTINE ======================== test.BenchmarkMake in /home/grzesiek/go/src/test/alloc_test.go
260ms 1.59s (flat, cum) 24.84% of Total
. . 5:func BenchmarkMake(b *testing.B) {
. . 6: var array []int
40ms 40ms 7: for i := 0; i < b.N; i++ {
50ms 1.38s 8: array = make([]int, 4)
. . 9: array[0] = 1
130ms 130ms 10: array[1] = 1
20ms 20ms 11: array[2] = 1
20ms 20ms 12: array[3] = 1
. . 13: }
. . 14:}
ROUTINE ======================== test.BenchmarkDirect in /home/grzesiek/go/src/test/alloc_test.go
90ms 1.66s (flat, cum) 25.94% of Total
. . 16:func BenchmarkDirect(b *testing.B) {
. . 17: var array []int
10ms 10ms 18: for i := 0; i < b.N; i++ {
80ms 1.65s 19: array = []int{1, 1, 1, 1}
. . 20: }
. . 21: array[0] = 1
. . 22:}
ROUTINE ======================== test.BenchmarkArray in /home/grzesiek/go/src/test/alloc_test.go
2.86s 2.86s (flat, cum) 44.69% of Total
. . 24:func BenchmarkArray(b *testing.B) {
. . 25: var array [4]int
500ms 500ms 26: for i := 0; i < b.N; i++ {
2.36s 2.36s 27: array = [4]int{1, 1, 1, 1}
. . 28: }
. . 29: array[0] = 1
. . 30:}
We can see that assignments takes some time.
To learn why we need to see the assembler code.
$go tool pprof -disasm 'BenchmarkMake' cpu.prof
. . 4eda93: MOVQ AX, 0(SP) ;alloc_test.go:8
30ms 30ms 4eda97: MOVQ $0x4, 0x8(SP) ;test.BenchmarkMake alloc_test.go:8
. . 4edaa0: MOVQ $0x4, 0x10(SP) ;alloc_test.go:8
10ms 1.34s 4edaa9: CALL runtime.makeslice(SB) ;test.BenchmarkMake alloc_test.go:8
. . 4edaae: MOVQ 0x18(SP), AX ;alloc_test.go:8
10ms 10ms 4edab3: MOVQ 0x20(SP), CX ;test.BenchmarkMake alloc_test.go:8
. . 4edab8: TESTQ CX, CX ;alloc_test.go:9
. . 4edabb: JBE 0x4edb0b
. . 4edabd: MOVQ $0x1, 0(AX)
130ms 130ms 4edac4: CMPQ $0x1, CX ;test.BenchmarkMake alloc_test.go:10
. . 4edac8: JBE 0x4edb04 ;alloc_test.go:10
. . 4edaca: MOVQ $0x1, 0x8(AX)
20ms 20ms 4edad2: CMPQ $0x2, CX ;test.BenchmarkMake alloc_test.go:11
. . 4edad6: JBE 0x4edafd ;alloc_test.go:11
. . 4edad8: MOVQ $0x1, 0x10(AX)
. . 4edae0: CMPQ $0x3, CX ;alloc_test.go:12
. . 4edae4: JA 0x4eda65
We can see that the time is taken by CMPQ command that compares constant with CX register. The CX register is the value copied from stack after call to make. We can deduce that it must be the size of slice while AX holds the reference to an underlying array. You can also see that the first bound check was optimized.
Conclusions
Allocations takes the same time but the assignments costs extra due to the slice size checks (as noticed by Terry Pang).
Using array instead of slice is much more cheaper as it saves allocations.
Why is using array so much cheaper?
In Go the array is basically a chunk of memory of fixed size. The [1]int is basically the same thing as int. You can find more in in Go Slices: usage and internals article.