Only using 20% of CPU after multiprocessing.pool.starmap() in python - performance

The intention of the program is to run through a directory, if a file is an excel spreadsheet it should open it, extract and manipulate some data then move onto the next file. As this is a laborious process I have tried to split the task across multiple threads. Even after this is only using 20% of the total CPU capacity, and it didn't particular speed up.
def extract_data(unique_file_names):
global rootdir
global newarray
global counter
global t0
string = rootdir + "\\" + str(unique_file_names[0])
wb = load_workbook(string, read_only = True, data_only=True)
ws = wb["Sheet1"]
df = pd.DataFrame(ws.values)
newarray = df.loc[4:43,:13].values
counter = 0
print("Starting pool")
pool = ThreadPool(processes=20)
pool.map(process, unique_file_names)
pool.close()
def process(filename):
global newarray
global unique_file_names
global counter
global t0
counter+=1
try:
file_name = rootdir + "/" + str(filename)
wb = load_workbook(file_name, read_only = True, data_only=True)
ws = wb["Sheet1"]
df = pd.DataFrame(ws.values)
newarray = np.hstack((newarray, df.loc[4:43,4:13].values))
except:
print("Failure")
pass
print("Time %.2f, Completed %.2f %% " %((time.clock()-t0),counter*100/len(unique_file_names)))
So it roughly takes around a second and a half to process one spreadsheet, but like I said pool.map() made next to no difference. Any suggestions?
Thanks in advance

Related

multiprocessing and time.perf_counter

I'm trying to learn multiproccessing in python and I'd like to see how long my program takes to run the code by using multiproccessing, but I can't understand why the time.perf_counter function prints two really small numbers (6.00004568696022e-07 and 1.200009137392044e-06) and after that the actual amount of seconds (18.546351400000276) of the duration of the program. Can you expalin me why?
Thanks
import time
from multiprocessing import Process
start = time.perf_counter()
def counter(n):
count = 0
while count < n:
count+=1
if __name__ == '__main__':
a = Process(target = counter, args = (500000000,))
b = Process(target = counter, args = (500000000,))
a.start()
b.start()
a.join()
b.join()
print(time.perf_counter() - start)

PyQt Copy folder with progress and transfer rate/speed

I'd like to expand my current code with the ability to show the transfer rate/speed of the files being copied. Im working on Windows 10 with py 3.6 and Qt 5.8. Here is my code:
import os
import shutil
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QLabel, QProgressBar, QFileDialog
class FileCopyProgress(QWidget):
def __init__(self, parent=None, src=None, dest=None):
super(FileCopyProgress, self).__init__()
self.src = src
self.dest = dest
self.build_ui()
def build_ui(self):
hbox = QVBoxLayout()
lbl_src = QLabel('Source: ' + self.src)
lbl_dest = QLabel('Destination: ' + self.dest)
self.pb = QProgressBar()
self.pb.setMinimum(0)
self.pb.setMaximum(100)
self.pb.setValue(0)
hbox.addWidget(lbl_src)
hbox.addWidget(lbl_dest)
hbox.addWidget(self.pb)
self.setLayout(hbox)
self.setWindowTitle('File copy')
self.auto_start_timer = QTimer()
self.auto_start_timer.singleShot(2000, lambda: self.copyFilesWithProgress(self.src, self.dest, self.progress, self.copydone))
self.show()
def progress(self, done, total):
progress = int(round((done/float(total))*100))
try:
self.pb.setValue(progress)
except:
pass
app.processEvents()
def copydone(self):
self.pb.setValue(100)
self.close()
def countfiles(self, _dir):
files = []
if os.path.isdir(_dir):
for path, dirs, filenames in os.walk(_dir):
files.extend(filenames)
return len(files)
def makedirs(self, dest):
if not os.path.exists(dest):
os.makedirs(dest)
#pyqtSlot()
def copyFilesWithProgress(self, src, dest, callback_progress, callback_copydone):
numFiles = self.countfiles(src)
if numFiles > 0:
dest = os.path.join(dest, src.replace(BASE_DIR, '').replace('\\', ''))
print(''.join(['Destination: ', dest]))
self.makedirs(dest)
numCopied = 0
for path, dirs, filenames in os.walk(src):
for directory in dirs:
destDir = path.replace(src,dest)
self.makedirs(os.path.join(destDir, directory))
for sfile in filenames:
srcFile = os.path.join(path, sfile)
destFile = os.path.join(path.replace(src, dest), sfile)
shutil.copy(srcFile, destFile)
numCopied += 1
callback_progress(numCopied, numFiles)
callback_copydone()
BASE_DIR = 'C:\\dev'
app = QApplication([])
FileCopyProgress(src="C:\dev\pywin32-221", dest='C:\dev\copied')
# Run the app
app.exec_()
This code opens a gui with a progressbar showing progress while copying files. A simple label with the current transfer rate/speed (approximate) would be rlly nice :)
Unfortuanetly i can't find any examples, can someone give me a hint or maybe a working example please?
EDIT:
I did a remake and now I have transfer rate, time elapsed and time remaining. The data seems to be realistic. I have only one problem: Lets assume i have a folder/file that time remaining is 7 sec -> currently it starts with 7 sec and gets an update every 1 second. We expect that in the next step it would display 6 sec but instead it goes:
5 sec
3 sec
1.5 sec
1.4 sec
1.3 sec
1.2 sec
1.1 sec and so on
Where is the mistake?
class FileCopyProgress(QWidget):
def __init__(self, parent=None, src=None, dest=None):
super(FileCopyProgress, self).__init__()
self.src = src
self.dest = dest
self.rate = "0"
self.total_time = "0 s"
self.time_elapsed = "0 s"
self.time_remaining = "0 s"
self.build_ui()
def build_ui(self):
hbox = QVBoxLayout()
lbl_src = QLabel('Source: ' + self.src)
lbl_dest = QLabel('Destination: ' + self.dest)
self.pb = QProgressBar()
self.lbl_rate = QLabel('Transfer rate: ' + self.rate)
self.lbl_time_elapsed = QLabel('Time Elapsed: ' + self.time_elapsed)
self.lbl_time_remaining = QLabel('Time Remaining: ' + self.time_remaining)
self.pb.setMinimum(0)
self.pb.setMaximum(100)
self.pb.setValue(0)
hbox.addWidget(lbl_src)
hbox.addWidget(lbl_dest)
hbox.addWidget(self.pb)
hbox.addWidget(self.lbl_rate)
hbox.addWidget(self.lbl_time_elapsed)
hbox.addWidget(self.lbl_time_remaining)
self.setLayout(hbox)
self.setWindowTitle('File copy')
self.auto_start_timer = QTimer()
self.auto_start_timer.singleShot(100, lambda: self.copy_files_with_progress(self.src, self.dest, self.progress, self.copy_done))
self.copy_timer = QTimer()
self.copy_timer.timeout.connect(lambda: self.process_informations())
self.copy_timer.start(1000)
self.show()
#pyqtSlot()
def process_informations(self):
time_elapsed_raw = time.clock() - self.start_time
self.time_elapsed = '{:.2f} s'.format(time_elapsed_raw)
self.lbl_time_elapsed.setText('Time Elapsed: ' + self.time_elapsed)
# example - Total: 100 Bytes, bisher kopiert 12 Bytes/s
time_remaining_raw = self._totalSize/self._copied
self.time_remaining = '{:.2f} s'.format(time_remaining_raw) if time_remaining_raw < 60. else '{:.2f} min'.format(time_remaining_raw)
self.lbl_time_remaining.setText('Time Remaining: ' + self.time_remaining)
rate_raw = (self._copied - self._copied_tmp)/1024/1024
self.rate = '{:.2f} MB/s'.format(rate_raw)
self.lbl_rate.setText('Transfer rate: ' + self.rate)
self._copied_tmp = self._copied
def progress(self):
self._progress = (self._copied/self._totalSize)*100
try:
self.pb.setValue(self._progress)
except:
pass
app.processEvents()
def get_total_size(self, src):
return sum( os.path.getsize(os.path.join(dirpath,filename)) for dirpath, dirnames, filenames in os.walk(src) for filename in filenames ) # total size of files in bytes
def copy_done(self):
self.pb.setValue(100)
print("done")
self.close()
def make_dirs(self, dest):
if not os.path.exists(dest):
os.makedirs(dest)
#pyqtSlot()
def copy_files_with_progress(self, src, dst, callback_progress, callback_copydone, length=16*1024*1024):
self._copied = 0
self._copied_tmp = 0
self._totalSize = self.get_total_size(src)
print(''.join(['Pre Dst: ', dst]))
dst = os.path.join(dst, src.replace(BASE_DIR, '').replace('\\', ''))
print(''.join(['Src: ', src]))
print(''.join(['Dst: ', dst]))
self.make_dirs(dst)
self.start_time = time.clock()
for path, dirs, filenames in os.walk(src):
for directory in dirs:
destDir = path.replace(src, dst)
self.make_dirs(os.path.join(destDir, directory))
for sfile in filenames:
srcFile = os.path.join(path, sfile)
destFile = os.path.join(dst, sfile)
# destFile = os.path.join(path.replace(src, dst), sfile)
with open(srcFile, 'rb') as fsrc:
with open(destFile, 'wb') as fdst:
while 1:
buf = fsrc.read(length)
if not buf:
break
fdst.write(buf)
self._copied += len(buf)
callback_progress()
try:
self.copy_timer.stop()
except:
print('Error: could not stop QTimer')
callback_copydone()
That's all about logic, think in the following way:
You have a TOTAL size of all files, let's say you have 100 dashes: [----- ... -----]
Now you get the starting time when you started transferring your files.
Choose some interval, let's have for example 2 secs.
So, after 2 secs see how much of the total you already transferred, in other words how much of files you already have in the new dir. Let's say you transferred 26 dashes.
The calc would be, 26 dashes/2secs = 13dashes per seconds.
Going deeper we have 26% of the content downloaded in 2 seconds since the total is 100 or 13% per second.
Even further we can calc also the prediction of time.
Total Time = 100%/13% = 7.6 secs
Time Elapsed = 2secs
Time Remaining = 7.6 - 2 = 5.6 secs
I think you got the idea...
Note: You just gotta keep verifying and remaking all these calcs in each 2 seconds, if you choose 2 secs of course, and updating the information to the user. If you wanna be even more precise or show more frequently updated information to the user just reduce that interval to let's say 0.5 secs and make the calc on top of milliseconds. It's up to you how often you update the information, that's just an overview of how to do the math. ")

Why does my julia code run so slowly?

redim = 2;
# Loading data
iris_data = readdlm("iris_data.csv");
iris_target = readdlm("iris_target.csv");
# Center data
iris_data = broadcast(-, iris_data, mean(iris_data, 1));
n_data, n_dim = size(iris_data);
Sw = zeros(n_dim, n_dim);
Sb = zeros(n_dim, n_dim);
C = cov(iris_data);
classes = unique(iris_target);
for i=1:length(classes)
index = find(x -> x==classes[i], iris_target);
d = iris_data[index,:];
classcov = cov(d);
Sw += length(index) / n_data .* classcov;
end
Sb = C - Sw;
evals, evecs = eig(Sw, Sb);
w = evecs[:,1:redim];
new_data = iris_data * w;
This code just does LDA (linear discriminant analysis) for the iris_data.
Reduct the dimensions of the iris_data to 2.
It will takes about 4 seconds, but Python(numpy/scipy) only takes about 0.6 seconds.
Why?
This is from the first page, second paragraph of the introduction in the Julia Manual:
Because Julia’s compiler is different from the interpreters used for languages like Python or R, you may find that Julia’s performance is unintuitive at first. If you find that something is slow, we highly recommend reading through the Performance Tips section before trying anything else. Once you understand how Julia works, it’s easy to write code that’s nearly as fast as C.
Excerpt:
Avoid global variables
A global variable might have its value, and therefore its type, change at any point. This makes it difficult for the compiler to optimize code using global variables. Variables should be local, or passed as arguments to functions, whenever possible.
Any code that is performance critical or being benchmarked should be inside a function.
We find that global names are frequently constants, and declaring them as such greatly improves performance
Knowing that the script (all procedural top level code) style is so pervasive among many scientific computing users, I would recommend you to at least wrap the whole file inside a let expression for starters (let introduces a new local scope), ie:
let
redim = 2
# Loading data
iris_data = readdlm("iris_data.csv")
iris_target = readdlm("iris_target.csv")
# Center data
iris_data = broadcast(-, iris_data, mean(iris_data, 1))
n_data, n_dim = size(iris_data)
Sw = zeros(n_dim, n_dim)
Sb = zeros(n_dim, n_dim)
C = cov(iris_data)
classes = unique(iris_target)
for i=1:length(classes)
index = find(x -> x==classes[i], iris_target)
d = iris_data[index,:]
classcov = cov(d)
Sw += length(index) / n_data .* classcov
end
Sb = C - Sw
evals, evecs = eig(Sw, Sb)
w = evecs[:,1:redim]
new_data = iris_data * w
end
But I would also urge you to refactor that into small functions and then compose a main function that calls the rest, something like this, notice how this refactor makes your code general and reusable (and fast):
module LinearDiscriminantAnalysis
export load_data, center_data
"Returns data and target Matrices."
load_data(data_path, target_path) = (readdlm(data_path), readdlm(target_path))
function center_data(data, target)
data = broadcast(-, data, mean(data, 1))
n_data, n_dim = size(data)
Sw = zeros(n_dim, n_dim)
Sb = zeros(n_dim, n_dim)
C = cov(data)
classes = unique(target)
for i=1:length(classes)
index = find(x -> x==classes[i], target)
d = data[index,:]
classcov = cov(d)
Sw += length(index) / n_data .* classcov
end
Sb = C - Sw
evals, evecs = eig(Sw, Sb)
redim = 2
w = evecs[:,1:redim]
return data * w
end
end
using LinearDiscriminantAnalysis
function main()
iris_data, iris_target = load_data("iris_data.csv", "iris_target.csv")
result = center_data(iris_data, iris_target)
#show result
end
main()
Notes:
You don't need all those semicolons.
anonymous functions are currently slow but that will change in v0.5. You can use FastAnonymous for now, if performance is critical.
In summary read carefully and take into account all the performance tips.
main is just a name, it could be anything else you like.

Reading memory and Access is Denied

I need to access all memory of a running process in my local Windows 7-64bit. I am new to winapi.
Here is my problem; Whenever I try to Open a process and reads its memory, I get Access is Denied error.
I searched and found something. It is said that If I run the main process as Administrator and use PROCESS_ALL_ACCESS on OpenProcess, I would have enough right to do it as it is said. OK, I did it. but nothing is changed. On reading memory, I still get Access is Denied.
So, I kept searching and found another thing which is enabling SeDebugPrivilege. I have also done that but nothing is changed. I still get the error.
I've read the quest and his answer here;
Windows Vista/Win7 Privilege Problem: SeDebugPrivilege & OpenProcess .
But as I said, I am really new to winapi. I could not solve my problem yet. Is there anything which which I need to configure in my local operating system?
Here is my Python code with pywin32;
from _ctypes import byref, sizeof, Structure
from ctypes import windll, WinError, c_buffer, c_void_p, create_string_buffer
from ctypes.wintypes import *
import win32security
import win32api
import gc
import ntsecuritycon
from struct import Struct
from win32con import PROCESS_ALL_ACCESS
from struct import calcsize
MEMORY_STATES = {0x1000: "MEM_COMMIT", 0x10000: "MEM_FREE", 0x2000: "MEM_RESERVE"}
MEMORY_PROTECTIONS = {0x10: "PAGE_EXECUTE", 0x20: "PAGE_EXECUTE_READ", 0x40: "PAGEEXECUTE_READWRITE",
0x80: "PAGE_EXECUTE_WRITECOPY", 0x01: "PAGE_NOACCESS", 0x04: "PAGE_READWRITE",
0x08: "PAGE_WRITECOPY"}
MEMORY_TYPES = {0x1000000: "MEM_IMAGE", 0x40000: "MEM_MAPPED", 0x20000: "MEM_PRIVATE"}
class MEMORY_BASIC_INFORMATION(Structure):
_fields_ = [
("BaseAddress", c_void_p),
("AllocationBase", c_void_p),
("AllocationProtect", DWORD),
("RegionSize", UINT),
("State", DWORD),
("Protect", DWORD),
("Type", DWORD)
]
class SYSTEM_INFO(Structure):
_fields_ = [("wProcessorArchitecture", WORD),
("wReserved", WORD),
("dwPageSize", DWORD),
("lpMinimumApplicationAddress", DWORD),
("lpMaximumApplicationAddress", DWORD),
("dwActiveProcessorMask", DWORD),
("dwNumberOfProcessors", DWORD),
("dwProcessorType", DWORD),
("dwAllocationGranularity", DWORD),
("wProcessorLevel", WORD),
("wProcessorRevision", WORD)]
class PyMEMORY_BASIC_INFORMATION:
def __init__(self, MBI):
self.MBI = MBI
self.set_attributes()
def set_attributes(self):
self.BaseAddress = self.MBI.BaseAddress
self.AllocationBase = self.MBI.AllocationBase
self.AllocationProtect = MEMORY_PROTECTIONS.get(self.MBI.AllocationProtect, self.MBI.AllocationProtect)
self.RegionSize = self.MBI.RegionSize
self.State = MEMORY_STATES.get(self.MBI.State, self.MBI.State)
# self.Protect = self.MBI.Protect # uncomment this and comment next line if you want to do a bitwise check on Protect.
self.Protect = MEMORY_PROTECTIONS.get(self.MBI.Protect, self.MBI.Protect)
self.Type = MEMORY_TYPES.get(self.MBI.Type, self.MBI.Type)
ASSUME_ALIGNMENT = True
class TARGET:
"""Given a ctype (initialized or not) this coordinates all the information needed to read, write and compare."""
def __init__(self, ctype):
self.alignment = 1
self.ctype = ctype
# size of target data
self.size = sizeof(ctype)
self.type = ctype._type_
# get the format type needed for struct.unpack/pack.
while hasattr(self.type, "_type_"):
self.type = self.type._type_
# string_buffers and char arrays have _type_ 'c'
# but that makes it slightly slower to unpack
# so swap is for 's'.
if self.type == "c":
self.type = "s"
# calculate byte alignment. this speeds up scanning substantially
# because we can read and compare every alignment bytes
# instead of every single byte.
# although if we are scanning for a string the alignment is defaulted to 1 \
# (im not sure if this is correct).
elif ASSUME_ALIGNMENT:
# calc alignment
divider = 1
for i in xrange(4):
divider *= 2
if not self.size % divider:
self.alignment = divider
# size of target ctype.
self.type_size = calcsize(self.type)
# length of target / array length.
self.length = self.size / self.type_size
self.value = getattr(ctype, "raw", ctype.value)
# the format string used for struct.pack/unpack.
self.format = str(self.length) + self.type
# efficient packer / unpacker for our own format.
self.packer = Struct(self.format)
def get_packed(self):
"""Gets the byte representation of the ctype value for use with WriteProcessMemory."""
return self.packer.pack(self.value)
def __str__(self):
return str(self.ctype)[:10] + "..." + " <" + str(self.value)[:10] + "..." + ">"
class Memory(object):
def __init__(self, process_handle, target):
self._process_handle = process_handle
self._target = target
self.found = []
self.__scann_process()
def __scann_process(self):
"""scans a processes pages for the target value."""
si = SYSTEM_INFO()
psi = byref(si)
windll.kernel32.GetSystemInfo(psi)
base_address = si.lpMinimumApplicationAddress
max_address = si.lpMaximumApplicationAddress
page_address = base_address
while page_address < max_address:
page_address = self.__scan_page(page_address)
if len(self.found) >= 60000000:
print("[Warning] Scan ended early because too many addresses were found to hold the target data.")
break
gc.collect()
return self.found
def __scan_page(self, page_address):
"""Scans the entire page for TARGET instance and returns the next page address and found addresses."""
information = self.VirtualQueryEx(page_address)
base_address = information.BaseAddress
region_size = information.RegionSize
next_region = base_address + region_size
size = self._target.size
target_value = self._target.value
step = self._target.alignment
unpacker = self._target.packer.unpack
if information.Type != "MEM_PRIVATE" or \
region_size < size or \
information.State != "MEM_COMMIT" or \
information.Protect not in ["PAGE_EXECUTE_READ", "PAGEEXECUTE_READWRITE", "PAGE_READWRITE"]:
return next_region
page_bytes = self.ReadMemory(base_address, region_size)
for i in xrange(0, (region_size - size), step):
partial = page_bytes[i:i + size]
if unpacker(partial)[0] == target_value:
self.found.append(base_address + i)
del page_bytes # free the buffer
return next_region
def ReadMemory(self, address, size):
cbuffer = c_buffer(size)
success = windll.kernel32.ReadProcessMemory(
self._process_handle,
address,
cbuffer,
size,
0)
assert success, "ReadMemory Failed with success == %s and address == %s and size == %s.\n%s" % (
success, address, size, WinError(win32api.GetLastError()))
return cbuffer.raw
def VirtualQueryEx(self, address):
MBI = MEMORY_BASIC_INFORMATION()
MBI_pointer = byref(MBI)
size = sizeof(MBI)
success = windll.kernel32.VirtualQueryEx(
self._process_handle,
address,
MBI_pointer,
size)
assert success, "VirtualQueryEx Failed with success == %s.\n%s" % (
success, WinError(win32api.GetLastError())[1])
assert success == size, "VirtualQueryEx Failed because not all data was written."
return PyMEMORY_BASIC_INFORMATION(MBI)
def AdjustPrivilege(priv):
flags = win32security.TOKEN_ADJUST_PRIVILEGES | win32security.TOKEN_QUERY
p = win32api.GetCurrentProcess()
htoken = win32security.OpenProcessToken(p, flags)
id = win32security.LookupPrivilegeValue(None, priv)
newPrivileges = [(id, win32security.SE_PRIVILEGE_ENABLED)]
win32security.AdjustTokenPrivileges(htoken, 0, newPrivileges)
win32api.CloseHandle(htoken)
def OpenProcess(pid=win32api.GetCurrentProcessId()):
# ntsecuritycon.SE_DEBUG_NAME = "SeDebugPrivilege"
AdjustPrivilege(ntsecuritycon.SE_DEBUG_NAME)
phandle = windll.kernel32.OpenProcess( \
PROCESS_ALL_ACCESS,
0,
pid)
assert phandle, "Failed to open process!\n%s" % WinError(win32api.GetLastError())[1]
return phandle
PID = 22852
process_handle = OpenProcess(PID)
Memory(process_handle, TARGET(create_string_buffer("1456")))
Here is the error I always get;
AssertionError: ReadMemory Failed with success == 0 and address == 131072 and size == 4096.
[Error 5] Access is denied.
I do not know what information else about my code and my personal Windows 7 operating system, I should provide to you. If you need to know more, please ask it from me, I will provide it to solve that problem.
I guess, this is about a lack of configuration in my operating system , not about pywin32. I'll be waiting for your solutions.

Graphite / Carbon / Ceres node overlap

I'm working with Graphite monitoring using Carbon and Ceres as the storage method. I have some problems with correcting bad data. It seems that (due to various problems) I've ended up with overlapping files. That is, since Carbon / Ceres stores the data as timestamp#interval.slice, I can have two or more files with overlapping time ranges.
There are two kinds of overlaps:
File A: +------------+ orig file
File B: +-----+ subset
File C: +---------+ overlap
This is causing problems because the existing tools available (ceres-maintenance defrag and rollup) don't cope with these overlaps. Instead, they skip the directory and move on. This is a problem, obviously.
I've created a script that fixes this problem, as follows:
For subsets, just delete the subset file.
For overlaps, using the file system 'truncate' on the orig file at the point where the next file starts. While it is possible to cut off the start of the overlap file and rename it properly, I would suggest that this is fraught with danger.
I've found that it's possible to do this in two ways:
Walk the dirs and iterate over the files, fixing as you go, and find the file subsets, remove them;
Walk the dirs and fix all the problems in a dir before moving on. This is BY FAR the faster approach, since the dir walk is hugely time consuming.
Code:
#!/usr/bin/env python2.6
################################################################################
import io
import os
import time
import sys
import string
import logging
import unittest
import datetime
import random
import zmq
import json
import socket
import traceback
import signal
import select
import simplejson
import cPickle as pickle
import re
import shutil
import collections
from pymongo import Connection
from optparse import OptionParser
from pprint import pprint, pformat
################################################################################
class SliceFile(object):
def __init__(self, fname):
self.name = fname
basename = fname.split('/')[-1]
fnArray = basename.split('#')
self.timeStart = int(fnArray[0])
self.freq = int(fnArray[1].split('.')[0])
self.size = None
self.numPoints = None
self.timeEnd = None
self.deleted = False
def __repr__(self):
out = "Name: %s, tstart=%s tEnd=%s, freq=%s, size=%s, npoints=%s." % (
self.name, self.timeStart, self.timeEnd, self.freq, self.size, self.numPoints)
return out
def setVars(self):
self.size = os.path.getsize(self.name)
self.numPoints = int(self.size / 8)
self.timeEnd = self.timeStart + (self.numPoints * self.freq)
################################################################################
class CeresOverlapFixup(object):
def __del__(self):
import datetime
self.writeLog("Ending at %s" % (str(datetime.datetime.today())))
self.LOGFILE.flush()
self.LOGFILE.close()
def __init__(self):
self.verbose = False
self.debug = False
self.LOGFILE = open("ceresOverlapFixup.log", "a")
self.badFilesList = set()
self.truncated = 0
self.subsets = 0
self.dirsExamined = 0
self.lastStatusTime = 0
def getOptionParser(self):
return OptionParser()
def getOptions(self):
parser = self.getOptionParser()
parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, help="debug mode for this program, writes debug messages to logfile." )
parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="verbose mode for this program, prints a lot to stdout." )
parser.add_option("-b", "--basedir", action="store", type="string", dest="basedir", default=None, help="base directory location to start converting." )
(options, args) = parser.parse_args()
self.debug = options.debug
self.verbose = options.verbose
self.basedir = options.basedir
assert self.basedir, "must provide base directory."
# Examples:
# ./updateOperations/1346805360#60.slice
# ./updateOperations/1349556660#60.slice
# ./updateOperations/1346798040#60.slice
def getFileData(self, inFilename):
ret = SliceFile(inFilename)
ret.setVars()
return ret
def removeFile(self, inFilename):
os.remove(inFilename)
#self.writeLog("removing file: %s" % (inFilename))
self.subsets += 1
def truncateFile(self, fname, newSize):
if self.verbose:
self.writeLog("Truncating file, name=%s, newsize=%s" % (pformat(fname), pformat(newSize)))
IFD = None
try:
IFD = os.open(fname, os.O_RDWR|os.O_CREAT)
os.ftruncate(IFD, newSize)
os.close(IFD)
self.truncated += 1
except:
self.writeLog("Exception during truncate: %s" % (traceback.format_exc()))
try:
os.close(IFD)
except:
pass
return
def printStatus(self):
now = self.getNowTime()
if ((now - self.lastStatusTime) > 10):
self.writeLog("Status: time=%d, Walked %s dirs, subsetFilesRemoved=%s, truncated %s files." % (now, self.dirsExamined, self.subsets, self.truncated))
self.lastStatusTime = now
def fixupThisDir(self, inPath, inFiles):
# self.writeLog("Fixing files in dir: %s" % (inPath))
if not '.ceres-node' in inFiles:
# self.writeLog("--> Not a slice directory, skipping.")
return
self.dirsExamined += 1
sortedFiles = sorted(inFiles)
sortedFiles = [x for x in sortedFiles if ((x != '.ceres-node') and (x.count('#') > 0)) ]
lastFile = None
fileObjList = []
for thisFile in sortedFiles:
wholeFilename = os.path.join(inPath, thisFile)
try:
curFile = self.getFileData(wholeFilename)
fileObjList.append(curFile)
except:
self.badFilesList.add(wholeFilename)
self.writeLog("ERROR: file %s, %s" % (wholeFilename, traceback.format_exc()))
# name is timeStart, really.
fileObjList = sorted(fileObjList, key=lambda thisObj: thisObj.name)
while fileObjList:
self.printStatus()
changes = False
firstFile = fileObjList[0]
removedFiles = []
for curFile in fileObjList[1:]:
if (curFile.timeEnd <= firstFile.timeEnd):
# have subset file. elim.
self.removeFile(curFile.name)
removedFiles.append(curFile.name)
self.subsets += 1
changes = True
if self.verbose:
self.writeLog("Subset file situation. First=%s, overlap=%s" % (firstFile, curFile))
fileObjList = [x for x in fileObjList if x.name not in removedFiles]
if (len(fileObjList) < 2):
break
secondFile = fileObjList[1]
# LT is right. FirstFile's timeEnd is always the first open time after first is done.
# so, first starts#100, len=2, end=102, positions used=100,101. second start#102 == OK.
if (secondFile.timeStart < firstFile.timeEnd):
# truncate first file.
# file_A (last): +---------+
# file_B (curr): +----------+
# solve by truncating previous file at startpoint of current file.
newLenFile_A_seconds = int(secondFile.timeStart - firstFile.timeStart)
newFile_A_datapoints = int(newLenFile_A_seconds / firstFile.freq)
newFile_A_bytes = int(newFile_A_datapoints) * 8
if (not newFile_A_bytes):
fileObjList = fileObjList[1:]
continue
assert newFile_A_bytes, "Must have size. newLenFile_A_seconds=%s, newFile_A_datapoints=%s, newFile_A_bytes=%s." % (newLenFile_A_seconds, newFile_A_datapoints, newFile_A_bytes)
self.truncateFile(firstFile.name, newFile_A_bytes)
if self.verbose:
self.writeLog("Truncate situation. First=%s, overlap=%s" % (firstFile, secondFile))
self.truncated += 1
fileObjList = fileObjList[1:]
changes = True
if not changes:
fileObjList = fileObjList[1:]
def getNowTime(self):
return time.time()
def walkDirStructure(self):
startTime = self.getNowTime()
self.lastStatusTime = startTime
updateStatsDict = {}
self.okayFiles = 0
emptyFiles = 0
for (thisPath, theseDirs, theseFiles) in os.walk(self.basedir):
self.printStatus()
self.fixupThisDir(thisPath, theseFiles)
self.dirsExamined += 1
endTime = time.time()
# time.sleep(11)
self.printStatus()
self.writeLog( "now = %s, started at %s, elapsed time = %s seconds." % (startTime, endTime, endTime - startTime))
self.writeLog( "Done.")
def writeLog(self, instring):
print instring
print >> self.LOGFILE, instring
self.LOGFILE.flush()
def main(self):
self.getOptions()
self.walkDirStructure()

Resources