PyQt Copy folder with progress and transfer rate/speed - windows

I'd like to expand my current code with the ability to show the transfer rate/speed of the files being copied. Im working on Windows 10 with py 3.6 and Qt 5.8. Here is my code:
import os
import shutil
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QLabel, QProgressBar, QFileDialog
class FileCopyProgress(QWidget):
def __init__(self, parent=None, src=None, dest=None):
super(FileCopyProgress, self).__init__()
self.src = src
self.dest = dest
self.build_ui()
def build_ui(self):
hbox = QVBoxLayout()
lbl_src = QLabel('Source: ' + self.src)
lbl_dest = QLabel('Destination: ' + self.dest)
self.pb = QProgressBar()
self.pb.setMinimum(0)
self.pb.setMaximum(100)
self.pb.setValue(0)
hbox.addWidget(lbl_src)
hbox.addWidget(lbl_dest)
hbox.addWidget(self.pb)
self.setLayout(hbox)
self.setWindowTitle('File copy')
self.auto_start_timer = QTimer()
self.auto_start_timer.singleShot(2000, lambda: self.copyFilesWithProgress(self.src, self.dest, self.progress, self.copydone))
self.show()
def progress(self, done, total):
progress = int(round((done/float(total))*100))
try:
self.pb.setValue(progress)
except:
pass
app.processEvents()
def copydone(self):
self.pb.setValue(100)
self.close()
def countfiles(self, _dir):
files = []
if os.path.isdir(_dir):
for path, dirs, filenames in os.walk(_dir):
files.extend(filenames)
return len(files)
def makedirs(self, dest):
if not os.path.exists(dest):
os.makedirs(dest)
#pyqtSlot()
def copyFilesWithProgress(self, src, dest, callback_progress, callback_copydone):
numFiles = self.countfiles(src)
if numFiles > 0:
dest = os.path.join(dest, src.replace(BASE_DIR, '').replace('\\', ''))
print(''.join(['Destination: ', dest]))
self.makedirs(dest)
numCopied = 0
for path, dirs, filenames in os.walk(src):
for directory in dirs:
destDir = path.replace(src,dest)
self.makedirs(os.path.join(destDir, directory))
for sfile in filenames:
srcFile = os.path.join(path, sfile)
destFile = os.path.join(path.replace(src, dest), sfile)
shutil.copy(srcFile, destFile)
numCopied += 1
callback_progress(numCopied, numFiles)
callback_copydone()
BASE_DIR = 'C:\\dev'
app = QApplication([])
FileCopyProgress(src="C:\dev\pywin32-221", dest='C:\dev\copied')
# Run the app
app.exec_()
This code opens a gui with a progressbar showing progress while copying files. A simple label with the current transfer rate/speed (approximate) would be rlly nice :)
Unfortuanetly i can't find any examples, can someone give me a hint or maybe a working example please?
EDIT:
I did a remake and now I have transfer rate, time elapsed and time remaining. The data seems to be realistic. I have only one problem: Lets assume i have a folder/file that time remaining is 7 sec -> currently it starts with 7 sec and gets an update every 1 second. We expect that in the next step it would display 6 sec but instead it goes:
5 sec
3 sec
1.5 sec
1.4 sec
1.3 sec
1.2 sec
1.1 sec and so on
Where is the mistake?
class FileCopyProgress(QWidget):
def __init__(self, parent=None, src=None, dest=None):
super(FileCopyProgress, self).__init__()
self.src = src
self.dest = dest
self.rate = "0"
self.total_time = "0 s"
self.time_elapsed = "0 s"
self.time_remaining = "0 s"
self.build_ui()
def build_ui(self):
hbox = QVBoxLayout()
lbl_src = QLabel('Source: ' + self.src)
lbl_dest = QLabel('Destination: ' + self.dest)
self.pb = QProgressBar()
self.lbl_rate = QLabel('Transfer rate: ' + self.rate)
self.lbl_time_elapsed = QLabel('Time Elapsed: ' + self.time_elapsed)
self.lbl_time_remaining = QLabel('Time Remaining: ' + self.time_remaining)
self.pb.setMinimum(0)
self.pb.setMaximum(100)
self.pb.setValue(0)
hbox.addWidget(lbl_src)
hbox.addWidget(lbl_dest)
hbox.addWidget(self.pb)
hbox.addWidget(self.lbl_rate)
hbox.addWidget(self.lbl_time_elapsed)
hbox.addWidget(self.lbl_time_remaining)
self.setLayout(hbox)
self.setWindowTitle('File copy')
self.auto_start_timer = QTimer()
self.auto_start_timer.singleShot(100, lambda: self.copy_files_with_progress(self.src, self.dest, self.progress, self.copy_done))
self.copy_timer = QTimer()
self.copy_timer.timeout.connect(lambda: self.process_informations())
self.copy_timer.start(1000)
self.show()
#pyqtSlot()
def process_informations(self):
time_elapsed_raw = time.clock() - self.start_time
self.time_elapsed = '{:.2f} s'.format(time_elapsed_raw)
self.lbl_time_elapsed.setText('Time Elapsed: ' + self.time_elapsed)
# example - Total: 100 Bytes, bisher kopiert 12 Bytes/s
time_remaining_raw = self._totalSize/self._copied
self.time_remaining = '{:.2f} s'.format(time_remaining_raw) if time_remaining_raw < 60. else '{:.2f} min'.format(time_remaining_raw)
self.lbl_time_remaining.setText('Time Remaining: ' + self.time_remaining)
rate_raw = (self._copied - self._copied_tmp)/1024/1024
self.rate = '{:.2f} MB/s'.format(rate_raw)
self.lbl_rate.setText('Transfer rate: ' + self.rate)
self._copied_tmp = self._copied
def progress(self):
self._progress = (self._copied/self._totalSize)*100
try:
self.pb.setValue(self._progress)
except:
pass
app.processEvents()
def get_total_size(self, src):
return sum( os.path.getsize(os.path.join(dirpath,filename)) for dirpath, dirnames, filenames in os.walk(src) for filename in filenames ) # total size of files in bytes
def copy_done(self):
self.pb.setValue(100)
print("done")
self.close()
def make_dirs(self, dest):
if not os.path.exists(dest):
os.makedirs(dest)
#pyqtSlot()
def copy_files_with_progress(self, src, dst, callback_progress, callback_copydone, length=16*1024*1024):
self._copied = 0
self._copied_tmp = 0
self._totalSize = self.get_total_size(src)
print(''.join(['Pre Dst: ', dst]))
dst = os.path.join(dst, src.replace(BASE_DIR, '').replace('\\', ''))
print(''.join(['Src: ', src]))
print(''.join(['Dst: ', dst]))
self.make_dirs(dst)
self.start_time = time.clock()
for path, dirs, filenames in os.walk(src):
for directory in dirs:
destDir = path.replace(src, dst)
self.make_dirs(os.path.join(destDir, directory))
for sfile in filenames:
srcFile = os.path.join(path, sfile)
destFile = os.path.join(dst, sfile)
# destFile = os.path.join(path.replace(src, dst), sfile)
with open(srcFile, 'rb') as fsrc:
with open(destFile, 'wb') as fdst:
while 1:
buf = fsrc.read(length)
if not buf:
break
fdst.write(buf)
self._copied += len(buf)
callback_progress()
try:
self.copy_timer.stop()
except:
print('Error: could not stop QTimer')
callback_copydone()

That's all about logic, think in the following way:
You have a TOTAL size of all files, let's say you have 100 dashes: [----- ... -----]
Now you get the starting time when you started transferring your files.
Choose some interval, let's have for example 2 secs.
So, after 2 secs see how much of the total you already transferred, in other words how much of files you already have in the new dir. Let's say you transferred 26 dashes.
The calc would be, 26 dashes/2secs = 13dashes per seconds.
Going deeper we have 26% of the content downloaded in 2 seconds since the total is 100 or 13% per second.
Even further we can calc also the prediction of time.
Total Time = 100%/13% = 7.6 secs
Time Elapsed = 2secs
Time Remaining = 7.6 - 2 = 5.6 secs
I think you got the idea...
Note: You just gotta keep verifying and remaking all these calcs in each 2 seconds, if you choose 2 secs of course, and updating the information to the user. If you wanna be even more precise or show more frequently updated information to the user just reduce that interval to let's say 0.5 secs and make the calc on top of milliseconds. It's up to you how often you update the information, that's just an overview of how to do the math. ")

Related

Rename images same as video

I saved one frame from each video. I want to rename the frame as the video, but I'm not sure how.
import cv2
import os
def video2frames(video_path, output_path,f_num):
if not os.path.exists(output_path):
os.makedirs(output_path)
cap = cv2.VideoCapture(video_path)
index = 0
while cap.isOpened():
Ret, Mat = cap.read()
if Ret:
index += 1
if index != f_num:
continue
cv2.imwrite(output_path + '/' + str(index) + '.png', Mat)
else:
break
cap.release()
return
def multiple_video2frames( video_path, output_path, f_num ):
list_videos = os.listdir(video_path)
for video in list_videos:
video_base = os.path.basename(video)
input_file = video_path + '/' + video
out_path = output_path + '/' + video_base
video2frames(input_file, out_path, f_num)
return
# run all
video_path = 'videos' # all videos
output_path = 'final' # location on ur pc
multiple_video2frames( video_path, output_path, 300 )
Searched: How can I extract and save image frames from a large number of videos all at once using OpenCV Python?

Fine tuning Bert for NER attempt on Mac OS

I'm using a MacBook Air/OS Monterey 12.5 (There are updates available; Ventura 13.1
Python version 3.10.8 and also tried using 3.11
Pylance has pointed that all the imports I was trying to execute were not being resolved so I changed the VS Code interpreter to Python 3.10.
Anyways, here's the code:
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizerFast
from transformers import BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
df = pd.read_csv('ner.csv')
labels = [i.split() for i in df['labels'].values.tolist()]
unique_labels = set()
for lb in labels:
[unique_labels.add(i) for i in lb if i not in unique_labels]
# print(unique_labels)
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
# print(labels_to_ids)
text = df['text'].values.tolist()
example = text[36]
#print(example)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
text_tokenized = tokenizer(example, padding='max_length', max_length=512, truncation=True, return_tensors='pt')
'''
print(text_tokenized)
print(tokenizer.decode(text_tokenized.input_ids[0]))
'''
def align_label_example(tokenized_input, labels):
word_ids = tokenized_input.word_ids()
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
try:
label_ids.append(labels_to_ids[labels[word_idx]])
except:
label_ids.append(-100)
else:
label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
previous_word_idx = word_idx
return label_ids;
label = labels[36]
label_all_tokens = False
new_label = align_label_example(text_tokenized, label)
'''
print(new_label)
print(tokenizer.convert_ids_to_tokens(text_tokenized['input_ids'][0]))
'''
def align_label(texts, labels):
tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
word_ids = tokenized_inputs.word_ids()
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
try:
label_ids.append(labels_to_ids[labels[word_idx]])
except:
label_ids.append(-100)
else:
try:
label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
except:
label_ids.append(-100)
previous_word_idx = word_idx
return label_ids
class DataSequence(torch.utils.data.Dataset):
def __init__(self, df):
lb = [i.split() for i in df['labels'].values.tolist()]
txt = df['text'].values.tolist()
self.texts = [tokenizer(str(i),
padding='max_length', max_length=512, truncation=True, return_tensors='pt') for i in txt]
self.labels = [align_label(i,j) for i,j in zip(txt, lb)]
def __len__(self):
return len(self.labels)
def get_batch_labels(self, idx):
return torch.LongTensor(self.labels[idx])
def __getitem__(self, idx):
batch_data = self.get_batch_data(idx)
batch_labels = self.get_batch_labels(idx)
return batch_data, batch_labels
df = df[0:1000]
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
[int(.8 * len(df)), int(.9 * len(df))])
class BertModel(torch.nn.Module):
def __init__(self):
super(BertModel, self).__init__()
self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))
def forward(self, input_id, mask, label):
output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
return output
def train_loop(model, df_train, df_val):
train_dataset = DataSequence(df_train)
val_dataset = DataSequence(df_val)
train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
if use_cuda:
model = model.cuda()
best_acc = 0
best_loss = 1000
for epoch_num in range(EPOCHS):
total_acc_train = 0
total_loss_train = 0
model.train()
for train_data, train_label in tqdm(train_dataloader):
train_label = train_label.to(device)
mask = train_data['attention_mask'].squeeze(1).to(device)
input_id = train_data['input_ids'].squeeze(1).to(device)
optimizer.zero_grad()
loss, logits = model(input_id, mask, train_label)
for i in range(logits.shape[0]):
logits_clean = logits[i][train_label[i] != -100]
label_clean = train_label[i][train_label[i] != -100]
predictions = logits_clean.argmax(dim=1)
acc = (predictions == label_clean).float().mean()
total_acc_train += acc
total_loss_train += loss.item()
loss.backward()
optimizer.step()
model.eval()
total_acc_val = 0
total_loss_val = 0
for val_data, val_label in val_dataloader:
val_label = val_label.to(device)
mask = val_data['attention_mask'].squeeze(1).to(device)
input_id = val_data['input_ids'].squeeze(1).to(device)
loss, logits = model(input_id, mask, val_label)
for i in range(logits.shape[0]):
logits_clean = logits[i][val_label[i] != -100]
label_clean = val_label[i][val_label[i] != -100]
predictions = logits_clean.argmax(dim=1)
acc = (predictions == label_clean).float().mean()
total_acc_val += acc
total_loss_val += loss.item()
val_accuracy = total_acc_val / len(df_val)
val_loss = total_loss_val / len(df_val)
print(
f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')
LEARNING_RATE = 5e-3
EPOCHS = 5
BATCH_SIZE = 2
model = BertModel()
train_loop(model, df_train, df_val)
And the debugger says:
Exception has occurred: RuntimeError (note: full exception trace is shown but execution is paused at: <module>)
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
File "/Users/filipedonatti/Projects/pyCodes/second_try.py", line 141, in train_loop
for train_data, train_label in tqdm(train_dataloader):
File "/Users/filipedonatti/Projects/pyCodes/second_try.py", line 197, in <module>
train_loop(model, df_train, df_val)
File "<string>", line 1, in <module> (Current frame)
By the way,
Despite using Mac, I have downloaded Anaconda-Navigator, however I've been trying and executing this code on VS Code. I've downloaded numpy, torch, datasets and other libraries through Brew with the pip3 command.
I'm at a loss, I can run the code on a google collab notebook or Jupiter notebook, and I know training models and such in my humble Mac would not be advised, but I am just exercising this so I can train and use the model in a much more powerful machine.
Please help me with this issue, I've been trying to find a solution for days.
Peace and happy holidays.
I've tried solving the issue by writing:
if __name__ == '__main__':
freeze_support()
I've tried using this:
import parallelTestModule
extractor = parallelTestModule.ParallelExtractor()
extractor.runInParallel(numProcesses=2, numThreads=4)
So...
It turns out the correct way to solve this is to implement a function to train the loop as such:
def run():
model = BertModel()
torch.multiprocessing.freeze_support()
print('loop')
train_loop(model, df_train, df_val)
if __name__ == '__main__':
run()
Redefining that train_loop line in the end. Issue solved. For more see this link: https://github.com/pytorch/pytorch/issues/5858

What's wrong with Windows Task Sheduler + pytesseract + multiprocessing?

Have the python code with pytesseract & multiprocessing. When I start the code manually from PyCharm it works fine with any number of threads. When I start the code with Win Task Sheduler with 'threads=1' it works fine.
However if I start the code with Win Task Sheduler with 'threads=2' or more than 2, it finishes without processing the images and without any errors.
I've got log messages like this. Script starts but does nothing and there is no any errors in Win logs
2020-05-24 13:09:31,834;START
2020-05-24 13:09:31,834;threads: 2
2020-05-24 13:10:31,832;START
2020-05-24 13:10:31,832;threads: 2
2020-05-24 13:11:31,851;START
2020-05-24 13:11:31,851;threads: 2
Code
from PIL import Image
import pytesseract
from pytesseract import Output
import datetime
from glob import glob
import os
import multiprocessing as multiprocessing
import cv2
import logging
def loggerinit(name, filename, overwrite):
logger = logging.getLogger(name)
logger.setLevel(logging.INFO)
# create the logging file handler
fh = logging.FileHandler(filename, encoding = 'UTF-8')
formatter = logging.Formatter('%(asctime)s;%(message)s')
fh.setFormatter(formatter)
# add handler to logger object
logger.addHandler(fh)
return logger
def getfiles(dirname, mask):
return glob(os.path.join(dirname, mask))
def tess_file(fname):
img = cv2.imread(fname)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
im_for_T = Image.fromarray(img)
pytesseract.pytesseract.tesseract_cmd = 'C://Tesseract-OCR//tesseract.exe'
TESSDATA_PREFIX = 'C:/Program Files/Tesseract-OCR/tessdata'
try:
os.environ['OMP_THREAD_LIMIT'] = '1'
tess_data = pytesseract.image_to_osd(im_for_T, output_type=Output.DICT)
return fname, tess_data
except:
return fname, None
if __name__ == '__main__':
logger = loggerinit('tess', 'tess.log', False)
files = getfiles('Croped', '*.jpg')
t1 = datetime.datetime.now()
logger.info('START')
threads = 2
logger.info('threads: ' + str(threads))
p = multiprocessing.Pool(threads)
results = p.map(tess_file,files)
e = []
for r in results:
if type(r) == type(None):
e.append('OCR error: ' + r)
else:
print(r[0],". rotate: ",r[1]['rotate'])
p.close()
p.join()
t2 = datetime.datetime.now()
delta = (t2 - t1).total_seconds()
print('Total time: ', delta)
print('Files: ', len(files))
logger.info('Files: ' + str(len(files)))
logger.info('Stop.' + 'Total time: ' + str(delta))
# Print error if exist
for ee in e:
print(ee)
Whats wrong? How can I fix this issue?

Only using 20% of CPU after multiprocessing.pool.starmap() in python

The intention of the program is to run through a directory, if a file is an excel spreadsheet it should open it, extract and manipulate some data then move onto the next file. As this is a laborious process I have tried to split the task across multiple threads. Even after this is only using 20% of the total CPU capacity, and it didn't particular speed up.
def extract_data(unique_file_names):
global rootdir
global newarray
global counter
global t0
string = rootdir + "\\" + str(unique_file_names[0])
wb = load_workbook(string, read_only = True, data_only=True)
ws = wb["Sheet1"]
df = pd.DataFrame(ws.values)
newarray = df.loc[4:43,:13].values
counter = 0
print("Starting pool")
pool = ThreadPool(processes=20)
pool.map(process, unique_file_names)
pool.close()
def process(filename):
global newarray
global unique_file_names
global counter
global t0
counter+=1
try:
file_name = rootdir + "/" + str(filename)
wb = load_workbook(file_name, read_only = True, data_only=True)
ws = wb["Sheet1"]
df = pd.DataFrame(ws.values)
newarray = np.hstack((newarray, df.loc[4:43,4:13].values))
except:
print("Failure")
pass
print("Time %.2f, Completed %.2f %% " %((time.clock()-t0),counter*100/len(unique_file_names)))
So it roughly takes around a second and a half to process one spreadsheet, but like I said pool.map() made next to no difference. Any suggestions?
Thanks in advance

Graphite / Carbon / Ceres node overlap

I'm working with Graphite monitoring using Carbon and Ceres as the storage method. I have some problems with correcting bad data. It seems that (due to various problems) I've ended up with overlapping files. That is, since Carbon / Ceres stores the data as timestamp#interval.slice, I can have two or more files with overlapping time ranges.
There are two kinds of overlaps:
File A: +------------+ orig file
File B: +-----+ subset
File C: +---------+ overlap
This is causing problems because the existing tools available (ceres-maintenance defrag and rollup) don't cope with these overlaps. Instead, they skip the directory and move on. This is a problem, obviously.
I've created a script that fixes this problem, as follows:
For subsets, just delete the subset file.
For overlaps, using the file system 'truncate' on the orig file at the point where the next file starts. While it is possible to cut off the start of the overlap file and rename it properly, I would suggest that this is fraught with danger.
I've found that it's possible to do this in two ways:
Walk the dirs and iterate over the files, fixing as you go, and find the file subsets, remove them;
Walk the dirs and fix all the problems in a dir before moving on. This is BY FAR the faster approach, since the dir walk is hugely time consuming.
Code:
#!/usr/bin/env python2.6
################################################################################
import io
import os
import time
import sys
import string
import logging
import unittest
import datetime
import random
import zmq
import json
import socket
import traceback
import signal
import select
import simplejson
import cPickle as pickle
import re
import shutil
import collections
from pymongo import Connection
from optparse import OptionParser
from pprint import pprint, pformat
################################################################################
class SliceFile(object):
def __init__(self, fname):
self.name = fname
basename = fname.split('/')[-1]
fnArray = basename.split('#')
self.timeStart = int(fnArray[0])
self.freq = int(fnArray[1].split('.')[0])
self.size = None
self.numPoints = None
self.timeEnd = None
self.deleted = False
def __repr__(self):
out = "Name: %s, tstart=%s tEnd=%s, freq=%s, size=%s, npoints=%s." % (
self.name, self.timeStart, self.timeEnd, self.freq, self.size, self.numPoints)
return out
def setVars(self):
self.size = os.path.getsize(self.name)
self.numPoints = int(self.size / 8)
self.timeEnd = self.timeStart + (self.numPoints * self.freq)
################################################################################
class CeresOverlapFixup(object):
def __del__(self):
import datetime
self.writeLog("Ending at %s" % (str(datetime.datetime.today())))
self.LOGFILE.flush()
self.LOGFILE.close()
def __init__(self):
self.verbose = False
self.debug = False
self.LOGFILE = open("ceresOverlapFixup.log", "a")
self.badFilesList = set()
self.truncated = 0
self.subsets = 0
self.dirsExamined = 0
self.lastStatusTime = 0
def getOptionParser(self):
return OptionParser()
def getOptions(self):
parser = self.getOptionParser()
parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, help="debug mode for this program, writes debug messages to logfile." )
parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="verbose mode for this program, prints a lot to stdout." )
parser.add_option("-b", "--basedir", action="store", type="string", dest="basedir", default=None, help="base directory location to start converting." )
(options, args) = parser.parse_args()
self.debug = options.debug
self.verbose = options.verbose
self.basedir = options.basedir
assert self.basedir, "must provide base directory."
# Examples:
# ./updateOperations/1346805360#60.slice
# ./updateOperations/1349556660#60.slice
# ./updateOperations/1346798040#60.slice
def getFileData(self, inFilename):
ret = SliceFile(inFilename)
ret.setVars()
return ret
def removeFile(self, inFilename):
os.remove(inFilename)
#self.writeLog("removing file: %s" % (inFilename))
self.subsets += 1
def truncateFile(self, fname, newSize):
if self.verbose:
self.writeLog("Truncating file, name=%s, newsize=%s" % (pformat(fname), pformat(newSize)))
IFD = None
try:
IFD = os.open(fname, os.O_RDWR|os.O_CREAT)
os.ftruncate(IFD, newSize)
os.close(IFD)
self.truncated += 1
except:
self.writeLog("Exception during truncate: %s" % (traceback.format_exc()))
try:
os.close(IFD)
except:
pass
return
def printStatus(self):
now = self.getNowTime()
if ((now - self.lastStatusTime) > 10):
self.writeLog("Status: time=%d, Walked %s dirs, subsetFilesRemoved=%s, truncated %s files." % (now, self.dirsExamined, self.subsets, self.truncated))
self.lastStatusTime = now
def fixupThisDir(self, inPath, inFiles):
# self.writeLog("Fixing files in dir: %s" % (inPath))
if not '.ceres-node' in inFiles:
# self.writeLog("--> Not a slice directory, skipping.")
return
self.dirsExamined += 1
sortedFiles = sorted(inFiles)
sortedFiles = [x for x in sortedFiles if ((x != '.ceres-node') and (x.count('#') > 0)) ]
lastFile = None
fileObjList = []
for thisFile in sortedFiles:
wholeFilename = os.path.join(inPath, thisFile)
try:
curFile = self.getFileData(wholeFilename)
fileObjList.append(curFile)
except:
self.badFilesList.add(wholeFilename)
self.writeLog("ERROR: file %s, %s" % (wholeFilename, traceback.format_exc()))
# name is timeStart, really.
fileObjList = sorted(fileObjList, key=lambda thisObj: thisObj.name)
while fileObjList:
self.printStatus()
changes = False
firstFile = fileObjList[0]
removedFiles = []
for curFile in fileObjList[1:]:
if (curFile.timeEnd <= firstFile.timeEnd):
# have subset file. elim.
self.removeFile(curFile.name)
removedFiles.append(curFile.name)
self.subsets += 1
changes = True
if self.verbose:
self.writeLog("Subset file situation. First=%s, overlap=%s" % (firstFile, curFile))
fileObjList = [x for x in fileObjList if x.name not in removedFiles]
if (len(fileObjList) < 2):
break
secondFile = fileObjList[1]
# LT is right. FirstFile's timeEnd is always the first open time after first is done.
# so, first starts#100, len=2, end=102, positions used=100,101. second start#102 == OK.
if (secondFile.timeStart < firstFile.timeEnd):
# truncate first file.
# file_A (last): +---------+
# file_B (curr): +----------+
# solve by truncating previous file at startpoint of current file.
newLenFile_A_seconds = int(secondFile.timeStart - firstFile.timeStart)
newFile_A_datapoints = int(newLenFile_A_seconds / firstFile.freq)
newFile_A_bytes = int(newFile_A_datapoints) * 8
if (not newFile_A_bytes):
fileObjList = fileObjList[1:]
continue
assert newFile_A_bytes, "Must have size. newLenFile_A_seconds=%s, newFile_A_datapoints=%s, newFile_A_bytes=%s." % (newLenFile_A_seconds, newFile_A_datapoints, newFile_A_bytes)
self.truncateFile(firstFile.name, newFile_A_bytes)
if self.verbose:
self.writeLog("Truncate situation. First=%s, overlap=%s" % (firstFile, secondFile))
self.truncated += 1
fileObjList = fileObjList[1:]
changes = True
if not changes:
fileObjList = fileObjList[1:]
def getNowTime(self):
return time.time()
def walkDirStructure(self):
startTime = self.getNowTime()
self.lastStatusTime = startTime
updateStatsDict = {}
self.okayFiles = 0
emptyFiles = 0
for (thisPath, theseDirs, theseFiles) in os.walk(self.basedir):
self.printStatus()
self.fixupThisDir(thisPath, theseFiles)
self.dirsExamined += 1
endTime = time.time()
# time.sleep(11)
self.printStatus()
self.writeLog( "now = %s, started at %s, elapsed time = %s seconds." % (startTime, endTime, endTime - startTime))
self.writeLog( "Done.")
def writeLog(self, instring):
print instring
print >> self.LOGFILE, instring
self.LOGFILE.flush()
def main(self):
self.getOptions()
self.walkDirStructure()

Resources