nltk similarity performance issue? - performance

nltk have nice word2word similarity function which measures similarity by how close the terms are to the common hypernym. Although that similarity function is not applicable to the situation where 2 terms differ from pos tag to pos tag, it still is great.
However, I found that it is so slow... It was 10x times slower than just term matching. Is there anyway the nltk similarity function become faster?
I have tested with this code below:
from nltk import stem, RegexpStemmer
from nltk.corpus import wordnet, stopwords
from nltk.tag import pos_tag
import time
file1 = open('./tester.csv', 'r')
def similarityCal(word1, word2):
synset1 = wordnet.synsets(word1)
synset2 = wordnet.synsets(word2)
if len(synset1) != 0 and len(synset2) != 0:
wordFromList1 = synset1[0]
wordFromList2 = synset2[0]
return wordFromList1.wup_similarity(wordFromList2)
else:
return 0
start_time = time.time()
file1lines = file1.readlines()
stopwords = stopwords.words('english')
previousLine = ""
currentLine = ""
cntOri = 0
cntExp = 0
for line1 in file1lines:
currentLine = line1.lower().strip()
if previousLine == "":
previousLine = currentLine
continue
else:
for tag1 in pos_tag(currentLine.split(" ")):
tmpStr1 = tag1[0];
if tmpStr1 not in stopwords and len(tmpStr1) > 1:
if tmpStr1 in previousLine:
print("termMatching word", tmpStr1);
cntOri = cntOri + 1
for tag2 in pos_tag(previousLine.split(" ")):
tmpStr2 = tag2[0];
if tag1[1].startswith("NN") and tag2[1].startswith("NN") or tag1[1].startswith("VB") and tag2[1].startswith("VB"):
value = similarityCal(tmpStr1, tmpStr2)
if type(value) is float and value > 0.8:
print(tmpStr1, " similar to " , tmpStr2 , " ", value)
cntExp = cntExp + 1
previousLine = currentLine
end_time = time.time()
print ("time taken : ",end_time - start_time, " // ", cntOri, " | ", cntExp)
file1.close()
I just comment out similarity function to compare the performance.
And I have used samples from this site:
https://www.briandunning.com/sample-data/
Any ideas?

Related

Fine tuning Bert for NER attempt on Mac OS

I'm using a MacBook Air/OS Monterey 12.5 (There are updates available; Ventura 13.1
Python version 3.10.8 and also tried using 3.11
Pylance has pointed that all the imports I was trying to execute were not being resolved so I changed the VS Code interpreter to Python 3.10.
Anyways, here's the code:
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizerFast
from transformers import BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
df = pd.read_csv('ner.csv')
labels = [i.split() for i in df['labels'].values.tolist()]
unique_labels = set()
for lb in labels:
[unique_labels.add(i) for i in lb if i not in unique_labels]
# print(unique_labels)
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
# print(labels_to_ids)
text = df['text'].values.tolist()
example = text[36]
#print(example)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
text_tokenized = tokenizer(example, padding='max_length', max_length=512, truncation=True, return_tensors='pt')
'''
print(text_tokenized)
print(tokenizer.decode(text_tokenized.input_ids[0]))
'''
def align_label_example(tokenized_input, labels):
word_ids = tokenized_input.word_ids()
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
try:
label_ids.append(labels_to_ids[labels[word_idx]])
except:
label_ids.append(-100)
else:
label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
previous_word_idx = word_idx
return label_ids;
label = labels[36]
label_all_tokens = False
new_label = align_label_example(text_tokenized, label)
'''
print(new_label)
print(tokenizer.convert_ids_to_tokens(text_tokenized['input_ids'][0]))
'''
def align_label(texts, labels):
tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)
word_ids = tokenized_inputs.word_ids()
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
try:
label_ids.append(labels_to_ids[labels[word_idx]])
except:
label_ids.append(-100)
else:
try:
label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
except:
label_ids.append(-100)
previous_word_idx = word_idx
return label_ids
class DataSequence(torch.utils.data.Dataset):
def __init__(self, df):
lb = [i.split() for i in df['labels'].values.tolist()]
txt = df['text'].values.tolist()
self.texts = [tokenizer(str(i),
padding='max_length', max_length=512, truncation=True, return_tensors='pt') for i in txt]
self.labels = [align_label(i,j) for i,j in zip(txt, lb)]
def __len__(self):
return len(self.labels)
def get_batch_labels(self, idx):
return torch.LongTensor(self.labels[idx])
def __getitem__(self, idx):
batch_data = self.get_batch_data(idx)
batch_labels = self.get_batch_labels(idx)
return batch_data, batch_labels
df = df[0:1000]
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
[int(.8 * len(df)), int(.9 * len(df))])
class BertModel(torch.nn.Module):
def __init__(self):
super(BertModel, self).__init__()
self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))
def forward(self, input_id, mask, label):
output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
return output
def train_loop(model, df_train, df_val):
train_dataset = DataSequence(df_train)
val_dataset = DataSequence(df_val)
train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
if use_cuda:
model = model.cuda()
best_acc = 0
best_loss = 1000
for epoch_num in range(EPOCHS):
total_acc_train = 0
total_loss_train = 0
model.train()
for train_data, train_label in tqdm(train_dataloader):
train_label = train_label.to(device)
mask = train_data['attention_mask'].squeeze(1).to(device)
input_id = train_data['input_ids'].squeeze(1).to(device)
optimizer.zero_grad()
loss, logits = model(input_id, mask, train_label)
for i in range(logits.shape[0]):
logits_clean = logits[i][train_label[i] != -100]
label_clean = train_label[i][train_label[i] != -100]
predictions = logits_clean.argmax(dim=1)
acc = (predictions == label_clean).float().mean()
total_acc_train += acc
total_loss_train += loss.item()
loss.backward()
optimizer.step()
model.eval()
total_acc_val = 0
total_loss_val = 0
for val_data, val_label in val_dataloader:
val_label = val_label.to(device)
mask = val_data['attention_mask'].squeeze(1).to(device)
input_id = val_data['input_ids'].squeeze(1).to(device)
loss, logits = model(input_id, mask, val_label)
for i in range(logits.shape[0]):
logits_clean = logits[i][val_label[i] != -100]
label_clean = val_label[i][val_label[i] != -100]
predictions = logits_clean.argmax(dim=1)
acc = (predictions == label_clean).float().mean()
total_acc_val += acc
total_loss_val += loss.item()
val_accuracy = total_acc_val / len(df_val)
val_loss = total_loss_val / len(df_val)
print(
f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')
LEARNING_RATE = 5e-3
EPOCHS = 5
BATCH_SIZE = 2
model = BertModel()
train_loop(model, df_train, df_val)
And the debugger says:
Exception has occurred: RuntimeError (note: full exception trace is shown but execution is paused at: <module>)
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.
File "/Users/filipedonatti/Projects/pyCodes/second_try.py", line 141, in train_loop
for train_data, train_label in tqdm(train_dataloader):
File "/Users/filipedonatti/Projects/pyCodes/second_try.py", line 197, in <module>
train_loop(model, df_train, df_val)
File "<string>", line 1, in <module> (Current frame)
By the way,
Despite using Mac, I have downloaded Anaconda-Navigator, however I've been trying and executing this code on VS Code. I've downloaded numpy, torch, datasets and other libraries through Brew with the pip3 command.
I'm at a loss, I can run the code on a google collab notebook or Jupiter notebook, and I know training models and such in my humble Mac would not be advised, but I am just exercising this so I can train and use the model in a much more powerful machine.
Please help me with this issue, I've been trying to find a solution for days.
Peace and happy holidays.
I've tried solving the issue by writing:
if __name__ == '__main__':
freeze_support()
I've tried using this:
import parallelTestModule
extractor = parallelTestModule.ParallelExtractor()
extractor.runInParallel(numProcesses=2, numThreads=4)
So...
It turns out the correct way to solve this is to implement a function to train the loop as such:
def run():
model = BertModel()
torch.multiprocessing.freeze_support()
print('loop')
train_loop(model, df_train, df_val)
if __name__ == '__main__':
run()
Redefining that train_loop line in the end. Issue solved. For more see this link: https://github.com/pytorch/pytorch/issues/5858

ZPL print Italic Bold and Underline

I need to print ZPL with dynamic content.
I Need your help. I am dynamic content , please help me.
is this word possible to print. Note the content is dynamic.
ZPL Code please.
If you want to type bold you can use this
^FO340,128^FDbold^FS
^FO339,128^FDbold^FS
Another option (External fonts usage for underline, italic and bold)
http://labelary.com/docs.html
There is no simple way to bold or italicize text withing ZPL. The fonts the printer has are very basic and can't be changed like that.
Complex font settings (italic, bold, serif ) are actually sent as compressed images to ZPL printers (you can check this with ZebraDesigner).
The format is called Z64, which is based on LZ77.
These two pages contain interesting code in Java to write a converter :
http://www.jcgonzalez.com/img-to-zpl-online
https://gist.github.com/trevarj/1255e5cbc08fb3f79c3f255e25989a18
...still I'm not sure whether the CRC part of the conversion will remain the same in the future, as this is probably vendor-dependent.
Here is a Python port of the first script :
import cv2
import base64
import matplotlib.pyplot as plt
import io
import numpy
blacklimit=int(50* 768/100)
compress=False
total=0
width_byte=0
mapCode = dict()
LOCAL_PATH="C://DEV//printer//zebra_debug.txt"
'''
class StringBuilder(object):
def __init__(self):
self._stringio = io.StringIO()
def __str__(self):
return self._stringio.getvalue()
def getvalue(self):
return self._stringio.getvalue()
def append(self, *objects, sep=' ', end=''):
print(*objects, sep=sep, end=end, file=self._stringio)
'''
def init_map_code():
global mapCode
mapCode[1] = "G"
mapCode[2] = "H"
mapCode[3] = "I"
mapCode[4] = "J"
mapCode[5] = "K"
mapCode[6] = "L"
mapCode[7] = "M"
mapCode[8] = "N"
mapCode[9] = "O"
mapCode[10] = "P"
mapCode[11] = "Q"
mapCode[12] = "R"
mapCode[13] = "S"
mapCode[14] = "T"
mapCode[15] = "U"
mapCode[16] = "V"
mapCode[17] = "W"
mapCode[18] = "X"
mapCode[19] = "Y"
mapCode[20] = "g"
mapCode[40] = "h"
mapCode[60] = "i"
mapCode[80] = "j"
mapCode[100] = "k"
mapCode[120] = "l"
mapCode[140] = "m"
mapCode[160] = "n"
mapCode[180] = "o"
mapCode[200] = "p"
mapCode[220] = "q"
mapCode[240] = "r"
mapCode[260] = "s"
mapCode[280] = "t"
mapCode[300] = "u"
mapCode[320] = "v"
mapCode[340] = "w"
mapCode[360] = "x"
mapCode[380] = "y"
mapCode[400] = "z"
def numberToBase(n, b):
if n == 0:
return [0]
digits = []
while n:
digits.append(int(n % b))
n //= b
return digits[::-1]
def four_byte_binary(binary_str):
decimal=int(binary_str, 2)
if decimal>15:
returned=hex(decimal).upper()
returned=returned[2:]
else:
#returned=hex(decimal).upper()+"0"
returned=hex(decimal).upper()
if binary_str!="00000000":
print("cut="+returned)
returned=returned[2:]
returned="0"+returned
if binary_str!="00000000":
print("low10="+returned)
#
if binary_str!="00000000":
print(binary_str+"\t"+str(decimal)+"\t"+returned+"\t")
return returned
def createBody(img):
global blacklimit
global width_byte
global total
height, width, colmap = img.shape
print(height)
print(width)
print(colmap)
rgb = 0
index=0
aux_binary_char=['0', '0', '0', '0', '0', '0', '0', '0']
sb=[]
if(width%8>0):
width_byte=int((width/8)+1)
else:
width_byte=width/8
total=width_byte*height
print(height)
print("\n")
print(width)
print("\n")
i=0
for h in range(0, height):
for w in range(0, width):
color = img[h,w]
#print(color)
#print(w)
blue=color[0]
green=color[1]
red=color[2]
blue=blue & 0xFF
green=green & 0xFF
red=red & 0xFF
"""
blue=np.uint8(blue)
green=np.unit8(green)
red=np.unit8(red)
"""
#print(bin(blue))
auxchar='1'
total_color=red+green+blue
if(total_color> blacklimit):
#print('above_black_limit')
auxchar='0'
aux_binary_char[index]=auxchar
index=index+1
if(index==8 or w==(width-1)):
if "".join(aux_binary_char) !="00000000":
print(i)
sb.append(four_byte_binary("".join(aux_binary_char)))
i=i+1
aux_binary_char=['0', '0', '0', '0', '0', '0', '0', '0']
index=0
#print(h)
sb.append("\n")
#print(sb)
print(blacklimit)
return ''.join(sb)
def encode_hex_ascii(code):
global width_byte
global mapCode
max_linea=width_byte*2
sb_code=[]
sb_linea=[]
previous_line=1
counter=1
aux = code[0]
first_char=False
for i in range(1, len(code)):
if(first_char):
aux=code[i]
first_char=False
continue
if(code[i]=="\n"):
if(counter>= max_linea and aux=='0'):
sb_linea.append(",")
elif(counter>= max_linea and aux=='F'):
sb_linea.append("!")
elif(counter>20):
multi20=int((counter/20))*20
resto20=counter%20
sb_linea.append(mapCode[multi20])
if(resto20!=0):
sb_linea.append(mapCode[resto20] +aux)
else:
sb_linea.append(aux)
else:
sb_linea.append(mapCode[counter] +aux)
counter=1
first_char=True
if(''.join(sb_linea)==previous_line):
sb_code.append(":")
else:
sb_code.append(''.join(sb_linea))
previous_line=''.join(sb_linea)
sb_linea=[]
continue
if aux==code[i]:
counter=counter+1
else:
if counter>20:
multi20=int((counter/20))*20
resto20=counter%20
sb_linea.append(mapCode[multi20])
if resto20!=0:
sb_linea.append(mapCode[resto20] + aux)
else:
sb_linea.append(aux)
else:
sb_linea.append(mapCode[counter] + aux)
counter=1
aux=code[i]
return ''.join(sb_code)
def head_doc():
global total
global width_byte
return "^XA " + "^FO0,0^GFA,"+ str(int(total)) + ","+ str(int(total)) + "," + str(int(width_byte)) +", "
def foot_doc():
return "^FS"+ "^XZ"
def process(img):
global compress
init_map_code()
cuerpo=createBody(img)
print("CUERPO\n")
print(cuerpo)
print("\n")
if compress:
cuerpo=encode_hex_ascii(cuerpo)
print("COMPRESS\n")
print(cuerpo)
print("\n")
return head_doc() + cuerpo + foot_doc()
img = cv2.imread("C:\\Users\\ftheeten\\Pictures\\out.jpg", cv2.IMREAD_COLOR )
compress=True
blacklimit ==int(50* 768/100)
test=process(img)
file=open(LOCAL_PATH, 'w')
file.write(test)
file.close()

R: tm Textmining package: Doc-Level metadata generation is slow

I have a list of documents to process, and for each record I want to attach some metadata to the document "member" inside the "corpus" data structure that tm, the R package, generates (from reading in text files).
This for-loop works but it is very slow,
Performance seems to degrade as a function f ~ 1/n_docs.
for (i in seq(from= 1, to=length(corpus), by=1)){
if(opts$options$verbose == TRUE || i %% 50 == 0){
print(paste(i, " ", substr(corpus[[i]], 1, 140), sep = " "))
}
DublinCore(corpus[[i]], "title") = csv[[i,10]]
DublinCore(corpus[[i]], "Publisher" ) = csv[[i,16]] #institutions
}
This may do something to the corpus variable but I don't know what.
But when I put it inside a tm_map() (similar to lapply() function), it runs much faster, but the changes are not made persistent:
i = 0
corpus = tm_map(corpus, function(x){
i <<- i + 1
if(opts$options$verbose == TRUE){
print(paste(i, " ", substr(x, 1, 140), sep = " "))
}
meta(x, tag = "Heading") = csv[[i,10]]
meta(x, tag = "publisher" ) = csv[[i,16]]
})
Variable corpus has empty metadata fields after exiting the tm_map function. It should be filled. I have a few other things to do with the collection.
The R documentation for the meta() function says this:
Examples:
data("crude")
meta(crude[[1]])
DublinCore(crude[[1]])
meta(crude[[1]], tag = "Topics")
meta(crude[[1]], tag = "Comment") <- "A short comment."
meta(crude[[1]], tag = "Topics") <- NULL
DublinCore(crude[[1]], tag = "creator") <- "Ano Nymous"
DublinCore(crude[[1]], tag = "Format") <- "XML"
DublinCore(crude[[1]])
meta(crude[[1]])
meta(crude)
meta(crude, type = "corpus")
meta(crude, "labels") <- 21:40
meta(crude)
I tried many of these calls (with var "corpus" instead of "crude"), but they do not seem to work.
Someone else once seemed to have had the same problem with a similar data set (forum post from 2009, no response)
Here's a bit of benchmarking...
With the for loop :
expr.for <- function() {
for (i in seq(from= 1, to=length(corpus), by=1)){
DublinCore(corpus[[i]], "title") = LETTERS[round(runif(26))]
DublinCore(corpus[[i]], "Publisher" ) = LETTERS[round(runif(26))]
}
}
microbenchmark(expr.for())
# Unit: milliseconds
# expr min lq median uq max
# 1 expr.for() 21.50504 22.40111 23.56246 23.90446 70.12398
With tm_map :
corpus <- crude
expr.map <- function() {
tm_map(corpus, function(x) {
meta(x, "title") = LETTERS[round(runif(26))]
meta(x, "Publisher" ) = LETTERS[round(runif(26))]
x
})
}
microbenchmark(expr.map())
# Unit: milliseconds
# expr min lq median uq max
# 1 expr.map() 5.575842 5.700616 5.796284 5.886589 8.753482
So the tm_map version, as you noticed, seems to be about 4 times faster.
In your question you say that the changes in the tm_map version are not persistent, it is because you don't return x at the end of your anonymous function. In the end it should be :
meta(x, tag = "Heading") = csv[[i,10]]
meta(x, tag = "publisher" ) = csv[[i,16]]
x

Graphite / Carbon / Ceres node overlap

I'm working with Graphite monitoring using Carbon and Ceres as the storage method. I have some problems with correcting bad data. It seems that (due to various problems) I've ended up with overlapping files. That is, since Carbon / Ceres stores the data as timestamp#interval.slice, I can have two or more files with overlapping time ranges.
There are two kinds of overlaps:
File A: +------------+ orig file
File B: +-----+ subset
File C: +---------+ overlap
This is causing problems because the existing tools available (ceres-maintenance defrag and rollup) don't cope with these overlaps. Instead, they skip the directory and move on. This is a problem, obviously.
I've created a script that fixes this problem, as follows:
For subsets, just delete the subset file.
For overlaps, using the file system 'truncate' on the orig file at the point where the next file starts. While it is possible to cut off the start of the overlap file and rename it properly, I would suggest that this is fraught with danger.
I've found that it's possible to do this in two ways:
Walk the dirs and iterate over the files, fixing as you go, and find the file subsets, remove them;
Walk the dirs and fix all the problems in a dir before moving on. This is BY FAR the faster approach, since the dir walk is hugely time consuming.
Code:
#!/usr/bin/env python2.6
################################################################################
import io
import os
import time
import sys
import string
import logging
import unittest
import datetime
import random
import zmq
import json
import socket
import traceback
import signal
import select
import simplejson
import cPickle as pickle
import re
import shutil
import collections
from pymongo import Connection
from optparse import OptionParser
from pprint import pprint, pformat
################################################################################
class SliceFile(object):
def __init__(self, fname):
self.name = fname
basename = fname.split('/')[-1]
fnArray = basename.split('#')
self.timeStart = int(fnArray[0])
self.freq = int(fnArray[1].split('.')[0])
self.size = None
self.numPoints = None
self.timeEnd = None
self.deleted = False
def __repr__(self):
out = "Name: %s, tstart=%s tEnd=%s, freq=%s, size=%s, npoints=%s." % (
self.name, self.timeStart, self.timeEnd, self.freq, self.size, self.numPoints)
return out
def setVars(self):
self.size = os.path.getsize(self.name)
self.numPoints = int(self.size / 8)
self.timeEnd = self.timeStart + (self.numPoints * self.freq)
################################################################################
class CeresOverlapFixup(object):
def __del__(self):
import datetime
self.writeLog("Ending at %s" % (str(datetime.datetime.today())))
self.LOGFILE.flush()
self.LOGFILE.close()
def __init__(self):
self.verbose = False
self.debug = False
self.LOGFILE = open("ceresOverlapFixup.log", "a")
self.badFilesList = set()
self.truncated = 0
self.subsets = 0
self.dirsExamined = 0
self.lastStatusTime = 0
def getOptionParser(self):
return OptionParser()
def getOptions(self):
parser = self.getOptionParser()
parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, help="debug mode for this program, writes debug messages to logfile." )
parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="verbose mode for this program, prints a lot to stdout." )
parser.add_option("-b", "--basedir", action="store", type="string", dest="basedir", default=None, help="base directory location to start converting." )
(options, args) = parser.parse_args()
self.debug = options.debug
self.verbose = options.verbose
self.basedir = options.basedir
assert self.basedir, "must provide base directory."
# Examples:
# ./updateOperations/1346805360#60.slice
# ./updateOperations/1349556660#60.slice
# ./updateOperations/1346798040#60.slice
def getFileData(self, inFilename):
ret = SliceFile(inFilename)
ret.setVars()
return ret
def removeFile(self, inFilename):
os.remove(inFilename)
#self.writeLog("removing file: %s" % (inFilename))
self.subsets += 1
def truncateFile(self, fname, newSize):
if self.verbose:
self.writeLog("Truncating file, name=%s, newsize=%s" % (pformat(fname), pformat(newSize)))
IFD = None
try:
IFD = os.open(fname, os.O_RDWR|os.O_CREAT)
os.ftruncate(IFD, newSize)
os.close(IFD)
self.truncated += 1
except:
self.writeLog("Exception during truncate: %s" % (traceback.format_exc()))
try:
os.close(IFD)
except:
pass
return
def printStatus(self):
now = self.getNowTime()
if ((now - self.lastStatusTime) > 10):
self.writeLog("Status: time=%d, Walked %s dirs, subsetFilesRemoved=%s, truncated %s files." % (now, self.dirsExamined, self.subsets, self.truncated))
self.lastStatusTime = now
def fixupThisDir(self, inPath, inFiles):
# self.writeLog("Fixing files in dir: %s" % (inPath))
if not '.ceres-node' in inFiles:
# self.writeLog("--> Not a slice directory, skipping.")
return
self.dirsExamined += 1
sortedFiles = sorted(inFiles)
sortedFiles = [x for x in sortedFiles if ((x != '.ceres-node') and (x.count('#') > 0)) ]
lastFile = None
fileObjList = []
for thisFile in sortedFiles:
wholeFilename = os.path.join(inPath, thisFile)
try:
curFile = self.getFileData(wholeFilename)
fileObjList.append(curFile)
except:
self.badFilesList.add(wholeFilename)
self.writeLog("ERROR: file %s, %s" % (wholeFilename, traceback.format_exc()))
# name is timeStart, really.
fileObjList = sorted(fileObjList, key=lambda thisObj: thisObj.name)
while fileObjList:
self.printStatus()
changes = False
firstFile = fileObjList[0]
removedFiles = []
for curFile in fileObjList[1:]:
if (curFile.timeEnd <= firstFile.timeEnd):
# have subset file. elim.
self.removeFile(curFile.name)
removedFiles.append(curFile.name)
self.subsets += 1
changes = True
if self.verbose:
self.writeLog("Subset file situation. First=%s, overlap=%s" % (firstFile, curFile))
fileObjList = [x for x in fileObjList if x.name not in removedFiles]
if (len(fileObjList) < 2):
break
secondFile = fileObjList[1]
# LT is right. FirstFile's timeEnd is always the first open time after first is done.
# so, first starts#100, len=2, end=102, positions used=100,101. second start#102 == OK.
if (secondFile.timeStart < firstFile.timeEnd):
# truncate first file.
# file_A (last): +---------+
# file_B (curr): +----------+
# solve by truncating previous file at startpoint of current file.
newLenFile_A_seconds = int(secondFile.timeStart - firstFile.timeStart)
newFile_A_datapoints = int(newLenFile_A_seconds / firstFile.freq)
newFile_A_bytes = int(newFile_A_datapoints) * 8
if (not newFile_A_bytes):
fileObjList = fileObjList[1:]
continue
assert newFile_A_bytes, "Must have size. newLenFile_A_seconds=%s, newFile_A_datapoints=%s, newFile_A_bytes=%s." % (newLenFile_A_seconds, newFile_A_datapoints, newFile_A_bytes)
self.truncateFile(firstFile.name, newFile_A_bytes)
if self.verbose:
self.writeLog("Truncate situation. First=%s, overlap=%s" % (firstFile, secondFile))
self.truncated += 1
fileObjList = fileObjList[1:]
changes = True
if not changes:
fileObjList = fileObjList[1:]
def getNowTime(self):
return time.time()
def walkDirStructure(self):
startTime = self.getNowTime()
self.lastStatusTime = startTime
updateStatsDict = {}
self.okayFiles = 0
emptyFiles = 0
for (thisPath, theseDirs, theseFiles) in os.walk(self.basedir):
self.printStatus()
self.fixupThisDir(thisPath, theseFiles)
self.dirsExamined += 1
endTime = time.time()
# time.sleep(11)
self.printStatus()
self.writeLog( "now = %s, started at %s, elapsed time = %s seconds." % (startTime, endTime, endTime - startTime))
self.writeLog( "Done.")
def writeLog(self, instring):
print instring
print >> self.LOGFILE, instring
self.LOGFILE.flush()
def main(self):
self.getOptions()
self.walkDirStructure()

What's the best way to extract Excel cell comments using Perl or Ruby?

I've been parsing Excel documents in Perl successfully with Spreadhsheet::ParseExcel (as recommended in What's the best way to parse Excel file in Perl?), but I can't figure out how to extract cell comments.
Any ideas? A solution in Perl or Ruby would be ideal.
The Python xlrd library will parse cell comments (if you turn on xlrd.sheet.OBJ_MSO_DEBUG, you'll see them), but it doesn't expose them from the API. You could either parse the dump or hack on it a bit so you can get to them programmatically. Here's a start (tested extremely minimally):
diff --git a/xlrd/sheet.py b/xlrd/sheet.py
--- a/xlrd/sheet.py
+++ b/xlrd/sheet.py
## -206,6 +206,7 ##
self._dimncols = 0
self._cell_values = []
self._cell_types = []
+ self._cell_notes = []
self._cell_xf_indexes = []
self._need_fix_ragged_rows = 0
self.defcolwidth = None
## -252,6 +253,7 ##
return Cell(
self._cell_types[rowx][colx],
self._cell_values[rowx][colx],
+ self._cell_notes[rowx][colx],
xfx,
)
## -422,12 +424,14 ##
if self.formatting_info:
self._cell_xf_indexes[nrx].extend(aa('h', [-1]) * nextra)
self._cell_values[nrx].extend([''] * nextra)
+ self._cell_notes[nrx].extend([None] * nextra)
if nc > self.ncols:
self.ncols = nc
self._need_fix_ragged_rows = 1
if nr > self.nrows:
scta = self._cell_types.append
scva = self._cell_values.append
+ scna = self._cell_notes.append
scxa = self._cell_xf_indexes.append
fmt_info = self.formatting_info
xce = XL_CELL_EMPTY
## -436,6 +440,7 ##
for _unused in xrange(self.nrows, nr):
scta([xce] * nc)
scva([''] * nc)
+ scna([None] * nc)
if fmt_info:
scxa([-1] * nc)
else:
## -443,6 +448,7 ##
for _unused in xrange(self.nrows, nr):
scta(aa('B', [xce]) * nc)
scva([''] * nc)
+ scna([None] * nc)
if fmt_info:
scxa(aa('h', [-1]) * nc)
self.nrows = nr
## -454,6 +460,7 ##
aa = array_array
s_cell_types = self._cell_types
s_cell_values = self._cell_values
+ s_cell_notes = self._cell_notes
s_cell_xf_indexes = self._cell_xf_indexes
s_dont_use_array = self.dont_use_array
s_fmt_info = self.formatting_info
## -465,6 +472,7 ##
nextra = ncols - rlen
if nextra > 0:
s_cell_values[rowx][rlen:] = [''] * nextra
+ s_cell_notes[rowx][rlen:] = [None] * nextra
if s_dont_use_array:
trow[rlen:] = [xce] * nextra
if s_fmt_info:
## -600,6 +608,7 ##
bk_get_record_parts = bk.get_record_parts
bv = self.biff_version
fmt_info = self.formatting_info
+ txos = {}
eof_found = 0
while 1:
# if DEBUG: print "SHEET.READ: about to read from position %d" % bk._position
## -877,13 +886,23 ##
break
elif rc == XL_OBJ:
# handle SHEET-level objects; note there's a separate Book.handle_obj
- self.handle_obj(data)
+ obj = self.handle_obj(data)
+ if obj:
+ obj_id = obj.id
+ else:
+ obj_id = None
elif rc == XL_MSO_DRAWING:
self.handle_msodrawingetc(rc, data_len, data)
elif rc == XL_TXO:
- self.handle_txo(data)
+ txo = self.handle_txo(data)
+ if txo and obj_id:
+ txos[obj_id] = txo
+ obj_id = None
elif rc == XL_NOTE:
- self.handle_note(data)
+ note = self.handle_note(data)
+ txo = txos.get(note.object_id)
+ if txo:
+ self._cell_notes[note.rowx][note.colx] = txo.text
elif rc == XL_FEAT11:
self.handle_feat11(data)
elif rc in bofcodes: ##### EMBEDDED BOF #####
## -1387,19 +1406,16 ##
def handle_obj(self, data):
- if not OBJ_MSO_DEBUG:
- return
- DEBUG = 1
if self.biff_version < 80:
return
o = MSObj()
data_len = len(data)
pos = 0
- if DEBUG:
+ if OBJ_MSO_DEBUG:
fprintf(self.logfile, "... OBJ record ...\n")
while pos < data_len:
ft, cb = unpack('<HH', data[pos:pos+4])
- if DEBUG:
+ if OBJ_MSO_DEBUG:
hex_char_dump(data, pos, cb, base=0, fout=self.logfile)
if ft == 0x15: # ftCmo ... s/b first
assert pos == 0
## -1430,16 +1446,14 ##
else:
# didn't break out of while loop
assert pos == data_len
- if DEBUG:
+ if OBJ_MSO_DEBUG:
o.dump(self.logfile, header="=== MSOBj ===", footer= " ")
+ return o
def handle_note(self, data):
- if not OBJ_MSO_DEBUG:
- return
- DEBUG = 1
if self.biff_version < 80:
return
- if DEBUG:
+ if OBJ_MSO_DEBUG:
fprintf(self.logfile, '... NOTE record ...\n')
hex_char_dump(data, 0, len(data), base=0, fout=self.logfile)
o = MSNote()
## -1453,13 +1467,11 ##
o.original_author, endpos = unpack_unicode_update_pos(data, 8, lenlen=2)
assert endpos == data_len - 1
o.last_byte = data[-1]
- if DEBUG:
+ if OBJ_MSO_DEBUG:
o.dump(self.logfile, header="=== MSNote ===", footer= " ")
+ return o
def handle_txo(self, data):
- if not OBJ_MSO_DEBUG:
- return
- DEBUG = 1
if self.biff_version < 80:
return
o = MSTxo()
## -1477,8 +1489,9 ##
rc3, data3_len, data3 = self.book.get_record_parts()
assert rc3 == XL_CONTINUE
# ignore the formatting runs for the moment
- if DEBUG:
+ if OBJ_MSO_DEBUG:
o.dump(self.logfile, header="=== MSTxo ===", footer= " ")
+ return o
def handle_feat11(self, data):
if not OBJ_MSO_DEBUG:
## -1638,11 +1651,12 ##
class Cell(BaseObject):
- __slots__ = ['ctype', 'value', 'xf_index']
+ __slots__ = ['ctype', 'value', 'note', 'xf_index']
- def __init__(self, ctype, value, xf_index=None):
+ def __init__(self, ctype, value, note=None, xf_index=None):
self.ctype = ctype
self.value = value
+ self.note = note
self.xf_index = xf_index
def __repr__(self):
Then you could write something like:
import xlrd
xlrd.sheet.OBJ_MSO_DEBUG = True
xls = xlrd.open_workbook('foo.xls')
for sheet in xls.sheets():
print 'sheet %s (%d x %d)' % (sheet.name, sheet.nrows, sheet.ncols)
for rownum in xrange(sheet.nrows):
for cell in sheet.row(rownum):
print cell, cell.note
One option is to use Ruby's win32ole library.
The following (somewhat verbose) example connects to an open Excel worksheet and gets the comment text from cell B2.
require 'win32ole'
xl = WIN32OLE.connect('Excel.Application')
ws = xl.ActiveSheet
cell = ws.Range('B2')
comment = cell.Comment
text = comment.Text
More info and examples of using Ruby's win32ole library to automate Excel can be found here:
http://rubyonwindows.blogspot.com/search/label/excel

Resources