python, import csv file, read columns and rows, remove blank spaces, convert strings to real numbers - whitespace

I'm having trouble with importing a csv file into python and having it separate the information. I want to also remove all the blank spaces and convert the numbers (which are strings right now) into integers. Here is what I have so far. These lines work but do not accomplish the task of removing the blank spaces and converting the strings to integers.
filename = 'myfile.csv'
f = open(filename, 'r')
read = f.readlines()
print(read)
for i in range(len(read)):
read[i] = read[i].split(',')
print(read)
header = read[0]
print(header)
info = {}
cntr = 0
for name in header:
info[name] = [line[cntr] for line in read]
cntr += 1
print(info)
I searched through past examples on this forum and this is what I tried to do to have the blank spaces removed but now I'm lost:
import csv
aList = []
with open('myfile.csv', 'r') as f:
reader = csv.reader(f, skipinitialspace = True, delimiter = ',', quoting = csv.QUOTE_NONE)
for row in reader:
aList.append(row)
print(aList)
info = {}
cntr = 0
for i in aList:
info[aList] = [line[cntr] for line in reader]
cntr += 1
print(info)

#sample input
#1 23,456,789
#11 2,11 3,114
import csv
aList = []
with open('myfile.csv', 'r') as f:
reader = csv.reader(f, skipinitialspace = True, delimiter = ',', quoting = csv.QUOTE_NONE)
for row in reader:
aList.append(row)
print(aList)
info = {}
cntr = 0
print [map(int,[j.replace(" ","") for j in i]) for i in aList]
#[[123,456,789][112,113,114]]
Explanation - making the last line simple, and breaking into parts,
#[i for i in aList] gives [["1 23","456","789"]["11 2","11 3","114"]]
#[j.replace(" ","") for j in i] gives [["123","456","789"]["112","113","114"]]
#[map(int,[j.replace(" ","") for j in i]) for i in aList]
#maps all string in list to int and gives [[123,456,789][112,113,114]]

Related

How to handle nested double quotes in Nifi?

We have a csv file with nested double quotes column.
For example : 1,John,26,"how are you "Jim"".
In this example we have 4 columns id, name, age and message.
Here message column is having nested double quotes, which is causing the data parsing issue in convertRecord Nifi processor(could not parse incoming data error). Is there any way we can escape nested double quotes and read the data properly ?
As shown in the below image, we are using the following properties in both CSVReader and CSVRecordSetWritter controller services.
We had the exact same issue and as #daggett highlighted - How could you detect which quote is the end of the field? We even spoke with Cloudera, and everything boils down to that data does not conform to CSV standard rules.
So written a small python script which is called using ExecuteScript processor, and able to escape almost all the special characters except when double quote and dilimiter is part of the data eg. "field_1","field_2 this is very invalid", data","field_3"
Give it a go and please comment if it works so that we can encompass logic into a custom processor!
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback
from org.apache.nifi.processors.script import ExecuteScript
from org.python.core.util.FileUtil import wrap
from io import StringIO
import re
# Define a subclass of StreamCallback for use in session.write()
class PyStreamCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
with wrap(inputStream) as f:
lines = f.readlines()
outer_new_value_list = []
is_header_row = True
for row in lines:
if is_header_row:
is_header_row = False
outer_new_value_list.append(row)
continue
char_list = list(row.strip())
for position, char in enumerate(char_list):
#print(position, char)
# if position == 54:
# print()
if (position + 1) == len(char_list):
continue
if position == 0:
continue
else:
if char == '"':
if char_list[position - 1] == ',' or char_list[position + 1] == ',':
# this double quote is Quote Character at start of field or end of field
continue
if char_list[position - 1] != ',' and char_list[position + 1] != ',':
# this double quote is inbetween and is not Quote Character, add escape character to it
replace_char = '\\' + char
char_list[position] = replace_char
if char == ',':
# Int values are not in double quotes, so check previous and next char is of int type
previous_char_type = ''
next_char_type = ''
try:
previous_char = char_list[position - 1]
if isinstance(int(previous_char), int):
previous_char_type = 'Int'
except:
pass
# print('previous_char : ' + str(previous_char))
try:
next_char = char_list[position + 1]
if isinstance(int(next_char), int):
next_char_type = 'Int'
except:
pass
# print(" next_char: " + str(next_char))
if previous_char_type == 'Int' or next_char_type == 'Int':
print('No need to replace this instance of comma')
continue
if char_list[position - 1] == '"' or char_list[position + 1] == '"':
# delimited comma
continue
if char_list[position - 1] != '"' and char_list[position + 1] != '"':
# not delimited comma, inbetween comma, add with escape character to it
replace_char = '\\' + char
char_list[position] = replace_char
if char == '\\':
replace_char = ''
char_list[position] = replace_char
new_data_line = ''.join([str(elem) for elem in char_list])
outer_new_value_list.append(new_data_line + '\r\n')
with wrap(outputStream, 'w') as filehandle:
filehandle.writelines("%s" % line for line in outer_new_value_list)
# end class
flowFile = session.get()
if (flowFile != None):
flowFile = session.write(flowFile, PyStreamCallback())
session.transfer(flowFile, ExecuteScript.REL_SUCCESS)
# implicit return at the end

Discord.py Embed text file, get more results

I'd like to send on a Discord bot a message embed but text is from another file. I did this way and it doesn't work:
bot.command()
async def gdc(ctx):
"""Wins GDC"""
index1 = 0
file = open("/home/plo/rkr/res_wins2", "r")
for line in file.readlines():
line = line.strip()
index1 += 1
if index1 == 4: break
message = line
embed = discord.Embed()
embed.description = message
embed.title = title
embed.colour = 0xF1C40F
await ctx.send(embed=embed)
However, it seems only one result goes out... Here is a part of my txt file:
Roi mouton: 9
tomate: 8
The_Portos: 8
And here is the result:
You're changing the value of line every loop in the for loop so you'll have to make a list of the lines
lines = []
with open("/home/plo/rkr/res_wins2", "r") as file: # Use this to open and close the file
for line in file.readlines():
line = line.strip()
lines.append(line)
index1 += 1
if index1 == 4: break
embed = discord.Embed()
embed.description = '\n'.join(lines)
embed.title = title
embed.colour = 0xF1C40F
await ctx.send(embed=embed)

How does this loop works?

I have this code and I need to find an explanation to it especially the part
set files ="$files test$k.ppm"
what does it do?
set files = ""
set k = 100
while($k <210)
set files = "$files test$k.ppm"
# k = $k +10
end
https://explainshell.com/
set is for setting variables
This row set variable "files" into String with value as "{files} test{counter}.ppm"
files ="$files test$k.ppm"
At the beginning files is empty String...
#1st iteration (IN: files = "", k = 100)
files = "" + test100.ppm
#out files = test100.ppm
#2nd iteration (files = "" + test100.ppm; k = 110)
files = test100.ppm test110.ppm
#out files = test100.ppm test110.ppm
#Last iteration will be something like
test100ppm test110ppm {...} test200.ppm
Nothing else

Bracket finding algorithm lua?

I'm making a JSON parser and I am looking for an algorithm that can find all of the matching brackets ([]) and braces ({}) and put them into a table with the positions of the pair.
Examples of returned values:
table[x][firstPos][secondPos] = type
table[x] = {firstPos, secondPos, bracketType}
EDIT: Let parse() be the function that returns the bracket pairs. Let table be the value returned by the parse() function. Let codeString be the string containing the brackets that I want to detect. Let firstPos be the position of the first bracket in the Nth pair of brackets. Let secondPos be the position of the second bracket in the Nth pair of brackets. Let bracketType be the type of the bracket pair ("bracket" or "brace").
Example:
If you called:
table = parse(codeString)
table[N][firstPos][secondPos] would be equal to type.
Well, In plain Lua, you could do something like this, also taking into account nested brackets:
function bm(s)
local res ={}
if not s:match('%[') then
return s
end
for k in s:gmatch('%b[]') do
res[#res+1] = bm(k:sub(2,-2))
end
return res
end
Of course you can generalize this easy enough to braces, parentheses, whatever (do keep in mind the necessary escaping of [] in patterns , except behind the %b pattern).
If you're not restricted to plain Lua, you could use LPeg for more flexibility
If you are not looking for the contents of the brackets, but the locations, the recursive approach is harder to implement, since you should keep track of where you are. Easier is just walking through the string and match them while going:
function bm(s,i)
local res={}
res.par=res -- Root
local lev = 0
for loc=1,#s do
if s:sub(loc,loc) == '[' then
lev = lev+1
local t={par=res,start=loc,lev=lev} -- keep track of the parent
res[#res+1] = t -- Add to the parent
res = t -- make this the current working table
print('[',lev,loc)
elseif s:sub(loc,loc) == ']' then
lev = lev-1
if lev<0 then error('too many ]') end -- more closing than opening.
print(']',lev,loc)
res.stop=loc -- save bracket closing position
res = res.par -- revert to the parent.
end
end
return res
end
Now that you have all matched brackets, you can loop through the table, extracting all locations.
I figured out my own algorithm.
function string:findAll(query)
local firstSub = 1
local lastSub = #query
local result = {}
while lastSub <= #self do
if self:sub(firstSub, lastSub) == query then
result[#result + 1] = firstSub
end
firstSub = firstSub + 1
lastSub = lastSub + 1
end
return result
end
function string:findPair(openPos, openChar, closeChar)
local counter = 1
local closePos = openPos
while closePos <= #self do
closePos = closePos + 1
if self:sub(closePos, closePos) == openChar then
counter = counter + 1
elseif self:sub(closePos, closePos) == closeChar then
counter = counter - 1
end
if counter == 0 then
return closePos
end
end
return -1
end
function string:findBrackets(bracketType)
local openBracket = ""
local closeBracket = ""
local openBrackets = {}
local result = {}
if bracketType == "[]" then
openBracket = "["
closeBracket = "]"
elseif bracketType == "{}" then
openBracket = "{"
closeBracket = "}"
elseif bracketType == "()" then
openBracket = "("
closeBracket = ")"
elseif bracketType == "<>" then
openBracket = "<"
closeBracket = ">"
else
error("IllegalArgumentException: Invalid or unrecognized bracket type "..bracketType.."\nFunction: findBrackets()")
end
local openBrackets = self:findAll(openBracket)
if not openBrackets[1] then
return {}
end
for i, j in pairs(openBrackets) do
result[#result + 1] = {j, self:findPair(j, openBracket, closeBracket)}
end
return result
end
Will output:
5 14
6 13
7 12
8 11
9 10

How to read large matrix from a csv efficiently in Octave

There are many reports of slow performance of Octave's dlmread. I was hoping that this was fixed in 3.2.4, but when I tried to load a csv file that has a size of ca. 8 * 4 mil (32 mil in total), it also took very, very long time. I searched the web but could not find a workaround for this. Does anybody know a good workaround?
I experienced the same problem and had R handy, so my solution was to use "read.csv" in R, and then use the R package "R.matlab" to write a ".mat" file, and then load that in Octave.
"read.csv" can be pretty slow too, but this worked very well in my case.
The reason is that Octave has a bug that adding data to a very large matrix takes more time then adding the same amount of data to a small matrix.
Below is my try. I choose to save data each 50000 lines, so meanwhile I could already take a look instead of being forced to wait. It is slower for small files, but much faster for larger files.
function alldata = load_data(filename)
fid = fopen(filename,'r');
s=0;
data=[];
alldata=[];
save "temp.mat" alldata;
if fid == -1
disp("Couldn't find file mydata");
else
while (~feof(fid))
line = fgetl(fid);
[t1,t2,t3,t4,d] = sscanf(line,'%i:%i:%i:%i %f', "C"); #reading time as hh:mm:ss:ms and data as float
s++;
t = (t1 * 3600000 + t2 * 60000 + t3 * 1000 + t4);
data = [data; t, d];
if (mod(s,10000) == 0)
#disp(s), disp(" "), disp(t), disp(" "), disp(d), disp("\n");
disp(s);
fflush(stdout);
end
if (mod(s,50000) == 0)
load "temp.mat";
alldata=[alldata; data];
data=[];
save "temp.mat" alldata;
disp("data saved");
fflush(stdout);
end
end
disp(s);
load "temp.mat";
alldata=[alldata; data];
save "temp.mat" alldata;
disp("data saved");
fflush(stdout);
end
fclose(fid);
Here is a workaround that I am using.
I did not find that sscanf will parse input lines as indicated above. Also, I didn't use the temp file.
My .csv file has a large number of rows. They begin with a header of 18 lines and are followed by a data block, each of which has 135 columns. The following code has been tested. My file also begins each row with a dd/mm/yyyy hh:mm field. This will also catch poor lines and indicate where they are by using try/catch.
My .csv file came from a customer who dumped his PARCView load in an Excel file.
function [tags,descr,alldata] = fbcsvread(filename)
fid = fopen(filename,'r');
s = 0;
data=[];
alldata=zeros(1,135);
if fid==-1
disp("Couldn't find file %s\n",filename);
else
linecount = 1;
while (~feof(fid))
line = fgetl(fid);
data2 = zeros(1,135);
if linecount == 1
tags = strsplit(line,",");
elseif linecount == 2
descr = strsplit(line,",");
elseif linecount >= 19
data = strsplit(line,",");
datetime = strsplit(char(data(1))," ");
modyyr = strsplit(char(datetime(1)),"/");
hrmin = strsplit(char(datetime(2)),":");
year1 = sscanf(char(modyyr(3)),"%d","C");
day1 = sscanf(char(modyyr(2)),"%d","C");
month1 = sscanf(char(modyyr(1)),"%d","C");
hour1 = sscanf(char(hrmin(1)),"%d","C");
minute1 = sscanf(char(hrmin(2)),"%d","C");
realtime = datenum(year1,month1,day1,hour1,minute1);
data2(1) = realtime;
for location = 2:134
try
data2(location) = sscanf(char(data(location)),"%f","C");
catch
printf("Error at %s %s\n",char(datetime(1)),char(datetime(2)) );
fflush(stdout);
end_try_catch
endfor
alldata(linecount-18,:) = data2;
if mod(linecount,50) == 0
printf(".");
fflush(stdout);
endif
endif
linecount = linecount + 1;
endwhile
fclose(fid);
endif
endfunction

Resources