scrapy response.xpath() cause memory leaking

scrapy response.xpath() cause memory leaking - xpath

i found response.xpath() method leaking memory while using scrapy to write a spider. here is the code:
def extract_data(self, response):
aomen_host_water = None
aomen_pankou = None
aomen_guest_water = None
sb_host_water = None
sb_pankou = None
sb_guest_water = None
# response.xpath('//div[#id="webmain"]/table[#id="odds"]/tr')
# for tr in all_trs:
# # cname(company name)
# cname = tr.xpath('td[1]/text()').extract()
# if len(cname) == 0:
# continue
# # remove extra space and other stuff
# cname = cname[0].split(' ')[0]
# if cname == u'澳彩':
# aomen_host_water = tr.xpath('td[9]/text()').extract()
# if len(aomen_host_water) != 0:
# aomen_pankou = tr.xpath('td[10]/text()').extract()
# aomen_guest_water = tr.xpath('td[11]/text()').extract()
# else:
# aomen_host_water = tr.xpath('td[6]/text()').extract()
# aomen_pankou = tr.xpath('td[7]/text()').extract()
# aomen_guest_water = tr.xpath('td[8]/text()').extract()
# elif cname == u'ＳＢ':
# sb_host_water = tr.xpath('td[9]/text()').extract()
# if len(sb_host_water) != 0:
# sb_pankou = tr.xpath('td[10]/text()').extract()
# sb_guest_water = tr.xpath('td[11]/text()').extract()
# else:
# sb_host_water = tr.xpath('td[6]/text()').extract()
# sb_pankou = tr.xpath('td[7]/text()').extract()
# sb_guest_water = tr.xpath('td[8]/text()').extract()
# if (aomen_host_water is None) or (aomen_pankou is None) or (aomen_guest_water is None) or \
# (sb_host_water is None) or (sb_pankou is None) or (sb_guest_water is None):
# return None
# if (len(aomen_host_water) == 0) or (len(aomen_pankou) == 0) or (len(aomen_guest_water) == 0) or \
# (len(sb_host_water) == 0) or (len(sb_pankou) == 0) or (len(sb_guest_water) == 0):
# return None
# item = YPItem()
# item['aomen_host_water'] = float(aomen_host_water[0])
# item['aomen_pankou'] = aomen_pankou[0].encode('utf-8') # float(pankou.pankou2num(aomen_pankou[0]))
# item['aomen_guest_water'] = float(aomen_guest_water[0])
# item['sb_host_water'] = float(sb_host_water[0])
# item['sb_pankou'] = sb_pankou[0].encode('utf-8') # float(pankou.pankou2num(sb_pankou[0]))
# item['sb_guest_water'] = float(sb_guest_water[0])
item = YPItem()
item['aomen_host_water'] = 1.0
item['aomen_pankou'] = '111' # float(pankou.pankou2num(aomen_pankou[0]))
item['aomen_guest_water'] = 1.0
item['sb_host_water'] = 1.0
item['sb_pankou'] = '111' # float(pankou.pankou2num(sb_pankou[0]))
item['sb_guest_water'] = 1.0
return item
here i commented the useful statements and used fake data, spider used about 45M memory, when i uncommented the commented lines, spider used 100+M memory and the memory usage continuously rises. Did somebody met this kind of problem before ?

You might decrease the memory usage by switching to extract_first() instead of extract() which would create unnecessary lists.
I would also upgrade scrapy and lxml to the latest versions:
pip install --upgrade scrapy
pip install --upgrade lxml

Related

Ruby Git Diff Line Information Parser

How can I parse the output of a git diff and get line information (i.e. which lines has been added/modified)?
I would like something similar to
raw = `git diff`
parsed = Git.Diff.parse(raw)
parsed.each do |file|
file.each do |line|
puts "#{file.name} - #{line.number} - #{line.type}"
end
end
Edit:
Sample output
[
{
"file": "path/to/file1",
"lines": [
{ number: "1", type: "modified"},
{ number: "4", type: "deleted"},
{ number: "9", type: "added"}
]
},
{
"file": "path/to/file2",
"lines": [
{ number: "4", type: "modified"},
{ number: "5", type: "added"}
]
}
]

What you need is to correctly group the output in file chunks and keep what is needed.
Getting the diff
You can get it by simply running a
`git --diff`
What lines are needed?
lines starting with 'diff --git' from where you can get the file's name
lines starting with '+ ' that are the added ones
lines starting with '- ' that are the removed ones
How to group them?
For these things Enumerable#slice_before comes to mind.
Putting it together
I ended up with this prototype:
raw_data = `git diff`.split("\n")
# Keep what is needed
clean_data = raw_data.select { |li|
li.starts_with?('diff --git') ||
li.starts_with?('- ') ||
li.starts_with?('+ ')
}
# Group the by file
# [[file_1, line1, line2, line3], [file_2, line1]]
file_data = clean_data.slice_before { |li| li.starts_with?('diff --git') }
# This is the output format
output = Hash.new {|h,k| h[k] = { added: 0, removed: 0 } }
# Populate the output
file_data.each_with_object(output) do |f_data, memo|
file, *file_info = f_data
file = file.split(' b/').first.gsub('diff --git a/', '')
file_info.each { |f_info|
memo[file][f_info[0] == '+' ? :added : :removed] += 1
}
end
Output example
{
"file_1" => { added: 1, removed: 12 },
"file_2" => { added: 0, removed: 1 }
}
I am sure it can get better :-)

Here is what I ended up with
class Parser
def parse(text)
if text.encoding.name != "UTF-8"
encoded_text = #full_diff.encode("UTF-8", "binary", { :invalid => :replace, :undef => :replace })
else
encoded_text = text
end
hunks = []
hunk = nil
added_line_number = nil
deleted_line_number = nil
lines = encoded_text.strip.split("\n")
lines.each_with_index do |line, index|
if m = /^diff --git a\/(.*?) b\/(.*?)$/.match(line)
raise "Diff formatting error, 'diff --git' is the last line" if index + 1 >= lines.length
# new hunk
added_line_number = nil
delete_line_number = nil
hunk = Hunk.new(m[1], m[2])
hunk.type = hunk_type(lines[index + 1], m[1], m[2])
hunks.push(hunk)
elsif /^Binary files /.match(line)
hunk.is_binary = true
elsif m = /^## \-(\d+)(?:,\d+)? \+(\d+)(?:,\d+)? ##/.match(line)
# (e.g. ## -19,6 +19,7 ##)
deleted_line_number = Integer(m[1])
added_line_number = Integer(m[2])
else
if !added_line_number.nil?
if line.start_with?('+')
# added line
hunk.lines.push SourceLine.new(added_line_number, SourceLine::Type::Added, line[1..-1])
added_line_number += 1
elsif line.start_with?('-')
# deleted line
hunk.lines.push SourceLine.new(deleted_line_number, SourceLine::Type::Deleted, line[1..-1])
deleted_line_number += 1
else
# unmodified line
added_line_number += 1
deleted_line_number += 1
end
end
end
end
hunks
end
def hunk_type(line, original, renamed)
case line
when /^new file /
type = Hunk::Type::Added
when /^deleted file /
type = Hunk::Type::Deleted
else
type = original == renamed ? Hunk::Type::Modified : Hunk::Type::Renamed
end
type
end
private :hunk_type
end
end
module Type
Added = 'added'
Deleted = 'deleted'
Modified = 'modified'
Renamed = 'renamed'
end
class Hunk
module Type
Added = 'added'
Deleted = 'deleted'
Modified = 'modified'
Renamed = 'renamed'
end
attr_accessor :original_path, :renamed_path, :type, :lines, :is_binary
alias_method :is_binary?, :is_binary
def initialize(original_path, renamed_path)
self.is_binary = false
self.lines = []
self.original_path = original_path
self.renamed_path = renamed_path
end
end
class SourceLine
module Type
Added = 'added'
Deleted = 'deleted'
end
attr_accessor :number, :type, :text
def initialize(number, type, text)
self.number = number
self.type = type
self.text = text
end
end

You can try out https://github.com/bguban/git_modified_lines gem. It returns only modified lines but probably it will be useful

How to run this ruby file that needs an argument?

I just wanted this function to run and get the output, but I can't seem to make it run.
I installed Interactive Ruby for this.
This is the code:
class Float
def to_sn # to scientific notation
"%E" % self
end
def self.from_sn str # generate a float from scientific notation
("%f" % str).to_f
end
end
# Pass in filename as only argument
if ARGV.size != 1
puts "Usage: ./converSTL.rb [stl filename]"
exit
end
begin
original = File.new(ARGV[0], "r")
# Read first line - check binary or ASCII
tempLine = original.gets
if tempLine.include? "solid"
outFilename = ARGV[0].sub(/\.stl/i, '-binary.stl')
puts "#{ARGV[0]} is in ASCII format, converting to BINARY: #{outFilename}"
outFile = File.new(outFilename, "w")
outFile.write("\0" * 80) # 80 bit header - ignored
outFile.write("FFFF") # 4 bit integer # of triangles - filled later
triCount = 0
while temp = original.gets
next if temp =~ /^\s*$/ or temp.include? 'endsolid' # ignore whitespace
temp.sub! /facet normal/, ''
normal = temp.split(' ').map{ |num| Float.from_sn num }
triCount += 1
temp = original.gets # 'outer loop'
temp = original.gets
vertexA = temp.sub(/vertex/, '').split(' ').map{ |num| Float.from_sn num }
temp = original.gets
vertexB = temp.sub(/vertex/, '').split(' ').map{ |num| Float.from_sn num }
temp = original.gets
vertexC = temp.sub(/vertex/, '').split(' ').map{ |num| Float.from_sn num }
temp = original.gets # 'endsolid'
temp = original.gets # 'endfacet'
outFile.write(normal.pack("FFF"))
outFile.write(vertexA.pack("FFF"))
outFile.write(vertexB.pack("FFF"))
outFile.write(vertexC.pack("FFF"))
outFile.write("\0\0")
end
outFile.seek(80, IO::SEEK_SET)
outFile.write([ triCount ].pack("V"))
outFile.close
else
outFilename = ARGV[0].sub(/\.stl/i, '-ascii.stl')
puts "#{ARGV[0]} is in BINARY format, converting to ASCII: #{outFilename}"
outFile = File.new(outFilename, "w")
outFile.write("solid \n")
original.seek(80, IO::SEEK_SET)
triCount = original.read(4).unpack('V')[0]
triCount.times do |triNdx|
normal = original.read(12).unpack('FFF')
vertexA = original.read(12).unpack('FFF')
vertexB = original.read(12).unpack('FFF')
vertexC = original.read(12).unpack('FFF')
original.seek(2, IO::SEEK_CUR)
outFile.write(" facet normal #{normal[0].to_sn} #{normal[1].to_sn} #{normal[2].to_sn}\n")
outFile.write(" outer loop\n")
outFile.write(" vertex #{vertexA[0].to_sn} #{vertexA[1].to_sn} #{vertexA[2].to_sn}\n")
outFile.write(" vertex #{vertexB[0].to_sn} #{vertexB[1].to_sn} #{vertexB[2].to_sn}\n")
outFile.write(" vertex #{vertexC[0].to_sn} #{vertexC[1].to_sn} #{vertexC[2].to_sn}\n")
outFile.write(" endloop\n")
outFile.write(" endfacet\n")
end
outFile.write("endsolid \n")
outFile.close
end
original.close
rescue => error
puts "Error: #{error}"
end
And everytime I try to run it, I get this error:

It's a script that can be run from cmd with
ruby convertST1.rb file.stl

By re-opening script, processing speed decreases

This is the code I have written.
It parses an xml and turns on lights when a certain condition is true.
The problem I have is, if I restart the script it takes longer to read out or process the xml. If I restart again, it takes even longer. So at some point it takes 10 or more seconds till one cycle is through.
def core():
import urllib #import urllib.request
x = 0
while True:
### XML Extraction ###
from xml.dom import minidom
xml = urllib.urlopen("http://192.168.60.242/xml") # xml = urllib.request.urlopen("http://192.168.60.242/xml")
xml_string = xml.read()
xml.close()
re_string = xml_string[130:4000]
re_string = re_string.replace('</TEXTAREA></FORM></BODY></HTML>', '') #zwecks inkompatibilitÃ¤t mit Python 3.5, muss hier eine Ã„nderung vorgenommen werden
#parsing
xmldoc = minidom.parseString(re_string)
Sensor0Elm = xmldoc.getElementsByTagName('t0')
Sensor1Elm = xmldoc.getElementsByTagName('t1')
#Sensor2Elm = xmldoc.getElementsByTagName('t2')
Sensor0Elm = Sensor0Elm[0]
Sensor1Elm = Sensor1Elm[0]
#Sensor2Elm = Sensor2Elm[0]
Sensor0 = Sensor0Elm.childNodes[0].data
Sensor1 = Sensor1Elm.childNodes[0].data
#Sensor2 = Sensor2Elm.childNodes[0].data
Sensor0 = float(Sensor0)
Sensor1 = float(Sensor1)
#Sensor2 = float(Sensor2)
#Datenaufbereitung
print (Sensor0*100.000000000001)
print (Sensor1*100.000000000001)
#print (Sensor2*100)
### int to bin ###
Sensor0=bin(int(Sensor0*100.000000000001))
Sensor1=bin(int(Sensor1*100.000000000001))
#Sensor2=bin(int(Sensor2*100))
Sensor0 = Sensor0[2:]
Sensor1 = Sensor1[2:]
#Sensor2 = Sensor2[2:]
Sensor0_count_int = int(len(str(Sensor0)))
Sensor1_count_int = int(len(str(Sensor1)))
#Sensor2_count_int = int(len(str(Sensor2)))
Sub0 = int(8 - Sensor0_count_int)
Sub1 = int(8 - Sensor1_count_int)
#Sub2 = int(8 - Sensor2_count_int)
Sensor0_compl = (str(Sub0*"0")+Sensor0)
Sensor1_compl = (str(Sub1*"0")+Sensor1)
#Sensor2_compl = (str(Sub2*"0")+Sensor2)
x = x+1
print (">>>", x ,"<<<")
print (Sensor0_compl)
print (Sensor1_compl)
#print (Sensor2_compl)
#############################
# import RPi.GPIO as GPIO
# GPIO.setmode(GPIO.BCM)
# GPIO.setup(4,GPIO.OUT)
# GPIO.setup(5,GPIO.OUT)
# GPIO.setup(6,GPIO.OUT)
# GPIO.setup(12,GPIO.OUT)
# GPIO.setup(13,GPIO.OUT)
# GPIO.setup(16,GPIO.OUT)
# GPIO.setup(17,GPIO.OUT)
# GPIO.setup(18,GPIO.OUT)
# GPIO.setup(19,GPIO.OUT)
# GPIO.setup(20,GPIO.OUT)
# GPIO.setup(21,GPIO.OUT)
# GPIO.setup(22,GPIO.OUT)
# GPIO.setup(23,GPIO.OUT)
# GPIO.setup(24,GPIO.OUT)
# GPIO.setup(25,GPIO.OUT)
#############################
#Sensor0
# AgCh3A=(Sensor0_compl[0:1])
# if AgCh3A=="0":
# GPIO.output(4,True)
# else:
# GPIO.output(4,False)
# AgPro2=(Sensor0_compl[1:2])
# if AgPro2=="0":
# GPIO.output(5,True)
# else:
# GPIO.output(5,False)
# CharRo440=(Sensor0_compl[2:3])
# if CharRo440=="0":
# GPIO.output(6,True)
# else:
# GPIO.output(6,False)
# AgInnoC=(Sensor0_compl[3:4])
# if AgInnoC=="0":
# GPIO.output(12,True)
# else:
# GPIO.output(12,False)
# AgInnoB=(Sensor0_compl[4:5])
# if AgInnoB=="0":
# GPIO.output(13,True)
# else:
# GPIO.output(13,False)
# ZK700=(Sensor0_compl[5:6])
# if ZK700=="0":
# GPIO.output(16,True)
# else:
# GPIO.output(16,False)
# AgF3000=(Sensor0_compl[6:7])
# if AgF3000=="0":
# GPIO.output(17,True)
# else:
# GPIO.output(17,False)
# ZK1200=(Sensor0_compl[7:8])
# if ZK1200=="0":
# GPIO.output(18,True)
# else:
# GPIO.output(18,False)
#Sensor1
# AgProV3=(Sensor1_compl[3:4])
# if AgProV3=="0":
# GPIO.output(19,True)
# else:
# GPIO.output(19,False)
# MakWPG1=(Sensor1_compl[4:5])
# if MakWPG1=="0":
# GPIO.output(20,True)
# else:
# GPIO.output(20,False)
# AgExcell2eC=(Sensor1_compl[5:6])
# if AgExcell2eC=="0":
# GPIO.output(21,True)
# else:
# GPIO.output(21,False)
# AgCh2eC=(Sensor1_compl[6:7])
# if AgCh2eC=="0":
# GPIO.output(22,True)
# else:
# GPIO.output(22,False)
# AgCh3B=(Sensor1_compl[7:8])
# if AgCh3B=="0":
# GPIO.output(23,True)
# else:
# GPIO.output(23,False)
#################################
# MM5=(Sensor1_compl[5:6])
# if MM5=="0":
# GPIO.output(24,True)
# else:
# GPIO.output(24,False)
# MM6=(Sensor1_compl[6:7])
# if MM6=="0":
# GPIO.output(25,True)
# else:
# GPIO.output(25,False)
# MM7=(Sensor2_compl[0:1])
# if MM7=="0":
# GPIO.output(1,True)
# else:
# GPIO.output(1,False)
#Sensor2
# MMM0=int(Sensor2[2:3])
# if M0==1:
# GPIO.output(1,True)
# else:
# GPIO.output(1,False)
# MMM1=int(Sensor2[3:4])
# if M0==1:
# GPIO.output(1,True)
# else:
# GPIO.output(1,False)
# MMM2=int(Sensor2[4:5])
# if M0==1:
# GPIO.output(1,True)
# else:
# GPIO.output(1,False)
# MMM3=int(Sensor2[5:6])
# if M0==1:
# GPIO.output(1,True)
# else:
# GPIO.output(1,False)
# MMM4=int(Sensor2[6:7])
# if M0==1:
# GPIO.output(1,True)
# else:
# GPIO.output(1,False)
# MMM5=int(Sensor2[7:8])
# if M0==1:
# GPIO.output(1,True)
# else:
# GPIO.output(1,False)
# MMM6=int(Sensor2[8:9])
# if M0==1:
# GPIO.output(1,True)
# else:
# GPIO.output(1,False)
# MMM7=int(Sensor2[9:10])
# if M0==1:
# GPIO.output(1,True)
# else:
# GPIO.output(1,False)
#import time
#time.sleep(1)
#GPIO.cleanup()
def main():
import time
while True:
try:
core()
except:
continue
time.sleep(0.01)
main()
Does somebody have an idea where my issue originates from?
Thank you

I suspect that the script isn't closing when you think it is. The obvious way to check for this is to see if pythonw.exe is still running in task manager.
To be really, really,sure you could create a file in the 'continue' block of your try/except:
with open('filepath\file.txt','w') as myfile:
pass
Then when you think you've killed the script, delete this file and see if it comes back. You'd have to increase your timeout to 1s or so to avoid flooding your system with file-creation requests as well.
Two thing to try first though:
Get rid of the while True at the top of core(), it seems unnecessary
Reduce the frequency of your calling loop, to, say once or twice a second

how to separate this text into a hash ruby

sorry my bad english, im new
i have this document.txt
paul gordon,jin kazama,1277,1268,21-12,21-19
yoshimistu,the rock,2020,2092,21-9,21-23,25-27
... lot more
i mean, how to strip each line, and comma sparator, into a hash like this
result = {
line_num: { name1: "paula wood", name2: "sarah carnley", m1: 1277, m2: 1268, sc1: 21, sc2: 12, sc3: 21, sc4: 19 }
}
i try to code like this
im using text2re for regex here
doc = File.read("doc.txt")
lines = doc.split("\n")
counts = 0
example = {}
player1 = '((?:[a-z][a-z]+))(.)((?:[a-z][a-z]+))'
player2 = '((?:[a-z][a-z]+))(.)((?:[a-z][a-z]+))'
re = (player1 + player2 )
m = Regexp.new(re, Regexp::IGNORECASE)
lines.each do |line|
re1='((?:[a-z][a-z]+))' # Word 1
re2='(.)' # Any Single Character 1
re3='((?:[a-z][a-z]+))' # Word 2
re4='(.)' # Any Single Character 2
re5='((?:[a-z][a-z]+))' # Word 3
re6='(.)' # Any Single Character 3
re7='((?:[a-z][a-z]+))' # Word 4
re=(re1+re2+re3+re4+re5+re6+re7)
m=Regexp.new(re,Regexp::IGNORECASE);
if m.match(line)
word1=m.match(line)[1];
c1=m.match(line)[2];
word2=m.match(line)[3];
c2=m.match(line)[4];
word3=m.match(line)[5];
c3=m.match(line)[6];
word4=m.match(line)[7];
counts += 1
example[counts] = word1+word2
puts example
end
end
# (/[a-z].?/)
but the output does not match my expectation
1=>"", 2=>"indahdelika", 3=>"masam",
..more

Your data is comma-separated, so use the CSV class instead of trying to roll your own parser. There are dragons waiting for you if you try to split simply using commas.
I'd use:
require 'csv'
data = "paul gordon,jin kazama,1277,1268,21-12,21-19
yoshimistu,the rock,2020,2092,21-9,21-23,25-27
"
hash = {}
CSV.parse(data).each_with_index do |row, i|
name1, name2, m1, m2, sc1_2, sc3_4 = row
sc1, sc2 = sc1_2.split('-')
sc3, sc4 = sc3_4.split('-')
hash[i] = {
name1: name1,
name2: name2,
m1: m1,
m2: m2,
sc1: sc1,
sc2: sc2,
sc3: sc3,
sc4: sc4,
}
end
Which results in:
hash
# => {0=>
# {:name1=>"paul gordon",
# :name2=>"jin kazama",
# :m1=>"1277",
# :m2=>"1268",
# :sc1=>"21",
# :sc2=>"12",
# :sc3=>"21",
# :sc4=>"19"},
# 1=>
# {:name1=>"yoshimistu",
# :name2=>"the rock",
# :m1=>"2020",
# :m2=>"2092",
# :sc1=>"21",
# :sc2=>"9",
# :sc3=>"21",
# :sc4=>"23"}}
Since you're reading from a file, modify the above a bit using the "Reading from a file a line at a time" example in the documentation.
If the numerics need to be integers, tweak the hash definition to:
hash[i] = {
name1: name1,
name2: name2,
m1: m1.to_i,
m2: m2.to_i,
sc1: sc1.to_i,
sc2: sc2.to_i,
sc3: sc3.to_i,
sc4: sc4.to_i,
}
Which results in:
# => {0=>
# {:name1=>"paul gordon",
# :name2=>"jin kazama",
# :m1=>1277,
# :m2=>1268,
# :sc1=>21,
# :sc2=>12,
# :sc3=>21,
# :sc4=>19},
# 1=>
# {:name1=>"yoshimistu",
# :name2=>"the rock",
# :m1=>2020,
# :m2=>2092,
# :sc1=>21,
# :sc2=>9,
# :sc3=>21,
# :sc4=>23}}
# :sc4=>"23"}}

This is another way you could do it. I have made no assumptions about the number of items per line which are to be the values of :namex, :scx or :mx, or the order of those items.
Code
def hashify(str)
str.lines.each_with_index.with_object({}) { |(s,i),h| h[i] = inner_hash(s) }
end
def inner_hash(s)
n = m = sc = 0
s.split(',').each_with_object({}) do |f,g|
case f
when /[a-zA-Z].*/
g["name#{n += 1}".to_sym] = f
when /\-/
g["sc#{sc += 1}".to_sym], g["sc#{sc += 1}".to_sym] = f.split('-').map(&:to_i)
else
g["m#{m += 1}".to_sym] = f.to_i
end
end
end
Example
str = "paul gordon,jin kazama,1277,1268,21-12,21-19
yoshimistu,the rock,2020,2092,21-9,21-23,25-27"
hashify(str)
#=> {0=>{:name1=>"paul gordon", :name2=>"jin kazama",
# :m1=>1277, :m2=>1268,
# :sc1=>21, :sc2=>12, :sc3=>21, :sc4=>19},
# 1=>{:name1=>"yoshimistu", :name2=>"the rock",
# :m1=>2020, :m2=>2092,
# :sc1=>21, :sc2=>9, :sc3=>21, :sc4=>23, :sc5=>25, :sc6=>27}
# }

How to write code ruby to collect data while run loop condition

I am quit new in ruby and I need your help.
Now I want to write ruby code to collect some data while looping.
I have 2 code for this work.
My objective is collect sum score from text that split from input file.
-first, run test_dialog.rb
-Second, change input file for this format
from
AA:0.88:320:800|BB:0.82:1040:1330|CC:0.77:1330:1700 enquire-privilege_card
to
AA 0.88
BB 0.82
CC 0.77
-Then use each text that separate check on dialog condition. If this data appear in dialog ,store point until end of text (AA --> BB --> CC)
-Finally get average score.
I have problem will separating and use loop for collect point in same time.
Please help.
Best regard.
PS.
score will return if match with dialog
score of input line 1 should be (0.88+0.82+0.77/3) [match condition 1].
if no match, no score return.
Input data
AA:0.88:320:800|BB:0.82:1040:1330|CC:0.77:1330:1700 enquire-privilege_card
BB:0.88:320:800|EE:0.82:1040:1330|FF:0.77:1330:1700 enquire-privilege_card
EE:0.88:320:800|QQ:0.82:1040:1330|AA:0.77:1330:1700|RR:0.77:1330:1700|TT:0.77:1330:1700 enquire-privilege_card
test_dialog.rb
#!/usr/bin/env ruby
# encoding: UTF-8
#
# Input file:
# hyp(with confidence score), ref_tag
#
# Output:
# hyp, ref_tag, hyp_tag, result
#
require_relative 'dialog'
require_relative 'version'
unless ARGV.length > 0
puts 'Usage: ruby test_dialog.rb FILENAME [FILENAME2...]'
exit(1)
end
counter = Hash.new{|h,k| h[k]=Hash.new{|h2,k2| h2[k2]=Hash.new{|h3,k3| h3[k3]=0}}}
thresholds = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
puts %w(hyp ref_tag hyp_tag result).join("\t")
ARGV.each do |fname|
open(fname, 'r:UTF-8').each do |line|
hyp, ref_tag = line.strip.split(/\t/)
key = if ref_tag == "(reject)"
:reject
else
:accept
end
counter[fname][key][:all] += 1
thresholds.each do |threshold|
hyp_all = get_response_text(hyp, threshold)
hyp_tag = if hyp_all==:reject
"(reject)"
else
hyp_all.split(/,/)[1]
end
result = ref_tag==hyp_tag
counter[fname][key][threshold] += 1 if result
puts [hyp.split('|').map{|t| t.split(':')[0]}.join(' '),
ref_tag, hyp_tag, result].join("\t") if threshold==0.0
end
end
end
STDERR.puts ["Filename", "Result"].concat(thresholds).join("\t")
counter.each do |fname, c|
ca_all = c[:accept].delete(:all)
cr_all = c[:reject].delete(:all)
ca = thresholds.map{|t| c[:accept][t]}.map{|n| ca_all==0 ? "N/A" : '%4.1f' % (n.to_f/ca_all*100) }
cr = thresholds.map{|t| c[:reject][t]}.map{|n| cr_all==0 ? "N/A" : '%4.1f' % (n.to_f/cr_all*100) }
STDERR.puts [fname, "Correct Accept"].concat(ca).join("\t")
STDERR.puts [fname, "Correct Reject"].concat(cr).join("\t")
end
dialog.rb
# -*- coding: utf-8 -*-
#
# text : AA:0.88:320:800|BB:0.82:1040:1330|CC:0.77:1330:1700|DD:0.71:1700:2010|EE:1.00:2070:2390|FF:0.56:320:800|GG:0.12:1330:1700
#
def get_response_text text, threshold, dsr_session_id=nil
# ...
#p "result text >> " + text
# Promotion => detail => rate
# Promotion IR/IDD => high priority than enquire-promotion
# Rate IR/IDD => high priority than enquire-rate
# Problem IR/IDD => high priority than enquire-service_problem
# Internet IR/IDD => high priority than enquire-internet
# Cancel Net => enquire-internet NOT cancel-service
# Lost-Stolen => +Broken
memu = ""
intent = ""
prompt = ""
intent_th = ""
intent_id = ""
# strInput = text.gsub(/\s/,'')
strInput = text.split('|').map{|t| t.split(':')[0]}.join('')
puts ("****strINPUT*****")
puts strInput
scores = text.split('|').map{|t| t.split(':')[1].to_f}
puts ("****SCORE*****")
puts scores
avg_score = scores.inject(0){|a,x| a+=x} / scores.size
puts ("****AVG-Score*****")
puts avg_score
if avg_score < threshold
return :reject
end
# List of Country
country_fname = File.dirname(__FILE__)+"/country_list.txt"
country_list = open(country_fname, "r:UTF-8").readlines.map{|line| line.chomp}
contry_reg = Regexp.union(country_list)
# List of Mobile Type
mobile_fname = File.dirname(__FILE__)+"/mobile_list.txt"
mobile_list = open(mobile_fname, "r:UTF-8").readlines.map{|line| line.chomp}
mobile_reg = Regexp.union(mobile_list)
# List of Carrier
carrier_fname = File.dirname(__FILE__)+"/carrier_list.txt"
carrier_list = open(carrier_fname, "r:UTF-8").readlines.map{|line| line.chomp}
carrier_reg = Regexp.union(carrier_list)
if (strInput =~ /AA|BB/ and strInput =~ /CC/)
intent = "enquire-payment_method"
elsif (strInput =~ /EE/) and ("#{$'}" =~ /QQ|RR/)
intent = "enquire-balance_amount"
elsif (strInput =~ /AA|EE/i) and (strInput =~ /TT/i)
intent = "enquire-balance_unit"
elsif (strInput =~ /DD|BB|/i) and (strInput =~ /FF|AA/i)
intent = "service-balance_amount"
end

Parse as follows:
str = 'AA:0.88:320:800|BB:0.82:1040:1330|CC:0.77:1330:1700 enquire-privilege_card'
str.split( /[:|]/ ).select.with_index {| code, i | i % 4 < 2 ; }.join( ' ' )
# => "AA 0.88 BB 0.82 CC 0.77"

Develop Reference

ruby bash windows laravel spring algorithm oracle macos go visual-studio

scrapy response.xpath() cause memory leaking - xpath

You might decrease the memory usage by switching to extract_first() instead of extract() which would create unnecessary lists. I would also upgrade scrapy and lxml to the latest versions: pip install --upgrade scrapy pip install --upgrade lxml

Related

Ruby Git Diff Line Information Parser

How to run this ruby file that needs an argument?

By re-opening script, processing speed decreases

how to separate this text into a hash ruby

How to write code ruby to collect data while run loop condition

Categories

Resources