Limit the number of threads in an iteration ruby - ruby

When I have my code like this, I get "can't create thread, resource temporarily unavailable". There are over 24k files in the directory to process.
frames.each do |image|
Thread.new do
pipeline = ImageProcessing::MiniMagick.
source(File.open("original/#{image}"))
.append("-fuzz", "30%")
.append("-transparent", "#ff00fe")
result = pipeline.call
puts result.path
file_parts = image.split("_")
frame_number = file_parts[2]
FileUtils.cp(result.path, "transparent/image_transparent_#{frame_number}")
puts "Done with #{image}!"
puts "#{Dir.children("transparent").count.to_s} / #{Dir.children("original").count.to_s}"
puts "\n"
end
end.each{ |thread| thread.join }
So, I tried the first 1001 files by calling the index 0-1000, and did it this way:
frames[0..1000].each_with_index do |image, index|
thread = Thread.new do
pipeline = ImageProcessing::MiniMagick.
source(File.open("original/#{image}"))
.append("-fuzz", "30%")
.append("-transparent", "#ff00fe")
result = pipeline.call
puts result.path
file_parts = image.split("_")
frame_number = file_parts[2]
FileUtils.cp(result.path, "transparent/image_transparent_#{frame_number}")
puts "Done with #{image}!"
puts "#{Dir.children("transparent").count.to_s} / #{Dir.children("original").count.to_s}"
puts "\n"
end
thread.join
end
And while this is processing, the speed seems to be about the same as if it was on a single thread when I'm watching it in the Terminal.
But I want the code to be able to limit to whatever the OS will allow before it disallows, so that it can parse through them all faster.
Or at lease:
Find the maximum threads allowed
Get original directory's count, divided by the number of threads allowed.
Run this each in batches of that division.

Related

Download files asynchronously

I was trying to make a script that downloads all images or videos from a thread in my favourite imageboard: 2ch.hk
I was successful until I wanted to download these files asynchronously (for example, to improve performance)
Here is the code http://ideone.com/k2l4Hm
file = http.get(source).body
require 'net/http'
multithreading = false
Net::HTTP.start("2ch.hk", :use_ssl => true) do |http|
thread = http.get("/b/res/133467978.html").body
sources = []
thread.scan(/<a class="desktop" target="_blank" href=".+">.+<\/a>/).each do |a|
source = "/b#{/<a class="desktop" target="_blank" href="\.\.(.+)">.+<\/a>/.match(a).to_a[1]}"
sources << source
end
i = 0
start = Time.now
if multithreading
threads = []
sources.each do |source|
threads << Thread.new(i) do |j|
file = http.get(source).body #breaks everything
# type = /.+\.(.+)/.match(source)[1]
# open("#{j}.#{type}","wb") { |new_file|
# new_file.write(file)
# }
end
i += 1
end
threads.each do |thr|
thr.join
end
# until downloade=sources.size
#
# end
else
sources.each do |source|
file = http.get(source).body
type = /.+\.(.+)/.match(source)[1]
open("#{i}.#{type}","wb") { |new_file|
new_file.write(file)
}
i += 1
print "#{(((i).to_f / sources.size) * 100).round(2)}% "
end
puts
end
puts "Done. #{i} files were downloaded. It took #{Time.now - start} seconds"
end
I suppose that this line crashes everything.
file = http.get(source).body
Or maybe that's the problem.
threads.each do |thr|
thr.join
end
Error messages are always different, from Bad File Descriptor and IO errors to "You may have encountered a bug in the Ruby interpreter or extension libraries."
If you want to try and run my code, please substitute a link to thread in 4th line with a new thread (from 2ch.hk/b), because the one in my code may be deleted by the time you run my code
Version of ruby: 2.3.1, OS Xubuntu 16.10
You'll probably have much better performance using a ruby http lib that supports parallel requests:
https://github.com/typhoeus/typhoeus
e.g.
hydra = Typhoeus::Hydra.new
10.times.map{ hydra.queue(Typhoeus::Request.new("www.example.com", followlocation: true)) }
hydra.run
The problem with my code is that I can't make multiple requests on a Net::HTTP instance at the same time.
The solution is to open an HTTP connection for each thread.

Add multithreads/concurency in script

I created a script which checks healthcheck and ports status from a .json file populated with microservices.
So for every microservice from the .json file the script will output the HTTP status and healthcheck body and other small details, and I want to add multithreading here in order to return all the output at once.Please see the script below:
#!/usr/bin/env ruby
... get the environment argument part...
file = File.read('./services.json')
data_hash = JSON.parse(file)
threads = []
service = data_hash.keys
service.each do |microservice|
threads << Thread.new do
begin
puts "Microservice: #{microservice}"
port = data_hash["#{microservice}"]['port']
puts "Port: #{port}"
nodes = "knife search 'chef_environment:#{env} AND recipe:#{microservice}' -i"
node = %x[ #{nodes} ].split
node.each do |n|
puts "Node: #{n}"
uri = URI("http://#{n}:#{port}/healthcheck?count=10")
res = Net::HTTP.get_response(uri)
status = Net::HTTP.get(uri)
puts res.code
puts status
puts res.message
end
rescue Net::ReadTimeout
puts "ReadTimeout Error"
next
end
end
end
threads.each do |thread|
thread.join
end
Anyway in this way the script return first the puts "Microservice: #{microservice}" and puts "Port: #{port}" and after this it will return the nodes and only after the STATUS.
How can I return all the data for each loop together?
Instead of puts write output to a variable (hash).
If you wand to wait for all threads to finish their job before showing the output, use ThreadsWait class.
require 'thwait'
file = File.read('./services.json')
data_hash = JSON.parse(file)
h = {}
threads = []
service = data_hash.keys
service.each do |microservice|
threads << Thread.new do
thread_id = Thread.current.object_id.to_s(36)
begin
h[thread_id] = "Microservice: #{microservice}"
port = data_hash["#{microservice}"]['port']
h[thread_id] << "Port: #{port}"
nodes = "knife search 'chef_environment:#{env} AND recipe:#{microservice}' -i"
node = %x[ #{nodes} ].split
node.each do |n|
h[thread_id]<< "Node: #{n}"
uri = URI("http://#{n}:#{port}/healthcheck?count=10")
res = Net::HTTP.get_response(uri)
status = Net::HTTP.get(uri)
h[thread_id] << res.code
h[thread_id] << status
h[thread_id] << res.message
end
rescue Net::ReadTimeout
h[thread_id] << "ReadTimeout Error"
next
end
end
end
threads.each do |thread|
thread.join
end
# wait untill all threads finish their job
ThreadsWait.all_waits(*threads)
p h
[edit]
ThreadsWait.all_waits(*threads) is redundant in above code and can be omitted, since line treads.each do |thread| thread.join end does exactely the same thing.
Instead of outputting the data as you get it using puts, you can collect it all in a string and then puts it once at the end. Strings can take the << operator (implemented as a method in Ruby), so you can just initialize the string, add to it, and then output it at the end, like this:
report = ''
report << 'first thing'
report << 'second thing'
puts report
You could even save them all up together and print them all after all were finished if you want.

Ruby understanding multithreading

I'm trying to multithread loop in ruby following this exmaple: http://t-a-w.blogspot.com/2010/05/very-simple-parallelization-with-ruby.html.
I copied that coded and wrote this:
module Enumerable
def ignore_exception
begin
yield
rescue Exception => e
STDERR.puts e.message
end
end
def in_parallel(n)
t_queue = Queue.new
threads = (1..n).map {
Thread.new{
while x = t_queue.deq
ignore_exception{ yield(x[0]) }
end
}
}
each{|x| t_queue << [x]}
n.times{ t_queue << nil }
threads.each{|t|
t.join
unless t[:out].nil?
puts t[:out]
end
}
end
end
ids.in_parallel(10){ |id|
conn = open_conn(loc)
out = conn.getData(id)
Thread.current[:out] = out
}
The way I understand it is that it will dequeue 10 items at a time, process the block in the loop per id and join the 10 threads at the end, and repeat until finished. After running this code I get different results sometimes, especially if the size of my ids is less then 10, and I am confused why this is occuring. Half the time it will not output anything for upto half the ids, even though I can check on server side that output for those ids exists. For example if the correct output is "Got id 1" and "Got id 2", it will only print out {"Got id 1"} or {"Got id 2"} or {"Got id 1", "Got id 2"}. My question is that is that is my understanding of this code correct?
The issue in my code was the open_conn() function call, which was not thread safe. I fixed the issue by synchronizing around getting the connection handle:
connLock = Mutex.new
ids.in_parallel(10){ |id|
conn = nil
connLock.synchronize {
conn = open_conn(loc)
}
out = conn.getData(id)
Thread.current[:out] = out
}
Also should use http://peach.rubyforge.org/ for the loop parallelization by using:
ids.peach(10){ |id| ... }

Ruby multithreading questions

I've started looking into multi-threading in Ruby.
So basically, I want to create a few threads, and have them all execute, but not display any of the output until the thread has successfully completed.
Example:
#!/usr/bin/env ruby
t1 = Thread.new {
puts "Hello_1"
sleep(5)
puts "Hello_1 after 5 seconds of sleep"
}
t2 = Thread.new {
puts "Hello_2"
sleep(5)
puts "Hello_2 after 5 seconds of sleep"
}
t1.join
t2.join
puts "Hello_3"
sleep(5)
puts "Hello_3 after 5 seconds of sleep"
The first Hello_1 / Hello_2 execute immediately. I wouldn't want any of the output to show until the thread has successfully completed.
Because puts prints to a single output stream (sysout) you can't use it if you want to capture the output each thread.
You will have to use separate buffered stream for each thread, write to that in each thread, and then dump them to sysout when the thread terminates to see the output.
Here is an example of a thread:
t = Thread.new() do
io = StringIO.new
io << "mary"
io.puts "fred"
io.puts "fred"
puts io.string
end
You will have to pass io to every method in the thread.
or have a look at this for creating a module that redirects stdout for a thread.
But in each thread that your start wrap your code with:
Thread.start do
# capture the STDOUT by storing a StringIO in the thread space
Thread.current[:stdout] = StringIO.new
# Do your stuff.. print using puts
puts 'redirected to StringIO'
# print everything before we exit
STDIO.puts Thread.current[:stdout].string
end.join
You can share a buffer but you should 'synchronize' access to it:
buffer = ""
lock = Mutex.new
t1 = Thread.new {
lock.synchronize{buffer << "Hello_1\n"}
sleep(5)
lock.synchronize{buffer << "Hello_1 after 5 seconds of sleep\n"}
}
t2 = Thread.new {
lock.synchronize{buffer << "Hello_2\n"}
sleep(5)
lock.synchronize{buffer << "Hello_2 after 5 seconds of sleep\n"}
}
t1.join
t2.join
puts buffer

Ruby Net::FTP Timeout Threads

I was trying to speed up multiple FTP downloads by using threaded FTP connections. My problem is that I always have threads hang. I am looking for a clean way of either telling FTP it needs to retry the ftp transaction, or at least knowing when the FTP connection is hanging.
In the code below I am threading 5/6 separate FTP connections where each thread has a list of files it is expected to download. When the script completes, a few of the threads hang and can not be joined. I am using the variable #last_updated to represent the last successful download time. If the current time + 20 seconds exceeds #last_updated, kill the remaining threads. Is there a better way?
threads = []
max_thread_pool = 5
running_threads = 0
Thread.abort_on_exception = true
existing_file_count = 0
files_downloaded = 0
errors = []
missing_on_the_server = []
#last_updated = Time.now
if ids.length > 0
ids.each_slice(ids.length / max_thread_pool) do |id_set|
threads << Thread.new(id_set) do |t_id_set|
running_threads += 1
thread_num = running_threads
thread_num.freeze
puts "making thread # #{thread_num}"
begin
ftp = Net::FTP.open(#remote_site)
ftp.login(#remote_user, #remote_password)
ftp.binary = true
#ftp.debug_mode = true
ftp.passive = false
rescue
raise "Could not establish FTP connection"
end
t_id_set.each do |id|
#last_updated = Time.now
rmls_path = "/_Photos/0#{id[0,2]}00000/#{id[2,1]}0000/#{id[3,1]}000/#{id}-1.jpg"
local_path = "#{#photos_path}/01/#{id}-1.jpg"
progress += 1
unless File.exist?(local_path)
begin
ftp.getbinaryfile(rmls_path, local_path)
puts "ftp reponse: #{ftp.last_response}"
# find the percentage of progress just for fun
files_downloaded += 1
p = sprintf("%.2f", ((progress.to_f / total) * 100))
puts "Thread # #{thread_num} > %#{p} > #{progress}/#{total} > Got file: #{local_path}"
rescue
errors << "#{thread_num} unable to get file > ftp response: #{ftp.last_response}"
puts errors.last
if ftp.last_response_code.to_i == 550
# Add the missing file to the missing list
missing_on_the_server << errors.last.match(/\d{5,}-\d{1,2}\.jpg/)[0]
end
end
else
puts "found file: #{local_path}"
existing_file_count += 1
end
end
puts "closing FTP connection #{thread_num}"
ftp.close
end # close thread
end
end
# If #last_updated has not been updated on the server in over 20 seconds, wait 3 seconds and check again
while Time.now < #last_updated + 20 do
sleep 3
end
# threads are hanging so joining the threads does not work.
threads.each { |t| t.kill }
The trick for me that worked was to use ruby's Timeout.timeout to ensure the FTP connection was not hanging.
begin
Timeout.timeout(10) do
ftp.getbinaryfile(rmls_path, local_path)
end
# ...
rescue Timeout::Error
errors << "#{thread_num}> File download timed out for: #{rmls_path}"
puts errors.last
rescue
errors << "unable to get file > ftp reponse: #{ftp.last_response}"
# ...
end
Hanging FTP downloads were causing my threads to appear to hang. Now that the threads are no longer hanging, I can use the more proper way of dealing with threads:
threads.each { |t| t.join }
rather than the ugly:
# If #last_updated has not been updated on the server in over 20 seconds, wait 3 seconds and check again
while Time.now < #last_updated + 20 do
sleep 3
end
# threads are hanging so joining the threads does not work.
threads.each { |t| t.kill }

Resources