Zlib::BufError when using progressbar/ruby-progressbar gem - ruby

I use the following Ruby snippet to download a 8.9MB file.
require 'open-uri'
require 'net/http'
require 'uri'
def http_download_no_progress_bar(uri, filename)
uri.open(read_timeout: 500) do |file|
open filename, 'w' do |io|
file.each_line do |line|
io.write line
end
end
end
end
I want to add the progressbar gem to visualize the download process:
require 'open-uri'
require 'progressbar'
require 'net/http'
require 'uri'
def http_download_with_progressbar(uri, filename)
progressbar = nil
uri.open(
read_timeout: 500,
content_length_proc: lambda { |total|
if total && 0 < total.to_i
progressbar = ProgressBar.new("...", total)
progressbar.file_transfer_mode
end
},
progress_proc: lambda { |step|
progressbar.set step if progressbar
}
) do |file|
open filename, 'w' do |io|
file.each_line do |line|
io.write line
end
end
end
end
However, it now fails with the following error:
/home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http/response.rb:357:in `finish':
buffer error (Zlib::BufError)oooooo | 8.0MB 8.6MB/s ETA: 0:00:00
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http/response.rb:357:in `finish'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http/response.rb:262:in `ensure in inflater'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http/response.rb:262:in `inflater'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http/response.rb:274:in `read_body_0'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http/response.rb:201:in `read_body'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/open-uri.rb:328:in `block (2 levels) in open_http'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http.rb:1415:in `block (2 levels) in transport_request'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http/response.rb:162:in `reading_body'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http.rb:1414:in `block in transport_request'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http.rb:1405:in `catch'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http.rb:1405:in `transport_request'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http.rb:1378:in `request'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/open-uri.rb:319:in `block in open_http'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/net/http.rb:853:in `start'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/open-uri.rb:313:in `open_http'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/open-uri.rb:724:in `buffer_open'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/open-uri.rb:210:in `block in open_loop'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/open-uri.rb:208:in `catch'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/open-uri.rb:208:in `open_loop'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/open-uri.rb:149:in `open_uri'
from /home/user/.rvm/rubies/ruby-2.1.1/lib/ruby/2.1.0/open-uri.rb:704:in `open'
Meanwhile I also tried the ruby-progressbar gem:
require 'open-uri'
require 'ruby-progressbar'
require 'net/http'
require 'uri'
def http_download_with_ruby_progressbar(uri, filename)
progressbar = nil
uri.open(
read_timeout: 500,
content_length_proc: lambda { |total|
if total && 0 < total.to_i
progressbar = ProgressBar.create(title: filename, total: total)
end
},
progress_proc: lambda { |step|
progressbar.progress = step if progressbar
}
) do |file|
open filename, 'w' do |io|
file.each_line do |line|
io.write line
end
end
end
end
It fails with the same error. Here is the associated issue for the problem.

The problem is the file you are trying to download as every method works with this file: https://androidnetworktester.googlecode.com/files/1mb.txt.
The problem is that your file is larger than it says it is. The content_length_proc says that it is 8549968 bytes (8.15MB) whereas it is 101187668 bytes (96.5MB) (check with ls after downloading the file). Now I have an alternative that does not crash and gives you a progressbar:
def http_download_with_words(uri, filename)
bytes_total = nil
uri.open(
read_timeout: 500,
:content_length_proc => lambda{|content_length|
bytes_total = content_length},
:progress_proc => lambda{|bytes_transferred|
if bytes_total
# Print progress
print("\r#{bytes_transferred}/#{bytes_total}")
else
# We don’t know how much we get, so just print number
# of transferred bytes
print("\r#{bytes_transferred} (total size unknown)")
end
}
) do |file|
open filename, 'w' do |io|
file.each_line do |line|
io.write line
end
end
end
end
http_download_with_words(URI( 'http://data.wien.gv.at/daten/geo?service=WFS&request=GetFeature&version=1.1.0&typeName=ogdwien%3aBAUMOGD&srsName=EPSG:4326' ), 'temp.txt')
which is pretty self-explanatory, (seen here.)
Now the part I haven't been able to figure out is how exactly the progressbar gem is interfering with the ZLib. Most things seem to work fine inside the procs (e.g. having them print random stuff) so I assume both of these progressbars do something odd on completion that somehow messes with the transfer. I'd be very interested if anyone can figure out why that is?

In my testing when this occurred it was due to the raise in #set. As for why it results in an error in Zlib, that's not clear. Perhaps some strange exception handling in there. In my case I did "progbar.set(count) rescue nil" to get rid of the issue.

Related

Reading files in a zip archive, without unzipping the archive

I have a directory with 100+ zip files and I need to read the files inside the zip files to do some data processing, without unzipping the archive.
Is there a Ruby library to read the contents of files in zip archives, without unzipping the file?
Using rubyzip gives an error:
require 'zip'
Zip::File.open('my_zip.zip') do |zip_file|
# Handle entries one by one
zip_file.each do |entry|
# Extract to file/directory/symlink
puts "Extracting #{entry.name}"
entry.extract('here')
# Read into memory
content = entry.get_input_stream.read
end
end
Gives this error:
test.rb:12:in `block (2 levels) in <main>': undefined method `read' for Zip::NullInputStream:Module (NoMethodError)
from .gem/ruby/gems/rubyzip-1.1.6/lib/zip/entry_set.rb:42:in `call'
from .gem/ruby/gems/rubyzip-1.1.6/lib/zip/entry_set.rb:42:in `block in each'
from .gem/ruby/gems/rubyzip-1.1.6/lib/zip/entry_set.rb:41:in `each'
from .gem/ruby/gems/rubyzip-1.1.6/lib/zip/entry_set.rb:41:in `each'
from .gem/ruby/gems/rubyzip-1.1.6/lib/zip/central_directory.rb:182:in `each'
from test.rb:6:in `block in <main>'
from .gem/ruby/gems/rubyzip-1.1.6/lib/zip/file.rb:99:in `open'
from test.rb:4:in `<main>'
The Zip::NullInputStream is returned if the entry is a directory and not a file, could that be the case?
Here's a more robust variation of the code:
#!/usr/bin/env ruby
require 'rubygems'
require 'zip'
Zip::File.open('my_zip.zip') do |zip_file|
# Handle entries one by one
zip_file.each do |entry|
if entry.directory?
puts "#{entry.name} is a folder!"
elsif entry.symlink?
puts "#{entry.name} is a symlink!"
elsif entry.file?
puts "#{entry.name} is a regular file!"
# Read into memory
entry.get_input_stream { |io| content = io.read }
# Output
puts content
else
puts "#{entry.name} is something unknown, oops!"
end
end
end
I came across the same issue and checking for if entry.file?, before entry.get_input_stream.read, resolved the issue.
require 'zip'
Zip::File.open('my_zip.zip') do |zip_file|
# Handle entries one by one
zip_file.each do |entry|
# Extract to file/directory/symlink
puts "Extracting #{entry.name}"
entry.extract('here')
# Read into memory
if entry.file?
content = entry.get_input_stream.read
end
end
end

Undefined method each in Ruby Regexp task

I have series of zip files under #workingdir, and am trying to unzip the files that match #Regexp, and print the lines from them.
require 'zip/zip'
#workingdir = '/my/dir/structure/*.zip'
#Regexp = '/yup:maybe.*nope/i'
Dir.glob(#workingdir) do |zips|
Zip::ZipFile.open(zips) do |file|
file.each do |search|
tempFile = file.read(search)
tempFile.each do |line|
if (line =~ #Regexp ) then
p line
end
end
end
end
end
Below is the error message from IRB:
NoMethodError: undefined method `each' for #<String:0x0000000168bf40>
from (irb):70:in `block (3 levels) in irb_binding'
from /var/lib/gems/1.9.1/gems/rubyzip2-2.0.2/lib/zip/zip.rb:1122:in `each'
from /var/lib/gems/1.9.1/gems/rubyzip2-2.0.2/lib/zip/zip.rb:1122:in `each'
from /var/lib/gems/1.9.1/gems/rubyzip2-2.0.2/lib/zip/zip.rb:1265:in `each'
from (irb):68:in `block (2 levels) in irb_binding'
from /var/lib/gems/1.9.1/gems/rubyzip2-2.0.2/lib/zip/zip.rb:1381:in `open'
from (irb):67:in `block in irb_binding'
from (irb):66:in `glob'
from (irb):66
from /usr/bin/irb:12:in `<main>'
I tried tempFile.grep, and received the same error, except that grep was an undefined method. I believe I need to define a class.
Turns out my code had two problems. 1) My regular expression was being processed as a string (I should not have used the quotes). 2) Seeing as it runs fine otherwise on Ruby 1.8.7, I suspect the is a difference in how 1.8.7 and 1.9.1 process the 'each' method. If anyone has additional insights, I'm more than happy to hear them. The code below works fine on 1.8.7:
require 'zip/zip'
#workingdir = '/my/dir/structure/*.zip'
#Regexp = /regexp/i
Dir.glob(#workingdir) do |zips|
Zip::ZipFile.open(zips) do |file|
file.each do |search|
tempFile = file.read(search)
tempFile.each do |line|
if (line =~ #Regexp) then
puts zips + ': ' + line.chomp
end
end
end
end
end
Thanks again everyone!

`block in non_options': file not found: (ArgumentError)

I'm trying to open browser url based on argument passed to script. Hence I wrote following ruby code:
require 'selenium-webdriver'
require 'test/unit'
class TestTitle < Test::Unit::TestCase
def setup
$driver = Selenium::WebDriver.for :firefox
if ARGV[0] == 'google'
$driver.get 'http://www.google.com'
elsif ARGV[0] == 'twitter'
$driver.get 'http://www.twitter.com'
end
end
def test_title
puts $driver.title
end
def teardown
$driver.quit
end
end
When I passed argument: ruby test.rb 'google', it results into following error:
c:/Ruby193/lib/ruby/1.9.1/test/unit.rb:167:in `block in non_options': file not found: google (ArgumentError)
from c:/Ruby193/lib/ruby/1.9.1/test/unit.rb:146:in `map!'
from c:/Ruby193/lib/ruby/1.9.1/test/unit.rb:146:in `non_options'
from c:/Ruby193/lib/ruby/1.9.1/test/unit.rb:207:in `non_options'
from c:/Ruby193/lib/ruby/1.9.1/test/unit.rb:52:in `process_args'
from c:/Ruby193/lib/ruby/1.9.1/minitest/unit.rb:891:in `_run'
from c:/Ruby193/lib/ruby/1.9.1/minitest/unit.rb:884:in `run'
from c:/Ruby193/lib/ruby/1.9.1/test/unit.rb:21:in `run'
from c:/Ruby193/lib/ruby/1.9.1/test/unit.rb:326:in `block (2 levels) in autorun'
from c:/Ruby193/lib/ruby/1.9.1/test/unit.rb:27:in `run_once'
from c:/Ruby193/lib/ruby/1.9.1/test/unit.rb:325:in `block in autorun'
Please help me understand what I'm doing wrong.
It appears that test-unit (as of 1.9.1) grabs command line options in its GlobOptions module. You are using ARGV[0] to pass browser name, but it thinks you're passing a file name. A workaround is to capture the value of ARGV[0] and then clear it before your test case runs:
browser = ARGV[0]
ARGV[0] = nil

How to scrape data from list of URLs and save data to CSV with nokogiri

I have a file called bontyurls.csv that looks like this:
http://bontrager.com/model/11383
http://bontrager.com/model/01740
http://bontrager.com/model/09595
I want my script to read that file and then spit out a file like this: bonty_test_urls_results.csv
url,model_names
http://bontrager.com/model/11383,"Road TLR Conversion Kit"
http://bontrager.com/model/01740,"404 File Not Found"
http://bontrager.com/model/09595,"RXL Road"
Here's what I've got so far:
# based on code from here: http://www.andrewsturges.com/2011/09/how-to-harvest-web-data-using-ruby-and.html
require 'nokogiri'
require 'open-uri'
require 'csv'
#urls = Array.new
#model_names = Array.new
urls = CSV.read("bontyurls.csv")
(0..urls.length - 1).each do |index|
puts urls[index][0]
doc = Nokogiri::HTML(open(urls[index][0]))
doc.xpath('//h1').each do |model_name|
#model_name << model_name.content
end
end
# write results to file
CSV.open("bonty_test_urls_results.csv", "wb") do |row|
row << ["url", "model_names"]
(0..#urls.length - 1).each do |index|
row << [
#urls[index],
#model_names[index]]
end
end
That code isn't working. I'm getting this error:
$ ruby bonty_test_urls.rb
http://bontrager.com/model/00310
bonty_test_urls.rb:15:in `block (2 levels) in <main>': undefined method `<<' for nil:NilClass (NoMethodError)
from /home/simon/.rvm/gems/ruby-1.9.3-p194/gems/nokogiri-1.5.5/lib/nokogiri/xml/node_set.rb:239:in `block in each'
from /home/simon/.rvm/gems/ruby-1.9.3-p194/gems/nokogiri-1.5.5/lib/nokogiri/xml/node_set.rb:238:in `upto'
from /home/simon/.rvm/gems/ruby-1.9.3-p194/gems/nokogiri-1.5.5/lib/nokogiri/xml/node_set.rb:238:in `each'
from bonty_test_urls.rb:14:in `block in <main>'
from bonty_test_urls.rb:11:in `each'
from bonty_test_urls.rb:11:in `<main>'
Here is some code that returns the model_name at least. I'm just having trouble getting it to work in the larger script:
require 'open-uri'
require 'nokogiri'
doc = Nokogiri::HTML(open("http://bontrager.com/model/09124"))
doc.xpath('//h1').each do |node|
puts node.text
end
Also, I haven't figured out how to handle the URLs that return a 404.
This is how I'd do it:
require 'csv'
require 'nokogiri'
require 'open-uri'
CSV_OPTIONS = {
:write_headers => true,
:headers => %w[url model_names]
}
CSV.open('bonty_test_urls_results.csv', 'wb', CSV_OPTIONS) do |csv|
csv_doc = File.foreach('bontyurls.csv') do |url|
url.chomp!
begin
doc = Nokogiri.HTML(open(url))
h1 = doc.at('h1').text.strip
h1 = doc.at('title').text.strip.sub(/^Bontrager: /i, '') if (h1.empty?)
csv << [url, h1]
rescue OpenURI::HTTPError => e
csv << [url, e.message]
end
end
end
Which generates a CSV file like:
url,model_names
http://bontrager.com/model/11383,Road TLR Conversion Kit (Model #11383)
http://bontrager.com/model/01740,404 File Not Found
http://bontrager.com/model/09595,RXL Road (Model #09595)
You declare #model_names, but try to push in to #model_name, which is why it's nil.

Ruby Net::HTTP time out

I'm trying to write my first Ruby program, but have a problem. The code has to download 32 MP3 files over HTTP. It actually downloads a few, then times-out.
I tried setting a timeout period, but it makes no difference. Running the code under Windows, Cygwin and Mac OS X has the same result.
This is the code:
require 'rubygems'
require 'open-uri'
require 'nokogiri'
require 'set'
require 'net/http'
require 'uri'
puts "\n Up and running!\n\n"
links_set = {}
pages = ['http://www.vimeo.com/siai/videos/sort:oldest',
'http://www.vimeo.com/siai/videos/page:2/sort:oldest',
'http://www.vimeo.com/siai/videos/page:3/sort:oldest']
pages.each do |page|
doc = Nokogiri::HTML(open(page))
doc.search('//*[#href]').each do |m|
video_id = m[:href]
if video_id.match(/^\/(\d+)$/i)
links_set[video_id[/\d+/]] = m.children[0].to_s.split(" at ")[0].split(" -- ")[0]
end
end
end
links = links_set.to_a
p links
cookie = ''
file_name = ''
open("http://www.tubeminator.com") {|f|
cookie = f.meta['set-cookie'].split(';')[0]
}
links.each do |link|
open("http://www.tubeminator.com/ajax.php?function=downloadvideo&url=http%3A%2F%2Fwww.vimeo.com%2F" + link[0],
"Cookie" => cookie) {|f|
puts f.read
}
open("http://www.tubeminator.com/ajax.php?function=convertvideo&start=0&duration=1120&size=0&format=mp3&vq=high&aq=high",
"Cookie" => cookie) {|f|
file_name = f.read
}
puts file_name
Net::HTTP.start("www.tubeminator.com") { |http|
#http.read_timeout = 3600 # 1 hour
resp = http.get("/download-video-" + file_name)
open(link[1] + ".mp3", "wb") { |file|
file.write(resp.body)
}
}
end
puts "\n Yay!!"
And this is the exception:
/Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/net/protocol.rb:140:in `rescue in rbuf_fill': Timeout::Error (Timeout::Error)
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/net/protocol.rb:134:in `rbuf_fill'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/net/protocol.rb:116:in `readuntil'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/net/protocol.rb:126:in `readline'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/net/http.rb:2138:in `read_status_line'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/net/http.rb:2127:in `read_new'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/net/http.rb:1120:in `transport_request'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/net/http.rb:1106:in `request'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/open-uri.rb:312:in `block in open_http'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/net/http.rb:564:in `start'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/open-uri.rb:306:in `open_http'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/open-uri.rb:767:in `buffer_open'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/open-uri.rb:203:in `block in open_loop'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/open-uri.rb:201:in `catch'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/open-uri.rb:201:in `open_loop'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/open-uri.rb:146:in `open_uri'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/open-uri.rb:669:in `open'
from /Users/test/.rvm/rubies/ruby-1.9.2-preview1/lib/ruby/1.9.1/open-uri.rb:33:in `open'
from test.rb:38:in `block in <main>'
from test.rb:37:in `each'
from test.rb:37:in `<main>'
I'd also appreciate your comments on the rest of the code.
For Ruby 1.8 I used this to solve my time-out issues. Extending the Net::HTTP class in my code and re-initialized with default parameters including an initialization of my own read_timeout should keep things sane I think.
require 'net/http'
# Lengthen timeout in Net::HTTP
module Net
class HTTP
alias old_initialize initialize
def initialize(*args)
old_initialize(*args)
#read_timeout = 5*60 # 5 minutes
end
end
end
Your timeout isn't in the code you set the timeout for. It's here, where you use open-uri:
open("http://www.tubeminator.com/ajax.php?function=downloadvideo&url=http%3A%2F%2Fwww.vimeo.com%2F" + link[0],
You can set a read timeout for open-uri like so:
#!/usr/bin/ruby1.9
require 'open-uri'
open('http://stackoverflow.com', 'r', :read_timeout=>0.01) do |http|
http.read
end
# => /usr/lib/ruby/1.9.0/net/protocol.rb:135:in `sysread': \
# => execution expired (Timeout::Error)
# => ...
# => from /tmp/foo.rb:5:in `<main>'
:read_timeout is new for Ruby 1.9 (it's not in Ruby 1.8). 0 or nil means "no timeout."

Resources