How can I fix this error? Net::ReadTimeout (Net::ReadTimeout) - ruby

Actually I realize that it works well for some categories, like
step %{I go to "https://newyork.craigslist.org/search/spa?s=#{emails}"}
but not for others, like
# step %{I go to "https://newyork.craigslist.org/search/fbh?s=#{emails}"}
My function was working well for a few days, then suddenly it started giving out this error: Net::ReadTimeout (Net::ReadTimeout) right when i = 120.
Is there anything I can do to fix this?
Given(/^I go to "([^"]*)"?/) do |url|
visit(url)
end
Given("I save all emails") do
emails = 0
i = 119
until emails >= 500
until i == 120
fetch_emails(i, emails)
i += 1
end
click_next_button
emails += 120
puts emails
i = 1
puts i
end
end
def fetch_emails(i, emails)
find(:xpath, "(//a[#class='result-title hdrlnk'])[#{i}]").click
if Capybara.has_xpath?("//button[#class='reply-button js-only']")
find(:xpath, "//button[#class='reply-button js-only']").click
sleep(1)
if Capybara.has_xpath?("//p[#class='reply-email-address']")
# puts find(:xpath, "//p[#class='reply-email-address']//a").text
open('RESULTS.csv', 'a') do |f|
f << find(:xpath, "//p[#class='reply-email-address']//a").text + "\n"
end
end
end
# step %{I go to "https://newyork.craigslist.org/search/fbh?s=#{emails}"}
step %{I go to "https://newyork.craigslist.org/search/rfh?s=#{emails}"}
# step %{I go to "https://newyork.craigslist.org/search/lab?s=#{emails}"}
# step %{I go to "https://newyork.craigslist.org/search/spa?s=#{emails}"}
# step %{I go to "https://newyork.craigslist.org/search/trd?s=#{emails}"}
end
def click_next_button
first(".next").click
sleep(2)
end

If your chrome is upgrade to latest versions then use below capabilities
capabilities = Selenium::WebDriver::Remote::Capabilities.chrome(
chromeOptions: {
args: %w[
headless disable-gpu no-sandbox
--window-size=1980,1080 --enable-features=NetworkService,NetworkServiceInProcess
]
}
)
Capybara::Selenium::Driver.new app, browser: :chrome, desired_capabilities: capabilities

Related

Is there any validation check for already registered email id in ruby programming

# load in the webdriver gem to interect with selenium
require 'selenium-webdriver'
#setup chrome plugin
driver = Selenium::WebDriver::Chrome.driver_path='C:\Users\vidhi\Desktop\Ruby Folder\chromedriver.exe'
#this line will start the browser
driver = Selenium::WebDriver.for :chrome
wait = Selenium::WebDriver::Wait.new(:timeout => 20)
def document_initialised(driver)
driver.execute_script('return initialised')
end
#Navigate to URl
driver.get "http://automationpractice.com/index.php?controller=authentication&back=my-account#account-creation"
#Maximize the window
driver.manage.window.maximize
sleep 6
driver.action.key_down(:enter).perform
sleep 5
driver.find_element(:id,"email_create").send_keys "demouser099#gmail.com"
sleep 5
driver.action.key_down(:enter).perform
driver.find_element(:id,"SubmitCreate").click
sleep 2
driver.action.key_down(:enter).perform
#### Check that the radio button exists
puts "Test Passed: Radio button found" if wait.until {
driver.find_element(:id,"uniform-id_gender2").displayed?
}
#Change the state of the Radio Buttons
cb1 = wait.until {
element1 = driver.find_element(:id,"uniform-id_gender1")
element1 if element1.displayed?
}
cb1.click if cb1.selected? == false
cb3 = wait.until {
element2 = driver.find_element(:id,"uniform-id_gender2")
element2 if element2.displayed?
}
cb3.click if cb3.selected? == false
sleep 4
driver.find_element(:id,"customer_firstname").send_keys "demo"
sleep 3
driver.find_element(:id,"customer_lastname").send_keys "user"
sleep 5
password=driver.find_element(:id,"passwd").send_keys "demo#123"
sleep 4
if driver.find_element(:id,"uniform-days").displayed?
puts "Days dropdown is displayed.."
select = driver.find_element(:id,"uniform-days")
alloptions = select.find_elements(:tag_name,"option")
puts alloptions.size
alloptions.each do |option|
puts "Value is.."+option.attribute("value")
if option.attribute("value")=="20"
option.click
puts "Value has been selected.."
sleep 5
break
end
end
end
month_dropdown = driver.find_element(:id,"months")
months=Selenium::WebDriver::Support::Select.new(month_dropdown)
months.select_by(:text,"April ")
puts driver.find_element(:id,"months").text.include?("April")
years_dropdown = driver.find_element(:id,"years")
years=Selenium::WebDriver::Support::Select.new(years_dropdown)
years.select_by(:index,28)
sleep 3
driver.find_element(:id,"address1").send_keys "45 calony A"
sleep 4
if driver.find_element(:id,"uniform-id_state").displayed?
puts "State dropdown is displayed.."
select = driver.find_element(:id,"uniform-id_state")
alloptions = select.find_elements(:tag_name,"option")
puts alloptions.size
alloptions.each do |option|
puts "Text is.."+option.attribute("text")
if option.attribute("text")=="California"
option.click
puts "text has been selected.."
sleep 5
break
end
end
end
#Enter city
driver.find_element(:id,"city").send_keys "Los Angeles"
sleep 4
driver.find_element(:id,"postcode").send_keys "23654"
Country_dropdown =driver.find_element(:id,"id_country")
country=Selenium::WebDriver::Support::Select.new(Country_dropdown)
country.select_by(:index,1)
sleep 5
#Input Mobile Number
driver.find_element(:id,"phone_mobile").send_keys "985256238"
sleep 5
#Click on Submit button
driver.find_element(:id,"submitAccount").click
sleep 5
I have write script for registration form I have run my automation script then on first time it is successfully pass but when run twice with same email id then it is showing validation message as email already exists try with new one..so for this validation what is the assertion in ruby?
Here is the link for registration form-http://automationpractice.com/index.php?controller=authentication&back=my-account#account-creation
General side notes:
I see that you have a method defined that you don't use.
def document_initialised(driver)
driver.execute_script('return initialised')
end
Also there seems to be some un-nneeded variable assignment and not-needed driver.action.key_down(:enter).perform
It is possible to create a class to keep your code more structured ;)
To your question:
I would create an if to check if the error is displayed on the page or not after you did you submit with driver.find_element(:id,"SubmitCreate").click. You can see an example in about it in #register_email in my example.
If I create some class from it and do some other removal of code I think is obsolete, it looks like this (but it has not been tested by me).
# load in the webdriver gem to interect with selenium
require 'selenium-webdriver'
class RegisterUser
def run
return unless register_email
do_the_rest
end
private
def driver
return #driver if #driver.present?
Selenium::WebDriver::Chrome.driver_path='C:\Users\vidhi\Desktop\Ruby Folder\chromedriver.exe'
#driver = Selenium::WebDriver.for(:chrome)
#driver.get "http://automationpractice.com/index.php?controller=authentication&back=my-account#account-creation"
sleep 6
#driver.manage.window.maximize
#driver
end
def wait
Selenium::WebDriver::Wait.new(:timeout => 20)
end
def register_email
driver.find_element(:id,"email_create").send_keys "demouser099#gmail.com"
driver.find_element(:id,"SubmitCreate").click
sleep 5
return true unless driver.find_element(:id,"create_account_error").displayed?
# An account using this email address has already been registered.
# Please enter a valid password or request a new one.
puts driver.find_element(:id,"create_account_error").text
false
end
def do_the_rest
# Here you can put the rest of the code, or clean up even more and split into multiple methods
end
end
RegisterUser.new.run

How to take screenshot in selenium webdriver with ruby with date and time included in screenshot name?

I am trying to get a screenshot at every step with the current date and time, but I am getting the error
Error: test_login(Login_page): Argument Error: wrong number of arguments (1 for 0)
The code is
def setup
#driver = Selenium::WebDriver.for :chrome
#driver.manage.window.maximize
#driver.navigate.to "https://www.findmedecor.com"
wait = Selenium::WebDriver::Wait.new(:timeout => 10)
screenshot()
end
def test_login
#driver.find_element(:class,'open-overlay').click
screenshot(DateTime.now)
wait = Selenium::WebDriver::Wait.new(:timeout => 10)
login_email = wait.until {
element = #driver.find_element(:name, "login_email")
element if element.displayed?
}
login_email.send_keys("suwarna.wade#rohagroup.com")
puts "Test Passed: login pop up found" if login_email.displayed?
screenshot(DateTime.now)
#driver.find_element(:id,'pass').send_keys('123456')
#driver.find_element(:id,'btn_login').click
puts "Logged in successfully"
puts "Time of test = ", DateTime.now
screenshot(DateTime.now)
end
$i = DateTime.now
def screenshot
#driver.save_screenshot("screenshot #{'$i'}.png")
$i= +1
end
end
The problem is that Time.now returns a format like '2016-09-28 04:45:40 +0000' which is not a valid filename on Windows. You can just reformat the date/time to something valid like
Time.now.strftime('%Y-%m-%d_%H.%M.%S')
which outputs 2016-09-27_23.33.59 and then put that in your filename.
http://ruby-doc.org/core-2.2.0/Time.html#method-i-strftime

/settings/ads/ Keeps popping up while scraping Google

I have a program that scrapes Google, it's an open source vulnerability scraper that uses mechanize to search Google. It uses a random search query provided in a text file to decide what to search for.
I'll post the main file and a link to the git due to the size of the program.
Anyways, I have this program that is used to scrape for sites, however, while it is scraping every now and then it comes across a 'URL' (I say that lightly) that looks like this:
[17:05:02 INFO]I'll run in default mode!
[17:05:02 INFO]I'm searching for possible SQL vulnerable sites, using search query inurl:/main.php?f1=
[17:05:04 SUCCESS]Site found: http://forix.autosport.com/main.php?l=0&c=1
[17:05:05 SUCCESS]Site found: https://zweeler.com/formula1/FantasyFormula12016/main.php?ref=103
[17:05:06 SUCCESS]Site found: https://en.zweeler.com/formula1/FantasyFormula1YearGame2015/main.php
[17:05:07 SUCCESS]Site found: http://modelcargo.com/main.php?mod=sambachoose&dep=samba
[17:05:08 SUCCESS]Site found: http://www.ukdirt.co.uk/main.php?P=rules&f=8
[17:05:09 SUCCESS]Site found: http://www.ukdirt.co.uk/main.php?P=tracks&g=2&d=2&m=0
[17:05:11 SUCCESS]Site found: http://zoohoo.sk/redir.php?q=v%FDsledok&url=http%3A%2F%2Flivescore.sk%2Fmain.php%3Flang%3Dsk
[17:05:12 SUCCESS]Site found: http://www.chemical-plus.com/main.php?f1=pearl_pigment.htm
[17:05:13 SUCCESS]Site found: http://www.fantasyf1.co/main.php
[17:05:14 SUCCESS]Site found: http://www.escritores.cl/base.php?f1=escritores/main.php
[17:05:15 SUCCESS]Site found: /settings/ads/preferences?hl=en #<= Right here
When this shows up, it completely crashes the program. I've tried doing the following:
next if urls == '/settings/ads/preferences?hl=en'
next if urls =~ /preferences?hl=en/
next if urls.split('/')[2] == 'ads/preferences?hl=en'
However, it keeps popping up. Also I should mention, the last 5 characters depend on your locations, so far I've seen:
hl=en
hl=ru
hl=ia
Does anybody have any idea what this is, I've done some research and literally can't find anything on it. Any help with this would be fantastic.
Main source:
#!/usr/local/env ruby
require 'rubygems'
require 'bundler/setup'
require 'mechanize'
require 'nokogiri'
require 'rest-client'
require 'timeout'
require 'uri'
require 'fileutils'
require 'colored'
require 'yaml'
require 'date'
require 'optparse'
require 'tempfile'
require 'socket'
require 'net/http'
require_relative 'lib/modules/format.rb'
require_relative 'lib/modules/credits.rb'
require_relative 'lib/modules/legal.rb'
require_relative 'lib/modules/spider.rb'
require_relative 'lib/modules/copy.rb'
require_relative 'lib/modules/site_info.rb'
include Format
include Credits
include Legal
include Whitewidow
include Copy
include SiteInfo
PATH = Dir.pwd
VERSION = Whitewidow.version
SEARCH = File.readlines("#{PATH}/lib/search_query.txt").sample
info = YAML.load_file("#{PATH}/lib/rand-agents.yaml")
#user_agent = info['user_agents'][info.keys.sample]
OPTIONS = {}
def usage_page
Format.usage("You can run me with the following flags: #{File.basename(__FILE__)} -[d|e|h] -[f] <path/to/file/if/any>")
exit
end
def examples_page
Format.usage('This is my examples page, I\'ll show you a few examples of how to get me to do what you want.')
Format.usage('Running me with a file: whitewidow.rb -f <path/to/file> keep the file inside of one of my directories.')
Format.usage('Running me default, if you don\'t want to use a file, because you don\'t think I can handle it, or for whatever reason, you can run me default by passing the Default flag: whitewidow.rb -d this will allow me to scrape Google for some SQL vuln sites, no guarentees though!')
Format.usage('Running me with my Help flag will show you all options an explanation of what they do and how to use them')
Format.usage('Running me without a flag will show you the usage page. Not descriptive at all but gets the point across')
end
OptionParser.new do |opt|
opt.on('-f FILE', '--file FILE', 'Pass a file name to me, remember to drop the first slash. /tmp/txt.txt <= INCORRECT tmp/text.txt <= CORRECT') { |o| OPTIONS[:file] = o }
opt.on('-d', '--default', 'Run me in default mode, this will allow me to scrape Google using my built in search queries.') { |o| OPTIONS[:default] = o }
opt.on('-e', '--example', 'Shows my example page, gives you some pointers on how this works.') { |o| OPTIONS[:example] = o }
end.parse!
def page(site)
Nokogiri::HTML(RestClient.get(site))
end
def parse(site, tag, i)
parsing = page(site)
parsing.css(tag)[i].to_s
end
def format_file
Format.info('Writing to temporary file..')
if File.exists?(OPTIONS[:file])
file = Tempfile.new('file')
IO.read(OPTIONS[:file]).each_line do |s|
File.open(file, 'a+') { |format| format.puts(s) unless s.chomp.empty? }
end
IO.read(file).each_line do |file|
File.open("#{PATH}/tmp/#sites.txt", 'a+') { |line| line.puts(file) }
end
file.unlink
Format.info("File: #{OPTIONS[:file]}, has been formatted and saved as #sites.txt in the tmp directory.")
else
puts <<-_END_
Hey now my friend, I know you're eager, I am also, but that file #{OPTIONS[:file]}
either doesn't exist, or it's not in the directory you say it's in..
I'm gonna need you to go find that file, move it to the correct directory and then
run me again.
Don't worry I'll wait!
_END_
.yellow.bold
end
end
def get_urls
Format.info("I'll run in default mode!")
Format.info("I'm searching for possible SQL vulnerable sites, using search query #{SEARCH}")
agent = Mechanize.new
agent.user_agent = #user_agent
page = agent.get('http://www.google.com/')
google_form = page.form('f')
google_form.q = "#{SEARCH}"
url = agent.submit(google_form, google_form.buttons.first)
url.links.each do |link|
if link.href.to_s =~ /url.q/
str = link.href.to_s
str_list = str.split(%r{=|&})
urls = str_list[1]
next if urls.split('/')[2].start_with? 'stackoverflow.com', 'github.com', 'www.sa-k.net', 'yoursearch.me', 'search1.speedbit.com', 'duckfm.net', 'search.clearch.org', 'webcache.googleusercontent.com'
next if urls == '/settings/ads/preferences?hl=en' #<= ADD HERE REMEMBER A COMMA =>
urls_to_log = URI.decode(urls)
Format.success("Site found: #{urls_to_log}")
sleep(1)
sql_syntax = ["'", "`", "--", ";"].each do |sql|
File.open("#{PATH}/tmp/SQL_sites_to_check.txt", 'a+') { |s| s.puts("#{urls_to_log}#{sql}") }
end
end
end
Format.info("I've dumped possible vulnerable sites into #{PATH}/tmp/SQL_sites_to_check.txt")
end
def vulnerability_check
case
when OPTIONS[:default]
file_to_read = "tmp/SQL_sites_to_check.txt"
when OPTIONS[:file]
Format.info("Let's check out this file real quick like..")
file_to_read = "tmp/#sites.txt"
end
Format.info('Forcing encoding to UTF-8') unless OPTIONS[:file]
IO.read("#{PATH}/#{file_to_read}").each_line do |vuln|
begin
Format.info("Parsing page for SQL syntax error: #{vuln.chomp}")
Timeout::timeout(10) do
vulns = vuln.encode(Encoding.find('UTF-8'), {invalid: :replace, undef: :replace, replace: ''})
begin
if parse("#{vulns.chomp}'", 'html', 0)[/You have an error in your SQL syntax/]
Format.site_found(vulns.chomp)
File.open("#{PATH}/tmp/SQL_VULN.txt", "a+") { |s| s.puts(vulns) }
sleep(1)
else
Format.warning("URL: #{vulns.chomp} is not vulnerable, dumped to non_exploitable.txt")
File.open("#{PATH}/log/non_exploitable.txt", "a+") { |s| s.puts(vulns) }
sleep(1)
end
rescue Timeout::Error, OpenSSL::SSL::SSLError
Format.warning("URL: #{vulns.chomp} failed to load dumped to non_exploitable.txt")
File.open("#{PATH}/log/non_exploitable.txt", "a+") { |s| s.puts(vulns) }
next
sleep(1)
end
end
rescue RestClient::ResourceNotFound, RestClient::InternalServerError, RestClient::RequestTimeout, RestClient::Gone, RestClient::SSLCertificateNotVerified, RestClient::Forbidden, OpenSSL::SSL::SSLError, Errno::ECONNREFUSED, URI::InvalidURIError, Errno::ECONNRESET, Timeout::Error, OpenSSL::SSL::SSLError, Zlib::GzipFile::Error, RestClient::MultipleChoices, RestClient::Unauthorized, SocketError, RestClient::BadRequest, RestClient::ServerBrokeConnection, RestClient::MaxRedirectsReached => e
Format.err("URL: #{vuln.chomp} failed due to an error while connecting, URL dumped to non_exploitable.txt")
File.open("#{PATH}/log/non_exploitable.txt", "a+") { |s| s.puts(vuln) }
next
end
end
end
case
when OPTIONS[:default]
begin
Whitewidow.spider
sleep(1)
Credits.credits
sleep(1)
Legal.legal
get_urls
vulnerability_check unless File.size("#{PATH}/tmp/SQL_sites_to_check.txt") == 0
Format.warn("No sites found for search query: #{SEARCH}. Logging into error_log.LOG. Create a issue regarding this.") if File.size("#{PATH}/tmp/SQL_sites_to_check.txt") == 0
File.open("#{PATH}/log/error_log.LOG", 'a+') { |s| s.puts("No sites found with search query #{SEARCH}") } if File.size("#{PATH}/tmp/SQL_sites_to_check.txt") == 0
File.truncate("#{PATH}/tmp/SQL_sites_to_check.txt", 0)
Format.info("I'm truncating SQL_sites_to_check file back to #{File.size("#{PATH}/tmp/SQL_sites_to_check.txt")}")
Copy.file("#{PATH}/tmp/SQL_VULN.txt", "#{PATH}/log/SQL_VULN.LOG")
File.truncate("#{PATH}/tmp/SQL_VULN.txt", 0)
Format.info("I've run all my tests and queries, and logged all important information into #{PATH}/log/SQL_VULN.LOG")
rescue Mechanize::ResponseCodeError, RestClient::ServiceUnavailable, OpenSSL::SSL::SSLError, RestClient::BadGateway => e
d = DateTime.now
Format.fatal("Well this is pretty crappy.. I seem to have encountered a #{e} error. I'm gonna take the safe road and quit scanning before I break something. You can either try again, or manually delete the URL that caused the error.")
File.open("#{PATH}/log/error_log.LOG", 'a+'){ |error| error.puts("[#{d.month}-#{d.day}-#{d.year} :: #{Time.now.strftime("%T")}]#{e}") }
Format.info("I'll log the error inside of #{PATH}/log/error_log.LOG for further analysis.")
end
when OPTIONS[:file]
begin
Whitewidow.spider
sleep(1)
Credits.credits
sleep(1)
Legal.legal
Format.info('Formatting file')
format_file
vulnerability_check
File.truncate("#{PATH}/tmp/SQL_sites_to_check.txt", 0)
Format.info("I'm truncating SQL_sites_to_check file back to #{File.size("#{PATH}/tmp/SQL_sites_to_check.txt")}")
Copy.file("#{PATH}/tmp/SQL_VULN.txt", "#{PATH}/log/SQL_VULN.LOG")
File.truncate("#{PATH}/tmp/SQL_VULN.txt", 0)
Format.info("I've run all my tests and queries, and logged all important information into #{PATH}/log/SQL_VULN.LOG") unless File.size("#{PATH}/log/SQL_VULN.LOG") == 0
rescue Mechanize::ResponseCodeError, RestClient::ServiceUnavailable, OpenSSL::SSL::SSLError, RestClient::BadGateway => e
d = DateTime.now
Format.fatal("Well this is pretty crappy.. I seem to have encountered a #{e} error. I'm gonna take the safe road and quit scanning before I break something. You can either try again, or manually delete the URL that caused the error.")
File.open("#{PATH}/log/error_log.LOG", 'a+'){ |error| error.puts("[#{d.month}-#{d.day}-#{d.year} :: #{Time.now.strftime("%T")}]#{e}") }
Format.info("I'll log the error inside of #{PATH}/log/error_log.LOG for further analysis.")
end
when OPTIONS[:example]
examples_page
else
Format.warning('You failed to pass me a flag!')
usage_page
end
IS there anything within this code, that would cause this to randomly popup? It only happens with random search queries.
Link to GitHub
UPDATE:
Ive discovered that Googles advertisement services link has the same extension in its URL as the one giving me problems.. However this doesn't explain why I'm getting this link, and why I can't seem to skip over it.
urls = "settings/ads/preferences?hl=ru"
if urls =~ /settings\/ads\/preferences\?hl=[a-z]{2}/
p "I'm skipped"
end
=> "I'm skipped"

Ruby Mechanize Stops Working while in Each Do Loop

I am using a mechanize Ruby script to loop through about 1,000 records in a tab delimited file. Everything works as expected until i reach about 300 records.
Once I get to about 300 records, my script keeps calling rescue on every attempt and eventually stops working. I thought it was because I had not properly set max_history, but that doesn't seem to be making a difference.
Here is the error message that I start getting:
getaddrinfo: nodename nor servname provided, or not known
Any ideas on what I might be doing wrong here?
require 'mechanize'
result_counter = 0
used_file = File.open(ARGV[0])
total_rows = used_file.readlines.size
mechanize = Mechanize.new { |agent|
agent.open_timeout = 10
agent.read_timeout = 10
agent.max_history = 0
}
File.open(ARGV[0]).each do |line|
item = line.split("\t").map {|item| item.strip}
website = item[16]
name = item[11]
if website
begin
tries ||= 3
page = mechanize.get(website)
primary1 = page.link_with(text: 'text')
secondary1 = page.link_with(text: 'other_text')
contains_primary = true
contains_secondary = true
unless contains_primary || contains_secondary
1.times do |count|
result_counter+=1
STDERR.puts "Generate (#{result_counter}/#{total_rows}) #{name} - No"
end
end
for i in [primary1]
if i
page_to_visit = i.click
page_found = page_to_visit.uri
1.times do |count|
result_counter+=1
STDERR.puts "Generate (#{result_counter}/#{total_rows}) #{name}"
end
break
end
end
rescue Timeout::Error
STDERR.puts "Generate (#{result_counter}/#{total_rows}) #{name} - Timeout"
rescue => e
STDERR.puts e.message
STDERR.puts "Generate (#{result_counter}/#{total_rows}) #{name} - Rescue"
end
end
end
You get this error because you don't close the connection after you used it.
This should fix your problem:
mechanize = Mechanize.new { |agent|
agent.open_timeout = 10
agent.read_timeout = 10
agent.max_history = 0
agent.keep_alive = false
}

mechanize html scraping problem

so i am trying to extract the email of my website using ruby mechanize and hpricot.
what i a trying to do its loop on all the page of my administration side and parse the pages with hpricot.so far so good. Then I get:
Exception `Net::HTTPBadResponse' at /usr/lib/ruby/1.8/net/http.rb:2022 - wrong status line: *SOME HTML CODE HERE*
when it parse a bunch of page , its starts with a timeout and then print the html code of the page.
cant understand why? how can i debug that?
its seems like mechanize can get more than 10 page on a row ?? is it possible??
thanks
require 'logger'
require 'rubygems'
require 'mechanize'
require 'hpricot'
require 'open-uri'
class Harvester
def initialize(page)
#page=page
#agent = WWW::Mechanize.new{|a| a.log = Logger.new("logs.log") }
#agent.keep_alive=false
#agent.read_timeout=15
end
def login
f = #agent.get( "http://****.com/admin/index.asp") .forms.first
f.set_fields(:username => "user", :password =>"pass")
f.submit
end
def harvest(s)
pageNumber=1
##agent.read_timeout =
s.upto(#page) do |pagenb|
puts "*************************** page= #{pagenb}/#{#page}***************************************"
begin
#time=Time.now
#search=#agent.get( "http://****.com/admin/members.asp?action=search&term=&state_id=&r=500&p=#{page}")
extract(pagenb)
rescue => e
puts "unknown #{e.to_s}"
#puts "url:http://****.com/admin/members.asp?action=search&term=&state_id=&r=500&p=#{page}"
#sleep(2)
extract(pagenb)
rescue Net::HTTPBadResponse => e
puts "net exception"+ e.to_s
rescue WWW::Mechanize::ResponseCodeError => ex
puts "mechanize error: "+ex.response_code
rescue Timeout::Error => e
puts "timeout: "+e.to_s
end
end
end
def extract(page)
#puts search.body
search=#agent.get( "http://***.com/admin/members.asp?action=search&term=&state_id=&r=500&p=#{page}")
doc = Hpricot(search.body)
#remove titles
#~ doc.search("/html/body/div/table[2]/tr/td[2]/table[3]/tr[1]").remove
(doc/"/html/body/div/table[2]/tr/td[2]/table[3]//tr").each do |tr|
#delete the phone number from the html
temp = tr.search("/td[2]").inner_html
index = temp.index('<')
email = temp[0..index-1]
puts email
f=File.open("./emails", 'a')
f.puts(email)
f.close
end
end
end
puts "starting extacting emails ... "
start =ARGV[0].to_i
h=Harvester.new(186)
h.login
h.harvest(start)
Mechanize puts full content of a page into history, this may cause problems when browsing through many pages. To limit the size of history, try
#mech = WWW::Mechanize.new do |agent|
agent.history.max_size = 1
end

Resources