Ruby - Getting page content even if it doesn't exist - ruby

I am trying to put together a series of custom 404 pages.
require 'uri'
def open(url)
page_content = Net::HTTP.get(URI.parse(url))
puts page_content.content
end
open('http://somesite.com/1ygjah1761')
the following code exits the program with an error. How can I get the page content from a website, regardless of it being 404 or not.

You need to rescue from the error
def open(url)
require 'net/http'
page_content = ""
begin
page_content = Net::HTTP.get(URI.parse(url))
puts page_content
rescue Net::HTTPNotFound
puts "THIS IS 404" + page_content
end
end
You can find more information on something like this here: http://tammersaleh.com/posts/rescuing-net-http-exceptions/

Net::HTTP.get returns the page content directly as a string, so there is no need to call .content on the results:
page_content = Net::HTTP.get(URI.parse(url))
puts page_content

Related

Getting all unique URL's using nokogiri

I've been working for a while to try to use the .uniq method to generate a unique list of URL's from a website (within the /informatics path). No matter what I try I get a method error when trying to generate the list. I'm sure it's a syntax issue, and I was hoping someone could point me in the right direction.
Once I get the list I'm going to need to store these to a database via ActiveRecord, but I need the unique list before I get start to wrap my head around that.
require 'nokogiri'
require 'open-uri'
require 'active_record'
ARGV[0]="https://www.nku.edu/academics/informatics.html"
ARGV.each do |arg|
open(arg) do |f|
# Display connection data
puts "#"*25 + "\nConnection: '#{arg}'\n" + "#"*25
[:base_uri, :meta, :status, :charset, :content_encoding,
:content_type, :last_modified].each do |method|
puts "#{method.to_s}: #{f.send(method)}" if f.respond_to? method
end
# Display the href links
base_url = /^(.*\.nku\.edu)\//.match(f.base_uri.to_s)[1]
puts "base_url: #{base_url}"
Nokogiri::HTML(f).css('a').each do |anchor|
href = anchor['href']
# Make Unique
if href =~ /.*informatics/
puts href
#store stuff to active record
end
end
end
end
Replace the Nokogiri::HTML part to select only those href attributes that matches with /*.informatics/ and then you can use uniq, as it's already an array:
require 'nokogiri'
require 'open-uri'
require 'active_record'
ARGV[0] = 'https://www.nku.edu/academics/informatics.html'
ARGV.each do |arg|
open(arg) do |f|
puts "#{'#' * 25} \nConnection: '#{arg}'\n #{'#' * 25}"
%i[base_uri meta status charset content_encoding, content_type last_modified].each do |method|
puts "#{method.to_s}: #{f.send(method)}" if f.respond_to? method
end
puts "base_url: #{/^(.*\.nku\.edu)\//.match(f.base_uri.to_s)[1]}"
anchors = Nokogiri::HTML(f).css('a').select { |anchor| anchor['href'] =~ /.*informatics/ }
puts anchors.map { |anchor| anchor['href'] }.uniq
end
end
See output.

syntax error, unexpected end-of-input, expecting keyword_end

I'm trying to write a simple program to parse JSON from the results of an API call. Very new to ruby and just can't figure this one out.
Here's all the code:
require "rubygems"
require "json"
require "net/http"
require "uri"
uri = URI.parse("http://api.chartbeat.com/live/recent/v3/?apikey=eaafffb9a735796b6edd50fd31eaab69&host=enactus.org")
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Get.new(uri.request_uri)
response = http.request(request)
if response.code == "200"
result = JSON.parse(response.body)
result.each do |doc|
puts doc["id"] #reference properties like this
puts doc # this is the result in object form
puts ""
puts ""
end
else
puts "ERROR!!!"
end
Here's the output of running the program (chartbeat.rb):
chartbeat.rb:14: syntax error, unexpected end-of-input, expecting keyword_end
The program comes verbatim from here with the url replaced: https://gist.github.com/timsavery/1657351
It doesn't look like what you're doing is taking advantage of any of Net::HTTPs power, so I'd probably do it like this:
require "rubygems"
require "json"
require "open-uri"
response = open("http://api.chartbeat.com/live/recent/v3/?apikey=eaafffb9a735796b6edd50fd31eaab69&host=enactus.org").read
result = JSON.parse(response)
result.each do |doc|
puts doc["id"] #reference properties like this
puts doc # this is the result in object form
puts ""
puts ""
end
OpenURI is the basis of a lot of code that hits URLs, and is a great starting place.
If you want to trap exceptions raised, use something like:
begin
response = open("http://api.chartbeat.com/live/recent/v3/?apikey=eaafffb9a735796b6edd50fd31eaab69&host=enactus.org").read
rescue Exception => e
puts e.message
exit
end
It could even be reduced to:
require "rubygems"
require "json"
require "open-uri"
JSON[
open("http://api.chartbeat.com/live/recent/v3/?apikey=eaafffb9a735796b6edd50fd31eaab69&host=enactus.org").read
].each do |doc|
puts doc["id"] #reference properties like this
puts doc # this is the result in object form
puts ""
puts ""
end
But that might be too drastic.

Using mechanize to validate a set of url's

I'm trying to validate an array of urls using mechanize. I am getting a 404 for one of the urls that is ending my loop instead of going through the rescue. I want the loop to continue even if it hits a 404. Am I doing something wrong with the begin/rescue syntax? I'm just displaying them in terminal for the time being.
a.get(url) do |page|
begin
puts url
puts page.title
rescue Mechanize::ResponseCodeError, Net::HTTPNotFound
puts "404!- " + "#{url}"
next
end
end
You need your begin/rescue/end around a.get, i.e:
begin
a.get(url) do |page|
puts url
puts page.title
end
rescue Mechanize::ResponseCodeError, Net::HTTPNotFound
puts "404!- " + "#{url}"
next
end

anemone print links on first page

wanted to see what i was doing wrong. here.
I need to print the links on the parent page, even they are for another domain. And get out.
require 'anemone'
url = ARGV[0]
Anemone.crawl(url, :depth_limit => 1) do |anemone|
anemone.on_every_page do |page|
page.links.each do |link|
puts link
end
end
end
what am i not doing right?
Edit: Outputs nothing.
This worked for me
require 'anemone'
require 'optparse'
file = ARGV[0]
File.open(file).each do |url|
url = URI.parse(URI.encode(url.strip))
Anemone.crawl(url, :discard_page_bodies => true) do |anemone|
anemone.on_every_page do |page|
links = page.doc.xpath("//a/#href")
if (links != nil)
links.each do |link|
puts link.to_s
end
end
end
end
end

How do I get the destination URL of a shortened URL using Ruby?

How do I take this URL http://t.co/yjgxz5Y and get the destination URL which is http://nickstraffictricks.com/4856_how-to-rank-1-in-google/
require 'net/http'
require 'uri'
Net::HTTP.get_response(URI.parse('http://t.co/yjgxz5Y'))['location']
# => "http://nickstraffictricks.com/4856_how-to-rank-1-in-google/"
I've used open-uri for this, because it's nice and simple. It will retrieve the page, but will also follow multiple redirects:
require 'open-uri'
final_uri = ''
open('http://t.co/yjgxz5Y') do |h|
final_uri = h.base_uri
end
final_uri # => #<URI::HTTP:0x00000100851050 URL:http://nickstraffictricks.com/4856_how-to-rank-1-in-google/>
The docs show a nice example for using the lower-level Net::HTTP to handle redirects.
require 'net/http'
require 'uri'
def fetch(uri_str, limit = 10)
# You should choose better exception.
raise ArgumentError, 'HTTP redirect too deep' if limit == 0
response = Net::HTTP.get_response(URI.parse(uri_str))
case response
when Net::HTTPSuccess then response
when Net::HTTPRedirection then fetch(response['location'], limit - 1)
else
response.error!
end
end
puts fetch('http://www.ruby-lang.org')
Of course this all breaks down if the page isn't using a HTTP redirect. A lot of sites use meta-redirects, which you have to handle by retrieving the URL from the meta tag, but that's a different question.
For resolving redirects you should use a HEAD request to avoid downloading the whole response body (imagine resolving a URL to an audio or video file).
Working example using the Faraday gem:
require 'faraday'
require 'faraday_middleware'
def resolve_redirects(url)
response = fetch_response(url, method: :head)
if response
return response.to_hash[:url].to_s
else
return nil
end
end
def fetch_response(url, method: :get)
conn = Faraday.new do |b|
b.use FaradayMiddleware::FollowRedirects;
b.adapter :net_http
end
return conn.send method, url
rescue Faraday::Error, Faraday::Error::ConnectionFailed => e
return nil
end
puts resolve_redirects("http://cre.fm/feed/m4a") # http://feeds.feedburner.com/cre-podcast
You would have to follow the redirect. I think that would help :
http://shadow-file.blogspot.com/2009/03/handling-http-redirection-in-ruby.html

Resources