Nokogiri scraper not sourcing properly - ruby

I'm trying to build a scraper for a web page for data transfer, and hitting a sourcing issue.
require 'nokogiri'
require 'byebug'
require 'httparty'
require "open-uri"
def scraper
url = "https://page text replaced for privacy"
unparsed_page = HTTParty.get(url, verify: false)
parsed_page = Nokogiri::HTML(unparsed_page, nil, Encoding::UTF_8.to_s)
items = Array.new
products = parsed_page.css("div.product-column > div.row")
products.each do |product|
product = {
byebug
summary: product.css("div.summary> a").value
# image: product.css("div.photo>a.img-responsive").attribute("src").value,
byebug
}
items << product
puts "Added #{product[:summary]}"
# puts "Added #{product[:image]}"
end
end
scraper
When I run byebug on the first line of the .each loop I get this error:
scraper.rb:18: syntax error, unexpected '\n', expecting =>
scraper.rb:21: syntax error, unexpected '}', expecting end
}
scraper.rb:26: syntax error, unexpected end, expecting end-of-input
I think I'm struggling to grab it properly. Any ideas?

Related

Getting all unique URL's using nokogiri

I've been working for a while to try to use the .uniq method to generate a unique list of URL's from a website (within the /informatics path). No matter what I try I get a method error when trying to generate the list. I'm sure it's a syntax issue, and I was hoping someone could point me in the right direction.
Once I get the list I'm going to need to store these to a database via ActiveRecord, but I need the unique list before I get start to wrap my head around that.
require 'nokogiri'
require 'open-uri'
require 'active_record'
ARGV[0]="https://www.nku.edu/academics/informatics.html"
ARGV.each do |arg|
open(arg) do |f|
# Display connection data
puts "#"*25 + "\nConnection: '#{arg}'\n" + "#"*25
[:base_uri, :meta, :status, :charset, :content_encoding,
:content_type, :last_modified].each do |method|
puts "#{method.to_s}: #{f.send(method)}" if f.respond_to? method
end
# Display the href links
base_url = /^(.*\.nku\.edu)\//.match(f.base_uri.to_s)[1]
puts "base_url: #{base_url}"
Nokogiri::HTML(f).css('a').each do |anchor|
href = anchor['href']
# Make Unique
if href =~ /.*informatics/
puts href
#store stuff to active record
end
end
end
end
Replace the Nokogiri::HTML part to select only those href attributes that matches with /*.informatics/ and then you can use uniq, as it's already an array:
require 'nokogiri'
require 'open-uri'
require 'active_record'
ARGV[0] = 'https://www.nku.edu/academics/informatics.html'
ARGV.each do |arg|
open(arg) do |f|
puts "#{'#' * 25} \nConnection: '#{arg}'\n #{'#' * 25}"
%i[base_uri meta status charset content_encoding, content_type last_modified].each do |method|
puts "#{method.to_s}: #{f.send(method)}" if f.respond_to? method
end
puts "base_url: #{/^(.*\.nku\.edu)\//.match(f.base_uri.to_s)[1]}"
anchors = Nokogiri::HTML(f).css('a').select { |anchor| anchor['href'] =~ /.*informatics/ }
puts anchors.map { |anchor| anchor['href'] }.uniq
end
end
See output.

Ruby - Getting page content even if it doesn't exist

I am trying to put together a series of custom 404 pages.
require 'uri'
def open(url)
page_content = Net::HTTP.get(URI.parse(url))
puts page_content.content
end
open('http://somesite.com/1ygjah1761')
the following code exits the program with an error. How can I get the page content from a website, regardless of it being 404 or not.
You need to rescue from the error
def open(url)
require 'net/http'
page_content = ""
begin
page_content = Net::HTTP.get(URI.parse(url))
puts page_content
rescue Net::HTTPNotFound
puts "THIS IS 404" + page_content
end
end
You can find more information on something like this here: http://tammersaleh.com/posts/rescuing-net-http-exceptions/
Net::HTTP.get returns the page content directly as a string, so there is no need to call .content on the results:
page_content = Net::HTTP.get(URI.parse(url))
puts page_content

Sinatra: Why is this a syntax error? [duplicate]

This question already has answers here:
An irritating Issue about ruby hashes
(3 answers)
Closed 7 years ago.
I am creating a CRUD API using Ruby, Sinatra, and MongoDB. I keep getting the following error message and I can't figure out what I've done wrong:
SyntaxError: /yasi.rb:6: syntax error, unexpected =>, expecting '}' Yasi.connect {:server => "localhost", :db => "yasi"} ^ /yasi.rb:6: syntax error, unexpected ',', expecting '}' Yasi.connect {:server => "localhost", :db => "yasi"} ^
Here is what my code looks like:
require 'rubygems'
require 'sinatra'
require 'lib/yasi'
before do
Yasi.connect {:server => "localhost", :db => "yasi"}
end
get "/" do
#yasis = Yasi.find :all
erb :index
end
get "/new" do
erb :new
end
get "/delete/:id" do
Yasi.delete(params[:id])
redirect "/"
end
post "/" do
params.reject! {|k,v| k == "submit"}
Yasi.save(params)
redirect "/"
end
Here is the lib/yasi file:
require 'rubygems'
require 'mongo'
require 'sinatra'
module Yasi
class << self
def connect(config)
#db = Mongo::Connection.new(config[:server],config[:port] || 27017).db(config[:db])
end
def find(search)
if search == :all
#return all
yasi = #db.collection("yasis").find.to_a
return nil_or_array(yasi)
else
return find_with_criteria(search)
end
end
def save(yasi)
stringify_keys(yasi)
#handle author first
if yasi["author"]
stringify_keys(yasi["author"])
author = #db.collection("authors").find_one(yasi["author"])
unless author
author = #db.collection("authors").save(yasi["author"])
end
yasi["author"] = author
end
#db.collection("yasis").save(yasi)
end
def delete(id)
victim = #db.collection("yasis").find_one(Mongo::ObjectID.from_stringid))
#db.collection("yasis").remove(victim) if victim
end
private
def find_with_criteria(search)
stringify_keys(search)
if search["author"]
author = #db.collection("authors").find_one stringify_keys(search["author"])
if author
search[:author] = author
yasi = #db.collection("yasis").find(search).to_a
return nil_or_array yasi
else
nil
end
else
yasi = #db.collection("yasis").find(search).to_a
return nil_or_array(yasi)
end
end
def stringify_keys(hash)
hash.each_key do |key|
hash[key.to_s] = hash.delete(key)
end
hash
end
def nil_or_array(result)
if result.size == 0
return nil
else
return result
end
end
end
end
Couple things. Make sure you're accessing yasi in the correct path. Also, pass a variable referencing your hash as config to Yasi.connect like below:
require './lib/yasi'
before do
config = { :server => "localhost", :db => "yasi" }
Yasi.connect config
end
That should handle the error you've posted about, but I would also double check your indentation in the yasi file to make it easier to troubleshoot.

How do you open StringIO in Ruby?

I have a Sinatra application with the following main.rb:
require 'bundler'
Bundler.require
get '/' do
##p = Pry.new
haml :index
end
post '/' do
code = params[:code]
$stdout = StringIO.new
##p.eval(code)
output = $stdout.string
$stdout = STDOUT
output_arr = []
output.each_line('\n') { |line| output_arr << line }
output_arr[1]
binding.pry
end
When I hit the binding.pry at the bottom to see if output contains any output, it seems like the IO stream is not closed, as I can't get anything to show up in the console.
However if I try to call open on StringIO.new, I receive an NoMethodError - private method 'open' called.
I am requiring 'stringio' in a config.ru file, and I've also tried requiring it in the main.rb file:
config.ru:
require 'stringio'
require './main'
run Sinatra::Application
I'm not sure if this is related but something interesting that I've noticed is that, in irb, if I require 'pry' before requiring stringio, then it returns false, otherwise it returns true.
This makes me wonder if Sinatra is including Pry from my Gemfile before loading the config.ru. Could that be the problem? Not sure how to solve this.

syntax error, unexpected end-of-input, expecting keyword_end

I'm trying to write a simple program to parse JSON from the results of an API call. Very new to ruby and just can't figure this one out.
Here's all the code:
require "rubygems"
require "json"
require "net/http"
require "uri"
uri = URI.parse("http://api.chartbeat.com/live/recent/v3/?apikey=eaafffb9a735796b6edd50fd31eaab69&host=enactus.org")
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Get.new(uri.request_uri)
response = http.request(request)
if response.code == "200"
result = JSON.parse(response.body)
result.each do |doc|
puts doc["id"] #reference properties like this
puts doc # this is the result in object form
puts ""
puts ""
end
else
puts "ERROR!!!"
end
Here's the output of running the program (chartbeat.rb):
chartbeat.rb:14: syntax error, unexpected end-of-input, expecting keyword_end
The program comes verbatim from here with the url replaced: https://gist.github.com/timsavery/1657351
It doesn't look like what you're doing is taking advantage of any of Net::HTTPs power, so I'd probably do it like this:
require "rubygems"
require "json"
require "open-uri"
response = open("http://api.chartbeat.com/live/recent/v3/?apikey=eaafffb9a735796b6edd50fd31eaab69&host=enactus.org").read
result = JSON.parse(response)
result.each do |doc|
puts doc["id"] #reference properties like this
puts doc # this is the result in object form
puts ""
puts ""
end
OpenURI is the basis of a lot of code that hits URLs, and is a great starting place.
If you want to trap exceptions raised, use something like:
begin
response = open("http://api.chartbeat.com/live/recent/v3/?apikey=eaafffb9a735796b6edd50fd31eaab69&host=enactus.org").read
rescue Exception => e
puts e.message
exit
end
It could even be reduced to:
require "rubygems"
require "json"
require "open-uri"
JSON[
open("http://api.chartbeat.com/live/recent/v3/?apikey=eaafffb9a735796b6edd50fd31eaab69&host=enactus.org").read
].each do |doc|
puts doc["id"] #reference properties like this
puts doc # this is the result in object form
puts ""
puts ""
end
But that might be too drastic.

Resources