post '/uploadZIP' do
zipFile = params['zip'][:filename]
if zipFile.end_with? '.zip'
File.open(params['zip'][:filename], 'w') do |f|
f.write(params['zip'][:tempfile].read)
end
extract_zip(zipFile, '..\VotersBest\public')
$websites = get_websites()
redirect to('/loginTA')
end
end
#STORE WEBSITES IN ARRAY
def get_websites()
$directory = "public"
if Dir.exist? $directory
siteArray = Array.new
# go to pwd/dir
Dir.chdir($directory) { siteArray = Dir.glob("*").select {|x| Dir.exist? x }}
# go back to parent dir
siteArray
end
end
get '/websites' do
#instance_websites = $websites
#website_url = $directory+ "/"+ #instance_websites[0] + "/index.html"
erb :websites
end
it says sinatra doesnt know this ditty and i know thas because i dont have a get request but is there anyway to get around that? I have websites in different folders and I dont want to make a get request for each folder path
the line
#website_url = $directory+ "/"+ #instance_websites[0] + "/index.html"
gets the error
it says to try
get '/public/student1/index.html' do
"Hello World"
end
but there could be time where the folder isnt called "student"
Related
I'm new to Ruby and am using Nokogiri to parse html webpages. An error is thrown in a function when it gets to the line:
currentPage = Nokogiri::HTML(open(url))
I have verified the inputs of the function, url is a string with a webaddress. The line I previously mention works exactly as intended when used outside of the function, but not inside. When it gets to that line inside the function the following error is thrown:
WebCrawler.rb:25:in `explore': undefined method `+#' for #<Nokogiri::HTML::Document:0x007f97ea0cdf30> (NoMethodError)
from WebCrawler.rb:43:in `<main>'
The function the problematic line is in is pasted below.
def explore(url)
if CRAWLED_PAGES_COUNTER > CRAWLED_PAGES_LIMIT
return
end
CRAWLED_PAGES_COUNTER++
currentPage = Nokogiri::HTML(open(url))
links = currentPage.xpath('//#href').map(&:value)
eval_page(currentPage)
links.each do|link|
puts link
explore(link)
end
end
Here is the full program (It's not much longer):
require 'nokogiri'
require 'open-uri'
#Crawler Params
START_URL = "https://en.wikipedia.org"
CRAWLED_PAGES_COUNTER = 0
CRAWLED_PAGES_LIMIT = 5
#Crawler Functions
def explore(url)
if CRAWLED_PAGES_COUNTER > CRAWLED_PAGES_LIMIT
return
end
CRAWLED_PAGES_COUNTER++
currentPage = Nokogiri::HTML(open(url))
links = currentPage.xpath('//#href').map(&:value)
eval_page(currentPage)
links.each do|link|
puts link
explore(link)
end
end
def eval_page(page)
puts page.title
end
#Start Crawling
explore(START_URL)
require 'nokogiri'
require 'open-uri'
#Crawler Params
$START_URL = "https://en.wikipedia.org"
$CRAWLED_PAGES_COUNTER = 0
$CRAWLED_PAGES_LIMIT = 5
#Crawler Functions
def explore(url)
if $CRAWLED_PAGES_COUNTER > $CRAWLED_PAGES_LIMIT
return
end
$CRAWLED_PAGES_COUNTER+=1
currentPage = Nokogiri::HTML(open(url))
links = currentPage.xpath('//#href').map(&:value)
eval_page(currentPage)
links.each do|link|
puts link
explore(link)
end
end
def eval_page(page)
puts page.title
end
#Start Crawling
explore($START_URL)
Just to give you something to build from, this is a simple spider that only harvests and visits links. Modifying it to do other things would be easy.
require 'nokogiri'
require 'open-uri'
require 'set'
BASE_URL = 'http://example.com'
URL_FORMAT = '%s://%s:%s'
SLEEP_TIME = 30 # in seconds
urls = [BASE_URL]
last_host = BASE_URL
visited_urls = Set.new
visited_hosts = Set.new
until urls.empty?
this_uri = URI.join(last_host, urls.shift)
next if visited_urls.include?(this_uri)
puts "Scanning: #{this_uri}"
doc = Nokogiri::HTML(this_uri.open)
visited_urls << this_uri
if visited_hosts.include?(this_uri.host)
puts "Sleeping #{SLEEP_TIME} seconds to reduce server load..."
sleep SLEEP_TIME
end
visited_hosts << this_uri.host
urls += doc.search('[href]').map { |node|
node['href']
}.select { |url|
extension = File.extname(URI.parse(url).path)
extension[/\.html?$/] || extension.empty?
}
last_host = URL_FORMAT % [:scheme, :host, :port].map{ |s| this_uri.send(s) }
puts "#{urls.size} URLs remain."
end
It:
Works on http://example.com. That site is designed and designated for experimenting.
Checks to see if a page was visited previously and won't scan it again. It's a naive check and will be fooled by URLs containing queries or queries that are not in a consistent order.
Checks to see if a site was previously visited and automatically throttles the page retrieval if so. It could be fooled by aliases.
Checks to see if a page ends with ".htm", ".html" or has no extension. Anything else is ignored.
The actual code to write an industrial strength spider is much more involved. Robots.txt files need to be honored, figuring out how to deal with pages that redirect to other pages either via HTTP timeouts or JavaScript redirects is a fun task, dealing with malformed pages are a challenge....
Thanks for click on this post, I have a Dashboard powered by Dashing on my RPI 3 where I have add a widget for my camera IP (D-link DSC 4201).
Camera-widget Link
There is the ruby jobs who catch the snapshot.jpeg of my camera :
require 'net/http'
#cameraDelay = 1 # Needed for image sync.
#fetchNewImageEvery = '3s'
#camera1Host = "192.168.0.20" ## CHANGE
#camera1Port = "80" ## CHANGE
#camera1Username = 'admin' ## CHANGE
#camera1Password ='*****'
#camera1URL = "/dms?nowprofileid=1&"
#newFile1 = "assets/images/cameras/snapshot1_new.jpg"
#oldFile1 = "assets/images/cameras/snapshot1_old.jpg"
def fetch_image(host,old_file,new_file, cam_user, cam_pass, cam_url)
`rm #{old_file}`
`mv #{new_file} #{old_file}`
Net::HTTP.start(host) do |http|
req = Net::HTTP::Get.new(cam_url)
if cam_user != "None" ## if username for any particular camera is set to 'None' then assume auth not required.
req.basic_auth cam_user, cam_pass
end
response = http.request(req)
open(new_file, "wb") do |file|
file.write(response.body)
end
end
new_file
end
def make_web_friendly(file)
"/" + File.basename(File.dirname(file)) + "/" + File.basename(file)
end
SCHEDULER.every #fetchNewImageEvery do
new_file1 = fetch_image(#camera1Host,#oldFile1,#newFile1,#camera1Username,#camera1Password,#camera1URL)
if not File.exists?(#newFile1)
warn "Failed to Get Camera Image"
end
send_event('camera1', image: make_web_friendly(#oldFile1))
sleep(#cameraDelay)
send_event('camera1', image: make_web_friendly(new_file1))
end
Actually my jobs display only the first two images (in #oldFile1,#newFile1) and after he was stock in a loop where he display only the first two images catch on my dashing dashboard.
So, I have looked in the /assets and I see my two snapshots.jpg refresh in real time like my jobs as to did it, But the dashing dashboard doesn't display it.
So why the dashboard don't took the refresh image..?
just ran into this issue today. Ended up rewriting things a bit. Pretty sure this is a browser caching issue in that the file name is the same. Just appended the datetime and pushing it seemed to work.
require 'net/http'
require 'open-uri'
#url = 'http://172.1.1.16/image.jpg'
SCHEDULER.every '4s', :first_in => 0 do |job|
`find '/dashing/f12dash/assets/images/cameras/' -type f -mmin +1 -print0 | xargs -0 rm -f`
#currentTime = Time.now.strftime("%Y-%m-%d_%H-%M-%S")
#newFile1 = "assets/images/cameras/snapshot-" + #currentTime + "_new.jpeg"
open(#url, :http_basic_authentication => ['root', 'CamPW']) do |f|
open(#newFile1,'wb') do |file|
file.puts f.read
end
end
send_event('camera1', image: ("/" + File.basename(File.dirname(#newFile1)) + "/" + File.basename(#newFile1)))
end
I'm doing a scraper to download all the issues of The Exile available at http://exile.ru/archive/list.php?IBLOCK_ID=35&PARAMS=ISSUE.
So far, my code is like this:
require 'rubygems'
require 'open-uri'
DATA_DIR = "exile"
Dir.mkdir(DATA_DIR) unless File.exists?(DATA_DIR)
BASE_exile_URL = "http://exile.ru/docs/pdf/issues/exile"
for number in 120..290
numero = BASE_exile_URL + number.to_s + ".pdf"
puts "Downloading issue #{number}"
open(numero) { |f|
File.open("#{DATA_DIR}/#{number}.pdf",'w') do |file|
file.puts f.read
end
}
end
puts "done"
The thing is, a lot of the issue links are down, and the code creates a PDF for every issue and, if it's empty, it will leave an empty PDF. How can I change the code so that it can only create and copy a file if the link exists?
require 'open-uri'
DATA_DIR = "exile"
Dir.mkdir(DATA_DIR) unless File.exists?(DATA_DIR)
url_template = "http://exile.ru/docs/pdf/issues/exile%d.pdf"
filename_template = "#{DATA_DIR}/%d.pdf"
(120..290).each do |number|
pdf_url = url_template % number
print "Downloading issue #{number}"
# Opening the URL downloads the remote file.
open(pdf_url) do |pdf_in|
if pdf_in.read(4) == '%PDF'
pdf_in.rewind
File.open(filename_template % number,'w') do |pdf_out|
pdf_out.write(pdf_in.read)
end
print " OK\n"
else
print " #{pdf_url} is not a PDF\n"
end
end
end
puts "done"
open(url) downloads the file and provides a handle to a local temp file. A PDF starts with '%PDF'. After reading the first 4 characters, if the file is a PDF, the file pointer has to be put back to the beginning to capture the whole file when writing a local copy.
you can use this code to check if exist the file:
require 'net/http'
def exist_the_pdf?(url_pdf)
url = URI.parse(url_pdf)
Net::HTTP.start(url.host, url.port) do |http|
puts http.request_head(url.path)['content-type'] == 'application/pdf'
end
end
Try this:
require 'rubygems'
require 'open-uri'
DATA_DIR = "exile"
Dir.mkdir(DATA_DIR) unless File.exists?(DATA_DIR)
BASE_exile_URL = "http://exile.ru/docs/pdf/issues/exile"
for number in 120..290
numero = BASE_exile_URL + number.to_s + ".pdf"
open(numero) { |f|
content = f.read
if content.include? "Link is missing"
puts "Issue #{number} doesnt exists"
else
puts "Issue #{number} exists"
File.open("./#{number}.pdf",'w') do |file|
file.write(content)
end
end
}
end
puts "done"
The main thing I added is a check to see if the string "Link is missing". I wanted to do it using HTTP status codes but they always give a 200 back, which is not the best practice.
The thing to note is that with my code you always download the whole site to look for that string, but I don't have any other idea to fix it at the moment.
i have this simple script to show all files in a folder, it works in the console but gives a different result in Sinatra (with path and extension). Why is this so, and how can i best present these basenames (without path and extension) in a ul list as a link to open this file in the browser using Sinatra ?
The goal is to present a clickable list of pages to open if no filename is given. I allready have the routine to show the files.
console:
require 'find'
def get_files path
dir_array = Array.new
Find.find(path) do |f|
dir_array << f if !File.directory?(f) # add only non-directories
end
return dir_array
end
for filename in get_files 'c:/sinatra_wiki/views'
basename = File.basename(filename, ".*")
puts basename
end
=> index
index2
Sinatra:
require 'find'
def get_files path
dir_array = Array.new
Find.find(path) do |f|
dir_array << f if !File.directory?(f) # add only non-directories
end
return dir_array
end
get '/' do
for filename in get_files 'c:/sinatra_wiki/views'
basename = File.basename(filename, ".*")
puts basename
end
end
=> c:/sinatra_wiki/views/index.htmlc:/sinatra_wiki/views/index2.erb
In your sinatra implementation, the result you see in the browser is not the one from the puts basename statement in the get block. It's the return value of the get_files method. Try adding puts "<p>#{base name}</p>" instead of the puts basename in the get block and see for yourself.
Some changes:
The get_files method: Instead of sending the entire file path, send only the file name
dir_array << File.basename(f, ".*")
Add a view in case you need clarity:
get '/' do
#arr = get_files(the_path)
erb :index
end
elsewhere, in the app/views folder, in an index.erb file:
<h2>Page list</h2>
<ul>
<% #arr.each do |page| %>
<li><%=page> %></li>
<% end %>
</ul>
This is to list out the file names in a similar way to that of the console output.
TL;DR: Put the looping part in the view!
I have the code:
require 'open-uri'
print "Enter a URL: "
add = gets
added = add.sub!(/http:\/\//, "")
puts "Info from: #{add}"
open("#{add}") do |f|
img = f.read.scan(/<img/)
img = img.length
puts "\t#{img} images"
f.close
end
open("#{add}") do |f|
links = f.read.scan(/<a/)
links = links.length
puts "\t#{links} links"
f.close
end
open("#{add}") do |f|
div = f.read.scan(/<div/)
div = div.le1ngth
puts "\t#{div} div tags"
f.close
end
(Yes I know it isn't good code, don't comment about it please)
When I run it, and for the URL, I enter in, say:
http://stackoverflow.com
I get the following error:
/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:32:in `initialize': No such file or directory - http (Errno::ENOENT)
from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:32:in `open_uri_original_open'
from /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/open-uri.rb:32:in `open'
Why does this error come up and how can I fix it?
The String.sub! method replaces the string in place, so add.sub!(/http:\/\//, "") changes the value of add in addition to setting added.
To use the open(name) method with URIs, the value of name must start with a URI scheme, like http://.
If you want to set added, do so like so:
added = add.sub(/http:\/\//, "")