EventMachine and Twitter streaming API - ruby

I am running an EventMachine process using the Twitter streaming API. I always have an issue if the content of the stream is not frequently.
Here is the minimal version of the script:
require 'rubygems'
require 'eventmachine'
require 'em-http'
require 'json'
usage = "#{$0} <user> <password> <track>"
abort usage unless user = ARGV.shift
abort usage unless password = ARGV.shift
abort usage unless keywords= ARGV.shift
def startIt(user,password,keywords)
EventMachine.run do
http = EventMachine::HttpRequest.new("https://stream.twitter.com/1/statuses/filter.json",{:port=>443}).post(
:head =>{ 'Authorization' => [ user, password ] } ,
:body =>{"track"=>keywords},
:keepalive=>true,
:timeout=>-1)
buffer = ""
http.stream do |chunk|
buffer += chunk
while line = buffer.slice!(/.+\r?\n/)
if line.length>5
tweet=JSON.parse(line)
puts Time.new.to_s+"#{tweet['user']['screen_name']}: #{tweet['text']}"
end
end
end
http.errback {
puts Time.new.to_s+"Error: "
puts http.error
}
end
rescue => error
puts "error rescue "+error.to_s
end
while true
startIt user,password,keywords
end
If I search for a keyword like "iphone", everything works well
If I search for a less frequently used keyword, my stream keeps to be closed very rapidely , around 20 sec after the last message.
Note: that http.error is always empty, so it's very hard to understand while the stream is closed...
On the other end, the nerly similar php version is not closed, so seems probably in issue with eventmachine/http-em but I dont' understand which one...

You should add settings to prevent your connection to timeout.
Try this :
http = EventMachine::HttpRequest.new(
"https://stream.twitter.com/1/statuses/filter.json",
:connection_timeout => 0,
:inactivity_timeout => 0
).post(
:head => {'Authorization' => [ user, password ] } ,
:body => {'track' => keywords}
)
Good luck,
Christian

Related

How to properly implement Net::SSH port forwards

I have been trying to get port forwarding to work correctly with Net::SSH. From what I understand I need to fork out the Net::SSH session if I want to be able to use it from the same Ruby program so that the event handling loop can actually process packets being sent through the connection. However, this results in the ugliness you can see in the following:
#!/usr/bin/env ruby -w
require 'net/ssh'
require 'httparty'
require 'socket'
include Process
log = Logger.new(STDOUT)
log.level = Logger::DEBUG
local_port = 2006
child_socket, parent_socket = Socket.pair(:UNIX, :DGRAM, 0)
maxlen = 1000
hostname = "www.example.com"
pid = fork do
parent_socket.close
Net::SSH.start("hostname", "username") do |session|
session.logger = log
session.logger.sev_threshold=Logger::Severity::DEBUG
session.forward.local(local_port, hostname, 80)
child_socket.send("ready", 0)
pidi = fork do
msg = child_socket.recv(maxlen)
puts "Message from parent was: #{msg}"
exit
end
session.loop do
status = waitpid(pidi, Process::WNOHANG)
puts "Status: #{status.inspect}"
status.nil?
end
end
end
child_socket.close
puts "Message from child: #{parent_socket.recv(maxlen)}"
resp = HTTParty.post("http://localhost:#{local_port}/", :headers => { "Host" => hostname } )
# the write cannot be the last statement, otherwise the child pid could end up
# not receiving it
parent_socket.write("done")
puts resp.inspect
Can anybody show me a more elegant/better working solution to this?
I spend a lot of time trying to figure out how to correctly implement port forwarding, then I took inspiration from net/ssh/gateway library. I needed a robust solution that works after various possible connection errors. This is what I'm using now, hope it helps:
require 'net/ssh'
ssh_options = ['host', 'login', :password => 'password']
tunnel_port = 2222
begin
run_tunnel_thread = true
tunnel_mutex = Mutex.new
ssh = Net::SSH.start *ssh_options
tunnel_thread = Thread.new do
begin
while run_tunnel_thread do
tunnel_mutex.synchronize { ssh.process 0.01 }
Thread.pass
end
rescue => exc
puts "tunnel thread error: #{exc.message}"
end
end
tunnel_mutex.synchronize do
ssh.forward.local tunnel_port, 'tunnel_host', 22
end
begin
ssh_tunnel = Net::SSH.start 'localhost', 'tunnel_login', :password => 'tunnel_password', :port => tunnel_port
puts ssh_tunnel.exec! 'date'
rescue => exc
puts "tunnel connection error: #{exc.message}"
ensure
ssh_tunnel.close if ssh_tunnel
end
tunnel_mutex.synchronize do
ssh.forward.cancel_local tunnel_port
end
rescue => exc
puts "tunnel error: #{exc.message}"
ensure
run_tunnel_thread = false
tunnel_thread.join if tunnel_thread
ssh.close if ssh
end
That's just how SSH in general is. If you're offended by how ugly it looks, you should probably wrap up that functionality into a port forwarding class of some sort so that the exposed part is a lot more succinct. An interface like this, perhaps:
forwarder = PortForwarder.new(8080, 'remote.host', 80)
So I have found a slightly better implementation. It only requires a single fork but still uses a socket for the communication. It uses IO#read_nonblock for checking if a message is ready. If there isn't one, the method throws an exception, in which case the block continues to return true and the SSH session keeps serving requests. Once the parent is done with the connection it sends a message, which causes child_socket.read_nonblock(maxlen).nil? to return false, making the loop exit and therefore shutting down the SSH connection.
I feel a little better about this, so between that and #tadman's suggestion to wrap it in a port forwarding class I think it's about as good as it'll get. However, any further suggestions for improving this are most welcome.
#!/usr/bin/env ruby -w
require 'net/ssh'
require 'httparty'
require 'socket'
log = Logger.new(STDOUT)
log.level = Logger::DEBUG
local_port = 2006
child_socket, parent_socket = Socket.pair(:UNIX, :DGRAM, 0)
maxlen = 1000
hostname = "www.example.com"
pid = fork do
parent_socket.close
Net::SSH.start("ssh-tunnel-hostname", "username") do |session|
session.logger = log
session.logger.sev_threshold=Logger::Severity::DEBUG
session.forward.local(local_port, hostname, 80)
child_socket.send("ready", 0)
session.loop { child_socket.read_nonblock(maxlen).nil? rescue true }
end
end
child_socket.close
puts "Message from child: #{parent_socket.recv(maxlen)}"
resp = HTTParty.post("http://localhost:#{local_port}/", :headers => { "Host" => hostname } )
# the write cannot be the last statement, otherwise the child pid could end up
# not receiving it
parent_socket.write("done")
puts resp.inspect

understanding Ruby code?

I was wondering if anyone can help me understanding the Ruby code below? I'm pretty new to Ruby programming and having trouble understanding the meaning of each functions.
When I run this with my twitter username and password as parameter, I get a stream of twitter feed samples. What do I need to do with this code to only display the hashtags?
I'm trying to gather the hashtags every 30 seconds, then sort from least to most occurrences of the hashtags.
Not looking for solutions, but for ideas. Thanks!
require 'eventmachine'
require 'em-http'
require 'json'
usage = "#{$0} <user> <password>"
abort usage unless user = ARGV.shift
abort usage unless password = ARGV.shift
url = 'https://stream.twitter.com/1/statuses/sample.json'
def handle_tweet(tweet)
return unless tweet['text']
puts "#{tweet['user']['screen_name']}: #{tweet['text']}"
end
EventMachine.run do
http = EventMachine::HttpRequest.new(url).get :head => { 'Authorization' => [ user, password ] }
buffer = ""
http.stream do |chunk|
buffer += chunk
while line = buffer.slice!(/.+\r?\n/)
handle_tweet JSON.parse(line)
end
end
end
puts "#{tweet['user']['screen_name']}: #{tweet['text']}"
That line shows you a user name followed by the content of the tweet.
Let's take a step back for a sec.
Hash tags appear inside the tweet's content--this means they're inside tweet['text']. A hash tag always takes the form of a # followed by a bunch of non-space characters. That's really easy to grab with a regex. Ruby's core API facilitates that via String#scan. Example:
"twitter is short #foo yawn #bar".scan(/\#\w+/) # => ["#foo", "#bar"]
What you want is something like this:
def handle_tweet(tweet)
return unless tweet['text']
# puts "#{tweet['user']['screen_name']}: #{tweet['text']}" # OLD
puts tweet['text'].scan(/\#\w+/).to_s
end
tweet['text'].scan(/#\w+/) is an array of strings. You can do whatever you want with that array. Supposing you're new to Ruby and want to print the hash tags to the console, here's a brief note about printing arrays with puts:
puts array # => "#foo\n#bar"
puts array.to_s # => '["#foo", "#bar"]'
#Load Libraries
require 'eventmachine'
require 'em-http'
require 'json'
# Looks like this section assumes you're calling this from commandline.
usage = "#{$0} <user> <password>" # $0 returns the name of the program
abort usage unless user = ARGV.shift # Return first argument passed when program called
abort usage unless password = ARGV.shift
# The URL
url = 'https://stream.twitter.com/1/statuses/sample.json'
# method which, when called later, prints out the tweets
def handle_tweet(tweet)
return unless tweet['text'] # Ensures tweet object has 'text' property
puts "#{tweet['user']['screen_name']}: #{tweet['text']}" # write the result
end
# Create an HTTP request obj to URL above with user authorization
EventMachine.run do
http = EventMachine::HttpRequest.new(url).get :head => { 'Authorization' => [ user, password ] }
# Initiate an empty string for the buffer
buffer = ""
# Read the stream by line
http.stream do |chunk|
buffer += chunk
while line = buffer.slice!(/.+\r?\n/) # cut each line at newline
handle_tweet JSON.parse(line) # send each tweet object to handle_tweet method
end
end
end
Here's a commented version of what the source is doing. If you just want the hashtag, you'll want to rewrite handle_tweet to something like this:
handle_tweet(tweet)
tweet.scan(/#\w/) do |tag|
puts tag
end
end

Typhoeus Hydra run out of memory

I wrote a script that checks urls from file (using ruby gem Typhoeus). I don't know why when I run my code the memory usage grow. Usually after 10000 urls script crashes.
Is there any solution for it ? Thanks in advance for your help.
My code:
require 'rubygems'
require 'typhoeus'
def run file
log = Logger.new('log')
hydra = Typhoeus::Hydra.new(:max_concurrency => 30)
hydra.disable_memoization
File.open(file).each do |url|
begin
request = Typhoeus::Request.new(url.strip, :method => :get, :follow_location => true)
request.on_complete do |resp|
check_website(url, resp.body)
end
puts "queuing #{ url }"
hydra.queue(request)
request.destroy
rescue Exception => e
log.error e
end
end
hydra.run
end
One approach might be to adapt your file processing - instead of reading a line from the file and immediately creating the request object, try processing them in batches (say 5000 at a time) and throttle your request rate / memory consumption.
I've made improvement to my code, as you suggest I'm processing urls to hydra in batches.
It works with normal memory usage but I don't know why after about 1000 urls it just stop getting new ones. This is very strange, no errors, script is still running but it doesn't send/get new requests. My code:
def run file, concurrency
log = Logger.new('log')
log.info '*** Hydra started ***'
queue = []
File.open(file).each do |uri|
queue << uri
if queue.size == concurrency * 5
hydra = Typhoeus::Hydra.new(:max_concurrency => concurrency)
hydra.disable_memoization
queue.each do |url|
request = Typhoeus::Request.new(url.strip, :method => :get, :follow_location => true, :max_redirections => 2, :timeout => 5000)
request.on_complete do |resp|
check_website(url, resp.body)
puts "#{url} code: #{resp.code} curl_msg #{resp.curl_error_message}"
end
puts "queuing #{url}"
hydra.queue(request)
end
puts 'hydra run'
hydra.run
queue = []
end
end
log.info '*** Hydra finished work ***'
end

When using eventmachine with sinatra, how can I close one http connection without closing them all?

I have the following code which will query the twitter streaming API for a certain string. When I open two tabs with different queries, they both work. However, when I stop one of the tabs, the other tab stops receiving any data, and the server stops processing the stream. How can I fix this?
require 'sinatra'
require 'eventmachine'
require 'em-http'
require 'json'
require 'sinatra/streaming'
enable :logging, :dump_errors, :raise_errors
get '/test/:query' do
q = params[:query]
the = 'him'
url = "https://stream.twitter.com/1/statuses/filter.json?track=#{q}"
stream(:keep_open) do |out|
http = EM::HttpRequest.new(url)
s = http.get :head => { 'Authorization' => [ 'USERNAME', 'PASSWORD' ] }
out.callback do
puts "callback"
http.conn.close_connection
end
out.errback do
puts "errback"
http.conn.close_connection
end
buffer = ""
s.stream do |chunk|
puts "what"
buffer += chunk
while line = buffer.slice!(/.+\r?\n/)
tweet = JSON.parse(line)
unless tweet.length == 0 or tweet['user'].nil? or out.closed?
out << "<p><b>#{tweet['user']['screen_name']}</b>: #{tweet['text']}</p>"
end
end
end
end
end
The problem is that Twitter closes one connection once you open a second one. Try running curl https://USER:PASSWORD#stream.twitter.com/1/statuses/filter.json?track=bar in two terminals at the same time.
Also, while trying to figure out what is wrong, I did a rather small refactoring to improve readability:
require 'sinatra'
require 'sinatra/streaming'
require 'eventmachine'
require 'em-http'
require 'json'
enable :logging, :dump_errors, :raise_errors
template(:tweet) { "<p><b><%= #tweet['user']['screen_name'] %></b>: <%= #tweet['text'] %></p>" }
get '/test/:query' do |q|
stream(:keep_open) do |out|
http = EM::HttpRequest.new("https://stream.twitter.com/1/statuses/filter.json?track=#{q}")
EM.next_tick do
s = http.get :head => { 'Authorization' => ENV.values_at('USERNAME', 'PASSWORD') }
s.callback { out.close }
out.callback { s.close }
s.errback { out.close }
out.errback { s.close }
buffer = ""
s.stream do |chunk|
buffer << chunk
while line = buffer.slice!(/.+\r?\n/)
break if out.closed?
#tweet = JSON.parse(line)
out << erb(:tweet) if #tweet.length > 0 and #tweet['user']
end
end
end
end
end

Ruby AMQP under Thin HTTP server

I'm running a simple thin server, that publish some messages to different queues, the code looks like :
require "rubygems"
require "thin"
require "amqp"
require 'msgpack'
app = Proc.new do |env|
params = Rack::Request.new(env).params
command = params['command'].strip rescue "no command"
number = params['number'].strip rescue "no number"
p command
p number
AMQP.start do
if command =~ /\A(create|c|r|register)\z/i
MQ.queue("create").publish(number)
elsif m = (/\A(Answer|a)\s?(\d+|\d+-\d+)\z/i.match(command))
MQ.queue("answers").publish({:number => number,:answer => "answer" }.to_msgpack )
end
end
[200, {'Content-Type' => "text/plain"} , command ]
end
Rack::Handler::Thin.run(app, :Port => 4001)
Now when I run the server, and do something like http://0.0.0.0:4001/command=r&number=123123123
I'm always getting duplicate outputs, something like :
"no command"
"no number"
"no command"
"no number"
The first thing is why I'm getting like duplicate requests ? is it something has to do with the browser ? since when I use curl I'm not having the same behavior , and the second thing why I can't get the params ?
Any tips about the best implementation for such a server would be highly appreciated
Thanks in advance .
The second request comes from the browser looking for the favicon.ico. You can inspect the requests by adding the following code in your handler:
params = Rack::Request.new(env).params
p env # add this line to see the request in your console window
Alternatively you could use Sinatra:
require "rubygems"
require "amqp"
require "msgpack"
require "sinatra"
get '/:command/:number' do
command = params['command'].strip rescue "no command"
number = params['number'].strip rescue "no number"
p command
p number
AMQP.start do
if command =~ /\A(create|c|r|register)\z/i
MQ.queue("create").publish(number)
elsif m = (/\A(Answer|a)\s?(\d+|\d+-\d+)\z/i.match(command))
MQ.queue("answers").publish({:number => number,:answer => "answer" }.to_msgpack )
nd
end
return command
end
and then run ruby the_server.rb at the command line to start the http server.

Resources