scrapy image problems - image

I'm trying to build a spider to catch images. I've got the spider working, it just.. doesn't work and doesn't error out.
Spider:
from urlparse import urljoin
from scrapy.selector import XmlXPathSelector
from scrapy.spider import BaseSpider
from nasa.items import NasaItem
class NasaImagesSpider(BaseSpider):
name = "nasa.gov"
start_urls = ('http://www.nasa.gov/multimedia/imagegallery/iotdxml.xml',)
def parse(self, response):
xxs = XmlXPathSelector(response)
item = NasaItem()
baseLink = xxs.select('//link/text()').extract()[0]
imageLink = xxs.select('//tn/text()').extract()
imgList = []
for img in imageLink:
imgList.append(urljoin(baseLink, img))
item['image_urls'] = imgList
return item
It runs through the page, and it captures the urls correctly. I pass it down the pipeline, but.. no pics.
The settings file:
BOT_NAME = 'nasa.gov'
BOT_VERSION = '1.0'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGE_STORE = '/home/usr1/Scrapy/spiders/nasa/images'
LOG_LEVEL = "DEBUG"
SPIDER_MODULES = ['nasa.spiders']
NEWSPIDER_MODULE = 'nasa.spiders'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
and the items file:
from scrapy.item import Item, Field
class NasaItem(Item):
image_urls = Field()
images = Field()
and the output log:
2012-11-12 07:47:28-0500 [scrapy] INFO: Scrapy 0.14.4 started (bot: nasa)
2012-11-12 07:47:29-0500 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, MemoryUsage, SpiderState
2012-11-12 07:47:29-0500 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, RedirectMiddleware, CookiesMiddleware, HttpCompressionMiddleware, ChunkedTransferMiddleware, DownloaderStats
2012-11-12 07:47:29-0500 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2012-11-12 07:47:29-0500 [scrapy] DEBUG: Enabled item pipelines:
2012-11-12 07:47:29-0500 [nasa.gov] INFO: Spider opened
2012-11-12 07:47:29-0500 [nasa.gov] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2012-11-12 07:47:29-0500 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023
2012-11-12 07:47:29-0500 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080
2012-11-12 07:47:29-0500 [nasa.gov] DEBUG: Crawled (200) <GET http://www.nasa.gov/multimedia/imagegallery/iotdxml.xml> (referer: None)
2012-11-12 07:47:29-0500 [nasa.gov] DEBUG: Scraped from <200 http://www.nasa.gov/multimedia/imagegallery/iotdxml.xml>
#removed output of every jpg link
2012-11-12 07:47:29-0500 [nasa.gov] INFO: Closing spider (finished)
2012-11-12 07:47:29-0500 [nasa.gov] INFO: Dumping spider stats:
{'downloader/request_bytes': 227,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 2526,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2012, 11, 12, 12, 47, 29, 802477),
'item_scraped_count': 1,
'scheduler/memory_enqueued': 1,
'start_time': datetime.datetime(2012, 11, 12, 12, 47, 29, 682005)}
2012-11-12 07:47:29-0500 [nasa.gov] INFO: Spider closed (finished)
2012-11-12 07:47:29-0500 [scrapy] INFO: Dumping global stats:
{'memusage/max': 104132608, 'memusage/startup': 104132608}
I'm stuck. Any suggestions as to what I'm doing wrong?
[EDITED] Added output log, changed settings bot name.

#pipeline file
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class PaulsmithPipeline(ImagesPipeline):
def process_item(self, item, spider):
return item
def get_media_requests(self,item,info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self,results,item,info):
image_paths=[x['path'] for ok,x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item["image_paths"]=image_paths
return item

Related

Gatling 3.4 bug with timeout after silent check

I experience a very weird behavior of Gatling with websocket silent check:
The .await(600 seconds)(check) fails with timeout after some milliseconds.
At first I explain my situation. The limitation of Gatling websocket does not allow to handle ping requests from server. So I have to cheat and invented a fancy protocol. The code with comments is below:
client may send an INITIAL event or NON-INITIAL event (not first in a sequence).
the minimum interval between INITIAL events is 3 seconds
each request initialized by client results with 2 responses: "calculation started" and "calculation result"
ping request from server may come at any time. When we pause between client events, we still may receive ping request
when we receive a ping request, we must respond.
the usual sequence of events does not depend on ping, so if ping arrived, then we must wait for one more message
exec(session => dump(session, s"The spin action: event=$eventType oneRound=$oneRound" )).
exec(_.remove(ATTR_PING_SEQ_ID))
.doIfOrElse("CLIENT_INITIAL_EVENT".equals(eventType)) {
exec(session => dump(session, s"Sending CLIENT_INITIAL_EVENT and expect 2 or 3 responses. 3 responses mean that one of them is a ping. wait for each response for ${Config.waitForResponseSec} seconds" )).
exec(
clientActionBuilder
// first response: "started calculation"
.await(Config.waitForResponseSec seconds)(check1)
// second response: "calculated result"
.await(Config.waitForResponseSec seconds)(check2)
// wait for the delay between client initial events.
// We cannot just wait because PING may come within this time and we must handle it!
// Most probably the ping will not come, so we ignore the timeout and make the check silent
.await(Config.minTimeBetweenClientInitialEventsMillis milliseconds)(check3.silent)
)
.exec(session => dump(session, s"SPIN 2 or 3 responses got" ))
// we waited for 3 messages, so if the ping came, we just send pong and do not wait for anything else
.doIf(session => Utils.getStringSessionAttribute(session, ATTR_PING_SEQ_ID, "0") != "0"){
exec(session => dump(session, s"Sending PONG for CLIENT_INITIAL_EVENT" )).
exec(pongBuilder)
}
} {
exec(session => dump(session, s"Sending NON-INITIAL_CLIENT_EVENT and expect 2 responses" )).
exec(
clientActionBuilder
// first response: "started calculation"
.await(Config.waitForResponseSec seconds)(check4)
// second response: "calculation result"
.await(Config.waitForResponseSec seconds)(check5)
)
.exec(session => dump(session, s"NON-INITIAL_CLIENT_EVENT 2 responses got" ))
// we waited for 2 messages. If ping came, then it came instead of a
// "started calculation" or "calculation result", so we have to wait for one more message
.doIf(session => Utils.getStringSessionAttribute(session, ATTR_PING_SEQ_ID, "0") != "0"){
exec(session => dump(session, s"Sending PONG for NON-INITIAL_CLIENT_EVENT" )).
exec(pongBuilder.await(Config.waitForResponseSec seconds)(check6))
.exec(session => dump(session, s"NON-INITIAL_CLIENT_EVENT PONG response got" ))
}
}
.exitHereIfFailed
.exec(_.remove(ATTR_PING_SEQ_ID))
The check for each message is the same. It is cloned because I want to see in logs which concrete check has timed out:
val checkX = ws.checkTextMessage("myCheckX")
.matching(jsonPath(matchingCondition).exists)
jsonPath("$.body.data.nextActions[0]").optional.saveAs(ATTR_NEXT_ACTION),
).check(regex("\"cId\":(.*?),(\"name\":\"Ping\")").optional.saveAs(ATTR_PING_SEQ_ID))
the actual messages are very simple:
val clientActionBuilder = ws("requestClientAction").sendText(
"""{
| "header":
| {
| "name": "Action",
| "cId": ${cId},
| "dType": 2
| },
| "body":
| {
| "type": "#TYPE#",
| "seqId":${seqId},
| "data":{
| }
| }
|}
""".stripMargin.replaceAll("[\\s\n\r]", "").replace("#TYPE#", eventType)
)
val pongBuilder = ws("requestPong").sendText(
"""{
| "header":
| {
| "name": "Ping",
| "cId": ${pingSeqId},
| "dType": 1
| },
| "body": {}
|}
""".stripMargin.replaceAll("[\\s\n\r]", "")
)
the client actions are sent in a loop untill timeout:
asLongAs(
session =>
!timeoutIsOver(startTime, testDurationMillis)
) {
exec(doClientAction())
}
The logic works as expected until the ping request comes from server. After that the ws await timeout breaks. Here is what I see in logs:
DUMP---> The client action: event=INITIAL_EVENT oneRound=false
DUMP---> Sending CLIENT_INITIAL_EVENT and expect 2 or 3 responses. wait for each response for 600 seconds
20:33:18.772 [INFO ] i.g.h.a.w.WsSendTextFrame - Sending text frame {"header":{"name":"Action","cId":103,"dType":2},"body":{}} with websocket 'gatling.http.webSocket': Scenario 'doUntilTimeout', UserId #1
20:33:18.773 [DEBUG] i.g.h.a.w.f.WsIdleState - Send text frame requestClientAction {"header":{"name":"Action","cId":103,"dType":2},"body":{}}
20:33:18.773 [DEBUG] i.g.h.c.i.WebSocketHandler - ctx.write msg=TextWebSocketFrame(data: UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeHeapByteBuf(ridx: 0, widx: 164, cap: 512))
20:33:18.773 [TRACE] i.n.h.c.h.w.WebSocket08FrameEncoder - Encoding WebSocket Frame opCode=1 length=164
20:33:18.773 [DEBUG] i.g.h.a.w.f.WsIdleState - Trigger check after sending text frame
20:33:18.787 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame opCode=1
20:33:18.787 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame length=80
20:33:18.787 [DEBUG] i.g.h.c.i.WebSocketHandler - Read msg=TextWebSocketFrame(data: PooledUnsafeDirectByteBuf(ridx: 0, widx: 80, cap: 80))
20:33:18.788 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Received matching message {"header":{"cId":103,"name":"ClientAction","code":1,"dType":2}}
20:33:18.789 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Current check success
20:33:18.789 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Perform next check sequence
20:33:19.233 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame opCode=1
20:33:19.233 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame length=1480
20:33:19.233 [DEBUG] i.g.h.c.i.WebSocketHandler - Read msg=TextWebSocketFrame(data: PooledUnsafeDirectByteBuf(ridx: 0, widx: 1480, cap: 1480))
20:33:19.235 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Received matching message {"header":{"cId":37,"name":"ClientEvent","dType":2,"dId":1270},"body":{...}}
20:33:19.237 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Current check success
20:33:19.238 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Perform next check sequence
20:33:20.871 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame opCode=1
20:33:20.871 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame length=65
20:33:20.871 [DEBUG] i.g.h.c.i.WebSocketHandler - Read msg=TextWebSocketFrame(data: PooledUnsafeDirectByteBuf(ridx: 0, widx: 65, cap: 65))
20:33:20.872 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Received matching message {"header":{"cId":38,"name":"Ping","dType":2}}
20:33:20.872 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Current check success
20:33:20.872 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Check sequences completed successfully
DUMP---> 2 or 3 responses got
DUMP---> Sending PONG for CLIENT_INITIAL_EVENT
20:33:20.873 [INFO ] i.g.h.a.w.WsSendTextFrame - Sending text frame {"header":{"name":"Ping","cId":38,"dType":1},"body":{}} with websocket 'gatling.http.webSocket': Scenario 'doUntilTimeout', UserId #1
20:33:20.873 [DEBUG] i.g.h.a.w.f.WsIdleState - Send text frame requestPong {"header":{"name":"Ping","cId":38,"dType":1},"body":{}}
20:33:20.873 [DEBUG] i.g.h.c.i.WebSocketHandler - ctx.write msg=TextWebSocketFrame(data: UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeHeapByteBuf(ridx: 0, widx: 65, cap: 256))
20:33:20.873 [TRACE] i.n.h.c.h.w.WebSocket08FrameEncoder - Encoding WebSocket Frame opCode=1 length=65
....
20:33:20.876 [INFO ] i.g.h.a.w.WsSendTextFrame - Sending text frame {"header":{"name":"ClientAction","cId":104,"dType":2},"body":{"type":"NON-INITIAL_CLIENT_EVENT","seqId":304,"data":{...}}} with websocket 'gatling.http.webSocket': Scenario 'doUntilTimeout', UserId #1
20:33:20.876 [DEBUG] i.g.h.a.w.f.WsIdleState - Send text frame requestClientAction {"header":{"name":"CLientAction","cId":104,"dType":2},"body":{"type":"NON-INITIAL_CLIENT_EVENT","seqId":304,"data":{...}}}
20:33:20.876 [DEBUG] i.g.h.c.i.WebSocketHandler - ctx.write msg=TextWebSocketFrame(data: UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeHeapByteBuf(ridx: 0, widx: 167, cap: 512))
20:33:20.876 [TRACE] i.n.h.c.h.w.WebSocket08FrameEncoder - Encoding WebSocket Frame opCode=1 length=167
20:33:20.877 [DEBUG] i.g.h.a.w.f.WsIdleState - Trigger check after sending text frame
20:33:20.897 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame opCode=1
20:33:20.897 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame length=81
20:33:20.897 [DEBUG] i.g.h.c.i.WebSocketHandler - Read msg=TextWebSocketFrame(data: PooledUnsafeDirectByteBuf(ridx: 0, widx: 81, cap: 81))
20:33:20.898 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Received matching message {"header":{"cId":104,"name":"ClientAction","code":1,"dType":2}}
20:33:20.899 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Current check success
20:33:20.899 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Perform next check sequence
....
20:33:21.535 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame opCode=1
20:33:21.535 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame length=2275
20:33:21.535 [DEBUG] i.g.h.c.i.WebSocketHandler - Read msg=TextWebSocketFrame(data: PooledUnsafeDirectByteBuf(ridx: 0, widx: 2275, cap: 2275))
20:33:21.537 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Received matching message {"header":{"cId":39,"name":"ClientEvent","dType":2},"body":{...}}
20:33:21.540 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Current check success
20:33:21.540 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Check sequences completed successfully
DUMP---> NON-INITIAL_CLIENT_EVENT 2 responses got
....
DUMP---> Sending CLIENT_INITIAL_EVENT and expect 2 or 3 responses. wait for each response for 600 seconds
20:33:21.542 [INFO ] i.g.h.a.w.WsSendTextFrame - Sending text frame {"header":{"name":"ClientAction","cId":105,"dType":2},"body":{...}}} with websocket 'gatling.http.webSocket': Scenario 'doUntilTimeout', UserId #1
20:33:21.542 [DEBUG] i.g.h.a.w.f.WsIdleState - Send text frame requestClientAction {"header":{"name":"ClientAction","cId":105,"dType":2},"body":{...}}}
20:33:21.542 [DEBUG] i.g.h.c.i.WebSocketHandler - ctx.write msg=TextWebSocketFrame(data: UnpooledByteBufAllocator$InstrumentedUnpooledUnsafeHeapByteBuf(ridx: 0, widx: 164, cap: 512))
20:33:21.542 [TRACE] i.n.h.c.h.w.WebSocket08FrameEncoder - Encoding WebSocket Frame opCode=1 length=164
20:33:21.543 [DEBUG] i.g.h.a.w.f.WsIdleState - Trigger check after sending text frame
20:33:21.558 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame opCode=1
20:33:21.559 [TRACE] i.n.h.c.h.w.WebSocket08FrameDecoder - Decoding WebSocket Frame length=81
20:33:21.559 [DEBUG] i.g.h.c.i.WebSocketHandler - Read msg=TextWebSocketFrame(data: PooledUnsafeDirectByteBuf(ridx: 0, widx: 81, cap: 81))
20:33:21.560 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Received matching message {"header":,"cId":105,"name":"ClientAction","code":1,"dType":2}}
20:33:21.560 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Current check success
20:33:21.561 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Perform next check sequence
20:33:21.742 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Check timeout
20:33:21.743 [DEBUG] i.g.h.a.w.f.WsPerformingCheckState - Check timeout, failing it and performing next action
DUMP---> 2 or 3 responses got
20:33:21.744 [DEBUG] i.g.c.a.Exit - End user #1
20:33:21.748 [DEBUG] i.g.c.c.i.Injector - End user #doUntilTimeout
20:33:21.748 [INFO ] i.g.c.c.i.Injector - All users of scenario doUntilTimeout are stopped
20:33:21.749 [INFO ] i.g.c.c.i.Injector - Stopping
20:33:21.749 [INFO ] i.g.c.c.Controller - Injector has stopped, initiating graceful stop
I received the first message from web socket at 20:33:21.560
Then the second "await" started. It should timeout after 600 seconds,
but in fact I see the timeout right away at 20:33:21.743
I looks like a bug in Gatling. Something like timeout property reset to zero
Thanks in advance!
Andrei

Simulate Ajax Call with Scrapy

i'm new at scraping with Scrapy and unfortunatly, i can't access data through a request (to simulate an AJAX request made).
I read others topics, but it didnt help me resolve my issues.
The website i would like to crawl is auchan.fr ,it has a dynamic search box driven by algolia (algolia).
Here is my spider for a Nutella request(POST then):
class AjaxspiderSpider(scrapy.Spider):
name = "ajaxspider"
allowed_domains = ["auchandirect.fr/recherche"]
#start_urls = ['https://www.auchandirect.fr/recherche/']
def start_requests(self):
full_url = "/1/indexes/articles_article_11228/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.20.4&x-algolia-application-id=TN96V7LRXC&x-algolia-api-key=46a121512cba9c452df318ffca231225"
yield FormRequest('https://tn96v7lrxc-dsn.algolia.net' + full_url, callback=self.parse, formdata={"params":"query=nutella&facets=%5B%22loopr_shelf%22%5D&hitsPerPage=50"})
def parse(self, response):
with open('data_content', 'w') as file:
file.write(response.content)
and Here is the log i got :
2017-02-03 15:14:34 [scrapy.utils.log] INFO: Scrapy 1.3.0 started (bot: ajax)
2017-02-03 15:14:34 [scrapy.utils.log] INFO: Overridden settings: {'SPIDER_MODULES': ['ajax.spiders'], 'NEWSPIDER_MODULE': 'ajax.spiders', 'BOT_NAME': 'ajax'}
2017-02-03 15:14:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats',
'scrapy.extensions.corestats.CoreStats']
2017-02-03 15:14:34 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2017-02-03 15:14:34 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2017-02-03 15:14:34 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2017-02-03 15:14:34 [scrapy.core.engine] INFO: Spider opened
2017-02-03 15:14:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2017-02-03 15:14:34 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2017-02-03 15:14:35 [scrapy.core.engine] DEBUG: Crawled (400) <POST https://tn96v7lrxc-dsn.algolia.net/1/indexes/articles_article_11228/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.20.4&x-algolia-application-id=TN96V7LRXC&x-algolia-api-key=46a121512cba9c452df318ffca231225> (referer: None)
2017-02-03 15:14:35 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <400 https://tn96v7lrxc-dsn.algolia.net/1/indexes/articles_article_11228/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.20.4&x-algolia-application-id=TN96V7LRXC&x-algolia-api-key=46a121512cba9c452df318ffca231225>: HTTP status code is not handled or not allowed
2017-02-03 15:14:35 [scrapy.core.engine] INFO: Closing spider (finished)
2017-02-03 15:14:35 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 545,
'downloader/request_count': 1,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 338,
'downloader/response_count': 1,
'downloader/response_status_count/400': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 2, 3, 14, 14, 35, 216807),
'log_count/DEBUG': 2,
'log_count/INFO': 8,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2017, 2, 3, 14, 14, 34, 977436)}
2017-02-03 15:14:35 [scrapy.core.engine] INFO: Spider closed (finished)
i thank you for any piece for informations
This is not Ajax-related but site-specific question, you just passes search parameters string wrong way trying to pass it as formdata while it should be passed as raw body of POST request, so it should be like that:
yield Request('https://tn96v7lrxc-dsn.algolia.net' + full_url,
callback=self.parse, method='POST',
body='{"params":"query=nutella&facets=%5B%22loopr_shelf%22%5D&hitsPerPage=50"}')

How can I find need words with xpath?

I'm using scrapy to crawl a website,but I dont konw how to parse and find word.
The following is the website,I want to find "hello I'm here".
This is my xpath code:
//div[#class='sort_left']/p/strong/a/href/text()
Html part:
<div class="sort hottest_dishes1">
<ul class="sort_title">
<li class="current">按默认排序</li>
<li class="">按人气排序</li>
</ul>
<ol class="sort_content">
<li class="show">
<div class="sort_yi">
<div class="sort_left">
<p class="li_title">
<strong class="span_left ">
hello I'm here<span class="restaurant_list_hot"></span>
<span> (川菜) </span>
</strong>
<span class="span_d_right3" title="馋嘴牛蛙特价只要9.9元,每单限点1份">馋嘴牛蛙特价9块9</span>
</p>
<p class="consume">
<strong>人均消费:</strong>
<b><span>¥70</span>元</b>
看网友点评
</p>
<p class="sign">
<strong>招牌菜:</strong>
<span>水煮鲶鱼 馋嘴牛蛙 酸梅汤 钵钵鸡 香辣土豆丝 毛血旺 香口猪手 ……</span>
</p>
</div>
<div class="sort_right">
看菜谱
</div>
<div class="sort_all" >
<strong>送达时间:</strong><span>60分钟</span>
</div>
</div>
I use response.css in shell is right ,but in scrapy,it returns nothing,am I write the code wrong?
The following is my code:
def parse_torrent(self, response):
torrent = TorrentItem()
torrent['url'] = response.url
torrent['name'] = response.xpath("//div[#class='sort_left']/p/strong/a[1]").extract()[1]
torrent['description'] = response.xpath("//div[#id='list_content']/div/div/ol/li/div/div/p/strong[1]/following-sibling::span[1]").extract()
torrent['size'] = response.xpath("//div[#id='list_content']/div/div/ol/li/div/div/p/span[1]").extract()
return torrent
strong text
I personally find css selectors much easier than using xpath for locating content. For the response object that you get on crawling the given document, why don't you try response.css('p[class="li_title"] a::text')[0].extract().
(I tested it and it works in scrapy shell. The output: u"hello I'm here")
This can be an example of what you need to do:
def parse_torrent(self, response):
print response.xpath('//div[#class="sort_left"]/p/strong/a/text()').extract()[0]
output:
2014-12-19 10:58:28+0100 [scrapy] INFO: Scrapy 0.24.4 started (bot: skema_crawler)
2014-12-19 10:58:28+0100 [scrapy] INFO: Optional features available: ssl, http11
2014-12-19 10:58:28+0100 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'skema_crawler.spiders', 'SPIDER_MODULES': ['skema_crawler.spiders'], 'BOT_NAME': 'skema_crawler'}
2014-12-19 10:58:28+0100 [scrapy] INFO: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2014-12-19 10:58:29+0100 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2014-12-19 10:58:29+0100 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2014-12-19 10:58:29+0100 [scrapy] INFO: Enabled item pipelines:
2014-12-19 10:58:29+0100 [linkedin] INFO: Spider opened
2014-12-19 10:58:29+0100 [linkedin] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2014-12-19 10:58:29+0100 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2014-12-19 10:58:29+0100 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080
2014-12-19 10:58:29+0100 [linkedin] DEBUG: Crawled (200) <GET file:///C:/1.html> (referer: None)
hello I'm here
2014-12-19 10:58:29+0100 [linkedin] INFO: Closing spider (finished)
2014-12-19 10:58:29+0100 [linkedin] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 232,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 1599,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2014, 12, 19, 9, 58, 29, 241000),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2014, 12, 19, 9, 58, 29, 213000)}
2014-12-19 10:58:29+0100 [linkedin] INFO: Spider closed (finished)
you can see that hello I'm here appeared.
You are referring to
response.xpath("//div[#class='sort_left']/p/strong/a[1]").extract()[1]
you need to add text() to your xpath and as your a has a span inside, you need to get the element [0] and not [1]. So then you need to change it to
response.xpath("//div[#class='sort_left']/p/strong/a/text()").extract()[0]
I can't see a <div> in your HTML excerpt which has an attribute with value 'list_content' – so the [#id='list_content'] predicate filters-out everything, whatever the rest of your XPath expression is. The result of the expression evaluation is an empty sequence.
After the question edit:
There is no <href> element in your HTML, so the .../a/href subexpression selects nothing.
href is an attribute of <a> – use .../a/#href instead to proces the href attribute contents.
However if you still want to find the 'hello I'm here' text, then you need to reach the <a> element contents – use .../a/text().

Setup a graylog2 server with elasticsearch in a vagrant machine

I'm trying to Install graylog2 server on my local dev machine and encountering problems with elasticsearch setup.
My elasticsearch is installed as a service on a vagrant machine running on my dev machine. so My elasticsearch isn't installed in 127.0.0.1 but in 192.168.50.4 (the ip of the vagrant machine) I have ports 9200 forwarded from the vagrant machine but graylog2 server seems to fail to find it and stops running with a :
ERROR: Could not successfully connect to ElasticSearch. Check that
your cluster state is not RED and that ElasticSearch is running
properly.
Adding port 9300 forwarded from the vagrant machine changed the error to:
Caused by: org.elasticsearch.common.netty.channel.ChannelException:
Failed to bind to: 0.0.0.0/0.0.0.0:9350
I tried this settings in graylog conf file:
elasticsearch_network_host =192.168.50.4
but that only changes the error to an exception failing to bind to
Caused by: org.elasticsearch.common.netty.channel.ChannelException:
Failed to bind to: /192.168.50.4:9350 at
org.elasticsearch.common.netty.bootstrap.ServerBootstrap.bind(ServerBootstrap.java:272)
But didn't help.
I'll be glad for any direction what am I doing wrong (either with elastic search configuration or the vagrant or graylog2)
Thanks!
Update following advice by the answer below I changed the following config:
elasticsearch_discovery_zen_ping_multicast_enabled = false
elasticsearch_discovery_zen_ping_unicast_hosts = 192.168.50.4:9300
I now get this error:
2014-06-16 23:04:34,946 WARN : org.elasticsearch.transport.netty - [graylog2-server] Message not fully read (response) for [6] handler org.elasticsearch.discovery.zen.ping.unicast.UnicastZenPing$4#67bd250a, error [true], resetting
2014-06-16 23:04:36,451 WARN : org.elasticsearch.discovery.zen.ping.unicast - [graylog2-server] failed to send ping to [[#zen_unicast_1#][inet[/192.168.50.4:9300]]]
org.elasticsearch.transport.RemoteTransportException: Failed to deserialize exception response from stream
Caused by: org.elasticsearch.transport.TransportSerializationException: Failed to deserialize exception response from stream
at org.elasticsearch.transport.netty.MessageChannelHandler.handlerResponseError(MessageChannelHandler.java:169)
at org.elasticsearch.transport.netty.MessageChannelHandler.messageReceived(MessageChannelHandler.java:123)
at org.elasticsearch.common.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.elasticsearch.common.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.elasticsearch.common.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.elasticsearch.common.netty.channel.Channels.fireMessageReceived(Channels.java:296)
at org.elasticsearch.common.netty.handler.codec.frame.FrameDecoder.unfoldAndFireMessageReceived(FrameDecoder.java:462)
at org.elasticsearch.common.netty.handler.codec.frame.FrameDecoder.callDecode(FrameDecoder.java:443)
at org.elasticsearch.common.netty.handler.codec.frame.FrameDecoder.messageReceived(FrameDecoder.java:310)
at org.elasticsearch.common.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.elasticsearch.common.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.elasticsearch.common.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:559)
at org.elasticsearch.common.netty.channel.Channels.fireMessageReceived(Channels.java:268)
at org.elasticsearch.common.netty.channel.Channels.fireMessageReceived(Channels.java:255)
at org.elasticsearch.common.netty.channel.socket.nio.NioWorker.read(NioWorker.java:88)
at org.elasticsearch.common.netty.channel.socket.nio.AbstractNioWorker.process(AbstractNioWorker.java:108)
at org.elasticsearch.common.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:318)
at org.elasticsearch.common.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:89)
at org.elasticsearch.common.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
at org.elasticsearch.common.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
at org.elasticsearch.common.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
Caused by: java.io.InvalidClassException: failed to read class descriptor
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1603)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1517)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1622)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1517)
looks that graylog2 still fails to connect to elastic search in a correct way
Details (update): graylog2-server-0.20.2, elasticsearch 1.1.0 (I think) - I can replace if that's the problem. java OpenJDK 64-Bit java version "1.7.0_55"
More Updates (thanks #sheena) When downgrading the elasticsearch version to 0.90.10 we got some progress but still not working:
Here is the current log:
2014-06-17 13:27:16,394 INFO : org.graylog2.Main - Graylog2 0.20.2 starting up. (JRE: Oracle Corporation 1.7.0_55 on Linux 3.13.0-29-generic)
2014-06-17 13:27:16,475 INFO : org.graylog2.plugin.system.NodeId - Node ID: e7245f12-2e8b-4803-9e88-7529169b5a91
2014-06-17 13:27:16,670 INFO : org.graylog2.buffers.ProcessBuffer - Initialized ProcessBuffer with ring size <1024> and wait strategy <BlockingWaitStrategy>.
2014-06-17 13:27:16,692 INFO : org.graylog2.buffers.OutputBuffer - Initialized OutputBuffer with ring size <1024> and wait strategy <BlockingWaitStrategy>.
2014-06-17 13:27:16,964 DEBUG: com.ning.http.client.providers.netty.NettyAsyncHttpProvider - Number of application's worker threads is 8
2014-06-17 13:27:17,272 INFO : org.elasticsearch.node - [graylog2-server] version[0.90.10], pid[24419], build[0a5781f/2014-01-10T10:18:37Z]
2014-06-17 13:27:17,273 INFO : org.elasticsearch.node - [graylog2-server] initializing ...
2014-06-17 13:27:17,273 DEBUG: org.elasticsearch.node - [graylog2-server] using home [/home/alon/Downloads/graylog2-server-0.20.2], config [/home/alon/Downloads/graylog2-server-0.20.2/config], data [[/home/alon/Downloads/graylog2-server-0.20.2/data]], logs [/home/alon/Downloads/graylog2-server-0.20.2/logs], work [/home/alon/Downloads/graylog2-server-0.20.2/work], plugins [/home/alon/Downloads/graylog2-server-0.20.2/plugins]
2014-06-17 13:27:17,281 INFO : org.elasticsearch.plugins - [graylog2-server] loaded [], sites []
2014-06-17 13:27:17,320 DEBUG: org.elasticsearch.common.compress.lzf - using [UnsafeChunkDecoder] decoder
2014-06-17 13:27:18,655 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [generic], type [cached], keep_alive [30s]
2014-06-17 13:27:18,740 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [index], type [fixed], size [4], queue_size [200]
2014-06-17 13:27:18,744 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [bulk], type [fixed], size [4], queue_size [50]
2014-06-17 13:27:18,745 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [get], type [fixed], size [4], queue_size [1k]
2014-06-17 13:27:18,745 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [search], type [fixed], size [12], queue_size [1k]
2014-06-17 13:27:18,745 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [suggest], type [fixed], size [4], queue_size [1k]
2014-06-17 13:27:18,745 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [percolate], type [fixed], size [4], queue_size [1k]
2014-06-17 13:27:18,746 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [management], type [scaling], min [1], size [5], keep_alive [5m]
2014-06-17 13:27:18,747 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [flush], type [scaling], min [1], size [2], keep_alive [5m]
2014-06-17 13:27:18,747 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [merge], type [scaling], min [1], size [2], keep_alive [5m]
2014-06-17 13:27:18,747 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [refresh], type [scaling], min [1], size [2], keep_alive [5m]
2014-06-17 13:27:18,748 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [warmer], type [scaling], min [1], size [2], keep_alive [5m]
2014-06-17 13:27:18,748 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [snapshot], type [scaling], min [1], size [2], keep_alive [5m]
2014-06-17 13:27:18,748 DEBUG: org.elasticsearch.threadpool - [graylog2-server] creating thread_pool [optimize], type [fixed], size [1], queue_size [null]
2014-06-17 13:27:18,768 DEBUG: org.elasticsearch.transport.netty - [graylog2-server] using worker_count[8], port[9350], bind_host[null], publish_host[null], compress[false], connect_timeout[30s], connections_per_node[2/3/6/1/1], receive_predictor[512kb->512kb]
2014-06-17 13:27:18,784 DEBUG: org.elasticsearch.discovery.zen.ping.unicast - [graylog2-server] using initial hosts [192.168.50.4:9300], with concurrent_connects [10]
2014-06-17 13:27:18,787 DEBUG: org.elasticsearch.discovery.zen - [graylog2-server] using ping.timeout [3s], master_election.filter_client [true], master_election.filter_data [false]
2014-06-17 13:27:18,788 DEBUG: org.elasticsearch.discovery.zen.elect - [graylog2-server] using minimum_master_nodes [-1]
2014-06-17 13:27:18,790 DEBUG: org.elasticsearch.discovery.zen.fd - [graylog2-server] [master] uses ping_interval [1s], ping_timeout [30s], ping_retries [3]
2014-06-17 13:27:18,801 DEBUG: org.elasticsearch.discovery.zen.fd - [graylog2-server] [node ] uses ping_interval [1s], ping_timeout [30s], ping_retries [3]
2014-06-17 13:27:18,845 DEBUG: org.elasticsearch.monitor.jvm - [graylog2-server] enabled [true], last_gc_enabled [false], interval [1s], gc_threshold [{old=GcThreshold{name='old', warnThreshold=10000, infoThreshold=5000, debugThreshold=2000}, default=GcThreshold{name='default', warnThreshold=10000, infoThreshold=5000, debugThreshold=2000}, young=GcThreshold{name='young', warnThreshold=1000, infoThreshold=700, debugThreshold=400}}]
2014-06-17 13:27:18,846 DEBUG: org.elasticsearch.monitor.os - [graylog2-server] Using probe [org.elasticsearch.monitor.os.JmxOsProbe#7b01e044] with refresh_interval [1s]
2014-06-17 13:27:18,849 DEBUG: org.elasticsearch.monitor.process - [graylog2-server] Using probe [org.elasticsearch.monitor.process.JmxProcessProbe#3103c203] with refresh_interval [1s]
2014-06-17 13:27:18,854 DEBUG: org.elasticsearch.monitor.jvm - [graylog2-server] Using refresh_interval [1s]
2014-06-17 13:27:18,854 DEBUG: org.elasticsearch.monitor.network - [graylog2-server] Using probe [org.elasticsearch.monitor.network.JmxNetworkProbe#1cc7580f] with refresh_interval [5s]
2014-06-17 13:27:18,857 DEBUG: org.elasticsearch.monitor.network - [graylog2-server] net_info
host [stox-alonisser]
vboxnet0 display_name [vboxnet0]
address [/fe80:0:0:0:800:27ff:fe00:0%4] [/192.168.50.1]
mtu [1500] multicast [true] ptp [false] loopback [false] up [true] virtual [false]
wlan0 display_name [wlan0]
address [/fe80:0:0:0:e8b:fdff:fe62:dc9d%3] [/192.168.20.107]
mtu [1500] multicast [true] ptp [false] loopback [false] up [true] virtual [false]
lo display_name [lo]
address [/0:0:0:0:0:0:0:1%1] [/127.0.0.1]
mtu [65536] multicast [false] ptp [false] loopback [true] up [true] virtual [false]
2014-06-17 13:27:18,858 DEBUG: org.elasticsearch.monitor.fs - [graylog2-server] Using probe [org.elasticsearch.monitor.fs.JmxFsProbe#2c8807d7] with refresh_interval [1s]
2014-06-17 13:27:19,196 DEBUG: org.elasticsearch.indices.store - [graylog2-server] using indices.store.throttle.type [MERGE], with index.store.throttle.max_bytes_per_sec [20mb]
2014-06-17 13:27:19,204 DEBUG: org.elasticsearch.cache.memory - [graylog2-server] using bytebuffer cache with small_buffer_size [1kb], large_buffer_size [1mb], small_cache_size [10mb], large_cache_size [500mb], direct [true]
2014-06-17 13:27:19,220 DEBUG: org.elasticsearch.script - [graylog2-server] using script cache with max_size [500], expire [null]
2014-06-17 13:27:19,234 DEBUG: org.elasticsearch.cluster.routing.allocation.decider - [graylog2-server] using node_concurrent_recoveries [2], node_initial_primaries_recoveries [4]
2014-06-17 13:27:19,235 DEBUG: org.elasticsearch.cluster.routing.allocation.decider - [graylog2-server] using [cluster.routing.allocation.allow_rebalance] with [indices_all_active]
2014-06-17 13:27:19,236 DEBUG: org.elasticsearch.cluster.routing.allocation.decider - [graylog2-server] using [cluster_concurrent_rebalance] with [2]
2014-06-17 13:27:19,243 DEBUG: org.elasticsearch.gateway.local - [graylog2-server] using initial_shards [quorum], list_timeout [30s]
2014-06-17 13:27:19,424 DEBUG: org.elasticsearch.indices.recovery - [graylog2-server] using max_bytes_per_sec[20mb], concurrent_streams [3], file_chunk_size [512kb], translog_size [512kb], translog_ops [1000], and compress [true]
2014-06-17 13:27:19,486 DEBUG: org.elasticsearch.indices.memory - [graylog2-server] using index_buffer_size [265.4mb], with min_shard_index_buffer_size [4mb], max_shard_index_buffer_size [512mb], shard_inactive_time [30m]
2014-06-17 13:27:19,487 DEBUG: org.elasticsearch.indices.cache.filter - [graylog2-server] using [node] weighted filter cache with size [20%], actual_size [530.8mb], expire [null], clean_interval [1m]
2014-06-17 13:27:19,489 DEBUG: org.elasticsearch.indices.fielddata.cache - [graylog2-server] using size [-1] [-1b], expire [null]
2014-06-17 13:27:19,507 DEBUG: org.elasticsearch.gateway.local.state.meta - [graylog2-server] using gateway.local.auto_import_dangled [YES], with gateway.local.dangling_timeout [2h]
2014-06-17 13:27:19,511 DEBUG: org.elasticsearch.bulk.udp - [graylog2-server] using enabled [false], host [null], port [9700-9800], bulk_actions [1000], bulk_size [5mb], flush_interval [5s], concurrent_requests [4]
2014-06-17 13:27:19,514 DEBUG: org.elasticsearch.cluster.routing.allocation.decider - [graylog2-server] using node_concurrent_recoveries [2], node_initial_primaries_recoveries [4]
2014-06-17 13:27:19,514 DEBUG: org.elasticsearch.cluster.routing.allocation.decider - [graylog2-server] using [cluster.routing.allocation.allow_rebalance] with [indices_all_active]
2014-06-17 13:27:19,515 DEBUG: org.elasticsearch.cluster.routing.allocation.decider - [graylog2-server] using [cluster_concurrent_rebalance] with [2]
2014-06-17 13:27:19,516 DEBUG: org.elasticsearch.cluster.routing.allocation.decider - [graylog2-server] using node_concurrent_recoveries [2], node_initial_primaries_recoveries [4]
2014-06-17 13:27:19,516 DEBUG: org.elasticsearch.cluster.routing.allocation.decider - [graylog2-server] using [cluster.routing.allocation.allow_rebalance] with [indices_all_active]
2014-06-17 13:27:19,516 DEBUG: org.elasticsearch.cluster.routing.allocation.decider - [graylog2-server] using [cluster_concurrent_rebalance] with [2]
2014-06-17 13:27:19,528 INFO : org.elasticsearch.node - [graylog2-server] initialized
2014-06-17 13:27:19,529 INFO : org.elasticsearch.node - [graylog2-server] starting ...
2014-06-17 13:27:19,552 DEBUG: org.elasticsearch.netty.channel.socket.nio.SelectorUtil - Using select timeout of 500
2014-06-17 13:27:19,552 DEBUG: org.elasticsearch.netty.channel.socket.nio.SelectorUtil - Epoll-bug workaround enabled = false
2014-06-17 13:27:19,618 DEBUG: org.elasticsearch.transport.netty - [graylog2-server] Bound to address [/0:0:0:0:0:0:0:0:9350]
2014-06-17 13:27:19,622 INFO : org.elasticsearch.transport - [graylog2-server] bound_address {inet[/0:0:0:0:0:0:0:0:9350]}, publish_address {inet[/192.168.20.107:9350]}
2014-06-17 13:27:19,658 DEBUG: org.elasticsearch.transport.netty - [graylog2-server] connected to node [[#zen_unicast_1#][inet[/192.168.50.4:9300]]]
2014-06-17 13:27:22,628 WARN : org.elasticsearch.discovery - [graylog2-server] waited for 3s and no initial state was set by the discovery
2014-06-17 13:27:22,628 INFO : org.elasticsearch.discovery - [graylog2-server] graylog2/vWsYLp5JQoOJMva0FZgRsA
2014-06-17 13:27:22,629 DEBUG: org.elasticsearch.gateway - [graylog2-server] can't wait on start for (possibly) reading state from gateway, will do it asynchronously
2014-06-17 13:27:22,629 INFO : org.elasticsearch.node - [graylog2-server] started
2014-06-17 13:27:22,642 DEBUG: org.elasticsearch.transport.netty - [graylog2-server] disconnected from [[#zen_unicast_1#][inet[/192.168.50.4:9300]]]
2014-06-17 13:27:22,644 DEBUG: org.elasticsearch.discovery.zen - [graylog2-server] filtered ping responses: (filter_client[true], filter_data[false])
--> target [[Crimson Daffodil][vPHcWzoCQteDG19hofaayA][inet[/10.0.2.15:9300]]], master [[Crimson Daffodil][vPHcWzoCQteDG19hofaayA][inet[/10.0.2.15:9300]]]
2014-06-17 13:27:27,634 ERROR: org.graylog2.Main -
elasticsearch_network_host is not what you think. It is about the elasticsearch /client/ within graylog, and not the elasticsearch server you want to connect with. So graylog is trying to listen on 192.168.50.4 which isn't a valid IP address on the graylog system (your dev machine).
You most likely want to set these variables in graylog2 config:
elasticsearch_discovery_zen_ping_multicast_enabled = false
elasticsearch_discovery_zen_ping_unicast_hosts = 192.168.50.4:9300
Here is where I got stuck, but that was because I had elasticsearch 1.0 installed when I needed 0.90. I'll now more once my puppet/vagrant stack finishes re-provisioning. =)
EDIT: Mine is working now.

Motorola Razer v3i Modem detection error with Kannel

Please help me I am getting this error on running the bearerbox
[root#localhost sbin]# ./bearerbox -v 1 /usr/local/smskannel.conf
2012-04-30 11:56:28 [13417] [0] INFO: Debug_lvl = 1, log_file = <none>, log_lvl = 0
2012-04-30 11:56:28 [13417] [0] WARNING: DLR: using default 'internal' for storage type.
2012-04-30 11:56:28 [13417] [0] INFO: DLR using storage type: internal
2012-04-30 11:56:28 [13417] [0] INFO: HTTP: Opening server at port 13003.
2012-04-30 11:56:28 [13417] [0] INFO: BOXC: 'smsbox-max-pending' not set, using default (100).
2012-04-30 11:56:28 [13417] [0] INFO: Set SMS resend frequency to 60 seconds.
2012-04-30 11:56:28 [13417] [0] INFO: SMS resend retry set to unlimited.
2012-04-30 11:56:28 [13417] [0] INFO: DLR rerouting for smsc id <FAKE> disabled.
2012-04-30 11:56:28 [13417] [0] INFO: DLR rerouting for smsc id <(null)> disabled.
2012-04-30 11:56:28 [13417] [0] INFO: AT2[/dev/ttyACM0]: configuration doesn't show modemtype. will autodetect
2012-04-30 11:56:28 [13417] [0] INFO: ----------------------------------------
2012-04-30 11:56:28 [13417] [0] INFO: Kannel bearerbox II version 1.4.3 starting
2012-04-30 11:56:28 [13417] [7] INFO: AT2[/dev/ttyACM0]: opening device
2012-04-30 11:56:28 [13417] [0] INFO: MAIN: Start-up done, entering mainloop
2012-04-30 11:56:31 [13417] [7] INFO: AT2[/dev/ttyACM0]: speed set to 115200
2012-04-30 11:56:33 [13417] [7] INFO: AT2[/dev/ttyACM0]: Closing device
2012-04-30 11:56:33 [13417] [7] INFO: AT2[/dev/ttyACM0]: detect speed is 115200
2012-04-30 11:56:33 [13417] [7] INFO: AT2[/dev/ttyACM0]: opening device
2012-04-30 11:56:34 [13417] [7] INFO: AT2[/dev/ttyACM0]: speed set to 115200
2012-04-30 11:56:36 [13417] [7] PANIC: AT2[/dev/ttyACM0]: Cannot detect modem and generic not found
2012-04-30 11:56:36 [13417] [7] PANIC: ./bearerbox(gw_panic+0xc2) [0x80cc0e2]
2012-04-30 11:56:36 [13417] [7] PANIC: ./bearerbox [0x806ca62]
2012-04-30 11:56:36 [13417] [7] PANIC: ./bearerbox [0x806d5f1]
2012-04-30 11:56:36 [13417] [7] PANIC: ./bearerbox [0x80c2971]
2012-04-30 11:56:36 [13417] [7] PANIC: /lib/libpthread.so.0 [0xb9649b]
2012-04-30 11:56:36 [13417] [7] PANIC: /lib/libc.so.6(clone+0x5e) [0xaed42e]
I am using Motorola Razer v3i using usb cable with Redhat. The device is also detected in /dev as ttyACM0
This is my smskannel.conf
group = core
admin-port = 13003
smsbox-port = 13004
admin-password = bar
#status-password = foo
#admin-deny-ip = ""
#admin-allow-ip = ""
#log-file = "/tmp/kannel.log"
#log-level = 0
box-deny-ip = "*.*.*.*"
box-allow-ip = "127.0.0.1"
#unified-prefix = "+923,0092,0;+,00"
#access-log = "/tmp/access.log"
#store-file = "kannel.store"
#ssl-server-cert-file = "cert.pem"
#ssl-server-key-file = "key.pem"
#ssl-certkey-file = "mycertandprivkeyfile.pem"
#---------------------------------------------
# SMSC CONNECTIONS
group = smsc
smsc = fake
smsc-id = FAKE
port = 10000
connect-allow-ip = 127.0.0.1
#this is for Motorola Razer V3i
group = smsc
smsc = at
modemtype = auto
device=/dev/ttyACM0
my-number = 00923478847037
sms-center= 00923455000010
connect-allow-ip = 127.0.0.1
log-level = 0
#-----------------Modem Group------------
group = modems
id = Motorola
name = "Motorola"
init-string = "AT+C=1"
need-sleep = true
enable-mms = true
speed = 115200
message-storage = "SM"
#---------------------------------------------
# SMSBOX SETUP
#
group = smsbox
bearerbox-host = 127.0.0.1
sendsms-port = 13013
global-sender = 13013
#sendsms-chars = "0123456789 +-"
#log-file = "/tmp/smsbox.log"
#log-level = 0
#access-log = "/tmp/access.log"
#---------------------------------------------
# SEND-SMS USERS
group = sendsms-user
username = tester
password = foobar
#user-deny-ip = ""
#user-allow-ip = ""
#---------------------------------------------
# SERVICES
group = sms-service
keyword = nop
text = "You asked nothing and I did it!"
group = sms-service
keyword = default
text = "No service specified"
Please help me
Try this init-string:
init-string = "AT+CMEE=2;+CNMI=3,1,0,0,0".
It worked for my Motorola Razr2 v8: I am able to send and receive messages. This guy suggested the solution I provided and it worked for his Razr v3.
For a complete list of AT commands and explanations, I found this reference very useful.

Resources