I am trying to download image in via scrapy. Here are my different files :
items.py
class DmozItem(Item):
title = Field()
image_urls = Field()
images = Field()
settings.py
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGES= '/home/mayank/Desktop/sc/tutorial/tutorial'
spider
class DmozSpider(BaseSpider):
name = "wikipedia"
allowed_domains = ["wikipedia.org"]
start_urls = [
"http://en.wikipedia.org/wiki/Pune"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
images=hxs.select('//a[#class="image"]')
for image in images:
item = DmozItem()
link=image.select('#href').extract()[0]
link = 'http://en.wikipedia.com'+link
item['image_urls']=link
items.append(item)
In spite of all these setting I my pipeline is not getting activated.Please help. I am new to this framework.
First, settings.py: IMAGES -> IMAGES_STORE
Second, spider: You should return an item so that ImagesPipeline could download those images.
item = DmozItem()
image_urls = hxs.select('//img/#src').extract()
item['image_urls'] = ["http:" + x for x in image_urls]
return item
Related
I have model:
class Ingredient(models.Model):
KILOGRAM = 'kg'
LITER = 'ltr'
PIECES = 'pcs'
MUNITS_CHOICES = (
(KILOGRAM, 'Kilogram'),
(LITER, 'Liter'),
(PIECES, 'Pieces'),
)
name = models.CharField(max_length=200,unique=True,null=False)
slug = models.SlugField(unique=True)
munit = models.CharField(max_length=10,choices=MUNITS_CHOICES,default=KILOGRAM)
rate = models.DecimalField(max_digits=19, decimal_places=2,validators=[MinValueValidator(0)],default=0)
typeofingredient = models.ForeignKey(TypeOfIngredient, related_name='typeof_ingredient',null=True, blank=True,on_delete=models.PROTECT)
density_kg_per_lt = models.DecimalField(max_digits=19, decimal_places=2,verbose_name='Density (kg/lt)',null=True,blank=True,validators=[MinValueValidator(0)])
density_pcs_per_kg = models.DecimalField(max_digits=19, decimal_places=2,verbose_name='Density (pcs/kg)',null=True,blank=True,validators=[MinValueValidator(0)])
density_pcs_per_lt = models.DecimalField(max_digits=19, decimal_places=2,verbose_name='Density (pcs/lt)',null=True,blank=True,validators=[MinValueValidator(0)])
updated = models.DateTimeField(auto_now=True, auto_now_add=False)
timestamp = models.DateTimeField(auto_now=False, auto_now_add=True)
When i get the api i also want to get field types like char, decimal, datetime etc
Something like the below api result, is it possible. Because i am using reactJs as frontend, i have tell the input what kind of field it can accept and also helps in sorting by text or number
{
"id": {value: 1,type: number},
"name": {value: "adark",type: charfield},
"rate": {value: "12.00",type: decimal},
"updated": {value: "2017-07-14T10:51:47.847171Z",type: datetime},
.......so on
}
The Corresponding Serializer would be as follows:
class IngredientSerializer(serializers.ModelSerializer):
name = serializers.SerializerMethodField()
rate = serializers.SerializerMethodField()
updated = serializers.SerializerMethodField()
class Meta:
model = Ingredient
fields = ('name', 'rate', 'updated')
def get_name(self, obj):
response = dict()
response['value'] = obj.name
response['type'] = obj.name.get_internal_type()
return Response(response)
def get_rate(self, obj):
response = dict()
response['value'] = obj.rate
response['type'] = obj.rate.get_internal_type()
return Response(response)
def get_updated(self, obj):
response = dict()
response['value'] = obj.updated
response['type'] = obj.updated.get_internal_type()
return Response(response)
Few days can't fight problem. I need to transport image through AJAX as JSON. I encoded it as base64 by using fileReader.readAsDataURL, and transport that. But PIL won't open it. Here is my code:
script.js:
function imageloader(callBack){
var input = $('#photo');
var file = input.prop('files')[0];
var reader = new FileReader();
reader.onload = function(){
callBack(reader.result);
};
reader.readAsDataURL(file);
$('#edit_form').submit(function (eve) {
eve.preventDefault();
var form = $.toJSON($(this).serializeArray());
imageloader(function(image){
var image_data = $.toJSON(image);
sender(form,image_data);
});
});
view.py:
def post(self, request):
form = request.POST.get('form')
image = request.POST.get('image')
new_image = json.loads(image)
data = json.loads(form)
new_data = {}
for i in data:
new_data[i['name']] = i['value']
new_data['photo'] = resize_picture(new_image)
...
def resize_picture(file):
file = file.split(',')[1]
bytes = (BytesIO(base64.b64decode(file)))
bytes.seek(0)
image = bytes.read()
img = Image.open(image)
img.thumbnail(IMAGE_SIZE, Image.ANTIALIAS)
return img
Every time I try it I get this error from PIL trying to open the file:
file() argument 1 must be encoded string without NULL bytes, not str
Did i miss something?
The filereader.readAsDataURL function produces a data URL which is a unicode string of the form "data:[image type];[encoding],[THEENCODEDSTUFF....]".
To process it in python and possibly assign it to an image field, cherry pick from my code snippet below (the uploaded data is on the variable url_data and the imagefield on the django model object is obj.avatar_image):
img_dict = re.match("data:(?P<type>.*?);(?P<encoding>.*?),(?P<data>.*)", url_data).groupdict()
blob = img_dict['data'].decode(img_dict['encoding'], 'strict')
image = Image.open(StringIO(blob))
image = image.resize((75, 75), Image.ANTIALIAS)
f = StringIO()
try:
image.save(f, format='png')
filename = os.path.splitext(filename)[0] + '.png'
obj.avatar_image.save(filename, ContentFile(f.getvalue()))
finally:
f.close()
I am getting Distribution.png (box with question mark) instead of wanted image while using fluid template, typo3 v6.0.
This is my root template:
config.no_cache = 1
config.doctype = html5
page = PAGE
page.typeNum = 0
page.10 = FLUIDTEMPLATE
page.10 {
format = html
file = fileadmin/templates/layouts/main.html
partialRootPath = fileadmin/templates/partials/
layoutRootPath = fileadmin/templates/layouts/
variables {
content < styles.content.get
content.select.where = colPos = 0
content_left < styles.content.get
content_left.select.where = colPos = 1
content_right < styles.content.get
content_right.select.where = colPos = 2
}
}
page.10.file.stdWrap.cObject = CASE
page.10.file.stdWrap.cObject {
key.data = levelfield:-1, backend_layout_next_level, slide
key.override.field = backend_layout
default = TEXT
default.value = fileadmin/templates/threeColumn.html
2 = TEXT
2.value = fileadmin/templates/threeColumn.html
3 = TEXT
3.value = fileadmin/templates/fullSize.html
}
In version 4.5 everyting is working ok.
Any help would be appreciated.
I've been at this for 12hrs and I'm hoping someone can give me a leg up.
Here is my code all I want is to get the anchor and url of every link on a page as it crawls along.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.response import get_base_url
from urlparse import urljoin
#from scrapy.item import Item
from tutorial.items import DmozItem
class HopitaloneSpider(CrawlSpider):
name = 'dmoz'
allowed_domains = ['domain.co.uk']
start_urls = [
'http://www.domain.co.uk'
]
rules = (
#Rule(SgmlLinkExtractor(allow='>example\.org', )),
Rule(SgmlLinkExtractor(allow=('\w+$', )), callback='parse_item', follow=True),
)
user_agent = 'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))'
def parse_item(self, response):
#self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
#print response.url
sites = hxs.select('//html')
#item = DmozItem()
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/#href').extract()
items.append(item)
return items
What I'm doing wrong... my eyes hurt now.
response.body should be what you want
def parse_item(self, response):
#self.log('Hi, this is an item page! %s' % response.url)
body = response.body
item = ....
To get all links on a single page:
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
links = hxs.select("//a")
for link in links:
item = DmozItem()
item['title'] = site.select('text()').extract()
item['link'] = site.select('#href').extract()
items.append(item)
return items
I am inserting a new record in my model form where my model is child form containing a foriegn key. When I am submitting the form it give error it should be instance of the foriegn key.
Here is my model
class MMEditidState(models.Model):
state_id = models.IntegerField(primary_key = True)
state_dremelid = models.ForeignKey(MMDremelDump, db_column = 'state_dremelid')
assignee = models.CharField(max_length = 50)
state = models.CharField(max_length = 50)
role = models.CharField(max_length = 50)
date = models.DateTimeField()
class Meta:
db_table = u'mm_editid_state'
def __unicode__(self):
return u'%s %s' % (self.state_dremelid, self.assignee)
class MMEditidErrors(models.Model):
error_id = models.IntegerField(primary_key = True)
error_stateid = models.ForeignKey(MMEditidState, db_column = 'error_stateid')
feature_type = models.CharField(max_length = 20)
error_type = models.CharField(max_length = 20)
error_nature = models.CharField(max_length = 50, null = True)
error_details = models.CharField(max_length = 50)
error_subtype = models.CharField(max_length = 200)
date = models.DateTimeField()
class Meta:
db_table = u'mm_editid_errors'
def __str__(self):
return "%s" % (self.error_dremelid)
def __unicode__(self):
return u'%s' % (self.error_dremelid)
Here is my View
def qcthisedit(request, get_id):
if request.method == "POST":
form = forms.MMEditidErrorForm(get_id, request.POST)
if form.is_valid():
form.save()
return http.HttpResponseRedirect('/mmqc/dremel_list/')
else:
form = forms.MMEditidErrorForm(get_id)
return shortcuts.render_to_response('qcthisedit.html',locals(),
context_instance = context.RequestContext(request))
Here is my form
class MMEditidErrorForm(forms.ModelForm):
def __init__(self,get_id, *args, **kwargs):
super(MMEditidErrorForm, self).__init__(*args, **kwargs)
dremel = MMEditidState.objects.filter(pk=get_id).values('state_id')
dremelid = int(dremel[0]['state_id'])
self.fields['error_stateid'] = forms.IntegerField(initial = dremelid,
widget = forms.TextInput(
attrs{'readonly':'readonly'}))
feature_type = forms.TypedChoiceField(choices = formfields.FeatureType)
error_type = forms.TypedChoiceField(choices = formfields.ErrorType)
error_nature = forms.TypedChoiceField(choices = formfields.ErrorNature)
error_details = forms.TypedChoiceField(choices = formfields.ErrorDetails)
error_subtype = forms.TypedChoiceField(choices = formfields.ErrorSubType)
class Meta:
model = models.MMEditidErrors
exclude = ('error_id','date')
When I submit the form I am getting the error
Cannot assign "1": "MMEditidErrors.error_stateid" must be a "MMEditidState" instance.
So I have added line
get_id = MMEditidState.objects.get(pk = get_id)
Now I am getting the below mentioned error
int() argument must be a string or a number, not 'MMEditidState'
in form = forms.MMEditidErrorForm(get_id, request.POST)
Can someone help on this
Thanks
Vikram
I have solved this problem by simply using the custom forms instead of model forms. While storing the data in the database, I managed myself in the views.py