I build Flask app with sqlite3 database and app.route add and app.route save
I have a problem with validators some of them works some does not works
validators.DataRequired() works
URLField() works
but validators.Length(min=1,max=15) does not works at all
from flask_wtf import FlaskForm #I aslo I also tried with Form
from wtforms import BooleanField, StringField, IntegerField, validators,SubmitField
from wtforms.fields.html5 import URLField
class AddRecValidators(FlaskForm): # <---I aslo I also tried with Form
title = StringField('Title:',[validators.DataRequired(), validators.Length(min=1,max=35,message="Title too long max 35 characters")])
authors = StringField('Authors:',[validators.Length(min=1,max=100)])
published_date = IntegerField('Published date:',[validators.Length(min=1,max=4)])
isbn_or_identifier = StringField('ISBN:',[validators.Length(min=1,max=15)])
page_count = IntegerField('Page count:',[ validators.Length(min=1,max=10000)])
language = StringField('Language:',[ validators.Length(min=1,max=3)])
image_links = URLField('Image links:')
submit = SubmitField(label=('Add to library'))
It looks like you're using the wrong validators for the type of input you're validating.
validators.Length() is for strings, see here
For the integers, try using NumberRange
from flask_wtf import FlaskForm
from wtforms import BooleanField, StringField, IntegerField, validators,SubmitField
from wtforms.fields.html5 import URLField
class AddRecValidators(FlaskForm):
title = StringField('Title:',[validators.DataRequired(), validators.Length(min=1,max=35,message="Title too long max 35 characters")])
authors = StringField('Authors:',[validators.Length(min=1,max=100)])
published_date = IntegerField('Published date:',[validators.NumberRange(min=1,max=4)]) # <-- note change to NumberRange
isbn_or_identifier = StringField('ISBN:',[validators.Length(min=1,max=15)])
page_count = IntegerField('Page count:',[ validators.NumberRange(min=1,max=10000)]) # <-- note change to NumberRange
language = StringField('Language:',[ validators.Length(min=1,max=3)])
image_links = URLField('Image links:')
submit = SubmitField(label=('Add to library'))
Also, here are the docs for flask-wtforms validators.
Related
cronjob
*/30 * * * * /home/ubuntu/web-coin-crawler/venv/bin/python3 /home/ubuntu/web-coin-crawler/webcoincrawler/cron.crontab
cron.py, crontab() Function that crawls on two sites. Save to Django orm as save().
import json
import collections
import crawl_coinmarketcal as coinmarketcal
import crawl_coinscalendar as coinscalendar
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
import django
import datetime
django.setup()
from crawled_data.models import BlogData
def preprocessingDict(dic: dict):
coin_dict = collections.defaultdict(dict)
for key, value in dic.items():
if value['symbol'] in coin_dict[value['date']]:
coin_dict[value['date']][value['symbol']].append([key, value['title'], value['name']])
else:
coin_dict[value['date']][value['symbol']] = [[key, value['title'], value['name']]]
return coin_dict
def crontab():
result = dict()
urls = coinmarketcal.get_urls()
for url in urls:
coinmarketcal.do_crawl(url, result)
urls = coinscalendar.get_urls()
for url in urls:
coinscalendar.do_crawl(url, result)
BlogData(title="COIN_DATA", content=json.dumps(preprocessingDict(result.copy()))).save()
I checked that the cron tab runs every 30 minutes.
However, no data was stored in Django orm.
I changed the Python file itself to run and it was solved.
cron.crontab -> cron
I am trying to deploy model to AKS. I am using AML SDK to register the model in the aml workspace. I am using PipelineModel module to save the model. And I am trying to load the model using PipelineModel.load. My entry script looks like below:
`
import os
import json
import pandas as pd
from azureml.core.model import Model
from pyspark.ml import PipelineModel
from mmlspark import ComputeModelStatistics
def init():
import mmlspark # this is needed to load mmlspark libraries
import logging
# extract and load model
global model, model_path
model_path = Model.get_model_path("{model_name}")
print(model_path)
print(os.stat(model_path))
print(os.path.exists(model_path))
#model_path = os.path.join(os.getenv("AZUREML_MODEL_DIR"), "{model_name}")
logging.basicConfig(level=logging.DEBUG)
#print(model_path)
#with ZipFile(model_path, 'r') as f:
# f.extractall('model')
model = PipelineModel.load(model_path)
#model = PipelineModel.read().load(model_path)
def run(input_json):
try:
output_df = model.transform(pd.read_json(input_json))
evaluator = ComputeModelStatistics().setScoredLabelsCol("prediction").setLabelCol("label").setEvaluationMetric("AUC")
result = evaluator.transform(predictions)
auc = result.select("AUC").collect()[0][0]
result = auc
except Exception as e:
result = str(e)
return json.dumps({{"result": result}})
`
It's giving error like below:
org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/var/azureml-app/azureml-models/lightgbm.model/2/lightgbm.model/metadata\n\tat org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)\n\tat org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)\n\tat org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315).
os.path.exists returns true the path fetched from Model.get_model_path.
Am I missing something here?
I'm trying to download random public domain images from the Metropolitan Museum collection using their API (more info here : https://metmuseum.github.io/) and Python, unfortunatly the images I get are empty. Here is a minimal code :
import urllib
from urllib2 import urlopen
import json
from random import randint
url = "https://collectionapi.metmuseum.org/public/collection/v1/objects"
objectID_list = json.loads(urlopen(url).read())['objectIDs']
objectID = objectID_list[randint(0,len(objectID_list)-1)]
url_request = url+"/"+str(objectID)
fetched_data = json.loads(urlopen(url_request).read())
if fetched_data['isPublicDomain']:
name = str(fetched_data['title'])
ID = str(fetched_data['objectID'])
url_image = str(fetched_data['primaryImage'])
urllib.urlretrieve(url_image, 'path/'+name+'_'+ID+'.jpg')
If I print url_image and copy/paste it in a browser I get to the desired image, but the code retrieves an image that weights 1ko and can't be opened.
Any idea what I'm doing wrong ?
Your way of downloading is correct, however, it seems as the domain is validating request headers to prevent scraping (probably unintended as they have an API to pull images).
One way of solving this problem is by changing your headers to something realistic, or utilizing fake_useragent and requests.
import requests
from fake_useragent import UserAgent
def save_image(link, file_path):
ua = UserAgent(verify_ssl=False)
headers = {"User-Agent": ua.random}
r = requests.get(link, stream=True, headers=headers)
if r.status_code == 200:
with open(file_path, 'wb') as f:
f.write(r.content)
else:
raise Exception("Error code {}.".format(r.status_code))
what changes should i do in this code so i don't have to save image to disk in step [A] then again read it from disk in step [B]. as showing in code. can anyone help me this with changes in the code or some tips?
import io
import os
import six
from google.cloud import vision
from google.cloud import translate
from google.cloud.vision import types
import json
from wand.image import Image
client = vision.ImageAnnotatorClient()
sample_pdf = Image(filename='CMB72_CMB0720160.pdf[0]', resolution=500)
blank = Image(filename='Untitled.png')
all_ = sample_pdf.clone()
polling_ = sample_pdf.clone()
voters = sample_pdf.clone()
all_.crop(3000,2800,3800,3860)
polling_.crop(870,4330,2900,4500)
voters.crop(1300,4980,2000,5250)
blank.composite(all_,left=0,top=0)
blank.composite(voters,left=0,top=1100)
blank.composite(polling_,left=0,top=1420)
blank.save('CMB72_CMB0720122.jpg')---------------[A]
file_name = 'CMB72_CMB0720122.jpg'-------------|
with io.open(file_name,'rb') as image_file:----|>[B]
content = image_file.read()---------------|
image = types.Image(content= content)
image_context = vision.types.ImageContext(
language_hints=['hi'])
response = client.document_text_detection(image=image)
texts = response.text_annotations
file = open('jin.txt','w+',encoding='utf-8')
file.write(texts[0].description)
file.close()
Use the wand.image.Image.make_blob method.
content = blank.make_blob('JPEG')
I am learning Scrapy. Now I just try to scrapy items and when I call spider:
planefinder]# scrapy crawl planefinder -o /User/spider/planefinder/pf.csv -t csv
it shows tech information and no scraped content (Crawled 0 pages .... etc), and it returns an empty csv file.
The problem is when i test xpath in scrapy shell it works:
>>> from scrapy.selector import Selector
>>> sel = Selector(response)
>>> flights = sel.xpath("//div[#class='col-md-12'][1]/div/div/table//tr")
>>> items = []
>>> for flt in flights:
... item = flt.xpath("td[1]/a/#href").extract_first()
... items.append(item)
...
>>> items
The following is my planeFinder.py code:
# -*-:coding:utf-8 -*-
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector, HtmlXPathSelector
from planefinder.items import arr_flt_Item, dep_flt_Item
class planefinder(CrawlSpider):
name = 'planefinder'
host = 'https://planefinder.net'
start_url = ['https://planefinder.net/data/airport/PEK/']
def parse(self, response):
arr_flights = response.xpath("//div[#class='col-md-12'][1]/div/div/table//tr")
dep_flights = response.xpath("//div[#class='col-md-12'][2]/div/div/table//tr")
for flight in arr_flights:
arr_item = arr_flt_Item()
arr_flt_url = flight.xpath('td[1]/a/#href').extract_first()
arr_item['arr_flt_No'] = flight.xpath('td[1]/a/text()').extract_first()
arr_item['STA'] = flight.xpath('td[2]/text()').extract_first()
arr_item['From'] = flight.xpath('td[3]/a/text()').extract_first()
arr_item['ETA'] = flight.xpath('td[4]/text()').extract_first()
yield arr_item
Please before going to CrawlSpider please check the docs for Spiders, some of the issues I've found were:
Instead of host use allowed_domains
Instead of start_url use start_urls
It seem that the page needs to have some cookies set or maybe it's using some kind of basic anti-bot protection, and you need to land somewhere else first.
Try this (I've also changed a bit :
# -*-:coding:utf-8 -*-
from scrapy import Field, Item, Request
from scrapy.spiders import CrawlSpider, Spider
class ArrivalFlightItem(Item):
arr_flt_no = Field()
arr_sta = Field()
arr_from = Field()
arr_eta = Field()
class PlaneFinder(Spider):
name = 'planefinder'
allowed_domains = ['planefinder.net']
start_urls = ['https://planefinder.net/data/airports']
def parse(self, response):
yield Request('https://planefinder.net/data/airport/PEK', callback=self.parse_flight)
def parse_flight(self, response):
flights_xpath = ('//*[contains(#class, "departure-board") and '
'./preceding-sibling::h2[contains(., "Arrivals")]]'
'//tr[not(./th) and not(./td[#class="spacer"])]')
for flight in response.xpath(flights_xpath):
arrival = ArrivalFlightItem()
arr_flt_url = flight.xpath('td[1]/a/#href').extract_first()
arrival['arr_flt_no'] = flight.xpath('td[1]/a/text()').extract_first()
arrival['arr_sta'] = flight.xpath('td[2]/text()').extract_first()
arrival['arr_from'] = flight.xpath('td[3]/a/text()').extract_first()
arrival['arr_eta'] = flight.xpath('td[4]/text()').extract_first()
yield arrival
The problem here is not understanding correctly which "Spider" to use, as Scrapy offers different custom ones.
The main one, and the one you should be using is the simple Spider and not CrawlSpider, because CrawlSpider is used for a more deep and intensive search into forums, blogs, etc.
Just change the type of spider to:
from scrapy import Spider
class plane finder(Spider):
...
Check the value of ROBOTSTXT_OBEY in your settings.py file. By default it's set to True (but not when you run shell). Set it to False if you wan't to disobey robots.txt file.