Download Images from list of urls - image

I have a list of urls in a text file.i want the images to be downloaded to a particular folder ,how i can do it.is there any addons available in chrome or any other program to download images from url

Create a folder in your machine.
Place your text file of images URL in the folder.
cd to that folder.
Use wget -i images.txt
You will find all your downloaded files in the folder.

On Windows 10/11 this is fairly trivial using
for /F "eol=;" %f in (filelist.txt) do curl -O %f
Note the inclusion of eol=; allows us to mask individual exclusions by adding ; at the start of those lines in filelist.txt that we do not want this time. If using above in a batch file GetFileList.cmd then double those %%'s
Windows 7 has a FTP command, but that can often throw up a firewall dialog requiring a User Authorization response.
Currently running Windows 7 and wanting to download a list of URLs without downloading any wget.exe or other dependency like curl.exe (which would be simplest as the first command) the shortest compatible way is a power-shell command (not my favorite for speed, but if needs must.)
The file with URLs is filelist.txt and IWR is the PS near equivalent of wget.
The Security Protocol first command ensures we are using modern TLS1.2 protocol
-OutF ... split-path ... means the filenames will be the same as remote filenames but in CWD (current working directory), for scripting you can cd /d folder if necessary.
PS> [Net.ServicePointManager]::SecurityProtocol = "Tls12" ; GC filelist.txt | % {IWR $_ -OutF $(Split-Path $_ -Leaf)}
To run as a CMD use a slightly different set of quotes around 'Tls12'
PowerShell -C "& {[Net.ServicePointManager]::SecurityProtocol = 'Tls12' ; GC filelist.txt | % {IWR $_ -OutF $(Split-Path $_ -Leaf)}}"

This needs to be made into a function with error handling but it repeatedly downloads images for image classification projects
import requests
urls = pd.read_csv('cat_urls.csv') #save the url list as a dataframe
rows = []
for index, i in urls.iterrows():
rows.append(i[-1])
counter = 0
for i in rows:
file_name = 'cat' + str(counter) + '.jpg'
print(file_name)
response = requests.get(i)
file = open(file_name, "wb")
file.write(response.content)
file.close()
counter += 1

import os
import time
import sys
import urllib
from progressbar import ProgressBar
def get_raw_html(url):
version = (3,0)
curr_version = sys.version_info
if curr_version >= version: #If the Current Version of Python is 3.0 or above
import urllib.request #urllib library for Extracting web pages
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
request = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(request)
respData = str(resp.read())
return respData
except Exception as e:
print(str(e))
else: #If the Current Version of Python is 2.x
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
request = urllib2.Request(url, headers = headers)
try:
response = urllib2.urlopen(request)
except URLError: # Handling SSL certificate failed
context = ssl._create_unverified_context()
response = urlopen(req,context=context)
#response = urllib2.urlopen(req)
raw_html = response.read()
return raw_html
except:
return"Page Not found"
def next_link(s):
start_line = s.find('rg_di')
if start_line == -1: #If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
def all_links(page):
links = []
while True:
link, end_content = next_link(page)
if link == "no_links":
break
else:
links.append(link) #Append all the links in the list named 'Links'
#time.sleep(0.1) #Timer could be used to slow down the request for image downloads
page = page[end_content:]
return links
def download_images(links, search_keyword):
choice = input("Do you want to save the links? [y]/[n]: ")
if choice=='y' or choice=='Y':
#write all the links into a test file.
f = open('links.txt', 'a') #Open the text file called links.txt
for link in links:
f.write(str(link))
f.write("\n")
f.close() #Close the file
num = input("Enter number of images to download (max 100): ")
counter = 1
errors=0
search_keyword = search_keyword.replace("%20","_")
directory = search_keyword+'/'
if not os.path.isdir(directory):
os.makedirs(directory)
pbar = ProgressBar()
for link in pbar(links):
if counter<=int(num):
file_extension = link.split(".")[-1]
filename = directory + str(counter) + "."+ file_extension
#print ("Downloading image: " + str(counter)+'/'+str(num))
try:
urllib.request.urlretrieve(link, filename)
except IOError:
errors+=1
#print ("\nIOError on Image" + str(counter))
except urllib.error.HTTPError as e:
errors+=1
#print ("\nHTTPError on Image"+ str(counter))
except urllib.error.URLError as e:
errors+=1
#print ("\nURLError on Image" + str(counter))
counter+=1
return errors
def search():
version = (3,0)
curr_version = sys.version_info
if curr_version >= version: #If the Current Version of Python is 3.0 or above
import urllib.request #urllib library for Extracting web pages
else:
import urllib2 #If current version of python is 2.x
search_keyword = input("Enter the search query: ")
#Download Image Links
links = []
search_keyword = search_keyword.replace(" ","%20")
url = 'https://www.google.com/search?q=' + search_keyword+ '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (get_raw_html(url))
links = links + (all_links(raw_html))
print ("Total Image Links = "+str(len(links)))
print ("\n")
errors = download_images(links, search_keyword)
print ("Download Complete.\n"+ str(errors) +" errors while downloading.")
search()

In this python project I make a search in unsplash.com, which brings me a list of URL, then I save a number of them (pre-defined by the user) to a pre-defined folder. Check it out.

On Windows, install wget - https://sourceforge.net/projects/gnuwin32/files/wget/1.11.4-1/
and add C:\Program Files (x86)\GnuWin32\bin to your environment path.
create a folder with a txt file of all the images you want to download.
in the location bar at the top of the file explorer type cmd
When the command prompt opens enter the following.
wget -i images.txt --no-check-certificate

Related

Is there any way to download .tar files from web to local folder fast?

I am trying to download a batch of files from the following website to my local folder. But the download is very slow. Average size of each file is in the range of 30 - 60 MB. Is there a better way to improve this code so that I can download them fast?
import requests
from os import mkdir
from os.path import isdir
from bs4 import BeautifulSoup
from os import chdir, getcwd
url = "https://opendata.dwd.de/climate_environment/CDC/grids_germany/hourly/radolan/historical/asc/"
years = [str(year) for year in range(2005,2021)]
links = [url + i + "/" for i in years]
Creating an empty list to store list of list:
t_links = []
def get_tarlinks():
for i in links:
#create response object
r = requests.get(i)
#create beautiful object
soup = BeautifulSoup(r.content, 'html5lib')
#find all links on webpage
a_links = soup.find_all('a')
#filter the link sending with .tar
tar_links = [i + link['href'] for link in a_links if link['href'].endswith('.tar')]
t_links.append(tar_links)
return t_links
t_links = get_tarlinks()
src_path = "D:/Sandeep/Thesis/Data/"
Download files to local machine:
for i in t_links:
for j in i:
year,filename = j.split('/')[10:]
r = requests.get(j, allow_redirects=True)
if isdir(src_path+year) == False:
mkdir(src_path+year)
chdir(src_path+year)
open(filename, "wb").write(r.content)
else:
open(filename, "wb").write(r.content)
Note: Please check for the indentation when you copy this code to your IDE. Thanks!

How to hide the console window when I run tesseract with pytesseract with CREATE_NO_WINDOW

I am using tesseract to perform OCR on screengrabs. I have an app using a tkinter window leveraging self.after in the initialization of my class to perform constant image scrapes and update label, etc values in the tkinter window. I have searched for multiple days and can't find any specific examples how to leverage CREATE_NO_WINDOW with Python3.6 on a Windows platform calling tesseract with pytesseract.
This is related to this question:
How can I hide the console window when I run tesseract with pytesser
I have only been programming Python for 2 weeks and don't understand what/how to perform the steps in the above question. I opened up the pytesseract.py file and reviewed and found the proc = subprocess.Popen(command, stderr=subproces.PIPE) line but when I tried editing it I got a bunch of errors that I couldn't figure out.
#!/usr/bin/env python
'''
Python-tesseract. For more information: https://github.com/madmaze/pytesseract
'''
try:
import Image
except ImportError:
from PIL import Image
import os
import sys
import subprocess
import tempfile
import shlex
# CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
tesseract_cmd = 'tesseract'
__all__ = ['image_to_string']
def run_tesseract(input_filename, output_filename_base, lang=None, boxes=False,
config=None):
'''
runs the command:
`tesseract_cmd` `input_filename` `output_filename_base`
returns the exit status of tesseract, as well as tesseract's stderr output
'''
command = [tesseract_cmd, input_filename, output_filename_base]
if lang is not None:
command += ['-l', lang]
if boxes:
command += ['batch.nochop', 'makebox']
if config:
command += shlex.split(config)
proc = subprocess.Popen(command, stderr=subprocess.PIPE)
status = proc.wait()
error_string = proc.stderr.read()
proc.stderr.close()
return status, error_string
def cleanup(filename):
''' tries to remove the given filename. Ignores non-existent files '''
try:
os.remove(filename)
except OSError:
pass
def get_errors(error_string):
'''
returns all lines in the error_string that start with the string "error"
'''
error_string = error_string.decode('utf-8')
lines = error_string.splitlines()
error_lines = tuple(line for line in lines if line.find(u'Error') >= 0)
if len(error_lines) > 0:
return u'\n'.join(error_lines)
else:
return error_string.strip()
def tempnam():
''' returns a temporary file-name '''
tmpfile = tempfile.NamedTemporaryFile(prefix="tess_")
return tmpfile.name
class TesseractError(Exception):
def __init__(self, status, message):
self.status = status
self.message = message
self.args = (status, message)
def image_to_string(image, lang=None, boxes=False, config=None):
'''
Runs tesseract on the specified image. First, the image is written to disk,
and then the tesseract command is run on the image. Tesseract's result is
read, and the temporary files are erased.
Also supports boxes and config:
if boxes=True
"batch.nochop makebox" gets added to the tesseract call
if config is set, the config gets appended to the command.
ex: config="-psm 6"
'''
if len(image.split()) == 4:
# In case we have 4 channels, lets discard the Alpha.
# Kind of a hack, should fix in the future some time.
r, g, b, a = image.split()
image = Image.merge("RGB", (r, g, b))
input_file_name = '%s.bmp' % tempnam()
output_file_name_base = tempnam()
if not boxes:
output_file_name = '%s.txt' % output_file_name_base
else:
output_file_name = '%s.box' % output_file_name_base
try:
image.save(input_file_name)
status, error_string = run_tesseract(input_file_name,
output_file_name_base,
lang=lang,
boxes=boxes,
config=config)
if status:
errors = get_errors(error_string)
raise TesseractError(status, errors)
f = open(output_file_name, 'rb')
try:
return f.read().decode('utf-8').strip()
finally:
f.close()
finally:
cleanup(input_file_name)
cleanup(output_file_name)
def main():
if len(sys.argv) == 2:
filename = sys.argv[1]
try:
image = Image.open(filename)
if len(image.split()) == 4:
# In case we have 4 channels, lets discard the Alpha.
# Kind of a hack, should fix in the future some time.
r, g, b, a = image.split()
image = Image.merge("RGB", (r, g, b))
except IOError:
sys.stderr.write('ERROR: Could not open file "%s"\n' % filename)
exit(1)
print(image_to_string(image))
elif len(sys.argv) == 4 and sys.argv[1] == '-l':
lang = sys.argv[2]
filename = sys.argv[3]
try:
image = Image.open(filename)
except IOError:
sys.stderr.write('ERROR: Could not open file "%s"\n' % filename)
exit(1)
print(image_to_string(image, lang=lang))
else:
sys.stderr.write('Usage: python pytesseract.py [-l lang] input_file\n')
exit(2)
if __name__ == '__main__':
main()
The code I am leveraging is similar to the example in the similar question:
def get_string(img_path):
# Read image with opencv
img = cv2.imread(img_path)
# Convert to gray
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Apply dilation and erosion to remove some noise
kernel = np.ones((1, 1), np.uint8)
img = cv2.dilate(img, kernel, iterations=1)
img = cv2.erode(img, kernel, iterations=1)
# Write image after removed noise
cv2.imwrite(src_path + "removed_noise.png", img)
# Apply threshold to get image with only black and white
# Write the image after apply opencv to do some ...
cv2.imwrite(src_path + "thres.png", img)
# Recognize text with tesseract for python
result = pytesseract.image_to_string(Image.open(src_path + "thres.png"))
return result
When it gets to the following line, there is a flash of a black console window for less than a second and then it closes when it runs the command.
result = pytesseract.image_to_string(Image.open(src_path + "thres.png"))
Here is the picture of the console window:
Program Files (x86)_Tesseract
Here is what is suggested from the other question:
You're currently working in IDLE, in which case I don't think it
really matters if a console window pops up. If you're planning to
develop a GUI app with this library, then you'll need to modify the
subprocess.Popen call in pytesser.py to hide the console. I'd first
try the CREATE_NO_WINDOW process creation flag. – eryksun
I would greatly appreciate any help for how to modify the subprocess.Popen call in the pytesseract.py library file using CREATE_NO_WINDOW. I am also not sure of the difference between pytesseract.py and pytesser.py library files. I would leave a comment on the other question to ask for clarification but I can't until I have more reputation on this site.
I did more research and decided to learn more about subprocess.Popen:
Documentation for subprocess
I also referenced the following articles:
using python subprocess.popen..can't prevent exe stopped working prompt
I changed the original line of code in pytesseract.py:
proc = subprocess.Popen(command, stderr=subprocess.PIPE)
to the following:
proc = subprocess.Popen(command, stderr=subprocess.PIPE, creationflags = CREATE_NO_WINDOW)
I ran the code and got the following error:
Exception in Tkinter callback Traceback (most recent call last):
File
"C:\Users\Steve\AppData\Local\Programs\Python\Python36-32\lib\tkinter__init__.py",
line 1699, in call
return self.func(*args) File "C:\Users\Steve\Documents\Stocks\QuickOrder\QuickOrderGUI.py", line
403, in gather_data
update_cash_button() File "C:\Users\Steve\Documents\Stocks\QuickOrder\QuickOrderGUI.py", line
208, in update_cash_button
currentCash = get_string(src_path + "cash.png") File "C:\Users\Steve\Documents\Stocks\QuickOrder\QuickOrderGUI.py", line
150, in get_string
result = pytesseract.image_to_string(Image.open(src_path + "thres.png")) File
"C:\Users\Steve\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pytesseract\pytesseract.py",
line 125, in image_to_string
config=config) File "C:\Users\Steve\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pytesseract\pytesseract.py",
line 49, in run_tesseract
proc = subprocess.Popen(command, stderr=subprocess.PIPE, creationflags = CREATE_NO_WINDOW) NameError: name 'CREATE_NO_WINDOW'
is not defined
I then defined the CREATE_NO_WINDOW variable:
#Assignment of the value of CREATE_NO_WINDOW
CREATE_NO_WINDOW = 0x08000000
I got the value of 0x08000000 from the above linked article. After adding the definition I ran the application and I didn't get any more console window popups.

How to read the latest image in a folder using python?

I have to read the latest image in a folder using python. How can I do this?
Another similar way, with some pragmatic (non-foolproof) image validation added:
import os
def get_latest_image(dirpath, valid_extensions=('jpg','jpeg','png')):
"""
Get the latest image file in the given directory
"""
# get filepaths of all files and dirs in the given dir
valid_files = [os.path.join(dirpath, filename) for filename in os.listdir(dirpath)]
# filter out directories, no-extension, and wrong extension files
valid_files = [f for f in valid_files if '.' in f and \
f.rsplit('.',1)[-1] in valid_extensions and os.path.isfile(f)]
if not valid_files:
raise ValueError("No valid images in %s" % dirpath)
return max(valid_files, key=os.path.getmtime)
Walk over the filenames, get their modification time and keep track of the latest modification time you found:
import os
import glob
ts = 0
found = None
for file_name in glob.glob('/path/to/your/interesting/directory/*'):
fts = os.path.getmtime(file_name)
if fts > ts:
ts = fts
found = file_name
print(found)

'File path' use causing program exit in Python 3

I have downloaded a set of html files and saved the file paths which I saved them to in a .txt file. It has each path on a new line. I wanted to look at the first file in the list and then itterate through the whole list, opening the files and extracting data before going on to the next file.
My code works fine with a single path put in directly (for the first file) as:
path = r'C:\path\to\file.html'
and works if I itterate through the text file using:
file_list_fp = r'C:\path\to\file_with_pathlist.txt'
with open(file_list_fp, 'r') as file_list:
for filepath in file_list:
pathend = filepath.find('\n')
path = file[:pathend]
q = open(path, 'r').read()
but it fails when I try getting a single path using either:
with open(file_list_fp, 'r') as file_list:
path_n = file_list.readline()
end = path_n.find('\n')
path_bad1 = path_n[:end]
or:
with open(file_list_fp, 'r') as file_list:
path_bad2 = file_list.readline().split('\n')[0]
With these two my code exits just after that point. I can't figure out why. Any pointers very welcome. (I'm using Python 3.3.1 on windows.)

CVSNT: from a tag to a list of file names and revisions

I have a project with sources under the control of CVSNT.
I need a list of source file names and revisions belonging to a certain tag.
For example:
the tag MYTAG is:
myproject/main.cpp 1.5.2.3
myproject/myclass.h 1.5.2.1
I know that with cvs log -rMYTAG > log.txt I get in log.txt all the information I need and then I can filter it to build my list, but, is there any utility which already do what I need?
Here's a Python script that does this:
import sys, os, os.path
import re, string
def runCvs(args):
f_in, f_out, f_err = os.popen3('cvs '+string.join(args))
out = f_out.read()
err = f_err.read()
f_out.close()
f_err.close()
code = f_in.close()
if not code: code = 0
return code, out, err
class RevDumper:
def parseFile(self, rex, filelog):
m = rex.search(filelog)
if m:
print '%s\t%s' % (m.group(1), m.group(2))
def filterOutput(self, logoutput, repoprefix):
rex = re.compile('^={77}$', re.MULTILINE)
files = rex.split(logoutput)
rex = re.compile('RCS file: %s(.*),v[^=]+selected revisions: [^0][^=]+revision ([0-9\.]+)' % repoprefix, re.MULTILINE)
for file in files:
self.parseFile(rex, file)
def getInfo(self, tag, module, repoprefix):
args = ['-Q', '-z9', 'rlog', '-S', '-N', '-r'+tag, module] # remove the -S if you're using an older version of CVS
code, out, err = runCvs(args)
if code == 0:
self.filterOutput(out, repoprefix)
else:
sys.stderr.write('CVS returned %d\n%s\n' % (code, err))
if len(sys.argv) > 2:
tag = sys.argv[1]
module = sys.argv[2]
if len(sys.argv) > 3:
repoprefix = sys.argv[3]
else:
repoprefix = ''
RevDumper().getInfo(tag, module, repoprefix)
else:
sys.stderr.write('Syntax: %s TAG MODULE [REPOPREFIX]' % os.path.basename(sys.argv[0]))
Note that you either have to have a CVSROOT environment variable set or run this from inside a working copy checked out from the repository you want to query.
Also, the file names displayed are based on the "RCS File" property of the rlog output, i.e. they still contain the repository prefix. If you want to filter that out you can specify a third argument, e.g. when your CVSROOT is something like sspi:server:/cvsrepo then you would call this like:
ListCvsTagRevisions.py MyTag MyModule /cvsrepo/
Hope this helps.
Note: If you need a script that lists the revisions currently in your working copy, see the edit history of this answer.

Resources