Is there any way to download .tar files from web to local folder fast? - download

I am trying to download a batch of files from the following website to my local folder. But the download is very slow. Average size of each file is in the range of 30 - 60 MB. Is there a better way to improve this code so that I can download them fast?
import requests
from os import mkdir
from os.path import isdir
from bs4 import BeautifulSoup
from os import chdir, getcwd
url = ""
years = [str(year) for year in range(2005,2021)]
links = [url + i + "/" for i in years]
Creating an empty list to store list of list:
t_links = []
def get_tarlinks():
for i in links:
#create response object
r = requests.get(i)
#create beautiful object
soup = BeautifulSoup(r.content, 'html5lib')
#find all links on webpage
a_links = soup.find_all('a')
#filter the link sending with .tar
tar_links = [i + link['href'] for link in a_links if link['href'].endswith('.tar')]
return t_links
t_links = get_tarlinks()
src_path = "D:/Sandeep/Thesis/Data/"
Download files to local machine:
for i in t_links:
for j in i:
year,filename = j.split('/')[10:]
r = requests.get(j, allow_redirects=True)
if isdir(src_path+year) == False:
open(filename, "wb").write(r.content)
Note: Please check for the indentation when you copy this code to your IDE. Thanks!


How do I create a prefetch dataset from a folder of images?

I am trying to input a dataset from Kaggle into this notebook from the Tensorflow docs in order to train a CycleGAN model. My current approach is to download the folders into my notebook and loop through the paths of each image and use cv2.imread(path) to add the uint8 image data to a list. But this doesn't work and I know my current approach is wrong because the code provided by google requires a Prefetch dataset.
Here's my current code (excluding the opencv part)
import os
# specify the img directory path
art_path = "/content/abstract-art-gallery/Abstract_gallery/Abstract_gallery/"
land_path = "/content/landscape-pictures/"
def grab_path(folder, i_count=100):
res = []
for file in range(i_count):
if os.listdir(folder)[0].endswith(('.jpg', '.png', 'jpeg')):
img_path = folder + os.listdir(folder)[0]
return res
art_path, land_path = grab_path(art_path), grab_path(land_path)
The error in the code comes here:
train_horses = train_horses.cache().map(
preprocess_image_train, num_parallel_calls=AUTOTUNE).shuffle(
Is there a simpler approach to this problem?
import pathlib
import tensorflow as tf
import numpy as np
def read_image(path):
image_string =
image = DataUtils.decode_image(image_string,(image_size))
return image
paths = np.array([x for x in pathlib.Path(IMAGE_PATHS_DIR).rglob('*.jpg')])
dataset =
dataset =
dataset = dataset.shuffle(2048)
dataset = dataset.prefetch(AUTOTUNE)

Downloaded images from the Metropolitan Museum collection are empty

I'm trying to download random public domain images from the Metropolitan Museum collection using their API (more info here : and Python, unfortunatly the images I get are empty. Here is a minimal code :
import urllib
from urllib2 import urlopen
import json
from random import randint
url = ""
objectID_list = json.loads(urlopen(url).read())['objectIDs']
objectID = objectID_list[randint(0,len(objectID_list)-1)]
url_request = url+"/"+str(objectID)
fetched_data = json.loads(urlopen(url_request).read())
if fetched_data['isPublicDomain']:
name = str(fetched_data['title'])
ID = str(fetched_data['objectID'])
url_image = str(fetched_data['primaryImage'])
urllib.urlretrieve(url_image, 'path/'+name+'_'+ID+'.jpg')
If I print url_image and copy/paste it in a browser I get to the desired image, but the code retrieves an image that weights 1ko and can't be opened.
Any idea what I'm doing wrong ?
Your way of downloading is correct, however, it seems as the domain is validating request headers to prevent scraping (probably unintended as they have an API to pull images).
One way of solving this problem is by changing your headers to something realistic, or utilizing fake_useragent and requests.
import requests
from fake_useragent import UserAgent
def save_image(link, file_path):
ua = UserAgent(verify_ssl=False)
headers = {"User-Agent": ua.random}
r = requests.get(link, stream=True, headers=headers)
if r.status_code == 200:
with open(file_path, 'wb') as f:
raise Exception("Error code {}.".format(r.status_code))

how to read image from wand.image.Image without saving it to drive

what changes should i do in this code so i don't have to save image to disk in step [A] then again read it from disk in step [B]. as showing in code. can anyone help me this with changes in the code or some tips?
import io
import os
import six
from import vision
from import translate
from import types
import json
from wand.image import Image
client = vision.ImageAnnotatorClient()
sample_pdf = Image(filename='CMB72_CMB0720160.pdf[0]', resolution=500)
blank = Image(filename='Untitled.png')
all_ = sample_pdf.clone()
polling_ = sample_pdf.clone()
voters = sample_pdf.clone()
file_name = 'CMB72_CMB0720122.jpg'-------------|
with,'rb') as image_file:----|>[B]
content =|
image = types.Image(content= content)
image_context = vision.types.ImageContext(
response = client.document_text_detection(image=image)
texts = response.text_annotations
file = open('jin.txt','w+',encoding='utf-8')
Use the wand.image.Image.make_blob method.
content = blank.make_blob('JPEG')

Download Images from list of urls

I have a list of urls in a text file.i want the images to be downloaded to a particular folder ,how i can do there any addons available in chrome or any other program to download images from url
Create a folder in your machine.
Place your text file of images URL in the folder.
cd to that folder.
Use wget -i images.txt
You will find all your downloaded files in the folder.
On Windows 10/11 this is fairly trivial using
for /F "eol=;" %f in (filelist.txt) do curl -O %f
Note the inclusion of eol=; allows us to mask individual exclusions by adding ; at the start of those lines in filelist.txt that we do not want this time. If using above in a batch file GetFileList.cmd then double those %%'s
Windows 7 has a FTP command, but that can often throw up a firewall dialog requiring a User Authorization response.
Currently running Windows 7 and wanting to download a list of URLs without downloading any wget.exe or other dependency like curl.exe (which would be simplest as the first command) the shortest compatible way is a power-shell command (not my favorite for speed, but if needs must.)
The file with URLs is filelist.txt and IWR is the PS near equivalent of wget.
The Security Protocol first command ensures we are using modern TLS1.2 protocol
-OutF ... split-path ... means the filenames will be the same as remote filenames but in CWD (current working directory), for scripting you can cd /d folder if necessary.
PS> [Net.ServicePointManager]::SecurityProtocol = "Tls12" ; GC filelist.txt | % {IWR $_ -OutF $(Split-Path $_ -Leaf)}
To run as a CMD use a slightly different set of quotes around 'Tls12'
PowerShell -C "& {[Net.ServicePointManager]::SecurityProtocol = 'Tls12' ; GC filelist.txt | % {IWR $_ -OutF $(Split-Path $_ -Leaf)}}"
This needs to be made into a function with error handling but it repeatedly downloads images for image classification projects
import requests
urls = pd.read_csv('cat_urls.csv') #save the url list as a dataframe
rows = []
for index, i in urls.iterrows():
counter = 0
for i in rows:
file_name = 'cat' + str(counter) + '.jpg'
response = requests.get(i)
file = open(file_name, "wb")
counter += 1
import os
import time
import sys
import urllib
from progressbar import ProgressBar
def get_raw_html(url):
version = (3,0)
curr_version = sys.version_info
if curr_version >= version: #If the Current Version of Python is 3.0 or above
import urllib.request #urllib library for Extracting web pages
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
request = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(request)
respData = str(
return respData
except Exception as e:
else: #If the Current Version of Python is 2.x
import urllib2
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
request = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(request)
except URLError: # Handling SSL certificate failed
context = ssl._create_unverified_context()
response = urlopen(req,context=context)
#response = urllib2.urlopen(req)
raw_html =
return raw_html
return"Page Not found"
def next_link(s):
start_line = s.find('rg_di')
if start_line == -1: #If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
def all_links(page):
links = []
while True:
link, end_content = next_link(page)
if link == "no_links":
links.append(link) #Append all the links in the list named 'Links'
#time.sleep(0.1) #Timer could be used to slow down the request for image downloads
page = page[end_content:]
return links
def download_images(links, search_keyword):
choice = input("Do you want to save the links? [y]/[n]: ")
if choice=='y' or choice=='Y':
#write all the links into a test file.
f = open('links.txt', 'a') #Open the text file called links.txt
for link in links:
f.close() #Close the file
num = input("Enter number of images to download (max 100): ")
counter = 1
search_keyword = search_keyword.replace("%20","_")
directory = search_keyword+'/'
if not os.path.isdir(directory):
pbar = ProgressBar()
for link in pbar(links):
if counter<=int(num):
file_extension = link.split(".")[-1]
filename = directory + str(counter) + "."+ file_extension
#print ("Downloading image: " + str(counter)+'/'+str(num))
urllib.request.urlretrieve(link, filename)
except IOError:
#print ("\nIOError on Image" + str(counter))
except urllib.error.HTTPError as e:
#print ("\nHTTPError on Image"+ str(counter))
except urllib.error.URLError as e:
#print ("\nURLError on Image" + str(counter))
return errors
def search():
version = (3,0)
curr_version = sys.version_info
if curr_version >= version: #If the Current Version of Python is 3.0 or above
import urllib.request #urllib library for Extracting web pages
import urllib2 #If current version of python is 2.x
search_keyword = input("Enter the search query: ")
#Download Image Links
links = []
search_keyword = search_keyword.replace(" ","%20")
url = '' + search_keyword+ '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (get_raw_html(url))
links = links + (all_links(raw_html))
print ("Total Image Links = "+str(len(links)))
print ("\n")
errors = download_images(links, search_keyword)
print ("Download Complete.\n"+ str(errors) +" errors while downloading.")
In this python project I make a search in, which brings me a list of URL, then I save a number of them (pre-defined by the user) to a pre-defined folder. Check it out.
On Windows, install wget -
and add C:\Program Files (x86)\GnuWin32\bin to your environment path.
create a folder with a txt file of all the images you want to download.
in the location bar at the top of the file explorer type cmd
When the command prompt opens enter the following.
wget -i images.txt --no-check-certificate

How to read the latest image in a folder using python?

I have to read the latest image in a folder using python. How can I do this?
Another similar way, with some pragmatic (non-foolproof) image validation added:
import os
def get_latest_image(dirpath, valid_extensions=('jpg','jpeg','png')):
Get the latest image file in the given directory
# get filepaths of all files and dirs in the given dir
valid_files = [os.path.join(dirpath, filename) for filename in os.listdir(dirpath)]
# filter out directories, no-extension, and wrong extension files
valid_files = [f for f in valid_files if '.' in f and \
f.rsplit('.',1)[-1] in valid_extensions and os.path.isfile(f)]
if not valid_files:
raise ValueError("No valid images in %s" % dirpath)
return max(valid_files, key=os.path.getmtime)
Walk over the filenames, get their modification time and keep track of the latest modification time you found:
import os
import glob
ts = 0
found = None
for file_name in glob.glob('/path/to/your/interesting/directory/*'):
fts = os.path.getmtime(file_name)
if fts > ts:
ts = fts
found = file_name
