How to speed up resample with for loop? - resampling

I want to get the value of rsi on h1 for each m15 candle, this is how I do this. However, with data larger than 500000 lines, this is very time consuming, is there any better way. Note that it is mandatory to resample each row to get the correct result
import talib
import pandas as pd
import numpy as np
def Data(df):
df['RSI1'] = talib.RSI(df['close'], timeperiod=13)
df['RSI2'] = talib.RSI(df['close'], timeperiod=21)
return df
#len(df) > 555555
df = pd.read_csv('m15_candle.csv')
for i in range(0, len(df)):
t = df.at[i, 'time']
if t.hour == 0 and t.minute == 0:
df = df[i:]
break
df = df.set_index('time')
ohlc = {
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last'
}
rsi1 = [0]*len(df)
rsi2 = [0]*len(df)
for i in range(100000, len(df)):
h1 = Data(df[:i].resample("1h", offset=0).apply(ohlc).dropna())
rsi1[i] = h1.iloc[-1]['RSI1']
rsi2[i] = h1.iloc[-1]['RSI2']
df['RSI1_h1'] = rsi1
df['RSI2_h1'] = rsi2
df = df.reset_index()
df.to_csv("data.csv", index = False)

Related

applyInPandas() aggregation runs slowly on big delta table

I'm trying to create a gold table notebook in Databricks, however it would take 9 days to fully reprocess the historical data (43GB, 35k parquet files). I tried scaling up the cluster but it doesn't go above 5000 records/second. The bottleneck seems to be the applyInPandas() function. I'm wondering if I could replace pandas with anything else to make the gold notebook execute faster.
Silver table has 60 columns (read_id, reader_id, tracker_timestamp, event_type, ebook_id, page_id, agent_ip, agent_device_type, ...). Each row of data represents read event of an ebook. E.g 'page turn', 'click on image', 'click on link',... All of the events that have occurred in the single session have the same read.id. In the gold table I'm trying to group those events in sessions and calculate the number of times each event has occurred in the single session. So instead of 100+ rows of data for a read session in silver table I would end up just with a single aggregated row in gold table.
Input is the silver delta table:
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
from pyspark.sql.functions import pandas_udf
input = (spark
.readStream
.format("delta")
.option("withEventTimeOrder", "true")
.option("maxFilesPerTrigger", 100)
.load(f"path_to_silver_bucket")
)
I use withWatermark and session_window functions to ensure I end up grouping all of the events from the single read session. (read session automatically ends 30 minutes after the last reader activity)
group = input.withWatermark("tracker_timestamp", "10 minutes").groupBy("read_id", F.session_window(input.tracker_timestamp, "30 minutes"))
In the next step I use the applyInPandas function like so:
sessions = group.applyInPandas(processing_function, schema=processing_function_output_schema)
Definition of the processing_function used in applyInPandas:
def processing_function(df):
surf_time_ms = df.query('event_type == "surf"')['duration'].sum()
immerse_time_ms = df.query('event_type == "immersion"')['duration'].sum()
min_timestamp = df['tracker_timestamp'].min()
max_timestamp = df['tracker_timestamp'].max()
shares = len(df.query('event_type == "share"'))
leads = len(df.query('event_type == "lead_store"'))
is_read = len(df.query('event_type == "surf"')) > 0
distinct_pages = df['page_id'].nunique()
data = {
"read_id": df['read_id'].values[0],
"surf_time_ms": surf_time_ms,
"immerse_time_ms": immerse_time_ms,
"min_timestamp": min_timestamp,
"max_timestamp": max_timestamp,
"shares": shares,
"leads": leads,
"is_read": is_read,
"number_of_events": len(df),
"distinct_pages": distinct_pages
}
for field in not_calculated_string_fields:
data[field] = df[field].values[0]
new_df = pd.DataFrame(data=data, index=['read_id'])
for x in all_events:
new_df[f"count_{x}"] = df.query(f"type == '{x}'").count()
for x in duration_events:
duration = df.query(f"event_type == '{x}'")['duration']
duration_sum = duration.sum()
new_df[f"duration_{x}_ms"] = duration_sum
if duration_sum > 0:
new_df[f"mean_duration_{x}_ms"] = duration.mean()
else:
new_df[f"mean_duration_{x}_ms"] = 0
return new_df
And finally, I'm writing the calculated row to the gold table like so:
for_partitioning = (sessions
.withColumn("tenant", F.col("story_tenant"))
.withColumn("year", F.year(F.col("min_timestamp")))
.withColumn("month", F.month(F.col("min_timestamp"))))
checkpoint_path = "checkpoint-path"
gold_path = f"gold-bucket"
(for_partitioning
.writeStream
.format('delta')
.partitionBy('year', 'month', 'tenant')
.option("mergeSchema", "true")
.option("checkpointLocation", checkpoint_path)
.outputMode("append")
.start(gold_path))
Can anybody think of a more efficient way to do a UDF in PySpark than applyInPandas for the above example? I simply cannot afford to wait 9 days to reprocess 43GB of data...
I've tried playing around with different input and output options (e.g. .option("maxFilesPerTrigger", 100)) but the real problem seems to be applyInPandas.
You could rewrite your processing_function into native Spark if you really wanted.
"read_id": df['read_id'].values[0]
F.first('read_id').alias('read_id')
"surf_time_ms": df.query('event_type == "surf"')['duration'].sum()
F.sum(F.when(F.col('event_type') == 'surf', F.col('duration'))).alias('surf_time_ms')
"immerse_time_ms": df.query('event_type == "immersion"')['duration'].sum()
F.sum(F.when(F.col('event_type') == 'immersion', F.col('duration'))).alias('immerse_time_ms')
"min_timestamp": df['tracker_timestamp'].min()
F.min('tracker_timestamp').alias('min_timestamp')
"max_timestamp": df['tracker_timestamp'].max()
F.max('tracker_timestamp').alias('max_timestamp')
"shares": len(df.query('event_type == "share"'))
F.count(F.when(F.col('event_type') == 'share', F.lit(1))).alias('shares')
"leads": len(df.query('event_type == "lead_store"'))
F.count(F.when(F.col('event_type') == 'lead_store', F.lit(1))).alias('leads')
"is_read": len(df.query('event_type == "surf"')) > 0
(F.count(F.when(F.col('event_type') == 'surf', F.lit(1))) > 0).alias('is_read')
"number_of_events": len(df)
F.count(F.lit(1)).alias('number_of_events')
"distinct_pages": df['page_id'].nunique()
F.countDistinct('page_id').alias('distinct_pages')
for field in not_calculated_string_fields:
data[field] = df[field].values[0]
*[F.first(field).alias(field) for field in not_calculated_string_fields]
for x in all_events:
new_df[f"count_{x}"] = df.query(f"type == '{x}'").count()
The above can probably be skipped? As far as my tests go, new columns get NaN values, because .count() returns a Series object instead of one simple value.
for x in duration_events:
duration = df.query(f"event_type == '{x}'")['duration']
duration_sum = duration.sum()
new_df[f"duration_{x}_ms"] = duration_sum
if duration_sum > 0:
new_df[f"mean_duration_{x}_ms"] = duration.mean()
else:
new_df[f"mean_duration_{x}_ms"] = 0
*[F.sum(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"duration_{x}_ms") for x in duration_events]
*[F.mean(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"mean_duration_{x}_ms") for x in duration_events]
So, instead of
def processing_function(df):
...
...
sessions = group.applyInPandas(processing_function, schema=processing_function_output_schema)
you could use efficient native Spark:
sessions = group.agg(
F.first('read_id').alias('read_id'),
F.sum(F.when(F.col('event_type') == 'surf', F.col('duration'))).alias('surf_time_ms'),
F.sum(F.when(F.col('event_type') == 'immersion', F.col('duration'))).alias('immerse_time_ms'),
F.min('tracker_timestamp').alias('min_timestamp'),
F.max('tracker_timestamp').alias('max_timestamp'),
F.count(F.when(F.col('event_type') == 'share', F.lit(1))).alias('shares'),
F.count(F.when(F.col('event_type') == 'lead_store', F.lit(1))).alias('leads'),
(F.count(F.when(F.col('event_type') == 'surf', F.lit(1))) > 0).alias('is_read'),
F.count(F.lit(1)).alias('number_of_events'),
F.countDistinct('page_id').alias('distinct_pages'),
*[F.first(field).alias(field) for field in not_calculated_string_fields],
# skipped count_{x}
*[F.sum(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"duration_{x}_ms") for x in duration_events],
*[F.mean(F.when(F.col('event_type') == x, F.col('duration'))).alias(f"mean_duration_{x}_ms") for x in duration_events],
)

Understanding the distance metric in company name matching using KNN

I am trying to understand the following code that I found for matching a messy list of company names to a list of clean list of company names. My question is what the 'Ratio' metric is calculated using. It appears that the ratio is from scorer = fuzz.token_sort_ratio which is I understand is part of the fuzzywuzzy package and therefore a levenschtein distance calculation correct? I'm trying to understand why the author uses this as the scorer rather than the distance output from KNN. When I try changing the metric inside NearestNeighbors, it doesn't appear to change the results. Does the metric in NearestNeighbors matter then?
Original article:
https://audhiaprilliant.medium.com/fuzzy-string-matching-optimization-using-tf-idf-and-knn-b07fce69b58f
def build_vectorizer(
clean: pd.Series,
analyzer: str = 'char',
ngram_range: Tuple[int, int] = (1, 4),
n_neighbors: int = 1,
**kwargs
) -> Tuple:
# Create vectorizer
vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, **kwargs)
X = vectorizer.fit_transform(clean.values.astype('U'))
# Fit nearest neighbors corpus
nbrs = NearestNeighbors(n_neighbors = n_neighbors, metric = 'cosine').fit(X)
return vectorizer, nbrs
# String matching - KNN
def tfidf_nn(
messy,
clean,
n_neighbors = 1,
**kwargs
):
# Fit clean data and transform messy data
vectorizer, nbrs = build_vectorizer(clean, n_neighbors = n_neighbors, **kwargs)
input_vec = vectorizer.transform(messy)
# Determine best possible matches
distances, indices = nbrs.kneighbors(input_vec, n_neighbors = n_neighbors)
nearest_values = np.array(clean)[indices]
return nearest_values, distances
# String matching - match fuzzy
def find_matches_fuzzy(
row,
match_candidates,
limit = 5
):
row_matches = process.extract(
row, dict(enumerate(match_candidates)),
scorer = fuzz.token_sort_ratio,
limit = limit
)
result = [(row, match[0], match[1]) for match in row_matches]
return result
# String matching - TF-IDF
def fuzzy_nn_match(
messy,
clean,
column,
col,
n_neighbors = 100,
limit = 5, **kwargs):
nearest_values, _ = tfidf_nn(messy, clean, n_neighbors, **kwargs)
results = [find_matches_fuzzy(row, nearest_values[i], limit) for i, row in enumerate(messy)]
df = pd.DataFrame(itertools.chain.from_iterable(results),
columns = [column, col, 'Ratio']
)
return df
# String matching - Fuzzy
def fuzzy_tf_idf(
df: pd.DataFrame,
column: str,
clean: pd.Series,
mapping_df: pd.DataFrame,
col: str,
analyzer: str = 'char',
ngram_range: Tuple[int, int] = (1, 3)
) -> pd.Series:
# Create vectorizer
clean = clean.drop_duplicates().reset_index(drop = True)
messy_prep = df[column].drop_duplicates().dropna().reset_index(drop = True).astype(str)
messy = messy_prep.apply(preprocess_string)
result = fuzzy_nn_match(messy = messy, clean = clean, column = column, col = col, n_neighbors = 1)
# Map value from messy to clean
return result

how to add symbols to the multiple stock data

#i have scraped data below is my code, now i want to add a column of symbols to the respective company data, plz guide me how the symbol can be added to the respective firm data
#code below
from time import sleep
import pandas as pd
import os
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
browser = webdriver.Chrome(ChromeDriverManager().install())
symbols =['FATIMA',
'SSGC',
'FCCL',
'ISL',
'KEL',
'NCL',
'DGKC',
'SNGP',
'NML',
'ENGRO',
'HUMNL',
'CHCC',
'ATRL',
'HUBC',
'ASTL',
'PIBTL',
'OGDC',
'EFERT',
'FFC',
'NCPL',
'KTML',
'PSO',
'LUCK',
'SEARL',
'KOHC',
'ABOT',
'AICL',
'HASCOL',
'PTC',
'KAPCO',
'PIOC',
'POL',
'SHEL',
'GHGL',
'HCAR',
'DCR',
'BWCL',
'MTL',
'GLAXO',
'PKGS',
'SHFA','MARI',
'ICI',
'ACPL',
'PSMC',
'SPWL',
'THALL',
'BNWM',
'EFUG',
'GADT',
'AABS']
company = 1
for ThisSymbol in symbols :
# Get first symbol from the above python list
company = 2
# In the URL, make symbol as variable
url = 'http://www.scstrade.com/stockscreening/SS_CompanySnapShotYF.aspx?symbol=' + ThisSymbol
browser.get(url)
sleep(2)
# The below command will get all the contents from the url
html = browser.execute_script("return document.documentElement.outerHTML")
# So we will supply the contents to beautiful soup and we tell to consider this text as a html, with the following command
soup = BeautifulSoup (html, "html.parser")
for rn in range(0,9) :
plist = []
r = soup.find_all('tr')[rn]
# Condition: if first row, then th, otherwise td
if (rn==0) :
celltag = 'th'
else :
celltag = 'td'
# Now use the celltag instead of using fixed td or th
col = r.find_all(celltag)
print()
if col[i] == 0:
print ("")
else:
for i in range(0,4) :
cell = col[i].text
clean = cell.replace('\xa0 ', '')
clean = clean.replace (' ', '')
plist.append(clean)
# If first row, create df, otherwise add to it
if (rn == 0) :
df = pd.DataFrame(plist)
else :
df2 = pd.DataFrame(plist)
colname = 'y' + str(2019-rn)
df[colname] = df2
if (company == 1):
dft = df.T
# Get header Column
head = dft.iloc[0]
# Exclude first row from the data
dft = dft[1:]
dft.columns = head
dft = dft.reset_index()
# Assign Headers
dft = dft.drop(['index'], axis = 'columns')
else:
dft2 = df.T
# Get header Column
head = dft2.iloc[0]
# Exclude first row from the data
dft2 = dft2[1:]
dft2.columns = head
dft2 = dft2.reset_index()
# Assign Headers
dft2 = dft2.drop(['index'], axis = 'columns')
dft['Symbol'] = ThisSymbol
dft = dft.append(dft2, sort=['Year','Symbol'])
company = company +1
dft
my output looks this, i want to have a symbol column to each respective firm data
Symbol,i have added
dft['Symbol'] = ThisSymbol
but it add just first company from the list to all companies data
enter image description here

OCR only image on one side

I was wondering if there's a way to only OCR the document on the right (ignoring the left) without having to split the images in PS or any other image editor?
The problem is that sometimes there is text on the images. However, they are polluting my results as I only need to rear the right-hand side.
Kind regards,
O.
## PREPROCESSING (load and read images to OCR and transform them into a DataFrame)
import pytesseract as tess
from tesserocr import PyTessBaseAPI, RIL
import os
from PIL import Image
import pandas as pd
import re
import tesserocr
path = "/Users/oliviervandhuynslager/PycharmProjects/Design Centre/assets/img/" ##path to directory (folder) where the images are located
count = 0
fileName = [] #create empty list that will contain the original filenames
fullText = [] #create empty list to store the OCR results per file
for imageName in os.listdir(path):
count = count + 1
fileName.append(imageName)
# fileName.sort()#generate list from texts.
with PyTessBaseAPI(lang='eng') as api:
for imageName in os.listdir(path):
inputPath = os.path.join(path, imageName)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
d = {"FILENAME":fileName, "OCR": fullText}
df = pd.DataFrame(d)
##Generate empty lists
search_material = []
search_product = []
search_manufacturer = []
search_designer = []
search_description = []
search_dimensions = []
search_packing = []
search_price = []
search_delivery = []
## -_-_-_-_-_-_-_-_-_-_-_-_-_-
count_material = 0
count_product = 0
count_maufacturer = 0
count_designer = 0
count_description = 0
count_dimension = 0
count_packing = 0
count_price = 0
## search for PRODUCT (NAME/TITLE)
for values in df["OCR"]:
try:
search_product.append((re.search(r'Product[\s\S]+', values).group()).split("\n")[0].split(":")[1])
count_product = count_product + 1
except:
search_product.append("")
df["PRODUCT"] = search_product
## search for MANUFACTURER
for values in df["OCR"]:
try:
search_manufacturer.append((re.search(r'Manufacturer[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_maufacturer = count_maufacturer + 1
except:
search_manufacturer.append("")
df["MANUFACTURER"] = search_manufacturer
## search for DESIGNER
for values in df["OCR"]:
try:
search_designer.append((re.search(r'Designer[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_designer = count_designer + 1
except:
search_designer.append("")
df["DESIGNER"] = search_designer
## search for MATERIALS
for values in df["OCR"]:
try:
search_material.append((re.search(r'Material[\S\s]+', values).group()).split("\n")[0].lstrip().split(":")[1])
count_material = count_material + 1
except:
search_material.append("")
df["MATERIAL"] = search_material
#search for DESCRIPTION:
for values in df["OCR"]:
try:
search_description.append((re.search(r'Description[\S\s]+', values).group()).split(":")[1])
count_description = count_description + 1
except:
search_description.append("")
df["DESCRIPTION"] = search_description
#search for DIMENSIONS
for values in df["OCR"]:
try:
search_dimensions.append((re.search(r'Dimensions[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_dimension = count_dimension + 1
except:
search_dimensions.append("")
df["DIMENSIONS"] = search_dimensions
#search for PACKING
for values in df["OCR"]:
try:
search_packing.append((re.search(r'Packing[\S\s]+', values).group()).split('\n\n')[0].split(":")[1])
count_packing = count_packing + 1
except:
search_packing.append("")
df["PACKING"] = search_packing
#search for PRICE
for values in df["OCR"]:
try:
search_price.append((re.search(r'Price[\S\s]+', values).group()).split("\n")[0].split(":")[1])
count_price = count_price + 1
except:
search_price.append("")
df["PRICE"] = search_price
#search for DELIVERY DAYS
for values in df["OCR"]:
try:
search_delivery.append((re.search(r'Delivery[\S\s]+', values).group()).split("\n\n")[0].split(":")[1])
count_delivery = count_delivery + 1
except:
search_delivery.append("")
df["DELIVERY"] = search_delivery
df.drop(columns="OCR", inplace=True)
print(df)
If the layout of text on your image is fixed then you can simply read the full Image but pass only half of that image array to tesseract.
import cv2
img = cv2.imread(inputPath)
_, width, _ = img.shape
half = width//2
cut = img[: half: , :]
temp_path = r'path/where/you/want/your/cropped/image/to/be/saved'
cv2.imwrite(temp_path, cut)
api.SetImageFile(inputPath)
text = api.GetUTF8Text()
print(api.AllWordConfidences())
fullText.append(text)
os.remove(temp_path) #removing cut image from the directory
Alternate Approach
You can pass the image array cut to the tesseract instead of saving it and then removing it. In that case, remember to convert the image array cut to RGB format since open cv uses BGR format by default while reading images.
rgb_arr = cv2.cvtColor(cut, cv2.COLOR_BGR2RGB)
All these things can be done with PIL also. In PIL you can use crop() to extract the required part of the image. Also by default, it reads images in the RGB format and can be passed directly to tesseract if you are following the alternate approach as mentioned above
You can call api.SetRectangle method passing the coordinates of the right half before recognition.

PyTorch custom dataset dataloader returns strings (of keys) not tensors

I am trying to load my own dataset and I use a custom Dataloader that reads in images and labels and converts them to PyTorch Tensors. However when the Dataloader is instantiated it returns strings x "image" and y "labels" but not the real values or tensors when read (iter)
print(self.train_loader) # shows a Tensor object
tic = time.time()
with tqdm(total=self.num_train) as pbar:
for i, (x, y) in enumerate(self.train_loader): # x and y are returned as string (where it fails)
if self.use_gpu:
x, y = x.cuda(), y.cuda()
x, y = Variable(x), Variable(y)
This is how dataloader.py looks like:
from __future__ import print_function, division #ds
import numpy as np
from utils import plot_images
import os #ds
import pandas as pd #ds
from skimage import io, transform #ds
import torch
from torchvision import datasets
from torch.utils.data import Dataset, DataLoader #ds
from torchvision import transforms
from torchvision import utils #ds
from torch.utils.data.sampler import SubsetRandomSampler
class CDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.frame = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.frame)
def __getitem__(self, idx):
img_name = os.path.join(self.root_dir,
self.frame.iloc[idx, 0]+'.jpg')
image = io.imread(img_name)
# image = image.transpose((2, 0, 1))
labels = np.array(self.frame.iloc[idx, 1])#.as_matrix() #ds
#landmarks = landmarks.astype('float').reshape(-1, 2)
#print(image.shape)
#print(img_name,labels)
sample = {'image': image, 'labels': labels}
if self.transform:
sample = self.transform(sample)
return sample
class ToTensor(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
image, labels = sample['image'], sample['labels']
#print(image)
#print(labels)
# swap color axis because
# numpy image: H x W x C
# torch image: C X H X W
image = image.transpose((2, 0, 1))
#print(image.shape)
#print((torch.from_numpy(image)))
#print((torch.from_numpy(labels)))
return {'image': torch.from_numpy(image),
'labels': torch.from_numpy(labels)}
def get_train_valid_loader(data_dir,
batch_size,
random_seed,
#valid_size=0.1, #ds
#shuffle=True,
show_sample=False,
num_workers=4,
pin_memory=False):
"""
Utility function for loading and returning train and valid
multi-process iterators over the MNIST dataset. A sample
9x9 grid of the images can be optionally displayed.
If using CUDA, num_workers should be set to 1 and pin_memory to True.
Args
----
- data_dir: path directory to the dataset.
- batch_size: how many samples per batch to load.
- random_seed: fix seed for reproducibility.
- #ds valid_size: percentage split of the training set used for
the validation set. Should be a float in the range [0, 1].
In the paper, this number is set to 0.1.
- shuffle: whether to shuffle the train/validation indices.
- show_sample: plot 9x9 sample grid of the dataset.
- num_workers: number of subprocesses to use when loading the dataset.
- pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
True if using GPU.
Returns
-------
- train_loader: training set iterator.
- valid_loader: validation set iterator.
"""
#ds
#error_msg = "[!] valid_size should be in the range [0, 1]."
#assert ((valid_size >= 0) and (valid_size <= 1)), error_msg
#ds
# define transforms
#normalize = transforms.Normalize((0.1307,), (0.3081,))
trans = transforms.Compose([
ToTensor(), #normalize,
])
# load train dataset
#train_dataset = datasets.MNIST(
# data_dir, train=True, download=True, transform=trans
#)
train_dataset = CDataset(csv_file='/home/Desktop/6June17/util/train.csv',
root_dir='/home/caffe/data/images/',transform=trans)
# load validation dataset
#valid_dataset = datasets.MNIST( #ds
# data_dir, train=True, download=True, transform=trans #ds
#)
valid_dataset = CDataset(csv_file='/home/Desktop/6June17/util/eval.csv',
root_dir='/home/caffe/data/images/',transform=trans)
num_train = len(train_dataset)
train_indices = list(range(num_train))
#ds split = int(np.floor(valid_size * num_train))
num_valid = len(valid_dataset) #ds
valid_indices = list(range(num_valid)) #ds
#if shuffle:
# np.random.seed(random_seed)
# np.random.shuffle(indices)
#ds train_idx, valid_idx = indices[split:], indices[:split]
train_idx = train_indices #ds
valid_idx = valid_indices #ds
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=batch_size, sampler=train_sampler,
num_workers=num_workers, pin_memory=pin_memory,
)
print(train_loader)
valid_loader = torch.utils.data.DataLoader(
valid_dataset, batch_size=batch_size, sampler=valid_sampler,
num_workers=num_workers, pin_memory=pin_memory,
)
# visualize some images
if show_sample:
sample_loader = torch.utils.data.DataLoader(
dataset, batch_size=9, #shuffle=shuffle,
num_workers=num_workers, pin_memory=pin_memory
)
data_iter = iter(sample_loader)
images, labels = data_iter.next()
X = images.numpy()
X = np.transpose(X, [0, 2, 3, 1])
plot_images(X, labels)
return (train_loader, valid_loader)
def get_test_loader(data_dir,
batch_size,
num_workers=4,
pin_memory=False):
"""
Utility function for loading and returning a multi-process
test iterator over the MNIST dataset.
If using CUDA, num_workers should be set to 1 and pin_memory to True.
Args
----
- data_dir: path directory to the dataset.
- batch_size: how many samples per batch to load.
- num_workers: number of subprocesses to use when loading the dataset.
- pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
True if using GPU.
Returns
-------
- data_loader: test set iterator.
"""
# define transforms
#normalize = transforms.Normalize((0.1307,), (0.3081,))
trans = transforms.Compose([
ToTensor(), #normalize,
])
# load dataset
#dataset = datasets.MNIST(
# data_dir, train=False, download=True, transform=trans
#)
test_dataset = CDataset(csv_file='/home/Desktop/6June17/util/test.csv',
root_dir='/home/caffe/data/images/',transform=trans)
test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=batch_size, shuffle=False,
num_workers=num_workers, pin_memory=pin_memory,
)
return test_loader
#for i_batch, sample_batched in enumerate(dataloader):
# print(i_batch, sample_batched['image'].size(),
# sample_batched['landmarks'].size())
# # observe 4th batch and stop.
# if i_batch == 3:
# plt.figure()
# show_landmarks_batch(sample_batched)
# plt.axis('off')
# plt.ioff()
# plt.show()
# break
A minimal working sample will be difficult to post here but basically I am trying to modify this project http://torch.ch/blog/2015/09/21/rmva.html which works smoothly with MNIST. I am just trying to run it with my own dataset with the custom dataloader.py I use above.
It instantiates a Dataloader like this:
in trainer.py:
if config.is_train:
self.train_loader = data_loader[0]
self.valid_loader = data_loader[1]
self.num_train = len(self.train_loader.sampler.indices)
self.num_valid = len(self.valid_loader.sampler.indices)
-> run from main.py:
if config.is_train:
data_loader = get_train_valid_loader(
config.data_dir, config.batch_size,
config.random_seed, #config.valid_size,
#config.shuffle,
config.show_sample, **kwargs
)
You are not properly using python's enumerate(). (x, y) are currently assigned the 2 keys of your batch dictionary i.e. the strings "image" and "labels". This should solve your problem:
for i, batch in enumerate(self.train_loader):
x, y = batch["image"], batch["labels"]
# ...

Resources