I am using UDF to add an hour to my given date and using it as filter condition in my hive query. But it is giving me below error.
FAILED: ParseException line 8:4 cannot recognize input near 'TRANSFORM' '(' '"201606161340"' in expression specification
My hive query is :
add file adddate.py;
select
dt,param2
from
my_table
where
dt>=TRANSFORM("201606161340") USING adddate.py and parm1="465";
My python Code is
from datetime import datetime, timedelta
import sys
import string
def add_date(current_date):
date_object = datetime.strptime(current_date, '%Y%m%d%H%M')
# print(date_object)
one_hour = date_object + timedelta(hours=1)
return one_hour.strftime('%Y%m%d%H%M')
# addDate("201606152350")
while True:
line = sys.stdin.readline()
if not line:
break
line = string.strip(line, "\n ")
print(add_date(line))
Related
Hi i'm having an issue with the transfer of data from one database to another. I created a list using field in a table on a msql db, used that list to query and oracle db table (using the initial list in the where statement to filter results) I then load the query results back into the msql db.
The program runs for the first few iterations but then errors out, with the following error (
Traceback (most recent call last):
File "C:/Users/1/PycharmProjects/DataExtracts/BuyerGroup.py", line 67, in
insertIntoMSDatabase(idString)
File "C:/Users/1/PycharmProjects/DataExtracts/BuyerGroup.py", line 48, in insertIntoMSDatabase
mycursor.executemany(sql, val)
pyodbc.ProgrammingError: The second parameter to executemany must not be empty.)
I can't seem to find and guidance online to troubleshoot this error message. I feel it may be a simple solution but I just can't get there...
# import libraries
import cx_Oracle
import pyodbc
import logging
import time
import re
import math
import numpy as np
logging.basicConfig(level=logging.DEBUG)
conn = pyodbc.connect('''Driver={SQL Server Native Client 11.0};
Server='servername';
Database='dbname';
Trusted_connection=yes;''')
b = conn.cursor()
dsn_tns = cx_Oracle.makedsn('Hostname', 'port', service_name='name')
conn1 = cx_Oracle.connect(user=r'uid', password='pwd', dsn=dsn_tns)
c = conn1.cursor()
beginTime = time.time()
bind = (b.execute('''select distinct field1
from [server].[db].[dbo].[table]'''))
print('MSQL table(s) queried, List Generated')
# formats ids for sql string
def surroundWithQuotes(id):
return "'" + re.sub(",|\s$", "", str(id)) + "'"
def insertIntoMSDatabase(idString):
osql = '''SELECT distinct field1, field2
FROM Database.Table
WHERE field2 is not null and field3 IN ({})'''.format(idString)
c.execute(osql)
claimsdata = c.fetchall()
print('Oracle table(s) queried, Data Pulled')
mycursor = conn.cursor()
sql = '''INSERT INTO [dbo].[tablename]
(
[fields1]
,[field2]
)
VALUES (?,?)'''
val = claimsdata
mycursor.executemany(sql, val)
conn.commit()
ids = []
formattedIdStrings = []
# adds all the ids found in bind to an iterable array
for row in bind:
ids.append(row[0])
# splits the ids[] array into multiple arrays < 1000 in length
batchedIds = np.array_split(ids, math.ceil(len(ids) / 1000))
# formats the value inside each batchedId to be a string
for batchedId in batchedIds:
formattedIdStrings.append(",".join(map(surroundWithQuotes, batchedId)))
# runs insert into MS database for each batch of IDs
for idString in formattedIdStrings:
insertIntoMSDatabase(idString)
print("MSQL table loaded, Data inserted into destination")
endTime = time.time()
print("Program Time Elapsed: ",endTime-beginTime)
conn.close()
conn1.close()
mycursor.executemany(sql, val)
pyodbc.ProgrammingError: The second parameter to executemany must not be empty.
Before calling .executemany() you need to verify that val is not an empty list (as would be the case if .fetchall() is called on a SELECT statement that returns no rows) , e.g.,
if val:
mycursor.executemany(sql, val)
I am trying load a csv file into the oracle database and facing this error :
cx_Oracle.DatabaseError: ORA-01036: illegal variable name/number
Can you please help to understand the root cause and possible fix for this issue?
def main():
ConStr = 'UserName/PWD#END_POINT'
con = cx_Oracle.connect(ConStr)
cur = con.cursor()
with open('Persons.csv','r') as file:
read_csv = csv.reader(file,delimiter= '|')
sql = "insert into Persons (PERSONID,LASTNAME,FIRSTNAME,ADDRESS,CITY) values (:1,:2,:3,:4,:5)"
for lines in read_csv :
print(lines)
cur.executemany(sql,lines)
cur.close()
con.commit()
con.close();
My csv file looks like below :
PERSONID|LASTNAME|FIRSTNAME|ADDRESS|CITY
001|abc|def|ghi|jkl
002|opq|rst|uvw|xyz
From the Oracle documentation:
import cx_Oracle
import csv
. . .
# Predefine the memory areas to match the table definition
cursor.setinputsizes(None, 25)
# Adjust the batch size to meet your memory and performance requirements
batch_size = 10000
with open('testsp.csv', 'r') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
sql = "insert into test (id,name) values (:1, :2)"
data = []
for line in csv_reader:
data.append((line[0], line[1]))
if len(data) % batch_size == 0:
cursor.executemany(sql, data)
data = []
if data:
cursor.executemany(sql, data)
con.commit()
I'm using the wrapped version of smith-waterman from skbio (0.5.4), but i have an unspected error:
_, score, _ = local_pairwise_align_ssw(protein_list[idx1], protein_list[idx2], substitution_matrix = blosum62)
File "/anaconda3/lib/python3.6/site-packages/skbio/alignment/_pairwise.py", line 732, in local_pairwise_align_ssw
validate=False)
File "/anaconda3/lib/python3.6/site-packages/skbio/alignment /_tabular_msa.py", line 785, in __init__
reset_index=minter is None and index is None)
File "/anaconda3/lib/python3.6/site-packages/skbio/alignment /_tabular_msa.py", line 1956, in extend
self._assert_valid_sequences(sequences)
File "/anaconda3/lib/python3.6/site-packages/skbio/alignment /_tabular_msa.py", line 2035, in _assert_valid_sequences
% (length, expected_length))
ValueError: Each sequence's length must match the number of positions in the MSA: 232 != 231
The weird thing is that sometimes the error appears with protein pair 0-10, and others with 0-116. So, i don't believe it's an error from protein fromat.
I have a similar problem. However, I was able to limit the error to the optimized SSW version. So no error in the sequence formatting.
import warnings
from skbio.sequence import Protein
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="...")
from Bio.Align import substitution_matrices
from skbio.alignment import local_pairwise_align_ssw
from skbio.alignment import local_pairwise_align
peptide1 = Protein("CGAGDNQAGTALIF")
peptide2 = Protein("CAGEEGGGADGLTF")
gap_open_penalty = 10
gap_extend_penalty = 10
substitution_matrix = substitution_matrices.load("BLOSUM45")
## works correct
rv = local_pairwise_align_ssw(
sequence1 = peptide1
, sequence2 = peptide2
, gap_open_penalty=1
, gap_extend_penalty=1
, substitution_matrix=substitution_matrix
)
print(rv)
## but if I swap peptide1 and peptide 2 the ValueError occur
rv = local_pairwise_align_ssw(
sequence1 = peptide2
, sequence2 = peptide1
, gap_open_penalty=1
, gap_extend_penalty=1
, substitution_matrix=substitution_matrix
)
print(rv)
## if I do the same with local_pairwise_align it works!
rv = local_pairwise_align(
seq1=peptide2
, seq2=peptide1
, gap_open_penalty=1
, gap_extend_penalty=1
, substitution_matrix=substitution_matrix
)
print(rv)
following script is used to lemmatize a given input column with text:
%%time
import pandas as pd
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS
STOPWORDS = list(STOPWORDS)
data = pd.read_csv('https://pastebin.com/raw/0SEv1RMf')
def lemmatization(s):
result = []
# lowercase, tokenize, remove stopwords, len>3, lemmatize
for token in lemmatize(s, stopwords=STOPWORDS, min_length=3):
result.append(token.decode('utf-8').split('/')[0])
# print(len(result)) <- This didn't work.
return result
X_train = data.apply(lambda r: lemmatization(r['text']), axis=1)
print(X_train)
Question:
How can I print the progress of the lemmatization progress?
You could pass a variable into the lemmatization function to keep track of the number of times it was called - and then print it every 1000 iterations or so. I have wrapped it in a list below so the int can be passed by reference rather than by value.
%%time
import pandas as pd
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS
STOPWORDS = list(STOPWORDS)
data = pd.read_csv('https://pastebin.com/raw/0SEv1RMf')
iteration_count = [0]
def lemmatization(s, iteration_count):
result = []
# lowercase, tokenize, remove stopwords, len>3, lemmatize
for token in lemmatize(s, stopwords=STOPWORDS, min_length=3):
result.append(token.decode('utf-8').split('/')[0])
# print(len(result)) <- This didn't work.
iteration_count[0] += 1
if iteration_count[0] % 1000 == 0:
print(iteration_count[0])
return result
X_train = data.apply(lambda r: lemmatization(r['text'], iteration_count), axis=1)
print(X_train)
I needed a stable index sorting for DataFrames, when I had this problem:
In cases where a DataFrame becomes a Series (when only a single column matches the selection), the kind argument returns an error. See example:
import pandas as pd
df_a = pd.Series(range(10))
df_b = pd.Series(range(100, 110))
df = pd.concat([df_a, df_b])
df.sort_index(kind='mergesort')
with the following error:
----> 6 df.sort_index(kind='mergesort')
TypeError: sort_index() got an unexpected keyword argument 'kind'
If DataFrames (more then one column is selected), mergesort works ok.
EDIT:
When selecting a single column from a DataFrame for example:
import pandas as pd
import numpy as np
df_a = pd.DataFrame(np.array(range(25)).reshape(5,5))
df_b = pd.DataFrame(np.array(range(100, 125)).reshape(5,5))
df = pd.concat([df_a, df_b])
the following returns an error:
df[0].sort_index(kind='mergesort')
...since the selection is casted to a pandas Series, and as pointed out the pandas.Series.sort_index documentation contains a bug.
However,
df[[0]].sort_index(kind='mergesort')
works alright, since its type continues to be a DataFrame.
pandas.Series.sort_index() has no kind parameter.
here is the definition of this function for Pandas 0.18.1 (file: ./pandas/core/series.py):
# line 1729
#Appender(generic._shared_docs['sort_index'] % _shared_doc_kwargs)
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
sort_remaining=True):
axis = self._get_axis_number(axis)
index = self.index
if level is not None:
new_index, indexer = index.sortlevel(level, ascending=ascending,
sort_remaining=sort_remaining)
elif isinstance(index, MultiIndex):
from pandas.core.groupby import _lexsort_indexer
indexer = _lexsort_indexer(index.labels, orders=ascending)
indexer = com._ensure_platform_int(indexer)
new_index = index.take(indexer)
else:
new_index, indexer = index.sort_values(return_indexer=True,
ascending=ascending)
new_values = self._values.take(indexer)
result = self._constructor(new_values, index=new_index)
if inplace:
self._update_inplace(result)
else:
return result.__finalize__(self)
file ./pandas/core/generic.py, line 39
_shared_doc_kwargs = dict(axes='keywords for axes', klass='NDFrame',
axes_single_arg='int or labels for object',
args_transpose='axes to permute (int or label for'
' object)')
So most probably it's a bug in the pandas documentation...
Your df is Series, it's not a data frame