Wordcount Nonetype error pyspark- - hadoop

I am trying to do some text analysis:
def cleaning_text(sentence):
sentence=sentence.lower()
sentence=re.sub('\'','',sentence.strip())
sentence=re.sub('^\d+\/\d+|\s\d+\/\d+|\d+\-\d+\-\d+|\d+\-\w+\-\d+\s\d+\:\d+|\d+\-\w+\-\d+|\d+\/\d+\/\d+\s\d+\:\d+',' ',sentence.strip())# dates removed
sentence=re.sub(r'(.)(\/)(.)',r'\1\3',sentence.strip())
sentence=re.sub("(.*?\//)|(.*?\\\\)|(.*?\\\)|(.*?\/)",' ',sentence.strip())
sentence=re.sub('^\d+','',sentence.strip())
sentence = re.sub('[%s]' % re.escape(string.punctuation),'',sentence.strip())
cleaned=' '.join([w for w in sentence.split() if not len(w)<2 and w not in ('no', 'sc','ln') ])
cleaned=cleaned.strip()
if(len(cleaned)<=1):
return "NA"
else:
return cleaned
org_val=udf(cleaning_text,StringType())
df_new =df.withColumn("cleaned_short_desc", org_val(df["symptom_short_description_"]))
df_new =df_new.withColumn("cleaned_long_desc", org_val(df_new["long_description"]))
longWordsDF = (df_new.select(explode(split('cleaned_long_desc',' ')).alias('word'))
longWordsDF.count()
I get the following error.
File "<stdin>", line 2, in cleaning_text
AttributeError: 'NoneType' object has no attribute 'lower'
I want to perform word counts but any kind of aggregation function is giving me an error.
I tried following things:
sentence=sentence.encode("ascii", "ignore")
Added this statement in the cleaning_text function
df.dropna()
Its still giving the same issue, I do not know how to resolve this issue.

It looks like you have null values in some columns. Add an if at the beginning of cleaning_text function and the error will disappear:
if sentence is None:
return "NA"

Related

Unexpected index error using choice module

I've been trying to use args in a function so that it can accept an unspecified amount of parameters but during testing I seem to be lumped with this error. Which seems to be produced by the choice module itself not the file i'm writing.
File "C:\ProgramFiles\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.1520.0_x64__qbz5n2kfra8p0\lib\random.py", line 378, in choice
return seq[self._randbelow(len(seq))]
IndexError: list index out of range
Here is the code i'm trying to run:
def key_word_choice(key, *otherkeys):
key_choice = []
for items in sauces:
if key and otherkeys in items:
key_choice.append(items[0])
print("You should choose:", choice(key_choice))
key_word_choice('hot', 'garlic', 'buffalo')

argument must be a string or unicode object: got tuple instead

The following below is the actual code:
I get the following error: TypeError: list indices must be
integers or slices, not str and TypeError: argument 1 must be a
string or unicode object: got list instead
def search(gss=None, description=None, postcode=None):
if request.method == "POST":
postcodevar = request.form['postcodevar']
print("Asssigning SQl statement to a postgreSQL variable")
postgreSQL_select_Query_2 = [("""SELECT DISTINCT gss,description,
postcode FROM postcodelatlng pt, gss.shapes g
WHERE st_contains(geom,pos) AND %(postcode)s~* '(\y(?:[A-PR-
UWYZ](?:[A-HK-]?\d{1,2}|d[A-HJKMNP-Y]|[A-HK-Y]\d[ABEHMNPRV-
Y]))(?:\s*\d[ABD-HJLNP-UW-]{2})\y)'
LIMIT 50""", {'postcode': postcodevar})]
cursor.execute(postgreSQL_select_Query_2, postcodevar)
print("Selecting rows from mobile table using
cursor.fetchall")
records = cursor.fetchall()
return render_template('idm_prototype.html', data=records)
File "/Users/fernandocamutari/PycharmProjects/DeploymentPrototype/main.py", line 69, in search
WHERE st_contains(geom,pos)
AND %(postcode)s~* '(\y(?:[A-PR-UWYZ](?:[A-HK-]?\d{1,2}|d[A-HJKMNP-Y]|[A-HK-Y]\d[ABEHMNPRV-Y]))(?:\s*\d[ABD-HJLNP-UW-]{2})\y)'
LIMIT 50""", {'postcode': postcodevar})]
cursor.execute(postgreSQL_select_Query_2)
print("Selecting rows from mobile table using cursor.fetchall")
records = cursor.fetchall()
return render_template('idm_prototype.html', data=records)
TypeError: argument 1 must be a string or unicode object: got list instead
This is the Copy/Paste friendly version of the traceback.
Traceback (most recent call last):
File "/Users/fernandocamutari/PycharmProjects/DeploymentPrototype/venv/lib/python3.8/site-packages/flask/app.py", line 2088, in call
return self.wsgi_app(environ, start_response)
File "/Users/fernandocamutari/PycharmProjects/DeploymentPrototype/venv/lib/python3.8/site-packages/flask/app.py", line 2073, in wsgi_app
response = self.handle_exception(e)
File "/Users/fernandocamutari/PycharmProjects/DeploymentPrototype/venv/lib/python3.8/site-packages/flask/app.py", line 2070, in wsgi_app
response = self.full_dispatch_request()
File "/Users/fernandocamutari/PycharmProjects/DeploymentPrototype/venv/lib/python3.8/site-packages/flask/app.py", line 1515, in full_dispatch_request
rv = self.handle_user_exception(e)
File "/Users/fernandocamutari/PycharmProjects/DeploymentPrototype/venv/lib/python3.8/site-packages/flask/app.py", line 1513, in full_dispatch_request
rv = self.dispatch_request()
File "/Users/fernandocamutari/PycharmProjects/DeploymentPrototype/venv/lib/python3.8/site-packages/flask/app.py", line 1499, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)
File "/Users/fernandocamutari/PycharmProjects/DeploymentPrototype/main.py", line 54, in search
postgreSQL_select_Query_2 = {"""SELECT DISTINCT gss, description,
TypeError: unhashable type: 'dict'

Get metadata for a column with google Sheets API

I have a Google spreadsheet that I am connecting to and interacting with using the google-python-api-client package. Following this description on metadata search, and the links in it for the request body, I have written a function to get metadata for a range:
def get_metadata_by_range(range_: Union[dict, str]) -> dict:
if isinstance(range_, str):
print("String range: ", range_)
request_body = {"dataFilters": \
{"a1Range": range_}}
elif isinstance(range_, dict):
print("Dict range: ", range_)
request_body = {"dataFilters": \
[{"gridRange": range_}]}
else:
return None
request = service.spreadsheets().developerMetadata().\
search(spreadsheetId=SPREADSHEET_ID, body=request_body)
return request.execute()
Calling this with a range, either A1 notation or a gridRange will cause an error to occur though. For example, calling it with this line get_metadata_by_range("Metadata!A:A") will cause the following traceback.
String range: Metadata!A:A
Traceback (most recent call last):
File "oqc_server/fab/gapc.py", line 82, in <module>
get_metadata_by_range("Metadata!A:A")
File "oqc_server/fab/gapc.py", line 69, in get_metadata_by_range
return request.execute()
File "/media/kajsa/Storage/Projects/oqc_server/venv/lib/python3.7/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
return wrapped(*args, **kwargs)
File "/media/kajsa/Storage/Projects/oqc_server/venv/lib/python3.7/site-packages/googleapiclient/http.py", line 856, in execute
raise HttpError(resp, content, uri=self.uri)
googleapiclient.errors.HttpError: <HttpError 500 when requesting https://sheets.googleapis.com/v4/spreadsheets/1RhheCsI3kHrm8yK2Yio2kAOU4VOzYdz-eK0vjiMY7co/developerMetadata:search?alt=json returned "Internal error encountered."
Any ideas on what is causing this and how to solve it?
You want to search and retrieve the developer metadata from the range using the method of Method: spreadsheets.developerMetadata.search of Sheets API.
You want to achieve this using google-api-python-client with python.
You have already been able to get and put values for Spreadsheet with Sheets API.
If my understanding is correct, how about this answer? Please think of this as just one of several possible answers.
Modification points:
When you want to search the developer metadata with the range, please set the gridrange to dataFilters[].developerMetadataLookup.metadataLocation.dimensionRange.
When the range is set to dataFilters[].a1Range and dataFilters[].gridRange, I could confirm that the same error occurs.
Sample script:
The sample script for retrieving the developer metadata from the range is as follows. Before you use this, please set the variables of spreadsheet_id and sheet_id.
service = build('sheets', 'v4', credentials=creds)
spreadsheet_id = '###' # Please set the Spreadsheet ID.
sheet_id = ### # Please set the sheet ID.
search_developer_metadata_request_body = {
"dataFilters": [
{
"developerMetadataLookup": {
"metadataLocation": {
"dimensionRange": {
"sheetId": sheet_id,
"dimension": "COLUMNS",
"startIndex": 0,
"endIndex": 1
}
}
}
}
]
}
request = service.spreadsheets().developerMetadata().search(
spreadsheetId=spreadsheet_id, body=search_developer_metadata_request_body)
response = request.execute()
print(response)
Above script retrieves the developer metadata from the column "A" of sheet_id.
Note:
Please modify above script for your actual script.
In the current stage, the Developer Metadata can be added to the Spreadsheet, each sheet in the Spreadsheet and row and column. Please be careful this. Ref
References:
Method: spreadsheets.developerMetadata.search
Adding Developer Metadata- DeveloperMetadataLookup
If I misunderstood your question and this was not the direction you want, I apologize.
There is a bug with developerMetadata related to a1Range objects being passed as filters.
Edit
I've checked the bug again and a fix has been implemented.

Python ode first order, how to solve this using Sympy

When I try to solve this first ode by using Sympy as it shows below:
import sympy
y = sympy.Function('y')
t = sympy.Symbol('t')
ode = sympy.Eq(y(t).diff(t),(1/y(t))*sympy.sin(t))
sol = sympy.dsolve(ode,y(t))
csol=sol.subs([(t,0),(y(0),-4)]) # the I.C. is y(0) = 1
ode_sol= sol.subs([(csol.rhs,csol.lhs)])
print(sympy.pprint(ode_sol))
It gives me this error:
Traceback (most recent call last):
File "C:/Users/Mohammed Alotaibi/AppData/Local/Programs/Python/Python35/ODE2.py", line 26, in <module>
csol=sol.subs([(t,0),(y(0),-4)]) # the I.C. is y(0) = 1
AttributeError: 'list' object has no attribute 'subs'
Your problem is that this ODE does not have a unique solution. Thus it returns a list of solution, which you can find out from the error message and by printing sol.
Do the evaluation in a loop,
for psol in sol:
csol = psol.subs([(t,0),(y(0),-4)]);
ode_sol = psol.subs([(csol.rhs,csol.lhs)]);
print(sympy.pprint(ode_sol))
to find the next error, that substituting does not solve for the constant. What works is to define C1=sympy.Symbol("C1") and using
ode_sol= psol.subs([(C1, sympy.solve(csol)[0])]);
but this still feels hacky. Or better to avoid error messages for the unsolvability of the second case:
C1=sympy.Symbol("C1");
for psol in sol:
csol = psol.subs([(t,0),(y(0),-4)]);
for cc1 in sympy.solve(csol):
ode_sol= psol.subs([(C1, cc1)]);
print(sympy.pprint(ode_sol))

unexpected - TypeError: object of type 'NoneType' has no len()

I know what this error means, but I see it in a very unexpected place:
if rate and len(rate) == 1:
TypeError: object of type 'NoneType' has no len()
Context is:
rate = re_search_once("(\d+)", ratetxt)
if rate and len(rate) == 1:
where re_search_once is just a syntactic sugar for regexp extraction.
Anyway, whatever "rate" is, how comes that Python tries to evaluate the second part of "and" for NoneType? (in this case it's just None, a sole value of NoneType)
I can't check the real value of "rate", just noticed it in the logs.
using python 2.6.5 on Ubuntu 10.04.4 LTS
Updated, contents of
def re_search_once(pattern, string, flags=0):
if string is None:
return None
mo = re.compile(pattern).search(string, flags)
if mo:
gs = mo.groups()
if len(gs) == 1:
return gs[0]
else:
return gs
else:
return None

Resources