I tried one sample program for getting an email message in outlook account using IMAP. In this account, I have 20 folders its getting all email messages except these folders (contact, calendar, task) not getting data its throwing server error. How to fix this error.
import imaplib
import pprint
import email
import base64
import json
import re
import os
import fileinput
imap_host = 'outlook.office365.com'
imap_user = 'XXXXXXXXXXX'
imap_pass = 'XXXXXXXXXXXXX'
count = 0
file_path = 'geek.txt'
# connect to host using SSL
imap = imaplib.IMAP4_SSL(imap_host,993)
# login to server
l = imap.login(imap_user, imap_pass)
# Get Flags,mailbox_name,delimiter using regex
list_response_pattern = re.compile(r'\((?P<flags>.*?)\) "(?P<delimiter>.*)" (?P<name>.*)')
# Get List of Sync folders
list_data = imap.list()
# Check Local Storage is empty Sync All Folders Details.
if os.stat(file_path).st_size == 0:
global day
# Iterate folders in Sync folder
for i in list_data[1]:
# Get Folder name
sample = re.findall('"\/"(.*)',i.decode("utf-8"))
# Get Message_ids
data = imap.select(sample[0].lstrip())
search_resp, search_data = imap.search( None, "ALL" )
match = list_response_pattern.match(i.decode("utf-8"))
flags, delimiter, mailbox_name = match.groups()
mailbox_name = mailbox_name.strip('"')
except Exception as e:
# Get Current Status of Folder
current_status = imap.status(
# Get message using UID and Message_id
msg_ids = search_data[ 0 ].split()
print("total count: ",len(msg_ids))
for i in msg_ids:
print("Message Ids: ", i)
count = count + 1
fetch_resp, fetch_UID = imap.fetch( i, 'UID' )
print("Fetch UID: ", fetch_UID)
day = bytes(str(fetch_UID[0].split()[2]).split("'")[1].split(')')[0],'utf-8')
print("ID: ",day)
fetch_resp, fetch_mdg = imap.uid('fetch', day, '(RFC822)')
email_msg = fetch_mdg[0][1]
if email_msg and isinstance(email_msg, str):
email_msg = email.message_from_string(email_msg)
except :
email_msg = None
elif email_msg and isinstance(email_msg, bytes):
email_msg = email.message_from_bytes(email_msg)
email_msg = None
print("Count: ",count)
print("UID: ",day)
# Store Folder details in File
status_details = current_status[1][0].decode("utf-8")
status_details = status_details.split('(')[1].split(')')[0].split(' ')
if len(msg_ids) == 0:
json1 = json.dumps({'total_count':int(status_details[1]),'UID':0,'UIDNext':int(status_details[5]),'UIDValidity':int(status_details[7]), 'Folder name':mailbox_name})
json1 = json.dumps({'total_count':int(status_details[1]),'UID':int(day),'UIDNext':int(status_details[5]),'UIDValidity':int(status_details[7]), 'Folder name':mailbox_name})
file = open(file_path,'a')
Message Ids: b'3'
Fetch UID: [b'3 (UID 11)']
ID: b'11'
[(b'3 (RFC822 {757}', b'MIME-Version: 1.0\r\nContent-Type: text/plain; charset="us-ascii"\r\nFrom: Microsoft Exchange Server\r\nTo: "\r\nSubject: Retrieval using the IMAP4 protocol failed for the following message:\r\n 11\r\nContent-Transfer-Encoding: quoted-printable\r\n\r\nThe server couldn\'t retrieve the following message:\r\n\r\nSubject: "Test email Sync 3"\r\nFrom: "Imap Testing" ("/O=3DEXCHANGELABS/OU=3DEXCHANGE ADMINISTRATIVE GROUP=\r\n (FYDIBOHF23SPDLT)/CN=3DRECIPIENTS/CN=3DEBF2483D9A0145A59A48B829B12A45E4-MA=\r\nILBOX1")\r\nSent date: 5/6/2020 2:02:59 AM\r\n\r\nThe message hasn\'t been deleted. You might be able to view it using either =\r\nOutlook or Outlook Web App. You can also contact the sender to find out wha=\r\nt the message says.=\r\n'), b' UID 11 FLAGS (\\Seen))']
Server Error
Subject: Retrieval using the IMAP4 protocol failed for the following message:
Content-Transfer-Encoding: quoted-printable
The server couldn't retrieve the following message:
Subject: "Testing"
Sent date: 5/6/2020 2:01:54 AM
The message hasn't been deleted. You might be able to view it using either =
Outlook or Outlook Web App. You can also contact the sender to find out wha=
t the message says.=
I have around 20 folders I iterate one by one get current status of folder and stored in sample file. Its successfully working.but I tried to print email messages some folders (contact,calender,task) its showing this response.
I'm trying to upload videos to youtube using Django and MSSQL, I want to store the user data to DB so that I can log in from multiple accounts and post videos.
The official documentation provided by youtube implements a file system and after login, all the user data gets saved there, I don't want to store any data in a file as saving files to DB would be a huge risk and not a good practice. So how can I bypass this step and save data directly to DB and retrieve it when I want to post videos to a specific account?
In short, I want to replace the pickle file implementation with storing it in the database.
Here's my code
def youtubeAuthenticate():
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "client_secrets.json"
creds = None
# the file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first time
if os.path.exists("token.pickle"):
with open("token.pickle", "rb") as token:
creds = pickle.load(token)
# if there are no (valid) credentials availablle, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
flow = InstalledAppFlow.from_client_secrets_file(client_secrets_file, SCOPES)
creds = flow.run_local_server(port=0)
# save the credentials for the next run
with open("token.pickle", "wb") as token:
pickle.dump(creds, token)
return build(api_service_name, api_version, credentials=creds)
def postVideoYT(request):
youtube = youtubeAuthenticate()
initialize_upload(youtube, request.data)
except HttpError as e:
print("An HTTP error %d occurred:\n%s" % (e.resp.status, e.content))
return Response("Hello")
def initialize_upload(youtube, options):
print('options', options)
print("title", options['title'])
# tags = None
# if options.keywords:
# tags = options.keywords.split(",")
# # Call the API's videos.insert method to create and upload the video.
insert_request = youtube.videos().insert(
media_body=MediaFileUpload(options['file'], chunksize=-1, resumable=True)
path = pathlib.Path(options['file'])
ext = path.suffix
getSize = os.path.getsize(options['file'])
# This method implements an exponential backoff strategy to resume a
# failed upload.
def resumable_upload(insert_request, ext, getSize):
response = None
error = None
retry = 0
while response is None:
print("Uploading file...")
status, response = insert_request.next_chunk()
if response is not None:
respData = response
if 'id' in response:
print("Video id '%s' was successfully uploaded." % response['id'])
exit("The upload failed with an unexpected response: %s" % response)
except HttpError as e:
if e.resp.status in RETRIABLE_STATUS_CODES:
error = "A retriable HTTP error %d occurred:\n%s" % (e.resp.status,
error = "A retriable error occurred: %s" % e
if error is not None:
retry += 1
if retry > MAX_RETRIES:
exit("No longer attempting to retry.")
max_sleep = 2 ** retry
sleep_seconds = random.random() * max_sleep
print("Sleeping %f seconds and then retrying..." % sleep_seconds)
I'm trying to create a cog for my Discord bot that scrapes Indeed and returns info on job postings (position, company, location, etc). My bot is hosted on Heroku, which is where the issues start. I've tested my web scraper by itself and when implemented as a cog for my Discord bot locally. It works both times. However, when I tried to deploy it on Heroku, the cog stopped working.
I read that this was because cloud-hosting services have blacklists or something for web scraping apps and functions. So I tried to use rq as suggested in this post:
I did all the steps, added an additional worker, a worker.py file, and installed the Redis To Go addon. However, when I try to use the following, I receive nothing back:
url = get_url(job_title, location)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
# soup.find() returns None
I'm sure I just implemented something wrong, but can someone help me please? The full code is below:
import discord
from discord.ext import commands
import random
import requests
import time
from bs4 import BeautifulSoup
from rq import Queue
from worker import conn
ret = []
def get_url(position, location):
'''Generate url from position and location'''
template = 'https://www.indeed.com/jobs?q={}&l={}'
position = position.replace(" ", "+")
location = location.replace(" ", "+")
url = template.format(position, location)
return url
def get_jobs(job_title, location):
'''Max returned number of jobs is 15 per page.'''
global ret
url = get_url(job_title, location)
response = requests.get(url)
print(f"Responses: {response}")
### This returns <Response [200]>
soup = BeautifulSoup(response.text, "html.parser")
job_names = []
for job_name in soup.find_all("h2", class_="jobTitle"):
### Each one just returns an empty list []
companies = []
for company in soup.find_all("span", class_="companyName"):
locations = []
for location in soup.find_all("div", class_="companyLocation"):
salaries = []
for salary in soup.find_all("div", class_="attribute_snippet"):
if salary.get_text().startswith("$"):
links = []
for link in soup.find_all("a", class_=lambda value: value and value.startswith("tapItem fs-unmask result"), href=True):
link = link["href"]
link = "https://indeed.com" + link
ret = [job_names, companies, locations, salaries, links]
### This returns [[], [], [], [], []]
class JobScraper(commands.Cog):
def __init__(self, client): # References whatever is passed through the client from discord
self.client = client
self.q = Queue(connection=conn)
#commands.command(aliases=["job", "find_job", "find_jobs", "get_job", "get_jobs"])
async def jobs(self, ctx, *, query):
'''Scrapes Indeed.com for jobs and returns them.
The input format should be "eve jobs [job title], [job location], [num returned]
e.g. eve jobs ai researcher, san francisco, 3'''
key_terms = query.split(",")
key_terms = [term.strip() for term in key_terms]
if len(key_terms) == 3:
num_jobs = int(key_terms[2])
num_jobs = 15
# ret = get_jobs(key_terms[0], key_terms[1])
job = self.q.enqueue(get_jobs, key_terms[0], key_terms[1])
await ctx.send("Here is what I found:")
for i in range(num_jobs):
await ctx.send("```" +
f"\nTitle: {ret[0][i]}" +
f"\nCompany: {ret[1][i]}" +
f"\nLocation: {ret[2][i]}" +
f"\nSalary: {ret[3][i]}" +
f"\nLink: {ret[4][i]}" +
def setup(client):
I am trying to integrate QnAmaker knowledge base with Azure Bot Service.
I am unable to find knowledge base id on QnAMaker portal.
How to find the kbid in QnAPortal?
The Knowledge Base Id can be located in Settings under “Deployment details” in your knowledge base. It is the guid that is nestled between “knowledgebases” and “generateAnswer” in the POST (see image below).
Hope of help!
Hey you can also use python to get this by take a look at the following code.
That is if you wanted to write a program to dynamically get the kb ids.
import http.client, os, urllib.parse, json, time, sys
# Represents the various elements used to create HTTP request path for QnA Maker
# Replace this with a valid subscription key.
# User host = '<your-resource-name>.cognitiveservices.azure.com'
host = '<your-resource-name>.cognitiveservices.azure.com'
subscription_key = '<QnA-Key>'
get_kb_method = '/qnamaker/v4.0/knowledgebases/'
headers = {
'Ocp-Apim-Subscription-Key': subscription_key,
'Content-Type': 'application/json'
conn = http.client.HTTPSConnection(host)
conn.request ("GET", get_kb_method, None, headers)
response = conn.getresponse()
data = response.read().decode("UTF-8")
result = None
if len(data) > 0:
result = json.loads(data)
#print(json.dumps(result, sort_keys=True, indent=2))
# Note status code 204 means success.
KB_id = result["knowledgebases"][0]["id"]
except :
print ("Unexpected error:", sys.exc_info()[0])
print ("Unexpected error:", sys.exc_info()[1])
The files in Google domain that I administer have gotten into a bad state; there are thousands of files residing in the root directory. I want to identify these files and move them to a folder underneath "My Drive".
When I use the API to list the parents for one of these orphaned files, the result is an empty array. To determine if a file is orphaned, I can iterate over all the files in my domain, and request the list of parents for each. If the list is empty, I know that the file is orphaned.
But this is hideously slow.
Is there anyway to use the Drive API to search for files that have no parents?
The "parents" field for the q parameter doesn't seem to be useful for this, as it's only possible to specify that the parents list contains some ID.
I'm trying to find a quick way to locate items that are truly at the root of the document hierarchy. That is, they are siblings of "My Drive", not children of "My Drive".
In Java:
List<File> result = new ArrayList<File>();
Files.List request = drive.files().list();
request.setQ("'root'" + " in parents");
FileList files = null;
files = request.execute();
for (com.google.api.services.drive.model.File element : files.getItems()) {
'root' is the parent folder, if the file or folder is in the root
Brute, but simple and it works..
do {
try {
FileList files = request.execute();
for (File f : files.getItems()) {
if (f.getParents().size() == 0) {
System.out.println("Orphan found:\t" + f.getTitle());
} catch (IOException e) {
System.out.println("An error occurred: " + e);
} while (request.getPageToken() != null
&& request.getPageToken().length() > 0);
The documentation recommends following query: is:unorganized owner:me.
The premise is:
List all files.
If a file has no 'parents' field, it means it's an orphan file.
So, the script deletes them.
Before to start you need:
To create an OAuth id
Then you need to add the permissions '../auth/drive' to your OAuth id, and validating your app against google, so you have delete permissions.
Ready for copy paste demo
from __future__ import print_function
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/drive']
def callback(request_id, response, exception):
if exception:
print("Exception:", exception)
def main():
Shows basic usage of the Drive v3 API to delete orphan files.
""" --- CHECK CREDENTIALS --- """
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
""" --- OPEN CONNECTION --- """
service = build('drive', 'v3', credentials=creds)
page_token = ""
files = None
orphans = []
page_size = 100
batch_counter = 0
while (True):
# List
r = service.files().list(pageToken=page_token,
fields="nextPageToken, files"
page_token = r.get('nextPageToken')
files = r.get('files', [])
# Filter orphans
# NOTE: (If the file has no 'parents' field, it means it's orphan)
for file in files:
if file['parents']:
print("File with a parent found.")
except Exception as e:
print("Orphan file found.")
# Exit condition
if page_token is None:
batch_size = min(len(orphans), 100)
while(len(orphans) > 0):
batch = service.new_batch_http_request(callback=callback)
for i in range(batch_size):
print("File with id {0} queued for deletion.".format(orphans[0]))
del orphans[0]
batch_counter += 1
print("BATCH {0} DELETED - {1} FILES DELETED".format(batch_counter,
if __name__ == '__main__':
This method won't delete files in the root directory, as they have the 'root' value for the field 'parents'. If not all your orphan files are listed, it means they are being automatically deleted by google. This process might take up to 24h.
Adreian Lopez, thanks for your script. It really saved me a lot of manual work. Below are the steps that I followed to implement your script:
Created a folder c:\temp\pythonscript\ folder
Created OAuth 2.0 Client ID using https://console.cloud.google.com/apis/credentials and downloaded the credentials file to c:\temp\pythonscript\ folder.
Renamed the above client_secret_#######-#############.apps.googleusercontent.com.json as credentials.json
Copied the Adreian Lopez's python's script and saved it as c:\temp\pythonscript\deleteGoogleDriveOrphanFiles.py
Go to "Microsoft Store" on Windows 10 and install Python 3.8
Open the Command Prompt and enter: cd c:\temp\pythonscript\
run pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib
run python deleteGoogleDriveOrphanFiles.py and follow the steps on the screen to create c:\temp\pythonscript\token.pickle file and start deleting the orphan files. This step can take quite a while.
Verify the https://one.google.com/u/1/storage
Rerun step 8 again as necessary.
Try to use this in your query:
'root' in parents
I've created an application, which is used to loop through the emails in an inbox and find all the undeliverable, mailbox full or delayed emails and generate a report.
The usual routine is to loop through all the emails in the inbox (up to a specified date).
If an email is undeliverable use regex to find the email. This works 95% of the time as this information is contained in the body of the Undelivered message (ReportItem).
So, my problem is I have a few emails which are returning blank emails to the report making it nigh on impossible to clean them or easily report that we have a problem with someone's email.
I have found that the information in the Internet Headers has who the mail was intended for, but cannot find anything on if it is possible to use an interop or some other object to obtain this information.
If anyone else has come across this problem and knows of a work around I would be very grateful.
I was looking to automate an outlook mail box to move all undelivered emails and store the email address of the recipient of the undeliverable message in a list, so that I can later check if an entry of the list is present in an excel column and then remove it from the excel. I hope this helps !
I've found a Python solution for this problem. A python library that is used to connect to the outlook is win32com, so first we import all libraries that we will need:
import win32com.client
import re
import datetime as dt
from tqdm import tqdm
import time
import extract_msg
This is a good way to connect to a specific outlook account, if you have :
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
accounts= win32com.client.Dispatch("Outlook.Application").Session.Accounts
Then create a loop that iterates through the whole outlook and gets to the specified mail account:
for account in accounts:
inbox = outlook.Folders(account.DeliveryStore.DisplayName)
if account.DeliveryStore.DisplayName == 'place_your_account_name_here':
for folder in inbox.Folders:
Find the folder in outlook you wish to check by folder name,
so if you would want to iterate through Inbox, type "Inbox" instead of "Folder_name"
if folder.__str__() == "Folder_name":
messages = folder.Items
messages.Sort('[ReceivedTime]', True)
if folder.Folders.Item('Undeliverable'):
undeliverable = folder.Folders.Item('Undeliverable')
list_of_undelivered_email_addresses = my_super_function(messages,undeliverable)
After we have reached the mail items and declared the undeliverable subfolder as "undeliverable", we specify the time period for which we want to do the below function:
def my_super_function(messages,undeliverable):
list_of_undelivered_email_addresses = []
last_n_days = dt.datetime.now() - dt.timedelta(days = 25)
messages = messages.Restrict("[ReceivedTime] >= '" +last_n_days.strftime('%m/%d/%Y %H:%M %p')+"'")
rl= list()
I have found that the msot popular times of undeliverable email addresses present some sort of an error, and below the error is the original version of the email I have sent. Most of them (with very few exceptions, have a line that says:
To: "Some_email_address" ....
This is why I used this regular expression to get read the whole line after my pattern (which is "To: "")
pattern = re.compile('To: ".*\n?',re.MULTILINE)
for counter, message in enumerate(messages):
It is very important that you save the email somewhere on your PC, because otherwise as soon as you read it's body, the email gets encrypted.
f = r'specify_the_absolute_path_where_you_want_it_saved'
msg = extract_msg.Message(f)
Search the saved msg body for the keyword Undeliverable:
if msg.body.find("undeliverable")!= -1 or msg.body.find("Undeliverable")!= -1 or msg.subject.find("Undeliverable")!= -1 or msg.subject.find("undeliverable")!= -1 or msg.body.find("wasn't found at")!= -1:
Save the actual email to a list, so you can move it to the undeliverables subfolder later
m = re.search(pattern, msg.body)
m = m[0]
mail_final = m.split('"')[1]
list_of_undelivered_email_addresses=list(filter(None, list_of_undelivered_email_addresses))
print('this email is not an undeliverable one')
Move all mails in the list to the undeliverables folder:
if len(rl) ==0:
for m in tqdm(rl):
return list_of_undelivered_email_addresses
Here is the full code:
import win32com.client
import re
import datetime as dt
from tqdm import tqdm #tqdm gives you the progress bar
import time
import extract_msg
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
accounts= win32com.client.Dispatch("Outlook.Application").Session.Accounts
def my_super_function(messages,undeliverable):
list_of_undelivered_email_addresses = []
last_n_days = dt.datetime.now() - dt.timedelta(days = 25)
messages = messages.Restrict("[ReceivedTime] >= '" +last_n_days.strftime('%m/%d/%Y %H:%M %p')+"'")
rl= list()
pattern = re.compile('To: ".*\n?',re.MULTILINE)
for counter, message in enumerate(messages):
f = r'some_absolute_path'
msg = extract_msg.Message(f)
if msg.body.find("undeliverable")!= -1 or msg.body.find("Undeliverable")!= -1 or msg.subject.find("Undeliverable")!= -1 or msg.subject.find("undeliverable")!= -1 or msg.body.find("wasn't found at")!= -1:
m = re.search(pattern, msg.body)
m = m[0]
mail_final = m.split('"')[1]
list_of_undelivered_email_addresses=list(filter(None, list_of_undelivered_email_addresses))
if len(rl) ==0:
for m in tqdm(rl):
return list_of_undelivered_email_addresses
for account in accounts:
inbox = outlook.Folders(account.DeliveryStore.DisplayName)
if account.DeliveryStore.DisplayName == 'desired_email_address':
for folder in inbox.Folders:
if folder.__str__() == "Inbox":
messages = folder.Items
messages.Sort('[ReceivedTime]', True)
if folder.Folders.Item('Undeliverable'):
undeliverable = folder.Folders.Item('Undeliverable')
list_of_undelivered_email_addresses = my_super_function(messages,undeliverable)
looks like what I want isnt part of the ReportItem properties.
The possible options are Extended IMAPI, CDO or Redemption