Defining URL list for crawler, syntax issues

I'm currently running the following code:
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
def hltv_match_list(max_offset):
offset = 0
while offset < max_offset:
url = '' + str(offset)
base = ""
soup = BeautifulSoup(requests.get("").content, 'html.parser')
cont ="div.covMainBoxContent a[href*=matchid=]")
href = urljoin(base, (a["href"] for a in cont))
# print([urljoin(base, a["href"]) for a in cont])
offset += 50
def get_hltv_match_data(matchid_url):
source_code = requests.get(matchid_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for teamid in soup.findAll("div.covSmallHeadline a[href*=teamid=]"):
print teamid.string
File "C:/Users/mdupo/PycharmProjects/HLTVCrawler/", line 12, in hltv_match_list
href = urljoin(base, (a["href"] for a in cont))
File "C:\Python27\lib\", line 261, in urljoin
urlparse(url, bscheme, allow_fragments)
File "C:\Python27\lib\", line 143, in urlparse
tuple = urlsplit(url, scheme, allow_fragments)
File "C:\Python27\lib\", line 182, in urlsplit
i = url.find(':')
AttributeError: 'generator' object has no attribute 'find'
Process finished with exit code 1
I think I'm having trouble with the href = urljoin(base, (a["href"] for a in cont)) part as I'm trying to create a url list I can feed into get_hltv_match_datato then capture various items within that page. Am I going about this wrong?

You need to join each href as per your commented code:
urls = [urljoin(base,a["href"]) for a in cont]
You are trying to join the base url to a generator i.e (a["href"] for a in cont) which makes no sense.
You should also be passing the url to requests or you are going to be requesting the same page over and over.
soup = BeautifulSoup(requests.get(url).content, 'html.parser')


Extracting web address and using for loop

I am trying to extract websites of the members from So, I wrote a code to visit the member page one by one and extract the web addresses. I am using BeautifulSoup Library to extract.
However my problem is not implementation of BeautifulSoup but in the for loop. The above link has 15 members per page. When I try to run the code for one page it just returns the web address of only one member. I have imported all the desired libraries.
url = input('Enter URL:')
#position = int(input('Enter position:'))-1
html = urlopen(url).read()
lst1 = list()
lst = list()
lst2 = list()
lst3 = list()
lst4 = list()
soup = BeautifulSoup(html,"html.parser")
url1 = ""
conn = sqlite3.connect('list.sqlite')
cur = conn.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS weblinks (URL TEXT UNIQUE)''')
cur.execute('''CREATE TABLE IF NOT EXISTS weblinks1 (website TEXT UNIQUE)''')
cur.execute('''CREATE TABLE IF NOT EXISTS weblinks2 (website1 TEXT UNIQUE)''')
for link in soup.find_all('a'):
for links in lst:
if 'members' in links:
url2 = urllib.parse.urljoin(url1, links)
url3 = lst2[4:18]
for x in url3:
html1 = urlopen(x).read()
soup1 = BeautifulSoup(html1,"html.parser")
for url4 in soup1.find_all('a'):
url4 = url4.get('href')
if url4 not in lst3:
url6 = lst3.pop(108)
So, the last "for loop" where url6 is the desired output. It just prints output one web address.
Please advise what am I missing here.

I want to download after image crawling for multiple pages

I want to download after image crawling for multiple pages. However, all images cannot be downloaded because they are overwritten in [for syntax].
Below is my code. What is wrong?
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as rq
for page in range(2,4):
baseUrl = ''
plusUrl = baseUrl + str(page)
html = urlopen(plusUrl).read()
soup = BeautifulSoup(html, 'html.parser')
img = soup.find_all(class_='card-img-top')
listimg = []
for i in img:
n = 1
for index, img_link in enumerate(listimg):
img_data = rq.get(img_link).content
with open('./onepiece/' + str(index+1) + '.png', 'wb+') as f:
n += 1
Another way is to download all the pictures.
from simplified_scrapy import Spider, SimplifiedDoc, utils, SimplifiedMain
class ImageSpider(Spider):
name = 'onepiecetreasurecruise'
start_urls = ['']
# refresh_urls = True
concurrencyPer1s = 0.5 # set download speed
imgPath = 'images/'
def __init__(self):
Spider.__init__(self, # necessary
utils.createDir(self.imgPath) # create image dir
def afterResponse(self, response, url, error=None, extra=None):
try: # save images
flag = utils.saveResponseAsFile(response, self.imgPath, 'image')
if flag: return None
except Exception as err:
return Spider.afterResponse(self, response, url, error, extra)
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
# image urls
urls = doc.body.getElements('p', value='card-text').a
if (urls):
for u in urls:
u['header']={'Referer': url['url']}
# next page urls
u = doc.body.getElementByText('Suivant',tag='a')
if (u):
u['href'] = utils.absoluteUrl(url.url,u.href)
return True
SimplifiedMain.startThread(ImageSpider()) # start download
I fixed the indents in your code. This works for me. It downloads 30 images.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as rq
listimg = [] # all images
for page in range(2,4):
baseUrl = ''
plusUrl = baseUrl + str(page)
html = urlopen(plusUrl).read()
soup = BeautifulSoup(html, 'html.parser')
img = soup.find_all(class_='card-img-top')
for i in img:
n = 1
for index, img_link in enumerate(listimg):
img_data = rq.get(img_link).content
with open('./onepiece/' + str(index+1) + '.png', 'wb+') as f:
n += 1

how to add symbols to the multiple stock data

#i have scraped data below is my code, now i want to add a column of symbols to the respective company data, plz guide me how the symbol can be added to the respective firm data
#code below
from time import sleep
import pandas as pd
import os
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from import ChromeDriverManager
browser = webdriver.Chrome(ChromeDriverManager().install())
symbols =['FATIMA',
company = 1
for ThisSymbol in symbols :
# Get first symbol from the above python list
company = 2
# In the URL, make symbol as variable
url = '' + ThisSymbol
# The below command will get all the contents from the url
html = browser.execute_script("return document.documentElement.outerHTML")
# So we will supply the contents to beautiful soup and we tell to consider this text as a html, with the following command
soup = BeautifulSoup (html, "html.parser")
for rn in range(0,9) :
plist = []
r = soup.find_all('tr')[rn]
# Condition: if first row, then th, otherwise td
if (rn==0) :
celltag = 'th'
else :
celltag = 'td'
# Now use the celltag instead of using fixed td or th
col = r.find_all(celltag)
if col[i] == 0:
print ("")
for i in range(0,4) :
cell = col[i].text
clean = cell.replace('\xa0 ', '')
clean = clean.replace (' ', '')
# If first row, create df, otherwise add to it
if (rn == 0) :
df = pd.DataFrame(plist)
else :
df2 = pd.DataFrame(plist)
colname = 'y' + str(2019-rn)
df[colname] = df2
if (company == 1):
dft = df.T
# Get header Column
head = dft.iloc[0]
# Exclude first row from the data
dft = dft[1:]
dft.columns = head
dft = dft.reset_index()
# Assign Headers
dft = dft.drop(['index'], axis = 'columns')
dft2 = df.T
# Get header Column
head = dft2.iloc[0]
# Exclude first row from the data
dft2 = dft2[1:]
dft2.columns = head
dft2 = dft2.reset_index()
# Assign Headers
dft2 = dft2.drop(['index'], axis = 'columns')
dft['Symbol'] = ThisSymbol
dft = dft.append(dft2, sort=['Year','Symbol'])
company = company +1
my output looks this, i want to have a symbol column to each respective firm data
Symbol,i have added
dft['Symbol'] = ThisSymbol
but it add just first company from the list to all companies data
enter image description here

xpath could not recognize predicate for a tag

I try to use scrapy xpath to scrape a page, but it seems it cannot capture the tag with predicates when I use a for loop,
# This package will contain the spiders of your Scrapy project
from cunyfirst.items import CunyfirstSectionItem
import scrapy
import json
class CunyfristsectionSpider(scrapy.Spider):
name = "cunyfirst-section-spider"
start_urls = ["file:///Users/haowang/Desktop/section.htm"]
def parse(self, response):
url = response.url
yield scrapy.Request(url, self.parse_page)
def parse_page(self, response):
n = -1
for section in response.xpath("//a[contains(#name,'MTG_CLASS_NBR')]"):
print(response.xpath("//a[#name ='MTG_CLASSNAME$10']/text()"))
n += 1
class_num = section.xpath('text()').extract_first()
# print(class_num)
classname = "MTG_CLASSNAME$" + str(n)
date = "MTG_DAYTIME$" + str(n)
instr = "MTG_INSTR$" + str(n)
class_name = response.xpath("//a[#name = classname]/text()")
I am looking for a tags with name as "MTG_CLASSNAME$" + str(n), with n being 0,1,2..., and I am getting empty output from my xpath query. Not sure why...
I am basically trying to scrape course and their info from
with filter applied: Kingsborough CC, fall 18, BIO
Well... I've visited the website you put in the question description, I used element inspection and searched for "MTG_CLASSNAME" and I got 0 matches...
So I will give you some tools:
In your set that:
LOG_FILE = "log.txt"
then print the response body ( response.body ) where you should ( in the top of parse_page function in this case ) and search it in log.txt
Check there if there is what you are looking for.
If there is, use this (
or similar ) to check your xpath statement.
In addition, change for section in response.xpath("//a[contains(#name,'MTG_CLASS_NBR')]"):
by for section in response.xpath("//a[contains(#name,'MTG_CLASS_NBR')]").extract():, this will raise an error when you get the data that you are looking for.

How to automatically turn BibTex citation into something parseable by Zotero?

I have a citation system which publishes users notes to a wiki (Researchr). Programmatically, I have access to the full BibTeX record of each entry, and I also display this on the individual pages (for example - click on BibTeX). This is in the interest of making it easy for users of other citation manager to automatically import the citation of a paper that interests them. I would also like other citation managers, especially Zotero, to be able to automatically detect and import a citation.
Zotero lists a number of ways of exposing metadata that it will understand, including meta tags with RDF, COiNS, Dublin Core and unAPI. Is there a Ruby library for converting BibTeX to any of these standards automatically - or a Javascript library? I could probably create something, but if something existed, it would be far more robust (BibTeX has so many publication types and fields etc).
There's a BibTeX2RDF convertor available here, might be what you're after.
unAPI is not a data standard - it's a way to serve data (to Zotero and other programs). Zotero imports Bibtex, so serving Bibtex via unAPI works just fine. Inspire is an example of a site that does that:
By now one can simply import bibtex files of type .bib directly in Zotero. However, I noticed my bibtex files were often less complete than Zotero (in particular they often missed a DOI), and I did not find an "auto-complete" function (based on the data in the bibtex entries) in Zotero.
So I import the .bib file with Zotero, to ensure they are all in there. Then I run a python script that gets all the missing DOI's it can find for the entries in that .bib file, and exports them to a space separated .txt file.:
# pip install habanero
from habanero import Crossref
import re
def titletodoi(keyword):
cr = Crossref()
result =
items = result["message"]["items"]
item_title = items[0]["title"]
tmp = ""
for it in item_title:
tmp += it
title = keyword.replace(" ", "").lower()
title = re.sub(r"\W", "", title)
# print('title: ' + title)
tmp = tmp.replace(" ", "").lower()
tmp = re.sub(r"\W", "", tmp)
# print('tmp: ' + tmp)
if title == tmp:
doi = items[0]["DOI"]
return doi
return None
def get_dois(titles):
dois = []
for title in titles:
doi = titletodoi(title)
print(f"doi={doi}, title={title}")
if not doi is None:
# print("An exception occurred")
return dois
def read_titles_from_file(filepath):
with open(filepath) as f:
lines =
split_lines = splits_lines(lines)
return split_lines
def splits_lines(lines):
split_lines = []
for line in lines:
new_lines = line.split(";")
for new_line in new_lines:
return split_lines
def write_dois_to_file(dois, filename, separation_char):
textfile = open(filename, "w")
for doi in dois:
textfile.write(doi + separation_char)
filepath = "list_of_titles.txt"
titles = read_titles_from_file(filepath)
dois = get_dois(titles)
write_dois_to_file(dois, "dois_space.txt", " ")
write_dois_to_file(dois, "dois_per_line.txt", "\n")
The DOIs of the .txt are fed into magic wand of Zotero. Next, I (manually) remove the duplicates by choosing the latest added entry (because that comes from the magic wand with the most data).
After that, I run another script to update all the reference id's in my .tex and .bib files to those generated by Zotero:
# Importing library
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import *
import os, fnmatch
import Levenshtein as lev
# Let's define a function to customize our entries.
# It takes a record and return this record.
def customizations(record):
"""Use some functions delivered by the library
:param record: a record
:returns: -- customized record
record = type(record)
record = author(record)
record = editor(record)
record = journal(record)
record = keyword(record)
record = link(record)
record = page_double_hyphen(record)
record = doi(record)
return record
def get_references(filepath):
with open(filepath) as bibtex_file:
parser = BibTexParser()
parser.customization = customizations
bib_database = bibtexparser.load(bibtex_file, parser=parser)
# print(bib_database.entries)
return bib_database
def get_reference_mapping(main_filepath, sub_filepath):
found_sub = []
found_main = []
main_into_sub = []
main_references = get_references(main_filepath)
sub_references = get_references(sub_filepath)
for main_entry in main_references.entries:
for sub_entry in sub_references.entries:
# Match the reference ID if 85% similair titles are detected
lev_ratio = lev.ratio(
if lev_ratio > 0.85:
if main_entry["ID"] != sub_entry["ID"]:
print(f'replace: {sub_entry["ID"]} with: {main_entry["ID"]}')
main_into_sub.append([main_entry, sub_entry])
# Keep track of which entries have been found
return (
def remove_curly_braces(string):
left = string.replace("{", "")
right = left.replace("{", "")
return right
def replace_references(main_into_sub, directory):
for pair in main_into_sub:
main = pair[0]["ID"]
sub = pair[1]["ID"]
print(f"replace: {sub} with: {main}")
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")
def findReplace(directory, find, replace, filePattern):
for path, dirs, files in os.walk(os.path.abspath(directory)):
for filename in fnmatch.filter(files, filePattern):
filepath = os.path.join(path, filename)
with open(filepath) as f:
s =
s = s.replace(find, replace)
with open(filepath, "w") as f:
def list_missing(main_references, sub_references):
for sub in sub_references:
if not sub["ID"] in list(map(lambda x: x["ID"], main_references)):
print(f'the following reference has a changed title:{sub["ID"]}')
latex_root_dir = "some_path/"
main_filepath = f"{latex_root_dir}latex/Literature_study/zotero.bib"
sub_filepath = f"{latex_root_dir}latex/Literature_study/references.bib"
) = get_reference_mapping(main_filepath, sub_filepath)
replace_references(main_into_sub, latex_root_dir)
list_missing(main_references, sub_references)
# For those references which have levenshtein ratio below 85 you can specify a manual swap:
manual_swap = [] # main into sub
# manual_swap.append(["cantley_impact_2021","cantley2021impact"])
# manual_swap.append(["widemann_envision_2021","widemann2020envision"])
for pair in manual_swap:
main = pair[0]
sub = pair[1]
print(f"replace: {sub} with: {main}")
# findReplace(latex_root_dir, sub, main, "*.tex")
# findReplace(latex_root_dir, sub, main, "*.bib")
