Converge on Best Combination of Elements - algorithm

You have $10,000 to invest in stocks. You are given a list of 200 stocks, and are told to select 8 of those stocks to buy, and also indicate how many of those stocks you want to buy. You cannot spend more than $2,500 on a single stock alone, and each stock has its own price ranging from $100 to $1000. You cannot buy a fraction of a stock, only whole numbers. Each stock also has a value attached to it indicating how profitable it is. This is an arbitrary number from 0-100 that serves as a simple rating system.
The end goal is to list the optimal selection of 8 stocks, and indicate the best quantity of each of those stocks to buy without going over the $2,500 limit for each stock.
• I'm not asking for investment advice, I chose stocks because it acts as a good metaphor for the actual problem I'm trying to solve.
• Seems like what I'm looking at is a more complex version of the 0/1 Knapsack problem: https://en.wikipedia.org/wiki/Knapsack_problem.
• No, this isn't homework.

Here is lightly tested code for solving your problem exactly in time that is polynomial in the amount of money available, the number of stocks that you have, and the maximum amount of stock that you can buy.
#! /usr/bin/env python
from collections import namedtuple
Stock = namedtuple('Stock', ['id', 'price', 'profit'])
def optimize (stocks, money=10000, max_stocks=8, max_per_stock=2500):
Investment = namedtuple('investment', ['profit', 'stock', 'quantity', 'previous_investment'])
investment_transitions = []
last_investments = {money: Investment(0, None, None, None)}
for _ in range(max_stocks):
next_investments = {}
investment_transitions.append([last_investments, next_investments])
last_investments = next_investments
def prioritize(stock):
# This puts the best profit/price, as a ratio, first.
val = [-(stock.profit + 0.0)/stock.price, stock.price, stock.id]
return val
for stock in sorted(stocks, key=prioritize):
# We reverse transitions so we have not yet added the stock to the
# old investments when we add it to the new investments.
for transition in reversed(investment_transitions):
old_t = transition[0]
new_t = transition[1]
for avail, invest in old_t.iteritems():
for i in range(int(min(avail, max_per_stock)/stock.price)):
quantity = i+1
new_avail = avail - quantity*stock.price
new_profit = invest.profit + quantity*stock.profit
if new_avail not in new_t or new_t[new_avail].profit < new_profit:
new_t[new_avail] = Investment(new_profit, stock, quantity, invest)
best_investment = investment_transitions[0][0][money]
for transition in investment_transitions:
for invest in transition[1].values():
if best_investment.profit < invest.profit:
best_investment = invest
purchase = {}
while best_investment.stock is not None:
purchase[best_investment.stock] = best_investment.quantity
best_investment = best_investment.previous_investment
return purchase
optimize([Stock('A', 100, 10), Stock('B', 1040, 160)])
And here it is with the tiny optimization of deleting investments once we see that continuing to add stocks to it cannot improve. This will probably run orders of magnitude faster than the old code with your data.
#! /usr/bin/env python
from collections import namedtuple
Stock = namedtuple('Stock', ['id', 'price', 'profit'])
def optimize (stocks, money=10000, max_stocks=8, max_per_stock=2500):
Investment = namedtuple('investment', ['profit', 'stock', 'quantity', 'previous_investment'])
investment_transitions = []
last_investments = {money: Investment(0, None, None, None)}
for _ in range(max_stocks):
next_investments = {}
investment_transitions.append([last_investments, next_investments])
last_investments = next_investments
def prioritize(stock):
# This puts the best profit/price, as a ratio, first.
val = [-(stock.profit + 0.0)/stock.price, stock.price, stock.id]
return val
best_investment = investment_transitions[0][0][money]
for stock in sorted(stocks, key=prioritize):
profit_ratio = (stock.profit + 0.0) / stock.price
# We reverse transitions so we have not yet added the stock to the
# old investments when we add it to the new investments.
for transition in reversed(investment_transitions):
old_t = transition[0]
new_t = transition[1]
for avail, invest in old_t.items():
if avail * profit_ratio + invest.profit <= best_investment.profit:
# We cannot possibly improve with this or any other stock.
del old_t[avail]
continue
for i in range(int(min(avail, max_per_stock)/stock.price)):
quantity = i+1
new_avail = avail - quantity*stock.price
new_profit = invest.profit + quantity*stock.profit
if new_avail not in new_t or new_t[new_avail].profit < new_profit:
new_invest = Investment(new_profit, stock, quantity, invest)
new_t[new_avail] = new_invest
if best_investment.profit < new_invest.profit:
best_investment = new_invest
purchase = {}
while best_investment.stock is not None:
purchase[best_investment.stock] = best_investment.quantity
best_investment = best_investment.previous_investment
return purchase

Related

Quantlib Zero Coupon Inflation Swap Ignores CPI Values

I am trying to price a zero-coupon USD CPI inflation swap in Quantlib and Python. My discount curve and NPV of the fixed leg looks good, but I'm a few percentage points out compared to BBG SWPM on the NPV of the inflation leg.
One thing I've noticed is that changing the values of the CPI has no effect on the price of the swap. So I think I'm setting the CPI wrong and as such the base index of the swap is wrong. Can anyone see what I'm doing wrong here?
For reference, I've backed this out from looking at the C++ unit tests. If there's a complete Python example I would be interested to see it, but couldn't find one on my own.
import QuantLib as quantlib
import pandas as pd
start_date = quantlib.Date.from_date(pd.Timestamp(2022, 9, 6))
calc_date = quantlib.Date.from_date(pd.Timestamp(2022, 9, 6))
end_date = quantlib.Date.from_date(pd.Timestamp(2024, 9, 6))
swap_type = quantlib.ZeroCouponInflationSwap.Receiver
calendar = quantlib.TARGET()
day_count_convention = quantlib.ActualActual()
contract_observation_lag = quantlib.Period(3, quantlib.Months)
business_day_convention = quantlib.ModifiedFollowing
nominal = 10e6
fixed_rate = 0.05
cpi_json = '{"columns":[1],"index":[1640908800000,1643587200000,1646006400000,1648684800000,1651276800000,1653955200000,1656547200000,1659225600000],"data":[[277.948],[278.802],[283.716],[287.504],[289.109],[292.296],[296.311],[296.276]]}'
cpi_prints = pd.read_json(cpi_json, orient='split')
# Pretty-printed CPI:
# 1
# 2021-12-31 277.948
# 2022-01-31 278.802
# 2022-02-28 283.716
# 2022-03-31 287.504
# 2022-04-30 289.109
# 2022-05-31 292.296
# 2022-06-30 296.311
# 2022-07-31 296.276
zero_coupon_observations = pd.DataFrame(index=[0],
data={'1Y': 2.73620,
'2Y': 2.975,
'3Y': 2.967,
'4Y': 2.917,
'5Y': 2.8484})
inflation_yield_term_structure = quantlib.RelinkableZeroInflationTermStructureHandle()
inflation_index = quantlib.USCPI(True, inflation_yield_term_structure)
for date, value in cpi_prints.itertuples():
# Setting the CPI as fixings, but no matter what I put here the NPV comes out the same
# Looks like the base index for the swap is not being set by me/set through the CPI prints
# I put here.
inflation_index.addFixing(quantlib.Date.from_date(date), value)
inflation_rate_helpers = []
nominal_term_structure = quantlib.YieldTermStructureHandle(quantlib.FlatForward(calc_date,
0.00, # Changing this seems to have no effect
quantlib.ActualActual()))
for tenor in zero_coupon_observations.columns:
maturity = calendar.advance(calc_date, quantlib.Period(tenor))
quote = quantlib.QuoteHandle(quantlib.SimpleQuote(zero_coupon_observations.at[0, tenor] / 100.0))
helper = quantlib.ZeroCouponInflationSwapHelper(quote,
contract_observation_lag,
maturity,
calendar,
business_day_convention,
day_count_convention,
inflation_index,
nominal_term_structure)
inflation_rate_helpers.append(helper)
# Not sure how to choose this number, just taking the 1Y tenor on the calc date?
# I'm pricing a 2Y swap, and will want to price it off it's start date as well
base_zero_rate = zero_coupon_observations.at[0, '1Y']/100
inflation_curve = quantlib.PiecewiseZeroInflation(calc_date,
calendar,
day_count_convention,
contract_observation_lag,
quantlib.Monthly,
inflation_index.interpolated(),
base_zero_rate,
inflation_rate_helpers,
1.0e-12,
quantlib.Linear())
inflation_yield_term_structure.linkTo(inflation_curve)
swap = quantlib.ZeroCouponInflationSwap(swap_type,
nominal,
start_date,
end_date,
calendar,
business_day_convention,
day_count_convention,
fixed_rate,
inflation_index,
contract_observation_lag)
# Leaving off the construction of the discount curve for brevity.
# NPV of the fixed legs checks out
discount_curve = ...
swap_engine = quantlib.DiscountingSwapEngine(discount_curve)
swap.setPricingEngine(swap_engine)
print(swap.NPV())

How to improve Ruby structure for Shopify Script Performance

I'm using a Ruby in Shopify Scripts Editor to manage as a security measure Gift With Purchase (GWP) promotions.
The script current is:
Checking if the Customer is logged in as a Professional or Unlogged
Checking if there is a minimum amount spent in the cart
Ensuring that only one "Gift" product is been added to the cart
Removing a "Gift" product if the checkout doesn't have a "Discount Code" or the minimum set in the GWP_SETTINGS = [] obj.
The problem is that it's generating too much Production Errors like "Your script exceeded the time limit." and "Your script exceeded the cpu limit."
The current usage is CPU: 5% | Memory: 8% and it's increasing dizzyingly every time we add a new GWP promotion array.
Is there a better way to structure this logic so it takes less memory to process the entire order + GWP validation?
Here is the "Line Items" structure:
cart = Input.cart
PRO_TAG = 'professional-tag'
has_pro_tag = cart.customer && cart.customer.tags.include?(PRO_TAG)
GWP_SETTINGS = [
gwp_1 = {
"variant_id" => 98989898989898,
"discount_code" => "DISCOUNT_CODE_1",
"minimum_requirement" => Money.new(cents: 50 * 100),
"user_type" => "consumer"
},
gwp_2 = {
"variant_id" => 97979797979797,
"discount_code" => "DISCOUNT_CODE_1",
"minimum_requirement" => Money.new(cents: 50 * 100),
"user_type" => "consumer"
},
gwp_3 = {
"variant_id" => 96969696969696,
"discount_code" => "DISCOUNT_CODE_1",
"minimum_requirement" => Money.new(cents: 50 * 100),
"user_type" => "consumer"
}
]
def remove_GWP(cart, variant_id)
cart.line_items.each do |item|
next if item.variant.id != variant_id
index = cart.line_items.find_index(item)
cart.line_items.delete_at(index)
end
end
def ensure_only_one_GWP_is_added(cart, variant_id)
cart.line_items.each do |item|
next if item.variant.id != variant_id
item.instance_variable_set(:#quantity, 1)
end
end
GWP_SETTINGS.each do |gwp_item_settings|
customer_has_discount = cart.discount_code && cart.discount_code.code == gwp_item_settings["discount_code"]
customer_has_minimum = cart.subtotal_price >= gwp_item_settings["minimum_requirement"]
gwp_is_for_professional = gwp_item_settings["user_type"] == "professional-tag"
#UNLOGGED
if customer_has_discount && customer_has_minimum
ensure_only_one_GWP_is_added(cart, gwp_item_settings["variant_id"])
else
remove_GWP(cart, gwp_item_settings["variant_id"])
end
#PRO
if gwp_is_for_professional && has_pro_tag
if customer_has_discount && customer_has_minimum
ensure_only_one_GWP_is_added(cart, gwp_item_settings["variant_id"])
else
remove_GWP(cart, gwp_item_settings["variant_id"])
end
end
end
Output.cart = cart
You only have 3 settings. But a customer (an order) could have 100+ line items. You know there is only ever 1 customer, 1 order and for you, 3 GWT settings to use.
Your business logic would be smarter if you looped through the line items only once. Then you have a "this is as fast as I can go, go to town" in terms of your algorithm. You cannot go faster than that.
With things like, "does this customer have an X or Y?", you do those once, not 3 times per line item!
As you check each line item, you can do your special logic for things that might AFFECT that line item.
Basically, this is basic algorithmics. You are doing the most work possible repetitively for no reason, and Shopify is puking because of it.

problem, backtesting with backtrader and btalib, It's not action as i tried, want to action as order buy if sma5>sma10 and sell when sma5<sma14

from btalib.indicators import sma
import pandas as pd
import backtrader as bt
import os.path #To manage paths
import sys # to find out the script name
import datetime
import matplotlib as plt
from backtrader import cerebro
from numpy import mod #for datetime object
df = pd.read_csv('C:/Users/User/Desktop/programming/dataset/coin_Bitcoin.csv',parse_dates=True, index_col='Date')
sma14 = btalib.sma(df, period = 14)
sma5 = btalib.sma(df, period=5)
class TestStrategy(bt.Strategy):
params = (
('exitbars', 5),
)
def log(self, txt, dt=None):
#Logging function fot this strategy
dt = dt or self.datas[0].datetime.date(0)
print('%s, %s' % (dt.isoformat(), txt))
def __init__(self):
# Keep a reference to the "close" line in the data[0] dataseries
self.dataclose = self.datas[0].close
# To keep track of pending orders
self.order = None
self.buyprice = None
self.buycomm = None
def notify_order(self, order):
if order.status in [order.Submitted, order.Accepted]:
# Buy/Sell order submitted/accepted to/by broker - Nothing to do
return
if order.status in [order.Completed]:
if order.isbuy():
self.log(
'BUY EXECUTED, Price: %.2f, Cost: %.2f, Comm: %.2f' %
(order.executed.price,
order.executed.value,
order.executed.comm))
self.buyprice = order.executed.price
self.buycomm = order.executed.comm
else: #sell
self.log('SELL EXECUTED, Price: %.2f, Cost: %.2f, Comm %.2f'%
(order.executed.price,
order.executed.value,
order.executed.comm))
self.bar_executed = len(self)
elif order.status in [order.Canceled, order.Margin, order.Rejected]:
self.log('Order Canceled/Margin/Reject')
# Write down: no pending order
self.order = None
# Check if an order has been completed
# Attention: broker could reject order if not enough cash
def notify_trade(self, trade):
if not trade.isclosed:
return
self.log('OPERATION PROFIT, GROSS %.2f, NET %.2f' %
(trade.pnl, trade.pnlcomm))
def next(self):
#sma = btalib.sma(df, period=30)
# Simply log the closing price of the series from the reference
self.log('Close, %.2f' % self.dataclose[0])
# Check if an order is pending ... if yes, we cannot send a 2nd one
if self.order:
return
# Check if we are in the market
#if not self.position:
# Not yet ... we MIGHT BUY if ...
if sma5[0] > sma14[0]:
# BUY, BUY, BUY!!! (with all possible default parameters)
self.log('BUY CREATE, %.2f' % self.dataclose[0])
# Keep track of the created order to avoid a 2nd order
self.order = self.buy()
else:
# Already in the market ... we might sell
if sma5[0] < sma14[0]:
# SELL, SELL, SELL!!! (with all possible default parameters)
self.log('S[enter image description here][1]ELL CREATE, %.2f' % self.dataclose[0])
self.order = self.sell()
if __name__ == '__main__':
# Create a cerebro entity
cerebro = bt.Cerebro()
# Add a strategy
cerebro.addstrategy(TestStrategy)
modpath = os.path.dirname(os.path.abspath(sys.argv[0]))
datapath = os.path.join(modpath, 'C:/programming/AlgoTrading/backtest/BTC-USD-YF.csv')
data = bt.feeds.YahooFinanceCSVData(
dataname = datapath,
fromdate = datetime.datetime(2020,5,1),
todate = datetime.datetime(2021,6,1),
reverse = False)
#Add the Data Feed to Cerebro
cerebro.adddata(data)
cerebro.broker.setcash(100000.0)
# Add a FixedSize sizer according to the stake
#cerebro.addsizer(bt.sizers.FixedSize, stake=10)
cerebro.addsizer(bt.sizers.FixedSize)
# Set the commission
cerebro.broker.setcommission(commission=0.0)
# Print out the starting conditions
print('Starting Portfolio Value: %.2f' % cerebro.broker.getvalue())
# Run over everything
cerebro.run()
#print(df(data))
# Print out the final result
print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())
cerebro.plot()
I tried so hard to order buy when sma5>sma14 and sell at sma5<sma14 but it doesn't work
I use backtrader as backtesting library and use btalib for indicator o generate signal where "btalib.sma(df, period)"
cerebro function is what backtesting module
sometimes it's buy and sell everyday, buy today sell tomorrow
Probably you have to invert the order of your df, this was my problem when computing the RSI with btalib.
Example: df = df.iloc[::-1]

Technical Analyis (MACD) for crpto trading

Background:
I have writing a crypto trading bot for fun and profit.
So far, it connects to an exchange and gets streaming price data.
I am using this price to create a technical indicator (MACD).
Generally for MACD, it is recommended to use closing prices for 26, 12 and 9 days.
However, for my trading strategy, I plan to use data for 26, 12 and 9 minutes.
Question:
I am getting multiple (say 10) price ticks in a minute.
Do I simply average them and round the time to the next minute (so they all fall in the same minute bucket)? Or is there is better way to handle this.
Many Thanks!
This is how I handled it. Streaming data comes in < 1s period. Code checks for new low and high during streaming period and builds the candle. Probably ugly since I'm not a trained developer, but it works.
Adjust "...round('20s')" and "if dur > 15:" for whatever candle period you want.
def on_message(self, msg):
df = pd.json_normalize(msg, record_prefix=msg['type'])
df['date'] = df['time']
df['price'] = df['price'].astype(float)
df['low'] = df['low'].astype(float)
for i in range(0, len(self.df)):
if i == (len(self.df) - 1):
self.rounded_time = self.df['date'][i]
self.rounded_time = pd.to_datetime(self.rounded_time).round('20s')
self.lhigh = self.df['price'][i]
self.lhighcandle = self.candle['high'][i]
self.llow = self.df['price'][i]
self.lowcandle = self.candle['low'][i]
self.close = self.df['price'][i]
if self.lhigh > self.lhighcandle:
nhigh = self.lhigh
else:
nhigh = self.lhighcandle
if self.llow < self.lowcandle:
nlow = self.llow
else:
nlow = self.lowcandle
newdata = pd.DataFrame.from_dict({
'date': self.df['date'],
'tkr': tkr,
'open': self.df.price.iloc[0],
'high': nhigh,
'low': nlow,
'close': self.close,
'vol': self.df['last_size']})
self.candle = self.candle.append(newdata, ignore_index=True).fillna(0)
if ctime > self.rounded_time:
closeit = True
self.en = time.time()
if closeit:
dur = (self.en - self.st)
if dur > 15:
self.st = time.time()
out = self.candle[-1:]
out.to_sql(tkr, cnx, if_exists='append')
dat = ['tkr', 0, 0, 100000, 0, 0]
self.candle = pd.DataFrame([dat], columns=['tkr', 'open', 'high', 'low', 'close', 'vol'])
As far as I know, most or all technical indicator formulas rely on same-sized bars to produce accurate and meaningful results. You'll have to do some data transformation. Here's an example of an aggregation technique that uses quantization to get all your bars into uniform sizes. It will convert small bar sizes to larger bar sizes; e.g. second to minute bars.
// C#, see link above for more info
quoteHistory
.OrderBy(x => x.Date)
.GroupBy(x => x.Date.RoundDown(newPeriod))
.Select(x => new Quote
{
Date = x.Key,
Open = x.First().Open,
High = x.Max(t => t.High),
Low = x.Min(t => t.Low),
Close = x.Last().Close,
Volume = x.Sum(t => t.Volume)
});
See Stock.Indicators for .NET for indicators and related tools.

Unable to fetch the next_maintext of 2nd page

page1 and page2 URL. I want to fetch all the content from the 1st URL and only the main text from the 2nd URL and append it to the main text of 1st URL. This is only one article. function parse_indianexpress_archive_links() contains a list of news articles URLs. I m getting all the results from page1 but the next_maintext column from page2 results output <GET http://archive.indianexpress.com/news/congress-approves-2010-budget-plan/442712/2>
class spider_indianexpress(scrapy.Spider):
name = 'indianexpress'
start_urls = parse_indianexpress_archive_links()
def parse(self,response):
items = ScrapycrawlerItem()
separator = ''
#article_url = response.xpath("//link[#rel = 'canonical']/#href").extract_first()
article_url = response.request.url
date_updated = max(response.xpath("//div[#class = 'story-date']/text()").extract() , key=len)[-27:] #Call max(list, key=len) to return the longest string in list by comparing the lengths of all strings in a list
if len(date_updated) <=10:
date_updated = max(response.xpath("//div[#class = 'story-date']/p/text()").extract() , key=len)[-27:]
headline = response.xpath("(//div[#id = 'ie2013-content']/h1//text())").extract()
headline=separator.join(headline)
image_url = response.css("div.storybigpic.ssss img").xpath("#src").extract_first()
maintext = response.xpath("//div[#class = 'ie2013-contentstory']//p//text()").extract()
maintext = ' '.join(map(str, maintext))
maintext = maintext.replace('\r','')
contd = response.xpath("//div[#class = 'ie2013-contentstory']/p[#align = 'right']/text()").extract_first()
items['date_updated'] = date_updated
items['headline'] = headline
items['maintext'] = maintext
items['image_url'] = image_url
items['article_url'] = article_url
next_page_url = response.xpath("//a[#rel='canonical']/#href").extract_first()
if next_page_url :
items['next_maintext'] = scrapy.Request(next_page_url , callback = self.parse_page2)
yield items
def parse_page2(self, response):
next_maintext = response.xpath("//div[#class = 'ie2013-contentstory']//p//text()").extract()
next_maintext = ' '.join(map(str, next_maintext))
next_maintext = next_maintext.replace('\r','')
yield {next_maintext}
Output:
article_url,date_publish,date_updated,description,headline,image_url,maintext,next_maintext
http://archive.indianexpress.com/news/congress-approves-2010-budget-plan/442712/,,"Fri Apr 03 2009, 14:49 hrs ",,Congress approves 2010 budget plan,http://static.indianexpress.com/m-images/M_Id_69893_Obama.jpg,"The Democratic-controlled US Congress on Thursday approved budget blueprints embracing President Barack Obama's agenda but leaving many hard choices until later and a government deeply in the red. With no Republican support, the House of Representatives and Senate approved slightly different, less expensive versions of Obama's $3.55 trillion budget plan for fiscal 2010, which begins on October 1. The differences will be worked out over the next few weeks. Obama, who took office in January after eight years of the Republican Bush presidency, has said the Democrats' budget is critical to turning around the recession-hit US economy and paving the way for sweeping healthcare, climate change and education reforms he hopes to push through Congress this year. Obama, traveling in Europe, issued a statement praising the votes as ""an important step toward rebuilding our struggling economy."" Vice President Joe Biden, who serves as president of the Senate, presided over that chamber's vote. Democrats in both chambers voted down Republican alternatives that focused on slashing massive deficits with large cuts to domestic social spending but also offered hefty tax breaks for corporations and individuals. ""Democrats know that those policies are the wrong way to go,"" House Majority Leader Steny Hoyer told reporters. ""Our budget lays the groundwork for a sustained, shared and job-creating recovery."" But Republicans have argued the Democrats' budget would be a dangerous expansion of the federal government and could lead to unnecessary taxes that would only worsen the country's long-term fiscal situation. ""The Democrat plan to increase spending, to increase taxes, and increase the debt makes no difficult choices,"" said House Minority Leader John Boehner. ""It's a roadmap to disaster."" The budget measure is nonbinding but it sets guidelines for spending and tax bills Congress will consider later this year. BIPARTISANSHIP ABSENT AGAIN Obama has said he hoped to restore bipartisanship when he arrived in Washington but it was visibly absent on Thursday. ... contd.",<GET http://archive.indianexpress.com/news/congress-approves-2010-budget-plan/442712/2>
This is not how Scrapy works (I mean next_page request) How to fetch the Response object of a Request synchronously on Scrapy?.
But in fact you don't need synchronous requests. All you need is to check for a next page and pass current state (item) to the callback that will process your next page. I'm using cb_kwargs (it's a recommended way now). You may need to use request.meta if you have an old version.
import scrapy
class spider_indianexpress(scrapy.Spider):
name = 'indianexpress'
start_urls = ['http://archive.indianexpress.com/news/congress-approves-2010-budget-plan/442712/']
def parse(self,response):
item = {}
separator = ''
#article_url = response.xpath("//link[#rel = 'canonical']/#href").extract_first()
article_url = response.request.url
date_updated = max(response.xpath("//div[#class = 'story-date']/text()").extract() , key=len)[-27:] #Call max(list, key=len) to return the longest string in list by comparing the lengths of all strings in a list
if len(date_updated) <=10:
date_updated = max(response.xpath("//div[#class = 'story-date']/p/text()").extract() , key=len)[-27:]
headline = response.xpath("(//div[#id = 'ie2013-content']/h1//text())").extract()
headline=separator.join(headline)
image_url = response.css("div.storybigpic.ssss img").xpath("#src").extract_first()
maintext = response.xpath("//div[#class = 'ie2013-contentstory']//p//text()").extract()
maintext = ' '.join(map(str, maintext))
maintext = maintext.replace('\r','')
contd = response.xpath("//div[#class = 'ie2013-contentstory']/p[#align = 'right']/text()").extract_first()
item['date_updated'] = date_updated
item['headline'] = headline
item['maintext'] = maintext
item['image_url'] = image_url
item['article_url'] = article_url
next_page_url = response.xpath('//a[#rel="canonical"][#id="active"]/following-sibling::a[1]/#href').extract_first()
if next_page_url :
yield scrapy.Request(
url=next_page_url,
callback = self.parse_next_page,
cb_kwargs={
'item': item,
}
)
else:
yield item
def parse_next_page(self, response, item):
next_maintext = response.xpath("//div[#class = 'ie2013-contentstory']//p//text()").extract()
next_maintext = ' '.join(map(str, next_maintext))
next_maintext = next_maintext.replace('\r','')
item["maintext"] += next_maintext
next_page_url = response.xpath('//a[#rel="canonical"][#id="active"]/following-sibling::a[1]/#href').extract_first()
if next_page_url :
yield scrapy.Request(
url=next_page_url,
callback = self.parse_next_page,
cb_kwargs={
'item': item,
}
)
else:
yield item

Resources