Implementation of parallel class using CUDA C++? - parallel-processing

I've been trying to parallelize my class and class constructor by using CUDA.
You can find the both serial version and parallelized version of my class. It compiles correctly and I wonder if there are any improvements in my parallelized code.
Serial Code Ray.h:
#pragma once
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <math.h>
#include <vector>
#include <algorithm>
#include <complex>
#include "arithmatic_operations.h"
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
class Ray
{
public:
std::vector<std::vector<double>>Point = { { 0,0,0 } ,{ 0,0,0 } ,{ 0,0,0 } };
std::vector<std::vector<double>>Direction = { { 0,0,0 } ,{ 0,0,0 } ,{ 0,0,0 } };
double no_bounces = -1;
double length = -1;
std::vector<double>E_thei = { 0,0,0 };
std::vector<double>E_phii = { 0,0,0 };
std::complex<double> Er_the_the = 0;
std::complex<double> Er_phi_the = 0;
std::complex<double> Er_the_phi = 0;
std::complex<double> Er_phi_phi = 0;
double Ai = 0;
Ray(std::vector<double>OO, std::vector<std::vector<double>>DD, double delta)
{
//Point.push_back({ 0,0,0 });
this->Point[no_bounces + 1][0] = OO[0];
this->Point[no_bounces + 1][1] = OO[1];
this->Point[no_bounces + 1][2] = OO[2];
std::vector<double>first_row_DD = { 0,0,0 };
first_row_DD[0] = DD[0][0];
first_row_DD[1] = DD[0][1];
first_row_DD[2] = DD[0][2];
//Direction.push_back({ 0,0,0 });
this->Direction[no_bounces + 1][0] = DD[0][0] / norm(first_row_DD);
this->Direction[no_bounces + 1][1] = DD[0][1] / norm(first_row_DD);
this->Direction[no_bounces + 1][2] = DD[0][2] / norm(first_row_DD);
this->E_thei[0] = DD[1][0];
this->E_thei[1] = DD[1][1];
this->E_thei[2] = DD[1][2];
this->E_phii[0] = DD[2][0];
this->E_phii[1] = DD[2][1];
this->E_phii[2] = DD[2][2];
this->Ai = pow(delta, 2);
}
};
and below there is the code I parallelized:
Ray.cuh:
#include <iostream>
#include <array>
#include <fstream>
#include <string>
#include <sstream>
#include <cstddef>
#include <utility>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/copy.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/zip_function.h>
#include <thrust/execution_policy.h>
__device__ double square(double& x) { return x * x; }
struct myPow
{
__device__
double operator()(double& x) const { return square(x); }
};
struct CalculateNormValues
{
__device__
double operator()(const thrust::tuple<double, double, double>& t) const
{
double x = thrust::get<0>(t);
double y = thrust::get<1>(t);
double z = thrust::get<2>(t);
return (double)sqrt(x * x + y * y + z * z);
}
};
struct Normalize
{
__device__
thrust::tuple<double, double, double> operator()(const thrust::tuple<double, double, double, double>& t) const
{
double x = thrust::get<0>(t);
double y = thrust::get<1>(t);
double z = thrust::get<2>(t);
double norm = thrust::get<3>(t);
return thrust::make_tuple(x / norm, y / norm, z / norm);
}
};
class Ray {
static constexpr int n_dims = 3;
static constexpr int cn_dims = 2;
using Container = thrust::device_vector<double>;
using Vectors = std::array<Container, n_dims>;
using Matrices = std::array<Container, n_dims* n_dims>;
using Complexes = std::array<Container, cn_dims>;
public:
std::ptrdiff_t n_rays{};
Vectors E_thei;
Vectors E_phii;
Matrices Point;
Matrices Direction;
Complexes Er_the_the;
Complexes Er_phi_the;
Complexes Er_the_phi;
Complexes Er_phi_phi;
Container Ai;
Container no_bounces;
Container length;
Container normValues;
Ray(thrust::device_vector<double>& OO_0,
thrust::device_vector<double>& OO_1,
thrust::device_vector<double>& OO_2,
thrust::device_vector<double>& DD_00,
thrust::device_vector<double>& DD_01,
thrust::device_vector<double>& DD_02,
thrust::device_vector<double>& DD_10,
thrust::device_vector<double>& DD_11,
thrust::device_vector<double>& DD_12,
thrust::device_vector<double>& DD_20,
thrust::device_vector<double>& DD_21,
thrust::device_vector<double>& DD_22,
thrust::device_vector<double>& delta) :
n_rays{ static_cast<std::ptrdiff_t>(OO_0.size()) },
Direction{ std::move(DD_00), //Normalize direction components later
std::move(DD_01),
std::move(DD_02) },
Point{ std::move(OO_0),
std::move(OO_1),
std::move(OO_2) },
E_thei{ std::move(DD_10),
std::move(DD_11),
std::move(DD_12) },
E_phii{ std::move(DD_20),
std::move(DD_21),
std::move(DD_22) },
Ai{ std::move(delta) } //Multiply Ai values later
{
thrust::transform(Ai.begin(), Ai.end(), Ai.begin(), myPow()); //Multiplies Ai values
//Calculate normalized values
normValues.resize(3);
thrust::transform(
thrust::make_zip_iterator(thrust::make_tuple(Direction[0].begin(), Direction[1].begin(), Direction[2].begin())),
thrust::make_zip_iterator(thrust::make_tuple(Direction[0].end(), Direction[1].end(), Direction[2].end())),
normValues.begin(),
CalculateNormValues{});
//Normalize Direction
thrust::transform(
thrust::make_zip_iterator(
thrust::make_tuple(Direction[0].begin(), Direction[1].begin(), Direction[2].begin(), normValues.begin())),
thrust::make_zip_iterator(
thrust::make_tuple(Direction[0].end(), Direction[1].end(), Direction[2].end(), normValues.end())),
thrust::make_zip_iterator(
thrust::make_tuple(Direction[0].begin(), Direction[1].begin(), Direction[2].begin())),
Normalize{});
}
};
The program compiles but I'd like to ask some questions.
When I use thrust::transform I know that the thrust library does the memory allocation and copying on the device for me. I wonder after the operation is done, does it copy back to the host? After
thrust::transform(Ai.begin(), Ai.end(), Ai.begin(), myPow());
if I write a line like this Ai[0]=5. Is this line executed on CPU or GPU?
My second question is I wonder if I can write a device function in my parallelized class by using __global__ and cuda threads. If the answer is yes, for example after moving device_vectors OO_1 and OO_2 to the member Point, If I want to do math on these device vectors in __global__ function since device_vectors are host only I need to copy them to C arrays and allocate memory on device and do the math right?

Related

Efficient search for an element other than the one used for ordering in std::set

Consider the following set implementation. Here I have ordered the set based on fScore parameter. What should I do If I want to search for an element of particular 'id' in 'NodeData'.
I know I can use 'find' to search for any element of 'fScore' in the set with O(logn).
Is there any efficient way to search for 'id' (less time) than a linear search (implemented below)?
#include<iostream>
#include<algorithm>
#include<iterator>
#include<set>
#include<stdlib.h>
#include<vector>
struct NodeData{
int id;
int parent;
double fScore, gScore, hScore;
std::vector<double> nScores;
NodeData(const int& idIn = 0,
const int& parentIn = -1,
const double& fIn = 1,
const double& gIn = 1,
const double& hIn = 1):id(idIn), parent(parentIn),
fScore(fIn), gScore(gIn), hScore(hIn)
{
}
bool operator<(const NodeData& rhs) const {
return fScore < rhs.fScore;
}
};
class test
{
public:
std::set<NodeData> NodeList;
};
int main()
{
test q;
for(int i=1;i<=5;i++)
{
NodeData n1 = {i,1,i,1,1};
q.NodeList.insert(n1);
}
std::set<NodeData>::iterator it;
//search for node with fScore 1 - cost O(logn)
it = q.NodeList.find(1);
if(it != q.NodeList.end()){
std::cout<<"node with fScore 1 found. id = "<<it->id<<std::endl;
}
else{
std::cout<<"node not found = "<<std::endl;
}
//searching for id=3 - Linear search - cost O(n)
int searchId = 3;
std::set<NodeData>::iterator it1 = q.NodeList.begin();
while(it1 != q.NodeList.end())
{
if(it1->id == searchId)
{
std::cout <<"found node with id = "<<it1->id<<std::endl;
}
it1++;
}
}
Does your set changes often? If not - you could consider building an "index" - unordered_map<> of whatever field you need to the element of your set.
There is a cost of maintaining such "index", you should see if it overweights the faster search.
You can't achieve this without using different/additional data structures. If you're using C++ and you're OK with using a library, you can find this functionality in the Boost multi-index containers library.
Adding to Falk's answer, here's an example of how the thing could be done with Boost.MultiIndex:
Live On Coliru
#include <iostream>
#include <algorithm>
#include <iterator>
#include <stdlib.h>
#include <vector>
#include <boost/multi_index_container.hpp>
#include <boost/multi_index/identity.hpp>
#include <boost/multi_index/member.hpp>
#include <boost/multi_index/ordered_index.hpp>
struct NodeData{
int id;
int parent;
double fScore, gScore, hScore;
std::vector<double> nScores;
NodeData(const int& idIn = 0,
const int& parentIn = -1,
const double& fIn = 1,
const double& gIn = 1,
const double& hIn = 1):id(idIn), parent(parentIn),
fScore(fIn), gScore(gIn), hScore(hIn)
{
}
bool operator<(const NodeData& rhs) const {
return fScore < rhs.fScore;
}
};
class test
{
public:
typedef boost::multi_index_container<
NodeData,
boost::multi_index::indexed_by<
boost::multi_index::ordered_unique<
boost::multi_index::identity<NodeData>
>,
boost::multi_index::ordered_unique<
boost::multi_index::member<NodeData, int, &NodeData::id>
>
>
> NodeListType;
NodeListType NodeList;
};
int main()
{
test q;
for(int i=1;i<=5;i++)
{
NodeData n1 = {i,1,double(i),1,1};
q.NodeList.insert(n1);
}
test::NodeListType::iterator it;
//search for node with fScore 1 - cost O(logn)
it = q.NodeList.find(1);
if(it != q.NodeList.end()){
std::cout<<"node with fScore 1 found. id = "<<it->id<<std::endl;
}
else{
std::cout<<"node not found = "<<std::endl;
}
//searching for id=3 on second index - cost O(logn)
int searchId = 3;
test::NodeListType::nth_index<1>::type::iterator it1 = q.NodeList.get<1>().find(searchId);
if(it1 != q.NodeList.get<1>().end()){
std::cout <<"found node with id = "<<it1->id<<std::endl;
}
}
If, instead of an ordered index, you use a hashed index for NodeData::id, lookup is constant (in average).

dcmtk display image Qt example

I would like to be able to display in a Dicom image in a Qt project with the same render as a Dicom Viewer Program could give.
I was able to do it but with a very bad contrast. I heard you need to operate on the pixels but I'm not sure. Do you have a working example ?
EDIT: I add my code in case it helps you, I commented a lot of things because I noticed the result was exactly the same
#include "mainwindow.h"
#include "ui_mainwindow.h"
#include <iostream>
#undef UNICODE
#undef _UNICODE
#include <dcmtk/config/osconfig.h>
#include <dcmtk/dcmdata/dctk.h>
#include <dcmtk/dcmimgle/dcmimage.h>
#include <QPixmap>
#include <QLabel>
#include <QImageReader>
using namespace std;
MainWindow::MainWindow(QWidget *parent) :
QMainWindow(parent),
ui(new Ui::MainWindow)
{
ui->setupUi(this);
//int sizeX = 600;
// int sizeY = 600;
//initialize random seed
//srand (time(NULL));
//QImage image = QImage( sizeX, sizeY, QImage::Format_RGB32 );
/*for( int l=0; l<sizeX; l++ )
{
for( int c=0; c<sizeY; c++ )
{
///Random color for each pixel
//image.setPixel( l, c, qRgb(rand() % 256, rand() % 256, rand() % 256) );
///Fixed color for each pixel
image.setPixel( l, c, qRgb(100, 150, 200) );
}
}*/
const char *file = "/home/x4rkz/project/Laura/QTImage/IMG00000";
DicomImage *image = new DicomImage(file);
if (image != NULL)
{
if (image->getStatus() == EIS_Normal)
{
Uint8 *pixelData = (Uint8 *)(image->getOutputData(8 )); // bits per sample
// Uint8 is a pointer to internal memory buffer
if (pixelData != NULL)
{
// do something useful with the pixel data
QImage img(pixelData,image->getWidth(), image->getHeight(), QImage::Format_Indexed8 );
/*QColor color;
QImage *img;
void *pDicomDibits;
uchar *px;
// uchar pixel[4];
const int width = (int)(image->getWidth());
const int height = (int)(image->getHeight());
if (image->isMonochrome()){
img = new QImage(width, height, QImage::Format_Indexed8);
img->setColorCount(256);
// define gray palette here
for (int i=0; i<256; i++) {
color.setRgb(i, i, i);
img->setColor(i, color.rgb());
}
image->createWindowsDIB(pDicomDibits, 0, 0, 8, 0, 1);
unsigned char * pd;
pd=(unsigned char *)pDicomDibits;
for (int y=0; y < (long) height; y++)
{
px = img->scanLine(y);
for (int x=0; x < (long) width; x++)
{
px[x] = (unsigned char) (*pd);
pd++;
}
}*/
QGraphicsScene * graphic = new QGraphicsScene( this );
graphic->addPixmap( QPixmap::fromImage( img ) );
ui->graphicsView->setScene(graphic);
/* }else
cout << "Non monochrome image" << endl;*/
}
} else
cerr << "Error: cannot load DICOM image (" << DicomImage::getString(image->getStatus()) << ")" << endl;
}
}
MainWindow::~MainWindow()
{
delete ui;
}
#include "mainwindow.h"
#include <QApplication>
#include <iostream>
#undef UNICODE
#undef _UNICODE
#include <dcmtk/config/osconfig.h>
#include <dcmtk/dcmdata/dctk.h>
#include <dcmtk/dcmimgle/dcmimage.h>
#include <QPixmap>
#include <QLabel>
#include <QImageReader>
using namespace std;
int main(int argc, char *argv[])
{
QApplication a(argc, argv);
MainWindow w;
w.show();
return a.exec();
}
As you cant see, the result has no constrast.
As you cant see, the result has no constrast.
If the rendered image has such a low contrast, you should try to set an appropriate VOI (Value of Interest) window, e.g. using image->setMinMaxWndow(). See API documentation for details.

Sound cards don't show up in /proc/asound/cards

I've this driver:
#include <linux/acpi.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/gpio.h>
#include <linux/gpio/consumer.h>
#include <linux/kernel.h>
#include <linux/mod_devicetable.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/platform_device.h>
#include <sound/pcm.h>
#include <sound/soc.h>
#include <sound/soc-dai.h>
#include <sound/soc-dapm.h>
static const struct snd_soc_dapm_route max9880_dapm_routes[] = {
{"Mono out", NULL, "Mono Mixer"}
};
static struct snd_soc_codec_driver soc_codec_dev_max9880 = {
.component_driver = {
.dapm_routes = max9880_dapm_routes,
.num_dapm_routes = ARRAY_SIZE(max9880_dapm_routes)
}
};
static struct snd_soc_dai_driver max9880_dai = {
.name = "max9880",
.playback = {
.stream_name = "Playback",
.channels_min = 1,
.channels_max = 1
}
};
static int max9880_platform_probe(struct platform_device *pdev)
{
int ret;
snd_printk(KERN_ALERT "1. platform probe");
ret = snd_soc_register_codec(&pdev->dev, &soc_codec_dev_max9880,
&max9880_dai, 1);
return ret;
}
static int max9880_platform_remove(struct platform_device *pdev)
{
snd_printk(KERN_ALERT "2. platform remove");
snd_soc_unregister_codec(&pdev->dev);
return 0;
}
static const struct of_device_id max9880_device_id[] = {
{ .compatible = "max9880" },
{}
};
MODULE_DEVICE_TABLE(of, max9880_device_id);
static struct platform_driver max9880_platform_driver = {
.driver = {
.name = "max9880",
.of_match_table = of_match_ptr(max9880_device_id),
},
.probe = &max9880_platform_probe,
.remove = &max9880_platform_remove,
};
module_platform_driver(max9880_platform_driver);
and I use insmod to load the module into the kernel. This all seems to work well, and I'm also able to do a rmmod without any problem. However I don't get any entries in /proc/asound/cards meaning that my module isn't recoqnized as a sound card. What am I missing?

inserting a range of struct vector into a vector of a struct member type

Is it possible to insert range of struct directly into vector of the same type (same type of a member of struct).
Let's have a struct and vectors like this:
struct pnt {
char _name;
int _type;
bool _aux;
};
std::vector<pnt> pnts;
std::vector<int> pntType;
The question is that how to insert a range of pnts into pntType using single standard line of C++98:
void insert (iterator position, InputIterator first, InputIterator last);
or even Boost library.
Since I am using this often in different parts of my code, I am trying to avoid doing this in a loop. The last option is defining a function for that.
EDIT:
I know the insert syntax. What I cannot do is how to insert from pnts (only _type of each member) into pntType
UPDATE: There is a better way than my first suggestion (see bottom), since we're already using Boost. The problem with std::transform and std::insert_iterator is that v2 is resized several times, which is wasteful considering that we know the width of the range in advance. Using boost::transform_iterator and boost::bind, it is possible to avoid the problem like this:
#include <boost/bind.hpp>
#include <boost/iterator/transform_iterator.hpp>
#include <algorithm>
#include <iostream>
#include <iterator>
#include <vector>
struct A {
int x;
};
int main() {
A arr[] = {
{ 0 }, { 1 }, { 2 }, { 3 }, { 4 }, { 5 }, { 6 }
};
std::vector<A> v1(arr, arr + 6);
std::vector<int> v2;
v2.insert(v2.begin(),
boost::make_transform_iterator(v1.begin() + 2, boost::bind(&A::x, _1)),
boost::make_transform_iterator(v1.begin() + 4, boost::bind(&A::x, _1)));
std::copy(v2.begin(), v2.end(), std::ostream_iterator<int>(std::cout, "\n"));
}
OLD SUGGESTION:
boost::bind works with data member pointers, so using C++98 and Boost, you could do something like this without changing your struct:
#include <boost/bind.hpp>
#include <algorithm>
#include <iostream>
#include <iterator>
#include <vector>
struct A {
int x;
};
int main() {
A arr[] = {
{ 0 }, { 1 }, { 2 }, { 3 }, { 4 }, { 5 }, { 6 }
};
std::vector<A> v1(arr, arr + 6);
std::vector<int> v2;
// one-liner here:
std::transform(v1.begin() + 2,
v1.begin() + 4,
std::insert_iterator<std::vector<int> >(v2, v2.begin()),
boost::bind(&A::x, _1));
std::copy(v2.begin(), v2.end(), std::ostream_iterator<int>(std::cout, "\n"));
}
Using boost range:
boost::copy(pnts | transformed(std::mem_fn(&pnt::_type)), std::back_inserter(pntType));
Or even
boost::copy_range<std::vector<int>>(pnts | transformed(std::mem_fn(&pnt::_type)));
See it Live on Coliru
Note you can use boost::bind(&pnt:_type,_1) instead of mem_fn to allow for your compiler version
Updated To show with specific first/last iterators, and compiling in c++03 mode:
Live On Coliru
#include <boost/range/algorithm.hpp>
#include <boost/range/adaptors.hpp>
#include <boost/range/iterator_range.hpp>
#include <boost/bind.hpp>
using namespace boost::adaptors;
using namespace boost;
struct pnt {
char _name;
int _type;
bool _aux;
};
int main() {
std::vector<pnt> pnts(6);
std::vector<int> pntType;
boost::copy(
make_iterator_range(pnts.begin(), pnts.begin()+3) | transformed(bind(&pnt::_type, _1)),
std::back_inserter(pntType));
}
Inserting one container into the other works like this:
pntType.insert(pntType.begin(),pnts.begin(),pnts.end());
To be able to insert the correct type, you should add a conversion operator to int to your struct.
struct pnt {
char _name;
int _type;
bool _aux;
operator int (){
return _type;
}
};

Difficulty in Implementation in Segment tree with lazy propogation

I am having trouble in implementing segment tree with lazy propagation. I just read about segment trees and tried to do a simple question (http://www.codechef.com/problems/FLIPCOIN) using it but I am getting wrong answer. Please help me with the implementation. Here is my code(If you prefer ideone:http://ideone.com/SHVZ5y):
#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <utility>
#include <map>
#include <vector>
#include <list>
#include <string>
#include <set>
#include <queue>
#define s(x) scanf("%d",&x)
#define sil(x) scanf("%llu",&x)
#define sd(x) scanf("%ld",&x)
#define FOR(i,a,b) for( typeof(a) i=(a); i<(b); ++i) // exclusive for
#define FORR(i,a,b) for( typeof(a) i=(a-1) ; i>=(b); --i)
#define REP(k,a,b) for(typeof(a) k=(a); k <= (b); ++k) // inclusive for
#define REPR(i,a,b) for( typeof(a) i=(a) ; i>=(b); --i)
#define ALL(c) (c).begin(), (c).end()
#define PB push_back
#define MP make_pair
#define SZ(x) ((int)((x).size()))
#define SRT(v) std::sort(ALL(v))
#define CTN(x) std::cout<<x<<'\n' //cout with newline
#define CTS(x) std::cout<<x<<" " //cout with space
#define CLR(x) std::memset(x,0,sizeof(x))
#define FILL(x,n) std::fill_n(x,sizeof(x),n)
#define DBGA(x,n) {FOR(i,0,n) cout<<x[i]<<" "; CTN(" ");}
//#define NL printf("\n")
typedef std::vector<int> VI;
typedef std::vector<long long int> VL;
typedef std::vector<std::string> VS;
typedef std::map<int,int> MI;
typedef std::pair<int,int> PII;
typedef unsigned long long ull;
typedef long long ll;
using namespace std;
struct node{
int h; //number of head
int t; //number of tail
int lazy;
node()
{
h=0;
t=0;
lazy=0;
}
}tree[300000];
void build_tree(int n,int a,int b)
{
//cout<<"wo"<<endl;
if(a>b)
return;
if(a==b)
{
tree[n].h=0;
tree[n].t=1;
//cout<<tree[n]<<" "<<a<<" "<<b<<" "<<n<<endl;
return;
}
build_tree(2*n+1,a,(a+b)/2);
build_tree(2*n+2,(a+b)/2+1,b);
tree[n].t=tree[2*n+1].t+tree[2*n+2].t;
//cout<<tree[n]<<" "<<a<<" "<<b<<" "<<n<<endl;
}
int query(int n,int ql,int qr,int l,int r)
{
if(tree[n].lazy!=0)
{
int tmp=tree[n].h;
tree[n].h=tree[n].t;
tree[n].t=tmp;
if(r!=l)
{
tree[2*n+1].lazy=1;
tree[2*n+2].lazy=1;
}
tree[n].lazy=0;
}
if(l>qr || r<ql)
return 0;
if(l>=ql && r<=qr)
return tree[n].h;
return query(2*n+1,ql,qr,l,(l+r)/2)+query(2*n+2,ql,qr,(l+r)/2+1,r);
}
void update(int n,int ul,int ur,int l,int r)
{
if(tree[n].lazy!=0)
{
int tmp=tree[n].h;
tree[n].h=tree[n].t;
tree[n].t=tmp;
if(r!=l)
{
tree[2*n+1].lazy=1;
tree[2*n+2].lazy=1;
}
tree[n].lazy=0;
}
if(l>ur || r<ul)
return ;
if(l>=ul && r<=ur)
{
int tmp=tree[n].h;
tree[n].h=tree[n].t;
tree[n].t=tmp;
if(r!=l)
{
tree[2*n+1].lazy=1;
tree[2*n+2].lazy=1;
}
return;
}
update(2*n+1,ul,ur,l,(l+r)/2);
update(2*n+2,ul,ur,(l+r)/2+1,r);
tree[n].h=tree[2*n+1].h+tree[2*n+2].h;
tree[n].t=tree[2*n+1].t+tree[2*n+2].t;
}
int main()
{
std::ios_base::sync_with_stdio(false);
int n;cin>>n;
build_tree(0,0,n-1);
int q;cin>>q;
while(q--)
{
int t;cin>>t;int l,r;cin>>l>>r;
if(t)
{
cout<<query(0,l,r,0,n-1)<<'\n';
}
else
{
update(0,l,r,0,n-1);
/*CTN(" ");
FOR(i,0,7)
cout<<i<<" "<<tree[i].h<<'\n';
CTN(" ");*/
}
}
}
There was a problem with the lazy propagation part. It should be:
tail[2*n+1].lazy=1-tail[2*n+1].lazy
and not
tail[2*n+1].lazy=1

Resources