Spark LinearRegressionSummary "normal" summary - apache-spark-mllib

According to LinearRegressionSummary (Spark 2.1.0 JavaDoc), p-values are only available for the "normal" solver.
This value is only available when using the "normal" solver.
What the hell is the "normal" solver?
I'm doing this:
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.LinearRegressionModel
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, SparkSession}
.
.
.
val (trainingData, testData): (DataFrame, DataFrame) =
com.acme.pta.accuracy.Util.splitData(output, testProportion)
.
.
.
val lr =
new org.apache.spark.ml.regression.LinearRegression()
.setSolver("normal").setMaxIter(maxIter)
val pipeline = new Pipeline()
.setStages(Array(lr))
val paramGrid = new ParamGridBuilder()
.addGrid(lr.elasticNetParam, Array(0.2, 0.4, 0.8, 0.9))
.addGrid(lr.regParam, Array(0,6, 0.3, 0.1, 0.01))
.build()
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(evaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(numFolds) // Use 3+ in practice
val cvModel: CrossValidatorModel = cv.fit(trainingData)
val pipelineModel: PipelineModel = cvModel.bestModel.asInstanceOf[PipelineModel]
val lrModel: LinearRegressionModel =
pipelineModel.stages(0).asInstanceOf[LinearRegressionModel]
val modelSummary = lrModel.summary
Holder.log.info("lrModel.summary: " + modelSummary)
try {
Holder.log.info("feature p values: ")
// Exception occurs on line below.
val featuresAndPValues = features.zip(lrModel.summary.pValues)
featuresAndPValues.foreach(
(featureAndPValue: (String, Double)) =>
Holder.log.info(
"feature: " + featureAndPValue._1 + ": " + featureAndPValue._2))
} catch {
case _: java.lang.UnsupportedOperationException
=> Holder.log.error("Cannot compute p-values")
}
I am still getting the UnsupportedOperationException.
The exception message is:
No p-value available for this LinearRegressionModel
Is there something else I need to be doing? I'm using
"org.apache.spark" %% "spark-mllib" % "2.1.1"
Is pValues supported in that version?

Updated
tl;dr
Solution 1
In normal LinearRegression pValues and other "normal" statistics are only present when one of the parameters elasticNetParam or regParam is zero. So you can change
.addGrid( lr.elasticNetParam, Array( 0.0 ) )
or
.addGrid( lr.regParam, Array( 0.0 ) )
Solution 2
Make custom version of LinearRegression which would explicitly use
"normal" solver for regression.
Cholesky solver for WeightedLeastSquares.
I made this class as an extension to ml.regression package.
package org.apache.spark.ml.regression
import scala.collection.mutable
import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.optim.WeightedLeastSquares
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.linalg.VectorImplicits._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.functions._
class CholeskyLinearRegression ( override val uid: String )
extends Regressor[ Vector, CholeskyLinearRegression, LinearRegressionModel ]
with LinearRegressionParams with DefaultParamsWritable with Logging {
import CholeskyLinearRegression._
def this() = this(Identifiable.randomUID("linReg"))
def setRegParam(value: Double): this.type = set(regParam, value)
setDefault(regParam -> 0.0)
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)
def setStandardization(value: Boolean): this.type = set(standardization, value)
setDefault(standardization -> true)
def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
setDefault(elasticNetParam -> 0.0)
def setMaxIter(value: Int): this.type = set(maxIter, value)
setDefault(maxIter -> 100)
def setTol(value: Double): this.type = set(tol, value)
setDefault(tol -> 1E-6)
def setWeightCol(value: String): this.type = set(weightCol, value)
def setSolver(value: String): this.type = set(solver, value)
setDefault(solver -> Auto)
def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
setDefault(aggregationDepth -> 2)
override protected def train(dataset: Dataset[_]): LinearRegressionModel = {
// Extract the number of features before deciding optimization solver.
val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size
val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
val instances: RDD[Instance] =
dataset
.select( col( $(labelCol) ), w, col( $(featuresCol) ) )
.rdd.map {
case Row(label: Double, weight: Double, features: Vector) =>
Instance(label, weight, features)
}
// if (($(solver) == Auto &&
// numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == Normal) {
// For low dimensional data, WeightedLeastSquares is more efficient since the
// training algorithm only requires one pass through the data. (SPARK-10668)
val optimizer = new WeightedLeastSquares(
$(fitIntercept),
$(regParam),
elasticNetParam = $(elasticNetParam),
$(standardization),
true,
solverType = WeightedLeastSquares.Cholesky,
maxIter = $(maxIter),
tol = $(tol)
)
val model = optimizer.fit(instances)
val lrModel = copyValues(new LinearRegressionModel(uid, model.coefficients, model.intercept))
val (summaryModel, predictionColName) = lrModel.findSummaryModelAndPredictionCol()
val trainingSummary = new LinearRegressionTrainingSummary(
summaryModel.transform(dataset),
predictionColName,
$(labelCol),
$(featuresCol),
summaryModel,
model.diagInvAtWA.toArray,
model.objectiveHistory
)
lrModel
.setSummary( Some( trainingSummary ) )
lrModel
}
override def copy(extra: ParamMap): CholeskyLinearRegression = defaultCopy(extra)
}
object CholeskyLinearRegression
extends DefaultParamsReadable[CholeskyLinearRegression] {
override def load(path: String): CholeskyLinearRegression = super.load(path)
val MAX_FEATURES_FOR_NORMAL_SOLVER: Int = WeightedLeastSquares.MAX_NUM_FEATURES
/** String name for "auto". */
private[regression] val Auto = "auto"
/** String name for "normal". */
private[regression] val Normal = "normal"
/** String name for "l-bfgs". */
private[regression] val LBFGS = "l-bfgs"
/** Set of solvers that LinearRegression supports. */
private[regression] val supportedSolvers = Array(Auto, Normal, LBFGS)
}
All you have to do is to paste it to the separate file in the project and change LinearRegression to CholeskyLinearRegression in your code.
val lr = new CholeskyLinearRegression() // new LinearRegression()
.setSolver( "normal" )
.setMaxIter( maxIter )
It works with non-zero params and gives pValues. Tested on following params grid.
val paramGrid = new ParamGridBuilder()
.addGrid( lr.elasticNetParam, Array( 0.2, 0.4, 0.8, 0.9 ) )
.addGrid( lr.regParam, Array( 0.6, 0.3, 0.1, 0.01 ) )
.build()
Full investigation
I initially thought that the main issue is with the model being not fully preserved. Trained model is not preserved after fitting in CrossValidator. It is understandable because of memory consumption. There is an ongoing debate on how should it be resolved. Issue in JIRA.
You can see in the commented section that I tried to extract parameters from the best model in order to run it again. Then I found out that the model summary is ok, it's just for some parameters diagInvAtWa has length of 1 and basically a zero.
For ridge regression or Tikhonov regularization (elasticNet = 0) and any regParam pValues and other "normal" statistics can be computed but for Lasso method and something in between (elastic net) not. Same goes for regParam = 0: with any elasticNet pValues were computed.
Why is that
LinearRegression uses Weighted Least Square optimizer for "normal" solver with solverType = WeightedLeastSquares.Auto. This optimizer has two options for solvers: QuasiNewton or Cholesky. The former is selected only when both regParam and elasticNetParam are non-zeroes.
val solver = if (
( solverType == WeightedLeastSquares.Auto &&
elasticNetParam != 0.0 &&
regParam != 0.0 ) ||
( solverType == WeightedLeastSquares.QuasiNewton ) ) {
...
new QuasiNewtonSolver(fitIntercept, maxIter, tol, effectiveL1RegFun)
} else {
new CholeskySolver
}
So in your parameters grid the QuasiNewtonSolver will be always used because there are no combinations of regParam and elasticNetParam where one of them is zero.
We know that in order to get pValues and other "normal" statistics such as t-statistic or std. error of coefficients the diagonal of matrix (A^T * W * A)^-1 (diagInvAtWA) must not be a vector with only one zero. This condition is set in definition of pValues.
diagInvAtWA is a vector of diagonal elements of packed upper triangular matrix (solution.aaInv).
val diagInvAtWA = solution.aaInv.map { inv => ...
For Cholesky solver it is calculated but for QuasiNewton not. Second parameter for NormalEquationSolution is this matrix.
You technically could make your own version of LinearRegression with
Reproduction
In this example I used data sample_linear_regression_data.txt from here.
Full code of reproduction
import org.apache.spark._
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.evaluation.{RegressionEvaluator, BinaryClassificationEvaluator}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.{LinearRegressionModel, LinearRegression}
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.ml.param.ParamMap
object Main {
def main( args: Array[ String ] ): Unit = {
val spark =
SparkSession
.builder()
.appName( "SO" )
.master( "local[*]" )
.config( "spark.driver.host", "localhost" )
.getOrCreate()
import spark.implicits._
val data =
spark
.read
.format( "libsvm" )
.load( "./sample_linear_regression_data.txt" )
val Array( training, test ) =
data
.randomSplit( Array( 0.9, 0.1 ), seed = 12345 )
val maxIter = 10;
val lr = new LinearRegression()
.setSolver( "normal" )
.setMaxIter( maxIter )
val paramGrid = new ParamGridBuilder()
// .addGrid( lr.elasticNetParam, Array( 0.2, 0.4, 0.8, 0.9 ) )
.addGrid( lr.elasticNetParam, Array( 0.0 ) )
.addGrid( lr.regParam, Array( 0.6, 0.3, 0.1, 0.01 ) )
.build()
val pipeline = new Pipeline()
.setStages( Array( lr ) )
val cv = new CrossValidator()
.setEstimator( pipeline )
.setEvaluator( new RegressionEvaluator )
.setEstimatorParamMaps( paramGrid )
.setNumFolds( 2 ) // Use 3+ in practice
val cvModel =
cv
.fit( training )
val pipelineModel: PipelineModel =
cvModel
.bestModel
.asInstanceOf[ PipelineModel ]
val lrModel: LinearRegressionModel =
pipelineModel
.stages( 0 )
.asInstanceOf[ LinearRegressionModel ]
// Technically there is a way to use exact ParamMap
// to build a new LR but for the simplicity I'll
// get and set them explicitly
// lrModel.params.foreach( ( param ) => {
// println( param )
// } )
// val bestLr = new LinearRegression()
// .setSolver( "normal" )
// .setMaxIter( maxIter )
// .setRegParam( lrModel.getRegParam )
// .setElasticNetParam( lrModel.getElasticNetParam )
// val bestLrModel = bestLr.fit( training )
val modelSummary =
lrModel
.summary
println( "lrModel pValues: " + modelSummary.pValues.mkString( ", " ) )
spark.stop()
}
}
Original
There are three solver algorithms available:
l-bfgs - Limited-memory Broyden–Fletcher–Goldfarb–Shanno algorithm which is a limited-memory quasi-Newton optimization method.
normal - using Normal Equation as an analytical solution to the linear regression problem. It is basically a weighted least squares approach or reweighted least squares approach.
auto - solver algorithm is selected automatically. The Normal Equations solver will be used when possible, but this will automatically fall back to iterative optimization methods when needed
The coefficientStandardErrors, tValues and pValues are only available when using the "normal" solver because they are all based on diagInvAtWA - a diagonal of matrix (A^T * W * A)^-1.

Related

py_environment 'time_step' doesn't match 'time_step_spec'

I have created a custom pyenvironment via tf agents. However I can't validate the environment or take steps within it with py_policy.action
I'm confused as to what is excepted from the time_step_specs
I have tried converting to tf_py_environment via tf_py_environment.TFPyEnvironment and was successful in taking actions with tf_policy but I'm still confused as to the difference.
import abc
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.trajectories import time_step as ts
from tf_agents.policies import random_tf_policy
import tensorflow as tf
import tf_agents
class TicTacToe(py_environment.PyEnvironment):
def __init__(self,n):
super(TicTacToe,self).__init__()
self.n = n
self.winner = None
self._episode_ended = False
self.inital_state = np.zeros((n,n))
self._state = self.inital_state
self._observation_spec = array_spec.BoundedArraySpec(
shape = (n,n),dtype='int32',minimum = -1,maximum = 1,name =
'TicTacToe board state spec')
self._action_spec = array_spec.BoundedArraySpec(
shape = (),dtype = 'int32', minimum = 0,maximum = 8, name =
'TicTacToe action spec')
def observation_spec(self):
return self._observation_spec
def action_spec(self):
return self._action_spec
def _reset(self):
return ts.restart(self.inital_state)
def check_game_over(self):
for i in range(self.n):
if (sum(self._state[i,:])==self.n) or
(sum(self._state[:,i])==self.n):
self.winner = 1
return True
elif (sum(self._state[i,:])==-self.n) or
(sum(self._state[:,i])==-self.n):
self.winner = -1
return True
if (self._state.trace()==self.n) or
(self._state[::-1].trace()==self.n):
self.winner = 1
return True
elif (self._state.trace()==-self.n) or (self._state[::-1].trace()==-
self.n):
self.winner = -1
return True
if not (0 in self._state):
return True
def _step(self,action):
self._state[action//3,action%3]=1
self._episode_ended = self.check_game_over
if self._episode_ended==True:
if self.winner == 1:
reward = 1
elif self.winner == None:
reward = 0
else:
reward = -1
return ts.termination(self._state,dtype = 'int32',reward=reward)
else:
return ts.transition(self._state,dtype = 'int32',reward =
0.0,discount = 0.9)
env = TicTacToe(3)
utils.validate_py_environment(env, episodes=5)
This is the error I get:
ValueError Traceback (most recent call last)
in
----> 1 utils.validate_py_environment(env, episodes=5)
C:\Users\bzhang\AppData\Local\Continuum\anaconda3\lib\site-packages\tf_agents\environments\utils.py in validate_py_environment(environment, episodes)
58 raise ValueError(
59 'Given time_step: %r does not match expected time_step_spec: %r' %
---> 60 (time_step, time_step_spec))
61
62 action = random_policy.action(time_step).action
ValueError: Given time_step: TimeStep(step_type=array(0), reward=array(0., dtype=float32), discount=array(1., dtype=float32), observation=array([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])) does not match expected time_step_spec: TimeStep(step_type=ArraySpec(shape=(), dtype=dtype('int32'), name='step_type'), reward=ArraySpec(shape=(), dtype=dtype('float32'), name='reward'), discount=BoundedArraySpec(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0), observation=BoundedArraySpec(shape=(3, 3), dtype=dtype('int32'), name='TicTacToe board state spec', minimum=-1, maximum=1))
Your observation does not match the spec, you need to pass dtype=np.int32 to the np array to make sure the type match.

How to split an akka ByteString at a two byte delimiter

I'm creating a WebSocket protocol of my own, and thought to have a text key/value header part, ending at two consecutive newlines, followed by a binary tail.
Turns out, splitting a ByteString in half (at the two newlines) is really tedious. There is no built-in .split method, for one. And no .indexOf for finding a binary fingerprint.
What would you use for this? Is there an easier way for me to build such a protocol?
References:
akka ByteString
Using akka-http 10.1.0-RC1, akka 2.5.8
One approach would be to first create sliding pairs from an indexedSeq of the ByteString, then split the ByteString using the identified indexes of the delimiter-pair, as in the following example:
import akka.util.ByteString
val bs = ByteString("aa\nbb\n\nxyz")
// bs: akka.util.ByteString = ByteString(97, 97, 10, 98, 98, 10, 10, 120, 121, 122)
val delimiter = 10
// Create sliding pairs from indexedSeq of the ByteString
val slidingList = bs.zipWithIndex.sliding(2).toList
// slidingList: List[scala.collection.immutable.IndexedSeq[(Byte, Int)]] = List(
// Vector((97,0), (97,1)), Vector((97,1), (10,2)), Vector((10,2), (98,3)),
// Vector((98,3), (98,4)), Vector((98,4), (10,5)), Vector((10,5), (10,6)),
// Vector((10,6), (120,7)), Vector((120,7), (121,8)), Vector((121,8), (122,9))
// )
// Get indexes of the delimiter-pair
val dIndex = slidingList.filter{
case Vector(x, y) => x._1 == delimiter && y._1 == delimiter
}.flatMap{
case Vector(x, y) => Seq(x._2, y._2)
}
// Split the ByteString list
val (bs1, bs2) = ( bs.splitAt(dIndex(0))._1, bs.splitAt(dIndex(1))._2.tail )
// bs1: akka.util.ByteString = ByteString(97, 97, 10, 98, 98)
// bs2: akka.util.ByteString = ByteString(120, 121, 122)
I came up with this. Haven't tested it in practise, yet.
#tailrec
def peelMsg(bs: ByteString, accHeaderLines: Seq[String]): Tuple2[Seq[String],ByteString] = {
val (a: ByteString, tail: ByteString) = bs.span(_ != '\n')
val b: ByteString = tail.drop(1)
if (a.isEmpty) { // end marker - empty line
Tuple2(accHeaderLines,b)
} else {
val acc: Seq[String] = accHeaderLines :+ a.utf8String // append
peelMsg(b,acc)
}
}
val (headerLines: Seq[String], value: ByteString) = peelMsg(bs,Seq.empty)
My code, for now:
// Find the index of the (first) double-newline
//
val n: Int = {
val bsLen: Int = bs.length
val tmp: Int = bs.zipWithIndex.find{
case ('\n',i) if i<bsLen-1 && bs(i+1)=='\n' => true
case _ => false
}.map(_._2).getOrElse{
throw new RuntimeException("No delimiter found")
}
tmp
}
val (bs1: ByteString, bs2: ByteString) = bs.splitAt(n) // headers, \n\n<binary>
Influenced by #leo-c's answer, but using a normal .find instead of the sliding window. Realised that since ByteString allows random access, I can combine a streaming search with that condition.

Scala/functional/without libs - check if string permutation of other

How could you check to see if one string is a permutation of another using scala/functional programming with out complex pre-built functions like sorted()?
I'm a Python dev and what I think trips me up the most is that you can't just iterate through a dictionary of character counts comparing to another dictionary of character counts, then just exit when there isn't a match, you can't just call break.
Assume this is the starting point, based on your description:
val a = "aaacddba"
val b = "aabaacdd"
def counts(s: String) = s.groupBy(identity).mapValues(_.size)
val aCounts = counts(a)
val bCounts = counts(b)
This is the simplest way:
aCounts == bCounts // true
This is precisely what you described:
def isPerm(aCounts: Map[Char,Int], bCounts: Map[Char,Int]): Boolean = {
if (aCounts.size != bCounts.size)
return false
for ((k,v) <- aCounts) {
if (bCounts.getOrElse(k, 0) != v)
return false
}
return true
}
This is your method, but more scala-ish. (It also breaks as soon as a mismatch is found, because of how foreach is implemented):
(aCounts.size == bCounts.size) &&
aCounts.forall { case (k,v) => bCounts.getOrElse(k, 0) == v }
(Also, Scala does have break.)
Also, also: you should read the answer to this question.
Another option using recursive function, which will also 'break' immediately once mismatch is detected:
import scala.annotation.tailrec
#tailrec
def isPerm1(a: String, b: String): Boolean = {
if (a.length == b.length) {
a.headOption match {
case Some(c) =>
val i = b.indexOf(c)
if (i >= 0) {
isPerm1(a.tail, b.substring(0, i) + b.substring(i + 1))
} else {
false
}
case None => true
}
} else {
false
}
}
Out of my own curiosity I also create two more versions which use char counts map for matching:
def isPerm2(a: String, b: String): Boolean = {
val cntsA = a.groupBy(identity).mapValues(_.size)
val cntsB = b.groupBy(identity).mapValues(_.size)
cntsA == cntsB
}
and
def isPerm3(a: String, b: String): Boolean = {
val cntsA = a.groupBy(identity).mapValues(_.size)
val cntsB = b.groupBy(identity).mapValues(_.size)
(cntsA == cntsB) && cntsA.forall { case (k, v) => cntsB.getOrElse(k, 0) == v }
}
and roughly compare their performance by:
def time[R](block: => R): R = {
val t0 = System.nanoTime()
val result = block // call-by-name
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0) + "ns")
result
}
// Match
time((1 to 10000).foreach(_ => isPerm1("apple"*100,"elppa"*100)))
time((1 to 10000).foreach(_ => isPerm2("apple"*100,"elppa"*100)))
time((1 to 10000).foreach(_ => isPerm3("apple"*100,"elppa"*100)))
// Mismatch
time((1 to 10000).foreach(_ => isPerm1("xpple"*100,"elppa"*100)))
time((1 to 10000).foreach(_ => isPerm2("xpple"*100,"elppa"*100)))
time((1 to 10000).foreach(_ => isPerm3("xpple"*100,"elppa"*100)))
and the result is:
Match cases
isPerm1 = 2337999406ns
isPerm2 = 383375133ns
isPerm3 = 382514833ns
Mismatch cases
isPerm1 = 29573489ns
isPerm2 = 381622225ns
isPerm3 = 417863227ns
As can be expected, the char counts map speeds up positive cases but can slow down negative cases (overhead on building the char counts map).

repetitive treatment on spark streaming

I'm new to Spark, I want to make a treatment on files in streaming.
I have files csv which arrive non-stop:
Example csv file:
world world
count world
world earth
count world
and I want to do two treatment on them :
the first treatment is for a result like this :
(world,2,2) // word is twice repeated for the first column and distinct (world,earth) for second therefore (2,2)
(count,2,1) // word is twice repeated for the first column and not distinct (world,world) for second therefore (2,1)
the second result
I want to get that result after each hour.in our example:
(world,1) // 1=2/2
(count,2) //2=2/1
this is my code :
val conf = new SparkConf()
.setAppName("File Count")
.setMaster("local[2]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(10m))
val file = ssc.textFileStream("hdfs://192.168.1.31:8020/user/sparkStreaming/input")
var result = file.map(x => (x.split(" ")(0)+";"+x.split(" ")(1), 1)).reduceByKey((x,y) => x+y)
val window = result.reduceByKeyAndWindow((a:Int,b:Int) => (a + b), Seconds(60), Seconds(20))
val result1 = window.map(x => x.toString )
val result2 = result1.map(line => line.split(";")(0)+","+line.split(",")(1))
val result3 = result2.map(line => line.substring(1, line.length-1))
val result4 = result3.map(line => (line.split(",")(0),line.split(",")(1).toInt ) )
val result5 = result4.reduceByKey((x,y) => x+y )
val result6 = result3.map(line => (line.split(",")(0), 1 ))
val result7 = result6.reduceByKey((x,y) => x+y )
val result8 = result7.join(result5) // (world,2,2)
val finalResult = result8.mapValues(x => x._1.toFloat / x._2 ) // (world,1), I want this result after every one hour
ssc.start()
ssc.awaitTermination()
Thanks in Advance!!!

How can I define a method that takes an Ordered[T] Array in Scala?

I'm building some basic algorithms in Scala (following Cormen's book) to refresh my mind on the subject and I'm building the insertion sort algorithm. Doing it like this, it works correctly:
class InsertionSort extends Sort {
def sort ( items : Array[Int] ) : Unit = {
if ( items.length < 2 ) {
throw new IllegalArgumentException( "Array must be bigger than 1" )
}
1.until( items.length ).foreach( ( currentIndex ) => {
val key = items(currentIndex)
var loopIndex = currentIndex - 1
while ( loopIndex > -1 && items(loopIndex) > key ) {
items.update( loopIndex + 1, items(loopIndex) )
loopIndex -= 1
}
items.update( loopIndex + 1, key )
} )
}
}
But this is for Int only and I would like to use generics and Ordered[A] so I could sort any type that is ordered. When I change the signature to be like this:
def sort( items : Array[Ordered[_]] ) : Unit
The following spec doesn't compile:
"sort correctly with merge sort" in {
val items = Array[RichInt](5, 2, 4, 6, 1, 3)
insertionSort.sort( items )
items.toList === Array[RichInt]( 1, 2, 3, 4, 5, 6 ).toList
}
And the compiler error is:
Type mismatch, expected: Array[Ordered[_]], actual Array[RichInt]
But isn't RichInt an Ordered[RichInt]? How should I define this method signature in a way that it would accept any Ordered object?
EDIT
In case anyone is interested, the final source is available here.
Actually RichInt is not an Ordered[RichInt] but an Ordered[Int]. However scala.runtime.RichInt <: Ordered[_], but class Array is invariant in type T so Array[RichInt] is not an Array[Ordered[_]].
scala> def f[T <% Ordered[T]](arr: Array[T]) = { arr(0) < arr(1) }
f: [T](arr: Array[T])(implicit evidence$1: T => Ordered[T])Boolean
scala> f(Array(1,2,3))
res2: Boolean = true
scala>
You can do this with a context bound on the type parameter;
scala> def foo[T : Ordering](arr: Array[T]) = {
| import math.Ordering.Implicits._
| arr(0) < arr(1)
| }
foo: [T](arr: Array[T])(implicit evidence$1: Ordering[T])Boolean
Such that usage is:
scala> foo(Array(2.3, 3.4))
res1: Boolean = true
The advantage to this is that you don't need the default order of the type if you don't want it:
scala> foo(Array("z", "bc"))
res4: Boolean = false
scala> foo(Array("z", "bc"))(Ordering.by(_.length))
res3: Boolean = true

Resources