We saw in Part 4 how to build a decision tree predictor. We are now going to create a predictor from a very classic machine learning data set, the Iris data set.
A decision tree classifier on the Iris data set
We add to the LabeledData
class two methods: randomSplit
and evaluate
. The purpose of the evaluate
method is to evaluate the predictions of a tree with respect to a given metric on the corresponding data set.
/** The building blocks of a binary decision tree being leaves and nodes,
* we create two Scala classes Leaf and Node that extend a trait DecisionTree.
*/
trait DecisionTree[-A, +B] {
def predict(sample: A): B
}
/** A leaf is a simple element that provides a prediction value (or decision). */
case class Leaf[A, B](decision: B) extends DecisionTree[A, B] {
def predict(sample: A): B = decision
}
/** A node stores a test function as well as to which node or leaf to go next depending on
* the result of the test.
* We define as:
* - *right* the leaf or node to go to when the test is true,
* - *left* the leaf or node to go to when the test is false.
*/
case class Node[A, B](test: A => Boolean, left: DecisionTree[A, B], right: DecisionTree[A, B])
extends DecisionTree[A, B] {
def predict(sample: A): B = test(sample) match {
case true => right.predict(sample)
case false => left.predict(sample)
}
}
case class LabelCombiner[B](combine: Vector[B] => B) {
def combine(left: B, right: B): B = combine(Vector(left, right))
}
import scala.util.Random
class LabeledData[A, B] private (
private val referenceSamples: Vector[A],
private val referenceLabels: Vector[B],
private val indices: Vector[Int]) {
def size: Int = indices.length
def isEmpty: Boolean = size == 0
def subset(indices: Vector[Int]): LabeledData[A, B] =
new LabeledData(referenceSamples, referenceLabels, indices)
def emptySet: LabeledData[A, B] = subset(Vector())
def groupBy[C](f: A => C): Map[C, LabeledData[A, B]] =
((indices groupBy {idx => f(referenceSamples(idx))})
mapValues subset)
def partition(f: A => Boolean): (LabeledData[A, B], LabeledData[A, B]) = {
val groups = groupBy(f)
(groups(true), groups(false))
}
def countDistinctLabels: Int =
(indices map referenceLabels).distinct.length
def mapSamples[C](f: A => C): Vector[C] =
indices map {idx => f(referenceSamples(idx))}
def combineLabels(labelCombiner: LabelCombiner[B]): B =
labelCombiner.combine(indices map referenceLabels)
def randomSplit(seed: Long)(ratios: Double*): Vector[LabeledData[A, B]] = {
val total = ratios.sum
val numberItems = ratios.toVector map {r => (r / total * size).toInt}
val rand = new Random(seed)
val zero = (Vector[Int](), rand.shuffle((0 until size).toVector))
numberItems.scanLeft(zero)((s, n) => s._2.splitAt(n)).drop(1).map(_._1).map(subset)
}
def evaluate(tree: DecisionTree[A, B], metric: (Vector[B], Vector[B]) => Double): Double = {
val predictions = mapSamples(tree.predict)
metric(predictions, indices map referenceLabels)
}
}
object LabeledData {
def apply[A, B](samples: Vector[A], labels: Vector[B]):
LabeledData[A, B] = {
require(
samples.length == labels.length,
"Samples and labels should have the same number of elements.")
new LabeledData[A, B](samples, labels, samples.indices.toVector)
}
}
$line25.$read$$iw$$iw$LabeledData$@596e455d
/** We associate to each leaf or node a unique id *number*,
* and also store the depth of the leaf or node.
*
* For each node, the next left and right nodes or leaves ids
* by considering that each level of the tree is full. The id
* numbers represent all the possible branching.
* By starting with a root node id number of 0, we get the
* following numbering for the first levels of the tree:
* 0
* 1 2
* 3 4 5 6
* 7 8 9 10 11 12 13 14
*/
case class Id(number: Long, depth: Int) {
require(number >= 0, "Id number should be nonnegative.")
def nextIdLeft: Id = Id(number * 2 + 1, depth + 1)
def nextIdRight: Id = Id(number * 2 + 2, depth + 1)
}
case class DecisionTreeBuilder[A, B](dataSet: LabeledData[A, B])
(combiner: LabelCombiner[B]) {
def build(testBuilder: (LabeledData[A, B], Id) => Either[String, A => Boolean]):
DecisionTree[A, B] = {
val rootId: Id = Id(0, 0)
buildStep(dataSet, rootId, testBuilder)
}
private def buildStep(
subSet: LabeledData[A, B],
id: Id,
testBuilder: (LabeledData[A, B], Id) => Either[String, A => Boolean]):
DecisionTree[A, B] = {
testBuilder(subSet, id) match {
case Left(_) => Leaf[A, B](subSet.combineLabels(combiner))
case Right(test) =>
val Seq(leftSubSet, rightSubSet) = nextSubsets(subSet, test)
if (leftSubSet.isEmpty || rightSubSet.isEmpty) {
println("Warning: one of right or left indices is empty.")
Leaf[A, B](subSet.combineLabels(combiner))
}
else Node(
test,
buildStep(leftSubSet, id.nextIdLeft, testBuilder),
buildStep(rightSubSet, id.nextIdRight, testBuilder)
)
}
}
private def nextSubsets(subSet: LabeledData[A, B], test: A => Boolean):
Seq[LabeledData[A, B]] = {
val groups = subSet.groupBy(test) withDefaultValue subSet.emptySet
Seq(groups(false), groups(true))
}
}
defined class Id
defined class DecisionTreeBuilder
We now build a decision tree from the Iris data set. We split the data set into two random sets, the training set and test set being of respective sizes of 70% and 30%.
// The data set can be downloaded from the UCI Machine Learning Repository: http://archive.ics.uci.edu/ml
// (University of California, School of Information and Computer Science)
val url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data"
val data = scala.io.Source.fromURL(url).getLines.withFilter(_.nonEmpty).toVector
val splitData = (data map { _.split(',').toVector.splitAt(4) }).unzip
val (samples, labels) = (splitData._1 map { v => v map (_.toDouble) }, splitData._2 map {_ (0) })
val dataSet = LabeledData[Vector[Double], String](samples, labels)
val Vector(trainSet, testSet) = dataSet.randomSplit(seed=0)(70, 30)
val combiner: LabelCombiner[String] =
LabelCombiner(v => {
val groups = v groupBy identity
groups.maxBy(_._2.length)._1})
val treeBuilder = DecisionTreeBuilder(trainSet)(combiner)
def testBuilder(subSet: LabeledData[Vector[Double], String], id: Id):
Either[String, Vector[Double] => Boolean] = {
if (id.depth > 3) Left("Maximal depth reached.")
else if (subSet.size < 10) Left("Number of elements per leaf reached.")
else if (subSet.countDistinctLabels == 1) Left("All elements have the same label.")
else {
/* The Iris data set has 4 features. The data is splitted feature after feature
by taking the values below and above (3 * min + max) / 4
*/
val i: Int = (id.number % 4).toInt
val v = subSet.mapSamples(_(i))
val m = v.foldLeft((v(0), v(0)))((acc, si) => (acc._1 min si, acc._2 max si))
if (m._1 == m._2) Left("Cannot split node.")
else Right((x: Vector[Double]) => x(i) > (3 * m._1 + m._2) / 4)
}
}
val tree = treeBuilder.build(testBuilder)
def accuracy[B](predictions: IndexedSeq[B], labels: IndexedSeq[B]): Double =
((predictions zip labels) map (x => if (x._1 == x._2) 1.0 else 0.0)).sum / labels.length
s"Test set accuracy : ${testSet.evaluate(tree, accuracy)}"
Test set accuracy : 0.9777777777777777
Using our simple decision tree builder, we are able to get an accuracy of 97.77% on a random test set of the Iris data set.