Programming a Decision Tree Predictor in Scala (Part 5)

We saw in Part 4 how to build a decision tree predictor. We are now going to create a predictor from a very classic machine learning data set, the Iris data set.

A decision tree classifier on the Iris data set

We add to the LabeledData class two methods: randomSplit and evaluate. The purpose of the evaluate method is to evaluate the predictions of a tree with respect to a given metric on the corresponding data set.

/** The building blocks of a binary decision tree being leaves and nodes, 
  * we create two Scala classes Leaf and Node that extend a trait DecisionTree.
  */
trait DecisionTree[-A, +B] {
  def predict(sample: A): B
}

/** A leaf is a simple element that provides a prediction value (or decision). */
case class Leaf[A, B](decision: B) extends DecisionTree[A, B] {
  def predict(sample: A): B = decision
}

/** A node stores a test function as well as to which node or leaf to go next depending on
  * the result of the test.
  * We define as:
  *   - *right* the leaf or node to go to when the test is true,
  *   - *left* the leaf or node to go to when the test is false.
  */
case class Node[A, B](test: A => Boolean, left: DecisionTree[A, B], right: DecisionTree[A, B])
    extends DecisionTree[A, B] {

  def predict(sample: A): B = test(sample) match {
    case true => right.predict(sample)
    case false => left.predict(sample)
  }
}


case class LabelCombiner[B](combine: Vector[B] => B) {

  def combine(left: B, right: B): B = combine(Vector(left, right))
}

import scala.util.Random

class LabeledData[A, B] private (
    private val referenceSamples: Vector[A],
    private val referenceLabels: Vector[B],
    private val indices: Vector[Int]) {

  def size: Int = indices.length

  def isEmpty: Boolean = size == 0

  def subset(indices: Vector[Int]): LabeledData[A, B] = 
    new LabeledData(referenceSamples, referenceLabels, indices)

  def emptySet: LabeledData[A, B] = subset(Vector())

  def groupBy[C](f: A => C): Map[C, LabeledData[A, B]] = 
    ((indices groupBy {idx => f(referenceSamples(idx))}) 
      mapValues subset)

  def partition(f: A => Boolean): (LabeledData[A, B], LabeledData[A, B]) = {
    val groups = groupBy(f)
    (groups(true), groups(false))
  }

  def countDistinctLabels: Int = 
    (indices map referenceLabels).distinct.length

  def mapSamples[C](f: A => C): Vector[C] = 
    indices map {idx => f(referenceSamples(idx))}
    
  def combineLabels(labelCombiner: LabelCombiner[B]): B = 
    labelCombiner.combine(indices map referenceLabels)
    
  def randomSplit(seed: Long)(ratios: Double*): Vector[LabeledData[A, B]] = {
    val total = ratios.sum
    val numberItems = ratios.toVector map {r => (r / total * size).toInt}
    val rand = new Random(seed)
    val zero = (Vector[Int](), rand.shuffle((0 until size).toVector))
    numberItems.scanLeft(zero)((s, n) => s._2.splitAt(n)).drop(1).map(_._1).map(subset)
  }

  def evaluate(tree: DecisionTree[A, B], metric: (Vector[B], Vector[B]) => Double): Double = {
    val predictions = mapSamples(tree.predict)
    metric(predictions, indices map referenceLabels)
  }
}

    
object LabeledData {

  def apply[A, B](samples: Vector[A], labels: Vector[B]): 
      LabeledData[A, B] = {
    require(
      samples.length == labels.length, 
      "Samples and labels should have the same number of elements.")
    new LabeledData[A, B](samples, labels, samples.indices.toVector)
  }
}

$line25.$read$$iw$$iw$LabeledData$@596e455d

/** We associate to each leaf or node a unique id *number*,
  * and also store the depth of the leaf or node.
  * 
  * For each node, the next left and right nodes or leaves ids 
  * by considering that each level of the tree is full. The id  
  * numbers represent all the possible branching.
  * By starting with a root node id number of 0, we get the 
  * following numbering for the first levels of the tree:
  *                  0
  *         1                2
  *      3     4          5     6
  *     7 8   9 10      11 12 13 14
  */
case class Id(number: Long, depth: Int) {
  require(number >= 0, "Id number should be nonnegative.")

  def nextIdLeft: Id = Id(number * 2 + 1, depth + 1)

  def nextIdRight: Id = Id(number * 2 + 2, depth + 1)
}


case class DecisionTreeBuilder[A, B](dataSet: LabeledData[A, B])
    (combiner: LabelCombiner[B]) {

  def build(testBuilder: (LabeledData[A, B], Id) => Either[String, A => Boolean]):
      DecisionTree[A, B] = {
    val rootId: Id = Id(0, 0)
    buildStep(dataSet, rootId, testBuilder)
  }

  private def buildStep(
    subSet: LabeledData[A, B], 
    id: Id, 
    testBuilder: (LabeledData[A, B], Id) => Either[String, A => Boolean]):
  DecisionTree[A, B] = {
      
    testBuilder(subSet, id) match {
      case Left(_) => Leaf[A, B](subSet.combineLabels(combiner))
      case Right(test) =>
        val Seq(leftSubSet, rightSubSet) = nextSubsets(subSet, test)
        if (leftSubSet.isEmpty || rightSubSet.isEmpty) {
          println("Warning: one of right or left indices is empty.")
          Leaf[A, B](subSet.combineLabels(combiner))
        }
        else Node(
          test,
          buildStep(leftSubSet, id.nextIdLeft, testBuilder),
          buildStep(rightSubSet, id.nextIdRight, testBuilder)
        )
    }
  }

  private def nextSubsets(subSet: LabeledData[A, B], test: A => Boolean):
      Seq[LabeledData[A, B]] = {
    val groups = subSet.groupBy(test) withDefaultValue subSet.emptySet
    Seq(groups(false), groups(true))
  }
}

defined class Id
defined class DecisionTreeBuilder

We now build a decision tree from the Iris data set. We split the data set into two random sets, the training set and test set being of respective sizes of 70% and 30%.

// The data set can be downloaded from the UCI Machine Learning Repository: http://archive.ics.uci.edu/ml
// (University of California, School of Information and Computer Science)
val url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data"
val data = scala.io.Source.fromURL(url).getLines.withFilter(_.nonEmpty).toVector
val splitData = (data map { _.split(',').toVector.splitAt(4) }).unzip
val (samples, labels) = (splitData._1 map { v => v map (_.toDouble) }, splitData._2 map {_ (0) })
val dataSet = LabeledData[Vector[Double], String](samples, labels)

val Vector(trainSet, testSet) = dataSet.randomSplit(seed=0)(70, 30)

val combiner: LabelCombiner[String] = 
  LabelCombiner(v => {
    val groups = v groupBy identity
    groups.maxBy(_._2.length)._1})
     
val treeBuilder = DecisionTreeBuilder(trainSet)(combiner)

def testBuilder(subSet: LabeledData[Vector[Double], String], id: Id): 
    Either[String, Vector[Double] => Boolean] = {
  if (id.depth > 3) Left("Maximal depth reached.")
  else if (subSet.size < 10) Left("Number of elements per leaf reached.")
  else if (subSet.countDistinctLabels == 1) Left("All elements have the same label.")
  else {
    /* The Iris data set has 4 features. The data is splitted feature after feature
    by taking the values below and above (3 * min + max) / 4     
    */
    val i: Int = (id.number % 4).toInt
    val v = subSet.mapSamples(_(i))
    val m = v.foldLeft((v(0), v(0)))((acc, si) => (acc._1 min si, acc._2 max si))
    if (m._1 == m._2) Left("Cannot split node.")
    else Right((x: Vector[Double]) => x(i) > (3 * m._1 + m._2) / 4)
  }
}

val tree = treeBuilder.build(testBuilder)
def accuracy[B](predictions: IndexedSeq[B], labels: IndexedSeq[B]): Double =
  ((predictions zip labels) map (x => if (x._1 == x._2) 1.0 else 0.0)).sum / labels.length
s"Test set accuracy : ${testSet.evaluate(tree, accuracy)}"

Test set accuracy : 0.9777777777777777

Using our simple decision tree builder, we are able to get an accuracy of 97.77% on a random test set of the Iris data set.