Skip to content

Instantly share code, notes, and snippets.

@MishaelRosenthal
Last active September 25, 2018 11:24
Show Gist options
  • Save MishaelRosenthal/05936a46905b7127ce1c to your computer and use it in GitHub Desktop.
Save MishaelRosenthal/05936a46905b7127ce1c to your computer and use it in GitHub Desktop.
Implements the Fast Correlation Based Filter algorithm for feature selection.Conference version: http://machinelearning.wustl.edu/mlpapers/paper_files/icml2003_YuL03.pdfJournal version: http://machinelearning.wustl.edu/mlpapers/paper_files/YuL04.pdf
package com.liveperson.lpbt.research.hadoop.examples
import scala.annotation.tailrec
/**
* User: mishaelr
* Date: 7/11/13
* Time: 10:33 AM
*/
object FCBF extends App{
/**
* A case class holding a list of feature observations.
*/
case class FeatureVector[+T](values: List[T])
object FeatureVector{
def of[T](t: T*) = FeatureVector(t.toList)
}
/**
* Calculates the frequencies of each of the values in the List.
* Assumption: values will be grouped according to groupBy(identity).
*/
def calcFreq[T](values: List[T]) = values.groupBy(identity).map{case (key, list) => (key, list.length.toDouble/values.length)}.values
/**
* Calculates the entropy H(X), given a list of frequencies.
* Assumption: Frequencies must sum to one.
*/
def calcEntropy(freq: Iterable[Double]) = freq.foldRight(0.0)((l, r) => r - (if(l == 0) 0 else l * math.log(l))) / math.log(2.0)
def calcFeatureEntropy[T](f: FeatureVector[T]) = calcEntropy(calcFreq(f.values))
/**
* Calculates the mutual entropy H(f1, f2) of two features.
*/
def calcMutualFeatureEntropy[T, S](f1: FeatureVector[T], f2: FeatureVector[S]) = calcEntropy(calcFreq(f1.values.zip(f2.values)))
/**
* Calculates the symmetrical uncertainty of f1 and f2.
* Symmetrical uncertainty is defined as:
* SU(X, Y) = 2 * (IG(X, Y) / (H(X) + H(Y)) = 2 * ((H(x) + H(Y) - H(X, Y)) / (H(x) + H(Y))) = 2 - 2 * H(X, Y) / (H(x) + H(Y))
* http://en.wikipedia.org/wiki/Mutual_information#Normalized_variants
*/
def symmetricalUncertainty[T, S](f1: FeatureVector[T], f2: FeatureVector[S]) = 2 - (2 * calcMutualFeatureEntropy(f1, f2) / (calcFeatureEntropy(f1) + calcFeatureEntropy(f2)))
/**
* Implements the Fast Correlation Based Filter algorithm for feature selection.
* Conference version: http://machinelearning.wustl.edu/mlpapers/paper_files/icml2003_YuL03.pdf
* Journal version: http://machinelearning.wustl.edu/mlpapers/paper_files/YuL04.pdf
* Assumption: Features and target values should have the same length.
* @param features The initial list of features.
* @param target The target feature.
* @param alpha The predetermined target-feature similarity threshold.
* @return The subset of features selected.
*/
def selectFeatures(features: List[FeatureVector[_]], target: FeatureVector[_], alpha: Double) = {
// Filter the irrelevant features corresponding to SU(feature, target) values lesser that the alpha threshold.
val sortedPairList = features.map(f => (f, symmetricalUncertainty(f, target))).filter(_._2 >= alpha)
// Order the features according to relevance i.e. in descending order of SU(feature, target).
.sortBy(_._2).reverse
val FeatureTargetSUMap = sortedPairList.toMap
// Recursively choose the predominant (top remaining) feature, filter all the redundant features
// i.e. features for which it forms an approximate Markov blanket.
@tailrec
def selectHeadFilterTailRec(features: List[FeatureVector[_]], acc: List[FeatureVector[_]]): List[FeatureVector[_]] =
features match {
case List() => features
case hd :: tl => selectHeadFilterTailRec(tl.filterNot(f => symmetricalUncertainty(f, hd) >= FeatureTargetSUMap(f)), hd :: acc)
}
selectHeadFilterTailRec(sortedPairList.map(_._1), List())
}
/**********************************************************************************************************************************************/
val f1 = FeatureVector.of(0,0,1,1)
val f2 = FeatureVector.of(0,1,0,1)
val f3 = FeatureVector.of(0,1,0,1)
val t = FeatureVector((f1.values zip f2.values).map{case (a, b) => a != b})
println("FCBF t " + selectFeatures(List(f1, f2), t, 0.1))
println("FCBF f3 " + selectFeatures(List(f1, f2), f3, 0.1))
}
@michellemay
Copy link

Am I wrong or it should be acc on line 70 ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment