Last active
September 25, 2018 11:24
-
-
Save MishaelRosenthal/05936a46905b7127ce1c to your computer and use it in GitHub Desktop.
Implements the Fast Correlation Based Filter algorithm for feature selection.Conference version: http://machinelearning.wustl.edu/mlpapers/paper_files/icml2003_YuL03.pdfJournal version: http://machinelearning.wustl.edu/mlpapers/paper_files/YuL04.pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.liveperson.lpbt.research.hadoop.examples | |
import scala.annotation.tailrec | |
/** | |
* User: mishaelr | |
* Date: 7/11/13 | |
* Time: 10:33 AM | |
*/ | |
object FCBF extends App{ | |
/** | |
* A case class holding a list of feature observations. | |
*/ | |
case class FeatureVector[+T](values: List[T]) | |
object FeatureVector{ | |
def of[T](t: T*) = FeatureVector(t.toList) | |
} | |
/** | |
* Calculates the frequencies of each of the values in the List. | |
* Assumption: values will be grouped according to groupBy(identity). | |
*/ | |
def calcFreq[T](values: List[T]) = values.groupBy(identity).map{case (key, list) => (key, list.length.toDouble/values.length)}.values | |
/** | |
* Calculates the entropy H(X), given a list of frequencies. | |
* Assumption: Frequencies must sum to one. | |
*/ | |
def calcEntropy(freq: Iterable[Double]) = freq.foldRight(0.0)((l, r) => r - (if(l == 0) 0 else l * math.log(l))) / math.log(2.0) | |
def calcFeatureEntropy[T](f: FeatureVector[T]) = calcEntropy(calcFreq(f.values)) | |
/** | |
* Calculates the mutual entropy H(f1, f2) of two features. | |
*/ | |
def calcMutualFeatureEntropy[T, S](f1: FeatureVector[T], f2: FeatureVector[S]) = calcEntropy(calcFreq(f1.values.zip(f2.values))) | |
/** | |
* Calculates the symmetrical uncertainty of f1 and f2. | |
* Symmetrical uncertainty is defined as: | |
* SU(X, Y) = 2 * (IG(X, Y) / (H(X) + H(Y)) = 2 * ((H(x) + H(Y) - H(X, Y)) / (H(x) + H(Y))) = 2 - 2 * H(X, Y) / (H(x) + H(Y)) | |
* http://en.wikipedia.org/wiki/Mutual_information#Normalized_variants | |
*/ | |
def symmetricalUncertainty[T, S](f1: FeatureVector[T], f2: FeatureVector[S]) = 2 - (2 * calcMutualFeatureEntropy(f1, f2) / (calcFeatureEntropy(f1) + calcFeatureEntropy(f2))) | |
/** | |
* Implements the Fast Correlation Based Filter algorithm for feature selection. | |
* Conference version: http://machinelearning.wustl.edu/mlpapers/paper_files/icml2003_YuL03.pdf | |
* Journal version: http://machinelearning.wustl.edu/mlpapers/paper_files/YuL04.pdf | |
* Assumption: Features and target values should have the same length. | |
* @param features The initial list of features. | |
* @param target The target feature. | |
* @param alpha The predetermined target-feature similarity threshold. | |
* @return The subset of features selected. | |
*/ | |
def selectFeatures(features: List[FeatureVector[_]], target: FeatureVector[_], alpha: Double) = { | |
// Filter the irrelevant features corresponding to SU(feature, target) values lesser that the alpha threshold. | |
val sortedPairList = features.map(f => (f, symmetricalUncertainty(f, target))).filter(_._2 >= alpha) | |
// Order the features according to relevance i.e. in descending order of SU(feature, target). | |
.sortBy(_._2).reverse | |
val FeatureTargetSUMap = sortedPairList.toMap | |
// Recursively choose the predominant (top remaining) feature, filter all the redundant features | |
// i.e. features for which it forms an approximate Markov blanket. | |
@tailrec | |
def selectHeadFilterTailRec(features: List[FeatureVector[_]], acc: List[FeatureVector[_]]): List[FeatureVector[_]] = | |
features match { | |
case List() => features | |
case hd :: tl => selectHeadFilterTailRec(tl.filterNot(f => symmetricalUncertainty(f, hd) >= FeatureTargetSUMap(f)), hd :: acc) | |
} | |
selectHeadFilterTailRec(sortedPairList.map(_._1), List()) | |
} | |
/**********************************************************************************************************************************************/ | |
val f1 = FeatureVector.of(0,0,1,1) | |
val f2 = FeatureVector.of(0,1,0,1) | |
val f3 = FeatureVector.of(0,1,0,1) | |
val t = FeatureVector((f1.values zip f2.values).map{case (a, b) => a != b}) | |
println("FCBF t " + selectFeatures(List(f1, f2), t, 0.1)) | |
println("FCBF f3 " + selectFeatures(List(f1, f2), f3, 0.1)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Am I wrong or it should be
acc
on line 70 ?