Last active
February 6, 2016 15:45
-
-
Save bistaumanga/139dcd2281c61ab5bf06 to your computer and use it in GitHub Desktop.
pwlktm02
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val rawData = sc.textFile("hdfs://hdfs-nn-host.com:9000/user/umb/bank/bank.csv") | |
rawData.take(5) | |
// 30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no" | |
implicit def str2int(x: String) = x.toInt // because i'm lazy | |
// case class for representing data row | |
case class DataRow(age: Int, | |
job: String, | |
maritial: String, | |
education: String, | |
default_ : String, | |
balance: Int, | |
housing: String, | |
loan: String, | |
contact: String, | |
day: Int, | |
month: String, | |
duration: Int, | |
camoaign: Int, | |
pdays: Int, | |
previous: Int, | |
poutcome: String, | |
y: String | |
) | |
// parsing | |
val filtered = rawData.filter {line => ! line.contains("default")}.map(_.split(";")).map { | |
arr => DataRow( | |
arr(0), arr(1), arr(2), arr(3), arr(4), arr(5), arr(6), arr(7), arr(8), arr(9), arr(10), arr(11), arr(12), arr(13), arr(14), arr(15), arr(16) | |
) | |
} | |
// persisting in memory | |
filtered.cache() | |
// counts for chi-squared test of independence | |
// between job and loan outcome | |
val countsByJobAndY = filtered.map{row => ((row.job, row.y), 1) }.reduceByKey(_+_) | |
val countsByJob = filtered.map{row => (row.job, 1) }.reduceByKey(_+_).collectAsMap() | |
val countsByY = filtered.map{row => (row.y, 1) }.reduceByKey(_+_).collectAsMap() | |
val countTotal = filtered.count() | |
// | |
countsByJobAndY.sortBy(-_._2).take(10) // top 10 highly correlated suspects from count | |
// observed and expected frequencies | |
import scala.math | |
val chiSqVals = countsByJobAndY | |
.map{case ((job, y), observed) => ((job, y), (observed, countsByJob(job) * countsByY(y) / countTotal ) ) } | |
.mapValues{case (observed, expected) => math.pow(observed - expected, 2)/expected } | |
chiSqVals.sortBy(-_._2).take(10) // top 10 highly correlated suspects from test statistic | |
// our test statistic | |
chiSqVals.collect() | |
// **ChiSquared (0.05,13) ** = 22.362 | |
chiSqVals.map(_._2).reduce() | |
// chi squared test statistic, which is higher than 22.362, so job and loan outcome are not independent |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val rawTexts = sc.textFile("hdfs://hdfs-nn-host.com:9000/user/umb/novels/") | |
rawTexts.take(5) | |
val words = rawTexts | |
.flatMap(_.split("\\s+")) // splitting by space | |
.map(_.replaceAll("[^a-zA-Z ]", "")) // remove non alphabets | |
.map(_.toLowerCase).filter(word => word.length >= 5 && word.length <= 10) // filtering words with certain length | |
val counts = words map{(_, 1)} reduceByKey (_+_) | |
counts.sortBy(- _._2)take(10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment