Last active
September 18, 2017 00:31
-
-
Save jiamingd/97cb19cb98ae3698ce66d6cb5d754a27 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.collection.mutable | |
import scala.io.Source | |
import scala.util.Try | |
/* | |
Sample csv: | |
user_id,age,favorite_color | |
0,5,cyan | |
1,5,gray | |
2,22,green | |
3,22,purple | |
... | |
Total number of users processed: | |
Mean age of all users: | |
Median age of all users: | |
Top 5 favorite colors | |
Total number of users processed with age greater than 21 | |
Mean age of all users with age greater than 21 | |
Median age of all users with age greater than 21 | |
Top 5 favorite colors of user with age greater than 21 | |
*/ | |
val csvFile = s"${REF_DIR}/users.csv" | |
val itr = Source.fromFile(csvFile).getLines() | |
itr.next() // Not drop/slice to avoid agressive itr move, per method impl , skipping headline: id, age, color | |
val age2Color2Count : mutable.Map[Int, mutable.Map[String, Int]] = mutable.Map[Int, mutable.Map[String, Int]]() | |
.withDefault(x=> mutable.Map[String, Int]().withDefault(s=>0)) | |
itr.foreach{ l => | |
val buf = l.split(",") | |
if (buf.size == 3) { | |
val clr2Cnt: mutable.Map[String, Int] = age2Color2Count(parseToInt(buf(1))) | |
clr2Cnt(buf(2)) +=1 | |
age2Color2Count(parseToInt(buf(1))) = clr2Cnt | |
} | |
} | |
//Now FUN starting ... ... | |
//Total number of users processed | |
val totalUsers = age2Color2Count.values.map{ clr2cnt => | |
clr2cnt.values.reduce(_+_) | |
}.reduce(_+_) | |
println(s"//Total number of users processed : ${totalUsers}") | |
//Mean age , meidan age of all users | |
val age2Headcount = for { | |
(ag, cl2ct) <- age2Color2Count | |
} yield { | |
(ag, cl2ct.values.sum) | |
} | |
val avgAge = age2Headcount.toSeq.map{ case(ag, ct) => | |
ag * ct | |
}.reduce(_+_) / totalUsers | |
println(s"Mean age : ${avgAge}") | |
//Median age of all users | |
val (ageSeq, headCountSeq) = age2Headcount.toSeq.sortBy(_._1).unzip | |
val medianAge = headCountSeq.scanLeft(0)(_+_).tail.zip(ageSeq).filter(_._1 > totalUsers/2).head._2 | |
println(s"Median age of all users: ${medianAge}") | |
//Total number of users processed with age greater than 21 | |
val headCountByAge21 = for( (ag, clr2Cnt) <- age2Color2Count if ag > 21 ) yield { | |
clr2Cnt.values.sum | |
} | |
println(s"Total number of users processed with age greater than 21: ${headCountByAge21.sum}") | |
//Top 5 favorite colors | |
val clr2HeadCnt = for { | |
(clr, clrCntSeq) <- age2Color2Count.values.flatten.groupBy(_._1) | |
} yield { | |
val colorHeadSum = clrCntSeq.map(_._2).reduce(_+_) | |
(clr, colorHeadSum) | |
} | |
println("Top 5 favorite colors (color, count)") | |
clr2HeadCnt.toSeq.sortWith(_._2 > _._2).slice(0, 5).foreach(println) | |
//Top 5 favorite colors of user with age greater than 21 | |
//Discussion: For now for the 5th position, if [4],[5].. count is the same, just pick random one | |
println("Top 5 favorite colors of user with age greater than 21:") | |
age2Color2Count.retain((ag,v)=>ag > 21).values.map(_.toSeq).flatten.groupBy(_._1).map{ case (c, c2cnttples) => | |
(c, c2cnttples.toSeq.map(_._2).reduce(_+_)) | |
}.toSeq.sortWith(_._2 > _._2).slice(0,5).foreach(println) | |
// Median age of all users with age greater than 21 | |
val age21HeadCount = for( (age, c2c) <- age2Color2Count if age > 21 ) yield { | |
(age, c2c.values.reduce(_+_)) | |
} | |
var headCountCum = 0 | |
var ageCum = 0 | |
val medianAgeAbove21 = age21HeadCount.toList.map{ case (ag, c) => | |
headCountCum = headCountCum + c | |
(ag, headCountCum) | |
}.filter(_._2 > age21HeadCount.values.sum / 2).head._1 | |
println(s"Median age of all users with age greater than 21: $medianAgeAbove21") | |
//Mean age of all users with age greater than 21 | |
/* Equall code | |
val ageTotal2Count = age2Color2Count.toSeq.filter(_._1 > 21).toList.map{ case (ag, cc) => | |
(ag, cc.values.reduce(_ + _)) | |
}.map{ case (age, count) => | |
(age * count, count) | |
} | |
*/ | |
val age2Count21 = for((age, clr2cnt) <- age2Color2Count if age > 21 ) yield { | |
(age, clr2cnt.values.reduce(_ + _)) | |
} | |
val ageTotal2Count = age2Count21.map{ case (age, count) => | |
(age*count, count) | |
} | |
println(s"Mean age of all users with age greater than 21: ${ageTotal2Count.keys.sum / ageTotal2Count.values.sum}") | |
def parseToInt(s: String): Int = Try(s.toInt).getOrElse(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment