Skip to content

Instantly share code, notes, and snippets.

@lucianogiuseppe
Created August 13, 2016 15:18
Show Gist options
  • Save lucianogiuseppe/063aff936f548fdd0faad6ef004a43e7 to your computer and use it in GitHub Desktop.
Save lucianogiuseppe/063aff936f548fdd0faad6ef004a43e7 to your computer and use it in GitHub Desktop.
Java WordCount on Spark using Dataset
//tested on spark 2.0
import org.apache.spark.sql.*;
import java.util.Arrays;
public class WordCount {
public static void main(String[] args) {
if (args.length < 2) {
System.err.println("Please provide the full path of input file and output dir as arguments");
System.exit(0);
}
SparkSession spark = SparkSession
.builder()
.master("local")
.appName("WordCount")
.getOrCreate();
Dataset<String> df = spark.read().text(args[0]).as(Encoders.STRING());
Dataset<String> words = df.flatMap(s -> {
return Arrays.asList(s.toLowerCase().split(" ")).iterator();
}, Encoders.STRING())
.filter(s -> !s.isEmpty())
.coalesce(1); //one partition (parallelism level)
//words.printSchema(); // { value: string (nullable = true) }
Dataset<Row> t = words.groupBy("value") //<k, iter(V)>
.count()
.toDF("word","count");
t = t.sort(functions.desc("count"));
t.toJavaRDD().saveAsTextFile(args[1]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment