Skip to content

Instantly share code, notes, and snippets.

@jogam5
Created October 22, 2021 01:29
Show Gist options
  • Save jogam5/6d3d15fc9f83e2594b913c081e4a8094 to your computer and use it in GitHub Desktop.
Save jogam5/6d3d15fc9f83e2594b913c081e4a8094 to your computer and use it in GitHub Desktop.
Grep in MapReduce
package my.midterm;
/*
'Overview' section provides a concise high level summary of Hadoop - MapReduce:
https://hadoop.apache.org/docs/r1.2.1/mapred_tutorial.html
*/
import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import java.util.ArrayList;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.io.Text;
public class Grep {
public static class Map extends Mapper<LongWritable, Text, Text, Text> {
/*
Rationale:
Map extends Mapper class, where the method map() (one of many maps tasks)
is called each time a line is processed. When the line contains the
string "pattern", "context.write()" is called to generate an output
similar to {"dark", "the sky is dark"}.
The final output of Map is similar to:
{"dark", "the sky is dark"}, {"dark", "dark knight"}, ...
*/
/* Supplied pattern */
String pattern ="dark";
private final Text keyOutput = new Text();
private final Text valueOutput = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
if (tokenizer.nextToken().contains(pattern)) {
// System.out.println(line);
keyOutput.set(pattern);
valueOutput.set(line);
context.write(keyOutput, valueOutput);
break;
}
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text> {
/*
Rationale:
Reduce receives a lists of key:value pairs from Map (i.e.
{"dark", "the sky is dark"}, {"dark", "dark knight"}, ... ).
For each unique key, the object "Iterable<Text> values" is
looped over in order to aggregate all the values for the same key.
In this case, all the values are stored in an ArrayList that
will be reordered to output the lines in the file that contain
the word specified in "dark".
*/
private final Text result = new Text();
public void reduce(Text key, Iterable<Text> values, Context context) throws
IOException, InterruptedException {
// System.out.println("---> Enter REDUCE");
List<String> lines = new ArrayList<>();
for (Text val : values) {
// System.out.println(val);
lines.add(val.toString());
}
/* Reverse order of elements in arrayList */
Collections.reverse(lines);
result.set(lines.toString());
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "grep");
job.setJarByClass(Grep.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment