srinivasanHadoop · October 31, 2013 06:28 · srinivasanHadoop · Oct 31, 2013 · cgizadi · Jan 13, 2014
diff --git a/TikaFileInputFormat.java b/TikaFileInputFormat.java
 package com.srini.tikacustom;

 import java.io.IOException;

 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 public class TikaFileInputFormat extends FileInputFormat<Text, Text> {

 	@Override
 	public RecordReader<Text, Text> createRecordReader(InputSplit split,
 			TaskAttemptContext context) throws IOException, InterruptedException {
 		// TODO Auto-generated method stub
 		return new TikaRecordReader();
 	}

 }
diff --git a/TikaMapreduce.java b/TikaMapreduce.java
 package com.srini.tikacustom;

 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;

 public class TikaMapreduce extends Configured implements Tool {
 	public static class TikaMapper extends Mapper<Text, Text, Text, Text> {
 		public void map(Text key, Text value, Context context)
 				throws IOException, InterruptedException {
 			context.write(key, value);
 		}
 	}

 	public static void main(String[] args) throws Exception {
 		int exit = ToolRunner.run(new Configuration(), new TikaMapreduce(),
 				args);
 		System.exit(exit);
 	}

 	@Override
 	public int run(String[] args) throws Exception {
 		// TODO Auto-generated method stub
 		if (args.length != 2) {
 			System.out.println("set the input path and output path");
 			return 2;
 		}
 		Configuration conf = new Configuration();
 		Job job = new Job(conf, "TikaMapreduce");
 		job.setJarByClass(getClass());
 		job.setJobName("TikRead");
 		job.setInputFormatClass(TikaFileInputFormat.class);
 		FileInputFormat.addInputPath(job, new Path(args[0]));
 		job.setMapperClass(TikaMapper.class);
 		job.setOutputKeyClass(Text.class);
 		job.setOutputValueClass(Text.class);
 		job.setOutputFormatClass(TikaOutPutFormt.class);
 		FileOutputFormat.setOutputPath(job, new Path(args[1]
 				+ System.currentTimeMillis()));
 		return job.waitForCompletion(true) ? 0 : 1;
 	}
 }
diff --git a/TikaOutPutFormt.java b/TikaOutPutFormt.java
 package com.srini.tikacustom;

 import java.io.IOException;

 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 public class TikaOutPutFormt extends FileOutputFormat<Text, Text> {

 	@Override
 	public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext context)
 			throws IOException, InterruptedException {
 		// TODO Auto-generated method stub
 		Path path=FileOutputFormat.getOutputPath(context);
 		Path fullapth=new Path(path,"Srini.txt");
 		FileSystem fs=path.getFileSystem(context.getConfiguration());
 		FSDataOutputStream output=fs.create(fullapth,context);
 		return new TikaRecordWrite(output);
 	}

 }
diff --git a/TikaRecordReader.java b/TikaRecordReader.java
 package com.srini.tikacustom;

 import java.io.IOException;
 import java.net.URL;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;

 public class TikaRecordReader extends RecordReader<Text, Text> {
 	private Text key = new Text();
 	private Text value = new Text();
 	private FileSplit fileSplit;
 	private Configuration conf;
 	private boolean processed = false;

 	@Override
 	public void close() throws IOException {
 		// TODO Auto-generated method stub

 	}

 	@Override
 	public Text getCurrentKey() throws IOException, InterruptedException {
 		// TODO Auto-generated method stub
 		return key;
 	}

 	@Override
 	public Text getCurrentValue() throws IOException, InterruptedException {
 		// TODO Auto-generated method stub
 		return value;
 	}

 	@Override
 	public float getProgress() throws IOException, InterruptedException {
 		// TODO Auto-generated method stub
 		return processed ? 1.0f : 0.0f;
 	}

 	@Override
 	public void initialize(InputSplit split, TaskAttemptContext context)
 			throws IOException, InterruptedException {
 		// TODO Auto-generated method stub
 		this.fileSplit = (FileSplit) split;
 		this.conf = context.getConfiguration();
 	}

 	@Override
 	public boolean nextKeyValue() throws IOException, InterruptedException {
 		// TODO Auto-generated method stub
 		if (!processed) {
 			Path path = fileSplit.getPath();
 			key.set(path.toString());
 			@SuppressWarnings("unused")
 			FileSystem fs = path.getFileSystem(conf);
 			@SuppressWarnings("unused")
 			FSDataInputStream fin = null;

 			try {
 				String con = new Tika().parseToString(new URL(path.toString()));
 				String string = con.replaceAll("[$%&+,:;=?#|']", " ");
 				String string2 = string.replaceAll("\\s+", " ");
 				String lo = string2.toLowerCase();
 				value.set(lo);
 			} catch (TikaException e) {
 				// TODO Auto-generated catch block
 				e.printStackTrace();
 			}
 			processed = true;
 			return true;
 		} else {
 			return false;

 		}

 	}

 }
diff --git a/TikaRecordWrite.java b/TikaRecordWrite.java
 package com.srini.tikacustom;

 import java.io.DataOutputStream;
 import java.io.IOException;

 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;

 public class TikaRecordWrite extends RecordWriter<Text, Text> {
         private DataOutputStream out;
 	public TikaRecordWrite(DataOutputStream output) {
 		// TODO Auto-generated constructor stub
 		out=output;
 		try {
 			out.writeBytes("result:\r\n");
 		} catch (IOException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
 	}

 	@Override
 	public void close(TaskAttemptContext context) throws IOException,
 			InterruptedException {
 		// TODO Auto-generated method stub
             out.close();
 	}

 	@Override
 	public void write(Text key, Text value) throws IOException,
 			InterruptedException {
 		// TODO Auto-generated method stub
             out.writeBytes(key.toString());
             out.writeBytes(",");
             out.writeBytes(value.toString());
             out.writeBytes("\r\n"); 
 	}

 }
	package com.srini.tikacustom;

	import java.io.IOException;

	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.InputSplit;
	import org.apache.hadoop.mapreduce.RecordReader;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

	public class TikaFileInputFormat extends FileInputFormat<Text, Text> {

	@Override
	public RecordReader<Text, Text> createRecordReader(InputSplit split,
	TaskAttemptContext context) throws IOException, InterruptedException {
	// TODO Auto-generated method stub
	return new TikaRecordReader();
	}

	}
	package com.srini.tikacustom;

	import java.io.IOException;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.conf.Configured;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.hadoop.util.Tool;
	import org.apache.hadoop.util.ToolRunner;

	public class TikaMapreduce extends Configured implements Tool {
	public static class TikaMapper extends Mapper<Text, Text, Text, Text> {
	public void map(Text key, Text value, Context context)
	throws IOException, InterruptedException {
	context.write(key, value);
	}
	}

	public static void main(String[] args) throws Exception {
	int exit = ToolRunner.run(new Configuration(), new TikaMapreduce(),
	args);
	System.exit(exit);
	}

	@Override
	public int run(String[] args) throws Exception {
	// TODO Auto-generated method stub
	if (args.length != 2) {
	System.out.println("set the input path and output path");
	return 2;
	}
	Configuration conf = new Configuration();
	Job job = new Job(conf, "TikaMapreduce");
	job.setJarByClass(getClass());
	job.setJobName("TikRead");
	job.setInputFormatClass(TikaFileInputFormat.class);
	FileInputFormat.addInputPath(job, new Path(args[0]));
	job.setMapperClass(TikaMapper.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(Text.class);
	job.setOutputFormatClass(TikaOutPutFormt.class);
	FileOutputFormat.setOutputPath(job, new Path(args[1]
	+ System.currentTimeMillis()));
	return job.waitForCompletion(true) ? 0 : 1;
	}
	}
	package com.srini.tikacustom;

	import java.io.IOException;

	import org.apache.hadoop.fs.FSDataOutputStream;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.RecordWriter;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

	public class TikaOutPutFormt extends FileOutputFormat<Text, Text> {

	@Override
	public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext context)
	throws IOException, InterruptedException {
	// TODO Auto-generated method stub
	Path path=FileOutputFormat.getOutputPath(context);
	Path fullapth=new Path(path,"Srini.txt");
	FileSystem fs=path.getFileSystem(context.getConfiguration());
	FSDataOutputStream output=fs.create(fullapth,context);
	return new TikaRecordWrite(output);
	}

	}
	package com.srini.tikacustom;

	import java.io.IOException;
	import java.net.URL;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FSDataInputStream;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.InputSplit;
	import org.apache.hadoop.mapreduce.RecordReader;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hadoop.mapreduce.lib.input.FileSplit;
	import org.apache.tika.Tika;
	import org.apache.tika.exception.TikaException;

	public class TikaRecordReader extends RecordReader<Text, Text> {
	private Text key = new Text();
	private Text value = new Text();
	private FileSplit fileSplit;
	private Configuration conf;
	private boolean processed = false;

	@Override
	public void close() throws IOException {
	// TODO Auto-generated method stub

	}

	@Override
	public Text getCurrentKey() throws IOException, InterruptedException {
	// TODO Auto-generated method stub
	return key;
	}

	@Override
	public Text getCurrentValue() throws IOException, InterruptedException {
	// TODO Auto-generated method stub
	return value;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
	// TODO Auto-generated method stub
	return processed ? 1.0f : 0.0f;
	}

	@Override
	public void initialize(InputSplit split, TaskAttemptContext context)
	throws IOException, InterruptedException {
	// TODO Auto-generated method stub
	this.fileSplit = (FileSplit) split;
	this.conf = context.getConfiguration();
	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
	// TODO Auto-generated method stub
	if (!processed) {
	Path path = fileSplit.getPath();
	key.set(path.toString());
	@SuppressWarnings("unused")
	FileSystem fs = path.getFileSystem(conf);
	@SuppressWarnings("unused")
	FSDataInputStream fin = null;

	try {
	String con = new Tika().parseToString(new URL(path.toString()));
	String string = con.replaceAll("[$%&+,:;=?#\|']", " ");
	String string2 = string.replaceAll("\\s+", " ");
	String lo = string2.toLowerCase();
	value.set(lo);
	} catch (TikaException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	processed = true;
	return true;
	} else {
	return false;

	}

	}

	}
	package com.srini.tikacustom;

	import java.io.DataOutputStream;
	import java.io.IOException;

	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.RecordWriter;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;

	public class TikaRecordWrite extends RecordWriter<Text, Text> {
	private DataOutputStream out;
	public TikaRecordWrite(DataOutputStream output) {
	// TODO Auto-generated constructor stub
	out=output;
	try {
	out.writeBytes("result:\r\n");
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	}

	@Override
	public void close(TaskAttemptContext context) throws IOException,
	InterruptedException {
	// TODO Auto-generated method stub
	out.close();
	}

	@Override
	public void write(Text key, Text value) throws IOException,
	InterruptedException {
	// TODO Auto-generated method stub
	out.writeBytes(key.toString());
	out.writeBytes(",");
	out.writeBytes(value.toString());
	out.writeBytes("\r\n");
	}

	}