SampleJob.java
package edu.usfca.cs.mr.sample;
import java.util.Random;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SampleJob {
public static void main(String[] args) {
if (args.length < 3) {
System.out.println("Args: <frac> <input> <output>");
System.exit(1);
}
try {
Configuration conf = new Configuration();
/* Set up our sampling fraction: */
conf.setFloat("frac", Float.parseFloat(args[0]));
Job job = Job.getInstance(conf, "sampling job");
job.setJarByClass(SampleJob.class);
/* Mapper class */
job.setMapperClass(SampleJob.SampleMapper.class);
/* Outputs from the Mapper. */
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
/* Disable the reducer: */
job.setNumReduceTasks(0);
/* Job input path in HDFS */
FileInputFormat.addInputPath(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
/* Wait (block) for the job to complete... */
System.exit(job.waitForCompletion(true) ? 0 : 1);
} catch (Exception e) {
System.err.println(e.getMessage());
}
}
private static class SampleMapper
extends Mapper<LongWritable, Text, Text, NullWritable> {
NullWritable out = NullWritable.get();
Random random = new Random();
float sampleFrac;
public SampleMapper() { }
@Override
protected void setup(Context context) {
Configuration conf = context.getConfiguration();
/* Get the sample fraction. Defaults to 10% */
sampleFrac = conf.getFloat("frac", 0.1f);
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
float r = random.nextFloat();
if (r <= sampleFrac) {
context.write(value, out);
}
}
}
}