package edu.usfca.cs.mr.sample; import java.util.Random; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class SampleJob { public static void main(String[] args) { if (args.length < 3) { System.out.println("Args: "); System.exit(1); } try { Configuration conf = new Configuration(); /* Set up our sampling fraction: */ conf.setFloat("frac", Float.parseFloat(args[0])); Job job = Job.getInstance(conf, "sampling job"); job.setJarByClass(SampleJob.class); /* Mapper class */ job.setMapperClass(SampleJob.SampleMapper.class); /* Outputs from the Mapper. */ job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); /* Disable the reducer: */ job.setNumReduceTasks(0); /* Job input path in HDFS */ FileInputFormat.addInputPath(job, new Path(args[1])); FileOutputFormat.setOutputPath(job, new Path(args[2])); /* Wait (block) for the job to complete... */ System.exit(job.waitForCompletion(true) ? 0 : 1); } catch (Exception e) { System.err.println(e.getMessage()); } } private static class SampleMapper extends Mapper { NullWritable out = NullWritable.get(); Random random = new Random(); float sampleFrac; public SampleMapper() { } @Override protected void setup(Context context) { Configuration conf = context.getConfiguration(); /* Get the sample fraction. Defaults to 10% */ sampleFrac = conf.getFloat("frac", 0.1f); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { float r = random.nextFloat(); if (r <= sampleFrac) { context.write(value, out); } } } }