MapReduce经典入门小案例

/**
 * 单词统计
 * @author fengmingyue
 *
 */
public class WordCount {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(WordCount.class);
        job.setMapperClass(WCMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        FileInputFormat.setInputPaths(job, new Path("hdfs://localhost:9000/input/words.txt"));
        job.setReducerClass(WCReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/output"));
        /**
         * Combiner的输出是Reducer的输入,如果Combiner是可插拔的,添加Combiner绝不能改变最终的计算结果。
         * 所以Combiner只应该用于那种Reduce的输入key/value与输出key/value类型完全一致,且不影响最终结果的场景。
         * 比如累加,最大值等。
         */
        job.setCombinerClass(WCReducer.class);
        job.waitForCompletion(true);
    }
}
class WCReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
    protected void reduce(Text key, Iterable<LongWritable> values, Context context)
            throws IOException, InterruptedException {
        long counter = 0;
        for(LongWritable l : values){
            counter += l.get();
        }
        context.write(key, new LongWritable(counter));
    }
}
class WCMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        String line = value.toString();
        String[] words = line.split(" ");
        for(String w : words){
            context.write(new Text(w), new LongWritable(1));
        }
    }
}
/**
 * 输入:
 *      hello tom
        hello tom2
        hello tom3
        hello tom4
        hello tom5
   输出:
        hello   5
        tom     1
        tom2    1
        tom3    1
        tom4    1
        tom5    1
 */

 

 

/**
 * 流量统计(输出在多个文件中)
 * @author fengmingyue
 *
 */
public class DataCount {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(DataCount.class);
        job.setMapperClass(DCMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(DataBean.class);
        FileInputFormat.setInputPaths(job, new Path("hdfs://localhost:9000/input/flowData.txt"));
        
        job.setReducerClass(DCReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DataBean.class);
        FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/output"));
        
        job.setPartitionerClass(ServiceProviderPartitioner.class);
        /**
         * 设置reducer数量,(有几个reducer就有几个结果文件,如果partitioner数量小于reducer数量,则多出的文件里无内容,
         * 如果partitioner数量大于reducer数量,则程序运行出错)
         */
        job.setNumReduceTasks(4);
        
        job.waitForCompletion(true);
    }
    public static class DCMapper extends Mapper<LongWritable, Text, Text, DataBean>{
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String[] fields = line.split("\t");
            String tel = fields[1];
            long up = Long.parseLong(fields[8]);
            long down = Long.parseLong(fields[9]);    
            DataBean bean = new DataBean(tel, up, down);
            context.write(new Text(tel), bean);
        }
    }
    public static class DCReducer extends Reducer<Text, DataBean, Text, DataBean>{
        protected void reduce(Text key, Iterable<DataBean> values, Context context)
                throws IOException, InterruptedException {
            long up_sum = 0;
            long down_sum = 0;
            for(DataBean bean : values){
                up_sum += bean.getUpPayLoad();
                down_sum += bean.getDownPayLoad();
            }
            DataBean bean = new DataBean("", up_sum, down_sum);
            context.write(key, bean);
        }
    }
    public static class ServiceProviderPartitioner extends Partitioner<Text, DataBean>{
        private static Map<String, Integer> providerMap = new HashMap<String, Integer>();
        static {
            providerMap.put("139", 1);
            providerMap.put("138", 2);
            providerMap.put("159", 3);
        }
        public int getPartition(Text key, DataBean value, int number) {
            String telNo = key.toString();
            String pcode = telNo.substring(0, 3);
            Integer p = providerMap.get(pcode);
            if(p == null){
                p = 0;
            }
            return p;
        }
    }
}
class DataBean implements Writable{
    private String tel;
    private long upPayLoad;
    private long downPayLoad;
    private long totalPayLoad;
    public DataBean(){}
    public DataBean(String tel, long upPayLoad, long downPayLoad) {
        super();
        this.tel = tel;
        this.upPayLoad = upPayLoad;
        this.downPayLoad = downPayLoad;
        this.totalPayLoad = upPayLoad + downPayLoad;
    }
    public String toString() {
        return this.upPayLoad + "\t" + this.downPayLoad + "\t" + this.totalPayLoad;
    }
    // notice : 1 类型 2 顺序
    public void write(DataOutput out) throws IOException {
        out.writeUTF(tel);
        out.writeLong(upPayLoad);
        out.writeLong(downPayLoad);
        out.writeLong(totalPayLoad);
    }
    public void readFields(DataInput in) throws IOException {
        this.tel = in.readUTF();
        this.upPayLoad = in.readLong();
        this.downPayLoad = in.readLong();
        this.totalPayLoad = in.readLong();
        
    }
    public String getTel() {
        return tel;
    }
    public void setTel(String tel) {
        this.tel = tel;
    }
    public long getUpPayLoad() {
        return upPayLoad;
    }
    public void setUpPayLoad(long upPayLoad) {
        this.upPayLoad = upPayLoad;
    }
    public long getDownPayLoad() {
        return downPayLoad;
    }
    public void setDownPayLoad(long downPayLoad) {
        this.downPayLoad = downPayLoad;
    }
    public long getTotalPayLoad() {
        return totalPayLoad;
    }
    public void setTotalPayLoad(long totalPayLoad) {
        this.totalPayLoad = totalPayLoad;
    }
}
/**
 * 输入:
 *  1363157985066     13726230503    00-FD-07-A4-72-B8:CMCC    120.196.100.82    i02.c.aliimg.com        24    27    2481    24681    200
    1363157995052     13826544101    5C-0E-8B-C7-F1-E0:CMCC    120.197.40.4            4    0    264    0    200
    1363157991076     13926435656    20-10-7A-28-CC-0A:CMCC    120.196.100.99            2    4    132    1512    200
    1363154400022     13926251106    5C-0E-8B-8B-B1-50:CMCC    120.197.40.4            4    0    240    0    200
    1363157993044     18211575961    94-71-AC-CD-E6-18:CMCC-EASY    120.196.100.99    iface.qiyi.com    视频网站    15    12    1527    2106    200
    1363157995074     84138413    5C-0E-8B-8C-E8-20:7DaysInn    120.197.40.4    122.72.52.12        20    16    4116    1432    200
    1363157993055     13560439658    C4-17-FE-BA-DE-D9:CMCC    120.196.100.99            18    15    1116    954    200
    1363157995033     15920133257    5C-0E-8B-C7-BA-20:CMCC    120.197.40.4    sug.so.360.cn    信息安全    20    20    3156    2936    200
    1363157983019    13719199419    68-A1-B7-03-07-B1:CMCC-EASY    120.196.100.82            4    0    240    0    200
    1363157984041     13660577991    5C-0E-8B-92-5C-20:CMCC-EASY    120.197.40.4    s19.cnzz.com    站点统计    24    9    6960    690    200
    1363157973098     15013685858    5C-0E-8B-C7-F7-90:CMCC    120.197.40.4    rank.ie.sogou.com    搜索引擎    28    27    3659    3538    200
    1363157986029     15989002119    E8-99-C4-4E-93-E0:CMCC-EASY    120.196.100.99    www.umeng.com    站点统计    3    3    1938    180    200
    1363157992093     13560439658    C4-17-FE-BA-DE-D9:CMCC    120.196.100.99            15    9    918    4938    200
    1363157986041     13480253104    5C-0E-8B-C7-FC-80:CMCC-EASY    120.197.40.4            3    3    180    180    200
    1363157984040     13602846565    5C-0E-8B-8B-B6-00:CMCC    120.197.40.4    2052.flash2-http.qq.com    综合门户    15    12    1938    2910    200
    1363157995093     13922314466    00-FD-07-A2-EC-BA:CMCC    120.196.100.82    img.qfc.cn        12    12    3008    3720    200
    1363157982040     13502468823    5C-0A-5B-6A-0B-D4:CMCC-EASY    120.196.100.99    y0.ifengimg.com    综合门户    57    102    7335    110349    200
    1363157986072     18320173382    84-25-DB-4F-10-1A:CMCC-EASY    120.196.100.99    input.shouji.sogou.com    搜索引擎    21    18    9531    2412    200
    1363157990043     13925057413    00-1F-64-E1-E6-9A:CMCC    120.196.100.55    t3.baidu.com    搜索引擎    69    63    11058    48243    200
    1363157988072     13760778710    00-FD-07-A4-7B-08:CMCC    120.196.100.82            2    2    120    120    200
    1363157985066     13726238888    00-FD-07-A4-72-B8:CMCC    120.196.100.82    i02.c.aliimg.com        24    27    2481    24681    200
    1363157993055     13560436666    C4-17-FE-BA-DE-D9:CMCC    120.196.100.99            18    15    1116    954    200
   输出:
   part-r-00000:
        13480253104    180    180    360
        13502468823    7335    110349    117684
        13560436666    1116    954    2070
        13560439658    2034    5892    7926
        13602846565    1938    2910    4848
        13660577991    6960    690    7650
        13719199419    240    0    240
        13726230503    2481    24681    27162
        13726238888    2481    24681    27162
        13760778710    120    120    240
        15013685858    3659    3538    7197
        18211575961    1527    2106    3633
        18320173382    9531    2412    11943
        84138413    4116    1432    5548
   part-r-00001:
        13922314466    3008    3720    6728
        13925057413    11058    48243    59301
        13926251106    240    0    240
        13926435656    132    1512    1644
   part-r-00002:
        13826544101    264    0    264
   part-r-00003:
        15920133257    3156    2936    6092
        15989002119    1938    180    2118
 */

 

 

/**
 * 先求和,再排序
 * @author fengmingyue
 *
 */
public class SumStep {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(SumStep.class);
        job.setMapperClass(SumMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(InfoBean.class);
        FileInputFormat.setInputPaths(job, new Path("hdfs://localhost:9000/input/trade_info.txt"));
        job.setReducerClass(SumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(InfoBean.class);
        FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/output"));
        job.waitForCompletion(true);
    }
    public static class SumMapper extends Mapper<LongWritable, Text, Text, InfoBean>{
        private InfoBean bean = new InfoBean();
        private Text k = new Text();
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String[] fields = line.split("\t");
            String account = fields[0];
            double income = Double.parseDouble(fields[1]);
            double expenses = Double.parseDouble(fields[2]);
            k.set(account);
            bean.set(account, income, expenses);
            context.write(k, bean);
        }
    }
    public static class SumReducer extends Reducer<Text, InfoBean, Text, InfoBean>{
        private InfoBean bean = new InfoBean();
        protected void reduce(Text key, Iterable<InfoBean> v2s, Context context)
                throws IOException, InterruptedException {
            double in_sum = 0;
            double out_sum = 0;
            for(InfoBean bean : v2s){
                in_sum += bean.getIncome();
                out_sum += bean.getExpenses();
            }
            bean.set("", in_sum, out_sum);
            context.write(key, bean);
        }
    }
}
class InfoBean implements WritableComparable<InfoBean>{
    private String account;
    private double income;
    private double expenses;
    private double surplus;
    public void set(String account, double income, double expenses){
        this.account = account;
        this.income = income;
        this.expenses = expenses;
        this.surplus = income - expenses;
    }
    public String toString() {
        return this.income + "\t" + this.expenses + "\t" + this.surplus;
    }
    //serialize
    public void write(DataOutput out) throws IOException {
        out.writeUTF(account);
        out.writeDouble(income);
        out.writeDouble(expenses);
        out.writeDouble(surplus);
    }
    public void readFields(DataInput in) throws IOException {
        this.account = in.readUTF();
        this.income = in.readDouble();
        this.expenses = in.readDouble();
        this.surplus = in.readDouble();
    }
    public int compareTo(InfoBean o) {
        if(this.income == o.getIncome()){
            return this.expenses > o.getExpenses() ? 1 : -1; 
        } else {
            return this.income > o.getIncome() ? -1 : 1;
        }
    }
    public String getAccount() {
        return account;
    }
    public void setAccount(String account) {
        this.account = account;
    }
    public double getIncome() {
        return income;
    }
    public void setIncome(double income) {
        this.income = income;
    }
    public double getExpenses() {
        return expenses;
    }
    public void setExpenses(double expenses) {
        this.expenses = expenses;
    }
    public double getSurplus() {
        return surplus;
    }
    public void setSurplus(double surplus) {
        this.surplus = surplus;
    }
}
/**
 * 输入:
 *      zhangsan@163.com    6000    0    2014-02-20
        lisi@163.com    2000    0    2014-02-20
        lisi@163.com    0    100    2014-02-20
        zhangsan@163.com    3000    0    2014-02-20
        wangwu@126.com    9000    0    2014-02-20
        wangwu@126.com    0    200        2014-02-20
   输出:
        lisi@163.com    2000.0    100.0    1900.0
        wangwu@126.com    9000.0    200.0    8800.0
        zhangsan@163.com    9000.0    0.0    9000.0
 */
public class SortStep {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(SortStep.class);
        job.setMapperClass(SortMapper.class);
        job.setMapOutputKeyClass(InfoBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        FileInputFormat.setInputPaths(job, new Path("hdfs://localhost:9000/input/trade_info2.txt"));
        job.setReducerClass(SortReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(InfoBean.class);
        FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/output"));
        job.waitForCompletion(true);
    }
    /**
     * 在map和reduce阶段进行排序时,比较的是k2。v2是不参与排序比较的。如果要想让v2也进行排序,需要把k2和v2组装成新的类,作为k2,才能参与比较。
     * 分组时也是按照k2进行比较的。
     */
    public static class SortMapper extends Mapper<LongWritable, Text, InfoBean, NullWritable>{
        private InfoBean bean = new InfoBean();
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String[] fields = line.split("\t");
            String account = fields[0];
            double income = Double.parseDouble(fields[1]);
            double expenses = Double.parseDouble(fields[2]);
            bean.set(account, income, expenses);
            context.write(bean, NullWritable.get());
        }
    }
    public static class SortReducer extends Reducer<InfoBean, NullWritable, Text, InfoBean>{
        private Text k = new Text();
        protected void reduce(InfoBean bean, Iterable<NullWritable> v2s, Context context)
                throws IOException, InterruptedException {
            String account = bean.getAccount();
            k.set(account);
            context.write(k, bean);
        }
    }
}
/**
 * 
   输入:
        lisi@163.com    2000.0    100.0    1900.0
        wangwu@126.com    9000.0    200.0    8800.0
        zhangsan@163.com    9000.0    0.0    9000.0
   输出:
        zhangsan@163.com    9000.0    0.0    9000.0
        wangwu@126.com    9000.0    200.0    8800.0
        lisi@163.com    2000.0    100.0    1900.0
 */

 

 

/**
 * 倒排索引(某个关键字在某篇文章中出现多少次)
 * @author fengmingyue
 *
 */
public class InverseIndex {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(InverseIndex.class);
        job.setMapperClass(IndexMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        //只写目录new Path("hdfs://localhost:9000/input2/")或new Path("hdfs://localhost:9000/input2")也可
        FileInputFormat.setInputPaths(job, new Path("hdfs://localhost:9000/input2/a.txt"),new Path("hdfs://localhost:9000/input2/b.txt"));
        job.setReducerClass(IndexReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/output"));
        job.setCombinerClass(IndexCombiner.class);
        job.waitForCompletion(true);
    }
    public static class IndexMapper extends Mapper<LongWritable, Text, Text, Text>{
        private Text k = new Text();
        private Text v = new Text();
        protected void map(LongWritable key, Text value,
                Mapper<LongWritable, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String[] fields = line.split(" ");
            FileSplit inputSplit = (FileSplit) context.getInputSplit();
            Path path = inputSplit.getPath();
            String name = path.getName();
            for(String f : fields){
                k.set(f + "->" + name);
                v.set("1");
                context.write(k, v);
            }
        }
    }
    public static class IndexCombiner extends Reducer<Text, Text, Text, Text>{
        private Text k = new Text();
        private Text v = new Text();
        protected void reduce(Text key, Iterable<Text> values,
                Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            String[] fields = key.toString().split("->");
            long sum = 0;
            for(Text t : values){
                sum += Long.parseLong(t.toString());
            }
            k.set(fields[0]);
            v.set(fields[1] + "->" + sum);
            context.write(k, v);
        }
    }
    public static class IndexReducer extends Reducer<Text, Text, Text, Text>{
        private Text v = new Text();
        protected void reduce(Text key, Iterable<Text> values,
                Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            String value = "";
            for(Text t : values){
                value += t.toString() + " ";
            }
            v.set(value);
            context.write(key, v);
        }
    }
}
/**
 * 输入:
 *     a.txt:  hello tom
               hello jerry
               hello kitty
               hello world
               hello tom
      b.txt:   hello jerry
               hello tom
               hello world
   输出:
           hello    b.txt->3 a.txt->5 
           jerry    a.txt->1 b.txt->1 
           kitty    a.txt->1 
           tom      a.txt->2 b.txt->1 
           world    b.txt->1 a.txt->1 
 */ 

 

    原文作者:MapReduce
    原文地址: https://www.cnblogs.com/fengmingyue/p/6354490.html
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞