MapReduce+Hive的日志分析

目录

我用的是idea做的

1.导入依赖

 2.idea测试连接hdfs

3.清洗数据

4.查看结果

我用的是idea做的

1.导入依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.lenovo</groupId>
    <artifactId>0406</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
    </properties>
    <dependencies>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.3</version>
        </dependency>

    </dependencies>


</project>

 2.idea测试连接hdfs

package com.lenovo;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;

import java.io.IOException;

public class hdfs {
    public static void main(String[] args) throws IOException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFs","hdfs://192.168.171.151:90000");
        FileSystem fs = FileSystem.get(conf);
        System.out.println(fs.toString());
    }
}

连接成功如下:

《MapReduce+Hive的日志分析》

3.清洗数据

数据获取:https://download.csdn.net/download/m0_55834564/85152206

清洗前数据

27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/hot_1.gif HTTP/1.1" 200 680

 清洗后数据

20130530053820

日期和时间格式

《MapReduce+Hive的日志分析》  

代码如下:


import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;

//提取数据
public class LogParser {
    private String ip;
    private String dateAndTime;
    private String uri;

    public LogParser(){}//空参
    public LogParser(String line) {
        this.ip = getIp(line);
        this.dateAndTime = getDateAndTime(line);
        this.uri = getURI(line);
    }

    public String getIp() {
        return ip;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public String getDateAndTime() {
        return dateAndTime;
    }

    public void setDateAndTime(String dateAndTime) {
        this.dateAndTime = dateAndTime;
    }

    public String getUri() {
        return uri;
    }

    public void setUri(String uri) {
        this.uri = uri;
    }

    public static void main(String[] args) {
        //在string输出双引号需要用转义字符
        String str = "220.181.108.165 - - [30/May/2013:17:38:23 +0800] \"GET / HTTP/1.1\" 301 -";
        //调用方法
          LogParser log = new LogParser(str);
          System.out.println(log.getIp());
        System.out.println(log.getDateAndTime());
        System.out.println(log.getUri());
    }

    //解析IP的方法
    private String getIp(String line){
        //分割符
        String[] split = line.split("- -");
        //trim去空格
        return split[0].trim();
    }
    //解析时间+日期
    private String getDateAndTime(String line){
        //indexOf根据字符找索引
        int i = line.lastIndexOf("[");
        int il = line.lastIndexOf("+");
        String substring = line.substring(i + 1, il - 1);
        //转换格式
        //30/May/2013:17:38:20对应dd/MMM/yyyy:HH:mm:ss,日期格式是英语格式
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
        SimpleDateFormat simpleDateFormat2 = new SimpleDateFormat("yyyyMMddhhmmss");
        String format = null;
        //parse将字符串转换成日期
        //format将日期转换成字符串
        try {
            Date parse = simpleDateFormat.parse(substring);

            format = simpleDateFormat2.format(parse);

        } catch (ParseException e) {
            e.printStackTrace();
        }


        return format.trim();
    }
    //解析URI(统一资源标识符)
    private String getURI(String line){
        int i = line.indexOf("\"");
        int il = line.lastIndexOf("\"");

        String substring = line.substring(i + 1, il);

        String[] s = substring.split(" ");

        return s[1].trim();
    }

}

运行结果:

《MapReduce+Hive的日志分析》

 LogMapper类和reduce类:

package com.lenovo.LogProject;
//清洗Mapper

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.codehaus.groovy.runtime.wrappers.LongWrapper;

import java.io.IOException;

public class LogMapper extends Mapper<LongWritable, Text, NullWritable,Text>{
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, NullWritable, Text>.Context context) throws IOException, InterruptedException {
        String s = value.toString().trim();
        LogParser logParser = new LogParser(s);
        StringBuilder stringBuilder = new StringBuilder();
        //196.168.171.151,20220413112233,xxxx
        stringBuilder.append(logParser.getIp()).append(",").append(logParser.getDateAndTime()).append(",").append(logParser.getUri());
        context.write(NullWritable.get(),new Text(stringBuilder.toString()));
    }
}
class LogReducer extends Reducer<NullWritable,Text,NullWritable,Text>{
    @Override
    protected void reduce(NullWritable key, Iterable<Text> values, Reducer<NullWritable, Text, NullWritable, Text>.Context context) throws IOException, InterruptedException {
        for (Text string:values){
            context.write(key,new Text(string));
        }
    }
}

job类:

package com.lenovo.LogProject;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;


public class LogJob extends Configured implements Tool{
	
	public static void main(String[] args) {
		try {
			new LogJob().run(null);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}

	@Override
	public int run(String[] arg0) throws Exception {

		//设置到分布式的ip信息
		Configuration configuration = new Configuration();	    
		configuration.set("fs.defaultFS", "hdfs://192.168.17.151:9000");
		  //获取File System
        FileSystem fileSystem=FileSystem.get(configuration);
		//获取mr核心的一个对象
		//MapReduce核心是Job,只要是类一定有构造器
		Job job = Job.getInstance(configuration,"LogParser");
		//反射Calss
		job.setJarByClass(LogJob.class);
		//设置mapper的类型
		job.setMapperClass(LogMapper.class);
		//设置Reducer的类型
		job.setReducerClass(LogReducer.class);
		//设置mapper输出的key的类型
		job.setMapOutputKeyClass(NullWritable.class);
		//设置mapper输出的value的类型
		job.setMapOutputValueClass(Text.class);
		//设置redeucer输出的key的类型
		job.setOutputKeyClass(NullWritable.class);
		//设置reducer输出的value的类型
		job.setOutputValueClass(Text.class);
		//数据输入的路径
		FileInputFormat.addInputPath(job, new Path("/1w.txt"));
		//数据结果输出的路径(不允许存在)		
	    Path path=new Path("/0416");
        if(fileSystem.exists(path)){
            fileSystem.delete(path,true);
        }

		FileOutputFormat.setOutputPath(job, path);
		//提交任务
		boolean waitForCompletion = job.waitForCompletion(true);
		System.out.println("结果:"+waitForCompletion);
		return 0;
	}

}

4.查看结果

上传到hdfs查看

[root@hadoop l0415]# hdfs dfs -put 1w.txt /
[root@hadoop l0415]# hdfs dfs -cat /0416/part-r-00000
8.35.201.164,20130530053821,/uc_server/data/avatar/000/03/13/51_avatar_middle.jpg
8.35.201.165,20130530053821,/uc_server/data/avatar/000/05/94/42_avatar_middle.jpg
27.19.74.143,20130530053820,/static/image/editor/editor.gif
27.19.74.143,20130530053820,/static/image/common/swfupload.swf?preventswfcaching=1369906718144
27.19.74.143,20130530053820,/static/image/common/pn.png
27.19.74.143,20130530053820,/data/cache/common_smilies_var.js?y7a
8.35.201.144,20130530053820,/uc_server/avatar.php?uid=29331&size=middle
110.52.250.126,20130530053820,/static/js/logging.js?y7a
27.19.74.143,20130530053820,/data/attachment/common/c8/common_2_verify_icon.png
110.52.250.126,20130530053820,/static/image/common/logo.png
27.19.74.143,20130530053820,/static/image/common/recommend_1.gif
110.52.250.126,20130530053820,/source/plugin/wsh_wx/img/wx_jqr.gif
110.52.250.126,20130530053820,/data/cache/style_1_forum_index.css?y7a
110.52.250.126,20130530053820,/source/plugin/wsh_wx/img/wsh_zk.css
27.19.74.143,20130530053820,/static/image/filetype/common.gif
27.19.74.143,20130530053820,/static/image/common/hot_2.gif
27.19.74.143,20130530053820,/static/image/common/hot_1.gif
110.52.250.126,20130530053820,/data/cache/style_1_widthauto.css?y7a
27.19.74.143,20130530053820,/static/image/common/faq.gif

    原文作者:lambda-小张
    原文地址: https://blog.csdn.net/m0_55834564/article/details/124150924
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞