目录
我用的是idea做的
1.导入依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.lenovo</groupId>
<artifactId>0406</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
</dependencies>
</project>
2.idea测试连接hdfs
package com.lenovo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import java.io.IOException;
public class hdfs {
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
conf.set("fs.defaultFs","hdfs://192.168.171.151:90000");
FileSystem fs = FileSystem.get(conf);
System.out.println(fs.toString());
}
}
连接成功如下:
3.清洗数据
数据获取:https://download.csdn.net/download/m0_55834564/85152206
清洗前数据
27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/hot_1.gif HTTP/1.1" 200 680
清洗后数据
20130530053820
日期和时间格式
代码如下:
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
//提取数据
public class LogParser {
private String ip;
private String dateAndTime;
private String uri;
public LogParser(){}//空参
public LogParser(String line) {
this.ip = getIp(line);
this.dateAndTime = getDateAndTime(line);
this.uri = getURI(line);
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getDateAndTime() {
return dateAndTime;
}
public void setDateAndTime(String dateAndTime) {
this.dateAndTime = dateAndTime;
}
public String getUri() {
return uri;
}
public void setUri(String uri) {
this.uri = uri;
}
public static void main(String[] args) {
//在string输出双引号需要用转义字符
String str = "220.181.108.165 - - [30/May/2013:17:38:23 +0800] \"GET / HTTP/1.1\" 301 -";
//调用方法
LogParser log = new LogParser(str);
System.out.println(log.getIp());
System.out.println(log.getDateAndTime());
System.out.println(log.getUri());
}
//解析IP的方法
private String getIp(String line){
//分割符
String[] split = line.split("- -");
//trim去空格
return split[0].trim();
}
//解析时间+日期
private String getDateAndTime(String line){
//indexOf根据字符找索引
int i = line.lastIndexOf("[");
int il = line.lastIndexOf("+");
String substring = line.substring(i + 1, il - 1);
//转换格式
//30/May/2013:17:38:20对应dd/MMM/yyyy:HH:mm:ss,日期格式是英语格式
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
SimpleDateFormat simpleDateFormat2 = new SimpleDateFormat("yyyyMMddhhmmss");
String format = null;
//parse将字符串转换成日期
//format将日期转换成字符串
try {
Date parse = simpleDateFormat.parse(substring);
format = simpleDateFormat2.format(parse);
} catch (ParseException e) {
e.printStackTrace();
}
return format.trim();
}
//解析URI(统一资源标识符)
private String getURI(String line){
int i = line.indexOf("\"");
int il = line.lastIndexOf("\"");
String substring = line.substring(i + 1, il);
String[] s = substring.split(" ");
return s[1].trim();
}
}
运行结果:
LogMapper类和reduce类:
package com.lenovo.LogProject;
//清洗Mapper
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.codehaus.groovy.runtime.wrappers.LongWrapper;
import java.io.IOException;
public class LogMapper extends Mapper<LongWritable, Text, NullWritable,Text>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, NullWritable, Text>.Context context) throws IOException, InterruptedException {
String s = value.toString().trim();
LogParser logParser = new LogParser(s);
StringBuilder stringBuilder = new StringBuilder();
//196.168.171.151,20220413112233,xxxx
stringBuilder.append(logParser.getIp()).append(",").append(logParser.getDateAndTime()).append(",").append(logParser.getUri());
context.write(NullWritable.get(),new Text(stringBuilder.toString()));
}
}
class LogReducer extends Reducer<NullWritable,Text,NullWritable,Text>{
@Override
protected void reduce(NullWritable key, Iterable<Text> values, Reducer<NullWritable, Text, NullWritable, Text>.Context context) throws IOException, InterruptedException {
for (Text string:values){
context.write(key,new Text(string));
}
}
}
job类:
package com.lenovo.LogProject;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
public class LogJob extends Configured implements Tool{
public static void main(String[] args) {
try {
new LogJob().run(null);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public int run(String[] arg0) throws Exception {
//设置到分布式的ip信息
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS", "hdfs://192.168.17.151:9000");
//获取File System
FileSystem fileSystem=FileSystem.get(configuration);
//获取mr核心的一个对象
//MapReduce核心是Job,只要是类一定有构造器
Job job = Job.getInstance(configuration,"LogParser");
//反射Calss
job.setJarByClass(LogJob.class);
//设置mapper的类型
job.setMapperClass(LogMapper.class);
//设置Reducer的类型
job.setReducerClass(LogReducer.class);
//设置mapper输出的key的类型
job.setMapOutputKeyClass(NullWritable.class);
//设置mapper输出的value的类型
job.setMapOutputValueClass(Text.class);
//设置redeucer输出的key的类型
job.setOutputKeyClass(NullWritable.class);
//设置reducer输出的value的类型
job.setOutputValueClass(Text.class);
//数据输入的路径
FileInputFormat.addInputPath(job, new Path("/1w.txt"));
//数据结果输出的路径(不允许存在)
Path path=new Path("/0416");
if(fileSystem.exists(path)){
fileSystem.delete(path,true);
}
FileOutputFormat.setOutputPath(job, path);
//提交任务
boolean waitForCompletion = job.waitForCompletion(true);
System.out.println("结果:"+waitForCompletion);
return 0;
}
}
4.查看结果
上传到hdfs查看
[root@hadoop l0415]# hdfs dfs -put 1w.txt /
[root@hadoop l0415]# hdfs dfs -cat /0416/part-r-00000
8.35.201.164,20130530053821,/uc_server/data/avatar/000/03/13/51_avatar_middle.jpg
8.35.201.165,20130530053821,/uc_server/data/avatar/000/05/94/42_avatar_middle.jpg
27.19.74.143,20130530053820,/static/image/editor/editor.gif
27.19.74.143,20130530053820,/static/image/common/swfupload.swf?preventswfcaching=1369906718144
27.19.74.143,20130530053820,/static/image/common/pn.png
27.19.74.143,20130530053820,/data/cache/common_smilies_var.js?y7a
8.35.201.144,20130530053820,/uc_server/avatar.php?uid=29331&size=middle
110.52.250.126,20130530053820,/static/js/logging.js?y7a
27.19.74.143,20130530053820,/data/attachment/common/c8/common_2_verify_icon.png
110.52.250.126,20130530053820,/static/image/common/logo.png
27.19.74.143,20130530053820,/static/image/common/recommend_1.gif
110.52.250.126,20130530053820,/source/plugin/wsh_wx/img/wx_jqr.gif
110.52.250.126,20130530053820,/data/cache/style_1_forum_index.css?y7a
110.52.250.126,20130530053820,/source/plugin/wsh_wx/img/wsh_zk.css
27.19.74.143,20130530053820,/static/image/filetype/common.gif
27.19.74.143,20130530053820,/static/image/common/hot_2.gif
27.19.74.143,20130530053820,/static/image/common/hot_1.gif
110.52.250.126,20130530053820,/data/cache/style_1_widthauto.css?y7a
27.19.74.143,20130530053820,/static/image/common/faq.gif