HiveDay03

这里写目录标题

一、 后台启动HIVE的JDBC连接

0 表示标准输入
	1 表示标准输出
	2 表示标准错误输出
	nohup 表示挂起
	最后的 & 表示 后台启动
	nohup hive -service hiveserver2 > /usr/local/soft/hive-3.1.2/log/hiveserver2.log 2>&1 &

	jps 查看 Runjob 

	关闭后台的JDBC : kill -9 关闭 RunJob对应的PID

二、HIVE WORDCOUNT

CREATE TABLE learn3.wordcount(
word STRING COMMENT "单词"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;

INSERT INTO TABLE learn3.wordcount (word) VALUES ("hello,word"),("hello,java"),("hive,hello");

-- 统计WORDCOUNT


SELECT
T1.col
, sum(num)
FROM (
SELECT
col2 as col,
count(*) as num
FROM (

SELECT
split(word,",")[0] as col1,
split(word,",")[1] as col2
FROM learn3.wordcount

)T GROUP BY T.col2
UNION ALL
SELECT
col1 as col,
count(*) as num
FROM (

SELECT
split(word,",")[0] as col1,
split(word,",")[1] as col2
FROM learn3.wordcount

)T GROUP BY T.col1
)T1 GROUP BY T1.col

+---------+------+
| t1.col  | _c1  |
+---------+------+
| hello   | 6    |
| hive    | 2    |
| java    | 2    |
| word    | 2    |
+---------+------+

三、WITH AS 用法

格式:
WITH table1 AS (
SELECT 查询语句1
)
,table2 AS (
SELECT 查询语句2
)
[INSERT INTO TABLE] SELECT FROM



WORDCOUNT WITH AS 用法

WITH split_res AS (
SELECT
split(word,",")[0] as col1,
split(word,",")[1] as col2
FROM learn3.wordcount
)
,col_count AS(
SELECT
col1 as col
,count(*) as num
FROM split_res
GROUP BY col1
)

,col2_count AS (
SELECT
col2 as col
,count(*) as num
FROM split_res
GROUP BY col2
)

SELECT
T.col,
sum(T.num)
FROM(
SELECT * FROM col_count
UNION ALL
SELECT * FROM col2_count
)T GROUP BY T.col

五、集合函数

COLLECT_LIST(column2)
	需要跟GROUP BY column1 配合使用 将column1中相同的组column2数据放至一个集合
COLLECT_SET()
	需要跟GROUP BY column1 配合使用 将column1中相同的组column2数据放至一个集合中 并对集合中的数据进行去重操作

需求:
	将一列中相同的内容放至一组数据中
		将 word列中所有相同的单词对应的num 放至一个数组中

CREATE TABLE learn3.wordcount2(
word STRING COMMENT "单词",
num int COMMENT "数量"
)
STORED AS TEXTFILE;

INSERT INTO TABLE learn3.wordcount2 (word,num) VALUES ("hello",1),("hello",2),("hive",3);

SELECT
word
,COLLECT_LIST(num)
FROM learn3.wordcount2
GROUP BY word;

+--------+------------+
|  word  |    _c1     |
+--------+------------+
| hello  | [1,2,1,2]  |
| hive   | [3,3]      |
+--------+------------+


SELECT
word
,COLLECT_SET(num)
FROM learn3.wordcount2
GROUP BY word;

+--------+--------+
|  word  |  _c1   |
+--------+--------+
| hello  | [1,2]  |
| hive   | [3]    |
+--------+--------+



CREATE EXTERNAL TABLE IF NOT EXISTS learn3.student20(
id STRING COMMENT "学生ID"
,name STRING COMMENT "学生姓名"
,age int COMMENT "年龄"
,gender STRING COMMENT "性别"
,subject STRING COMMENT "学科"
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
STORED AS TEXTFILE;

load data local inpath "/usr/local/soft/hive-3.1.2/data/student_20.txt" INTO TABLE learn3.student20;

将两列数据进行拼接

WITH concat_stu AS(
SELECT
CONCAT(subject,"|",gender) as subject_gender
,CONCAT_WS("|",id,name) as id_name
FROM learn3.student20
)
SELECT
subject_gender
,CONCAT_WS(",",COLLECT_LIST(id_name))
FROM concat_stu
GROUP BY subject_gender;

+-----------------+----------------------------------------------------+
| subject_gender  |                        _c1                         |
+-----------------+----------------------------------------------------+
| 文科|| 1500100001|施笑槐,1500100007|尚孤风,1500100016|潘访烟,1500100018|骆怜雪 |
| 文科|| 1500100002|吕金鹏,1500100013|逯君昊                      |
| 理科|| 1500100003|单乐蕊,1500100005|宣谷芹,1500100008|符半双,1500100012|梁易槐,1500100015|宦怀绿,1500100017|高芷天 |
| 理科|| 1500100004|葛德曜,1500100006|边昂雄,1500100009|沈德昌,1500100010|羿彦昌,1500100011|宰运华,1500100014|羿旭炎,1500100019|娄曦之,1500100020|杭振凯 |
+-----------------+----------------------------------------------------+


六、行列互换

行转列:
	将原先多行数据转成一行
	转换方式:
		通过COLLECT_SET() 或者 COLLECT_LIST()GROUP BY 进行配合使用
		将GROUP BY 分组的数据进行存放于一个集合中

列转行:
	将一列数据转换为多行数据
	转换:

案例:
	INSERT INTO TABLE learn3.wordcount (word) VALUES ("helli,word,hello,java,hello,spark");

+------------------------------------+
|           wordcount.word           |
+------------------------------------+
| hello,word                         |
| hello,java                         |
| hive,hello                         |
| hello,word                         |
| hello,java                         |
| hive,hello                         |
| helli,word,hello,java,hello,spark  |
+------------------------------------+

EXPLODE() 函数
	将集合中的一行函数转换为多行

SELECT
EXPLODE(split(word,",")) as word
FROM learn3.wordcount;

+--------+
|  word  |
+--------+
| hello  |
| word   |
| hello  |
| java   |
| hive   |
| hello  |
| hello  |
| word   |
| hello  |
| java   |
| hive   |
| hello  |
| helli  |
| word   |
| hello  |
| java   |
| hello  |
| spark  |
+--------+





CREATE TABLE learn3.movie(
movie_name STRING COMMENT "电影名"
,types STRING COMMENT "电影类型"
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;

load data local inpath "/usr/local/soft/hive-3.1.2/data/movie.txt" INTO TABLE learn3.movie;types列中的电影类型进行分隔 并且于电影名进行对应

由	| 肖申克的救赎            | 犯罪/剧情        |
转换为:
	肖申克的救赎 犯罪
	肖申克的救赎 剧情

SELECT
movie_name,EXPLODE(split(types,"/")) as type
FROM learn3.movie

样例:
	SELECT name,new_num FROM later_tb LATRAL VIEW explode(weight) num AS new_num;

+-------------------+--------------+
| movie.movie_name  | movie.types  |
+-------------------+--------------+
| 肖申克的救赎            | 犯罪/剧情        |
| 霸王别姬              | 剧情/爱情/同性     |
| 阿甘正传              | 剧情/爱情        |
| 泰坦尼克号             | 剧情/爱情/灾难     |
| 这个杀手不太冷           | 剧情/动作/犯罪     |
+-------------------+--------------+

SELECT
movie_name, type
FROM learn3.movie LATERAL VIEW EXPLODE(split(types,"/")) view as type;

+-------------+-------+
| movie_name  | type  |
+-------------+-------+
| 肖申克的救赎      | 犯罪    |
| 肖申克的救赎      | 剧情    |
| 霸王别姬        | 剧情    |
| 霸王别姬        | 爱情    |
| 霸王别姬        | 同性    |
| 阿甘正传        | 剧情    |
| 阿甘正传        | 爱情    |
| 泰坦尼克号       | 剧情    |
| 泰坦尼克号       | 爱情    |
| 泰坦尼克号       | 灾难    |
| 这个杀手不太冷     | 剧情    |
| 这个杀手不太冷     | 动作    |
| 这个杀手不太冷     | 犯罪    |
+-------------+-------+

LATERAL VIEW EXPLODE(split(types,"/")) view as type;
解析:
	①通过split方法将types中的字符串切分为数组
	②通过EXPLODE方法将数组由一行数据转换为多行
	③通过LATERAL VIEW 将EXPLODE转换的结果包装成一个名为view的一个侧写表 并且列名为type
	④通过全连接将侧写表中的数据与原表 learn3.movie 中的一行数据进行全连接

七、练习:学生成绩

1、统计各性别年龄前三

CREATE EXTERNAL TABLE IF NOT EXISTS learn3.student1(
id STRING COMMENT "学生ID",
name STRING COMMENT "学生姓名",
age int COMMENT "年龄",
gender STRING COMMENT "性别",
clazz STRING COMMENT "班级"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ",";

load data local inpath "/usr/local/soft/hive-3.1.2/data/students.txt" INTO TABLE learn3.student1;


CREATE EXTERNAL TABLE IF NOT EXISTS learn3.score1(
id STRING COMMENT "学生ID",
subject_id STRING COMMENT "科目ID",
score int COMMENT "成绩"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","

load data local inpath "/usr/local/soft/hive-3.1.2/data/score.txt" INTO TABLE learn3.score1;

需求:
	1.统计各性别年龄前三
-- HIVE 原先老版本 不支持这种写法 
SELECT
s1.*
FROM learn3.student1 as s1
WHERE 3 > (SELECT count(*) FROM learn3.student1 as s2
WHERE s1.gender = s2.gender and s1.age < s2.age 
)

	2.统计各班级学生总成绩前三名
①算出学生总成绩
WITH score_sum AS (
SELECT
id
,sum(score) as total_score
FROM learn3.score1
GROUP BY id
)

, studen_score AS (
SELECT
T1.name
,T2.total_score
,T1.clazz
FROM learn3.student1 T1
JOIN score_sum T2 ON T1.id = T2.id 
)

--② 排序获取前三名

SELECT
TT.*
FROM (
SELECT
T.name
,T.total_score
,T.clazz
, ROW_NUMBER() OVER(PARTITION BY T.clazz ORDER BY T.total_score DESC) as row_pm
FROM studen_score T
) TT
WHERE TT.row_pm <= 3


SELECT
T.name
,T.total_score
,T.clazz
, ROW_NUMBER() OVER(PARTITION BY T.clazz ORDER BY T.total_score DESC) as row_pm
, dense_rank() OVER(PARTITION BY T.clazz ORDER BY T.total_score DESC) as dense_pm
, rank() OVER(PARTITION BY T.clazz ORDER BY T.total_score DESC) as rank_pm
, percent_rank() OVER(PARTITION BY T.clazz ORDER BY T.total_score DESC) as percent_rank_pm
, cume_dist() OVER(PARTITION BY T.clazz ORDER BY T.total_score DESC) as cume_dist_pm
FROM studen_score T

2、取每个班级总分最大的同学

##### row_number:无并列排名

* 用法: select xxxx, row_number() over(partition by 分组字段 order by 排序字段 desc) as rn from tb group by xxxx

##### dense_rank:有并列排名,并且依次递增

* 用法: select xxxx, dense_rank() over(partition by 分组字段 order by 排序字段 desc) as rn from tb group by xxxx


##### rank:有并列排名,不依次递增

* 用法: select xxxx, rank() over(partition by 分区字段 order by 排序字段 desc) as rn from tb group by xxxx

##### percent_rank:(rank的结果-1)/(分区内数据的个数-1)

* 用法: select xxxx, percent_rank() over(partition by 分组字段 order by 排序字段 desc) as rn from tb group by xxxx



需求1:
	取每个班级总分最大的同学

CREATE TABLE learn3.student_score(
name STRING COMMENT "",
total_score int COMMENT "",
clazz STRING COMMENT ""
);


WITH score_sum AS (
SELECT
id
,sum(score) as total_score
FROM learn3.score1
GROUP BY id
)

INSERT INTO TABLE learn3.student_score
SELECT
T1.name
,T2.total_score
,T1.clazz
FROM learn3.student1 T1
JOIN score_sum T2 ON T1.id = T2.id 


-- 通过max方法取出每个班级分区中的学生成绩最大值 
SELECT
TT.*
FROM (
SELECT
T1.*
,max(T1.total_score) OVER(PARTITION BY T1.clazz ORDER BY T1.total_score DESC) as max_score
FROM learn3.student_score T1
) TT WHERE TT.total_score = TT.max_score


-- 如果max() OVER() 中的窗口over()没有给定分区,那么当前的窗口表示整个学校,得到的数据是整个学校的最高的分数

SELECT
TT.*
FROM (
SELECT
T1.*
,max(T1.total_score) OVER() as max_score
FROM learn3.student_score T1
) TT WHERE TT.total_score = TT.max_score


max
	用法:
			① max(T1.total_score) OVER(PARTITION BY T1.clazz ORDER BY T1.total_score DESC) 
				基于每个partition分区内数据取最大值
			② max(T1.total_score) OVER()
				基于整个数据集取最大值
min、avg、count、sum:与max方法使用一致

八、卡口流量需求分析

1、查询当月的设备及其总数

CREATE TABLE learn3.veh_pass(
id STRING COMMENT "卡口编号",
pass_time STRING COMMENT "进过时间",
pass_num int COMMENT "过车数"
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
STORED AS TEXTFILE;

load data local inpath "/usr/local/soft/hive-3.1.2/data/veh_pass.txt" INTO TABLE learn3.veh_pass; 

需求1:查询当月的设备及其总数

-- 写法1
SELECT
T.id
,count(*) OVER()
FROM (
SELECT
id 
,pass_time
FROM learn3.veh_pass
WHERE substr(pass_time,1,7) = substr(current_date,1,7)
) T GROUP BY T.id


-- 错误写法
SELECT
DISTINCT id 
,count(*) OVER()
FROM (
SELECT
id 
,pass_time
FROM learn3.veh_pass
WHERE substr(pass_time,1,7) = substr(current_date,1,7)
)T 


-- 写法2:
SELECT
T1.id
,count(*) OVER()
FROM (
SELECT
DISTINCT id 
FROM (
SELECT
id 
,pass_time
FROM learn3.veh_pass
WHERE substr(pass_time,1,7) = substr(current_date,1,7)
)T )T1


+---------------------+-----------------+
|        t1.id        | count_window_0  |
+---------------------+-----------------+
| 451000000000071117  | 5               |
| 451000000000071116  | 5               |
| 451000000000071115  | 5               |
| 451000000000071114  | 5               |
| 451000000000071113  | 5               |
+---------------------+-----------------+

+---------------------+
|         id          |
+---------------------+
| 451000000000071113  |
| 451000000000071114  |
| 451000000000071115  |
| 451000000000071116  |
| 451000000000071117  |
+---------------------+

2、查询所有流量明细及所有设备月流量总额

SELECT
T1.id
,T1.pass_time
,T1.pass_num
,SUM(T1.pass_num) OVER(PARTITION BY SUBSTRING(T1.pass_time,1,7)) as total_pass
FROM learn3.veh_pass T1


需求3:按设备编号日期顺序展示明细 并求
		1)从第一天开始到当前天数 对流量进行累加
		2)昨天与当前天流量累加
		3)当前天数的前一天与后一天流量累加
		4)当前天与下一天的累加和
		5)当前天数与之后所有天流量累加和

1)从第一天开始到当前天数 对流量进行累加
SELECT
T1.*
,SUM(T1.pass_num) OVER(ORDER BY T1.pass_time ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
FROM (
SELECT 
*
FROM learn3.veh_pass ORDER BY pass_time
) T1

2)昨天与当前天流量累加

SELECT
T1.*
,SUM(T1.pass_num) OVER(ORDER BY T1.pass_time ROWS BETWEEN 1 PRECEDING AND CURRENT ROW)
FROM (
SELECT 
*
FROM learn3.veh_pass ORDER BY pass_time
) T1

3)当前天数的前一天与后一天流量累加

SELECT
T1.*
,SUM(T1.pass_num) OVER(ORDER BY T1.pass_time ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)
FROM (
SELECT 
*
FROM learn3.veh_pass ORDER BY pass_time
) T1

4)当前天与下一天的累加和
SELECT
T1.*
,SUM(T1.pass_num) OVER(ORDER BY T1.pass_time ROWS BETWEEN CURRENT ROW  AND 1 FOLLOWING)
FROM (
SELECT 
*
FROM learn3.veh_pass ORDER BY pass_time
) T1

5)当前天数与之后所有天流量累加和
SELECT
T1.*
,SUM(T1.pass_num) OVER(ORDER BY T1.pass_time ROWS BETWEEN CURRENT ROW  AND UNBOUNDED FOLLOWING)
FROM (
SELECT 
*
FROM learn3.veh_pass ORDER BY pass_time
) T1

需求4:查询每个设备编号上次有数据日期和下一次有数据日期


SELECT
T1.*
, LAG(T1.pass_time,1,"2022-01-01") OVER(PARTITION BY T1.id ORDER BY T1.pass_time) as before_time
, LEAD(T1.pass_time,1,"2022-12-31") OVER(PARTITION BY T1.id ORDER BY T1.pass_time) as after_time
FROM learn3.veh_pass T1


总结:
	OVER(): 会为每条数据都开启一个窗口,默认窗口大小就是当前数据集的大小
	OVER(PARTITION BY) 会按照指定的字段进行分区,在获取一条数据时,窗口大小为整个分区的大小,之后根据分区中的数据进行计算
	OVER(PARTITION BY ... ORDER BY ...) 根据给定的分区,在获取一条数据时,窗口大小为整个分区的大小,并且对分区中的数据进行排序

OVER中的取数据格式
(ROWS | RANGE) BETWEEN (UNBOUNDED | [num]) PRECEDING AND ([num] PRECEDING | CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
(ROWS | RANGE) BETWEEN CURRENT ROW AND (CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
(ROWS | RANGE) BETWEEN [num] FOLLOWING AND (UNBOUNDED | [num]) FOLLOWING

OVER():指定分析函数工作的数据窗口大小,这个数据窗口大小可能会随着行的改变而变化。
CURRENT ROW:当前行
n PRECEDING:往前n行数据
n FOLLOWING:往后n行数据
UNBOUNDED :起点,
	UNBOUNDED PRECEDING 表示从前面的起点, 
    UNBOUNDED FOLLOWING 表示到后面的终点
LAG(col,n,default_val):往前第n行数据
LEAD(col,n, default_val):往后第n行数据
NTILE(n):把有序窗口的行分发到指定数据的组中,各个组有编号,编号从1开始,对于每一行,NTILE返回此行所属的组的编号。

九、自定义UDF函数

老版本UDF 不推荐使用:
	① 创建自定义类继承UDF 注意 自定义函数名必须使用 evaluate 不然识别不到
public class MyUDFAddString extends UDF {


    /** * 定义函数名 evaluate * 实现将传入的String 增加后缀 ### * * @param col HIVE中使用函数时传入的数据 * @return 一行数据 */
    public String evaluate(String col) {
        return col + "###";
    }

}
	② 将代码打包,添加jar包至HIVE中
		add jar /usr/local/soft/test/HiveCode15-1.0-SNAPSHOT.jar;

	③ 创建临时自定义函数
		CREATE TEMPORARY FUNCTION my_udf as "com.shujia.udf.MyUDFAddString";
    原文作者:刘浩浩yyds
    原文地址: https://blog.csdn.net/weixin_65996579/article/details/124310899
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞