Hive的部署
解压缩、重命名、设置环境变量
在目录$HIVE_HOME/conf/下,执行命令mv hive-default.xml.template hive-site.xml重命名
hive.metastore.warehouse.dir: Hive数据存储目录
hive.exec.scratchdir:Hive数据临时文件目录
在目录$HIVE_HOME/conf/下,执行命令mv hive-env.sh.template hive-env.sh重命名
修改hadoop的配置文件hadoop-env.sh,修改内容如下:
Export HADOOP_CLASSPATH=.:$CLASSPATH:$HADOOP_CLASSPATH:$HADOOP_HOME/bin
在目录$HIVE_HOME/bin下面,修改文件hive-config.sh(可跳过)
hive-site.xml
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
<description>password to use against metastore database</description>
</property>
查看hive版本:
mysql> select * from VERSION;
查看有哪些表:
mysql> select * from TBLS \G;
查看表对应的hdfs目录的metedata:
mysql> select * from SDS \G;
查看某个表的partitions:
mysql> select * from PARTITIONS where TBL_ID=1 \G;
查看某个表的列:
mysql> select * from COLUMNS_V2;
查看某个表的partition:
mysql> select * from PARTITION_KEYS;
连接bigdata 元数据库
从/usr/local/hive/conf/hive-site.xml找ConnectionUserName和ConnectionPassword(hive/hive2017)
跳到bigdata003
mysql -u hive -phive2017 -D hive
Hive与HBase对比
共同点
1.HBase与Hive都架构在Hadoop之上,都是用hdfs作为底层存储。
2.职能上都能对外提供表形式的数据查询等服务。
区别
1.Hive是建立在Hadoop之上为了减少MapReduce jobs编写工作的批处理系统,HBase是为了支持弥补Hadoop对实时操作的缺陷的项目 。
2.Hive本身不存储和计算数据,它完全依赖于HDFS和MapReduce,Hive中的表是纯逻辑表,是对hdfs文件的一种记录方式。
3.Hbase是物理表,不是逻辑表,提供一个超大的内存hash表,搜索引擎通过它来存储索引,方便查询操作。
create table city(
province_code INT,
province_name string,
city_code INT,
city_name string
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’;
load data local inpath ‘/home/bigdata/hive/city.txt’ into table city; //将数据从‘’中取出来,放到city表中。
alter table city rename to city_new;
select * from city_new limit 10; //查询十条数据
select count(*) from city_new; //查询一共多少条数据
select * from city_new order by city_code desc limit 10 ; //最大值
select province_name,count(*) as cnt from city_new group by province_name order by cnt desc limit 10;
//聚合函数在后面跟着order by的话,必须将聚合函数,设置别名。
select count(distinct province_name) from city_new ; //查询省份的个数
select
count(*)
from
(
select
province_name
from
city_new
group by
province_name
)a; //查询省份的个数
//查询只有一个市的省份
1.首先,查出来所有省份及其市的个数
2.查询市的个数为1的省份
select province_name,count(*) as su from city_new group by province_name having su = 1;
select province_name,su from (
select province_name,count(*) as su from city_new group by province_name
)a
where su = 1;
外表与内表
//创建一个外表,将’/user/cll/city/’这个路径下的文件,倒进来
create EXTERNAl table city_ex(
province_code int,
province_name String,
city_code int,
city_name String
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’
location ‘/user/cll/city/’;
//创建一个内表,将’/user/cll/city/’这个路径下的文件,倒进来
//’/user/cll/city/’这个路径是将city.txt文件put到服务器上的
create table city_in(
province_code int,
province_name String,
city_code int,
city_name String
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’
location ‘/user/cll/city/’;
show create table city_ex;
show create table city_in;
//将外表删除以后,倒进来的文件的路径,去服务器上查询,仍然存在;hadoop fs -ls /user/cll/city 文件下依然有文件
//将内表删除以后,倒进来的文件的路径,去服务器上查询,文件也不存在。
分区:
CREATE TABLE user
(
uid
INT,
city_code
INT,
model
string,
access
string
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’;
load data local inpath ‘/home/bigdata/tanqi/hive/user.txt’ into table user;
创建分区表:
create table user_daily(
uid int,
city_code int,
model String,
access String
)
partitioned by (p_date String);
//从user表中查出来数据,然后将其插入到表user_daily 2017.09.01这一天里面去
//静态分区
insert OVERWRITE table user_daily partition (p_date = ‘2017-09-01’)
select * from user;
//插进来以后,可以展示查询知道,有几个分区
show partitions user_daily;
//动态分区(不指定日期)
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE user_daily PARTITION (p_date)
SELECT *,’2017-09-02′ FROM user
UNION ALL
SELECT *,’2017-09-03′ FROM user
;
ALTER TABLE user_daily PARTITION (p_date=’2017-01-01′) RENAME TO PARTITION (p_date=’20170101′);
insert OVERWRITE table user_daily partition (p_date = ‘2017-01-01’)
select * from user;
alter table user_daily drop partition (p_date=’20170101′);
select * from user_daily where p_date=’2017-09-02′;
select p_date,count(*) from user_daily group by p_date;
作业1:
a)统计WIFI环境下用户最多的5个城市
select city_code, count(uid) as cun from user where access=’WIFI’ group by city_code order by cun desc limit 5; //在前面加一个explain,可以查看执行过程
b)统计用户数>=3的access和city_code组合
select city_code,access
,count(uid) as cun from user group by city_code,access
having cun>=3;
c)机型中带有ne(不区分大小写),不同access及对应用户数
selectaccess
,count(uid) from user where lower(model) like ‘%n%e%’ group by access order by access desc;
select sum(if(access=’2G’,1,0))/count(*) from user ; //求占比(WIFI人数与总数的比值)
//计算尾数为[0-3]、[4-7]、[8-9]的人数
select
case
when uid % 10 in (0,1,2,3) then ‘0-3’
when uid % 10 in (4,5,6,7) then ‘4-7’
else ‘8-9’
end as interval,
count(*) as cun
from user
group by
case
when uid % 10 in (0,1,2,3) then ‘0-3’
when uid % 10 in (4,5,6,7) then ‘4-7’
else ‘8-9’
end;
//集合
select collect_set(access) from user; //去重
select collect_list(access) from user; //不去重
//join
select user.uid, user.city_code, city_new.city_name
from
(select * from user where uid <= 100) user
right join
(select * from city_new where province_code <= 30) city_new
on (user.city_code = city_new.city_code)
limit 20;
select user.uid, user.city_code, city_new.city_name
from
(select * from user where uid <= 100) user
inner join
(select * from city_new where province_code <= 30) city_new
on (user.city_code = city_new.city_code)
limit 20;
select user.uid, user.city_code, city_new.city_name
from
(select * from user where uid <= 100) user
full join
(select * from city_new where province_code <= 30) city_new
on (user.city_code = city_new.city_code)
limit 20;
窗口函数:(a\、b)
a\分组topN:(窗口函数)
1、分组 access
2、排序 序号
3、top1 序号=1
select access,city_code,uid
from
(
select uid,access,city_code,
row_number() over (partition by access order by
city_code desc) as row_num
from user
)a
where row_num = 1;
b\累积和
select p_date,
sum(cun) over (order by p_date asc rows between unbounded preceding and current row)
from
(
select p_date,count(*) as cun
from user_daily
where p_date between ‘2017-09-01’ and ‘2017-09-30’
group by p_date
)a
;
create table phone (
model String,
os String,
brand String,
resolution String,
launch_date String
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’
location ‘/user/cll/phone/’;
select * from phone limit 10;
set hive.cli.print.header=true;
select user.uid, user.city_code, city.province_name
from user left join city
on (user.city_code = city.city_code)
limit 20;
select user.model,user.access, user.city_code, phone.launch_date
from user left join phone
on (user.model = phone.model)
limit 20;
create table user_info(
uid int,
access String,
city_code int,
province_name String,
model String,
launch_date String
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’
insert into //如果插入错了,在来一遍的话,需要覆盖式插入(insert overwrite) table user_info
select user.uid, user.access,user.city_code, city.province_name,user.model,phone.launch_date
from user
left join city on (user.city_code = city.city_code)
left join phone on (user.model = phone.model)
;
select * from phone;
select * from user_info limit 10;
作业2
create table phone (
model string,
os string,
brand string,
resolution string,
launch_data string
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’;
load data local inpath ‘/home/bigdata/tanqi/hive/phone.txt’ into table phone;
set hive.cli.print.header=true;
create table user_info (
uid int,
access string,
city_code string,
provice_name string,
model string,
launch_data string
)
row FORMAT delimited
fields terminated by ‘,’
lines terminated by ‘\n’
;
加载数据
以user为主表,分别左连接phone和city表,生成新表user_info
a)user和city连接时,利用city_code相等;user和phone连接时,利用model相等
b)user_info表包含字段:uid, access, city_code, province_name, model,launch_date
insert into table user_info
select user.uid, user.access, user.city_code, city.provice_name, user.model,phone.launch_data
from
user
left join
city
on (user.city_code = city.city_code)
left join
phone
on (user.model = phone.model);
a)统计不同省份下uid最大时的:province_name,uid,model
select provice_name,uid,model from
(
select provice_name,uid,model,
row_number() over (partition by provice_name order by uid desc) as row_num
from user_info
)a
where row_num=1;
b)按launch_date统计每年对应的累计总用户数。结果类似:
select launch_data,
sum(cnt) over (order by launch_data asc rows between unbounded preceding and current row)
from
(
select launch_data,count(*) as cnt from user_info
where launch_data between ‘2002’ and ‘2016’
group by launch_data
)a
;