基于HDFS
没有专门的数据存储格式
存储结构主要包括:数据库、文件、表、视图
可以直接加载文本文件(.txt文件)
创建表时,指定Hive数据的列分隔符与行分隔符
Hive的数据模型
表
Table 内部表
hive> create table t1
> (tid int,tname string,age int);
hive> create table t2
> (tid int,tname string,age int)
> location '/mytable/hive/t2';
hive> create table t3
> (tid int,tname string,age int)
> row format delimited fields terminated by ',';
hive>create table t3
>(tid int,tname string,age int)
>row format delimited fields terminated by ',';
hive>create table t4
>as
>select * from wordcount;
hive>select * from t4;
#hdfs dfs -ls /user/hive/warehouse/t4/
#hdfs dfs -cat /user/hive/warehouse/t4/000000-0
hive>create table t5
>row format delimited fields terminated by ','
>as
>select * from wordcount;
#hdfs dfs -cat /user/hive/warehouse/t5/000000-0
hive>alter table t1 add columns(english int);
#desc t1;
与数据库中的 Table 在概念上是类似
每一个 Table 在 Hive 中都有一个相应的目录存储数据
所有的 Table 数据(不包括 External Table)都保存在这个目录中
删除表时,元数据与数据都会被删除
Partition 分区表
Partition 对应于数据库的 Partition 列的密集索引
在 Hive 中,表中的一个 Partition 对应于表下的一个目录,所有的 Partition 的数据都存储在对应的目录中
hive>create table partition_table
>(sid int,sname string)
>partitioned by (gender string)
>row format delimited fields terminated by ',';
hive>insert into table partition_table partition(gender='M') select sid,sname from wordcountwhere gender='M';
hive>insert into table partition_table partition(gender='F') select sid,sname from wordcountwhere gender='F';
hive>explain select * from sample_data where gender='M';
hive>explain select * from partition_table where gender='M';
External Table 外部表
指向已经在 HDFS 中存在的数据,可以创建 Partition
它和内部表在元数据的组织上是相同的,而实际数据的存储则有较大的差异
外部表 只有一个过程,加载数据和创建表同时完成,并不会移动到数据仓库目录中,只是与外部数据建立一个链接。当删除一个外部表时,仅删除该链接
#vi student01.txt
1,Tom,23
2,Mary,20
#vi student02.txt
3,Mike,25
#vi student03.txt
4,Scott,21
5,King,20
#hdfs dfs -put student01.txt /input
#hdfs dfs -put student02.txt /input
#hdfs dfs -put student03.txt /input
hive> create external table external_student
> (sid int,sname string,age int)
> row format delimited fields terminated by ','
> location '/input';
hive>select * from external_student;
# hdfs dfs -rm /input/student03.txt
hive>select * from external_student;
#hdfs dfs -put student03.txt /input
hive>select * from external_student;
Bucket Table 桶表
桶表是对数据进行哈希取值,然后放到不同文件中存储。
hive> create table bucket_table
> (sid int,sname string,age int)
> clustered by(sname) into 5 buckets;
视图
视图是一种虚表,是一个逻辑概念;可以跨越多张表
视图建立在已有表的基础上, 视图赖以建立的这些表称为基表
视图可以简化复杂的查询
方法一
hive>create table dept (
>deptno int,
>dname string)
>row format delimited fields terminated by ' ';
hive>insert into table dept
>values
>(10.0,'SALES'),
>(20.0,'RESEARCH'),
>(30.0,'ACCOUTING');
方法二
#vi a.txt
10 SALES
20 RESEARCH
30 ACCOUTING
# hdfs dfs -mkdir /dd
# hdfs dfs -put emp.txt /dd
hvie>create table dept
>(deptno int,dname string)
>row format delimited fields terminated by ' ';
hive>load data inpath '/dd' overwrite into table dept;
hive>select * from dept;
hive>create table emp
>(empno int,
>ename string,
>job string,
>mgr string,
>hiredate date,
>sal int,
>comm int,
>deptno int)
>row format delimited fields terminated by ' ';
hive>insert into table emp
>values
>(7369.0,'SMITH','CLERK',7902,'1980-12-17',800,'NULL',20),
>(7499.0,'ALLEN','SALESMAN',7698,'1981-02-20',1600,300,30),
>(7521.0,'WARD','SALESMAN',7698,'1981-02-22',1250,500,30),
>(7566.0,'JONES','MANAGER',7839,'1981-04-02',2975,'NULL',20),
>(7654.0,'MARTIN','SALESMAN',7698,'1981-09-28',1250,1400,30),
>(7698.0,'BLAKE','MANAGER',7839,'1981-05-01',2850,'NULL',30),
>(7782.0,'CLARK','MANAGER',7839,'1981-06-09',2450,'NULL',10),
>(7788.0,'SCOTT','ANALYST',7566,'1987-04-19',3000,'NULL',20);
hive>insert into table emp
>(empno,ename,job,hiredate,sal,comm,deptno)
>values
>(7839.0,'KING','PRESIDENT','1981-11-17',5000,'NULL',10);
hive>insert into table emp
>values
>(7844.0,'TURNER','SALESMAN',7698,'1981-09-08',1500,0,30),
>(7876.0,'ADAMS','CLERK',7788,'1987-05-23',1100,'NULL',20),
>(7900.0,'JAMES','CLERK',7698,'1981-12-03',950,'NULL',30),
>(7902.0,'FORD','ANALYST',7566,'1981-12-03',3000,'NULL',20),
>(7934.0,'MILLER','CLERK',7782,'1982-01-23',1300,'NULL',10);
--查询员工信息:员工号,姓名,月薪,年薪,部门名称
hive>create view empinfo
>as
>select e.empno,e.ename,e.sal,e.sal*12 annlsal,d.dname
>from emp e,dept d
>where e.deptno=d.deptno;
hive>desc empinfo;
hive>select * from empinfo;