Hive基础

2023年2月12日 262次阅读来源: 洛水青柳2017

一.hive的数据类型

基本类型：整数类型，浮点类型，布尔类型，字符串类型；

复杂的数据类型(新增)：
 数组类型array:
                  创建表：    create table student (sid int,game string,grade array<float>); 
                  插入数据：{1,tom,[80,90,95]}
Map类型：
                        create table student1 (sid int,game string,grade map<string,float>);

                        {1,tom,<'大学语文',85>}
Array和Map嵌套
                     create table student3 (sid int,same string,grades array<map<string,float>>);
                      {1,tom,[<'大学语文',80>,<'大学英语'，90>]}
                      
Struct类型：
                   create table student4 (sid int,info struct<name:string,age:int,sex:string>);
                   {1,{'tom',10,'男'}}

二.表的类型和表的创建

#内部表,分区表,外部表,桶表
内部表：
     1.与数据库中的表在结构上类似的
     2.每一个table在Hive中都有相同的目录存储数据
     3.所有的表数据（不包括External Table）都保存在这个目录里面
     4.删除的时候，元数据和数据都会被清除
        create table t1  {tid int ,name string,age int};  #保存在默认位置
        create table t2  {tid int ,name string,age int} loaction '/mystable/hive/t2'; #指定文件的位置
        create table t3  {tid int ,name string,age int} row format delimited fields terminated by ',';
        create table  t4
            as
            select * from   t1; #根据t1表的查询结果创建t4表,也就复制表
            create table t5
            row format delimited fields terminated by ','
              as
            select * from   t1;

#分区表
    #将大表依据某一列划分成小表
     create table partition_table (sid int ,sname srting)
     partitioned by {gender string}
     row format delimited fields terminated by ',';
    #向分区表中插入数据
    insert into  table  partition_table  partition{gender='M'} SELECT * From  select * from t1 where gender='M';

#外部表
    1.指向已经在HDFS中存在的数据,可以创建分区表
    2.它和内部表在元数据的组织上是相同的
    3.外部表只是一个连接（相当于桌面的快捷方式）
    4.外部表只是一个过程，加载数据和创建表同时完成，并不会移动数据到数据仓库的目录中
      只是与外部数据建立一个链接，当删除一个外部表的时候，仅删除该链接
    #创建外部表
    create  external table external_student
    (sid int ,sname srting)
    row format delimited fields terminated by ',' #指定文件分隔符
    loaction '/input'; #指定文件位置
#桶表(哈希规则)
    1.经过哈希运算，把一整块数据打散存储数据
    2.降低热块 提升查询效率
    #创建桶表
    create table bucket_table(sid int ,sname srting)
    clustered by(sname) into  5 buckets;
#视图
    简化复杂查询
其他通用的操作
    #生成执行计划
    explain hql句子; 从下往上，从左往右
    #添加新的列
    alter table  t1,add columns(z int);
    #删除表
    drop table t1；

三.文件数据的导入

#数据的操作和插入
load data local inpath '/root/data/student01.txt' into table t2; #没有分隔符可以创建分隔符
load data       inpath '/root/data/student01.txt' overwrite into table t3; #没有分隔符可以创建分隔符
#数据导入分区表
load data local inpath '/root/data/student01.txt' into table partition_table partition(gender='M'); #指定分区表的条件

四.Hive的函数类型和简单举例

#Hive内置的函数
数学函数：
    round,ceil向上取整，floor向下取整
字符串函数：
    lower,upper,length,concat
    ,substr #substr(a,b) 从a中，第b位开始，取右边所有的字符；substr(a,b,c) 从a中，第b位开始，取c个的字符；
    ,trim  #去除空值
    ,lpad-左填充,rpad-右填充 #lpad('abc',10,'*') → ******abc
收集函数：
    size(map(<key,value>,<key,value>)) #size(map(1,'Tom',2,'Mary')) → 2
格式转化函数：
    cast cast(1 as float);cast('2018-05-26' as date)
#日期函数
    to_date(=date) #to_date('2015-04-23 11:23:11') → '2015-04-23'
    year,month,day,weekofyear,datediff
    ,date_add,date_sub #date_add('2015-04-23',2)
#条件函数
    coalesce()#从左到右返回第一个不为空的数值 select coalesce(a,b,c);
    case when 表达式
#聚合函数
    count,sum,avg(),max(),min() 
#表生成函数
    explode() #select explode(map(1,'tom',2,'marry',3,'mike'))
自定义函数UDF
    继承Java类,重写函数

五.数据的查询操作（与sql没有太大的区别）

#数据查询
    #简单的查询
        select * from t_sec_createrole ; #查询不需要程序转化
        select roleid,serverid from t_sec_createrole ; #需要转化成mapreduce程序
        select roleid,roleid*1 from t_sec_createrole ; #支持算数表达式
        select roleid,roleid*1,roleid*1+nvl(roleid,0) from t_sec_createrole ; #将空值转化为0
        #查询为空值的行
        select * from t_sec_createrole where  is null;
        #使用distinct 来去除重复值
        select distinct roleid from t_sec_createrole ;  
        #Fetch Task应对简单查询的功能
        set hive.fetch.task.conversion=more;
        #使用where条件进行过滤,严格区分大小写
        select * from t_sec_createrole  where serverid=1 and gamechannel=15;
        #模糊查询
        select * from t_sec_createrole  where rolename like 'S%';
        #查询包含特殊字符_的查询
        select * from t_sec_createrole  where rolename like '%\\_%';
        #查询结果的排序
        select roleid,serverid from t_sec_createrole  order by serverid ;oder by +列+表达式+别名+序号
        select roleid,serverid from t_sec_createrole  order by 4 ; #根据第四列数据进行排序
            需要开启功能 set hive.groupby.orderby.position.alias=ture 
    #连接查询(多表查询)
    #等值查询
        select roleid,pay
        from createrole as a ,recharge as b 
        where a.roleid=b.roleid
    #不等值查询
        select e.empo,e.name,e.sal
        from  emp e ,salgrade s 
        where  e.sal between s.losal  and s.hisal
    #外连接(左外连接,右外连接)
        select * 
          from create as a,gamechannel as b 
        where a.channelid=gamechannel
    #子查询
        #只支持from和where的子查询,子查询是空值可以使用in,但是不能使用not in 
        select roleid from  createrole where roleid  not in (select roleid from recharge)

    原文作者：洛水青柳2017
    原文地址: https://www.jianshu.com/p/cba8c561c954
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。