show tblproperties table_name
show tables
describe extended mydb.employees
–where查询自居没有加分区过滤的话就会禁止提交这个任务
set hive.mapred.mode=strict;
–查看表分区
show partitions Tablename;
show partitions table_name partition(country=’US’)
load data local inpath ‘${env:Home}/california-employees’
into Table employees
partition(country=’US’,sdate=’CA’);
–增加一个2012年1月2日的分区
alter table log_message
ADD PARTITION (year=2012,month=1,day=2)
location ‘hdfs://master_server/data/log_message/2012/01/02’
–删除表
drop table if exists employees;
–表重名
alter table log_message rename to logmsgs;
–增加、修改和删除表分区
alteer table log_message add if not exists
partion (year =2011,month=1,day=1) Location ‘/logs/2011/01/01’
–修改列信息
alter table log_message
change cloumn hms hours_minutes_seconds INT
cloumn ‘The hours ,minutes,and seconds part of the timestamp’
alter severity;
–增加列
alter table log_message add columns(
app_name string comment’Application name’,
session_id long comment ‘The current session_id’
);
–存储属性
alter table stocks
clustered by (exchange ,symbol)
stored by (symbol)
into 48 buckets;
–向管理表中转载数据
load data local inpath ‘${env:HOME}/california-employees’
overwrite into table employees
partion (country=’US’,state =’CA’);
–动态分区插入
insert overwrite table employees
partion(country,state)
select se.st,se.cnty
from staged_employees se;
–动态分区属性
set hive.exec.dynamic.partion=true;
set hive.exec.dynamic.partion.mode=nonstrict;
set hive.exec.max.dynamic.partions.pernode=1000;
–从宽表中拿数据,单个查询语句中
create table ca_employees
as select name ,salary,address
from employees
where se.state =’CA’
–导出数据
hadoop fs -cp source_path target_path
==========
insert overwrite local directory ‘/tmp/ca_employees’
select name, salary,address
from employees
where se.state=’CA’
–指定多个输出文件夹目录,向表里插入数据
from staged_employees se
insert overwrite directory’/tmp/or_employees’
select *
where se.cty=’US’ and se.st=’OR’
insert overwrite directory ‘/tmp/ca_employees’
select *
where se.cty=’US’ and se.st=’CA’
–建表
create table employees(
name string,
salary float,
subordinates array<string>,
deductions MAP<string ,float>,
address struct<street:string ,city:string,state:string,zip:int>
)
partitioned by (country string,state string)
–查询
select name, subordinates from employees;
–查询 array 格式的数据
select name ,subordinates[0] from employees
–查询Map格式的数据
select name ,deductions[“State Taxes”] from employees
–查询Struct格式的数据
select name ,address.city from employees
–使用正则表达式来指定列,查询symbol和所有以price作为前缀的列
select symbol,’price.*’
from stocks
–列值计算,round()函数会返回一个Double 类型的最近整数
–floor,round,ceil(“向上取整”)输入的是Double类型的值,返回的是bigint
select upper(name),salary,deductions[“Federal Taxes”],
round(salary *(1-deductions[“Federal Taxes “]))
from employees;
–聚合函数,count,avg
select count(*),avg(salary)
from employees;
–count(*),count(distinct expr[,expr_.]),sum(col),sum(distinct col)
–avg(),min(col),max(col)
–设置属性 hive.map.aggr值为true来提高聚合性能
select hive.map.aggr=true;
select count(*),avg(salary)
from employees;
–字段为空的话将不会产生新的记录;不为空的话,每个数组的灭个元素将会产生新的纪录
select explode(subordinates) as sub
from employees
–limit 语句会限制返回的行数
select upper(name0,salary,deductions[“Federal Taxes”],
round(salary*(1-deductions[“Federal Taxes”]))
from employees
limit 2;
–case … when … then
select name , salary,
case
when salary<5000 then ‘low’
when salary>=5000 and salary<7000 then ‘middle’
when salary>=7000 and salary<10000 then ‘high’
else ‘very high’
end as bracket from employees
–hive中某些情况的查询可以不需要用到MapReduce就是所谓的本地模式
–属性设置为true的话,hive就会尝试本地模式
set hive.exec.mode.local.auto =true;
–like 模糊匹配,RLike是hive中的扩展
–inner join的时候,表的大小从左到右依次增加
select s.ymd,s.symbol,s.price_close,d.divided
from stocks s join divided d
on s.ymd=d.ymd and s.symbol= d.symbol
where s|.symbol =’AAPL’;
–right outer join 返回右表中所有符合where语句的记录
–所有的表中只有一张小表,可以在最大的表通过mapper的时候将小表完全放内存中
set hive.auto.convert.join =true;
–order by 会对查询结果执行一个全局的扫描
–hive里添加了一个sort by,只会在每个reducer中对数据进行排序
–含有sort by 的distribute by
MapReduce job中传输的所有数据按照健-值对的方式进行组织的,默认情况下,
MapReduce计算框架会依据map输入的健计算相应的哈希值,
然后按照得到的哈希值将键-值对均匀发布到多个reducer中去
–distribute by 控制map的输出reducer中是如何划分的
–Hive要求Distribute by 写在Sort by 语句之前
select s.ymd,s.symbol,s.price_close
from stocks s
distribute by s.symbol
sort by s.symbol ASC,s.ymd ASC
–cluster by 等价于distribute by 和sort by 的综合使用
select s.ymd,s.symbol,s.price_close
from stocks s
cluster by s.symbol
–类型转换
select name,salary
from employees
where cast(salary as Float)<100000.0
select (2.0*cast(cast(b as string ) as double ))
from src;
–数据快抽样,hive提供了一种按照抽样百分比进行抽样的方式,这是基于行数的,
这样抽样的最小单元是HDFS数据块,如果小于128MB的话,会返回所有行
–hive目前暂不支持物化视图,嵌套子查询的查询
from(
select *
from people join cart
on (cart.people=people.id)
where firstname =’john’
)a select a.lastname where a.id=3;
–创建索引
create table employees(
name string ,
salary float,
subordinates array<string>,
deductions map<string ,float>,
address struct<street:string ,city:string,state:string,zip:int>
)
create index employees_index
on table employees(country)
as ‘org.apache.hadoop.hive.ql.index.compact.compactindexhandler’
with deferred rebuild
idxproperties (‘creator=’me’,’created_at’= ‘some_time’)
in table employees_index_table
partion by (country,name)
comment’Employees indexed by country and name.’
partioned by (country string, state string )
–修改索引
alter index employees_index
on table employees
partition (country=’US’)
rebuild;
–显示索引
show formatted index on employees;
–删除索引
drop index if exists employees_index on table employees;
–Bitmap 索引应用于排重后值比较少的列