join优化-数据倾斜
hive.optimize.skewjoin=true; 【TODO 细节】
- 数据倾斜时启动两个job进行join 然后再做union
set hive.skewjoin.key=100000;
- 启动skewjoin的key的阈值
map join
set hive.auto.convert.join = true;
hive.mapjoin.smalltable.filesize 默认25mb
- 建议可以设置小点 避免过多的map join 导致对内存占用率太高(每个进程都需要维护一份备份)
select /*+ mapjoin(A) */ a.a,a.b form A a join B b on a.id=b.id
- A为小表
Tips:
- 存在小表
- 需要做不等值join
bucket join
表按照join的字段做cluster by ,且bucket 数存在倍数关系
- create table A (id int, price string) cluster by(id) into 32 buckets
- create table B (id int, amount string) cluster by(id) into 32 buckets
- select id,price,amount from A a join B b on a.id=b.id
filter 下推(人肉,hive不会自动进行优化)
- select id,price,amount from A a join B b on a.id=b.id where id>1000 #优化前
- select id,price,amount from (select * from A where id > 1000 ) a join (select * from A where id > 1000 ) b on a.id=b.id #优化后
count(distinct id) 优化
select count(distinct id) from A #此处count只会启动一个reduce,但是该reduce需要完成distince id的计算
select count(1) from (select distince id from A) # 此处会启动两个mr job 第一个job 并行计算distice id ,第二个启动一个reduce计算count
- set mapred.reduce.task=n # n>1
select count(1) from (select id from A group by id) # 此处会启动两个mr job 第一个job 并行计算group by id ,第二个启动一个reduce计算count
- set mapred.reduce.task=n # n>1