1. 注意点
1.正则转义,反向引用
select regexp_replace('lat:test,lon:test,province:prov,city:city,area:area,cnt:3','.*province:([\\w]+),city:([\\w]+),area:([\\w]+),.*','$1 $2 $3');
-- prov city area
- hive自己本身的正则规范
- shell脚本中,hive正则的规范
#!/bin/sh
hive -v -e "
select regexp_replace('lat:test,lon:test,province:prov,city:city,area:area,cnt:3','.*province:([\\\\w]+),city:([\\\\w]+),area:([\\\\w]+),.*','\$1 \$2 \$3');
"
2. cast问题
select cast('12345678901' as int);
-- NULL
cast一般不会抛出异常,而是吞掉异常,返回NULL
3. coalesce 注意
select coalesce('', 'a'); -- => ''
select coalesce(null, 'a'); -- => a
coalesce
作用于null
,而不是空字符串
4. 小文件合并
set mapred.max.split.size=256000000;
set mapred.min.split.size.per.node=100000000;
set mapred.min.split.size.per.rack=100000000;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
set hive.merge.smallfiles.avgsize=256000000;
set hive.merge.mapfiles = true;
set hive.merge.mapredfiles = true;
set hive.merge.size.per.task = 256000000;
set hive.hadoop.supports.splittable.combineinputformat=true;
5. 本地测试
select key, value, sum(value) over(partition by key) as total
from (
select 'a' key, 1 as value
union all
select 'a' key, 2 as value
) as a
--
a 1 3
a 2 3
--
2. 优化点
1. 正则反向
select regexp_replace('lat:test,lon:test,province:prov,city:city,area:area,cnt:3','.*province:([\\w]+),city:([\\w]+),area:([\\w]+),.*','$1 $2 $3');
2. 任务参数优化
- 向量化
- 并行跑
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled=true;
set hive.exec.parallel=true;
3. 用join
做差集
select device from a
left join b
on a.device = b.device
where b.device is null
4. 窗口函数
- 单字段统计
select field_str as field_str, field, cnt, round(cnt/total, 4) as percent,
row_number() over (partition by field_str order by cnt desc) as rank
from (
select field_str, field, cnt, sum(cnt) over(partition by field_str) as total
from stat_temp
) as a
- 数据去重
insert overwrite table test
select id,country
from (
select id,country, row_number() over (partition by id order by processtime desc) rank
from (
select id,country, processtime from test
union all
select id,country day as processtime
from test_incr where day=20180101
)a
)b where rank=1;
5. 对NULL
好一点
聚合函数像collect_set,collect_list,count,sum
这些在聚合数据的时候,会自动剔除NULL
这样的值,这样出来的数据更好看,所以建议空字符串 => NULL