Hive-SQL

每天有多少用户首次流失

select t4.aa,count(*)
from
(
select  t3.role_id role_id,min(t3.date1) aa
from
(
select t2.role_id role_id,t2.date1 date1,t2.date2 date2
from
(
select t1.role_id role_id,t1.date date1,lag(t1.date,1) over (partition by role_id) date2
from
(
select *,row_number() over (partition by role_id order by ta.date asc) as rn
from 
(select distinct role_id,split(time," ")[0] date
from g37.logoutrole
where date between 20160902 and 20161219) ta ) t1) t2
where int(datediff(t2.date2,t2.date1)) > 7 ) t3
group by t3.role_id
) t4
group by t4.aa

分位数

from(
select t1.date a,percentile(t1.a, array(0,0.25,0.5,0.75,1)) over (partition by t1.date) b
from
  (select date, role_id,sum(cast(int(fn.json(source, "$.num")) as bigint)) a
   from g37.currency_xunzhang
   where date between 20161108 and 20161115
     and int(fn.json(source,"$.code_op"))= 1
   group by date, role_id) t1
) t2

每天和每个月的去重登录id量
SELECT
month(time),
date,
COUNT(DISTINCT role_id) AS uv,
GROUPING__ID
FROM g17_loginrole_utf8_log
where date >= 20161001
GROUP BY month(time), date
GROUPING SETS (month(time), date)

正则提取

select regexp_extract(source, "level=(\\d+)", 1),count(regexp_extract(source, "level=(\\d+)", 1))
from xxx
where date between 20160902 and 20160925
and key rlike "xxx"
group by regexp_extract(source, "level=(\\d+)", 1)

转换时间格式

select sum(t1.pj), sum(t1.hp)
from
(
select fn.change_date_format(time, "yyyy-MM-dd HH:mm:ss", "yyyy-MM-dd HH"),regexp_extract(source, "game_server=(\\d+)", 1), regexp_extract(source, "game_uid=(\\d+)", 1), count(pingjia) as pj, sum(pingjia) as hp
from xxx
where date between 20160825 and 20160926
group by 
fn.change_date_format(time, "yyyy-MM-dd HH:mm:ss", "yyyy-MM-dd HH"),regexp_extract(source, "game_server=(\\d+)", 1), regexp_extract(source, "game_uid=(\\d+)",1)
  ) as t1
where t1.pj - t1.hp >= 10

连接字符串

select role_id,split(max(concat(time,"|",get_json_object(source,"$.left_count"))) ,"|")[1]
from xxx as aa
where date between 20160925 and 20160927
group by role_id

最值

select role_id,max(get_json_object(source,"$.left_count")) 
from xxx 
where get_json_object(source,"$.item_name") rlike "aaa" 
and date between 20160925 and 20160926 
group by role_id

时间排序,取最后一条

select t3.server, t3.role_id, t3.cou, t2.cou
from
(
select server,
       role_id,
       max(int(get_json_object(source,"$.left_count"))) as cou
from xxx
where date between 20160915 and 20160926
  and get_json_object(source,"$.item_name") rlike "aaa"
group by server,
         role_id
         ) as t3
join

(
select t1.server, t1.role_id, int(get_json_object(t1.source,"$.left_count")) as cou from 
(
select * , ROW_NUMBER() over (partition by server, role_id ORDER BY time desc) as rn
from xxx
where date between 20160925 and 20160927
and get_json_object(source,"$.item_name") rlike "aaa"
  ) as t1 where t1.rn = 1  ) as t2
on t3.server = t2.server and t3.role_id = t2.role_id

where t3.cou = t2.cou

rlike正则(或)

select player
from xxx
where player rlike "重叠|叠加|冲突|触发"
and date between 20161001 and 20161008

创建外部表

create external table table_name (
  name string,
  urs string,
  ccid int,
  uid int,
  hn int,
  role_id int
  )
  row format delimited fields terminated by "\t"
  stored as textfile
  location '/path/file/'

文件引用为表

insert overwrite directory 'hdfs:///path/path/sjsj'
select a.role_id,time,fn.json(source,"$.role_level") 
from xxx as a where date >=20160401 and date <=20160430
and fn.json(source,"$.reason") ="player-upgrade"
and fn.json(source,"$.type") ="lv"
and fn.json(source,"$.role_level")  in ('20','60')
and a.role_id in
(select server from xxx)
===
and a.role_id in
(select server from xxx)

流失天数

select t1.server,t1.account_id,t1.liushi,t1.cost,t2.role_id,t2.udid
from
(select server,account_id,(20161013-max(date)) liushi,max(int(fn.json(source, "$.total_cost"))) cost
from loginrole
where date between 20161006 and 20161013
 group by server,account_id
) t1
join
(
 select fn.json(source, "$.account_id") account_id,fn.json(source, "$.role_id") role_id,fn.json(source, "$.udid") udid,fn.json(source, "$.server") server
from createrole
where date between 20161006 and 20161012) t2
on t1.server = t2.server
where t1.account_id = t2.account_id

实际登录天数

select t4.server,t4.account_id,count(cc)
from
 (
 select server,account_id,split(time, " ")[0] cc
 from loginrole
 where date between 20161006 and 20161015
 group by server,account_id,split(time, " ")[0]
  ) t4
  group by t4.server,t4.account_id

key是数组

select role_id,count(*)
from xxx
where fn.to_array(fn.json(source, "$.cost[0]"))[0] in ("490001", "490003")
and date between 20160902 and 20160916
group by role_id

SUBSTRING ( expression, start, length )

select distinct aa.account_id
from xxx aa
where date between 20161006 and 20161017
and int(aa.server)<=20000
and int(fn.json(source,"$.level"))>5
and concat(aa.account_id,"|",concat(substring(aa.date,1,4),"-",substring(aa.date,5,2),"-",substring(aa.date,7,2)))
in (
select concat(account_id,"|",date_add(concat(substring(date,1,4),"-",substring(date,5,2),"-",substring(date,7,2)),3)) host_iid_seven
from xxx
where date between 20161006 and 20161017
and int(server)<=20000
)
    原文作者:HouserLin
    原文地址: https://www.jianshu.com/p/106d45289477
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞