一个Hive练习(解答2017年10月份大数据行业大赛样题)

数据如下

[root@master conf]# cat ~/relations 
Zhangsan Wangwu 01:01:01
Zhangsan Zhaoliu 00:11:21
Zhangsan Yuqi 00:19:01
Zhangsan Jingba 00:21:01
Zhangsan Wuxi 01:31:17
Wangwu Zhaoliu 00:51:01
Wangwu Zhaoliu 01:11:19
Wangwu Yuqi 00:00:21
Wangwu Yuqi 00:23:01
Yuqi Zhaoliu 01:18:01
Yuqi Wuxi 00:18:00
Jingba Wangwu 00:01:01
Jingba Wangwu 00:00:06
Jingba Wangwu 00:02:04
Jingba Wangwu 00:02:54
Wangwu Yuqi 01:00:13
Wangwu Yuqi 00:01:01
Wangwu Zhangsan 00:01:01

要求:两个人统计相互通话总时长,并从高到低排列形成result1表,包括通话人和总时长两个字段

解答

字段转换

[root@master conf]# cat test_mapper.py 
import sys
for line in sys.stdin:
  line = line.strip()
  fromstr, tostr, time = line.split('\t')
  hours, minutes, secondes = time.split(':')
  newtime = int(hours)*60*60 + int(minutes)*60 + int(secondes)
  if cmp(fromstr, tostr) == -1:
    #print(cmp(fromstr, tostr))
    fromstr, tostr = tostr, fromstr
  print ' '.join([fromstr + tostr, str(newtime)])

hive脚本

[root@master conf]# cat test.hive 
CREATE TABLE relations_new (
  fromtostr STRING,
  duration INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ' ';

add FILE test_mapper.py;

INSERT OVERWRITE TABLE relations_new
SELECT
  TRANSFORM (fromstr, tostr, duration)
  USING 'python test_mapper.py'
  AS (fromtostr, duration)
FROM relations;

SELECT fromtostr, SUM(duration)
FROM relations_new
GROUP BY fromtostr;

运行hive脚本

[root@master conf]# hive -f test.hive 
    原文作者:tonyemail_st
    原文地址: https://www.jianshu.com/p/171707824901
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞