运用python做数据分析的步骤(HQL中用python脚本实现数据加工)
运用python做数据分析的步骤(HQL中用python脚本实现数据加工)g h a b cd e f g hload data local inpath '/home/badou/hive_test_3/data/wc.data' overwrite into table docs;a b c d eb c d e f
使用python在hive中实现wordcount统计汇总
此处的数据分析是数据加工
要准备好源数据
创建外部表create external table docs_table (line STRING);
load data local inpath '/home/badou/hive_test_3/data/wc.data' overwrite into table docs;
a b c d e
b c d e f
d e f g h
g h a b c
g d f b c
a 2
b 4
创建分析结果表:create table word_count (word STRING count INT)
row format delimited fields terminated by '\t';
[root@master transform_wc]# cat mapper.py
import sys
for line in sys.stdin:
ss = line.strip().split(' ')
for word in ss:
print '%s\t1' % (word)
开发reducer.py[root@master transform_wc]# cat reducer.py
import sys
last_key = None
last_count = 0
for line in sys.stdin:
key count = line.strip().split('\t')
if last_key and last_key != key:
print '%s\t%d' % (last_key last_count)
last_key = key
last_count = int(count)
else:
last_key = key
last_count = int(count)
加载python脚本add file /home/badou/hive_test_3/transform_wc/mapper.py;
add file /home/badou/hive_test_3/transform_wc/reducer.py;
HQL语句中使用python脚本模块实现数据加工
insert overwrite table word_count
select transform(wc_map.word wc_map.count) using 'python reducer.py'
from
(
select transform(line) using 'python mapper.py' as word count from docs_table cluster by word
) wc_map;