建表时申明文件的存储格式,默认为TEXTFILE。如使用压缩常使用分块压缩STORED AS SEQUENCEFILE
数据处理的中间过程和结果使用Snappy算法进行压缩。
– 任务中间压缩
set hive.exec.compress.intermediate=true;
set hive.intermediate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
set hive.intermediate.compression.type=BLOCK;
– map/reduce 输出压缩
set hive.exec.compress.output=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapred.output.compression.type=BLOCK;
实施案例:
现有一表log_db.commodlog共有3362555条数据共1.01GB
创建一个textfile的表
create table ifnot exists com(
unionId string,
brand string,
create_time string)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile;
设置操作:
set hive.exec.compress.output=true;set mapred.output.compress=true;set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;set io.compression.codecs=org.apache.hadoop.io.compress.GzipCodec;
导入数据
insert overwrite table fol_db.com select unionId,brand,create_time from log_db.commodlog;
结果:
select count(1)from fol_db.com;
共有3362555条数据共59.74MB
创建一个SEQUENCEFILE的表
create table ifnot exists seqcom(
unionId string,
brand string,
create_time string)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as SEQUENCEFILE;
设置操作:
set hive.exec.compress.output=true;set mapred.output.compress=true;set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;set io.compression.codecs=org.apache.hadoop.io.compress.GzipCodec;set mapred.output.compression.type=BLOCK;
导入数据
insert overwrite table fol_db.seqcom select *from fol_db.com;
结果:
select count(1)from fol_db.seqcom;
共有3365370条数据共65.17MB
创建一个RCFILE的表
create table ifnot exists rccom(
unionId string,
brand string,
create_time string)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as rcfile;
设置操作:
set hive.exec.compress.output=true;set mapred.output.compress=true;set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;set io.compression.codecs=org.apache.hadoop.io.compress.GzipCodec;
导入数据
insert overwrite table fol_db.rccom select *from fol_db.com;
结果:
select count(1)from fol_db.rccom ;
rc格式耗时4.126
select brand,count(brand) brandCount from fol_db.rccom where create_time like '2019-01-02%' group by brand SORT BY brand ASC;
SEQUENCEFILE格式耗时8.132
select brand,count(brand) brandCount from fol_db.seqcom where create_time like '2019-01-02%' group by brand SORT BY brand ASC;
textfile格式耗时8.132
select brand,count(brand) brandCount from fol_db.com where create_time like '2019-01-02%' group by brand SORT BY brand ASC;9.75
原数据格式耗时80.302
select brand,count(brand) brandCount from log_db.commodlog where create_time like '2019-01-02%' group by brand SORT BY brand ASC;