1、TEXTFILE
create table if not exists xt_format_text(source string, loginv string, uv string, dt string)row format delimitedstored as textfile;
插入数据开启压缩:
set hive.exec.compress.output=true;set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;insert overwrite table xt_format_text select * from xt_test3;
2、SEQUENCEFILE
SequenceFile是Hadoop API提供的一种二进制文件支持,其具有使用方便、可分割、可压缩的特点。SequenceFile支持三种压缩选择:NONE,RECORD,BLOCK。Record压缩率低,一般建议使用BLOCK压缩
create table if not exists xt_format_sequencefile(source string, loginv string, uv string, dt string)row format delimitedstored as sequencefile;
插入数据开启压缩:
set hive.exec.compress.output=true; set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;SET mapred.output.compression.type=BLOCK;insert overwrite table xt_format_sequencefile select * from xt_format_text;
3、RCFILE
RCFILE是一种行列存储相结合的存储方式。首先,其将数据按行分块,保证同一个record在一个块上,避免读一个记录需要读取多个block。其次,块数据列式存储,有利于数据压缩和快速的列存取
create table if not exists xt_format_rcfile(source string, loginv string, uv string, dt string)row format delimited #rcfile时 会自动忽略这个,而使用 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe' 这个SerDe.stored as rcfile;
插入数据开启压缩:
set hive.exec.compress.output=true; set mapred.output.compress=true;#注意:hive不依赖这个设置,只依赖于hive.exec.compress.output的设置值。set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;insert overwrite table xt_format_rcfile select * from xt_format_sequencefile ;
4、AVRO
CREATE TABLE xt_format_avro COMMENT "just drop the schema right into the HQL" ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' TBLPROPERTIES ( 'avro.schema.literal'='{ "namespace": "com.letv.bigdata.dataplatform", "name": "xt_avro_format_test", "type": "record", "fields": [ { "name":"source","type":"string"},{ "name":"loginv","type":"string"},{ "name":"uv","type":"string"},{ "name":"dt", "type":"string"}]}');
5、ORC 文件格式(The Optimized Row Columnar (ORC))
create table if not exists xt_format_orcfile(source string, loginv string, uv string, dt string)stored as orc;
实际例子:
CREATE TABLE xt_tds_did_user_targ_day( dvc_id string, user_id string, p1 string, p2 string, p3 string, prod_code string, login_ip string, cntry_name string, area_name string, prov_name string, city_name string, chnl_type string, chnl_type_name string, chnl_code string, chnl_name string, login_ref string, net_type string, oper_sys string, oper_sys_ver string, dvc_brand string, dvc_model string, dvc_type string, dvc_dpi string, brows_name string, login_ts bigint, first_login_date string, first_app_ver string, last_login_date string, last_app_ver string, evil_ip bigint, pv bigint, input_pv bigint, ins_pv bigint, qry_pv bigint, outs_pv bigint, coop_pv bigint, vv bigint, cv bigint, pt bigint, vod_vv bigint, vod_cv bigint, vod_pt bigint, live_vv bigint, live_cv bigint, live_pt bigint, ca_vv bigint, ca_cv bigint, ca_pt bigint, try_vv bigint, try_cv bigint, try_pt bigint, pay_vv bigint, pay_cv bigint, pay_pt bigint, off_vv bigint, off_cv bigint, off_pt bigint, block_ts bigint, drag_ts bigint, drag_ahd_ts bigint, drag_bwd_ts bigint, click_ts bigint, instl_ts bigint, stup_ts bigint, movie_vv bigint, movie_cv bigint, movie_pt bigint, tvp_vv bigint, tvp_cv bigint, tvp_pt bigint, cartn_vv bigint, cartn_cv bigint, cartn_pt bigint, var_vv bigint, var_cv bigint, var_pt bigint, amuse_vv bigint, amuse_cv bigint, amuse_pt bigint, sport_vv bigint, sport_cv bigint, sport_pt bigint, music_vv bigint, music_cv bigint, music_pt bigint, fin_vv bigint, fin_cv bigint, fin_pt bigint, hot_vv bigint, hot_cv bigint, hot_pt bigint)PARTITIONED BY ( dt string, pf string) STORED AS RCFILE;set hive.exec.compress.output=true;set mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec;insert overwrite table xt_tds_did_user_targ_day partition(dt='20150425',pf='tv') select dvc_id, user_id, p1, p2, p3, prod_code, login_ip, cntry_name, area_name, prov_name, city_name, chnl_type, chnl_type_name, chnl_code, chnl_name, login_ref, net_type, oper_sys, oper_sys_ver, dvc_brand, dvc_model, dvc_type, dvc_dpi, brows_name, login_ts, first_login_date, first_app_ver, last_login_date, last_app_ver, evil_ip, pv, input_pv, ins_pv, qry_pv, outs_pv, coop_pv, vv, cv, pt, vod_vv, vod_cv, vod_pt, live_vv, live_cv, live_pt, ca_vv, ca_cv, ca_pt, try_vv, try_cv, try_pt, pay_vv, pay_cv, pay_pt, off_vv, off_cv, off_pt, block_ts, drag_ts, drag_ahd_ts, drag_bwd_ts, click_ts, instl_ts, stup_ts, movie_vv, movie_cv, movie_pt, tvp_vv, tvp_cv, tvp_pt, cartn_vv, cartn_cv, cartn_pt, var_vv, var_cv, var_pt, amuse_vv, amuse_cv, amuse_pt, sport_vv, sport_cv, sport_pt, music_vv, music_cv, music_pt, fin_vv, fin_cv, fin_pt, hot_vv, hot_cv, hot_pt from data_tds.tds_did_user_targ_day where dt='20150425' and pf='tv';