1. 创建原始表,不包含自增列:
CREATE TABLE `test.wzq_matcheducation_temp`(
`userid` bigint,
`value` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
NULL DEFINED AS ''
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location信息:/user/hive/warehouse/test.db/wzq_matcheducation_temp
2. 创建数据表,包含自增列:
CREATE TABLE `test.wzq_matcheducation`(
`id` bigint,
`userid` bigint,
`value` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
NULL DEFINED AS ''
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location信息:/user/hive/warehouse/test.db/wzq_matcheducation
3. 编写hiveSQL,将数据导入原始表:
insert overwrite table test.wzq_matcheducation_temp
select userid, matcheducation
from bidp.user_cloud
where matcheducation like '%8%'
order by userid
4. 编写文件自增列数据处理shell脚本(重要)!!!脚本很简单,自己看吧。
#!/bin/bash
base_path=/home/work/modify_hbase/shell/column_auto_increment
read -p '------请输入要处理的hive表文件对应的目录路径:' hdfs_path_old
read -p '------请输入有自增列的hive表文件对应的目录路径:' hdfs_path_new
echo -e "------`date "+%y/%m/%d %H:%M:%S"`:正在下载hive表对应目录下的所有文件到linux系统中"
rm -rf $base_path/data
mkdir -p $base_path/data/tmp_old
mkdir -p $base_path/data/tmp_new
hadoop fs -get $hdfs_path_old/* $base_path/data/tmp_old
echo -e "------`date "+%y/%m/%d %H:%M:%S"`:hive表对应目录下的所有文件下载完毕"
cd $base_path/data/tmp_old
#给每个文件添加自增列
i=1
for file in $(ls)
do
cat $file | while read line
do
echo -e "$i\t$line" >> $base_path/data/tmp_new/$file
let i++
done
done
echo -e "------`date "+%y/%m/%d %H:%M:%S"`:每个数据文件的自增列添加完毕,开始上传所有处理完的数据文件..."
hadoop fs -put $base_path/data/tmp_new/* $hdfs_path_new
echo -e "------`date "+%y/%m/%d %H:%M:%S"`:数据上传完毕,请手动检查结果:1.查询几条数据;2.查询自增列最大值;3.查询表数据量;4.对比2和3的结果是否一致"
exit 0