**
1、用bin/hive同时打开多个客户端会报错 java.sql.SQLException: Another instance of Derby may have already booted the database /opt/modules/apache/hive-1.2.1/metastore_db.
1、描述表信息
desc tablename;
desc extended 表名;
desc formatted 表名;
2、修改表名
alter table table_oldname rename to new_table_name;
3、给表增加一个列
alter table new_table add columns(age int);
alter table new_table add columns(sex string comment ‘sex’);添加注释
4、修改列的名字以及类型
create table test_change(a int,b int,c int);
修改列名 a -> a1
alter table test_change change a a1 int;
a1改a2,数据类型改成String,并且放在b的后面;
alter table test_change change a1 a2 string after b int;
将c改成c1,并放在第一列
alter table test_change change c c1 int first;
5、替换列(不能删除列,但是可以修改和替换,)是全表替换
ALTER TABLE table_name ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], …)
alter table test_change replace columns(foo int , too string);
6、清除表数据truncate
只清除表数据,元数据信息还是存在的,表的结构已经表还是在的
truncate table row_table;
7、删除表数据drop
drop table row_table;
清除数据,表以及表的结构清除,元数据也清除
8、删除数据库
drop database test_db CASCADE;
删除数据库的信息,如果数据库不为空的话,则要加CASCADE字段
9、查看hive自带的函数: show functions;
desc function when;
**
**
create table dept_tmp(
deptno int,
dname string,
loc string
)
row format delimited fields terminated by ‘\t’;
load data local inpath ‘/opt/datas/dept.txt’ into table dept_tmp;
数据存放路径location:
Location:hdfs://hadoop01.com:8020/tmp/hive/hadoop/23a93177-f22f-4035-a2e3-c51cf315625f/_tmp_space.db/962463c2-6563-47a8-9e62-2a1e8eb6ed19
关闭hive cli:
自动删除临时表
也可以手动删除drop
CREATE TEMPORARY TABLE IF NOT EXISTS dept_tmp(
deptno int ,
dname string ,
loc string
)row format delimited fields terminated by ‘\t’;
LOCATION ‘/user/hive/warehouse/db_emp.db/dept’;
关闭hive cli:
自动删除临时表的数据
也可以手动删除drop,删除临时表的数据及数据文件
**
十七、hive分区表
**
企业中如何使用分区表
[PARTITIONED BY (col_name data_type [COMMENT col_comment], …)]
普通表:select * from logs where date=‘2018120’
执行流程:对全表的数据进行查询,然后才会进行过滤
分区表:select * from logs where date=‘2018120’ and hour=‘00’
执行流程:直接加载对应文件夹路径下的数据
分区表的字段是逻辑性的,体现在hdfs上形成一个文件夹存在,并不在数据中,
必须不能是数据中包含的字段
【一级分区】
create table emp_part(
empno int ,
ename string ,
job string ,
mgr int ,
hiredate string,
sal double ,
comm double ,
deptno int
)PARTITIONED BY(date string)
row format delimited fields terminated by ‘\t’;
load data local inpath ‘/opt/datas/emp.txt’ into table emp_part partition(date=‘2018120’);
load data local inpath ‘/opt/datas/emp.txt’ into table emp_part partition(date=‘2018121’);
load data local inpath ‘/opt/datas/emp.txt’ into table emp_part partition(date=‘2018122’);
select * from emp_part where date=‘2018120’;
【二级分区】
create table emp_part2(
empno int ,
ename string ,
job string ,
mgr int ,
hiredate string,
sal double ,
comm double ,
deptno int
)
PARTITIONED BY(date string,hour string)
row format delimited fields terminated by ‘\t’;
load data local inpath ‘/opt/datas/emp.txt’ into table emp_part2 partition(date=‘2018120’,hour=‘01’);
load data local inpath ‘/opt/datas/emp.txt’ into table emp_part2 partition(date=‘2018120’,hour=‘02’);
select * from emp_part2 where date=‘2018120’;
select * from emp_part2 where date=‘2018120’ and hour=‘01’;
-》分区表的作用主要是提高了查询的效率
**
十八、hive桶表
**
桶表:获取更高的处理效率、join、抽样数据
[CLUSTERED BY (col_name, col_name, …) [SORTED BY (col_name [ASC|DESC], …)] INTO num_buckets BUCKETS]
create table emp_bu(
empno int ,
ename string ,
job string ,
mgr int ,
hiredate string,
sal double ,
comm double ,
deptno int
)CLUSTERED BY (empno) INTO 4 BUCKETS
row format delimited fields terminated by ‘\t’;
先建表,然后设置
set hive.enforce.bucketing = true;
insert overwrite table emp_bu select * from emp;
需求1: 查询部门编号为10的所有员工,按照薪资进行降序排序desc(默认升序)
select * from emp where deptno=10 order by sal desc;
结果:
7839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 10
7782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 10
7934 MILLER CLERK 7782 1982-1-23 1300.0 NULL 10
需求2:将每个部门薪资最高的那个人的薪资显示在最后一列
select empno,ename, deptno,sal, max(sal) over(partition by deptno order by sal desc ) as sal_111 from emp;
结果:
7839 KING 10 5000.0 5000.0 1
7782 CLARK 10 2450.0 5000.0 2
7934 MILLER 10 1300.0 5000.0 3
7788 SCOTT 20 3000.0 3000.0
7902 FORD 20 3000.0 3000.0
7566 JONES 20 2975.0 3000.0
7876 ADAMS 20 1100.0 3000.0
7369 SMITH 20 800.0 3000.0
select empno,ename, deptno,sal, row_number() over(partition by deptno order by sal desc ) as sal_111 from emp ;
结果:
7839 KING 10 5000.0 1
7782 CLARK 10 2450.0 2
7934 MILLER 10 1300.0 3
7788 SCOTT 20 3000.0 1
7902 FORD 20 3000.0 2
7566 JONES 20 2975.0 3
7876 ADAMS 20 1100.0 4
7369 SMITH 20 800.0 5
7698 BLAKE 30 2850.0 1
7499 ALLEN 30 1600.0 2
7844 TURNER 30 1500.0 3
7654 MARTIN 30 1250.0 4
7521 WARD 30 1250.0 5
7900 JAMES 30 950.0 6
需求4:获取每个部门薪资最高的前两位(嵌套子查询的方式)
select empno,ename, deptno,sal from(select empno,ename, deptno,sal,row_number() over (partition by deptno order by sal desc) as rn from emp) as tmp where rn < 3;
结果:
7839 KING 10 5000.0
7782 CLARK 10 2450.0
7788 SCOTT 20 3000.0
7902 FORD 20 3000.0
7698 BLAKE 30 2850.0
7499 ALLEN 30 1600.0
【过滤】
where 、limit、distinct、between and 、null、 is not null
select * from emp where sal >3000;
select * from emp limit 5;
select distinct deptno from emp;
select * from emp where sal between 1000 and 3000;
select empno,ename from emp where comm is null;
select empno,ename from emp where comm is not null;
【聚合函数】
count、sum、max、min、avg、group by 、having
select avg(sal) avg_sal from emp;
按部门分组求出每个部门的平均工资
select 中出现的字段,需要用聚合函数包裹或者放入group by 中
select deptno,avg(sal) from emp group by deptno;
10 2916.6666666666665
20 2175.0
30 1566.6666666666667
select deptno,max(job),avg(sal) from emp group by deptno;
10 PRESIDENT 2916.6666666666665
20 MANAGER 2175.0
30 SALESMAN 1566.6666666666667
select deptno,avg(sal) from emp group by deptno,job;
having和where 是差不多的用法,都是筛选语句
可以一起使用,先执行where后执行having
select deptno,avg(sal) avg_sal from emp group by deptno having avg_sal >2000 ;
【join】
left join, right join,inner join(等值),全join
创建两张表A,B
A表: B表:
ID Name ID phone
1 张三 1 111
2 李四 2 112
3 王五 3 113
5 赵六 4 114
drop table A;
drop table B;
create table A(
id int,
name string
)row format delimited fields terminated by “\t”;
create table B(
id int,
name int
)row format delimited fields terminated by “\t”;
load data local inpath ‘/opt/datas/A’ into table A;
load data local inpath ‘/opt/datas/B’ into table B;
1、order by :全局排序,设置多个reduce没有太大的作用
select * from emp order by sal;
2、sort by :局部排序,对于每个reduce的结果进行排序,设置reduce=4,导出到本地
insert overwrite local directory ‘/opt/datas/emp_sort’ row format delimited fields terminated by “\t” select * from emp sort by sal;
3、distribute by(底层是mr分区)
-》可以按照指定的值进行分区
-》先分区后排序,一般和sort by连用
insert overwrite local directory ‘/opt/datas/emp_dist’ row format delimited fields terminated by “\t” select * from emp distribute by deptno sort by sal;
4、cluster by :相当于 distribute by +sort by 组合使用
-》排序只能倒序排序,不能指定排序的规则为desc或者asc;
-》相同的工资放在一起,因为数据的原因,相同工资有点少;
insert overwrite local directory ‘/opt/datas/emp_cluster’ row format delimited fields terminated by “\t” select * from emp cluster by sal;
**
二十二、hive和mapreduce的运行参数
**
设置每个reduce处理的数据量:(单位是字节)
set hive.exec.reducers.bytes.per.reducer=
hive.exec.reducers.bytes.per.reducer
256000000
size per reducer.The default is 256Mb, i.e if the input size is 1G, it will use 4 reducers.
设置最大运行的reduce的个数:默认1009
set hive.exec.reducers.max=
hive.exec.reducers.max
1009
max number of reducers will be used. If the one specified in the configuration parameter mapred.reduce.tasks is
negative, Hive will use this one as the max number of reducers when automatically determine number of reducers.
Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.
启动服务:bin/hive --service metastore &
fetch模式
hive.fetch.task.conversion
more
Expects one of [none, minimal, more].
Some select queries can be converted to single FETCH task minimizing latency.
Currently the query should be single sourced not having any subquery and should not have
any aggregations or distincts (which incurs RS), lateral views and joins.
0. none : disable hive.fetch.task.conversion
1. minimal : SELECT STAR, FILTER on partition columns, LIMIT only
2. more : SELECT, FILTER, LIMIT only (support TABLESAMPLE and virtual columns)
load data local inpath ‘/opt/datas/2015082818’ into table yhd_source;
load data local inpath ‘/opt/datas/2015082819’ into table yhd_source;
2、【数据清洗】
create table yhd_qingxi(
id string,
url string,
guid string,
date string,
hour string
)
row format delimited fields terminated by ‘\t’;
insert into table yhd_qingxi select id,url,guid,substring(trackTime,9,2) date,substring(trackTime,12,2) hour from yhd_source;
select id,url,guid,date,hour from yhd_qingxi;
创建分区表:根据时间字段进行分区
create table yhd_part(
id string,
url string,
guid string
)partitioned by (date string,hour string)
row format delimited fields terminated by ‘\t’;
insert into table yhd_part partition(date=‘20150828’,hour=‘18’)
select id,url,guid from yhd_qingxi where date=‘28’ and hour=‘18’;
insert into table yhd_part partition(date=‘20150828’,hour=‘19’)
select id,url,guid from yhd_qingxi where date=‘28’ and hour=‘19’;
select id,url,guid from yhd_part where date=‘20150828’ and hour=‘18’;
select id,url,guid from yhd_part where date=‘20150828’ and hour=‘19’;
动态分区:
表示是否开启动态分区
hive.exec.dynamic.partition
true
Whether or not to allow dynamic partitions in DML/DDL.
表示动态分区最大个数
hive.exec.max.dynamic.partitions
1000
Maximum number of dynamic partitions allowed to be created in total.
每个节点上支持动态分区的个数
hive.exec.max.dynamic.partitions.pernode
100
Maximum number of dynamic partitions allowed to be created in each mapper/reducer node.
使用动态分区,需要改变模式为非严格模式
hive.exec.dynamic.partition.mode
strict
In strict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions.
set设置:
set hive.exec.dynamic.partition.mode=nonstrict;
create table yhd_part2(
id string,
url string,
guid string
)partitioned by (date string,hour string)
row format delimited fields terminated by ‘\t’;
固定写死的格式
insert into table yhd_part partition(date=‘20150828’,hour=‘18’)
select id,url,guid from yhd_qingxi where date=‘28’ and hour=‘18’;
动态分区(非常灵活)
insert into table yhd_part2 partition (date,hour) select * from yhd_qingxi;
select id,url,guid from yhd_part2 where date=‘20150828’ and hour=‘18’;
3、数据分析
PV:
select date,hour,count(url) pv from yhd_part group by date,hour;
结果:
±----------±------±-------±-+
| date | hour | pv |
±----------±------±-------±-+
| 20150828 | 18 | 64972 |
| 20150828 | 19 | 61162 |
±----------±------±-------±-+
UV
select date,hour,count(distinct guid) uv from yhd_part group by date,hour;
结果:
±----------±------±-------±-+
| date | hour | uv |
±----------±------±-------±-+
| 20150828 | 18 | 23938 |
| 20150828 | 19 | 22330 |
±----------±------±-------±-+
最终结果导入最终结果表中:
create table result as select date,hour,count(url) pv,count(distinct guid) uv from yhd_part group by date,hour;
结果:
±-------------±-------------±-----------±-----------±-+
| result.date | result.hour | result.pv | result.uv |
±-------------±-------------±-----------±-----------±-+
| 20150828 | 18 | 64972 | 23938 |
| 20150828 | 19 | 61162 | 22330 |
±-------------±-------------±-----------±-----------±-+
load data local inpath ‘hiveconf:logdir/{hiveconf:file_path}’ into table load_log.load_tab
partition(dateTime=’hiveconf:DAY′,hour=′{hiveconf:HOUR}’):
重命名load_to_hive.sh load_to_hive_file.sh
2.创建一个load.sql,把-e后面的复制过来(然后在修改):
针对变量,我们可以用–hiveconf 去传参
load data local inpath ‘hiveconf:logdir/{hiveconf:file_path}’ into table load_hive.load_h partition(date=’hiveconf:DAY′,hour=′{hiveconf:HOUR}’)
**
6、jvm重用
mapreduce.job.jvm.numtasks默认是1,运行一个job会启动一个jvm上运行
7、推测执行:
缺点:会消耗更多的资源,一般不建议开启,有可能数据会重复写入,造成异常
8、hive本地模式:
hive的本地模式hive.exec.mode.local.auto默认是false
hive底层运行的hadoop集群中的资源,yarn调度,本地模式不会再集群所有机器上运行,会选一台作为本地运行,一般处理小数据量级的数据速度回很快
限制:job输入的数据不能大于128M,map的个数不能超过4个,默认reduce数目 1个
测试:
create table result4 as
select
date date,
sum(pv) PV,
count(distinct guid) UV,
count(distinct case when length(user_id)!=0 then guid else null end) login_user,
count(distinct case when length(user_id)=0 then guid else null end) visitor,
avg(stay_time) avg_time,
(count(case when pv>=2 then session_id else null end)/count(session_id)) session_jump,
count(distinct ip) IP
from session_info
where date=‘2015082818’
group by date;
hive.auto.convert.join
true
Whether Hive enables the optimization about converting common join into mapjoin based on the input file size
执行map join 的条件,默认是10M
hive.auto.convert.join.noconditionaltask.size
10000000
If hive.auto.convert.join.noconditionaltask is off, this parameter does not take affect. However, if it
is on, and the sum of size for n-1 of the tables/partitions for a n-way join is smaller than this size, the join is directly
converted to a mapjoin(there is no conditional task). The default is 10MB