select name,orderdate,cost,sum(cost) over(partition bymonth(orderdate))
from t_window
执行结果如下:
name orderdate cost sum_window_0
jack 2015-01-0110205
jack 2015-01-0855205
tony 2015-01-0750205
jack 2015-01-0546205
tony 2015-01-0429205
tony 2015-01-0215205
jack 2015-02-032323
mart 2015-04-1394341
jack 2015-04-0642341
mart 2015-04-1175341
mart 2015-04-0968341
mart 2015-04-0862341
neil 2015-05-101212
neil 2015-06-128080
可以看出数据已经按照月进行汇总了.
order by子句
上述的场景,假如我们想要将cost按照月进行累加.这时我们引入order by子句.
order by子句会让输入的数据强制排序(文章前面提到过,窗口函数是SQL语句最后执行的函数,因此可以把SQL结果集想象成输入数据)。Order By子句对于诸如Row_Number(),Lead(),LAG()等函数是必须的,因为如果数据无序,这些函数的结果就没有任何意义。因此如果有了Order By子句,则Count(),Min()等计算出来的结果就没有任何意义。
我们在上面的代码中加入order by
select name,orderdate,cost,sum(cost) over(partition bymonth(orderdate) orderby orderdate )
from t_window
得到的结果如下:(order by默认情况下聚合从起始行当当前行的数据)
name orderdate cost sum_window_0
jack 2015-01-011010
tony 2015-01-021525
tony 2015-01-042954
jack 2015-01-0546100
tony 2015-01-0750150
jack 2015-01-0855205
jack 2015-02-032323
jack 2015-04-064242
mart 2015-04-0862104
mart 2015-04-0968172
mart 2015-04-1175247
mart 2015-04-1394341
neil 2015-05-101212
neil 2015-06-128080
select name,orderdate,cost,
sum(cost) over() as sample1,--所有行相加
sum(cost) over(partition by name) as sample2,--按name分组,组内数据相加
sum(cost) over(partition by name orderby orderdate) as sample3,--按name分组,组内数据累加
sum(cost) over(partition by name orderby orderdate rows between UNBOUNDED PRECEDING andcurrentrow ) as sample4 ,--和sample3一样,由起点到当前行的聚合
sum(cost) over(partition by name orderby orderdate rows between 1 PRECEDING andcurrentrow) as sample5, --当前行和前面一行做聚合
sum(cost) over(partition by name orderby orderdate rows between 1 PRECEDING AND1 FOLLOWING ) as sample6,--当前行和前边一行及后面一行
sum(cost) over(partition by name orderby orderdate rows between currentrowand UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行
from t_window;
得到查询结果如下:
name orderdate cost sample1 sample2 sample3 sample4 sample5 sample6 sample7
jack 2015-01-011066117610101056176
jack 2015-01-0546661176565656111166
jack 2015-01-0855661176111111101124120
jack 2015-02-03236611761341347812065
jack 2015-04-0642661176176176656542
mart 2015-04-0862661299626262130299
mart 2015-04-0968661299130130130205237
mart 2015-04-1175661299205205143237169
mart 2015-04-139466129929929916916994
neil 2015-05-1012661921212129292
neil 2015-06-1280661929292929280
tony 2015-01-0215661941515154494
tony 2015-01-0429661944444449479
tony 2015-01-0750661949494797950
窗口函数中的序列函数
主要序列函数是不支持window子句的.
hive中常用的序列函数有下面几个:
NTILE
NTILE(n),用于将分组数据按照顺序切分成n片,返回当前切片值
NTILE不支持ROWS BETWEEN,
比如 NTILE(2) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW)
如果切片不均匀,默认增加第一个切片的分布
这个函数用什么应用场景呢假如我们想要每位顾客购买金额前1/3的交易记录,我们便可以使用这个函数.
select name,orderdate,cost,
ntile(3) over() as sample1 , --全局数据切片
ntile(3) over(partition byname), -- 按照name进行分组,在分组内将数据切成3份
ntile(3) over(order by cost),--全局按照cost升序排列,数据切成3份
ntile(3) over(partition byname order by cost ) --按照name分组,在分组内按照cost升序排列,数据切成3份from t_window
得到的数据如下:
name orderdate cost sample1 sample2 sample3 sample4
jack 2015-01-01103111
jack 2015-02-03233111
jack 2015-04-06422222
jack 2015-01-05462222
jack 2015-01-08552323
mart 2015-04-08622121
mart 2015-04-09681231
mart 2015-04-11751332
mart 2015-04-13941133
neil 2015-05-10121211
neil 2015-06-12801132
tony 2015-01-02153211
tony 2015-01-04293312
tony 2015-01-07502123
select name,orderdate,cost,
lag(orderdate,1,'1900-01-01') over(partition by name orderby orderdate ) as time1,
lag(orderdate,2) over (partition by name orderby orderdate) as time2
from t_window;
查询后的数据为:
name orderdate cost time1 time2
jack 2015-01-01101900-01-01NULL
jack 2015-01-05462015-01-01NULL
jack 2015-01-08552015-01-052015-01-01
jack 2015-02-03232015-01-082015-01-05
jack 2015-04-06422015-02-032015-01-08
mart 2015-04-08621900-01-01NULL
mart 2015-04-09682015-04-08NULL
mart 2015-04-11752015-04-092015-04-08
mart 2015-04-13942015-04-112015-04-09
neil 2015-05-10121900-01-01NULL
neil 2015-06-12802015-05-10NULL
tony 2015-01-02151900-01-01NULL
tony 2015-01-04292015-01-02NULL
tony 2015-01-07502015-01-042015-01-02
select name,orderdate,cost,
first_value(orderdate) over(partition byname order by orderdate) as time1,
last_value(orderdate) over(partition byname order by orderdate) as time2
from t_window
查询结果如下:
name orderdate cost time1 time2
jack 2015-01-01102015-01-012015-01-01
jack 2015-01-05462015-01-012015-01-05
jack 2015-01-08552015-01-012015-01-08
jack 2015-02-03232015-01-012015-02-03
jack 2015-04-06422015-01-012015-04-06
mart 2015-04-08622015-04-082015-04-08
mart 2015-04-09682015-04-082015-04-09
mart 2015-04-11752015-04-082015-04-11
mart 2015-04-13942015-04-082015-04-13
neil 2015-05-10122015-05-102015-05-10
neil 2015-06-12802015-05-102015-06-12
tony 2015-01-02152015-01-022015-01-02
tony 2015-01-04292015-01-022015-01-04
tony 2015-01-07502015-01-022015-01-07