Hive如何实现 count(distinct ) over (partition by )？

使用 collect_set(字段) over(partition by ) 来实现:select * from(select o.*-- ,count(distinct package_id) over(partition by user_id) cnt报错,size(collect_set(package_id) over(partition by user_id)) cntfrom o) a

雾岛与鲸

17249人浏览 · 2020-08-27 16:04:44

雾岛与鲸 · 2020-08-27 16:04:44 发布

一、方式1:

count(distinct ) over(partition by order by) 替换成 size(collect_set() over(partition by order by)) 来实现, 含义为求分组后的去重个数。

测试数据：

create table test_distinct as 
SELECT '1' as id ,'201808' as m,'a' as k
union all
SELECT '2' as id ,'201808' as m,'a' as k
union all
SELECT '1' as id ,'201809' as m,'a' as k
union all
SELECT '1' as id ,'201808' as m,'b' as k
union all
SELECT '2' as id ,'201809' as m,'b' as k;

id代表人编号, m代表月份，k代表其他key键。

需求：本月累计人数（即9月份的客户要包含9月以前的客户数）

第一步:

    select test_distinct.*,
    -- ,count(distinct id) over(partition by k) cnt   报错
    -- ,size(collect_set(id) over(partition by k ORDER BY m asc)) cnt
    collect_set(id) over(partition by k ORDER BY m asc),
    collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
    size(collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)) cnt
from test_distinct;

结果:
发现 _wcol0 在 m=201808 且 k=a 时，它的值是[“2”, “1”], _wco1的值是[“2”], 在m=201808 且 k=b 时, 它的值是[“1”]。这就是窗口加不加
rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW 的区别。
第一步结果

第二步：

只需要取最新的一条数据就可以了

select * from 
(
select 
k, m, 
row_number() over(PARTITION BY k,m ORDER BY cnt desc) as rk
from
(
    select test_distinct.*,
    -- ,count(distinct id) over(partition by k) cnt   报错
    -- ,size(collect_set(id) over(partition by k ORDER BY m asc)) cnt
    collect_set(id) over(partition by k ORDER BY m asc),
    collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
    size(collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)) cnt
from test_distinct
) temp 
) tb
where rk = 1;

结果:
第二步结果

二、方式2:

使用笛卡尔积实现:

第一步:

select k, m, ROW_NUMBER() OVER(PARTITION BY k,m ORDER BY m ASC) AS flag from test_distinct;

第二步：

select 
	t1.k, t1.m, 
	t1.cnt as ins,           -- 当前新增
	sum(t2.cnt) as total     -- 历史至今累计
from 
(
	select k, m, count(*) as cnt  
	from 
		(	
			select k, m, ROW_NUMBER() OVER(PARTITION BY k,m ORDER BY m ASC) AS flag 
			from test_distinct
		) a
	where flag = 1 
	group by k, m 
) t1, 
(
	select k, m, count(*) as cnt  
	from 
		(	
			select k, m, ROW_NUMBER() OVER(PARTITION BY k,m ORDER BY m ASC) AS flag 
			from test_distinct
		) b 
	where flag = 1 
	group by k, m 
) t2 
where t1.m >= t2.m 
group by t1.k, t1.m;