0 问题描述
现有一个用户活跃表user_active(user_id,active_date)、 用户注册表user_regist(user_id,regist_date),表中分区字段都为dt(yyyy-MM-dd),用户字段均为user_id; 设计一张 1-180天的注册活跃留存表;表结构如下:
1 数据分析
完整的代码如下:
select
regist_date,
diff,
active_user_cnt,
case
when nvl(regis_cnt, 0) != 0 then active_user_cnt / regis_cnt
end as retention_rate
from (
select
t1.regist_date,
max(t1.regist_count) as regis_cnt,
datediff(t2.active_date, t1.regist_date) as diff,
count(t2.user_id) as active_user_cnt
from (select
user_id,
to_date(regist_date) as regist_date,
count(user_id) over (partition by to_date(regist_date)) as regist_count
from user_regist
where dt >= date_sub(current_date(), 180)) t1
left join
(select
user_id,
to_date(active_date) as active_date
from user_active
where dt >= date_sub(current_date(), 180)
group by user_id, to_date(active_date)) t2
on t1.user_id = t2.user_id
where datediff(active_date, regist_date) >= 1
and datediff(active_date, regist_date) <= 180
group by t1.regist_date, datediff(t2.active_date, t1.regist_date)
) t3
order by regist_date,
diff;
上述代码解析:
步骤一:基于注册表,求出用户的注册日期regist_date、每日的用户注册数量regist_count
select
user_id,
to_date(regist_date) as regist_date,
count(user_id) over (partition by to_date(regist_date)) as regist_count
from user_regist
where dt >= date_sub(current_date(), 180);
步骤二:将用户注册表作为主表,关联活跃表,关联键为user_id,一对多的关系,形成笛卡尔积。需要注意:活跃用户表,每个用户每天可能会有多次活跃的情况,因此需要去重。
select
t1.regist_date,
t1.user_id,
t1.regist_count,
t2.user_id,
t2.active_date,
datediff(t2.active_date, t1.regist_date) as diff
from (select
user_id,
to_date(regist_date) as regist_date,
count(user_id) over (partition by to_date(regist_date)) as regist_count
from user_regist
where dt >= date_sub(current_date(), 180)) t1
left join
(select
user_id,
to_date(active_date) as active_date
from user_active
where dt >= date_sub(current_date(), 180)
group by user_id, to_date(active_date)) t2
on t1.user_id = t2.user_id;
步骤三:基于注册日期,留存周期分组(以“天”为单位),计算该留存周期下的活跃用户数
select
t1.regist_date,
max(t1.regist_count) as regis_cnt,
datediff(t2.active_date, t1.regist_date) as diff,
count(t2.user_id) as active_user_cnt
from (select
user_id,
to_date(regist_date) as regist_date,
count(user_id) over (partition by to_date(regist_date)) as regist_count
from user_regist
where dt >= date_sub(current_date(), 180)) t1
left join
(select
user_id,
to_date(active_date) as active_date
from user_active
where dt >= date_sub(current_date(), 180)
group by user_id, to_date(active_date)) t2
on t1.user_id = t2.user_id
where datediff(active_date, regist_date) >= 1
and datediff(active_date, regist_date) <= 180
group by t1.regist_date, datediff(t2.active_date, t1.regist_date);
步骤四:计算留存率retention_rate
select
regist_date,
diff,
active_user_cnt,
case
when nvl(regis_cnt, 0) != 0 then active_user_cnt / regis_cnt
end as retention_rate
from (
select
t1.regist_date,
max(t1.regist_count) as regis_cnt,
datediff(t2.active_date, t1.regist_date) as diff,
count(t2.user_id) as active_user_cnt
from (select
user_id,
to_date(regist_date) as regist_date,
count(user_id) over (partition by to_date(regist_date)) as regist_count
from user_regist
where dt >= date_sub(current_date(), 180)) t1
left join
(select
user_id,
to_date(active_date) as active_date
from user_active
where dt >= date_sub(current_date(), 180)
group by user_id, to_date(active_date)) t2
on t1.user_id = t2.user_id
where datediff(active_date, regist_date) >= 1
and datediff(active_date, regist_date) <= 180
group by t1.regist_date, datediff(t2.active_date, t1.regist_date)
) t3
order by regist_date,
diff;
3 总结
利用left join左表关联,笛卡尔积的形式设计最近180天的注册活跃留存表。