HIVE解决连续登录问题
目录
HIVE解决连续登录问题
1.解决连续登录问题
如何去分析数据:
2.需求:
3.-- 间隔天数
1.解决连续登录问题
如何去分析数据:
1)查看数据的字段信息
2)需要查看每一行数据代表什么意思
create table learn5.deal_tb(
id string
,datestr string
,amount string
)row format delimited fields terminated by ',';
load data local inpath "/usr/local/soft/hive-3.1.2/data/deal_tb.txt" into table learn5.deal_tb;
2.需求:
统计用户
①连续交易的总额、
②连续登陆天数、
③连续登陆开始和结束时间、
④间隔天数等
连续交易的总额分析:
①什么是连续交易:用户每天都在交易
WITH sum_amount_table AS(
SELECT
T.id
,T.datestr
,SUM(amount) as sum_amount
FROM learn5.deal_tb T
GROUP BY T.id,T.datestr
)
SELECT
id
,datestr
,sum_amount
, ROW_NUMBER() OVER(PARTITION BY id ORDER BY datestr) days
FROM sum_amount_table
+-----+-------------+-------------+-------+
| id | datestr | sum_amount | days |
+-----+-------------+-------------+-------+
| 1 | 2019-02-08 | 12461.55 | 1 |
| 1 | 2019-02-09 | 1052.99 | 2 |
| 1 | 2019-02-10 | 85.69 | 3 |
| 1 | 2019-02-12 | 769.85 | 4 |
| 1 | 2019-02-13 | 943.86 | 5 |
| 1 | 2019-02-14 | 538.42 | 6 |
| 1 | 2019-02-15 | 369.76 | 7 |
| 1 | 2019-02-16 | 369.76 | 8 |
| 1 | 2019-02-18 | 795.15 | 9 |
| 1 | 2019-02-19 | 715.65 | 10 |
| 1 | 2019-02-21 | 537.71 | 11 |
| 2 | 2019-02-08 | 12461.55 | 1 |
| 2 | 2019-02-09 | 1052.99 | 2 |
| 2 | 2019-02-10 | 85.69 | 3 |
| 2 | 2019-02-12 | 769.85 | 4 |
| 2 | 2019-02-13 | 943.86 | 5 |
| 2 | 2019-02-14 | 943.18 | 6 |
| 2 | 2019-02-15 | 369.76 | 7 |
| 2 | 2019-02-18 | 795.15 | 8 |
| 2 | 2019-02-19 | 715.65 | 9 |
| 2 | 2019-02-21 | 537.71 | 10 |
| 3 | 2019-02-08 | 12461.55 | 1 |
| 3 | 2019-02-09 | 1052.99 | 2 |
| 3 | 2019-02-10 | 85.69 | 3 |
| 3 | 2019-02-12 | 769.85 | 4 |
| 3 | 2019-02-13 | 943.86 | 5 |
| 3 | 2019-02-14 | 276.81 | 6 |
| 3 | 2019-02-15 | 369.76 | 7 |
| 3 | 2019-02-16 | 369.76 | 8 |
| 3 | 2019-02-18 | 795.15 | 9 |
| 3 | 2019-02-19 | 715.65 | 10 |
| 3 | 2019-02-21 | 537.71 | 11 |
+-----+-------------+-------------+-------+
我们创建一个表格 将查询语句的结果数据插入到里面 方便查询
CREATE TABLE continue_res (
id string
,start_day STRING
, continue_amount STRING
, continue_days STRING
, continue_start STRING
, continue_end STRING
)
WITH sum_amount_table AS(
SELECT
T.id
,T.datestr
,SUM(amount) as sum_amount
FROM learn5.deal_tb T
GROUP BY T.id,T.datestr
)
, days_table AS (
SELECT
id
,datestr
,sum_amount
, ROW_NUMBER() OVER(PARTITION BY id ORDER BY datestr) days
FROM sum_amount_table
)
, date_sub_table AS(
SELECT
id
,datestr
,sum_amount
,days
,date_sub(datestr,days) start_day --通过日期减去排序得到日期开始的天
FROM days_table
)
--连续登陆天数、连续登陆开始和结束时间
INSERT INTO TABLE continue_res
SELECT
id
,start_day
,sum(sum_amount) as continue_amount -- 通过对起始天数一致的数据进行分组统计得到 连续交易的总额
,count(*) AS continue_days
,min(datestr) AS continue_start
,max(datestr) as continue_end
FROM date_sub_table
GROUP BY id,start_day
select * from continue_res;
SELECT
id
,continue_end
,LEAD(continue_start,1,current_date) OVER(PARTITION BY id ORDER BY start_day) as last_day
FROM continue_res
+-----+-------------+---------------------+----------------+-----------------+-------------- -+
| id | start_day | continue_amount | continue_days | continue_start | continue_end |
+-----+-------------+---------------------+----------------+-----------------+-------------- -+
| 1 | 2019-02-07 | 13600.23 | 3 | 2019-02-08 | 2019-02-10 |
| 1 | 2019-02-08 | 2991.6500000000005 | 5 | 2019-02-12 | 2019-02-16 |
| 1 | 2019-02-09 | 1510.8 | 2 | 2019-02-18 | 2019-02-19 |
| 1 | 2019-02-10 | 537.71 | 1 | 2019-02-21 | 2019-02-21 |
| 2 | 2019-02-07 | 13600.23 | 3 | 2019-02-08 | 2019-02-10 |
| 2 | 2019-02-08 | 3026.6499999999996 | 4 | 2019-02-12 | 2019-02-15 |
| 2 | 2019-02-10 | 1510.8 | 2 | 2019-02-18 | 2019-02-19 |
| 2 | 2019-02-11 | 537.71 | 1 | 2019-02-21 | 2019-02-21 |
| 3 | 2019-02-07 | 13600.23 | 3 | 2019-02-08 | 2019-02-10 |
| 3 | 2019-02-08 | 2730.04 | 5 | 2019-02-12 | 2019-02-16 |
| 3 | 2019-02-09 | 1510.8 | 2 | 2019-02-18 | 2019-02-19 |
| 3 | 2019-02-10 | 537.71 | 1 | 2019-02-21 | 2019-02-21 |
+-----+-------------+---------------------+----------------+-----------------+-------------- -+
3.-- 间隔天数
LAG(col,n,default_val):往前第n行数据
LEAD(col,n, default_val):往后第n行数据
SELECT
T1.id
,T1.continue_end
,T1.last_day
,datediff(T1.last_day,T1.continue_end)
FROM (
SELECT
id
,continue_end
,LEAD(continue_start,1,"2022-04-15") OVER(PARTITION BY id ORDER BY start_day) as last_day
FROM learn5.continue_res
) T1