TAG
在传统数仓场景中,从传统数据库中导入的事实表数据一般是全量导入,按天分区每天都存储一份全量数据,paimon对此提供了Tag机制,创建TAG时,会对当前数据做一份全量快照,在之后对表的数据进行更新也不会影响已经打完TAG的数据。
其实就是对当前的数据做镜像,或者说做备份
维护创建
https://paimon.apache.org/docs/0.9/maintenance/manage-tags/
-- 创建
bin/flink run \
lib/paimon-flink-action-0.9.0.jar \
create_tag \
--warehouse file:///data/soft/paimon/catalog \
--database default \
--table t_tags \
--tag_name first_tag
-- 删除
bin/flink run \
lib/paimon-flink-action-0.9.0.jar \
delete_tag \
--warehouse file:///data/soft/paimon/catalog \
--database default \
--table t_tags \
--tag_name first_tag
-- 回滚到指定tag版本,还原当时的数据
bin/flink run \
lib/paimon-flink-action-0.9.0.jar \
rollback_to \
--warehouse file:///data/soft/paimon/catalog \
--database default \
--table t_tags \
--version first_tag
使用TAG
CREATE TABLE t_tags (
age BIGINT,
money BIGINT,
id STRING,
PRIMARY KEY (id) NOT ENFORCED
);
insert into t_tags values(10,1000,'1');
-- 创建第一个tag
bin/flink run \
lib/paimon-flink-action-0.9.0.jar \
create_tag \
--warehouse file:///data/soft/paimon/catalog \
--database default \
--table t_tags \
--tag_name first_tag
insert into t_tags values(20,2000,'2');
bin/flink run \
lib/paimon-flink-action-0.9.0.jar \
create_tag \
--warehouse file:///data/soft/paimon/catalog \
--database default \
--table t_tags \
--tag_name second_tag
select * from t_tags$tags;
Flink SQL> select * from t_tags$tags;
+------------+-------------+-----------+-------------------------+--------------+-------------+---------------+
| tag_name | snapshot_id | schema_id | commit_time | record_count | create_time | time_retained |
+------------+-------------+-----------+-------------------------+--------------+-------------+---------------+
| first_tag | 1 | 0 | 2024-12-19 15:05:18.802 | 1 | <NULL> | <NULL> |
| second_tag | 2 | 0 | 2024-12-19 15:08:14.165 | 2 | <NULL> | <NULL> |
+------------+-------------+-----------+-------------------------+--------------+-------------+---------------+
Flink SQL> select * from t_tags;
+-----+-------+----+
| age | money | id |
+-----+-------+----+
| 10 | 1000 | 1 |
| 20 | 2000 | 2 |
+-----+-------+----+
Flink SQL> select * from t_tags/*+ OPTIONS('scan.tag-name' = 'first_tag') */;
+-----+-------+----+
| age | money | id |
+-----+-------+----+
| 10 | 1000 | 1 |
+-----+-------+----+
1 row in set
-- 修改tag中数据
insert into t_tags_auto values(20,1000,'1');
Flink SQL> select * from t_tags;
+-----+-------+----+
| age | money | id |
+-----+-------+----+
| 20 | 1000 | 1 |
| 20 | 2000 | 2 |
+-----+-------+----+
2 rows in set
-- 查询tag,发现没有影响
Flink SQL> select * from t_tags/*+ OPTIONS('scan.tag-name' = 'first_tag') */;
+-----+-------+----+
| age | money | id |
+-----+-------+----+
| 10 | 1000 | 1 |
+-----+-------+----+
1 row in set
自动创建TAG
-- Flink SQL
CREATE TABLE t (
k INT PRIMARY KEY NOT ENFORCED,
f0 INT,
...
) WITH (
'tag.automatic-creation' = 'process-time', -- 时间模式 watermark/batch
'tag.creation-period' = 'daily', -- 循环周期 hourly/two-hour
'tag.creation-delay' = '10 m', -- 延迟触发时间
'tag.num-retained-max' = '90' -- tag保存90个,超出自动删除
);