– 拉链表实战
– 1.创建一张用于存储每日增量数据的表
drop table test
.stu_newdata
;
CREATE TABLE test
.stu_newdata
(
id
int,
name string
,
age string
,
update_time
bigint
)
partitioned
by (dt string
)
stored
as parquet
;
– 2.创建一张全量每日快照表
drop table test
.stu
;
CREATE TABLE test
.stu
(
id
int,
name string
,
age string
,
update_time
bigint
)
partitioned
by (dt string
)
stored
as parquet
;
– 3. 创建一张拉链表
drop table test
.stu_zip
;
CREATE TABLE test
.stu_zip
(
id
int,
name string
,
age string
,
update_time
bigint,
start_dt string
,
end_dt string
)
partitioned
by (dt string
)
stored
as parquet
;
– 操作手册 – 操作初始日期:2020-09-03
1. 利用sqoop,初次导入2020-09-03日,业务库中的表
bin/sqoop
import \
--connect jdbc:mysql://doitedu01:3306/realtimedw \
--username root \
--password ABC123abc.123 \
--table stu \
--target-dir
'/sqoopdata/incr/stu/2020-09-03' \
--delete-target-dir \
--incremental lastmodified \
--check-column update_time \
--last-value
'2020-09-01 00:00:00' \
--fields-terminated-by
',' \
--as-parquetfile \
-m 1 \
2. 将本次导入的增量数据,load到hive的增量表中
load data inpath
'/sqoopdata/incr/stu/2020-09-03' into table test.stu_newdata partition
(dt
='2020-09-03');
3.生成2020-09-03日的全量表
bin/sqoop codegen \
--connect jdbc:mysql://doitedu01:3306/realtimedw \
--username root \
--password ABC123abc.123 \
--table stu \
--bindir /opt/apps/code/stu \
--class-name Stu \
--as-parquetfile
模拟准备全量表的2020-09-02分区
insert into table stu partition
(dt
='2020-09-02')
select 0,
'xx',10,1598945495000
;
将增量表的2020-09-03数据跟全量表的2020-09-02,合并成全量表的2020-09-03数据
bin/sqoop merge \
--new-data /user/hive/warehouse/test.db/stu_newdata/dt
=2020-09-03 \
--onto /user/hive/warehouse/test.db/stu/dt
=2020-09-02 \
--target-dir /user/hive/warehouse/test.db/stu/dt
=2020-09-03 \
--jar-file /opt/apps/code/stu/Stu.jar \
--class-name Stu \
--merge-key
id
将合并好的数据,导入全量表的2020-09-03分区
hive
> alter table stu_newdata add partition
(dt
='2020-09-03');
4.根据20220-09-03的增量,和2020-09-02的拉链表,生成2020-09-03的拉链表