用flink cdc sqlserver 将数据实时同步到clickhouse
flink cdc 终于支持 sqlserver 了。现在互联网公司用sqlserver的不多,大部分都是一些国企的老旧系统。我们以前同步数据,都是用datax,但是不能实时同步数据。现在有了flinkcdc,可以实现实时同步了。1、首先sqlserver版本:要求sqlserver版本为14及以上,也就是SQL Server 2017 版。2、sqlserver开启cdc,具体细节可以百度,有一
flink cdc 终于支持 sqlserver 了。
现在互联网公司用sqlserver的不多,大部分都是一些国企的老旧系统。我们以前同步数据,都是用datax,但是不能实时同步数据。现在有了flinkcdc,可以实现实时同步了。
1、首先sqlserver版本:要求sqlserver版本为14及以上,也就是SQL Server 2017 版。
2、sqlserver开启cdc,具体细节可以百度,有一点要注意,必须启动SQL Server 代理服务。
3、需要实现一个json解析方法,用于将同步过来的json数据解析出来,并insert到目标数据库
4、如果需要断点续传,需要设置ck,由于我们这边设备有限。使用的是RocksDBStateBackend,把数据保存到本地了。如果有大数据环境,建议使用FsStateBackend(文件系统状态后端 hdfs),将数据保存到hdfs
5、关于维表关联问题,我将维表数据放到了redis中
下面是代码:
1、pom.xml
<properties> <flink.version>1.13.5</flink.version> <scala.version>2.11</scala.version> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> </properties> <dependencies> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-planner_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-planner-blink_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>com.ververica</groupId> <artifactId>flink-connector-sqlserver-cdc</artifactId> <!-- The dependency is available only for stable releases, SNAPSHOT dependency need build by yourself. --> <version>2.3-SNAPSHOT</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>${flink.version}</version> <!-- <scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-scala_2.11</artifactId> <version>${flink.version}</version> <!-- <scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-runtime-web_2.11</artifactId> <version>${flink.version}</version> <!-- <scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-statebackend-rocksdb_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>joda-time</groupId> <artifactId>joda-time</artifactId> <version>2.7</version> </dependency> <dependency> <groupId>com.google.code.gson</groupId> <artifactId>gson</artifactId> <version>2.8.2</version> </dependency> <dependency> <groupId>ru.yandex.clickhouse</groupId> <artifactId>clickhouse-jdbc</artifactId> <version>0.2.6</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-kafka_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka-clients</artifactId> <version>2.7.0</version> </dependency> </dependencies>
2、
package com.cmei.s2c; import com.ververica.cdc.connectors.sqlserver.SqlServerSource; import com.ververica.cdc.debezium.JsonDebeziumDeserializationSchema; import org.apache.flink.api.common.restartstrategy.RestartStrategies; import org.apache.flink.contrib.streaming.state.RocksDBStateBackend; import org.apache.flink.runtime.state.filesystem.FsStateBackend; import org.apache.flink.runtime.state.memory.MemoryStateBackend; import org.apache.flink.streaming.api.CheckpointingMode; import org.apache.flink.streaming.api.TimeCharacteristic; import org.apache.flink.streaming.api.environment.CheckpointConfig; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.types.Row; public class SqlServerSourceExample { public static void main(String[] args) throws Exception { SourceFunction<String> sourceFunction = SqlServerSource.<String>builder() .hostname("192.168.10.134") .port(1433) .database("inventory") // monitor sqlserver database .tableList("dbo.products") // monitor products table .username("sa") .password("qwe123==") .deserializer(new JsonDebeziumDeserializationSchema()) // converts SourceRecord to JSON String .build(); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime); //2.Flink-CDC将读取binlog的位置信息以状态的方式保存在CK,如果想要做到断点续传,需要从Checkpoint或者Savepoint启动程序 //2.1 开启Checkpoint,每隔5秒钟做一次CK env.enableCheckpointing(5000L); //2.2 指定CK的一致性语义 env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); //2.3 设置任务关闭的时候保留最后一次CK数据 env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); //2.4 指定从CK自动重启策略 env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, 2000L)); //2.5 设置状态后端 env.setStateBackend(new RocksDBStateBackend("file:///usr/local/flink-1.13.5/ck")); //env.setStateBackend(new MemoryStateBackend()); // MemoryStateBackend(内存状态后端) // FsStateBackend(文件系统状态后端 hdfs) // RocksDBStateBackend(RocksDB状态后端) //env.setStateBackend(new FsStateBackend("hdfs://sc2:8020/flinkCDC")); //2.6 设置访问HDFS的用户名 //System.setProperty("HADOOP_USER_NAME", "root"); env.addSource(sourceFunction).addSink(new ClickHouseSink()).setParallelism(1); //env.addSource(sourceFunction).print().setParallelism(1); // use parallelism 1 for sink to keep message ordering env.execute("1"); } }
3、json解析
package com.cmei.s2c; import com.google.gson.Gson; import com.ververica.cdc.debezium.DebeziumDeserializationSchema; import io.debezium.data.Envelope; import org.apache.flink.api.common.typeinfo.BasicTypeInfo; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.util.Collector; import org.apache.kafka.connect.data.Field; import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.source.SourceRecord; import java.util.HashMap; public class JsonDebeziumDeserializationSchema implements DebeziumDeserializationSchema { @Override public void deserialize(SourceRecord sourceRecord, Collector collector) throws Exception { HashMap<String, Object> hashMap = new HashMap<>(); String topic = sourceRecord.topic(); String[] split = topic.split("[.]"); String database = split[1]; String table = split[2]; hashMap.put("database",database); hashMap.put("table",table); //获取操作类型 Envelope.Operation operation = Envelope.operationFor(sourceRecord); //获取数据本身 Struct struct = (Struct)sourceRecord.value(); Struct after = struct.getStruct("after"); Struct before = struct.getStruct("before"); /* 1,同时存在 beforeStruct 跟 afterStruct数据的话,就代表是update的数据 2,只存在 beforeStruct 就是delete数据 3,只存在 afterStruct数据 就是insert数据 */ if (after != null) { //insert Schema schema = after.schema(); HashMap<String, Object> hm = new HashMap<>(); for (Field field : schema.fields()) { hm.put(field.name(), after.get(field.name())); } hashMap.put("data",hm); }else if (before !=null){ //delete Schema schema = before.schema(); HashMap<String, Object> hm = new HashMap<>(); for (Field field : schema.fields()) { hm.put(field.name(), before.get(field.name())); } hashMap.put("data",hm); }else if(before !=null && after !=null){ //update Schema schema = after.schema(); HashMap<String, Object> hm = new HashMap<>(); for (Field field : schema.fields()) { hm.put(field.name(), after.get(field.name())); } hashMap.put("data",hm); } String type = operation.toString().toLowerCase(); if ("create".equals(type)) { type = "insert"; }else if("delete".equals(type)) { type = "delete"; }else if("update".equals(type)) { type = "update"; } hashMap.put("type",type); Gson gson = new Gson(); collector.collect(gson.toJson(hashMap)); } @Override public TypeInformation<String> getProducedType() { return BasicTypeInfo.STRING_TYPE_INFO; } }
4、clickhousesink,只实现了insert其他可以自己补全
package com.cmei.s2c; import com.google.gson.Gson; import com.google.gson.internal.LinkedTreeMap; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.util.HashMap; public class ClickHouseSink extends RichSinkFunction<String> { Connection connection; PreparedStatement pstmt; PreparedStatement iStmt; PreparedStatement dStmt; PreparedStatement uStmt; private Connection getConnection() { Connection conn = null; try { Class.forName("ru.yandex.clickhouse.ClickHouseDriver"); String url = "jdbc:clickhouse://192.168.10.61:8123/drugdb"; conn = DriverManager.getConnection(url,"bigdata","bigdata"); } catch (Exception e) { e.printStackTrace(); } return conn; } @Override public void open(Configuration parameters) throws Exception { super.open(parameters); connection = getConnection(); String insertSql = "insert into product(id,name,description,weight) values (?,?,?,?)"; String deleteSql = "delete from product where id=?"; String updateSql = "update product set name=? ,description=?,weight=? where id=?"; iStmt = connection.prepareStatement(insertSql); dStmt = connection.prepareStatement(deleteSql); uStmt = connection.prepareStatement(updateSql); } // 每条记录插入时调用一次 public void invoke(String value, Context context) throws Exception { Gson t = new Gson(); HashMap<String, Object> hs = t.fromJson(value, HashMap.class); LinkedTreeMap<String,Object> source = (LinkedTreeMap<String,Object>)hs.get("source"); String database = (String) source.get("db"); String table = (String) source.get("table"); String op = (String) hs.get("op"); /** * {"before":null, * "after":{"id":109,"name":"spare tire","description":"24 inch spare tire","weight":22.2}, * "source":{"version":"1.5.4.Final","connector":"sqlserver","name":"sqlserver_transaction_log_source","ts_ms":1648776173094,"snapshot":"last","db":"inventory","sequence":null,"schema":"dbo","table":"products","change_lsn":null,"commit_lsn":"0000002c:00001a60:0001","event_serial_no":null}, * "op":"r","ts_ms":1648776173094,"transaction":null}*/ //实现insert方法 if ("inventory".equals(database) && "products".equals(table)) { if ("r".equals(op) || "c".equals(op)) { LinkedTreeMap<String, Object> data = (LinkedTreeMap<String, Object>) hs.get("after"); Double ids = (Double)data.get("id"); int id = ids.intValue(); String name = (String) data.get("name"); String description = (String) data.get("description"); Double weights = (Double)data.get("weight"); float weight=0; if("".equals(weights) || weights != null ){ weight = weights.floatValue(); } iStmt.setInt(1, id); iStmt.setString(2, name); iStmt.setString(3, description); iStmt.setFloat(4, weight); iStmt.executeUpdate(); } // else if ("d".equals(type)) { // System.out.println("delete => " + value); // LinkedTreeMap<String, Object> data = (LinkedTreeMap<String, Object>) hs.get("data"); // String id = (String) data.get("ID"); // dStmt.setString(1, id); // dStmt.executeUpdate(); // } // else if ("u".equals(type)) { // System.out.println("update => " + value); // LinkedTreeMap<String, Object> data = (LinkedTreeMap<String, Object>) hs.get("data"); // String id = (String) data.get("ID"); // String cron = (String) data.get("CRON"); // uStmt.setString(1, cron); // uStmt.setString(2, id); // uStmt.executeUpdate(); // } } } @Override public void close() throws Exception { super.close(); if(pstmt != null) { pstmt.close(); } if(connection != null) { connection.close(); } } }
git:
classtime2020/sqlServer2ClickHouse at master · zhaobingkun/classtime2020 · GitHub
更多推荐
所有评论(0)