FlinkSQL消费Kafka写入Hive表
flinkSQL消费kafka实时写入hive表
环境版本:
hadoop-3.1.0
hive-3.1.2
flink-1.13.2
一、开发
Maven引入依赖项:
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-hive_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>${flink.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
<!--用于向hdfs写paruqet-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-parquet_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-avro</artifactId>
<version>${flink.version}</version>
</dependency>
java代码示例:
package teld;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.SqlDialect;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.catalog.hive.HiveCatalog;
import java.time.Duration;
/**
* @Auther: lixz
* @Date: 2022/10/13/9:38
* @Description: 有hive依赖冲突问题暂停
*/
public class Kafka2Hive {
public static void main( String[] args ) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().build();
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env,settings);
/**
* hive环境
*/
// System.setProperty("HADOOP_USER_NAME","hdfs");
String name = "myhive";
String defaultDatabase = "test";
//这里版本号一定要与hive-exec包版本一致,否则报错:NoSuchMethodException: org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(org.apache.hadoop.hive.conf.HiveConf)
String hive_version = "3.1.2";
String hiveConfDir = "/opt/hive-3.1.2/conf";
HiveCatalog hive = new HiveCatalog(name, defaultDatabase, hiveConfDir,hive_version);
tEnv.registerCatalog("myhive", hive);
tEnv.useCatalog("myhive");
tEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
//接入kafka
KafkaSource<String> source = KafkaSource.<String>builder()
.setBootstrapServers("192.168.78.1:9092")
.setTopics("test4")
.setGroupId("my-group")
.setStartingOffsets(OffsetsInitializer.latest())
.setValueOnlyDeserializer(new SimpleStringSchema())
.build();
//接入kafka流
DataStreamSource<String> stream = env.fromSource(source,
WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(5)), "Kafka Source");
DataStream<MyUser> dataStream = stream.map(new MapFunction<String, MyUser>() {
@Override
public MyUser map(String s) throws Exception {
String[] arr = s.split(",");
return new MyUser(arr[0], arr[1], Integer.valueOf(arr[2]));
}
}).returns(MyUser.class);
//创建动态表
tEnv.createTemporaryView("MyUser",dataStream);
//创建hive表(如果hive中该表不存在会自动在hive上创建,也可以提前在hive中建好该表,flinksql中就无需再执行建表SQL,因为用了hive的catalog,flinksql运行时会找到表)
tEnv.executeSql("CREATE TABLE IF NOT EXISTS `myhive`.`test`.`useroplog` \n" +
"(\n" +
"`ID` STRING,\n" +
"`NAME` STRING,\n" +
"`AGE` INT\n" +
") \n" +
"partitioned by(`DAY` STRING)\n" +
"STORED AS parquet TBLPROPERTIES(\n" +
//小文件自动合并,1.12版的新特性,解决了实时写hive产生的小文件问题
"'auto-compaction'='true',\n" +
//合并后的最大文件大小
"'compaction.file-size'='128MB',\n"+
"'format' = 'parquet',\n"+
//压缩方式
"'parquet.compression'='GZIP',\n"+
//如果每小时一个分区,这个参数可设置为1 h,这样意思就是说数据延后一小时写入hdfs,能够达到数据的真确性,如果分区是天,这个参数也没必要设置了,今天的数据明天才能写入,时效性太差
"'sink.partition-commit.delay'='30 s',\n" +
//metastore值是专门用于写入hive的,也需要指定success-file
//这样检查点触发完数据写入磁盘后会创建_SUCCESS文件以及hive metastore上创建元数据,这样hive才能够对这些写入的数据可查
"'sink.partition-commit.policy.kind'='metastore,success-file'\n" +
")");
//写hive表
tEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
tEnv.executeSql("insert into useroplog select *,'2022-10-13' as `DAY` from MyUser");
//打印
// tEnv.executeSql("select * from MyUser").print();
env.execute();
}
}
如果要输出的hive没有创建,执行任务后会自动创建,我们到hive下看看自定创建出来的表格式是什么样:
CREATE TABLE `useroplog`(
`id` string,
`name` string,
`age` int)
PARTITIONED BY (
`day` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://dss0:8020/user/hive/warehouse/test.db/useroplog'
TBLPROPERTIES (
'auto-compaction'='true',
'bucketing_version'='2',
'format'='parquet',
'parquet.compression'='GZIP',
'sink.partition-commit.delay'='1min',
'sink.partition-commit.policy.kind'='success-file',
'transient_lastDdlTime'='1665629249')
打包代码,注意不要包含依赖避免依赖重读,我们用到的依赖都放到集群上
提交作业:
flink run-application \
-t yarn-application \
-c teld.Kafka2Hive \
-Dyarn.provided.lib.dirs="hdfs://dss0:8020/user/flink/flink-dependency-1.13.2;hdfs://dss0:8020/user/flink/flink-dependency-1.13.2/lib;hdfs://dss0:8020/user/flink/flink-dependency-1.13.2/plugin
s" \-Dyarn.application.name=flink2hivetest \
flink2hivetest-1.0-SNAPSHOT.jar \
提交成功截图:
当我们向kafka发送数据后就会写入到hive中,我们看下hive表生成的文件结构
实时写入时,分区会自动创建;我们来查询下
二、注意事项
1、代码中创建的HiveCatalog中要指定hive版本,并且该版本一定要与依赖hive-exec的版本一致
2、集群HDFS上的依赖如下:
3、hive要开启metastore
bin/hive --service metastore >/dev/null 2>&1 &
开启后可以看9083端口是否存在
4、hive-site.xml配置
需要指定metastore uri
<property>
<name>hive.metastore.uris</name>
<value>thrift://192.168.78.12:9083</value>
</property>
更多推荐
所有评论(0)