hive对json数据处理的三种方式
数据背景hive (ods)> select * from jsont1;OKjsont1.usernamejsont1.agejsont1.sexjsont1.jsonuser118male{"id":1,"ids":[101,102,103],"total_number":3}user228female{"id":2,"ids":[201,202,203],"total_number":
数据背景
hive (ods)> select * from jsont1;
OK
jsont1.username jsont1.age jsont1.sex jsont1.json
user1 18 male {"id":1,"ids":[101,102,103],"total_number":3}
user2 28 female {"id":2,"ids":[201,202,203],"total_number":3}
user3 38 male {"id":3,"ids":[301,302,303,304],"total_number":4}
user4 48 female {"id":4,"ids":[401,402,403,404,405],"total_number":5}
user5 58 male {"id":5,"ids":[501,502,503],"total_number":3}
user6 68 female {"id":6,"ids":[601,602,603],"total_number":3}
user7 78 male {"id":7,"ids":[701,702,703],"total_number":3}
NULL NULL NULL
Time taken: 0.933 seconds, Fetched: 8 row(s)
一.使用hive内置函数处理
1:通过get_json_object()单个取
hive (ods)> select username,age,sex,get_json_object(json,"$.id") id ,get_json_object(json,"$.ids") ids ,get_json_object(json,"$.total_number") totalNumber
> from jsont1;
OK
username age sex id ids totalnumber
user1 18 male 1 [101,102,103] 3
user2 28 female 2 [201,202,203] 3
user3 38 male 3 [301,302,303,304] 4
user4 48 female 4 [401,402,403,404,405] 5
user5 58 male 5 [501,502,503] 3
user6 68 female 6 [601,602,603] 3
user7 78 male 7 [701,702,703] 3
NULL NULL NULL NULL NULL
Time taken: 0.562 seconds, Fetched: 8 row(s)
还可以取数组中的某个元素
hive (ods)> select username,age,sex,get_json_object(json,"$.id") id ,
> get_json_object(json,"$.ids[0]") ids0,
> get_json_object(json,"$.ids[1]") ids1,
> get_json_object(json,"$.ids[2]") ids2,
> get_json_object(json,"$.ids[3]") ids3,
> get_json_object(json,"$.ids[4]") ids4,
> get_json_object(json,"$.ids[5]") ids5,
> get_json_object(json,"$.total_number") totalNumber
> from jsont1;
OK
username age sex id ids0 ids1 ids2 ids3 ids4 ids5 totalnumber
user1 18 male 1 101 102 103 NULL NULL NULL 3
user2 28 female 2 201 202 203 NULL NULL NULL 3
user3 38 male 3 301 302 303 304 NULL NULL 4
user4 48 female 4 401 402 403 404 405 NULL 5
user5 58 male 5 501 502 503 NULL NULL NULL 3
user6 68 female 6 601 602 603 NULL NULL NULL 3
user7 78 male 7 701 702 703 NULL NULL NULL 3
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
Time taken: 6.033 seconds, Fetched: 8 row(s)
hive (ods)>
2.通过hive内置函数json_tuple()一起取
hive (ods)> select json_tuple(json,'id','ids','total_number')
> from jsont1;
OK
c0 c1 c2
1 [101,102,103] 3
2 [201,202,203] 3
3 [301,302,303,304] 4
4 [401,402,403,404,405] 5
5 [501,502,503] 3
6 [601,602,603] 3
7 [701,702,703] 3
NULL NULL NULL
Time taken: 1.076 seconds, Fetched: 8 row(s)
以上处理方式缺点:
(1).通过hive内置函数解析出的json都为json字符串并非我们想要的字符数组.
我们仔细观察它们俩者之间的不同
"[101,102,103]"-->["101","102","103"]
如果我们要通过hql处理如下:
hive (ods)> select regexp_replace("[101,102,103]","\\[|\\]","");
OK
_c0
101,102,103
Time taken: 0.237 seconds, Fetched: 1 row(s)
hive (ods)> select split(regexp_replace("[101,102,103]","\\[|\\]",""),",");
OK
_c0
["101","102","103"]
(2)通过json_tuple()进行都分个取出,但是发现不能和原字段并列显示
hive (ods)> select username,age,sex,json_tuple(json,'id','ids','total_number')
> from jsont1;
FAILED: SemanticException [Error 10081]: UDTF's are not supported outside the SELECT clause, nor nested in expressions
ps:报错:我自己猜测是hive中没有json数组类型
补救措施 lateral view视图
通过下列方式可以(lateral view 拼接视图)
hive (ods)> select username,age,sex,id,ids,num
> from jsont1
> lateral view json_tuple(json,'id','ids','total_number') t1 as id,ids,num;
OK
username age sex id ids num
user1 18 male 1 [101,102,103] 3
user2 28 female 2 [201,202,203] 3
user3 38 male 3 [301,302,303,304] 4
user4 48 female 4 [401,402,403,404,405] 5
user5 58 male 5 [501,502,503] 3
user6 68 female 6 [601,602,603] 3
user7 78 male 7 [701,702,703] 3
NULL NULL NULL NULL NULL
Time taken: 0.64 seconds, Fetched: 8 row(s)
(3)如果我们需要对字符数组炸裂做进一步处理更加繁琐
hive (ods)> with tmp as (select username,age,sex,id,ids,num
> from jsont1
> lateral view json_tuple(json,'id','ids','total_number') t1 as id,ids,num)
> select username,age,sex,id,ids1,num
> from tmp
> lateral view explode(split(regexp_replace(ids,"\\[|\\]",""),",")) t1 as ids1;
OK
username age sex id ids1 num
user1 18 male 1 101 3
user1 18 male 1 102 3
user1 18 male 1 103 3
user2 28 female 2 201 3
user2 28 female 2 202 3
user2 28 female 2 203 3
user3 38 male 3 301 4
user3 38 male 3 302 4
user3 38 male 3 303 4
user3 38 male 3 304 4
user4 48 female 4 401 5
user4 48 female 4 402 5
user4 48 female 4 403 5
user4 48 female 4 404 5
user4 48 female 4 405 5
user5 58 male 5 501 3
user5 58 male 5 502 3
user5 58 male 5 503 3
user6 68 female 6 601 3
user6 68 female 6 602 3
user6 68 female 6 603 3
user7 78 male 7 701 3
user7 78 male 7 702 3
user7 78 male 7 703 3
Time taken: 0.459 seconds, Fetched: 24 row(s)
hive (ods)>
(4)只能处理简单的json,复杂的json不好处理
二.使用自定义UDF函数处理json数据
1.引入依赖
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.7</version>
</dependency>
2.自定义UDF
public class ParseJsonArray extends UDF {
public ArrayList<String> evaluate(String jsonStr, String key) {
if (Strings.isNullOrEmpty(jsonStr)) {
return null;
}
try {
JSONObject jsonObject = JSON.parseObject(jsonStr);
JSONArray jsonArray = jsonObject.getJSONArray(key);
ArrayList<String> strings = new ArrayList<>();
for (Object o : jsonArray) {
strings.add(o.toString());
}
return strings;
}catch (JsonException e){
return null;
}
}
@Test
public void junitParseJsonArray(){
String jsonStr = "{\"id\":1,\"ids\":[101,102,103],\"total_number\":3}";
String key = "ids";
ArrayList<String> list = evaluate(jsonStr, key);
System.out.println(JSON.toJSONString(list));//["101","102","103"]
}
}
3.打包使用自定义UDF(我这里是定义临时UDF)
hive (ods)> add jar /root/data/tmp_jar/FlumeInterceptor-1.0-SNAPSHOT.jar ;
Added [/root/data/tmp_jar/FlumeInterceptor-1.0-SNAPSHOT.jar] to class path
Added resources: [/root/data/tmp_jar/FlumeInterceptor-1.0-SNAPSHOT.jar]
hive (ods)> create temporary function json_array_me as "com.lagou.dw.flume.interceptor.hive.udf.ParseJsonArray";
OK
Time taken: 2.046 seconds
hive (ods)>
4.使用自定义UDF
hive (ods)> select username,age,sex,json_array_me(json,“ids”) ids
> from jsont1;
OK
username age sex ids
user1 18 male ["101","102","103"]
user2 28 female ["201","202","203"]
user3 38 male ["301","302","303","304"]
user4 48 female ["401","402","403","404","405"]
user5 58 male ["501","502","503"]
user6 68 female ["601","602","603"]
user7 78 male ["701","702","703"]
NULL NULL NULL
Time taken: 0.748 seconds, Fetched: 8 row(s)
突然发现自定义的好处:不用繁琐解决解析json字符串和普通字段不能同时显示问题.
同时我们对解析后字符数据炸裂更加方便
hive (ods)> select username,age,sex,ids1
> from jsont1
> lateral view explode(json_array_me(json,"ids")) t1 as ids1;
OK
username age sex ids1
user1 18 male 101
user1 18 male 102
user1 18 male 103
user2 28 female 201
user2 28 female 202
user2 28 female 203
user3 38 male 301
user3 38 male 302
user3 38 male 303
user3 38 male 304
user4 48 female 401
user4 48 female 402
user4 48 female 403
user4 48 female 404
user4 48 female 405
user5 58 male 501
user5 58 male 502
user5 58 male 503
user6 68 female 601
user6 68 female 602
user6 68 female 603
user7 78 male 701
user7 78 male 702
user7 78 male 703
Time taken: 1.503 seconds, Fetched: 24 row(s)
拓展:我们如果需要将json都展开呢?
解决办法: 使用两次lateral view处理
hive (ods)> select username,age,sex,id,num,ids1
> from jsont1
> lateral view json_tuple(json,'id','total_number') t1 as id,num
> lateral view explode(json_array_me(json,"ids")) t2 as ids1;
OK
username age sex id num ids1
user1 18 male 1 3 101
user1 18 male 1 3 102
user1 18 male 1 3 103
user2 28 female 2 3 201
user2 28 female 2 3 202
user2 28 female 2 3 203
user3 38 male 3 4 301
user3 38 male 3 4 302
user3 38 male 3 4 303
user3 38 male 3 4 304
user4 48 female 4 5 401
user4 48 female 4 5 402
user4 48 female 4 5 403
user4 48 female 4 5 404
user4 48 female 4 5 405
user5 58 male 5 3 501
user5 58 male 5 3 502
user5 58 male 5 3 503
user6 68 female 6 3 601
user6 68 female 6 3 602
user6 68 female 6 3 603
user7 78 male 7 3 701
user7 78 male 7 3 702
user7 78 male 7 3 703
Time taken: 0.568 seconds, Fetched: 24 row(s)
三.对于纯json文件可以用SerDe解析
创建表可以指定表的序列化方式
TextFile:org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
OrcSerde:org.apache.hadoop.hive.ql.io.orc.OrcSerde
Parquet:org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
例如:指定序列化方式org.apache.hive.hcatalog.data.JsonSerDe
[root@Linux122 source]# cat json_data.txt
{"id":1,"ids":[101,102,103],"total_number":3}
{"id":2,"ids":[201,202,203],"total_number":3}
{"id":3,"ids":[301,302,303,304],"total_number":4}
{"id":4,"ids":[401,402,403,404,405],"total_number":5}
{"id":5,"ids":[501,502,503],"total_number":3}
{"id":6,"ids":[601,602,603],"total_number":3}
{"id":7,"ids":[701,702,703],"total_number":3}
hive (ods)> create table json2(
> id int,
> ids array<string>,
> total_number int)
> row format serde 'org.apache.hive.hcatalog.data.JsonSerDe';
OK
Time taken: 0.702 seconds
hive (ods)> load data local inpath '/root/data/source/json_data.txt' into table json2;
Loading data to table ods.json2
OK
Time taken: 17.651 seconds
hive (ods)> select * from json2;
OK
json2.id json2.ids json2.total_number
1 ["101","102","103"] 3
2 ["201","202","203"] 3
3 ["301","302","303","304"] 4
4 ["401","402","403","404","405"] 5
5 ["501","502","503"] 3
6 ["601","602","603"] 3
7 ["701","702","703"] 3
Time taken: 0.94 seconds, Fetched: 7 row(s)
更多推荐
所有评论(0)