Hive使用python自定义utf函数

    科技2024-04-03  14

    要处理的字段样式:BaseFieldUDF.py

    line_list = ['1601837183327|{"cm":{"ln":"-69.1","sv":"V2.0.8","os":"8.1.4","g":"K7T9H5GM@gmail.com","mid":"1","nw":"4G","l":"es","vc":"19","hw":"640*960","ar":"MX","uid":"1","t":"1601825265783","la":"23.6","md":"Huawei-9","vn":"1.2.1","ba":"Huawei","sr":"T"},"ap":"app","et":[{"ett":"1601833164855","en":"display","kv":{"goodsid":"0","action":"1","extend1":"2","place":"4","category":"48"}},{"ett":"1601776175979","en":"notification","kv":{"ap_time":"1601750646753","action":"1","type":"1","content":""}},{"ett":"1601783920469","en":"active_foreground","kv":{"access":"","push_id":"2"}},{"ett":"1601756375929","en":"comment","kv":{"p_comment_id":1,"addtime":"1601795584064","praise_count":66,"other_id":5,"comment_id":9,"reply_count":129,"userid":2,"content":"韵凉"}},{"ett":"1601763155223","en":"praise","kv":{"target_id":3,"id":9,"type":3,"add_time":"1601773879208","userid":4}}]} 2020-10-05']

    自定义函数:

    # -*- coding:utf8 -*- import sys import json line_list = ['1601837183327|{"cm":{"ln":"-69.1","sv":"V2.0.8","os":"8.1.4","g":"K7T9H5GM@gmail.com","mid":"1","nw":"4G","l":"es","vc":"19","hw":"640*960","ar":"MX","uid":"1","t":"1601825265783","la":"23.6","md":"Huawei-9","vn":"1.2.1","ba":"Huawei","sr":"T"},"ap":"app","et":[{"ett":"1601833164855","en":"display","kv":{"goodsid":"0","action":"1","extend1":"2","place":"4","category":"48"}},{"ett":"1601776175979","en":"notification","kv":{"ap_time":"1601750646753","action":"1","type":"1","content":""}},{"ett":"1601783920469","en":"active_foreground","kv":{"access":"","push_id":"2"}},{"ett":"1601756375929","en":"comment","kv":{"p_comment_id":1,"addtime":"1601795584064","praise_count":66,"other_id":5,"comment_id":9,"reply_count":129,"userid":2,"content":"韵凉"}},{"ett":"1601763155223","en":"praise","kv":{"target_id":3,"id":9,"type":3,"add_time":"1601773879208","userid":4}}]} 2020-10-05'] def get_fields(): public_fileds = "mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,nw,ln,la,t" pf_list = public_fileds.split(",") return pf_list for line in line_list: #for line in sys.stdin: public_val = [] detail_str = line.strip() detail_list = detail_str.split("|") # 校验数据完整性 if len(detail_list) != 2 and detail_list[0] is None: pass # 解析公共字段 try: # print(detail_list[1]) json_str = json.loads(detail_list[1].split(" ")[0]) if "cm" in json_str: pf_json = json_str["cm"] pf_list = get_fields() for field in pf_list: if field in pf_json: public_val.append(pf_json[field]) # public_val.append("\t") else: public_val.append("\t") # 解析服务器时间和时间字段 server_time = detail_list[0] if "ap" in json_str: ap_json = json_str["ap"] public_val.append(str(ap_json)) # public_val.append("\t") else: public_val.append("\t") if "et" in json_str: event_json = json_str["et"] public_val.append(str(event_json)) # public_val.append("\t") else: public_val.append("\t") except Exception as e: print(e) print("\t".join(public_val)) public_val.clear() """此处有个坑,在jion的时候,需要join的字符必须时str,不可以为int,list等别的数据类型"""

    上传BaseFieldUDF.py 至服务器

    测试代码:解析成功

    hive中执行: 出现如下,表示解析成功

    add file /opt/module/script/BaseFieldUDF.py;

    执行sql:

    select transform(e) USING 'python3 BaseFieldUDF.py' AS (mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,nw,ln,la,t,ap,et) from ods_event_log where dt='2020-10-05' limit 1;

    Processed: 0.025, SQL: 9