一.如果是读取xml文件的时候,文件入库需要使用文件读取UDF
创建了1个测试文件
wsdFileRead(): 直接读取文件内容
SELECT wsdFileRead('/home/temp/wsd_test.xml')
Query id: 09b6e5fe-7169-43f7-b001-90e2eeabb8da
┌─wsdFileRead('/home/temp/wsd_test.xml')─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ <?xml version="1.0" encoding="UTF-8"?><des> <context> <ANNOUNCEMENT_START_TIME>20240416</ANNOUNCEMENT_START_TIME> <LAND_DISTRICT>1101202</LAND_DISTRICT> <FIELD_NUM>12</FIELD_NUM> <TRA_AGENCY_CODE>11110000MB03920782F</TRA_AGENCY_CODE> <PUB_SERVICE_PLAT_CODE>121100002400591890H</PUB_SERVICE_PLAT_CODE> <UNIT_ADDRESS>TEST1</UNIT_ADDRESS> <LAND_PROJECT_NAME>TEST2</LAND_PROJECT_NAME> </context></des> │
└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
1 row in set. Elapsed: 0.002 sec.
wsdFileReadB64(): 读取文件内容并base64加密
SELECT wsdFileReadB64('/home/temp/wsd_test.xml')
Query id: 76ea5d06-7802-4f71-98e5-0728d9b0372e
┌─wsdFileReadB64('/home/temp/wsd_test.xml')──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz48ZGVzPiAgPGNvbnRleHQ+ICAgIDxBTk5PVU5DRU1FTlRfU1RBUlRfVElNRT4yMDI0MDQxNjwvQU5OT1VOQ0VNRU5UX1NUQVJUX1RJTUU+ICAgIDxMQU5EX0RJU1RSSUNUPjExMDEyMDI8L0xBTkRfRElTVFJJQ1Q+ICAgIDxGSUVMRF9OVU0+MTI8L0ZJRUxEX05VTT4gICAgPFRSQV9BR0VOQ1lfQ09ERT4xMTExMDAwME1CMDM5MjA3ODJGPC9UUkFfQUdFTkNZX0NPREU+ICAgIDxQVUJfU0VSVklDRV9QTEFUX0NPREU+MTIxMTAwMDAyNDAwNTkxODkwSDwvUFVCX1NFUlZJQ0VfUExBVF9DT0RFPiAgICA8VU5JVF9BRERSRVNTPlRFU1QxPC9VTklUX0FERFJFU1M+ICAgIDxMQU5EX1BST0pFQ1RfTkFNRT5URVNUMjwvTEFORF9QUk9KRUNUX05BTUU+ICA8L2NvbnRleHQ+PC9kZXM+ │
└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
1 row in set. Elapsed: 0.002 sec.
二.如果处理clickhouse的字段含有xml字符串的情况
1.导入自定义UDF
函数名
- wsdXpathArrB64('参数1',参数2)
- wsdXpathB64('参数1‘,'参数2')
2.wsdXpathB64
此函数参数1传递字段值,需要base64编码,保证了xml字段数据如果含有特殊符号也可以解析成功,
参数2传递xpath表达式
这里我随便选取一个时间节点它的表达式是/des/context/ANNOUNCEMENT_START_TIME
具体使用
SELECT decodeURLComponent(base64Decode(submit_content)) AS ROUTE_WAY
FROM
(
SELECT wsdXpathB64(base64Encode(submit_content), '/des/context/ANNOUNCEMENT_START_TIME') AS submit_content
FROM WSD_CONTENT
WHERE sid = '001100005c20f73c4232b626dba00fa921ce'
)
LIMIT 4
Query id: f0dc9ace-c05d-4a47-b12f-f7102093fa57
┌─ROUTE_WAY─┐
│ 20240416 │
└───────────┘
3.wsdXpathArrB64
第一个参数和上面的函数一样,base64编码后的字段
第二个参数传一个xpath表达式的数组
具体如下
SELECT
sid,
decodeURLComponent(base64Decode(submit_content[1])) AS ROUTE_WAY,
decodeURLComponent(base64Decode(submit_content[2])) AS DATA_NO,
decodeURLComponent(base64Decode(submit_content[3])) AS DATA_KEY,
decodeURLComponent(base64Decode(submit_content[4])) AS CA_KEY,
decodeURLComponent(base64Decode(submit_content[5])) AS DATA_EN,
decodeURLComponent(base64Decode(submit_content[6])) AS DATA_TYPE,
FROM
(
SELECT
wsdXpathArrB64(base64Encode(replaceRegexpAll(submit_content, '\\+', ' ')), ['/des/title/ROUTE_WAY', '/des/title/DATA_NO', '/des/title/DATA_KEY', '/des/title/CA_KEY', '/des/title/DATA_EN', '/des/title/DATA_TYPE']) AS submit_content,
sid
FROM CEN_SUBMIT_CONTEXT_600
WHERE sid = '0011726f801b96724c0790f9a38e3593cca7'
)
Query id: 147b7dba-f694-4cbc-84cd-85f781244d20
Row 1:
──────
sid: 0011726f801b96724c0790f9a38e3593cca7
ROUTE_WAY: 0
DATA_NO: 600
DATA_KEY: ff8080818d548044018e59dc7ac10bde
CA_KEY: 88888888889999999999
DATA_EN: DEAL_BEHAVIOR_INFO
DATA_TYPE: 1
4.解析xml文件
##使用不带base64加密的函数
SELECT
wsdXpathB64(base64Encode(wsdFileRead('/home/temp/wsd_test.xml')), '/des/context/ANNOUNCEMENT_START_TIME') AS xx,
base64Decode(xx) AS yy
Query id: d9db0f57-e187-4137-ba92-3024b5fe8e61
┌─xx───────────┬─yy───────┐
│ MjAyNDA0MTY= │ 20240416 │
└──────────────┴──────────┘
1 row in set. Elapsed: 0.004 sec.
###使用带base64加密的函数
SELECT
wsdXpathB64(wsdFileReadB64('/home/temp/wsd_test.xml'), '/des/context/ANNOUNCEMENT_START_TIME') AS xx,
base64Decode(xx) AS yy
Query id: 55a5da4b-46d6-4205-9844-3cd2c17a3a2b
┌─xx───────────┬─yy───────┐
│ MjAyNDA0MTY= │ 20240416 │
└──────────────┴──────────┘
1 row in set. Elapsed: 0.005 sec.