使用Fluent Bit采集TIDB慢查询日志

前言

说到 TIDB 慢查询日志采集真是令人头疼,不得不吐槽一翻。

  • 日志格式仿 mysql 慢查询日志,都什么年代了,日志输出格式不能友好一些。即使希望对齐mysql,起码提供一个选项可以支持JSON 格式化方便用户采集,运行日志是支持 JSON 格式化;
  • 格式不固定,看了慢查询日志格式化源码,有十多个逻辑判断,而且这十多个逻辑判断在一些固定字段中穿插;不能把这些固定字段优先输出,然后在根据逻辑判断输出有特殊的字段,最后输出SQL这样子不行么?真的是匪夷所思!

来感受一下TIDB慢查询日志格式以及源码,源码不细说,具体可以查看[1]

  • 日志示例
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# Time: 2019-08-14T09:26:59.487776265+08:00
# Txn_start_ts: 410450924122144769
# User@Host: root[root] @ localhost [127.0.0.1]
# Conn_ID: 3086
# Exec_retry_time: 5.1 Exec_retry_count: 3
# Query_time: 1.527627037
# Parse_time: 0.000054933
# Compile_time: 0.000129729
# Rewrite_time: 0.000000003 Preproc_subqueries: 2 Preproc_subqueries_time: 0.000000002
# Optimize_time: 0.00000001
# Wait_TS: 0.00001078
# Process_time: 0.07 Request_count: 1 Total_keys: 131073 Process_keys: 131072 Prewrite_time: 0.335415029 Commit_time: 0.032175429 Get_commit_ts_time: 0.000177098 Local_latch_wait_time: 0.106869448 Write_keys: 131072 Write_size: 3538944 Prewrite_region: 1
# DB: test
# Is_internal: false
# Digest: 50a2e32d2abbd6c1764b1b7f2058d428ef2712b029282b776beb9506a365c0f1
# Stats: t:pseudo
# Num_cop_tasks: 1
# Cop_proc_avg: 0.07 Cop_proc_p90: 0.07 Cop_proc_max: 0.07 Cop_proc_addr: 172.16.5.87:20171
# Cop_wait_avg: 0 Cop_wait_p90: 0 Cop_wait_max: 0 Cop_wait_addr: 172.16.5.87:20171
# Cop_backoff_regionMiss_total_times: 200 Cop_backoff_regionMiss_total_time: 0.2 Cop_backoff_regionMiss_max_time: 0.2 Cop_backoff_regionMiss_max_addr: 127.0.0.1 Cop_backoff_regionMiss_avg_time: 0.2 Cop_backoff_regionMiss_p90_time: 0.2
# Cop_backoff_rpcPD_total_times: 200 Cop_backoff_rpcPD_total_time: 0.2 Cop_backoff_rpcPD_max_time: 0.2 Cop_backoff_rpcPD_max_addr: 127.0.0.1 Cop_backoff_rpcPD_avg_time: 0.2 Cop_backoff_rpcPD_p90_time: 0.2
# Cop_backoff_rpcTiKV_total_times: 200 Cop_backoff_rpcTiKV_total_time: 0.2 Cop_backoff_rpcTiKV_max_time: 0.2 Cop_backoff_rpcTiKV_max_addr: 127.0.0.1 Cop_backoff_rpcTiKV_avg_time: 0.2 Cop_backoff_rpcTiKV_p90_time: 0.2
# Mem_max: 525211
# Disk_max: 65536
# Prepared: false
# Plan_from_cache: false
# Succ: true
# Plan: tidb_decode_plan('ZJAwCTMyXzcJMAkyMAlkYXRhOlRhYmxlU2Nhbl82CjEJMTBfNgkxAR0AdAEY1Dp0LCByYW5nZTpbLWluZiwraW5mXSwga2VlcCBvcmRlcjpmYWxzZSwgc3RhdHM6cHNldWRvCg==')
use test;
insert into t select * from t;
  • 慢日志格式化源码 20231110110011

20231110110045

吐槽一翻正式开搞,出于业务需求希望能够将TIDB相关慢查询日志告警出来,方便进行下一步优化处理。

本文通过 Fluent bitMultiline Core[3] 插件来进行多行匹配 和 Lua 插件提取相关字段输出JSON上报到 LokiFluent bit 在1.8版本引入新的 Multiline核心功能 Multiline Core

使用Tail插件进行多行日志采集

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


cat << EOF > /data/services/fluent-bit/conf/parsers_multiline.conf
[MULTILINE_PARSER]
    name          multiline-regex
    type          regex
    flush_timeout 1000
    # 开始行的正则表达式
    rule         "start_state"   "/#.+?Time:\s+/"               "cont"
    # 以 # 开头的所有行
    rule         "cont"          "/#.+?(?!Time):\s+/"           "cont"
    # use db ; commit;操作独立行;
    rule         "cont"          "/^(use\s|commit;).*/"           "cont"
    # 根据SQL关键字匹配的行,
    rule         "cont"          "/(.*(CREATE|ADD|ALTER|SET|DROP|DELETE|SELECT|INSERT|UPDATE|create|add|alter|set|drop|delete|select|insert|update|#drc#).*);/"                    "start_state"

### 自定义解析,格式化JSON,并声明字段类型
[PARSER]
    Name        log_json
    Format      json
    Time_Key    time
    Time_Format %Y-%m-%dT%H:%M:%S.%L%z
    Types query_time:float conn_id:integer

EOF

Lua 解析TIDB慢查询日志

按照Fluent bit Lua插件规范,通过正则表达式将所需要的字段提取出来,并通过table形式返回json,第一次写Lua,有点矬。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
cat << EOF > /data/services/fluent-bit/conf/filters.lua

function extract_string(tag, timestamp, record)
    local log = record["log"]
    local new_record = {}
    local base_pattern = ":%s"
    local str_arr ={
        "Time",
        "DB",
        "User",
        "Host",
        "Index_names",
        "Preproc_subqueries",
        "Is_internal",
        "Prepared",
        "Plan_from_cache",
        "Plan_from_binding",
        "Has_more_results",
        "Succ",
        "Prev_stmt",
        "IsExplicitTxn",
        "IsWriteCacheTable",
        "IsSyncStatsFailed",
        "sql"
    }

    for i = 1, #str_arr do
        local pattern = string.format("%s%s%s", str_arr[i], base_pattern, "(%w+)")
        if str_arr[i] == "Time" then
            pattern = "Time:%s(%d+-%d+-%d+T%d+:%d+:%d+.%d+%+%d+:%d+)"
        elseif str_arr[i] == "DB" then
            pattern = "DB:%s(%w+%p+%w+)"
        elseif str_arr[i] == "sql" then
            pattern = "#%sIsSyncStatsFailed:%s.*\n(.*;)"
        elseif str_arr[i] == "Host" then
            pattern = "#%sUser@Host:.*@%s(%d+%.%d+%.%d+%.%d+)"
        elseif str_arr[i] == "User" then
            pattern = "#%sUser@Host:.*%[(.*)%]%s@"
        end
        local field = string.match(log, pattern)
        if field then
            new_record[string.lower(str_arr[i])] = field
        end
    end

    local int_arr = {
        "Txn_start_ts",
        "Conn_ID",
        "Num_cop_tasks",
        "Mem_max",
        "Disk_max",
        "Result_rows",
        "Exec_retry_count",
    }
    for i = 1, #int_arr do
        local pattern = string.format("%s%s%s", int_arr[i], base_pattern, "(%d+)")
        local field = string.match(log, pattern)
        if field then
            new_record[string.lower(int_arr[i])] = field
        end
    end

    local addr_arr = {
        "Cop_proc_addr",
        "Cop_wait_addr",
    }
    for i = 1, #addr_arr do
        local pattern = string.format("%s%s%s", addr_arr[i], base_pattern, "%d+%.%d+%.%d+%.%d+")
        local field = string.match(log, pattern)
        if field then
            new_record[string.lower(addr_arr[i])] = field
        end
    end

    local folat_arr = {
        "Query_time",
        "Cop_time",
        "Parse_time",
        "Compile_time",
        "Rewrite_time",
        "Optimize_time",
        "Wait_TS",
        "Preproc_subqueries_time",
        "Cop_proc_avg",
        "Cop_proc_p90",
        "Cop_proc_max",
        "Cop_wait_avg",
        "Cop_wait_p90",
        "Cop_wait_max",
        "Exec_retry_time",
        "KV_total",
        "PD_total",
        "Backoff_total",
        "Write_sql_response_total",
        "Backoff_Detail",
    }
    for i = 1, #folat_arr do
        local pattern = string.format("%s%s%s", folat_arr[i], base_pattern, "(%d+.%d+)")
        local field = string.match(log, pattern)
        if field then
            new_record[string.lower(folat_arr[i])] = field
        end
    end
    --  new_record["message"] = record["log"]
    return 1, timestamp, new_record
end
EOF

Fluent bit 配置

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
export local_ip=$(curl http://169.254.169.254/latest/meta-data/local-ipv4)

cat << EOF > /data/services/fluent-bit/conf/fluent-bit.conf
[SERVICE]
    flush        1
    log_level    info
    parsers_file parsers_multiline.conf

[INPUT]
    name             tail
    path             /data/tidb_*_log/tidb_slow_query.log
    #read_from_head   true
    multiline.parser multiline-regex
    Tag              tidb.slow
    db               /fluent-bit/tail/tidb-slow.pos

[Filter]
    Name    lua
    Match   tidb.slow
    script  filters.lua   ## lua脚本
    call    extract_string  # 调用函数

[FILTER]
    Name parser
    Match tidb.slow
    Key_Name .
    Parser log_json
    Reserve_Data On

[OUTPUT]
    name        loki
    match       tidb.slow
    host        <loki host,不包含scheme和uri>
    port        443
    tls         on
    tls.verify  on
    http_user   <loki 租户名称>
    http_passwd <loki 租户密码>
    labels      job=tidb-slow,address=${local_ip}
EOF

启动Fluent bit

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
docker run   --name "fluent-bit" -d \
        --memory=2g --cpus=1 \
        --restart=always \
        -v /data/services/fluent-bit/conf/fluent-bit.conf:/fluent-bit/etc/fluent-bit.conf:ro \
        -v /data/services/fluent-bit/conf/parsers_multiline.conf:/fluent-bit/etc/parsers_multiline.conf \
        -v /data/services/fluent-bit/conf/filters.lua:/fluent-bit/etc/filters.lua \
        -v /data/services/fluent-bit/data:/fluent-bit/tail \
        -v /data/:/data/ \
        -v /var/log:/var/log \
        fluent/fluent-bit \
        "/fluent-bit/bin/fluent-bit" "-c" "/fluent-bit/etc/fluent-bit.conf"

最终效果

20231110113914

参考

  1. TIDB慢查询日志格式化详细参考
  2. TIDB慢查询日志
  3. Multiline Core
0%