diff --git a/.github/workflows/VeniceCI-E2ETests.yml b/.github/workflows/VeniceCI-E2ETests.yml index 823b472131a..1c7637a8d52 100644 --- a/.github/workflows/VeniceCI-E2ETests.yml +++ b/.github/workflows/VeniceCI-E2ETests.yml @@ -21,7 +21,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_1 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -40,7 +40,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_2 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -59,7 +59,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_3 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -78,7 +78,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_4 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -97,7 +97,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_5 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -116,7 +116,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_6 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -135,7 +135,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_7 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -154,7 +154,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_8 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -173,7 +173,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_9 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -192,7 +192,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_10 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -211,7 +211,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_11 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -230,7 +230,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_12 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -249,7 +249,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_13 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -268,7 +268,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_14 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -287,7 +287,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_15 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -306,7 +306,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_16 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -325,7 +325,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_17 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -344,7 +344,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_18 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -363,7 +363,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_19 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -382,7 +382,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_20 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -401,7 +401,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_21 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -420,7 +420,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_22 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -439,7 +439,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_23 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -458,7 +458,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_24 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -477,7 +477,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_25 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -496,7 +496,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_26 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -515,7 +515,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_27 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -534,7 +534,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_28 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -553,7 +553,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_29 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -572,7 +572,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_30 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -591,7 +591,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_31 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -610,7 +610,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_32 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -629,7 +629,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_33 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -648,7 +648,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_34 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -667,7 +667,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_35 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -686,7 +686,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_36 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -705,7 +705,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_37 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -724,7 +724,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_38 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -743,7 +743,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_39 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -762,7 +762,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_40 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -781,7 +781,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_41 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -800,7 +800,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_42 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -819,7 +819,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_43 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -838,7 +838,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_44 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -857,7 +857,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_45 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -876,7 +876,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_46 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -895,7 +895,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_47 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -914,7 +914,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_48 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -933,7 +933,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_49 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -952,7 +952,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_50 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -971,7 +971,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_51 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -990,7 +990,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_52 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1009,7 +1009,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_53 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1028,7 +1028,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_54 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1047,7 +1047,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_55 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1066,7 +1066,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_56 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1085,7 +1085,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_57 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1104,7 +1104,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_58 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1123,7 +1123,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_59 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1142,7 +1142,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_60 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1161,7 +1161,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_61 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1180,7 +1180,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_62 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1199,7 +1199,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_63 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1218,7 +1218,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_64 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1237,7 +1237,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_65 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1256,7 +1256,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_66 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1275,7 +1275,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_67 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1294,7 +1294,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_68 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1313,7 +1313,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_69 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1332,7 +1332,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_70 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1351,7 +1351,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_71 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1370,7 +1370,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_72 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1389,7 +1389,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_73 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1408,7 +1408,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_74 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1427,7 +1427,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_75 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1446,7 +1446,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_76 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1465,7 +1465,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_77 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1484,7 +1484,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_78 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1503,7 +1503,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_79 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1522,7 +1522,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_80 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1541,7 +1541,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_81 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1560,7 +1560,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_82 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1579,7 +1579,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_83 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1598,7 +1598,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_84 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1617,7 +1617,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_85 cancel-in-progress: ${{ github.event_name == 'pull_request' }} @@ -1636,7 +1636,7 @@ jobs: runs-on: ubuntu-latest permissions: contents: read - timeout-minutes: 12 + timeout-minutes: 15 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-jdk17-IntegrationTests_99 cancel-in-progress: ${{ github.event_name == 'pull_request' }} diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/config/VeniceServerConfig.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/config/VeniceServerConfig.java index f5019ff7c13..1868a2fb115 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/config/VeniceServerConfig.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/config/VeniceServerConfig.java @@ -118,6 +118,7 @@ import static com.linkedin.venice.ConfigKeys.SERVER_ENABLE_PARALLEL_BATCH_GET; import static com.linkedin.venice.ConfigKeys.SERVER_FORKED_PROCESS_JVM_ARGUMENT_LIST; import static com.linkedin.venice.ConfigKeys.SERVER_GLOBAL_RT_DIV_ENABLED; +import static com.linkedin.venice.ConfigKeys.SERVER_HEARTBEAT_REPORTER_INTERVAL_SECONDS; import static com.linkedin.venice.ConfigKeys.SERVER_HELIX_JOIN_AS_UNKNOWN; import static com.linkedin.venice.ConfigKeys.SERVER_HTTP2_HEADER_TABLE_SIZE; import static com.linkedin.venice.ConfigKeys.SERVER_HTTP2_INBOUND_ENABLED; @@ -616,6 +617,7 @@ public class VeniceServerConfig extends VeniceClusterConfig { private final boolean recordLevelTimestampEnabled; private final boolean perRecordOtelMetricsEnabled; private final boolean perRecordBatchOtelMetricsEnabled; + private final int heartbeatReporterIntervalSeconds; private final boolean uniqueIngestedKeyCountHllEnabled; private final int uniqueIngestedKeyCountHllLog2K; private final long leaderCompleteStateCheckInFollowerValidIntervalMs; @@ -1066,6 +1068,12 @@ public VeniceServerConfig(VeniceProperties serverProperties, Map(3); + this.writeType = isWriteComputationEnabled ? VeniceStoreWriteType.WRITE_COMPUTE : VeniceStoreWriteType.REGULAR; + this.chunkingStatus = isChunked ? VeniceChunkingStatus.CHUNKED : VeniceChunkingStatus.UNCHUNKED; + this.localRegionName = localRegionName; // Restore in-memory latest consumed version topic position and leader info from the checkpoint version topic // position this.latestProcessedVtPosition = offsetRecord.getCheckpointedLocalVtPosition(); @@ -1470,14 +1490,33 @@ public void setHasResubscribedAfterBootstrapAsCurrentVersion(boolean hasResubscr /** * Get or create a cached HeartbeatKey for the given region. - * Derives storeName/version from the partition replica topic name. + * Derives storeName/version from the partition replica topic name. SLO labels (write type, + * chunking, locality) come from constructor args and are baked into the cached key so per-record + * OTel emission can read them without re-derivation. + * + *

Locality is derived per region by comparing the cached key's region to the local region + * supplied at construction: equal → LOCAL, otherwise REMOTE. When the local region is null or + * empty (lookup-only paths in tests, or unconfigured server region), locality is left null on + * the cached key — defaulting it here would mislabel every region as REMOTE. The OTel emit + * path coerces null → REMOTE at emission time so the metric still ships a concrete label. */ public HeartbeatKey getOrCreateCachedHeartbeatKey(String region) { - return cachedHeartbeatKeys.computeIfAbsent(region, r -> { + /* + * Normalize via RegionUtils so the per-record path produces the same region string as the + * HMS-side initializeEntry path. Both paths feed the same heartbeat-timestamps map and must + * agree on HeartbeatKey identity — otherwise computeIfPresent silently no-ops and the + * heartbeat-lag-driven ready-to-serve check never trips. + */ + String normalizedRegion = RegionUtils.normalizeRegionName(region); + return cachedHeartbeatKeys.computeIfAbsent(normalizedRegion, r -> { String topicName = partitionReplica.getTopicName(); String storeName = Version.parseStoreFromKafkaTopicName(topicName); int version = Version.parseVersionFromKafkaTopicName(topicName); - return new HeartbeatKey(storeName, version, getPartition(), r); + VeniceRegionLocality locality = null; + if (localRegionName != null && !localRegionName.isEmpty()) { + locality = r.equals(localRegionName) ? VeniceRegionLocality.LOCAL : VeniceRegionLocality.REMOTE; + } + return new HeartbeatKey(storeName, version, getPartition(), r, writeType, chunkingStatus, locality); }); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTask.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTask.java index 71660fa75e5..5f8c1bcbca2 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTask.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTask.java @@ -2766,8 +2766,20 @@ private PartitionConsumptionState createAndInstallPartitionConsumptionState(PubS storageMetadataService.getLastOffset(topicPartition.getTopicName(), partition, pubSubContext); LOGGER.info("Creating PCS for replica: {} with offsetRecord: {}", topicPartition, offsetRecord); - PartitionConsumptionState freshPcs = - new PartitionConsumptionState(topicPartition, offsetRecord, pubSubContext, hybridStoreConfig.isPresent()); + /* + * SLO labels are resolved once per PCS at construction. They ride along on every cached + * HeartbeatKey so the per-record OTel emit path reads pre-computed enum references (no string + * allocation, no per-call store/version lookup). Chunking is per-version + * (Version.isChunkingEnabled). Write-compute is store-level today (Store.isWriteComputationEnabled). + */ + PartitionConsumptionState freshPcs = new PartitionConsumptionState( + topicPartition, + offsetRecord, + pubSubContext, + hybridStoreConfig.isPresent(), + isWriteComputationEnabled, + isChunked, + serverConfig.getRegionName()); if (uniqueIngestedKeyCountHllEnabled) { int lgK = serverConfig.getUniqueIngestedKeyCountHllLog2K(); boolean isNewSubscription = PubSubSymbolicPosition.EARLIEST.equals(offsetRecord.getCheckpointedLocalVtPosition()); @@ -2814,8 +2826,14 @@ private PartitionConsumptionState createPlaceholderPartitionConsumptionState( int partition) { OffsetRecord placeholderOffset = new OffsetRecord(partitionStateSerializer, pubSubContext); - PartitionConsumptionState pcs = - new PartitionConsumptionState(topicPartition, placeholderOffset, pubSubContext, hybridStoreConfig.isPresent()); + PartitionConsumptionState pcs = new PartitionConsumptionState( + topicPartition, + placeholderOffset, + pubSubContext, + hybridStoreConfig.isPresent(), + isWriteComputationEnabled, + isChunked, + serverConfig.getRegionName()); pcs.setCurrentVersionSupplier(isCurrentVersion); boolean isFutureVersionReady = isFutureVersionReady(kafkaVersionTopic, storeRepository); @@ -3078,7 +3096,10 @@ private void resetOffset(int partition, PubSubTopicPartition topicPartition, boo new PubSubTopicPartitionImpl(versionTopic, partition), new OffsetRecord(partitionStateSerializer, pubSubContext), pubSubContext, - hybridStoreConfig.isPresent()); + hybridStoreConfig.isPresent(), + isWriteComputationEnabled, + isChunked, + serverConfig.getRegionName()); if (uniqueIngestedKeyCountHllEnabled) { consumptionState.initializeUniqueKeyCountHll(serverConfig.getUniqueIngestedKeyCountHllLog2K()); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatKey.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatKey.java index 50e90d0c1a7..e51f912c324 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatKey.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatKey.java @@ -1,24 +1,59 @@ package com.linkedin.davinci.stats.ingestion.heartbeat; import com.linkedin.venice.meta.Version; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; import com.linkedin.venice.utils.Utils; /** - * Composite key for the flattened heartbeat timestamp map + * Composite key for the flattened heartbeat timestamp map. + * + *

Identity is {@code (storeName, version, partition, region)}. The SLO classification + * labels {@link #writeType}, {@link #chunkingStatus}, {@link #locality} are passenger fields: + * resolved once when the caller (SIT) builds the key, carried along the per-record hot path + * to avoid repeated derivation, and intentionally excluded from {@link #equals}/{@link #hashCode}. */ public final class HeartbeatKey { final String storeName; final int version; final int partition; final String region; + /* + * Passenger labels: not part of identity. Used by per-record OTel emission so each record + * carries pre-resolved enum references (zero allocation on the hot path). + */ + final VeniceStoreWriteType writeType; + final VeniceChunkingStatus chunkingStatus; + final VeniceRegionLocality locality; private final int hashCode; + /** + * Lookup-only constructor: builds a key without the SLO labels. Use only when the resulting + * key is consumed for identity (map lookup, equality) and never flows to per-record OTel emission. + * The canonical hot-path constructor is the 7-arg variant below — that one is what the SIT/PCS + * pipeline uses so each record carries pre-resolved enum references. + */ public HeartbeatKey(String storeName, int version, int partition, String region) { + this(storeName, version, partition, region, null, null, null); + } + + public HeartbeatKey( + String storeName, + int version, + int partition, + String region, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus, + VeniceRegionLocality locality) { this.storeName = storeName; this.version = version; this.partition = partition; this.region = region; + this.writeType = writeType; + this.chunkingStatus = chunkingStatus; + this.locality = locality; // Manual hash computation avoids Objects.hash() varargs Object[] allocation and Integer autoboxing int h = storeName.hashCode(); h = 31 * h + version; @@ -31,6 +66,27 @@ String getReplicaId() { return Utils.getReplicaId(Version.composeKafkaTopic(storeName, version), partition); } + /** + * Public accessor for cross-package tests; the package-private field is the canonical source. + */ + public VeniceStoreWriteType getWriteType() { + return writeType; + } + + /** + * Public accessor for cross-package tests; the package-private field is the canonical source. + */ + public VeniceChunkingStatus getChunkingStatus() { + return chunkingStatus; + } + + /** + * Public accessor for cross-package tests; the package-private field is the canonical source. + */ + public VeniceRegionLocality getLocality() { + return locality; + } + @Override public boolean equals(Object o) { if (this == o) { diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java index 2fb96c16385..842fc4ace59 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java @@ -17,8 +17,12 @@ import com.linkedin.venice.meta.Version; import com.linkedin.venice.meta.VersionImpl; import com.linkedin.venice.service.AbstractVeniceService; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; import com.linkedin.venice.stats.dimensions.VeniceHeartbeatComponent; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; import com.linkedin.venice.utils.LogContext; +import com.linkedin.venice.utils.RegionUtils; import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricConfig; @@ -27,6 +31,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; @@ -84,6 +89,7 @@ public class HeartbeatMonitoringService extends AbstractVeniceService { private final int lagMonitorCleanupCycle; private final boolean recordLevelTimestampEnabled; private final boolean perRecordOtelMetricsEnabled; + private final int heartbeatReporterIntervalSeconds; private HelixCustomizedViewOfflinePushRepository customizedViewRepository; private KafkaStoreIngestionService kafkaStoreIngestionService; @@ -113,12 +119,14 @@ public HeartbeatMonitoringService( leaderHeartbeatTimeStamps, followerHeartbeatTimeStamps, serverConfig.getClusterName()); - this.heartbeatMonitoringServiceStats = heartbeatMonitoringServiceStats; + this.heartbeatMonitoringServiceStats = + Objects.requireNonNull(heartbeatMonitoringServiceStats, "heartbeatMonitoringServiceStats cannot be null"); this.customizedViewRepositoryFuture = customizedViewRepositoryFuture; this.nodeId = Utils.getHelixNodeIdentifier(serverConfig.getListenerHostname(), serverConfig.getListenerPort()); this.lagMonitorCleanupCycle = serverConfig.getLagMonitorCleanupCycle(); this.recordLevelTimestampEnabled = serverConfig.isRecordLevelTimestampEnabled(); this.perRecordOtelMetricsEnabled = serverConfig.isPerRecordOtelMetricsEnabled(); + this.heartbeatReporterIntervalSeconds = serverConfig.getHeartbeatReporterIntervalSeconds(); this.serverConfig = serverConfig; LOGGER.info( "HeartbeatMonitoringService initialized with localRegionName: {}, regionNames: {}", @@ -131,7 +139,9 @@ private synchronized void initializeEntry( Version version, int partition, boolean isFollower, - String replicaId) { + String replicaId, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus) { // We don't monitor heartbeat lag for non-hybrid versions if (version.getHybridStoreConfig() == null) { return; @@ -139,12 +149,30 @@ private synchronized void initializeEntry( String storeName = version.getStoreName(); int versionNum = version.getNumber(); long currentTime = System.currentTimeMillis(); + /* + * SLO labels are baked into every HeartbeatKey we put into the map. The periodic record-lag + * path iterates the map and reads these labels off the stored key; because + * HeartbeatKey.equals/hashCode ignore the labels (passenger fields), later updates never + * replace the stored key object — so the labels MUST be set at insertion time. Labels come + * pre-resolved from updateLagMonitor (the only public entry point), which already holds + * the Store + Version it resolved via waitVersion. + * + * Locality is left null on the stored key when localRegionName is null or empty + * (unconfigured server) — defaulting it here would mislabel every region as REMOTE. + * The OTel emit path coerces null → REMOTE at emission time so the metric still ships + * a concrete label. + */ + boolean haveLocalRegion = localRegionName != null && !localRegionName.isEmpty(); if (version.isActiveActiveReplicationEnabled() && !isFollower) { for (String region: regionNames) { if (Utils.isSeparateTopicRegion(region) && !version.isSeparateRealTimeTopicEnabled()) { continue; } - HeartbeatKey key = new HeartbeatKey(storeName, versionNum, partition, region); + VeniceRegionLocality locality = haveLocalRegion + ? (region.equals(localRegionName) ? VeniceRegionLocality.LOCAL : VeniceRegionLocality.REMOTE) + : null; + HeartbeatKey key = + new HeartbeatKey(storeName, versionNum, partition, region, writeType, chunkingStatus, locality); IngestionTimestampEntry previousEntry = heartbeatTimestamps.putIfAbsent(key, new IngestionTimestampEntry(currentTime, currentTime, false, false)); if (previousEntry == null) { @@ -156,14 +184,29 @@ private synchronized void initializeEntry( } } } else { - HeartbeatKey key = new HeartbeatKey(storeName, versionNum, partition, localRegionName); + /* + * Non-AA branch keys the entry by the local region. Normalize via RegionUtils so an + * unconfigured server (localRegionName null/empty) becomes UNKNOWN_REGION instead of "". + * The OTel base-dimension validator rejects empty strings, and the per-record path also + * normalizes through RegionUtils — using the same normalization here keeps the two + * paths' HeartbeatKey.region values matchable. + */ + String region = RegionUtils.normalizeRegionName(localRegionName); + HeartbeatKey key = new HeartbeatKey( + storeName, + versionNum, + partition, + region, + writeType, + chunkingStatus, + haveLocalRegion ? VeniceRegionLocality.LOCAL : null); IngestionTimestampEntry previousEntry = heartbeatTimestamps.putIfAbsent(key, new IngestionTimestampEntry(currentTime, currentTime, false, false)); if (previousEntry == null) { LOGGER.info( "Initialized heartbeat entry for replica: {}, region: {}, follower: {}", replicaId, - localRegionName, + region, isFollower); } } @@ -189,11 +232,19 @@ private synchronized void removeEntry( * * @param version the version to monitor lag for * @param partition the partition to monitor lag for + * @param replicaId the replica id used for log/cleanup keying + * @param writeType pre-resolved store write-type label baked into every HeartbeatKey for this entry + * @param chunkingStatus pre-resolved version-level chunking label baked into every HeartbeatKey for this entry */ - public void addFollowerLagMonitor(Version version, int partition, String replicaId) { + public void addFollowerLagMonitor( + Version version, + int partition, + String replicaId, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus) { cleanupHeartbeatMap.compute(replicaId, (k, v) -> { - // See comments in {@link #addLeaderLagMonitor(Version, int)} for race condition explanations - initializeEntry(followerHeartbeatTimeStamps, version, partition, true, replicaId); + // See comments in {@link #addLeaderLagMonitor} for race condition explanations + initializeEntry(followerHeartbeatTimeStamps, version, partition, true, replicaId, writeType, chunkingStatus); removeEntry(leaderHeartbeatTimeStamps, version, partition, replicaId); return null; }); @@ -205,8 +256,16 @@ public void addFollowerLagMonitor(Version version, int partition, String replica * * @param version the version to monitor lag for * @param partition the partition to monitor lag for + * @param replicaId the replica id used for log/cleanup keying + * @param writeType pre-resolved store write-type label baked into every HeartbeatKey for this entry + * @param chunkingStatus pre-resolved version-level chunking label baked into every HeartbeatKey for this entry */ - public void addLeaderLagMonitor(Version version, int partition, String replicaId) { + public void addLeaderLagMonitor( + Version version, + int partition, + String replicaId, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus) { cleanupHeartbeatMap.compute(replicaId, (k, v) -> { // Cleanup logic should perform the check and cleanup in a similar compute block to ensure that the following // race won't occur: @@ -214,7 +273,7 @@ public void addLeaderLagMonitor(Version version, int partition, String replicaId // 2. The same replica is reassigned to this node and added to lag monitor before the cleanup thread is able to // remove it. // 3. The cleanup thread removes the newly added lag monitor and the replica will be ingested without lag monitor - initializeEntry(leaderHeartbeatTimeStamps, version, partition, false, replicaId); + initializeEntry(leaderHeartbeatTimeStamps, version, partition, false, replicaId, writeType, chunkingStatus); removeEntry(followerHeartbeatTimeStamps, version, partition, replicaId); return null; }); @@ -367,12 +426,22 @@ public void updateLagMonitor( // It's valid to have null version when trying to remove lag monitor for the deleted resource. version = new VersionImpl(storeName, storeVersion, ""); } + /* + * Resolve SLO labels here, where we already hold the Store + Version that waitVersion just + * returned. This is the single source of truth for labels — both the per-record OTel emit + * path (via PCS-cached HeartbeatKey) and the periodic record-lag path (via these HMS-built + * keys) end up consuming labels resolved from the same Store/Version snapshot. + */ + VeniceStoreWriteType writeType = + store.isWriteComputationEnabled() ? VeniceStoreWriteType.WRITE_COMPUTE : VeniceStoreWriteType.REGULAR; + VeniceChunkingStatus chunkingStatus = + version.isChunkingEnabled() ? VeniceChunkingStatus.CHUNKED : VeniceChunkingStatus.UNCHUNKED; switch (heartbeatLagMonitorAction) { case SET_LEADER_MONITOR: - addLeaderLagMonitor(version, partitionId, replicaId); + addLeaderLagMonitor(version, partitionId, replicaId, writeType, chunkingStatus); break; case SET_FOLLOWER_MONITOR: - addFollowerLagMonitor(version, partitionId, replicaId); + addFollowerLagMonitor(version, partitionId, replicaId, writeType, chunkingStatus); break; case REMOVE_MONITOR: removeLagMonitor(version, partitionId, replicaId); @@ -385,6 +454,7 @@ public void updateLagMonitor( replicaId, heartbeatLagMonitorAction.getTrigger(), e); + heartbeatMonitoringServiceStats.recordHeartbeatExceptionCount(VeniceHeartbeatComponent.LAG_MONITOR_UPDATE); } } @@ -542,7 +612,14 @@ public void recordLeaderRecordTimestamp(HeartbeatKey key, Long timestamp, boolea if (perRecordOtelMetricsEnabled) { long delay = System.currentTimeMillis() - timestamp; - versionStatsReporter.emitPerRecordLeaderOtelMetric(key.storeName, key.version, key.region, delay); + versionStatsReporter.emitPerRecordLeaderOtelMetric( + key.storeName, + key.version, + key.region, + delay, + key.writeType, + key.chunkingStatus, + key.locality); } } @@ -559,8 +636,15 @@ public void recordFollowerRecordTimestamp(HeartbeatKey key, Long timestamp, bool if (perRecordOtelMetricsEnabled) { long delay = System.currentTimeMillis() - timestamp; - versionStatsReporter - .emitPerRecordFollowerOtelMetric(key.storeName, key.version, key.region, delay, isReadyToServe); + versionStatsReporter.emitPerRecordFollowerOtelMetric( + key.storeName, + key.version, + key.region, + delay, + isReadyToServe, + key.writeType, + key.chunkingStatus, + key.locality); } } @@ -642,55 +726,69 @@ protected Map getFollowerHeartbeatTimeSta protected void recordLags( Map heartbeatTimestamps, - ReportLagFunction lagFunction) { + RecordLatencyFunction lagFunction) { for (Map.Entry entry: heartbeatTimestamps.entrySet()) { - HeartbeatKey key = entry.getKey(); - lagFunction.apply( - key.storeName, - key.version, - key.region, - entry.getValue().heartbeatTimestamp, - entry.getValue().readyToServe); + lagFunction.apply(entry.getKey(), entry.getValue().heartbeatTimestamp, entry.getValue().readyToServe); } } - protected void recordRecordLags( + protected void reportRecordLatencies( Map heartbeatTimestamps, - ReportLagFunction lagFunction) { + RecordLatencyFunction reporter) { for (Map.Entry entry: heartbeatTimestamps.entrySet()) { - HeartbeatKey key = entry.getKey(); - lagFunction.apply( - key.storeName, - key.version, - key.region, - entry.getValue().recordTimestamp, - entry.getValue().readyToServe); + reporter.apply(entry.getKey(), entry.getValue().recordTimestamp, entry.getValue().readyToServe); } } protected void record() { - // Record heartbeat message delays + // Record heartbeat message delays — labels are baked on the HeartbeatKey at insertion. recordLags( leaderHeartbeatTimeStamps, - ((storeName, version, region, heartbeatTs, isReadyToServe) -> versionStatsReporter - .recordLeaderLag(storeName, version, region, heartbeatTs))); + ((key, heartbeatTs, isReadyToServe) -> versionStatsReporter.recordLeaderLag( + key.storeName, + key.version, + key.region, + heartbeatTs, + key.writeType, + key.chunkingStatus, + key.locality))); recordLags( followerHeartbeatTimeStamps, - ((storeName, version, region, heartbeatTs, isReadyToServe) -> versionStatsReporter - .recordFollowerLag(storeName, version, region, heartbeatTs, isReadyToServe))); + ((key, heartbeatTs, isReadyToServe) -> versionStatsReporter.recordFollowerLag( + key.storeName, + key.version, + key.region, + heartbeatTs, + isReadyToServe, + key.writeType, + key.chunkingStatus, + key.locality))); // Record record-level delays via OTel periodically. Skip if per-record OTel metrics are already enabled, // since those emit accurate per-message latency and the periodic snapshot would add inaccurate data points // (delay grows by up to the reporting interval since it uses max(recordTimestamp) rather than per-record values). if (recordLevelTimestampEnabled && !perRecordOtelMetricsEnabled) { - recordRecordLags( + reportRecordLatencies( leaderHeartbeatTimeStamps, - ((storeName, version, region, recordTs, isReadyToServe) -> versionStatsReporter - .recordLeaderRecordLag(storeName, version, region, recordTs))); - recordRecordLags( + ((key, recordTs, isReadyToServe) -> versionStatsReporter.recordLeaderRecordLag( + key.storeName, + key.version, + key.region, + recordTs, + key.writeType, + key.chunkingStatus, + key.locality))); + reportRecordLatencies( followerHeartbeatTimeStamps, - ((storeName, version, region, recordTs, isReadyToServe) -> versionStatsReporter - .recordFollowerRecordLag(storeName, version, region, recordTs, isReadyToServe))); + ((key, recordTs, isReadyToServe) -> versionStatsReporter.recordFollowerRecordLag( + key.storeName, + key.version, + key.region, + recordTs, + isReadyToServe, + key.writeType, + key.chunkingStatus, + key.locality))); } } @@ -808,7 +906,7 @@ private void cleanupStoreVersionPartitionMap(Map { // Replica is not assigned to this node based on locally cached customized view - // See comments in {@link #addLeaderLagMonitor(Version, int)} for race condition explanations + // See comments in {@link #addLeaderLagMonitor} for race condition explanations if (v == null) { return 1; } else if (v + 1 >= lagMonitorCleanupCycle) { @@ -874,9 +972,15 @@ public AggregatedHeartbeatLagEntry getMaxFollowerHeartbeatLag() { return getMaxHeartbeatLag(System.currentTimeMillis(), followerHeartbeatTimeStamps); } + /** + * Hands the full {@link HeartbeatKey} to the consumer so both the periodic record-lag path and + * the heartbeat-lag path can piggyback on the SLO labels carried by the key. The {@code timestamp} + * parameter is the raw producer/heartbeat timestamp from the entry — the consumer is expected to + * compute the lag (now − timestamp). + */ @FunctionalInterface - interface ReportLagFunction { - void apply(String storeName, int version, String region, long lag, boolean isReadyToServe); + interface RecordLatencyFunction { + void apply(HeartbeatKey key, long timestamp, boolean isReadyToServe); } private class HeartbeatReporterThread extends Thread { @@ -895,11 +999,11 @@ public void run() { while (heartbeatReporterThreadIsRunning.get()) { try { if (exceptionThrown) { - TimeUnit.SECONDS.sleep(DEFAULT_REPORTER_THREAD_SLEEP_INTERVAL_SECONDS); + TimeUnit.SECONDS.sleep(heartbeatReporterIntervalSeconds); } heartbeatMonitoringServiceStats.recordReporterHeartbeat(); record(); - TimeUnit.SECONDS.sleep(DEFAULT_REPORTER_THREAD_SLEEP_INTERVAL_SECONDS); + TimeUnit.SECONDS.sleep(heartbeatReporterIntervalSeconds); exceptionThrown = false; } catch (InterruptedException e) { // We've received an interrupt which is to be expected, so we'll just leave the loop and log diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelMetricEntity.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelMetricEntity.java index fae2c46f38c..5863c52d8a3 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelMetricEntity.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelMetricEntity.java @@ -1,10 +1,13 @@ package com.linkedin.davinci.stats.ingestion.heartbeat; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CHUNKING_STATUS; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_LOCALITY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_NAME; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_STATE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_WRITE_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_VERSION_ROLE; import static com.linkedin.venice.utils.Utils.setOf; @@ -24,9 +27,12 @@ public enum HeartbeatOtelMetricEntity implements ModuleMetricEntityInterface { VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REGION_NAME, + VENICE_REGION_LOCALITY, VENICE_VERSION_ROLE, VENICE_REPLICA_TYPE, - VENICE_REPLICA_STATE) + VENICE_REPLICA_STATE, + VENICE_STORE_WRITE_TYPE, + VENICE_CHUNKING_STATUS) ); private final MetricEntity metricEntity; diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStats.java index 7233d330cbe..1b0e6ac12cf 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStats.java @@ -10,8 +10,11 @@ import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; import com.linkedin.venice.stats.dimensions.ReplicaState; import com.linkedin.venice.stats.dimensions.ReplicaType; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; -import com.linkedin.venice.stats.metrics.MetricEntityStateThreeEnums; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; +import com.linkedin.venice.stats.metrics.MetricEntityStateFiveEnums; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; import java.util.HashMap; @@ -29,9 +32,11 @@ public class HeartbeatOtelStats { /** * Per-region metric entity states, keyed by region name. Grows lazily via {@code computeIfAbsent} and is bounded - * by the number of distinct regions in the deployment. Entries are not evicted individually. + * by the number of distinct regions in the deployment. Entries are not evicted individually. Locality is baked + * into the per-region state's base dimensions on first call (deterministic function of source region for a + * given server), matching the layout of {@link RecordLevelDelayOtelStats}. */ - private final Map> metricsByRegion; + private final Map> metricsByRegion; // Version info cache for classifying versions as CURRENT/FUTURE/BACKUP private volatile VersionInfo versionInfo = VersionInfo.NON_EXISTING; @@ -69,12 +74,17 @@ public void updateVersionInfo(int currentVersion, int futureVersion) { /** * Records a heartbeat delay with all dimensional attributes to OTel metrics. - * Returns early if OTel metrics are disabled or version is invalid. + * Returns early if OTel metrics are disabled. * * @param version The version number * @param region The region name * @param replicaType The replica type {@link ReplicaType} * @param replicaState The replica state {@link ReplicaState} + * @param writeType Pre-resolved store write-type label (REGULAR / WRITE_COMPUTE) + * @param chunkingStatus Pre-resolved version-level chunking label (CHUNKED / UNCHUNKED) + * @param locality Pre-resolved locality label (LOCAL / REMOTE) — applied to the per-region metric state on first + * call per region; subsequent calls reuse the cached state. Null is coerced to REMOTE: from the + * perspective of an unconfigured server, every source region is "not the local region". * @param delayMs The delay in milliseconds */ public void recordHeartbeatDelayOtelMetrics( @@ -82,34 +92,50 @@ public void recordHeartbeatDelayOtelMetrics( String region, ReplicaType replicaType, ReplicaState replicaState, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus, + VeniceRegionLocality locality, long delayMs) { if (!emitOtelMetrics()) { return; } VersionRole versionRole = OtelVersionedStatsUtils.classifyVersion(version, this.versionInfo); - MetricEntityStateThreeEnums metricState = getOrCreateMetricState(region); + /* + * Locality can be null when the server's local region is unconfigured (PCS / HMS leave it + * null in that case rather than mislabel everything as REMOTE). For the OTel emit path we + * still need a concrete label, so default to REMOTE. + */ + VeniceRegionLocality resolvedLocality = locality != null ? locality : VeniceRegionLocality.REMOTE; - // Records to OTel metrics only - metricState.record(delayMs, versionRole, replicaType, replicaState); + MetricEntityStateFiveEnums metricState = + getOrCreateMetricState(region, resolvedLocality); + + metricState.record(delayMs, versionRole, replicaType, replicaState, writeType, chunkingStatus); } /** - * Gets or creates a metric entity state for a specific region. + * Gets or creates a metric entity state for a specific region. Region locality is supplied by the + * caller and baked into the per-region base dimensions on first call — every subsequent record + * for the same region reuses the cached state. */ - private MetricEntityStateThreeEnums getOrCreateMetricState(String region) { + private MetricEntityStateFiveEnums getOrCreateMetricState( + String region, + VeniceRegionLocality locality) { return metricsByRegion.computeIfAbsent(region, r -> { - // Add region to base dimensions Map regionBaseDimensions = new HashMap<>(baseDimensionsMap); regionBaseDimensions.put(VeniceMetricsDimensions.VENICE_REGION_NAME, r); + regionBaseDimensions.put(VeniceMetricsDimensions.VENICE_REGION_LOCALITY, locality.getDimensionValue()); - return MetricEntityStateThreeEnums.create( + return MetricEntityStateFiveEnums.create( INGESTION_HEARTBEAT_DELAY.getMetricEntity(), otelRepository, regionBaseDimensions, VersionRole.class, ReplicaType.class, - ReplicaState.class); + ReplicaState.class, + VeniceStoreWriteType.class, + VeniceChunkingStatus.class); }); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatVersionedStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatVersionedStats.java index 4e90aa70ecd..7e529f7dbbc 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatVersionedStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatVersionedStats.java @@ -7,6 +7,9 @@ import com.linkedin.venice.stats.StatsSupplier; import com.linkedin.venice.stats.dimensions.ReplicaState; import com.linkedin.venice.stats.dimensions.ReplicaType; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; import java.util.Map; @@ -50,7 +53,14 @@ public HeartbeatVersionedStats( this.recordLevelDelayOtelStatsMap = new VeniceConcurrentHashMap<>(); } - public void recordLeaderLag(String storeName, int version, String region, long heartbeatTs) { + public void recordLeaderLag( + String storeName, + int version, + String region, + long heartbeatTs, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus, + VeniceRegionLocality locality) { // Calculate current time and delay once for both Tehuti and OTel metrics long currentTime = currentTimeSupplier.get(); long delay = currentTime - heartbeatTs; @@ -64,6 +74,9 @@ public void recordLeaderLag(String storeName, int version, String region, long h region, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, // Leaders are always ready to serve + writeType, + chunkingStatus, + locality, delay); } @@ -72,7 +85,10 @@ public void recordFollowerLag( int version, String region, long heartbeatTs, - boolean isReadyToServe) { + boolean isReadyToServe, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus, + VeniceRegionLocality locality) { // Calculate current time and delay once for all metrics long currentTime = currentTimeSupplier.get(); long delay = currentTime - heartbeatTs; @@ -94,16 +110,29 @@ public void recordFollowerLag( region, ReplicaType.FOLLOWER, ReplicaState.READY_TO_SERVE, + writeType, + chunkingStatus, + locality, readyToServeDelay); otelStats.recordHeartbeatDelayOtelMetrics( version, region, ReplicaType.FOLLOWER, ReplicaState.CATCHING_UP, + writeType, + chunkingStatus, + locality, catchingUpDelay); } - public void recordLeaderRecordLag(String storeName, int version, String region, long recordTs) { + public void recordLeaderRecordLag( + String storeName, + int version, + String region, + long recordTs, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus, + VeniceRegionLocality locality) { long currentTime = currentTimeSupplier.get(); long delay = currentTime - recordTs; @@ -113,6 +142,9 @@ public void recordLeaderRecordLag(String storeName, int version, String region, region, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, // Leaders are always ready to serve + writeType, + chunkingStatus, + locality, delay); } @@ -121,7 +153,10 @@ public void recordFollowerRecordLag( int version, String region, long recordTs, - boolean isReadyToServe) { + boolean isReadyToServe, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus, + VeniceRegionLocality locality) { long currentTime = currentTimeSupplier.get(); long delay = currentTime - recordTs; @@ -135,9 +170,19 @@ public void recordFollowerRecordLag( region, ReplicaType.FOLLOWER, ReplicaState.READY_TO_SERVE, + writeType, + chunkingStatus, + locality, readyToServeDelay); - otelStats - .recordRecordDelayOtelMetrics(version, region, ReplicaType.FOLLOWER, ReplicaState.CATCHING_UP, catchingUpDelay); + otelStats.recordRecordDelayOtelMetrics( + version, + region, + ReplicaType.FOLLOWER, + ReplicaState.CATCHING_UP, + writeType, + chunkingStatus, + locality, + catchingUpDelay); } /** No-op: heartbeat stats are loaded lazily when the first heartbeat/record arrives. */ @@ -228,7 +273,11 @@ private HeartbeatOtelStats getOrCreateHeartbeatOtelStats(String storeName) { }); } - /** Same pattern as {@link #getOrCreateHeartbeatOtelStats}. */ + /** + * Same pattern as {@link #getOrCreateHeartbeatOtelStats}. SLO classification dimensions + * (write type, chunking, locality) are now caller-supplied on every emit call — see + * {@link #emitPerRecordLeaderOtelMetric} / {@link #emitPerRecordFollowerOtelMetric}. + */ private RecordLevelDelayOtelStats getOrCreateRecordLevelDelayOtelStats(String storeName) { RecordLevelDelayOtelStats existing = recordLevelDelayOtelStatsMap.get(storeName); if (existing != null) { @@ -247,12 +296,27 @@ private RecordLevelDelayOtelStats getOrCreateRecordLevelDelayOtelStats(String st * Emits a per-record OTel metric for leader record delay (called per record, not aggregated). * Uses {@code get()} as a fast path; falls back to {@code getOrCreate} on first call per store. */ - public void emitPerRecordLeaderOtelMetric(String storeName, int version, String region, long delay) { + public void emitPerRecordLeaderOtelMetric( + String storeName, + int version, + String region, + long delay, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus, + VeniceRegionLocality locality) { RecordLevelDelayOtelStats otelStats = getOrLazilyCreateRecordLevelDelayOtelStats(storeName); if (otelStats == null || !otelStats.emitOtelMetrics()) { return; } - otelStats.recordRecordDelayOtelMetrics(version, region, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, delay); + otelStats.recordRecordDelayOtelMetrics( + version, + region, + ReplicaType.LEADER, + ReplicaState.READY_TO_SERVE, + writeType, + chunkingStatus, + locality, + delay); } /** @@ -264,13 +328,24 @@ public void emitPerRecordFollowerOtelMetric( int version, String region, long delay, - boolean isReadyToServe) { + boolean isReadyToServe, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus, + VeniceRegionLocality locality) { RecordLevelDelayOtelStats otelStats = getOrLazilyCreateRecordLevelDelayOtelStats(storeName); if (otelStats == null || !otelStats.emitOtelMetrics()) { return; } ReplicaState replicaState = isReadyToServe ? ReplicaState.READY_TO_SERVE : ReplicaState.CATCHING_UP; - otelStats.recordRecordDelayOtelMetrics(version, region, ReplicaType.FOLLOWER, replicaState, delay); + otelStats.recordRecordDelayOtelMetrics( + version, + region, + ReplicaType.FOLLOWER, + replicaState, + writeType, + chunkingStatus, + locality, + delay); } /** diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelMetricEntity.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelMetricEntity.java index ad7bd287ad4..8febbe31421 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelMetricEntity.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelMetricEntity.java @@ -1,10 +1,13 @@ package com.linkedin.davinci.stats.ingestion.heartbeat; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CHUNKING_STATUS; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_LOCALITY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_NAME; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_STATE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_WRITE_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_VERSION_ROLE; import static com.linkedin.venice.utils.Utils.setOf; @@ -28,9 +31,12 @@ public enum RecordLevelDelayOtelMetricEntity implements ModuleMetricEntityInterf VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REGION_NAME, + VENICE_REGION_LOCALITY, VENICE_VERSION_ROLE, VENICE_REPLICA_TYPE, - VENICE_REPLICA_STATE) + VENICE_REPLICA_STATE, + VENICE_STORE_WRITE_TYPE, + VENICE_CHUNKING_STATUS) ); private final MetricEntity metricEntity; diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStats.java index 0dc1eee7bb4..671328b8e00 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStats.java @@ -10,8 +10,11 @@ import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; import com.linkedin.venice.stats.dimensions.ReplicaState; import com.linkedin.venice.stats.dimensions.ReplicaType; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; -import com.linkedin.venice.stats.metrics.MetricEntityStateThreeEnums; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; +import com.linkedin.venice.stats.metrics.MetricEntityStateFiveEnums; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; import java.util.HashMap; @@ -21,7 +24,16 @@ /** * OpenTelemetry metrics for record-level delay monitoring. * Tracks delays for regular data records (not heartbeat control messages). - * Note: Tehuti metrics are managed separately in {@link HeartbeatStatReporter}. + * + *

SLO classification dimensions ({@link VeniceStoreWriteType}, {@link VeniceChunkingStatus}, + * {@link VeniceRegionLocality}) are supplied by the caller on every record so they reflect the + * version-level configuration the caller resolved (rather than store-level approximations baked + * in at construction time). Write-type and chunking are emitted as dynamic dimensions; locality + * stays a base dimension because it is a deterministic function of the source region — caller + * provides the resolved enum on the first call per region, and it is cached in that region's + * {@link MetricEntityStateFiveEnums} instance. + * + *

Note: Tehuti metrics are managed separately in {@link HeartbeatStatReporter}. */ public class RecordLevelDelayOtelStats { private final boolean emitOtelMetrics; @@ -32,7 +44,7 @@ public class RecordLevelDelayOtelStats { * Per-region metric entity states, keyed by region name. Grows lazily via {@code computeIfAbsent} and is bounded * by the number of distinct regions in the deployment. Entries are not evicted individually. */ - private final Map> metricsByRegion; + private final Map> metricsByRegion; // Version info cache for classifying versions as CURRENT/FUTURE/BACKUP private volatile VersionInfo versionInfo = VersionInfo.NON_EXISTING; @@ -69,13 +81,20 @@ public void updateVersionInfo(int currentVersion, int futureVersion) { } /** - * Records a record-level delay with all dimensional attributes to OTel metrics. - * Returns early if OTel metrics are disabled or version is invalid. + * Records a record-level delay with all dimensional attributes to OTel metrics. Returns early + * only when OTel emission is disabled. Version classification is best-effort: + * {@link OtelVersionedStatsUtils#classifyVersion} returns {@link VersionRole#BACKUP} for any + * version that is neither current nor future — including unknown/0/-1 — and emission still + * proceeds. * - * @param version The version number - * @param region The region name + * @param version The version number (used to classify VersionRole as CURRENT/FUTURE/BACKUP) + * @param region The record's source region * @param replicaType The replica type {@link ReplicaType} * @param replicaState The replica state {@link ReplicaState} + * @param writeType Pre-resolved write-type label (REGULAR / WRITE_COMPUTE) + * @param chunkingStatus Pre-resolved chunking label (CHUNKED / UNCHUNKED) + * @param locality Pre-resolved locality label (LOCAL / REMOTE) — applied to the per-region metric state on first + * call per region; subsequent calls reuse the cached state * @param delayMs The delay in milliseconds */ public void recordRecordDelayOtelMetrics( @@ -83,34 +102,54 @@ public void recordRecordDelayOtelMetrics( String region, ReplicaType replicaType, ReplicaState replicaState, + VeniceStoreWriteType writeType, + VeniceChunkingStatus chunkingStatus, + VeniceRegionLocality locality, long delayMs) { if (!emitOtelMetrics()) { return; } VersionRole versionRole = OtelVersionedStatsUtils.classifyVersion(version, this.versionInfo); - MetricEntityStateThreeEnums metricState = getOrCreateMetricState(region); + /* + * Locality can be null when the server's local region is unconfigured (PCS / HMS leave it + * null in that case rather than mislabel everything as REMOTE). For the OTel emit path we + * still need a concrete label, so default to REMOTE: from the perspective of an + * unconfigured server, every source region is "not the local region". + */ + VeniceRegionLocality resolvedLocality = locality != null ? locality : VeniceRegionLocality.REMOTE; + + MetricEntityStateFiveEnums metricState = + getOrCreateMetricState(region, resolvedLocality); // Records to OTel metrics only - metricState.record(delayMs, versionRole, replicaType, replicaState); + metricState.record(delayMs, versionRole, replicaType, replicaState, writeType, chunkingStatus); } /** - * Gets or creates a metric entity state for a specific region. + * Gets or creates a metric entity state for a specific region. Region locality is supplied by the + * caller and baked into the per-region base dimensions on first call — every subsequent record + * for the same region reuses the cached {@link MetricEntityStateFiveEnums}. Locality is a + * deterministic function of source region for a given server, so once-per-region caching is safe. */ - private MetricEntityStateThreeEnums getOrCreateMetricState(String region) { + private MetricEntityStateFiveEnums getOrCreateMetricState( + String region, + VeniceRegionLocality locality) { return metricsByRegion.computeIfAbsent(region, r -> { - // Add region to base dimensions + // Add region name and locality to base dimensions Map regionBaseDimensions = new HashMap<>(baseDimensionsMap); regionBaseDimensions.put(VeniceMetricsDimensions.VENICE_REGION_NAME, r); + regionBaseDimensions.put(VeniceMetricsDimensions.VENICE_REGION_LOCALITY, locality.getDimensionValue()); - return MetricEntityStateThreeEnums.create( + return MetricEntityStateFiveEnums.create( INGESTION_RECORD_DELAY.getMetricEntity(), otelRepository, regionBaseDimensions, VersionRole.class, ReplicaType.class, - ReplicaState.class); + ReplicaState.class, + VeniceStoreWriteType.class, + VeniceChunkingStatus.class); }); } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/helix/LeaderFollowerParticipantModelFactoryTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/helix/LeaderFollowerParticipantModelFactoryTest.java index e5f8f30f1e8..a00fb2c4112 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/helix/LeaderFollowerParticipantModelFactoryTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/helix/LeaderFollowerParticipantModelFactoryTest.java @@ -7,6 +7,7 @@ import com.linkedin.davinci.config.VeniceServerConfig; import com.linkedin.davinci.config.VeniceStoreVersionConfig; import com.linkedin.davinci.ingestion.IngestionBackend; +import com.linkedin.davinci.stats.HeartbeatMonitoringServiceStats; import com.linkedin.davinci.stats.ParticipantStateTransitionStats; import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatMonitoringService; import com.linkedin.venice.exceptions.VeniceException; @@ -91,7 +92,7 @@ public void setupTestCase() { new MetricsRepository(), mockReadOnlyStoreRepository, veniceServerConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), CompletableFuture.completedFuture(mockCustomizedViewRepository))); } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/helix/VeniceLeaderFollowerStateModelTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/helix/VeniceLeaderFollowerStateModelTest.java index de95c154b0d..43f7850313b 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/helix/VeniceLeaderFollowerStateModelTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/helix/VeniceLeaderFollowerStateModelTest.java @@ -17,6 +17,7 @@ import com.linkedin.davinci.config.VeniceServerConfig; import com.linkedin.davinci.config.VeniceStoreVersionConfig; +import com.linkedin.davinci.stats.HeartbeatMonitoringServiceStats; import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatLagMonitorAction; import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatMonitoringService; import com.linkedin.venice.helix.HelixCustomizedViewOfflinePushRepository; @@ -53,7 +54,7 @@ protected LeaderFollowerPartitionStateModel getParticipantStateModel() { new MetricsRepository(), mockReadOnlyStoreRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), CompletableFuture.completedFuture(mockCustomizedViewRepository)); spyHeartbeatMonitoringService = spy(heartbeatMonitoringService); return new LeaderFollowerPartitionStateModel( @@ -89,7 +90,7 @@ private LeaderFollowerPartitionStateModel getSystemStoreStateModel() { new MetricsRepository(), mockReadOnlyStoreRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), CompletableFuture.completedFuture(mockCustomizedViewRepository)); return new LeaderFollowerPartitionStateModel( mockIngestionBackend, diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/ActiveKeyCountTestUtils.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/ActiveKeyCountTestUtils.java index fe7dbeff7b5..d7f5dda75cf 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/ActiveKeyCountTestUtils.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/ActiveKeyCountTestUtils.java @@ -34,7 +34,10 @@ static PartitionConsumptionState freshPcs(PubSubTopicPartition tp) { AvroProtocolDefinition.PARTITION_STATE.getSerializer(), DEFAULT_PUBSUB_CONTEXT_FOR_UNIT_TESTING), DEFAULT_PUBSUB_CONTEXT_FOR_UNIT_TESTING, - true); + true, + false, + false, + null); } /** Creates a fresh PCS with the given leader/follower state for the default topic partition. */ @@ -58,12 +61,26 @@ static PartitionConsumptionState pcsFromCheckpoint(long activeKeyCount) { AvroProtocolDefinition.PARTITION_STATE.getSerializer(), DEFAULT_PUBSUB_CONTEXT_FOR_UNIT_TESTING); or.setActiveKeyCount(activeKeyCount); - return new PartitionConsumptionState(DEFAULT_TP, or, DEFAULT_PUBSUB_CONTEXT_FOR_UNIT_TESTING, true); + return new PartitionConsumptionState( + DEFAULT_TP, + or, + DEFAULT_PUBSUB_CONTEXT_FOR_UNIT_TESTING, + true, + false, + false, + null); } /** Restores a PCS from a deserialized OffsetRecord checkpoint. */ static PartitionConsumptionState restoreFrom(OffsetRecord checkpoint) { - return new PartitionConsumptionState(DEFAULT_TP, checkpoint, DEFAULT_PUBSUB_CONTEXT_FOR_UNIT_TESTING, true); + return new PartitionConsumptionState( + DEFAULT_TP, + checkpoint, + DEFAULT_PUBSUB_CONTEXT_FOR_UNIT_TESTING, + true, + false, + false, + null); } /** Simulates syncOffset: copies PCS count into OffsetRecord, serializes, and deserializes. */ diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/PartitionConsumptionStateTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/PartitionConsumptionStateTest.java index 4e0e45b4b3d..45d45fcf1bb 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/PartitionConsumptionStateTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/PartitionConsumptionStateTest.java @@ -9,6 +9,7 @@ import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; +import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatKey; import com.linkedin.venice.kafka.protocol.Put; import com.linkedin.venice.kafka.validation.checksum.CheckSum; import com.linkedin.venice.kafka.validation.checksum.CheckSumType; @@ -20,6 +21,9 @@ import com.linkedin.venice.pubsub.api.PubSubTopicPartition; import com.linkedin.venice.schema.rmd.RmdSchemaGenerator; import com.linkedin.venice.serialization.avro.AvroProtocolDefinition; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; import com.linkedin.venice.writer.LeaderCompleteState; import com.linkedin.venice.writer.WriterChunkingHelper; import java.nio.ByteBuffer; @@ -47,8 +51,14 @@ public void setUp() { @Test public void testUpdateChecksum() { - PartitionConsumptionState pcs = - new PartitionConsumptionState(TOPIC_PARTITION, mock(OffsetRecord.class), pubSubContext, false); + PartitionConsumptionState pcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); pcs.initializeExpectedChecksum(); byte[] rmdPayload = new byte[] { 127 }; byte[] key1 = new byte[] { 1 }; @@ -103,8 +113,14 @@ public void testUpdateChecksum() { */ @Test public void testTransientRecordMap() { - PartitionConsumptionState pcs = - new PartitionConsumptionState(TOPIC_PARTITION, mock(OffsetRecord.class), pubSubContext, false); + PartitionConsumptionState pcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); assertEquals(pcs.getPubSubContext(), pubSubContext); PubSubPosition consumedPosition1Mock = mock(PubSubPosition.class); PubSubPosition consumedPosition2Mock = mock(PubSubPosition.class); @@ -154,8 +170,14 @@ public void testTransientRecordMap() { @Test public void testIsLeaderCompleted() { - PartitionConsumptionState pcs = - new PartitionConsumptionState(TOPIC_PARTITION, mock(OffsetRecord.class), pubSubContext, false); + PartitionConsumptionState pcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); // default is LEADER_NOT_COMPLETED assertEquals(pcs.getLeaderCompleteState(), LeaderCompleteState.LEADER_NOT_COMPLETED); assertFalse(pcs.isLeaderCompleted()); @@ -170,7 +192,8 @@ public void testAddIncPushVersionToPendingReportList() { List pendingReportIncrementalPush = new ArrayList<>(); OffsetRecord offsetRecord = mock(OffsetRecord.class); doReturn(pendingReportIncrementalPush).when(offsetRecord).getPendingReportIncPushVersionList(); - PartitionConsumptionState pcs = new PartitionConsumptionState(TOPIC_PARTITION, offsetRecord, pubSubContext, false); + PartitionConsumptionState pcs = + new PartitionConsumptionState(TOPIC_PARTITION, offsetRecord, pubSubContext, false, false, false, null); pcs.addIncPushVersionToPendingReportList("a"); Assert.assertEquals(pcs.getPendingReportIncPushVersionList().size(), 1); for (int i = 0; i < 50; i++) { @@ -182,8 +205,14 @@ public void testAddIncPushVersionToPendingReportList() { @Test public void testDolStateOperations() { - PartitionConsumptionState pcs = - new PartitionConsumptionState(TOPIC_PARTITION, mock(OffsetRecord.class), pubSubContext, false); + PartitionConsumptionState pcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); // Initially, DoL state should be null assertNull(pcs.getDolState()); @@ -203,8 +232,14 @@ public void testDolStateOperations() { @Test public void testHighestLeadershipTermOperations() { - PartitionConsumptionState pcs = - new PartitionConsumptionState(TOPIC_PARTITION, mock(OffsetRecord.class), pubSubContext, false); + PartitionConsumptionState pcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); // Initially, highest leadership term should be -1 (default uninitialized value) assertEquals(pcs.getHighestLeadershipTerm(), -1L); @@ -224,8 +259,14 @@ public void testHighestLeadershipTermOperations() { @Test public void testDolStateWithMultipleUpdates() { - PartitionConsumptionState pcs = - new PartitionConsumptionState(TOPIC_PARTITION, mock(OffsetRecord.class), pubSubContext, false); + PartitionConsumptionState pcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); // Set first DolStamp DolStamp dolStamp1 = new DolStamp(1L, "host-1"); @@ -284,14 +325,20 @@ public void testHllInitBranchingLogic() { doReturn(ByteBuffer.wrap(serialized)).when(restoredRecord).getUniqueIngestedKeyCountHllSketch(); doReturn(null).when(restoredRecord).getLeaderTopic(); PartitionConsumptionState restoredPcs = - new PartitionConsumptionState(TOPIC_PARTITION, restoredRecord, pubSubContext, false); + new PartitionConsumptionState(TOPIC_PARTITION, restoredRecord, pubSubContext, false, false, false, null); restoredPcs.restoreUniqueKeyCountHll(); assertTrue(restoredPcs.hasUniqueIngestedKeyCountHll()); assertEquals(restoredPcs.getEstimatedUniqueIngestedKeyCount(), 2); // Case 3: Pre-deployment version — don't call any init, HLL stays null - PartitionConsumptionState preDeployPcs = - new PartitionConsumptionState(TOPIC_PARTITION, mock(OffsetRecord.class), pubSubContext, false); + PartitionConsumptionState preDeployPcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); preDeployPcs.trackKeyIngested("key1".getBytes()); assertFalse(preDeployPcs.hasUniqueIngestedKeyCountHll()); assertEquals(preDeployPcs.getEstimatedUniqueIngestedKeyCount(), 0); @@ -428,13 +475,19 @@ public void testSyncOffsetHllPersistencePath() { doReturn(restored.getUniqueIngestedKeyCountHllSketch()).when(restoredForPcs).getUniqueIngestedKeyCountHllSketch(); doReturn(null).when(restoredForPcs).getLeaderTopic(); PartitionConsumptionState restoredPcs = - new PartitionConsumptionState(TOPIC_PARTITION, restoredForPcs, pubSubContext, false); + new PartitionConsumptionState(TOPIC_PARTITION, restoredForPcs, pubSubContext, false, false, false, null); restoredPcs.restoreUniqueKeyCountHll(); assertEquals(restoredPcs.getEstimatedUniqueIngestedKeyCount(), originalEstimate); // --- HLL disabled path: no bytes should be set --- - PartitionConsumptionState disabledPcs = - new PartitionConsumptionState(TOPIC_PARTITION, mock(OffsetRecord.class), pubSubContext, false); + PartitionConsumptionState disabledPcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); // Don't init HLL assertFalse(disabledPcs.hasUniqueIngestedKeyCountHll()); assertNull(disabledPcs.serializeUniqueIngestedKeyCountHll()); @@ -448,25 +501,110 @@ private PartitionConsumptionState createPcsWithHll(int lgK) { OffsetRecord offsetRecord = mock(OffsetRecord.class); doReturn(null).when(offsetRecord).getUniqueIngestedKeyCountHllSketch(); doReturn(null).when(offsetRecord).getLeaderTopic(); - PartitionConsumptionState pcs = new PartitionConsumptionState(TOPIC_PARTITION, offsetRecord, pubSubContext, false); + PartitionConsumptionState pcs = + new PartitionConsumptionState(TOPIC_PARTITION, offsetRecord, pubSubContext, false, false, false, null); pcs.initializeUniqueKeyCountHll(lgK); return pcs; } @Test public void testStoreLevelPausedDefaultsToFalse() { - PartitionConsumptionState pcs = - new PartitionConsumptionState(TOPIC_PARTITION, mock(OffsetRecord.class), pubSubContext, false); + PartitionConsumptionState pcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); assertFalse(pcs.isStoreLevelPaused()); } @Test public void testSetAndGetStoreLevelPaused() { - PartitionConsumptionState pcs = - new PartitionConsumptionState(TOPIC_PARTITION, mock(OffsetRecord.class), pubSubContext, false); + PartitionConsumptionState pcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); pcs.setStoreLevelPaused(true); assertTrue(pcs.isStoreLevelPaused()); pcs.setStoreLevelPaused(false); assertFalse(pcs.isStoreLevelPaused()); } + + @Test + public void testGetOrCreateCachedHeartbeatKeyCarriesResolvedSloLabels() { + PubSubContext pubSubContext = DEFAULT_PUBSUB_CONTEXT_FOR_UNIT_TESTING; + PartitionConsumptionState pcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + true, + true, + "us-west"); + + HeartbeatKey local = pcs.getOrCreateCachedHeartbeatKey("us-west"); + assertEquals(local.getWriteType(), VeniceStoreWriteType.WRITE_COMPUTE); + assertEquals(local.getChunkingStatus(), VeniceChunkingStatus.CHUNKED); + assertEquals(local.getLocality(), VeniceRegionLocality.LOCAL); + + HeartbeatKey remote = pcs.getOrCreateCachedHeartbeatKey("us-east"); + assertEquals(remote.getWriteType(), VeniceStoreWriteType.WRITE_COMPUTE); + assertEquals(remote.getChunkingStatus(), VeniceChunkingStatus.CHUNKED); + assertEquals(remote.getLocality(), VeniceRegionLocality.REMOTE); + + // Subsequent call for the same region returns the cached instance. + assertTrue( + local == pcs.getOrCreateCachedHeartbeatKey("us-west"), + "Same-region calls must return the cached HeartbeatKey instance"); + } + + @Test + public void testGetOrCreateCachedHeartbeatKeyLeavesLocalityNullWhenLocalRegionUnset() { + PubSubContext pubSubContext = DEFAULT_PUBSUB_CONTEXT_FOR_UNIT_TESTING; + // Empty local region — observed in unconfigured deployments — must NOT default to REMOTE. + PartitionConsumptionState empty = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + ""); + HeartbeatKey k1 = empty.getOrCreateCachedHeartbeatKey("us-west"); + assertNull(k1.getLocality(), "Empty localRegionName must leave locality null, not REMOTE"); + + PartitionConsumptionState nullRegion = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + null); + HeartbeatKey k2 = nullRegion.getOrCreateCachedHeartbeatKey("us-west"); + assertNull(k2.getLocality(), "Null localRegionName must leave locality null"); + } + + @Test + public void testGetOrCreateCachedHeartbeatKeyResolvesRegularUnchunked() { + PubSubContext pubSubContext = DEFAULT_PUBSUB_CONTEXT_FOR_UNIT_TESTING; + PartitionConsumptionState pcs = new PartitionConsumptionState( + TOPIC_PARTITION, + mock(OffsetRecord.class), + pubSubContext, + false, + false, + false, + "us-west"); + HeartbeatKey k = pcs.getOrCreateCachedHeartbeatKey("us-west"); + assertEquals(k.getWriteType(), VeniceStoreWriteType.REGULAR); + assertEquals(k.getChunkingStatus(), VeniceChunkingStatus.UNCHUNKED); + assertEquals(k.getLocality(), VeniceRegionLocality.LOCAL); + } } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/SITFastReadyToServeTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/SITFastReadyToServeTest.java index f8277be7f29..0c0f8df946e 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/SITFastReadyToServeTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/SITFastReadyToServeTest.java @@ -37,7 +37,8 @@ public void testReadyToServeSerialization() { OffsetRecord offsetRecord = new OffsetRecord(AvroProtocolDefinition.PARTITION_STATE.getSerializer(), pubSubContext); // Create PartitionConsumptionState - PartitionConsumptionState pcs = new PartitionConsumptionState(topicPartition, offsetRecord, pubSubContext, false); + PartitionConsumptionState pcs = + new PartitionConsumptionState(topicPartition, offsetRecord, pubSubContext, false, false, false, null); // Verify initial state Assert.assertFalse(pcs.getReadyToServeInOffsetRecord(), "Should not be ready to serve initially"); @@ -53,7 +54,7 @@ public void testReadyToServeSerialization() { // Create a new PCS with the deserialized offset record PartitionConsumptionState newPcs = - new PartitionConsumptionState(topicPartition, deserialized, pubSubContext, false); + new PartitionConsumptionState(topicPartition, deserialized, pubSubContext, false, false, false, null); // Verify after deserialization Assert.assertTrue(newPcs.getReadyToServeInOffsetRecord(), "Should still be ready to serve after deserialization"); diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StorageUtilizationManagerTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StorageUtilizationManagerTest.java index 01e088106b6..3876ced974e 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StorageUtilizationManagerTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StorageUtilizationManagerTest.java @@ -69,7 +69,10 @@ public void buildNewQuotaEnforcer() { new PubSubTopicPartitionImpl(VERSION_TOPIC, i), mock(OffsetRecord.class), pubSubContext, - true); + true, + false, + false, + null); partitionConsumptionStateMap.put(i, pcs); } @@ -80,7 +83,10 @@ public void buildNewQuotaEnforcer() { new PubSubTopicPartitionImpl(VERSION_TOPIC, i), mockOffsetRecord, pubSubContext, - true); + true, + false, + false, + null); pcs.setLeaderFollowerState(LEADER); hybridPartitionConsumptionStateMap.put(i, pcs); } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskTest.java index 45f0a285cd3..8d30be672d5 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskTest.java @@ -4089,7 +4089,10 @@ public void testGetAndUpdateLeaderCompletedState(HybridConfig hybridConfig, Node new PubSubTopicPartitionImpl(pubSubTopic, PARTITION_FOO), mockOffsetRecord, pubSubContext, - true); + true, + false, + false, + null); long producerTimestamp = System.currentTimeMillis(); LeaderMetadataWrapper mockLeaderMetadataWrapper = mock(LeaderMetadataWrapper.class); @@ -4385,7 +4388,10 @@ public void testLeaderShouldSubscribeToCorrectVTOffset() { new PubSubTopicPartitionImpl(versionTopic, 0), offsetRecord, pubSubContext, - false); + false, + false, + false, + null); PubSubPosition localVersionTopicOffset = InMemoryPubSubPosition.of(100L); PubSubPosition remoteVersionTopicOffset = InMemoryPubSubPosition.of(200L); partitionConsumptionState.setLatestProcessedVtPosition(localVersionTopicOffset); @@ -5219,8 +5225,14 @@ public void testBatchOnlyStoreDataRecovery() { null); OffsetRecord offsetRecord = mock(OffsetRecord.class); doReturn(pubSubTopic).when(offsetRecord).getLeaderTopic(any()); - PartitionConsumptionState partitionConsumptionState = - new PartitionConsumptionState(new PubSubTopicPartitionImpl(pubSubTopic, 0), offsetRecord, pubSubContext, false); + PartitionConsumptionState partitionConsumptionState = new PartitionConsumptionState( + new PubSubTopicPartitionImpl(pubSubTopic, 0), + offsetRecord, + pubSubContext, + false, + false, + false, + null); storeIngestionTaskUnderTest.updateLeaderTopicOnFollower(partitionConsumptionState); storeIngestionTaskUnderTest.startConsumingAsLeader(partitionConsumptionState); String dataRecoverySourceTopic = Version.composeKafkaTopic(storeNameWithoutVersionInfo, 1); diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatKeyTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatKeyTest.java new file mode 100644 index 00000000000..2bdeccd812c --- /dev/null +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatKeyTest.java @@ -0,0 +1,164 @@ +package com.linkedin.davinci.stats.ingestion.heartbeat; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNotSame; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertSame; + +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; +import java.util.HashMap; +import java.util.Map; +import org.testng.annotations.Test; + + +/** + * Locks down the contract of {@link HeartbeatKey}: identity is {@code (storeName, version, partition, region)}. + * The SLO labels (write type, chunking, locality) are passenger fields — present for the per-record OTel + * emit path, but explicitly NOT part of equality or hash. Maps keyed by a HeartbeatKey rely on this so + * later inserts with different labels never replace the originally-stored key (and the periodic record-lag + * path can iterate the map and pull labels off the stored keys without divergence). + */ +public class HeartbeatKeyTest { + private static final String STORE = "test_store"; + private static final int VERSION = 7; + private static final int PARTITION = 3; + private static final String REGION = "us-west"; + + @Test + public void test4ArgConstructorLeavesPassengerFieldsNull() { + HeartbeatKey key = new HeartbeatKey(STORE, VERSION, PARTITION, REGION); + assertNull(key.writeType, "writeType should be null on the lookup-only constructor"); + assertNull(key.chunkingStatus, "chunkingStatus should be null on the lookup-only constructor"); + assertNull(key.locality, "locality should be null on the lookup-only constructor"); + } + + @Test + public void test7ArgConstructorPropagatesPassengerFields() { + HeartbeatKey key = new HeartbeatKey( + STORE, + VERSION, + PARTITION, + REGION, + VeniceStoreWriteType.WRITE_COMPUTE, + VeniceChunkingStatus.CHUNKED, + VeniceRegionLocality.LOCAL); + assertEquals(key.writeType, VeniceStoreWriteType.WRITE_COMPUTE); + assertEquals(key.chunkingStatus, VeniceChunkingStatus.CHUNKED); + assertEquals(key.locality, VeniceRegionLocality.LOCAL); + } + + @Test + public void test4ArgAnd7ArgKeysWithSameIdentityAreEqualAndHashEqual() { + HeartbeatKey lookup = new HeartbeatKey(STORE, VERSION, PARTITION, REGION); + HeartbeatKey labeled = new HeartbeatKey( + STORE, + VERSION, + PARTITION, + REGION, + VeniceStoreWriteType.WRITE_COMPUTE, + VeniceChunkingStatus.CHUNKED, + VeniceRegionLocality.LOCAL); + assertEquals(lookup, labeled, "Identity-equal keys must be equal even when labels differ"); + assertEquals(labeled, lookup, "Equality must be symmetric"); + assertEquals(lookup.hashCode(), labeled.hashCode(), "Identity-equal keys must hash equal"); + } + + @Test + public void testTwoLabeledKeysWithSameIdentityButDifferentLabelsAreEqual() { + HeartbeatKey a = new HeartbeatKey( + STORE, + VERSION, + PARTITION, + REGION, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL); + HeartbeatKey b = new HeartbeatKey( + STORE, + VERSION, + PARTITION, + REGION, + VeniceStoreWriteType.WRITE_COMPUTE, + VeniceChunkingStatus.CHUNKED, + VeniceRegionLocality.REMOTE); + assertEquals(a, b, "Passenger fields must not affect equality"); + assertEquals(a.hashCode(), b.hashCode(), "Passenger fields must not affect hashCode"); + } + + @Test + public void testKeysDifferingInIdentityAreNotEqual() { + HeartbeatKey base = new HeartbeatKey(STORE, VERSION, PARTITION, REGION); + assertNotEquals(base, new HeartbeatKey("other_store", VERSION, PARTITION, REGION)); + assertNotEquals(base, new HeartbeatKey(STORE, VERSION + 1, PARTITION, REGION)); + assertNotEquals(base, new HeartbeatKey(STORE, VERSION, PARTITION + 1, REGION)); + assertNotEquals(base, new HeartbeatKey(STORE, VERSION, PARTITION, "us-east")); + } + + @Test + public void testPutIfAbsentSemanticsKeepFirstStoredKey() { + Map map = new HashMap<>(); + HeartbeatKey first = new HeartbeatKey( + STORE, + VERSION, + PARTITION, + REGION, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL); + HeartbeatKey second = new HeartbeatKey( + STORE, + VERSION, + PARTITION, + REGION, + VeniceStoreWriteType.WRITE_COMPUTE, + VeniceChunkingStatus.CHUNKED, + VeniceRegionLocality.REMOTE); + map.putIfAbsent(first, "v1"); + map.putIfAbsent(second, "v2"); // identity-equal → put is a no-op, the original key stays + + assertEquals(map.size(), 1, "Identity-equal putIfAbsent must not add a second entry"); + HeartbeatKey stored = map.keySet().iterator().next(); + assertSame(stored, first, "Map must keep the first-inserted key reference"); + assertNotSame(stored, second, "Sanity: 'second' is a different object even though equal"); + // The whole point of the contract: stored key still carries the FIRST set of labels + assertEquals(stored.writeType, VeniceStoreWriteType.REGULAR); + assertEquals(stored.chunkingStatus, VeniceChunkingStatus.UNCHUNKED); + assertEquals(stored.locality, VeniceRegionLocality.LOCAL); + } + + @Test + public void testHashCodeStableAcrossLabelChanges() { + int reference = new HeartbeatKey(STORE, VERSION, PARTITION, REGION).hashCode(); + int withLabels = new HeartbeatKey( + STORE, + VERSION, + PARTITION, + REGION, + VeniceStoreWriteType.WRITE_COMPUTE, + VeniceChunkingStatus.CHUNKED, + VeniceRegionLocality.REMOTE).hashCode(); + assertEquals(withLabels, reference); + } + + @Test + public void testToStringIsStableAcrossLabelChanges() { + HeartbeatKey lookup = new HeartbeatKey(STORE, VERSION, PARTITION, REGION); + HeartbeatKey labeled = new HeartbeatKey( + STORE, + VERSION, + PARTITION, + REGION, + VeniceStoreWriteType.WRITE_COMPUTE, + VeniceChunkingStatus.CHUNKED, + VeniceRegionLocality.REMOTE); + assertNotNull(lookup.toString()); + assertEquals( + lookup.toString(), + labeled.toString(), + "toString is identity-derived and must not include passenger labels"); + } +} diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringServiceTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringServiceTest.java index 52c183b09c7..24bf4750ed0 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringServiceTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringServiceTest.java @@ -26,6 +26,7 @@ import com.linkedin.davinci.kafka.consumer.LeaderFollowerStateType; import com.linkedin.davinci.kafka.consumer.PartitionConsumptionState; import com.linkedin.davinci.kafka.consumer.StoreIngestionTask; +import com.linkedin.davinci.stats.HeartbeatMonitoringServiceStats; import com.linkedin.venice.exceptions.VeniceNoHelixResourceException; import com.linkedin.venice.helix.HelixCustomizedViewOfflinePushRepository; import com.linkedin.venice.meta.BufferReplayPolicy; @@ -39,6 +40,9 @@ import com.linkedin.venice.meta.StoreVersionInfo; import com.linkedin.venice.meta.Version; import com.linkedin.venice.meta.VersionImpl; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; import com.linkedin.venice.utils.DataProviderUtils; import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; @@ -333,7 +337,7 @@ public void testAddLeaderLagMonitor(boolean enableSepRT) { mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), CompletableFuture.completedFuture(mockCustomizedViewOfflinePushRepository)); // Let's emit some heartbeats that don't exist in the registry yet @@ -347,26 +351,62 @@ public void testAddLeaderLagMonitor(boolean enableSepRT) { // Let's do some state transitions! // Follower state transitions - heartbeatMonitoringService - .addFollowerLagMonitor(currentVersion, 0, Utils.getReplicaId(currentVersion.kafkaTopicName(), 0)); - heartbeatMonitoringService - .addFollowerLagMonitor(currentVersion, 1, Utils.getReplicaId(currentVersion.kafkaTopicName(), 1)); - heartbeatMonitoringService - .addFollowerLagMonitor(currentVersion, 2, Utils.getReplicaId(currentVersion.kafkaTopicName(), 2)); + heartbeatMonitoringService.addFollowerLagMonitor( + currentVersion, + 0, + Utils.getReplicaId(currentVersion.kafkaTopicName(), 0), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addFollowerLagMonitor( + currentVersion, + 1, + Utils.getReplicaId(currentVersion.kafkaTopicName(), 1), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addFollowerLagMonitor( + currentVersion, + 2, + Utils.getReplicaId(currentVersion.kafkaTopicName(), 2), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); - heartbeatMonitoringService - .addFollowerLagMonitor(backupVersion, 0, Utils.getReplicaId(backupVersion.kafkaTopicName(), 0)); - heartbeatMonitoringService - .addFollowerLagMonitor(backupVersion, 1, Utils.getReplicaId(backupVersion.kafkaTopicName(), 1)); - heartbeatMonitoringService - .addFollowerLagMonitor(backupVersion, 2, Utils.getReplicaId(backupVersion.kafkaTopicName(), 2)); + heartbeatMonitoringService.addFollowerLagMonitor( + backupVersion, + 0, + Utils.getReplicaId(backupVersion.kafkaTopicName(), 0), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addFollowerLagMonitor( + backupVersion, + 1, + Utils.getReplicaId(backupVersion.kafkaTopicName(), 1), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addFollowerLagMonitor( + backupVersion, + 2, + Utils.getReplicaId(backupVersion.kafkaTopicName(), 2), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); - heartbeatMonitoringService - .addFollowerLagMonitor(futureVersion, 0, Utils.getReplicaId(futureVersion.kafkaTopicName(), 0)); - heartbeatMonitoringService - .addFollowerLagMonitor(futureVersion, 1, Utils.getReplicaId(futureVersion.kafkaTopicName(), 1)); - heartbeatMonitoringService - .addFollowerLagMonitor(futureVersion, 2, Utils.getReplicaId(futureVersion.kafkaTopicName(), 2)); + heartbeatMonitoringService.addFollowerLagMonitor( + futureVersion, + 0, + Utils.getReplicaId(futureVersion.kafkaTopicName(), 0), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addFollowerLagMonitor( + futureVersion, + 1, + Utils.getReplicaId(futureVersion.kafkaTopicName(), 1), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addFollowerLagMonitor( + futureVersion, + 2, + Utils.getReplicaId(futureVersion.kafkaTopicName(), 2), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); // The above calls initialize entries with current time, and followers will retain the highest timestamp. // we'll note the current time that comes AFTER the initialization and use that from which to increment the time. @@ -438,18 +478,42 @@ public void testAddLeaderLagMonitor(boolean enableSepRT) { Assert.assertTrue(entry.heartbeatTimestamp >= baseTimeStamp + 1001L); // Leader state transitions - heartbeatMonitoringService - .addLeaderLagMonitor(currentVersion, 1, Utils.getReplicaId(currentVersion.kafkaTopicName(), 1)); - heartbeatMonitoringService - .addLeaderLagMonitor(currentVersion, 2, Utils.getReplicaId(currentVersion.kafkaTopicName(), 2)); - heartbeatMonitoringService - .addLeaderLagMonitor(backupVersion, 1, Utils.getReplicaId(backupVersion.kafkaTopicName(), 1)); - heartbeatMonitoringService - .addLeaderLagMonitor(backupVersion, 2, Utils.getReplicaId(backupVersion.kafkaTopicName(), 2)); - heartbeatMonitoringService - .addLeaderLagMonitor(futureVersion, 1, Utils.getReplicaId(futureVersion.kafkaTopicName(), 1)); - heartbeatMonitoringService - .addLeaderLagMonitor(futureVersion, 2, Utils.getReplicaId(futureVersion.kafkaTopicName(), 2)); + heartbeatMonitoringService.addLeaderLagMonitor( + currentVersion, + 1, + Utils.getReplicaId(currentVersion.kafkaTopicName(), 1), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addLeaderLagMonitor( + currentVersion, + 2, + Utils.getReplicaId(currentVersion.kafkaTopicName(), 2), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addLeaderLagMonitor( + backupVersion, + 1, + Utils.getReplicaId(backupVersion.kafkaTopicName(), 1), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addLeaderLagMonitor( + backupVersion, + 2, + Utils.getReplicaId(backupVersion.kafkaTopicName(), 2), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addLeaderLagMonitor( + futureVersion, + 1, + Utils.getReplicaId(futureVersion.kafkaTopicName(), 1), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addLeaderLagMonitor( + futureVersion, + 2, + Utils.getReplicaId(futureVersion.kafkaTopicName(), 2), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); // alright, no longer null Assert.assertTrue(hasStore(heartbeatMonitoringService.getLeaderHeartbeatTimeStamps(), TEST_STORE)); @@ -485,12 +549,24 @@ public void testAddLeaderLagMonitor(boolean enableSepRT) { 2 + (enableSepRT ? 1 : 0)); // Go back to follower - heartbeatMonitoringService - .addFollowerLagMonitor(currentVersion, 1, Utils.getReplicaId(currentVersion.kafkaTopicName(), 1)); - heartbeatMonitoringService - .addFollowerLagMonitor(backupVersion, 1, Utils.getReplicaId(backupVersion.kafkaTopicName(), 1)); - heartbeatMonitoringService - .addFollowerLagMonitor(futureVersion, 1, Utils.getReplicaId(futureVersion.kafkaTopicName(), 1)); + heartbeatMonitoringService.addFollowerLagMonitor( + currentVersion, + 1, + Utils.getReplicaId(currentVersion.kafkaTopicName(), 1), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addFollowerLagMonitor( + backupVersion, + 1, + Utils.getReplicaId(backupVersion.kafkaTopicName(), 1), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); + heartbeatMonitoringService.addFollowerLagMonitor( + futureVersion, + 1, + Utils.getReplicaId(futureVersion.kafkaTopicName(), 1), + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); // make sure non hybrid is still not in there Assert.assertEquals( @@ -645,12 +721,22 @@ public void testUpdateLagMonitor() { heartbeatMonitoringService .updateLagMonitor(resourceName, partition, HeartbeatLagMonitorAction.SET_LEADER_MONITOR, replicaId); verify(metadataRepo).waitVersion(eq(storeName), eq(storeVersion), any(Duration.class), anyLong()); - verify(heartbeatMonitoringService, never()).addLeaderLagMonitor(any(Version.class), anyInt(), anyString()); + verify(heartbeatMonitoringService, never()).addLeaderLagMonitor( + any(Version.class), + anyInt(), + anyString(), + any(VeniceStoreWriteType.class), + any(VeniceChunkingStatus.class)); heartbeatMonitoringService .updateLagMonitor(resourceName, partition, HeartbeatLagMonitorAction.SET_FOLLOWER_MONITOR, replicaId); verify(metadataRepo, times(2)).waitVersion(eq(storeName), eq(storeVersion), any(Duration.class), anyLong()); - verify(heartbeatMonitoringService, never()).addFollowerLagMonitor(any(Version.class), anyInt(), anyString()); + verify(heartbeatMonitoringService, never()).addFollowerLagMonitor( + any(Version.class), + anyInt(), + anyString(), + any(VeniceStoreWriteType.class), + any(VeniceChunkingStatus.class)); heartbeatMonitoringService .updateLagMonitor(resourceName, partition, HeartbeatLagMonitorAction.REMOVE_MONITOR, replicaId); @@ -663,12 +749,22 @@ public void testUpdateLagMonitor() { heartbeatMonitoringService .updateLagMonitor(resourceName, partition, HeartbeatLagMonitorAction.SET_LEADER_MONITOR, replicaId); verify(metadataRepo, times(4)).waitVersion(eq(storeName), eq(storeVersion), any(Duration.class), anyLong()); - verify(heartbeatMonitoringService, never()).addLeaderLagMonitor(any(Version.class), anyInt(), anyString()); + verify(heartbeatMonitoringService, never()).addLeaderLagMonitor( + any(Version.class), + anyInt(), + anyString(), + any(VeniceStoreWriteType.class), + any(VeniceChunkingStatus.class)); heartbeatMonitoringService .updateLagMonitor(resourceName, partition, HeartbeatLagMonitorAction.SET_FOLLOWER_MONITOR, replicaId); verify(metadataRepo, times(5)).waitVersion(eq(storeName), eq(storeVersion), any(Duration.class), anyLong()); - verify(heartbeatMonitoringService, never()).addFollowerLagMonitor(any(Version.class), anyInt(), anyString()); + verify(heartbeatMonitoringService, never()).addFollowerLagMonitor( + any(Version.class), + anyInt(), + anyString(), + any(VeniceStoreWriteType.class), + any(VeniceChunkingStatus.class)); heartbeatMonitoringService .updateLagMonitor(resourceName, partition, HeartbeatLagMonitorAction.REMOVE_MONITOR, replicaId); @@ -681,12 +777,22 @@ public void testUpdateLagMonitor() { heartbeatMonitoringService .updateLagMonitor(resourceName, partition, HeartbeatLagMonitorAction.SET_LEADER_MONITOR, replicaId); verify(metadataRepo, times(7)).waitVersion(eq(storeName), eq(storeVersion), any(Duration.class), anyLong()); - verify(heartbeatMonitoringService).addLeaderLagMonitor(version, partition, replicaId); + verify(heartbeatMonitoringService).addLeaderLagMonitor( + version, + partition, + replicaId, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); heartbeatMonitoringService .updateLagMonitor(resourceName, partition, HeartbeatLagMonitorAction.SET_FOLLOWER_MONITOR, replicaId); verify(metadataRepo, times(8)).waitVersion(eq(storeName), eq(storeVersion), any(Duration.class), anyLong()); - verify(heartbeatMonitoringService).addFollowerLagMonitor(version, partition, replicaId); + verify(heartbeatMonitoringService).addFollowerLagMonitor( + version, + partition, + replicaId, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED); heartbeatMonitoringService .updateLagMonitor(resourceName, partition, HeartbeatLagMonitorAction.REMOVE_MONITOR, replicaId); @@ -765,7 +871,7 @@ public void testCleanupLagMonitor() { mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), mockCVRepositoryFuture); // Initialize lag monitor for leader of p0 and follower of p1 and p2 for v1 heartbeatMonitoringService.updateLagMonitor( @@ -853,7 +959,7 @@ public void testCleanupLagMonitor() { mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), mockCVRepositoryFuture); // Initialize the new lag monitor for leader of p0 and follower of p1 and p2 for v1 newHeartbeatMonitoringService.updateLagMonitor( @@ -997,7 +1103,7 @@ public void testRecordLevelTimestampTracking(boolean recordLevelTimestampEnabled mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), mockCVRepositoryFuture); // Add leader lag monitor - this initializes the map structures @@ -1099,7 +1205,7 @@ public void testFollowerRecordLevelTimestampTracking(boolean recordLevelTimestam mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), mockCVRepositoryFuture); // Add follower lag monitor @@ -1181,7 +1287,7 @@ public void testPerRecordOtelMetricsEmission(boolean perRecordOtelMetricsEnabled mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), mockCVRepositoryFuture); // Add leader lag monitor @@ -1249,7 +1355,7 @@ public void testPerRecordOtelMetricsNotEmittedWhenRecordLevelTimestampDisabled() mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), mockCVRepositoryFuture); // Add leader lag monitor @@ -1324,7 +1430,7 @@ public void testRecordTimestampConsidersHeartbeatTimestamp() { mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), mockCVRepositoryFuture); // Add leader lag monitor @@ -1415,7 +1521,7 @@ public void testCachedHeartbeatKeyRecordingMethods() { mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), mockCVRepositoryFuture); // Add leader lag monitor @@ -1522,7 +1628,7 @@ public void testConsumedFromUpstreamOnlyFlippedByHeartbeat() { mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), mockCVRepositoryFuture); // ---- Follower path ---- @@ -1638,7 +1744,7 @@ public void testHeartbeatLagIsMaxValueBeforeFirstHeartbeat() { mockMetricsRepository, mockReadOnlyRepository, serverConfig, - null, + mock(HeartbeatMonitoringServiceStats.class), mockCVRepositoryFuture); String versionTopic = Version.composeKafkaTopic(TEST_STORE, 1); @@ -1675,6 +1781,68 @@ public void testHeartbeatLagIsMaxValueBeforeFirstHeartbeat() { Assert.assertEquals(lag, 5_000L, "Lag must equal now - heartbeatTimestamp after a heartbeat arrives"); } + /** + * Covers the WC + CHUNKED resolution branches inside {@code updateLagMonitor → addLeaderLagMonitor → + * initializeEntry}. The default mock store/version returns false for both flags so the (REGULAR, + * UNCHUNKED) path is well-exercised by other tests; this test forces the alternate values and + * asserts they propagate end-to-end onto the stored HeartbeatKey. + */ + @Test + public void testUpdateLagMonitorBakesWriteComputeAndChunkedLabelsOnStoredKey() { + HybridStoreConfig hybridStoreConfig = new HybridStoreConfigImpl(1L, 1L, 1L, BufferReplayPolicy.REWIND_FROM_SOP); + Version version = new VersionImpl(TEST_STORE, 1, "1"); + version.setHybridStoreConfig(hybridStoreConfig); + version.setChunkingEnabled(true); + + Store mockStore = mock(Store.class); + when(mockStore.getName()).thenReturn(TEST_STORE); + when(mockStore.isWriteComputationEnabled()).thenReturn(true); + when(mockStore.getHybridStoreConfig()).thenReturn(hybridStoreConfig); + when(mockStore.getVersion(1)).thenReturn(version); + + MetricsRepository mockMetricsRepository = new MetricsRepository(); + ReadOnlyStoreRepository mockReadOnlyRepository = mock(ReadOnlyStoreRepository.class); + when(mockReadOnlyRepository.getStoreOrThrow(TEST_STORE)).thenReturn(mockStore); + when(mockReadOnlyRepository.waitVersion(eq(TEST_STORE), eq(1), any(), anyLong())) + .thenReturn(new StoreVersionInfo(mockStore, version)); + + Set regions = new HashSet<>(); + regions.add(LOCAL_FABRIC); + + VeniceServerConfig serverConfig = mock(VeniceServerConfig.class); + when(serverConfig.getRegionNames()).thenReturn(regions); + when(serverConfig.getRegionName()).thenReturn(LOCAL_FABRIC); + when(serverConfig.getServerMaxWaitForVersionInfo()).thenReturn(Duration.ofSeconds(5)); + when(serverConfig.getListenerHostname()).thenReturn("localhost"); + when(serverConfig.getListenerPort()).thenReturn(123); + when(serverConfig.getLagMonitorCleanupCycle()).thenReturn(5); + + HeartbeatMonitoringService heartbeatMonitoringService = new HeartbeatMonitoringService( + mockMetricsRepository, + mockReadOnlyRepository, + serverConfig, + mock(HeartbeatMonitoringServiceStats.class), + new CompletableFuture<>()); + + String versionTopic = Version.composeKafkaTopic(TEST_STORE, 1); + heartbeatMonitoringService.updateLagMonitor( + versionTopic, + 0, + HeartbeatLagMonitorAction.SET_LEADER_MONITOR, + Utils.getReplicaId(versionTopic, 0)); + + // Pull the stored key out of the leader map and assert the labels match what the Store/Version mocks said. + HeartbeatKey stored = heartbeatMonitoringService.getLeaderHeartbeatTimeStamps() + .keySet() + .stream() + .filter(k -> k.storeName.equals(TEST_STORE) && k.version == 1 && k.partition == 0) + .findFirst() + .orElseThrow(() -> new AssertionError("No leader heartbeat entry was stored")); + Assert.assertEquals(stored.writeType, VeniceStoreWriteType.WRITE_COMPUTE); + Assert.assertEquals(stored.chunkingStatus, VeniceChunkingStatus.CHUNKED); + Assert.assertEquals(stored.locality, VeniceRegionLocality.LOCAL); + } + // Helper methods for flat map assertions private static boolean hasStore(Map map, String storeName) { diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelMetricEntityTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelMetricEntityTest.java index 3db700933db..8cf02e42f41 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelMetricEntityTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelMetricEntityTest.java @@ -1,10 +1,13 @@ package com.linkedin.davinci.stats.ingestion.heartbeat; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CHUNKING_STATUS; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_LOCALITY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_NAME; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_STATE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_WRITE_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_VERSION_ROLE; import static com.linkedin.venice.utils.Utils.setOf; @@ -36,9 +39,12 @@ private static Map expectedD VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REGION_NAME, + VENICE_REGION_LOCALITY, VENICE_VERSION_ROLE, VENICE_REPLICA_TYPE, - VENICE_REPLICA_STATE))); + VENICE_REPLICA_STATE, + VENICE_STORE_WRITE_TYPE, + VENICE_CHUNKING_STATUS))); return map; } } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStatsTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStatsTest.java index b724d7e9d16..576400b989a 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStatsTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStatsTest.java @@ -3,11 +3,14 @@ import static com.linkedin.davinci.stats.ServerMetricEntity.SERVER_METRIC_ENTITIES; import static com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatOtelMetricEntity.INGESTION_HEARTBEAT_DELAY; import static com.linkedin.venice.meta.Store.NON_EXISTING_VERSION; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CHUNKING_STATUS; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_LOCALITY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_NAME; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_STATE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_WRITE_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_VERSION_ROLE; import static com.linkedin.venice.utils.OpenTelemetryDataTestUtils.validateExponentialHistogramPointData; import static org.testng.Assert.assertEquals; @@ -21,6 +24,9 @@ import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.stats.dimensions.ReplicaState; import com.linkedin.venice.stats.dimensions.ReplicaType; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; import io.tehuti.metrics.MetricsRepository; @@ -105,6 +111,9 @@ public void testUpdateVersionInfo() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 100L); // Verify metric was recorded with CURRENT version type @@ -127,6 +136,9 @@ public void testRecordHeartbeatDelayOtelMetricsForCurrentVersion() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, delay); validateHeartbeatMetric( @@ -148,6 +160,9 @@ public void testRecordHeartbeatDelayOtelMetricsForFutureVersion() { REGION_US_WEST, ReplicaType.FOLLOWER, ReplicaState.CATCHING_UP, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, delay); validateHeartbeatMetric( @@ -169,6 +184,9 @@ public void testRecordHeartbeatDelayOtelMetricsForBackupVersion() { REGION_US_WEST, ReplicaType.FOLLOWER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, delay); validateHeartbeatMetric( @@ -190,6 +208,9 @@ public void testRecordHeartbeatDelayOtelMetricsForNonExistingVersion() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 100L); // Verify metric was recorded with BACKUP version type @@ -212,6 +233,9 @@ public void testRecordHeartbeatDelayOtelMetricsWithMultipleRegions() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 100L); // Record for US-EAST @@ -220,6 +244,9 @@ public void testRecordHeartbeatDelayOtelMetricsWithMultipleRegions() { REGION_US_EAST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 200L); // Verify both regions recorded metrics @@ -250,6 +277,9 @@ public void testRecordMultipleHeartbeatsForSameRegion() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 100L); heartbeatOtelStats.recordHeartbeatDelayOtelMetrics( @@ -257,6 +287,9 @@ public void testRecordMultipleHeartbeatsForSameRegion() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 150L); heartbeatOtelStats.recordHeartbeatDelayOtelMetrics( @@ -264,6 +297,9 @@ public void testRecordMultipleHeartbeatsForSameRegion() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 200L); // Verify aggregate metrics (min=100, max=200, sum=450, count=3) @@ -293,6 +329,9 @@ public void testAllVersionRoles(VersionRole expectedVersionRole, int version) { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 100L); validateHeartbeatMetric( @@ -318,6 +357,9 @@ public void testAllReplicaTypes(ReplicaType replicaType) { REGION_US_WEST, replicaType, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 100L); validateHeartbeatMetric(REGION_US_WEST, VersionRole.CURRENT, replicaType, ReplicaState.READY_TO_SERVE, 100.0, 1); @@ -332,8 +374,15 @@ public Object[][] replicaStateProvider() { public void testAllReplicaStates(ReplicaState replicaState) { heartbeatOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); - heartbeatOtelStats - .recordHeartbeatDelayOtelMetrics(CURRENT_VERSION, REGION_US_WEST, ReplicaType.FOLLOWER, replicaState, 100L); + heartbeatOtelStats.recordHeartbeatDelayOtelMetrics( + CURRENT_VERSION, + REGION_US_WEST, + ReplicaType.FOLLOWER, + replicaState, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, + 100L); validateHeartbeatMetric(REGION_US_WEST, VersionRole.CURRENT, ReplicaType.FOLLOWER, replicaState, 100.0, 1); } @@ -349,6 +398,9 @@ public void testVersionInfoUpdateChangesClassification() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 100L); validateHeartbeatMetric( @@ -368,6 +420,9 @@ public void testVersionInfoUpdateChangesClassification() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 150L); // Verify version 1 is now classified as BACKUP @@ -398,6 +453,9 @@ public void testNoMetricsRecordedWhenOtelDisabled() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 100L); // Verify no metrics were recorded @@ -419,6 +477,9 @@ public void testVersionInfoWithNonExistingVersions() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 100L); validateHeartbeatMetric( @@ -440,6 +501,9 @@ public void testRecordZeroDelay() { REGION_US_WEST, ReplicaType.FOLLOWER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 0L); validateHeartbeatMetric( @@ -462,6 +526,9 @@ public void testRecordLargeDelay() { REGION_US_WEST, ReplicaType.FOLLOWER, ReplicaState.CATCHING_UP, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, largeDelay); validateHeartbeatMetric( @@ -498,6 +565,9 @@ public void testDifferentDimensionCombinations() { REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 100L); heartbeatOtelStats.recordHeartbeatDelayOtelMetrics( @@ -505,6 +575,9 @@ public void testDifferentDimensionCombinations() { REGION_US_WEST, ReplicaType.FOLLOWER, ReplicaState.CATCHING_UP, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 200L); heartbeatOtelStats.recordHeartbeatDelayOtelMetrics( @@ -512,6 +585,9 @@ public void testDifferentDimensionCombinations() { REGION_US_EAST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, 300L); // Verify each combination has its own metric @@ -579,9 +655,18 @@ private void validateHeartbeatMetric( .put(VENICE_STORE_NAME.getDimensionNameInDefaultFormat(), STORE_NAME) .put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), CLUSTER_NAME) .put(VENICE_REGION_NAME.getDimensionNameInDefaultFormat(), region) + // SLO dimensions: existing tests record with the default-value combo (REGULAR, UNCHUNKED, LOCAL). + // Tests that exercise other combos build their own expectedAttributes inline. + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), VeniceRegionLocality.LOCAL.getDimensionValue()) .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), versionRole.getDimensionValue()) .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), replicaType.getDimensionValue()) .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), replicaState.getDimensionValue()) + .put( + VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), + VeniceStoreWriteType.REGULAR.getDimensionValue()) + .put( + VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), + VeniceChunkingStatus.UNCHUNKED.getDimensionValue()) .build(); validateExponentialHistogramPointData( diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatVersionedStatsTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatVersionedStatsTest.java index 2794b1d6c23..3501e9fb2ff 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatVersionedStatsTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatVersionedStatsTest.java @@ -3,11 +3,14 @@ import static com.linkedin.davinci.stats.ServerMetricEntity.SERVER_METRIC_ENTITIES; import static com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatOtelMetricEntity.INGESTION_HEARTBEAT_DELAY; import static com.linkedin.davinci.stats.ingestion.heartbeat.RecordLevelDelayOtelMetricEntity.INGESTION_RECORD_DELAY; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CHUNKING_STATUS; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_LOCALITY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_NAME; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_STATE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_WRITE_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_VERSION_ROLE; import static com.linkedin.venice.utils.OpenTelemetryDataTestUtils.validateExponentialHistogramPointData; import static org.mockito.Mockito.mock; @@ -28,6 +31,9 @@ import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.stats.dimensions.ReplicaState; import com.linkedin.venice.stats.dimensions.ReplicaType; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; import com.linkedin.venice.utils.DataProviderUtils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.opentelemetry.api.common.Attributes; @@ -57,6 +63,10 @@ public class HeartbeatVersionedStatsTest { private static final int FUTURE_VERSION = 3; private static final String TEST_PREFIX = "test_prefix"; private static final long FIXED_CURRENT_TIME = 1000000L; + // Default SLO labels for record-level delay tests + private static final VeniceStoreWriteType WRITE_TYPE = VeniceStoreWriteType.REGULAR; + private static final VeniceChunkingStatus CHUNKING_STATUS = VeniceChunkingStatus.UNCHUNKED; + private static final VeniceRegionLocality LOCALITY = VeniceRegionLocality.LOCAL; private InMemoryMetricReader inMemoryMetricReader; private VeniceMetricsRepository metricsRepository; @@ -92,7 +102,11 @@ public void setUp() { versions.add(futureVersion); when(mockStore.getVersions()).thenReturn(versions); + when(mockStore.isWriteComputationEnabled()).thenReturn(false); + when(mockStore.isChunkingEnabled()).thenReturn(false); + when(mockMetadataRepository.getStoreOrThrow(STORE_NAME)).thenReturn(mockStore); + when(mockMetadataRepository.getStore(STORE_NAME)).thenReturn(mockStore); when(mockMetadataRepository.hasStore(STORE_NAME)).thenReturn(true); when(mockMetadataRepository.getAllStores()).thenReturn(Collections.singletonList(mockStore)); @@ -132,9 +146,30 @@ public void testRecordLeaderLag() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Record multiple leader lags (delays: 100ms, 200ms, 150ms) - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100); - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 200); - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 150); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 200, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 150, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify Tehuti accumulated correctly HeartbeatStat tehutiStats = heartbeatVersionedStats.getStatsForTesting(STORE_NAME, CURRENT_VERSION); @@ -149,12 +184,33 @@ public void testRecordFollowerLag(boolean isReadyToServe) { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Record multiple follower lags (delays: 100ms, 200ms, 150ms) - heartbeatVersionedStats - .recordFollowerLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100, isReadyToServe); - heartbeatVersionedStats - .recordFollowerLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 200, isReadyToServe); - heartbeatVersionedStats - .recordFollowerLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 150, isReadyToServe); + heartbeatVersionedStats.recordFollowerLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordFollowerLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 200, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordFollowerLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 150, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify Tehuti metrics HeartbeatStat tehutiStats = heartbeatVersionedStats.getStatsForTesting(STORE_NAME, CURRENT_VERSION); @@ -187,7 +243,14 @@ public void testHandleStoreDeleted() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Record some metrics to create OTel stats for the store - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); HeartbeatOtelStats otelStatsBefore = heartbeatVersionedStats.getOtelStatsForTesting(STORE_NAME); assertNotNull(otelStatsBefore, "OTel stats should exist before deletion"); @@ -199,7 +262,14 @@ public void testHandleStoreDeleted() { assertNull(heartbeatVersionedStats.getOtelStatsForTesting(STORE_NAME), "OTel stats should be null after deletion"); // After deletion, recording new metrics should still work (creates new stats) - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 200); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 200, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify a fresh OTel stats instance was created (different object from before deletion) HeartbeatOtelStats otelStatsAfter = heartbeatVersionedStats.getOtelStatsForTesting(STORE_NAME); @@ -234,7 +304,14 @@ public void testVersionInfoInitializedCorrectly() { // Record a metric to trigger store initialization via getVersionedStats -> addStore heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); - heartbeatVersionedStats.recordLeaderLag(newStoreName, newCurrentVersion, REGION, FIXED_CURRENT_TIME - 100); + heartbeatVersionedStats.recordLeaderLag( + newStoreName, + newCurrentVersion, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify version info was initialized correctly HeartbeatStat stats = heartbeatVersionedStats.getStatsForTesting(newStoreName, newCurrentVersion); @@ -251,7 +328,14 @@ public void testOnVersionInfoUpdatedCalledOnStoreChange() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // First, record a metric to ensure the store is tracked - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Get the OTel stats and verify initial version info HeartbeatOtelStats otelStats = heartbeatVersionedStats.getOtelStatsForTesting(STORE_NAME); @@ -316,7 +400,8 @@ public void testFutureVersionComputedFromStartedAndPushedStatus() { // Record metric to trigger initialization heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); - heartbeatVersionedStats.recordLeaderLag(storeName, 1, REGION, FIXED_CURRENT_TIME - 100); + heartbeatVersionedStats + .recordLeaderLag(storeName, 1, REGION, FIXED_CURRENT_TIME - 100, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); // Verify future version is the highest STARTED/PUSHED version (3) HeartbeatOtelStats otelStats = heartbeatVersionedStats.getOtelStatsForTesting(storeName); @@ -348,7 +433,8 @@ public void testNoFutureVersionWhenAllOnline() { leaderMonitors.put(new HeartbeatKey(storeName, 2, 0, REGION), new IngestionTimestampEntry(0, false, false)); heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); - heartbeatVersionedStats.recordLeaderLag(storeName, 2, REGION, FIXED_CURRENT_TIME - 100); + heartbeatVersionedStats + .recordLeaderLag(storeName, 2, REGION, FIXED_CURRENT_TIME - 100, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); HeartbeatOtelStats otelStats = heartbeatVersionedStats.getOtelStatsForTesting(storeName); assertNotNull(otelStats); @@ -365,9 +451,13 @@ private Attributes buildAttributes(ReplicaType replicaType, ReplicaState replica .put(VENICE_STORE_NAME.getDimensionNameInDefaultFormat(), STORE_NAME) .put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), CLUSTER_NAME) .put(VENICE_REGION_NAME.getDimensionNameInDefaultFormat(), REGION) + // SLO dimensions: tests record with the default-value combo (REGULAR, UNCHUNKED, LOCAL). + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), LOCALITY.getDimensionValue()) .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.CURRENT.getDimensionValue()) .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), replicaType.getDimensionValue()) .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), replicaState.getDimensionValue()) + .put(VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), WRITE_TYPE.getDimensionValue()) + .put(VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), CHUNKING_STATUS.getDimensionValue()) .build(); } @@ -394,9 +484,30 @@ public void testRecordLeaderRecordLag() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Record multiple leader record lags (delays: 100ms, 200ms, 150ms) - heartbeatVersionedStats.recordLeaderRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100); - heartbeatVersionedStats.recordLeaderRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 200); - heartbeatVersionedStats.recordLeaderRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 150); + heartbeatVersionedStats.recordLeaderRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordLeaderRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 200, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordLeaderRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 150, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify OTel accumulated correctly (min=100, max=200, count=3, sum=450) validateRecordOtelHistogram(ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, 100.0, 200.0, 3, 450.0); @@ -407,12 +518,33 @@ public void testRecordFollowerRecordLag(boolean isReadyToServe) { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Record multiple follower record lags (delays: 100ms, 200ms, 150ms) - heartbeatVersionedStats - .recordFollowerRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100, isReadyToServe); - heartbeatVersionedStats - .recordFollowerRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 200, isReadyToServe); - heartbeatVersionedStats - .recordFollowerRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 150, isReadyToServe); + heartbeatVersionedStats.recordFollowerRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordFollowerRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 200, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordFollowerRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 150, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify OTel metrics: active has min=100, max=200, count=3, sum=450; squelched has all 0s validateRecordOtelHistogram( @@ -431,6 +563,24 @@ public void testRecordFollowerRecordLag(boolean isReadyToServe) { isReadyToServe ? 0.0 : 450.0); } + private Attributes buildRecordLevelAttributes(ReplicaType replicaType, ReplicaState replicaState) { + return Attributes.builder() + .put(VENICE_STORE_NAME.getDimensionNameInDefaultFormat(), STORE_NAME) + .put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), CLUSTER_NAME) + .put(VENICE_REGION_NAME.getDimensionNameInDefaultFormat(), REGION) + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), VeniceRegionLocality.LOCAL.getDimensionValue()) + .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.CURRENT.getDimensionValue()) + .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), replicaType.getDimensionValue()) + .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), replicaState.getDimensionValue()) + .put( + VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), + VeniceStoreWriteType.REGULAR.getDimensionValue()) + .put( + VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), + VeniceChunkingStatus.UNCHUNKED.getDimensionValue()) + .build(); + } + private void validateRecordOtelHistogram( ReplicaType replicaType, ReplicaState replicaState, @@ -444,7 +594,7 @@ private void validateRecordOtelHistogram( expectedMax, expectedCount, expectedSum, - buildAttributes(replicaType, replicaState), + buildRecordLevelAttributes(replicaType, replicaState), INGESTION_RECORD_DELAY.getMetricEntity().getMetricName(), TEST_PREFIX); } @@ -455,12 +605,22 @@ public void testEmitPerRecordLeaderOtelMetric() { // Initialize the OTel stats for this store by calling recordLeaderRecordLag first // This simulates the normal flow where periodic emission initializes the stats - heartbeatVersionedStats.recordLeaderRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 50); + heartbeatVersionedStats.recordLeaderRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 50, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Now emit per-record leader OTel metrics immediately (delays: 100ms, 200ms, 150ms) - heartbeatVersionedStats.emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 100); - heartbeatVersionedStats.emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 200); - heartbeatVersionedStats.emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 150); + heartbeatVersionedStats + .emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 100, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); + heartbeatVersionedStats + .emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 200, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); + heartbeatVersionedStats + .emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 150, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); // Verify OTel accumulated correctly: initial 50 + 100 + 200 + 150 = 500 sum, count=4 validateRecordOtelHistogram(ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, 50.0, 200.0, 4, 500.0); @@ -472,14 +632,45 @@ public void testEmitPerRecordFollowerOtelMetric(boolean isReadyToServe) { // Initialize the OTel stats for this store by calling recordFollowerRecordLag first // Note: recordFollowerRecordLag records to BOTH states (one with actual value, one with 0) - heartbeatVersionedStats - .recordFollowerRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 50, isReadyToServe); + heartbeatVersionedStats.recordFollowerRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 50, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Now emit per-record follower OTel metrics (delays: 100ms, 200ms, 150ms) // Note: emitPerRecordFollowerOtelMetric only records to the active state (no squelching) - heartbeatVersionedStats.emitPerRecordFollowerOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 100, isReadyToServe); - heartbeatVersionedStats.emitPerRecordFollowerOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 200, isReadyToServe); - heartbeatVersionedStats.emitPerRecordFollowerOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 150, isReadyToServe); + heartbeatVersionedStats.emitPerRecordFollowerOtelMetric( + STORE_NAME, + CURRENT_VERSION, + REGION, + 100, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.emitPerRecordFollowerOtelMetric( + STORE_NAME, + CURRENT_VERSION, + REGION, + 200, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.emitPerRecordFollowerOtelMetric( + STORE_NAME, + CURRENT_VERSION, + REGION, + 150, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify OTel metrics: // - recordFollowerRecordLag: records 50 to active state, 0 to inactive state (count=1 each) @@ -498,8 +689,10 @@ public void testEmitPerRecordFollowerOtelMetric(boolean isReadyToServe) { @Test public void testEmitPerRecordOtelMetricWhenStoreNotInMetadataRepository() { // Should be a graceful no-op: no exception thrown, no stats entry created - heartbeatVersionedStats.emitPerRecordLeaderOtelMetric("unknown_store", 1, REGION, 100); - heartbeatVersionedStats.emitPerRecordFollowerOtelMetric("unknown_store", 1, REGION, 100, true); + heartbeatVersionedStats + .emitPerRecordLeaderOtelMetric("unknown_store", 1, REGION, 100, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); + heartbeatVersionedStats + .emitPerRecordFollowerOtelMetric("unknown_store", 1, REGION, 100, true, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); assertNull( heartbeatVersionedStats.getRecordLevelDelayOtelStatsForTesting("unknown_store"), @@ -515,9 +708,12 @@ public void testEmitPerRecordLeaderOtelMetricWithoutPeriodicInitialization() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Call emitPerRecordLeaderOtelMetric directly without any prior recordLeaderRecordLag call - heartbeatVersionedStats.emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 100); - heartbeatVersionedStats.emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 200); - heartbeatVersionedStats.emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 150); + heartbeatVersionedStats + .emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 100, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); + heartbeatVersionedStats + .emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 200, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); + heartbeatVersionedStats + .emitPerRecordLeaderOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 150, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); validateRecordOtelHistogram(ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, 100.0, 200.0, 3, 450.0); } @@ -527,9 +723,33 @@ public void testEmitPerRecordFollowerOtelMetricWithoutPeriodicInitialization(boo heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Call emitPerRecordFollowerOtelMetric directly without any prior recordFollowerRecordLag call - heartbeatVersionedStats.emitPerRecordFollowerOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 100, isReadyToServe); - heartbeatVersionedStats.emitPerRecordFollowerOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 200, isReadyToServe); - heartbeatVersionedStats.emitPerRecordFollowerOtelMetric(STORE_NAME, CURRENT_VERSION, REGION, 150, isReadyToServe); + heartbeatVersionedStats.emitPerRecordFollowerOtelMetric( + STORE_NAME, + CURRENT_VERSION, + REGION, + 100, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.emitPerRecordFollowerOtelMetric( + STORE_NAME, + CURRENT_VERSION, + REGION, + 200, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.emitPerRecordFollowerOtelMetric( + STORE_NAME, + CURRENT_VERSION, + REGION, + 150, + isReadyToServe, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); ReplicaState activeState = isReadyToServe ? ReplicaState.READY_TO_SERVE : ReplicaState.CATCHING_UP; validateRecordOtelHistogram(ReplicaType.FOLLOWER, activeState, 100.0, 200.0, 3, 450.0); @@ -565,7 +785,14 @@ public void testOtelStatsCreatedLazilyWithCorrectVersionInfo() { assertNull(heartbeatVersionedStats.getRecordLevelDelayOtelStatsForTesting(newStoreName)); // Record a heartbeat — creates heartbeat OTel stats only (not record-level) - heartbeatVersionedStats.recordLeaderLag(newStoreName, currentVer, REGION, FIXED_CURRENT_TIME - 100); + heartbeatVersionedStats.recordLeaderLag( + newStoreName, + currentVer, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); HeartbeatOtelStats heartbeatOtelStats = heartbeatVersionedStats.getOtelStatsForTesting(newStoreName); assertNotNull(heartbeatOtelStats, "Heartbeat OTel stats should be created on first heartbeat"); @@ -578,7 +805,14 @@ public void testOtelStatsCreatedLazilyWithCorrectVersionInfo() { "Record-level stats should not be created by a heartbeat call"); // Now record a record-level lag — creates record-level OTel stats - heartbeatVersionedStats.recordLeaderRecordLag(newStoreName, currentVer, REGION, FIXED_CURRENT_TIME - 50); + heartbeatVersionedStats.recordLeaderRecordLag( + newStoreName, + currentVer, + REGION, + FIXED_CURRENT_TIME - 50, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); RecordLevelDelayOtelStats recordStats = heartbeatVersionedStats.getRecordLevelDelayOtelStatsForTesting(newStoreName); @@ -596,8 +830,22 @@ public void testHandleStoreChangedUpdatesVersionInfoInBothMaps() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Initialize both OTel stats by recording both types - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100); - heartbeatVersionedStats.recordLeaderRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 50); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordLeaderRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 50, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify initial version info in both maps HeartbeatOtelStats heartbeatStats = heartbeatVersionedStats.getOtelStatsForTesting(STORE_NAME); @@ -630,8 +878,22 @@ public void testHandleStoreDeletedCleansBothMaps() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Initialize both OTel stats - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100); - heartbeatVersionedStats.recordLeaderRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 50); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordLeaderRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 50, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); assertNotNull(heartbeatVersionedStats.getOtelStatsForTesting(STORE_NAME)); assertNotNull(heartbeatVersionedStats.getRecordLevelDelayOtelStatsForTesting(STORE_NAME)); @@ -655,13 +917,41 @@ public void testRecreationAfterDeletionInitializesVersionInfo() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Create and then delete - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100); - heartbeatVersionedStats.recordLeaderRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 50); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordLeaderRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 50, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); heartbeatVersionedStats.handleStoreDeleted(STORE_NAME); // Recreate by recording again - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 200); - heartbeatVersionedStats.recordLeaderRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 75); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 200, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + heartbeatVersionedStats.recordLeaderRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 75, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify fresh stats have correct version info HeartbeatOtelStats newHeartbeatStats = heartbeatVersionedStats.getOtelStatsForTesting(STORE_NAME); @@ -741,7 +1031,14 @@ public void testVersionInfoFromHandleStoreChangedUsedByLazyCreation() { // Now record a metric — triggers lazy OTel stats creation heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); - heartbeatVersionedStats.recordLeaderLag(trackedStore, updatedCurrentVersion, REGION, FIXED_CURRENT_TIME - 100); + heartbeatVersionedStats.recordLeaderLag( + trackedStore, + updatedCurrentVersion, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify OTel stats were created with version info from the metadata repository HeartbeatOtelStats heartbeatStats = heartbeatVersionedStats.getOtelStatsForTesting(trackedStore); @@ -756,7 +1053,14 @@ public void testVersionInfoFromHandleStoreChangedUsedByLazyCreation() { "Future version should match the metadata repository"); // Also verify record-level stats pick up the same version info - heartbeatVersionedStats.recordLeaderRecordLag(trackedStore, updatedCurrentVersion, REGION, FIXED_CURRENT_TIME - 50); + heartbeatVersionedStats.recordLeaderRecordLag( + trackedStore, + updatedCurrentVersion, + REGION, + FIXED_CURRENT_TIME - 50, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); RecordLevelDelayOtelStats recordStats = heartbeatVersionedStats.getRecordLevelDelayOtelStatsForTesting(trackedStore); assertNotNull(recordStats, "Record-level OTel stats should be created on first record-level call"); @@ -778,7 +1082,14 @@ public void testVersionRoleTaggingInHeartbeatMetrics() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Record for current version - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify CURRENT role tagging Attributes currentAttributes = Attributes.builder() @@ -788,6 +1099,9 @@ public void testVersionRoleTaggingInHeartbeatMetrics() { .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.CURRENT.getDimensionValue()) .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), ReplicaType.LEADER.getDimensionValue()) .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), ReplicaState.READY_TO_SERVE.getDimensionValue()) + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), LOCALITY.getDimensionValue()) + .put(VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), WRITE_TYPE.getDimensionValue()) + .put(VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), CHUNKING_STATUS.getDimensionValue()) .build(); validateExponentialHistogramPointData( inMemoryMetricReader, @@ -800,7 +1114,14 @@ public void testVersionRoleTaggingInHeartbeatMetrics() { TEST_PREFIX); // Record for future version - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, FUTURE_VERSION, REGION, FIXED_CURRENT_TIME - 200); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + FUTURE_VERSION, + REGION, + FIXED_CURRENT_TIME - 200, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify FUTURE role tagging Attributes futureAttributes = Attributes.builder() @@ -810,6 +1131,9 @@ public void testVersionRoleTaggingInHeartbeatMetrics() { .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.FUTURE.getDimensionValue()) .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), ReplicaType.LEADER.getDimensionValue()) .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), ReplicaState.READY_TO_SERVE.getDimensionValue()) + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), LOCALITY.getDimensionValue()) + .put(VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), WRITE_TYPE.getDimensionValue()) + .put(VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), CHUNKING_STATUS.getDimensionValue()) .build(); validateExponentialHistogramPointData( inMemoryMetricReader, @@ -830,38 +1154,43 @@ public void testVersionRoleTaggingInRecordLevelDelayMetrics() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Record for current version - heartbeatVersionedStats.recordLeaderRecordLag(STORE_NAME, CURRENT_VERSION, REGION, FIXED_CURRENT_TIME - 100); - - // Verify CURRENT role tagging - Attributes currentAttributes = Attributes.builder() - .put(VENICE_STORE_NAME.getDimensionNameInDefaultFormat(), STORE_NAME) - .put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), CLUSTER_NAME) - .put(VENICE_REGION_NAME.getDimensionNameInDefaultFormat(), REGION) - .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.CURRENT.getDimensionValue()) - .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), ReplicaType.LEADER.getDimensionValue()) - .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), ReplicaState.READY_TO_SERVE.getDimensionValue()) - .build(); - validateExponentialHistogramPointData( - inMemoryMetricReader, - 100.0, - 100.0, - 1, - 100.0, - currentAttributes, - INGESTION_RECORD_DELAY.getMetricEntity().getMetricName(), - TEST_PREFIX); + heartbeatVersionedStats.recordLeaderRecordLag( + STORE_NAME, + CURRENT_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); + + // Verify CURRENT role tagging (includes new SLO dimensions) + validateRecordOtelHistogram(ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, 100.0, 100.0, 1, 100.0); // Record for future version - heartbeatVersionedStats.recordLeaderRecordLag(STORE_NAME, FUTURE_VERSION, REGION, FIXED_CURRENT_TIME - 200); + heartbeatVersionedStats.recordLeaderRecordLag( + STORE_NAME, + FUTURE_VERSION, + REGION, + FIXED_CURRENT_TIME - 200, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify FUTURE role tagging Attributes futureAttributes = Attributes.builder() .put(VENICE_STORE_NAME.getDimensionNameInDefaultFormat(), STORE_NAME) .put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), CLUSTER_NAME) .put(VENICE_REGION_NAME.getDimensionNameInDefaultFormat(), REGION) + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), VeniceRegionLocality.LOCAL.getDimensionValue()) .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.FUTURE.getDimensionValue()) .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), ReplicaType.LEADER.getDimensionValue()) .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), ReplicaState.READY_TO_SERVE.getDimensionValue()) + .put( + VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), + VeniceStoreWriteType.REGULAR.getDimensionValue()) + .put( + VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), + VeniceChunkingStatus.UNCHUNKED.getDimensionValue()) .build(); validateExponentialHistogramPointData( inMemoryMetricReader, @@ -892,8 +1221,10 @@ public void testMultipleStoresHaveIndependentVersionInfo() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Initialize both stores - heartbeatVersionedStats.recordLeaderLag(storeA, 5, REGION, FIXED_CURRENT_TIME - 100); - heartbeatVersionedStats.recordLeaderLag(storeB, 10, REGION, FIXED_CURRENT_TIME - 200); + heartbeatVersionedStats + .recordLeaderLag(storeA, 5, REGION, FIXED_CURRENT_TIME - 100, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); + heartbeatVersionedStats + .recordLeaderLag(storeB, 10, REGION, FIXED_CURRENT_TIME - 200, WRITE_TYPE, CHUNKING_STATUS, LOCALITY); // Verify independent version info HeartbeatOtelStats statsA = heartbeatVersionedStats.getOtelStatsForTesting(storeA); @@ -924,7 +1255,14 @@ public void testVersionRoleChangesAfterPromotion() { heartbeatVersionedStats.setCurrentTimeSupplier(() -> FIXED_CURRENT_TIME); // Record for FUTURE_VERSION — should be tagged as FUTURE - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, FUTURE_VERSION, REGION, FIXED_CURRENT_TIME - 100); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + FUTURE_VERSION, + REGION, + FIXED_CURRENT_TIME - 100, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); HeartbeatOtelStats otelStats = heartbeatVersionedStats.getOtelStatsForTesting(STORE_NAME); assertEquals(otelStats.getVersionInfo().getCurrentVersion(), CURRENT_VERSION); @@ -940,7 +1278,14 @@ public void testVersionRoleChangesAfterPromotion() { assertEquals(otelStats.getVersionInfo().getFutureVersion(), newFutureVersion); // Record again for the SAME version (3) — now tagged as CURRENT - heartbeatVersionedStats.recordLeaderLag(STORE_NAME, FUTURE_VERSION, REGION, FIXED_CURRENT_TIME - 200); + heartbeatVersionedStats.recordLeaderLag( + STORE_NAME, + FUTURE_VERSION, + REGION, + FIXED_CURRENT_TIME - 200, + WRITE_TYPE, + CHUNKING_STATUS, + LOCALITY); // Verify the metric was tagged with CURRENT role (not FUTURE) for the second recording Attributes currentAttributes = Attributes.builder() @@ -950,6 +1295,9 @@ public void testVersionRoleChangesAfterPromotion() { .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.CURRENT.getDimensionValue()) .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), ReplicaType.LEADER.getDimensionValue()) .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), ReplicaState.READY_TO_SERVE.getDimensionValue()) + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), LOCALITY.getDimensionValue()) + .put(VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), WRITE_TYPE.getDimensionValue()) + .put(VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), CHUNKING_STATUS.getDimensionValue()) .build(); validateExponentialHistogramPointData( inMemoryMetricReader, diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelMetricEntityTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelMetricEntityTest.java index 7af4b8c6ae5..91b9ec1b47b 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelMetricEntityTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelMetricEntityTest.java @@ -1,10 +1,13 @@ package com.linkedin.davinci.stats.ingestion.heartbeat; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CHUNKING_STATUS; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_LOCALITY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_NAME; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_STATE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_WRITE_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_VERSION_ROLE; import static com.linkedin.venice.utils.Utils.setOf; @@ -36,9 +39,12 @@ private static Map ex VENICE_STORE_NAME, VENICE_CLUSTER_NAME, VENICE_REGION_NAME, + VENICE_REGION_LOCALITY, VENICE_VERSION_ROLE, VENICE_REPLICA_TYPE, - VENICE_REPLICA_STATE))); + VENICE_REPLICA_STATE, + VENICE_STORE_WRITE_TYPE, + VENICE_CHUNKING_STATUS))); return map; } } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStatsTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStatsTest.java index d38f3c70ce1..a73cab1dec6 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStatsTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStatsTest.java @@ -3,11 +3,14 @@ import static com.linkedin.davinci.stats.ServerMetricEntity.SERVER_METRIC_ENTITIES; import static com.linkedin.davinci.stats.ingestion.heartbeat.RecordLevelDelayOtelMetricEntity.INGESTION_RECORD_DELAY; import static com.linkedin.venice.meta.Store.NON_EXISTING_VERSION; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CHUNKING_STATUS; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CLUSTER_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_LOCALITY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_NAME; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_STATE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_WRITE_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_VERSION_ROLE; import static com.linkedin.venice.utils.OpenTelemetryDataTestUtils.validateExponentialHistogramPointData; import static org.testng.Assert.assertEquals; @@ -20,6 +23,9 @@ import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.stats.dimensions.ReplicaState; import com.linkedin.venice.stats.dimensions.ReplicaType; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; import io.tehuti.metrics.MetricsRepository; @@ -31,11 +37,17 @@ public class RecordLevelDelayOtelStatsTest { private static final String STORE_NAME = "test_store"; private static final String CLUSTER_NAME = "test_cluster"; - private static final String REGION_US_WEST = "us-west"; + private static final String LOCAL_REGION = "us-west"; + private static final String REMOTE_REGION = "us-east"; + private static final String REGION_US_WEST = LOCAL_REGION; private static final int CURRENT_VERSION = 2; private static final int FUTURE_VERSION = 3; private static final String TEST_PREFIX = "test_prefix"; + // Default SLO dimensions for most tests: non-WC, non-chunked + private static final VeniceStoreWriteType DEFAULT_WRITE_TYPE = VeniceStoreWriteType.REGULAR; + private static final VeniceChunkingStatus DEFAULT_CHUNKING_STATUS = VeniceChunkingStatus.UNCHUNKED; + private InMemoryMetricReader inMemoryMetricReader; private VeniceMetricsRepository metricsRepository; private RecordLevelDelayOtelStats recordLevelDelayOtelStats; @@ -52,6 +64,28 @@ public void setUp() { recordLevelDelayOtelStats = new RecordLevelDelayOtelStats(metricsRepository, STORE_NAME, CLUSTER_NAME); } + /** + * Helper that records a metric using the default REGULAR/UNCHUNKED labels, deriving locality + * from {@code region} (LOCAL when it matches {@link #LOCAL_REGION}, otherwise REMOTE). + */ + private void recordWithDefaultLabels( + RecordLevelDelayOtelStats stats, + int version, + String region, + ReplicaType replicaType, + ReplicaState replicaState, + long delay) { + stats.recordRecordDelayOtelMetrics( + version, + region, + replicaType, + replicaState, + DEFAULT_WRITE_TYPE, + DEFAULT_CHUNKING_STATUS, + region.equals(LOCAL_REGION) ? VeniceRegionLocality.LOCAL : VeniceRegionLocality.REMOTE, + delay); + } + @AfterMethod public void tearDown() { if (metricsRepository != null) { @@ -97,7 +131,8 @@ public void testUpdateVersionInfo() { recordLevelDelayOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); // Record metrics - should work with updated version info - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, CURRENT_VERSION, REGION_US_WEST, ReplicaType.LEADER, @@ -119,7 +154,8 @@ public void testRecordRecordDelayOtelMetricsForCurrentVersion() { recordLevelDelayOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); long delay = 150L; - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, CURRENT_VERSION, REGION_US_WEST, ReplicaType.LEADER, @@ -140,7 +176,8 @@ public void testRecordRecordDelayOtelMetricsForFutureVersion() { recordLevelDelayOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); long delay = 200L; - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, FUTURE_VERSION, REGION_US_WEST, ReplicaType.FOLLOWER, @@ -161,19 +198,22 @@ public void testRecordRecordDelayOtelMetricsMultipleRecords() { recordLevelDelayOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); // Record three delays: 100ms, 200ms, 150ms - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, CURRENT_VERSION, REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, 100L); - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, CURRENT_VERSION, REGION_US_WEST, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, 200L); - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, CURRENT_VERSION, REGION_US_WEST, ReplicaType.LEADER, @@ -204,7 +244,8 @@ public void testRecordRecordDelayOtelMetricsWhenOtelDisabled() { stats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); // Record metrics - should be no-op - stats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + stats, CURRENT_VERSION, REGION_US_WEST, ReplicaType.LEADER, @@ -237,7 +278,8 @@ private void validateRecordMetric( } /** - * Helper method to validate record histogram metrics with explicit min/max/sum + * Helper method to validate record histogram metrics with explicit min/max/sum. + * Uses the default SLO dimensions (non-WC, non-chunked, local region). */ private void validateRecordMetric( String region, @@ -248,13 +290,21 @@ private void validateRecordMetric( double expectedMax, double expectedSum, long expectedCount) { + VeniceRegionLocality locality = + region.equals(LOCAL_REGION) ? VeniceRegionLocality.LOCAL : VeniceRegionLocality.REMOTE; + VeniceStoreWriteType wcStatus = DEFAULT_WRITE_TYPE; + VeniceChunkingStatus chunkStatus = DEFAULT_CHUNKING_STATUS; + Attributes expectedAttributes = Attributes.builder() .put(VENICE_STORE_NAME.getDimensionNameInDefaultFormat(), STORE_NAME) .put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), CLUSTER_NAME) .put(VENICE_REGION_NAME.getDimensionNameInDefaultFormat(), region) + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), locality.getDimensionValue()) .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), versionRole.getDimensionValue()) .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), replicaType.getDimensionValue()) .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), replicaState.getDimensionValue()) + .put(VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), wcStatus.getDimensionValue()) + .put(VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), chunkStatus.getDimensionValue()) .build(); validateExponentialHistogramPointData( @@ -297,7 +347,8 @@ public void testInitialVersionInfoIsNonExisting() { @Test public void testRecordBeforeUpdateVersionInfoTagsAsBackup() { // Do NOT call updateVersionInfo — version info remains NON_EXISTING_VERSION - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, CURRENT_VERSION, REGION_US_WEST, ReplicaType.LEADER, @@ -317,7 +368,8 @@ public void testBackupVersionRoleTagging() { recordLevelDelayOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); int backupVersion = 1; // Neither current (2) nor future (3) - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, backupVersion, REGION_US_WEST, ReplicaType.LEADER, @@ -336,7 +388,8 @@ public void testUpdateVersionInfoMultipleTimesReflectsLatest() { recordLevelDelayOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); // Record for FUTURE_VERSION — should be FUTURE - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, FUTURE_VERSION, REGION_US_WEST, ReplicaType.LEADER, @@ -352,7 +405,8 @@ public void testUpdateVersionInfoMultipleTimesReflectsLatest() { assertEquals(recordLevelDelayOtelStats.getVersionInfo().getFutureVersion(), newFutureVersion); // Record same version (3) again — now tagged as CURRENT - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, FUTURE_VERSION, REGION_US_WEST, ReplicaType.LEADER, @@ -378,7 +432,8 @@ public void testMultipleRegionsHaveIndependentMetrics() { recordLevelDelayOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); // Record 100ms in us-west - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, CURRENT_VERSION, REGION_US_WEST, ReplicaType.LEADER, @@ -386,7 +441,8 @@ public void testMultipleRegionsHaveIndependentMetrics() { 100L); // Record 200ms in us-east - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, CURRENT_VERSION, regionEast, ReplicaType.LEADER, @@ -402,14 +458,19 @@ public void testMultipleRegionsHaveIndependentMetrics() { 100.0, 1); - // Validate us-east independently + // Validate us-east independently (REMOTE since it differs from LOCAL_REGION) + VeniceStoreWriteType wcStatus = DEFAULT_WRITE_TYPE; + VeniceChunkingStatus chunkStatus = DEFAULT_CHUNKING_STATUS; Attributes eastAttributes = Attributes.builder() .put(VENICE_STORE_NAME.getDimensionNameInDefaultFormat(), STORE_NAME) .put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), CLUSTER_NAME) .put(VENICE_REGION_NAME.getDimensionNameInDefaultFormat(), regionEast) + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), VeniceRegionLocality.REMOTE.getDimensionValue()) .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.CURRENT.getDimensionValue()) .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), ReplicaType.LEADER.getDimensionValue()) .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), ReplicaState.READY_TO_SERVE.getDimensionValue()) + .put(VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), wcStatus.getDimensionValue()) + .put(VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), chunkStatus.getDimensionValue()) .build(); validateExponentialHistogramPointData( inMemoryMetricReader, @@ -431,7 +492,8 @@ public void testCloseAndReuse() { recordLevelDelayOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); // Record a metric, then close - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, CURRENT_VERSION, REGION_US_WEST, ReplicaType.LEADER, @@ -444,7 +506,8 @@ public void testCloseAndReuse() { assertEquals(recordLevelDelayOtelStats.getVersionInfo().getFutureVersion(), FUTURE_VERSION); // Recording after close should re-create metric state and work - recordLevelDelayOtelStats.recordRecordDelayOtelMetrics( + recordWithDefaultLabels( + recordLevelDelayOtelStats, CURRENT_VERSION, REGION_US_WEST, ReplicaType.LEADER, @@ -463,4 +526,153 @@ public void testCloseAndReuse() { 300.0, 2); } + + // ================================================================================== + // Tests for SLO classification dimensions: region locality, write compute, chunking + // ================================================================================== + + /** + * Verifies that local region records are tagged with locality=LOCAL and remote region + * records are tagged with locality=REMOTE. + */ + @Test + public void testRegionLocalityDimension() { + recordLevelDelayOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); + + // Record from local region + recordWithDefaultLabels( + recordLevelDelayOtelStats, + CURRENT_VERSION, + LOCAL_REGION, + ReplicaType.LEADER, + ReplicaState.READY_TO_SERVE, + 100L); + + // Record from remote region + recordWithDefaultLabels( + recordLevelDelayOtelStats, + CURRENT_VERSION, + REMOTE_REGION, + ReplicaType.LEADER, + ReplicaState.READY_TO_SERVE, + 200L); + + // Validate local — uses validateRecordMetric which checks LOCAL for LOCAL_REGION + validateRecordMetric(LOCAL_REGION, VersionRole.CURRENT, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, 100.0, 1); + + // Validate remote — explicit attributes check + Attributes remoteAttributes = Attributes.builder() + .put(VENICE_STORE_NAME.getDimensionNameInDefaultFormat(), STORE_NAME) + .put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), CLUSTER_NAME) + .put(VENICE_REGION_NAME.getDimensionNameInDefaultFormat(), REMOTE_REGION) + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), VeniceRegionLocality.REMOTE.getDimensionValue()) + .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.CURRENT.getDimensionValue()) + .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), ReplicaType.LEADER.getDimensionValue()) + .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), ReplicaState.READY_TO_SERVE.getDimensionValue()) + .put( + VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), + VeniceStoreWriteType.REGULAR.getDimensionValue()) + .put( + VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), + VeniceChunkingStatus.UNCHUNKED.getDimensionValue()) + .build(); + validateExponentialHistogramPointData( + inMemoryMetricReader, + 200.0, + 200.0, + 1, + 200.0, + remoteAttributes, + INGESTION_RECORD_DELAY.getMetricEntity().getMetricName(), + TEST_PREFIX); + } + + /** + * Verifies that write-compute-enabled stores emit venice.store.write_type=write_compute. + */ + @Test + public void testWriteComputeEnabledDimension() { + RecordLevelDelayOtelStats wcStats = new RecordLevelDelayOtelStats(metricsRepository, "wc_store", CLUSTER_NAME); + wcStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); + + wcStats.recordRecordDelayOtelMetrics( + CURRENT_VERSION, + LOCAL_REGION, + ReplicaType.LEADER, + ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.WRITE_COMPUTE, + VeniceChunkingStatus.UNCHUNKED, + VeniceRegionLocality.LOCAL, + 150L); + + Attributes expectedAttributes = Attributes.builder() + .put(VENICE_STORE_NAME.getDimensionNameInDefaultFormat(), "wc_store") + .put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), CLUSTER_NAME) + .put(VENICE_REGION_NAME.getDimensionNameInDefaultFormat(), LOCAL_REGION) + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), VeniceRegionLocality.LOCAL.getDimensionValue()) + .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.CURRENT.getDimensionValue()) + .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), ReplicaType.LEADER.getDimensionValue()) + .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), ReplicaState.READY_TO_SERVE.getDimensionValue()) + .put( + VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), + VeniceStoreWriteType.WRITE_COMPUTE.getDimensionValue()) + .put( + VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), + VeniceChunkingStatus.UNCHUNKED.getDimensionValue()) + .build(); + + validateExponentialHistogramPointData( + inMemoryMetricReader, + 150.0, + 150.0, + 1, + 150.0, + expectedAttributes, + INGESTION_RECORD_DELAY.getMetricEntity().getMetricName(), + TEST_PREFIX); + } + + /** + * Verifies that chunking-enabled stores emit chunking_status=chunked. + */ + @Test + public void testChunkingEnabledDimension() { + RecordLevelDelayOtelStats chunkedStats = + new RecordLevelDelayOtelStats(metricsRepository, "chunked_store", CLUSTER_NAME); + chunkedStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); + + chunkedStats.recordRecordDelayOtelMetrics( + CURRENT_VERSION, + LOCAL_REGION, + ReplicaType.LEADER, + ReplicaState.READY_TO_SERVE, + VeniceStoreWriteType.REGULAR, + VeniceChunkingStatus.CHUNKED, + VeniceRegionLocality.LOCAL, + 300L); + + Attributes expectedAttributes = Attributes.builder() + .put(VENICE_STORE_NAME.getDimensionNameInDefaultFormat(), "chunked_store") + .put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), CLUSTER_NAME) + .put(VENICE_REGION_NAME.getDimensionNameInDefaultFormat(), LOCAL_REGION) + .put(VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(), VeniceRegionLocality.LOCAL.getDimensionValue()) + .put(VENICE_VERSION_ROLE.getDimensionNameInDefaultFormat(), VersionRole.CURRENT.getDimensionValue()) + .put(VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(), ReplicaType.LEADER.getDimensionValue()) + .put(VENICE_REPLICA_STATE.getDimensionNameInDefaultFormat(), ReplicaState.READY_TO_SERVE.getDimensionValue()) + .put( + VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(), + VeniceStoreWriteType.REGULAR.getDimensionValue()) + .put(VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(), VeniceChunkingStatus.CHUNKED.getDimensionValue()) + .build(); + + validateExponentialHistogramPointData( + inMemoryMetricReader, + 300.0, + 300.0, + 1, + 300.0, + expectedAttributes, + INGESTION_RECORD_DELAY.getMetricEntity().getMetricName(), + TEST_PREFIX); + } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceHeartbeatComponent.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceHeartbeatComponent.java index 78d3a8c415d..050c96d45c4 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceHeartbeatComponent.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceHeartbeatComponent.java @@ -10,7 +10,9 @@ public enum VeniceHeartbeatComponent implements VeniceDimensionInterface { /** The heartbeat reporter thread that publishes lag metrics. */ REPORTER, /** The heartbeat logger thread that logs lag delays and cleans up stale monitors. */ - LOGGER; + LOGGER, + /** The lag monitor update path (Helix state-transition triggered). */ + LAG_MONITOR_UPDATE; @Override public VeniceMetricsDimensions getDimensionName() { diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java index 080fae44376..7de901d655e 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensions.java @@ -183,7 +183,10 @@ public enum VeniceMetricsDimensions { VENICE_HELIX_STATE("venice.helix.state"), /** {@link VeniceRecordTransformerOperation} Record transformer operation: put or delete. */ - VENICE_RECORD_TRANSFORMER_OPERATION("venice.record_transformer.operation"); + VENICE_RECORD_TRANSFORMER_OPERATION("venice.record_transformer.operation"), + + /** {@link VeniceStoreWriteType} Store write type: regular or write_compute. */ + VENICE_STORE_WRITE_TYPE("venice.store.write_type"); private final String[] dimensionName = new String[VeniceOpenTelemetryMetricNamingFormat.SIZE]; diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceStoreWriteType.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceStoreWriteType.java new file mode 100644 index 00000000000..26567bc475c --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/dimensions/VeniceStoreWriteType.java @@ -0,0 +1,17 @@ +package com.linkedin.venice.stats.dimensions; + +/** + * Dimension to classify a store's write type for SLO tier classification. + * Partial update (write compute) stores have higher latency due to read-merge-write on leaders. + */ +public enum VeniceStoreWriteType implements VeniceDimensionInterface { + /** Store uses regular full-value writes. */ + REGULAR, + /** Store uses write compute (partial update / read-merge-write on leaders). */ + WRITE_COMPUTE; + + @Override + public VeniceMetricsDimensions getDimensionName() { + return VeniceMetricsDimensions.VENICE_STORE_WRITE_TYPE; + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnums.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnums.java new file mode 100644 index 00000000000..060348f8953 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnums.java @@ -0,0 +1,240 @@ +package com.linkedin.venice.stats.metrics; + +import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; +import com.linkedin.venice.stats.dimensions.VeniceDimensionInterface; +import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; +import io.opentelemetry.api.common.Attributes; +import io.tehuti.metrics.MeasurableStat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.EnumMap; +import java.util.List; +import java.util.Map; +import javax.annotation.Nonnull; + + +/** + * Similar to {@link MetricEntityStateOneEnum} but with Five dynamic dimensions and 5 level EnumMap + */ +public class MetricEntityStateFiveEnums & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface, E4 extends Enum & VeniceDimensionInterface, E5 extends Enum & VeniceDimensionInterface> + extends MetricEntityState { + private final EnumMap>>>> metricAttributesDataEnumMap; + + private final Class enumTypeClass1; + private final Class enumTypeClass2; + private final Class enumTypeClass3; + private final Class enumTypeClass4; + private final Class enumTypeClass5; + + /** should not be called directly, call {@link #create} instead */ + private MetricEntityStateFiveEnums( + MetricEntity metricEntity, + VeniceOpenTelemetryMetricsRepository otelRepository, + Map baseDimensionsMap, + Class enumTypeClass1, + Class enumTypeClass2, + Class enumTypeClass3, + Class enumTypeClass4, + Class enumTypeClass5) { + this( + metricEntity, + otelRepository, + null, + null, + Collections.EMPTY_LIST, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2, + enumTypeClass3, + enumTypeClass4, + enumTypeClass5); + } + + /** should not be called directly, call {@link #create} instead */ + private MetricEntityStateFiveEnums( + MetricEntity metricEntity, + VeniceOpenTelemetryMetricsRepository otelRepository, + TehutiSensorRegistrationFunction registerTehutiSensorFn, + TehutiMetricNameEnum tehutiMetricNameEnum, + List tehutiMetricStats, + Map baseDimensionsMap, + Class enumTypeClass1, + Class enumTypeClass2, + Class enumTypeClass3, + Class enumTypeClass4, + Class enumTypeClass5) { + super( + metricEntity, + otelRepository, + baseDimensionsMap, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats); + validateRequiredDimensions( + metricEntity, + null, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2, + enumTypeClass3, + enumTypeClass4, + enumTypeClass5); + this.enumTypeClass1 = enumTypeClass1; + this.enumTypeClass2 = enumTypeClass2; + this.enumTypeClass3 = enumTypeClass3; + this.enumTypeClass4 = enumTypeClass4; + this.enumTypeClass5 = enumTypeClass5; + this.metricAttributesDataEnumMap = createMetricAttributesDataEnumMap(); + registerObservableCounterIfNeeded(); + } + + /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E */ + public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface, E4 extends Enum & VeniceDimensionInterface, E5 extends Enum & VeniceDimensionInterface> MetricEntityStateFiveEnums create( + MetricEntity metricEntity, + VeniceOpenTelemetryMetricsRepository otelRepository, + Map baseDimensionsMap, + Class enumTypeClass1, + Class enumTypeClass2, + Class enumTypeClass3, + Class enumTypeClass4, + Class enumTypeClass5) { + return new MetricEntityStateFiveEnums<>( + metricEntity, + otelRepository, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2, + enumTypeClass3, + enumTypeClass4, + enumTypeClass5); + } + + /** Overloaded Factory method for constructor with Tehuti parameters */ + public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface, E4 extends Enum & VeniceDimensionInterface, E5 extends Enum & VeniceDimensionInterface> MetricEntityStateFiveEnums create( + MetricEntity metricEntity, + VeniceOpenTelemetryMetricsRepository otelRepository, + TehutiSensorRegistrationFunction registerTehutiSensorFn, + TehutiMetricNameEnum tehutiMetricNameEnum, + List tehutiMetricStats, + Map baseDimensionsMap, + Class enumTypeClass1, + Class enumTypeClass2, + Class enumTypeClass3, + Class enumTypeClass4, + Class enumTypeClass5) { + return new MetricEntityStateFiveEnums<>( + metricEntity, + otelRepository, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2, + enumTypeClass3, + enumTypeClass4, + enumTypeClass5); + } + + /** + * Creates an EnumMap of {@link MetricAttributesData} which will be used to lazy initialize the + * Attributes and optionally a LongAdder for ASYNC_COUNTER_FOR_HIGH_PERF_CASES metrics. + */ + private EnumMap>>>> createMetricAttributesDataEnumMap() { + if (!emitOpenTelemetryMetrics()) { + return null; + } + + return new EnumMap<>(enumTypeClass1); + } + + /** + * Manages the nested EnumMap structure for lazy initialization of MetricAttributesData. + * The structure is a five-level nested EnumMap: + * EnumMap>>>>. + * This allows efficient retrieval of state based on five enum dimensions (E1, E2, E3, E4, E5). + * + * For thread safety considerations, see the class-level notes on {@link MetricEntityStateOneEnum}. + */ + private MetricAttributesData getMetricAttributesData( + E1 dimension1, + E2 dimension2, + E3 dimension3, + E4 dimension4, + E5 dimension5) { + if (!emitOpenTelemetryMetrics()) { + return null; + } + + return metricAttributesDataEnumMap.computeIfAbsent(dimension1, k -> { + validateInputDimension(k); + return new EnumMap<>(enumTypeClass2); + }).computeIfAbsent(dimension2, k -> { + validateInputDimension(k); + return new EnumMap<>(enumTypeClass3); + }).computeIfAbsent(dimension3, k -> { + validateInputDimension(k); + return new EnumMap<>(enumTypeClass4); + }).computeIfAbsent(dimension4, k -> { + validateInputDimension(k); + return new EnumMap<>(enumTypeClass5); + }).computeIfAbsent(dimension5, k -> { + validateInputDimension(k); + Attributes attrs = createAttributes(dimension1, dimension2, dimension3, dimension4, dimension5); + return new MetricAttributesData(attrs, isObservableCounter()); + }); + } + + /** + * Returns the Attributes for the given dimensions. + */ + public Attributes getAttributes(E1 dimension1, E2 dimension2, E3 dimension3, E4 dimension4, E5 dimension5) { + MetricAttributesData holder = getMetricAttributesData(dimension1, dimension2, dimension3, dimension4, dimension5); + return holder != null ? holder.getAttributes() : null; + } + + public void record( + double value, + @Nonnull E1 dimension1, + @Nonnull E2 dimension2, + @Nonnull E3 dimension3, + @Nonnull E4 dimension4, + @Nonnull E5 dimension5) { + super.record(value, getMetricAttributesData(dimension1, dimension2, dimension3, dimension4, dimension5)); + } + + public void record( + long value, + @Nonnull E1 dimension1, + @Nonnull E2 dimension2, + @Nonnull E3 dimension3, + @Nonnull E4 dimension4, + @Nonnull E5 dimension5) { + super.record(value, getMetricAttributesData(dimension1, dimension2, dimension3, dimension4, dimension5)); + } + + @Override + protected Iterable getAllMetricAttributesData() { + if (metricAttributesDataEnumMap == null) { + return null; + } + + List allData = new ArrayList<>(); + for (EnumMap>>> level2Map: metricAttributesDataEnumMap + .values()) { + for (EnumMap>> level3Map: level2Map.values()) { + for (EnumMap> level4Map: level3Map.values()) { + for (EnumMap level5Map: level4Map.values()) { + allData.addAll(level5Map.values()); + } + } + } + } + return allData; + } + + /** visible for testing */ + public EnumMap>>>> getMetricAttributesDataEnumMap() { + return metricAttributesDataEnumMap; + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceHeartbeatComponentTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceHeartbeatComponentTest.java index be9571dca51..4ea5f0e88da 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceHeartbeatComponentTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceHeartbeatComponentTest.java @@ -12,6 +12,7 @@ public void testDimensionInterface() { CollectionUtils.mapBuilder() .put(VeniceHeartbeatComponent.REPORTER, "reporter") .put(VeniceHeartbeatComponent.LOGGER, "logger") + .put(VeniceHeartbeatComponent.LAG_MONITOR_UPDATE, "lag_monitor_update") .build(); new VeniceDimensionTestFixture<>( VeniceHeartbeatComponent.class, diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java index ccd9c78bfd9..b60bdbe704f 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceMetricsDimensionsTest.java @@ -187,6 +187,9 @@ public void testGetDimensionNameInSnakeCase() { case VENICE_RECORD_TRANSFORMER_OPERATION: assertEquals(dimension.getDimensionName(format), "venice.record_transformer.operation"); break; + case VENICE_STORE_WRITE_TYPE: + assertEquals(dimension.getDimensionName(format), "venice.store.write_type"); + break; default: throw new IllegalArgumentException("Unknown dimension: " + dimension); } @@ -372,6 +375,9 @@ public void testGetDimensionNameInCamelCase() { case VENICE_RECORD_TRANSFORMER_OPERATION: assertEquals(dimension.getDimensionName(format), "venice.recordTransformer.operation"); break; + case VENICE_STORE_WRITE_TYPE: + assertEquals(dimension.getDimensionName(format), "venice.store.writeType"); + break; default: throw new IllegalArgumentException("Unknown dimension: " + dimension); } @@ -557,6 +563,9 @@ public void testGetDimensionNameInPascalCase() { case VENICE_RECORD_TRANSFORMER_OPERATION: assertEquals(dimension.getDimensionName(format), "Venice.RecordTransformer.Operation"); break; + case VENICE_STORE_WRITE_TYPE: + assertEquals(dimension.getDimensionName(format), "Venice.Store.WriteType"); + break; default: throw new IllegalArgumentException("Unknown dimension: " + dimension); } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceStoreWriteTypeTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceStoreWriteTypeTest.java new file mode 100644 index 00000000000..e2f971e2e48 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/dimensions/VeniceStoreWriteTypeTest.java @@ -0,0 +1,20 @@ +package com.linkedin.venice.stats.dimensions; + +import com.linkedin.venice.utils.CollectionUtils; +import java.util.Map; +import org.testng.annotations.Test; + + +public class VeniceStoreWriteTypeTest { + @Test + public void testDimensionInterface() { + Map expectedValues = CollectionUtils.mapBuilder() + .put(VeniceStoreWriteType.REGULAR, "regular") + .put(VeniceStoreWriteType.WRITE_COMPUTE, "write_compute") + .build(); + new VeniceDimensionTestFixture<>( + VeniceStoreWriteType.class, + VeniceMetricsDimensions.VENICE_STORE_WRITE_TYPE, + expectedValues).assertAll(); + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnumsTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnumsTest.java new file mode 100644 index 00000000000..c233eca7cb0 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnumsTest.java @@ -0,0 +1,539 @@ +package com.linkedin.venice.stats.metrics; + +import static com.linkedin.venice.read.RequestType.MULTI_GET_STREAMING; +import static com.linkedin.venice.stats.dimensions.RequestRetryAbortReason.SLOW_ROUTE; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_METHOD; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_RETRY_ABORT_REASON; +import static com.linkedin.venice.stats.metrics.MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE; +import static com.linkedin.venice.stats.metrics.MetricEntityStateTest.DimensionEnum1.DIMENSION_TWO; +import static com.linkedin.venice.stats.metrics.MetricType.COUNTER; +import static org.mockito.Mockito.doReturn; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + +import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; +import io.opentelemetry.api.common.Attributes; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import org.mockito.Mockito; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + + +/** + * Test class for {@link MetricEntityStateFiveEnums}. + */ +public class MetricEntityStateFiveEnumsTest extends MetricEntityStateEnumTestBase { + private final Map attributesMap = new HashMap<>(); + + @BeforeMethod + public void setUp() { + setUpCommonMocks(); + + Set dimensionsSet = createDimensionSet( + DIMENSION_ONE.getDimensionName(), + MetricEntityStateTest.DimensionEnum2.DIMENSION_ONE.getDimensionName(), + MetricEntityStateTest.DimensionEnum3.DIMENSION_ONE.getDimensionName(), + MetricEntityStateTest.DimensionEnum4.DIMENSION_ONE.getDimensionName(), + MetricEntityStateTest.DimensionEnum5.DIMENSION_ONE.getDimensionName(), + MetricEntityStateTest.DimensionEnum1Duplicate.DIMENSION_ONE.getDimensionName() // Duplicate: ignored + ); + setupMockMetricEntity(dimensionsSet); + + // Pre-build attributes for all enum combinations + for (MetricEntityStateTest.DimensionEnum1 enum1: MetricEntityStateTest.DimensionEnum1.values()) { + for (MetricEntityStateTest.DimensionEnum2 enum2: MetricEntityStateTest.DimensionEnum2.values()) { + for (MetricEntityStateTest.DimensionEnum3 enum3: MetricEntityStateTest.DimensionEnum3.values()) { + for (MetricEntityStateTest.DimensionEnum4 enum4: MetricEntityStateTest.DimensionEnum4.values()) { + for (MetricEntityStateTest.DimensionEnum5 enum5: MetricEntityStateTest.DimensionEnum5.values()) { + Attributes attributes = buildAttributes(enum1, enum2, enum3, enum4, enum5); + String attributeName = createAttributeName("attributesDimension", enum1, enum2, enum3, enum4, enum5); + attributesMap.put(attributeName, attributes); + } + } + } + } + } + } + + @Test + public void testConstructorWithoutOtelRepo() { + MetricEntityStateFiveEnums metricEntityState = + MetricEntityStateFiveEnums.create( + mockMetricEntity, + null, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + assertNotNull(metricEntityState); + assertNull(metricEntityState.getMetricAttributesDataEnumMap()); + for (MetricEntityStateTest.DimensionEnum1 enum1: MetricEntityStateTest.DimensionEnum1.values()) { + for (MetricEntityStateTest.DimensionEnum2 enum2: MetricEntityStateTest.DimensionEnum2.values()) { + for (MetricEntityStateTest.DimensionEnum3 enum3: MetricEntityStateTest.DimensionEnum3.values()) { + for (MetricEntityStateTest.DimensionEnum4 enum4: MetricEntityStateTest.DimensionEnum4.values()) { + for (MetricEntityStateTest.DimensionEnum5 enum5: MetricEntityStateTest.DimensionEnum5.values()) { + assertNull(metricEntityState.getAttributes(enum1, enum2, enum3, enum4, enum5)); + } + } + } + } + } + } + + @Test + public void testConstructorWithOtelRepo() { + MetricEntityStateFiveEnums metricEntityState = + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + assertNotNull(metricEntityState); + assertEquals(metricEntityState.getMetricAttributesDataEnumMap().size(), 0); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*has no constants.*") + public void testCreateAttributesEnumMapWithEmptyEnum() { + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.EmptyDimensionEnum.class, + MetricEntityStateTest.EmptyDimensionEnum.class, + MetricEntityStateTest.EmptyDimensionEnum.class, + MetricEntityStateTest.EmptyDimensionEnum.class, + MetricEntityStateTest.EmptyDimensionEnum.class); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "The input Otel dimension cannot be null.*") + public void testGetAttributesWithNullDimension() { + MetricEntityStateFiveEnums metricEntityState = + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + metricEntityState.getAttributes(null, null, null, null, null); + } + + @Test(expectedExceptions = ClassCastException.class) + public void testGetAttributesWithInvalidDimensionType() { + MetricEntityStateFiveEnums metricEntityState = MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + metricEntityState.getAttributes( + MULTI_GET_STREAMING, + MULTI_GET_STREAMING, + MULTI_GET_STREAMING, + MULTI_GET_STREAMING, + MULTI_GET_STREAMING); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*has duplicate dimensions for MetricEntity.*") + public void testConstructorWithDuplicateClasses() { + MetricEntity mockMetricEntity = Mockito.mock(MetricEntity.class); + Set dimensionsSet = new HashSet<>(); + dimensionsSet.add(VENICE_REQUEST_METHOD); // part of baseDimensionsMap + dimensionsSet.add(DIMENSION_ONE.getDimensionName()); + dimensionsSet.add(MetricEntityStateTest.DimensionEnum2.DIMENSION_ONE.getDimensionName()); + dimensionsSet.add(MetricEntityStateTest.DimensionEnum3.DIMENSION_ONE.getDimensionName()); + dimensionsSet.add(MetricEntityStateTest.DimensionEnum4.DIMENSION_ONE.getDimensionName()); + dimensionsSet.add(MetricEntityStateTest.DimensionEnum1Duplicate.DIMENSION_ONE.getDimensionName()); + doReturn(dimensionsSet).when(mockMetricEntity).getDimensionsList(); + doReturn(COUNTER).when(mockMetricEntity).getMetricType(); + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum1Duplicate.class, // duplicate + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*doesn't match with the required dimensions.*") + public void testConstructorWithDuplicateBaseDimension() { + Map baseDimensionsMap = new HashMap<>(); + baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); + baseDimensionsMap.put(DIMENSION_ONE.getDimensionName(), DIMENSION_ONE.getDimensionValue()); // duplicate + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + } + + @Test + public void testGetAttributesWithValidDimension() { + MetricEntityStateFiveEnums metricEntityState = + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + + // getAttributes will work similarly for all cases as the attributes are either pre created + // or on demand + Attributes attributes = metricEntityState.getAttributes( + MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum2.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum3.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum4.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum5.DIMENSION_ONE); + assertNotNull(attributes); + assertEquals(attributes.size(), 6); + assertEquals( + attributes, + attributesMap.get( + "attributesDimensionEnum1DIMENSION_ONEEnum2DIMENSION_ONEEnum3DIMENSION_ONEEnum4DIMENSION_ONEEnum5DIMENSION_ONE")); + + validateAttributesEnumMap( + metricEntityState, + 1, + MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum2.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum3.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum4.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum5.DIMENSION_ONE, + attributes); + + attributes = metricEntityState.getAttributes( + MetricEntityStateTest.DimensionEnum1.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum2.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum3.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum4.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum5.DIMENSION_TWO); + assertNotNull(attributes); + assertEquals(attributes.size(), 6); + assertEquals( + attributes, + attributesMap.get( + "attributesDimensionEnum1DIMENSION_TWOEnum2DIMENSION_TWOEnum3DIMENSION_TWOEnum4DIMENSION_TWOEnum5DIMENSION_TWO")); + + validateAttributesEnumMap( + metricEntityState, + 2, + MetricEntityStateTest.DimensionEnum1.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum2.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum3.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum4.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum5.DIMENSION_TWO, + attributes); + } + + /** + * Records a "diagonal" combo where consecutive dimensions alternate ONE/TWO. If the + * computeIfAbsent chain ever nested an EnumMap of the wrong type at one level (e.g. Enum4 where + * Enum3 belongs), retrieving the resulting Attributes would produce the wrong attribute set. + * Aligned-only combos (all-ONE / all-TWO) cannot detect that swap. + */ + @Test + public void testGetAttributesWithDiagonalDimensionCombo() { + MetricEntityStateFiveEnums metricEntityState = + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + + Attributes diag1 = metricEntityState.getAttributes( + MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum2.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum3.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum4.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum5.DIMENSION_ONE); + assertNotNull(diag1); + assertEquals(diag1.size(), 6); + assertEquals( + diag1, + attributesMap.get( + "attributesDimensionEnum1DIMENSION_ONEEnum2DIMENSION_TWOEnum3DIMENSION_ONEEnum4DIMENSION_TWOEnum5DIMENSION_ONE")); + + // Inverse diagonal — different combo must produce a different Attributes. + Attributes diag2 = metricEntityState.getAttributes( + MetricEntityStateTest.DimensionEnum1.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum2.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum3.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum4.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum5.DIMENSION_TWO); + assertNotNull(diag2); + assertEquals( + diag2, + attributesMap.get( + "attributesDimensionEnum1DIMENSION_TWOEnum2DIMENSION_ONEEnum3DIMENSION_TWOEnum4DIMENSION_ONEEnum5DIMENSION_TWO")); + org.testng.Assert.assertNotEquals(diag1, diag2); + } + + @Test + public void testRecordWithValidDimension() { + MetricEntityStateFiveEnums metricEntityState = + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + metricEntityState.record( + 100L, + DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum2.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum3.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum4.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum5.DIMENSION_ONE); + validateAttributesEnumMap( + metricEntityState, + 1, + MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum2.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum3.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum4.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum5.DIMENSION_ONE, + attributesMap.get( + "attributesDimensionEnum1DIMENSION_ONEEnum2DIMENSION_ONEEnum3DIMENSION_ONEEnum4DIMENSION_ONEEnum5DIMENSION_ONE")); + metricEntityState.record( + 100.5, + DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum2.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum3.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum4.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum5.DIMENSION_ONE); + validateAttributesEnumMap( + metricEntityState, + 1, + MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum2.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum3.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum4.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum5.DIMENSION_ONE, + attributesMap.get( + "attributesDimensionEnum1DIMENSION_ONEEnum2DIMENSION_ONEEnum3DIMENSION_ONEEnum4DIMENSION_ONEEnum5DIMENSION_ONE")); + metricEntityState.record( + 100L, + DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum2.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum3.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum4.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum5.DIMENSION_TWO); + validateAttributesEnumMap( + metricEntityState, + 2, + MetricEntityStateTest.DimensionEnum1.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum2.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum3.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum4.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum5.DIMENSION_TWO, + attributesMap.get( + "attributesDimensionEnum1DIMENSION_TWOEnum2DIMENSION_TWOEnum3DIMENSION_TWOEnum4DIMENSION_TWOEnum5DIMENSION_TWO")); + metricEntityState.record( + 100.5, + DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum2.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum3.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum4.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum5.DIMENSION_TWO); + validateAttributesEnumMap( + metricEntityState, + 2, + MetricEntityStateTest.DimensionEnum1.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum2.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum3.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum4.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum5.DIMENSION_TWO, + attributesMap.get( + "attributesDimensionEnum1DIMENSION_TWOEnum2DIMENSION_TWOEnum3DIMENSION_TWOEnum4DIMENSION_TWOEnum5DIMENSION_TWO")); + } + + private void validateAttributesEnumMap( + MetricEntityStateFiveEnums metricEntityState, + int attributesEnumMapSize, + MetricEntityStateTest.DimensionEnum1 dimension1, + MetricEntityStateTest.DimensionEnum2 dimension2, + MetricEntityStateTest.DimensionEnum3 dimension3, + MetricEntityStateTest.DimensionEnum4 dimension4, + MetricEntityStateTest.DimensionEnum5 dimension5, + Attributes attributes) { + EnumMap>>>> metricAttributesDataEnumMap = + metricEntityState.getMetricAttributesDataEnumMap(); + // verify whether the attributes are cached + assertNotNull(metricAttributesDataEnumMap); + assertEquals(metricAttributesDataEnumMap.size(), attributesEnumMapSize); + + EnumMap>>> mapE2 = + metricAttributesDataEnumMap.get(dimension1); + assertNotNull(mapE2); + assertEquals(mapE2.size(), 1); + EnumMap>> mapE3 = + mapE2.get(dimension2); + assertNotNull(mapE3); + assertEquals(mapE3.size(), 1); + EnumMap> mapE4 = + mapE3.get(dimension3); + assertNotNull(mapE4); + assertEquals(mapE4.size(), 1); + EnumMap mapE5 = mapE4.get(dimension4); + assertNotNull(mapE5); + assertEquals(mapE5.size(), 1); + assertEquals(mapE5.get(dimension5).getAttributes(), attributes); + } + + @Test + public void testValidateRequiredDimensions() { + Map baseDimensionsMap = new HashMap<>(); + // case 1: right values + baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); + MetricEntityStateFiveEnums metricEntityState = + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + assertNotNull(metricEntityState); + + // case 2: baseDimensionsMap has extra values + baseDimensionsMap.clear(); + baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); + baseDimensionsMap.put(VENICE_REQUEST_RETRY_ABORT_REASON, SLOW_ROUTE.getDimensionValue()); + try { + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + fail(); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); + } + + // case 3: baseDimensionsMap has less values + baseDimensionsMap.clear(); + try { + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + fail(); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); + } + + // case 4: baseDimensionsMap has same count, but different dimensions + baseDimensionsMap.clear(); + baseDimensionsMap.put(VENICE_REQUEST_RETRY_ABORT_REASON, SLOW_ROUTE.getDimensionValue()); + try { + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + fail(); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); + } + } + + @Test + public void testGetAllMetricAttributesData() { + MetricEntityStateFiveEnums state = + MetricEntityStateFiveEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + // Populate the 5-level EnumMap with 2 distinct entries spanning all enum levels + state.record( + 100L, + MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum2.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum3.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum4.DIMENSION_ONE, + MetricEntityStateTest.DimensionEnum5.DIMENSION_ONE); + state.record( + 200L, + MetricEntityStateTest.DimensionEnum1.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum2.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum3.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum4.DIMENSION_TWO, + MetricEntityStateTest.DimensionEnum5.DIMENSION_TWO); + + // Verify iteration walks all 5 nested EnumMap levels and surfaces both entries. + int count = 0; + for (MetricAttributesData md: state.getAllMetricAttributesData()) { + assertNotNull(md); + count++; + } + assertEquals(count, 2); + + // Otel disabled → null sentinel branch + MetricEntityStateFiveEnums disabled = + MetricEntityStateFiveEnums.create( + mockMetricEntity, + null, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + MetricEntityStateTest.DimensionEnum2.class, + MetricEntityStateTest.DimensionEnum3.class, + MetricEntityStateTest.DimensionEnum4.class, + MetricEntityStateTest.DimensionEnum5.class); + assertNull(disabled.getAllMetricAttributesData()); + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java index 4388eac3132..32aabdfabfc 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java @@ -4,6 +4,7 @@ import static com.linkedin.venice.stats.dimensions.RequestRetryType.ERROR_RETRY; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.HTTP_RESPONSE_STATUS_CODE_CATEGORY; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_FANOUT_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_METHOD; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REQUEST_RETRY_TYPE; import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; @@ -540,6 +541,26 @@ public String getDimensionValue() { } } + enum DimensionEnum5 implements VeniceDimensionInterface { + DIMENSION_ONE(), DIMENSION_TWO(); + + private final String dimensionValue; + + DimensionEnum5() { + this.dimensionValue = "value_" + name().toLowerCase(); + } + + @Override + public VeniceMetricsDimensions getDimensionName() { + return VENICE_REQUEST_FANOUT_TYPE; // Dummy dimension + } + + @Override + public String getDimensionValue() { + return dimensionValue; + } + } + enum EmptyDimensionEnum implements VeniceDimensionInterface { ; // Empty enum @Override diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/ConfigKeys.java b/internal/venice-common/src/main/java/com/linkedin/venice/ConfigKeys.java index fc69946f14f..2308146cb56 100644 --- a/internal/venice-common/src/main/java/com/linkedin/venice/ConfigKeys.java +++ b/internal/venice-common/src/main/java/com/linkedin/venice/ConfigKeys.java @@ -2754,6 +2754,14 @@ private ConfigKeys() { public static final String SERVER_PER_RECORD_BATCH_OTEL_METRICS_ENABLED = "server.per.record.batch.otel.metrics.enabled"; + /** + * Sleep interval (in seconds) between heartbeat reporter cycles in {@code HeartbeatMonitoringService}. + * The reporter thread iterates the heartbeat-timestamp map and emits aggregate lag metrics on + * each cycle. Default: 60s in production. Tests typically override to 1s so SLO/heartbeat-delay + * metrics show up before the test method timeout. + */ + public static final String SERVER_HEARTBEAT_REPORTER_INTERVAL_SECONDS = "server.heartbeat.reporter.interval.seconds"; + /** * Whether to enable HyperLogLog-based unique key count tracking during ingestion. * When enabled, each partition maintains an HLL sketch (~8KB at lgK=13) that estimates diff --git a/internal/venice-test-common/build.gradle b/internal/venice-test-common/build.gradle index 52e3c43d83e..fafc75a1f8a 100644 --- a/internal/venice-test-common/build.gradle +++ b/internal/venice-test-common/build.gradle @@ -266,7 +266,7 @@ task generateGHCI() { def jobs = [] - def integrationTestTimeout = 12 // minutes + def integrationTestTimeout = 15 // minutes integrationTestBuckets.each { name, patterns -> def flowName = "IntegrationTests_" + name jobs << flowName diff --git a/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/endToEnd/NearlineE2ELatencyTest.java b/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/endToEnd/NearlineE2ELatencyTest.java index 0a3381a09c6..30e4ff135e7 100644 --- a/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/endToEnd/NearlineE2ELatencyTest.java +++ b/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/endToEnd/NearlineE2ELatencyTest.java @@ -1,7 +1,13 @@ package com.linkedin.venice.endToEnd; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_CHUNKING_STATUS; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REGION_LOCALITY; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_REPLICA_TYPE; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_WRITE_TYPE; import static com.linkedin.venice.utils.IntegrationTestPushUtils.getSamzaProducer; import static com.linkedin.venice.utils.IntegrationTestPushUtils.sendStreamingRecord; +import static com.linkedin.venice.utils.TestWriteUtils.NAME_RECORD_V1_SCHEMA; import static com.linkedin.venice.utils.TestWriteUtils.STRING_SCHEMA; import com.linkedin.davinci.stats.IngestionStats; @@ -9,6 +15,7 @@ import com.linkedin.venice.client.store.ClientConfig; import com.linkedin.venice.client.store.ClientFactory; import com.linkedin.venice.controllerapi.ControllerClient; +import com.linkedin.venice.controllerapi.ControllerResponse; import com.linkedin.venice.controllerapi.JobStatusQueryResponse; import com.linkedin.venice.controllerapi.UpdateStoreQueryParams; import com.linkedin.venice.controllerapi.VersionCreationResponse; @@ -17,24 +24,43 @@ import com.linkedin.venice.integration.utils.PubSubBrokerWrapper; import com.linkedin.venice.integration.utils.VeniceClusterWrapper; import com.linkedin.venice.integration.utils.VeniceMultiClusterWrapper; +import com.linkedin.venice.integration.utils.VeniceServerWrapper; import com.linkedin.venice.meta.Version; import com.linkedin.venice.pubsub.PubSubPositionTypeRegistry; import com.linkedin.venice.pubsub.PubSubProducerAdapterFactory; +import com.linkedin.venice.samza.VeniceSystemProducer; import com.linkedin.venice.server.VeniceServer; +import com.linkedin.venice.stats.VeniceMetricsRepository; +import com.linkedin.venice.stats.dimensions.ReplicaType; +import com.linkedin.venice.stats.dimensions.VeniceChunkingStatus; +import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; +import com.linkedin.venice.utils.DataProviderUtils; +import com.linkedin.venice.utils.IntegrationTestPushUtils; import com.linkedin.venice.utils.TestUtils; import com.linkedin.venice.utils.Utils; +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.sdk.metrics.data.ExponentialHistogramData; +import io.opentelemetry.sdk.metrics.data.ExponentialHistogramPointData; +import io.opentelemetry.sdk.metrics.data.MetricData; +import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; import io.tehuti.metrics.MetricsRepository; import java.util.AbstractMap; +import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; import java.util.stream.IntStream; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.samza.system.SystemProducer; @@ -46,6 +72,8 @@ public class NearlineE2ELatencyTest extends AbstractMultiRegionTest { private static final Logger LOGGER = LogManager.getLogger(NearlineE2ELatencyTest.class); private static final long TEST_TIMEOUT = 120_000; + // Worst-case budget: 60s push + 30s read availability + 90s heartbeat reporter cycle. + private static final long SLO_DIMS_TEST_TIMEOUT = 240_000; private PubSubPositionTypeRegistry pubSubPositionTypeRegistry; @@ -164,4 +192,219 @@ public void testEndToEndNearlineMetric() { Assert.assertTrue(producerToLocalBroker.get()); Assert.assertTrue(producerToLocalBrokerLatencies.stream().anyMatch(v -> v > 0)); } + + /** + * Validates that the record-level delay OTel metric (ingestion.replication.record.delay) includes + * the SLO classification dimensions: region locality, store write type, and chunking status. + * + *

Uses the same hybrid store setup as {@link #testEndToEndNearlineMetric()}: non-WC, non-chunked, + * local-region ingestion. After streaming records and waiting for ingestion, verifies that at least + * one server emitted the histogram with the expected dimension values. + */ + @Test(dataProvider = "Two-True-and-False", dataProviderClass = DataProviderUtils.class, timeOut = SLO_DIMS_TEST_TIMEOUT) + public void testRecordLevelDelaySloDimensions(boolean writeComputeEnabled, boolean chunkingEnabled) { + String storeName = Utils.getUniqueString("test-slo-dims"); + String parentControllerUrls = parentController.getControllerUrl(); + int versionNumber; + try (ControllerClient parentControllerCli = new ControllerClient(CLUSTER_NAME, parentControllerUrls); + ControllerClient dc0Client = + new ControllerClient(CLUSTER_NAME, childDatacenters.get(0).getControllerConnectString()); + ControllerClient dc1Client = + new ControllerClient(CLUSTER_NAME, childDatacenters.get(1).getControllerConnectString())) { + List dcControllerClientList = Arrays.asList(dc0Client, dc1Client); + // Use NAME_RECORD_V1_SCHEMA (record type with default values) so write-compute can be + // enabled — the controller rejects setWriteComputationEnabled(true) on primitive schemas. + TestUtils.createAndVerifyStoreInAllRegions( + storeName, + parentControllerCli, + dcControllerClientList, + STRING_SCHEMA.toString(), + NAME_RECORD_V1_SCHEMA.toString()); + /* + * Active-active replication so each fabric's leaders pull from EVERY fabric's RT — + * a leader processing a record from a remote-fabric RT emits locality=REMOTE. + * Without AA, only the source-fabric's RT carries records, and the cross-region branch + * isn't exercised at all (verified empirically: dc1 won't even ingest "19" within 120s + * under plain NR with a single dc0 producer). + */ + UpdateStoreQueryParams params = new UpdateStoreQueryParams().setHybridRewindSeconds(10) + .setHybridOffsetLagThreshold(5) + .setNativeReplicationEnabled(true) + .setActiveActiveReplicationEnabled(true) + .setPartitionCount(1) + .setChunkingEnabled(chunkingEnabled); + if (writeComputeEnabled) { + params.setWriteComputationEnabled(true); + } + Assert.assertFalse(parentControllerCli.updateStore(storeName, params).isError()); + TestUtils.verifyDCConfigNativeAndActiveRepl(storeName, true, true, dc0Client, dc1Client); + // Empty push to materialize the hybrid version — no batch records, just the version + // boundary so the streaming records can be ingested. + ControllerResponse emptyPushResponse = parentControllerCli + .sendEmptyPushAndWait(storeName, Utils.getUniqueString("empty-slo-dims-push"), 1L, 60_000L); + Assert.assertFalse(emptyPushResponse.isError()); + Assert.assertTrue(emptyPushResponse instanceof JobStatusQueryResponse); + versionNumber = ((JobStatusQueryResponse) emptyPushResponse).getVersion(); + // Wait for the empty push to be current in BOTH fabrics so each fabric's leaders are ready + // to ingest streaming records. + TestUtils.waitForNonDeterministicAssertion(60, TimeUnit.SECONDS, true, true, () -> { + for (ControllerClient cc: dcControllerClientList) { + Assert.assertEquals(cc.getStore(storeName).getStore().getCurrentVersion(), versionNumber); + } + }); + } + + /* + * Write streaming records via per-fabric Samza producers. With AA enabled, each fabric's + * leader pulls EVERY fabric's RT — so each leader sees records originating from BOTH its + * local RT (locality=LOCAL) and the remote RT (locality=REMOTE). + */ + Map fabricToProducer = new HashMap<>(); + int recordsPerFabric = 10; + try { + for (int dcIndex = 0; dcIndex < childDatacenters.size(); dcIndex++) { + VeniceSystemProducer producer = + IntegrationTestPushUtils.getSamzaProducerForStream(multiRegionMultiClusterWrapper, dcIndex, storeName); + fabricToProducer.put(dcIndex, producer); + String keyPrefix = "dc-" + dcIndex + "_key_"; + for (int i = 0; i < recordsPerFabric; i++) { + GenericRecord value = new GenericData.Record(NAME_RECORD_V1_SCHEMA); + value.put("firstName", "stream_" + i); + value.put("lastName", "dc-" + dcIndex); + sendStreamingRecord(producer, storeName, keyPrefix + i, value); + } + } + + // Wait for cross-fabric replication: each fabric should serve records produced in BOTH + // fabrics. This guarantees leaders on each fabric processed at least one record from + // each region, which in turn guarantees both LOCAL and REMOTE record-delay points are + // emitted before the metric assertion runs. + for (VeniceMultiClusterWrapper dc: childDatacenters) { + VeniceClusterWrapper cluster = dc.getClusters().get(CLUSTER_NAME); + try (AvroGenericStoreClient client = ClientFactory.getAndStartGenericAvroClient( + ClientConfig.defaultGenericClientConfig(storeName).setVeniceURL(cluster.getRandomRouterURL()))) { + TestUtils.waitForNonDeterministicAssertion(60, TimeUnit.SECONDS, true, true, () -> { + for (int dcIndex = 0; dcIndex < childDatacenters.size(); dcIndex++) { + String key = "dc-" + dcIndex + "_key_" + (recordsPerFabric - 1); + try { + Assert.assertNotNull( + client.get(key).get(), + "Records from fabric dc-" + dcIndex + " not yet replicated to " + cluster.getRandomRouterURL() + + " (key=" + key + ")"); + } catch (Exception e) { + throw new VeniceException(e); + } + } + }); + } + } + } finally { + for (VeniceSystemProducer producer: fabricToProducer.values()) { + producer.stop(); + } + } + + String localityKey = VENICE_REGION_LOCALITY.getDimensionNameInDefaultFormat(); + String writeTypeKey = VENICE_STORE_WRITE_TYPE.getDimensionNameInDefaultFormat(); + String chunkingKey = VENICE_CHUNKING_STATUS.getDimensionNameInDefaultFormat(); + String storeNameKey = VENICE_STORE_NAME.getDimensionNameInDefaultFormat(); + String replicaTypeKey = VENICE_REPLICA_TYPE.getDimensionNameInDefaultFormat(); + + /* + * Collect every (replica_type, locality) combination observed for this store across servers + * in BOTH fabrics. With AA enabled and writes from both regions, each fabric's leaders pull + * EVERY fabric's RT — so leaders emit BOTH locality=LOCAL (records from local-fabric RT) + * AND locality=REMOTE (records from remote-fabric RT). + * + * The write_type and chunking_status dims are filtered to the values driven by the test + * parameters — points with the wrong combo are ignored, which would itself indicate a bug + * since this store should only emit one combo. + * + * NOTE: Followers always emit locality=LOCAL today because they read from local VT only. + * If/when followers start emitting per-region (e.g., follower-side cross-region tracking + * lands), update this test to also assert (FOLLOWER, REMOTE). + */ + String expectedWriteType = + (writeComputeEnabled ? VeniceStoreWriteType.WRITE_COMPUTE : VeniceStoreWriteType.REGULAR).getDimensionValue(); + String expectedChunking = + (chunkingEnabled ? VeniceChunkingStatus.CHUNKED : VeniceChunkingStatus.UNCHUNKED).getDimensionValue(); + Set observedLeaderLocalities = ConcurrentHashMap.newKeySet(); + Set observedFollowerLocalities = ConcurrentHashMap.newKeySet(); + List serversInBothFabrics = new ArrayList<>(); + for (VeniceMultiClusterWrapper dc: childDatacenters) { + serversInBothFabrics.addAll(dc.getClusters().get(CLUSTER_NAME).getVeniceServers()); + } + + TestUtils.waitForNonDeterministicAssertion(90, TimeUnit.SECONDS, true, true, () -> { + observedLeaderLocalities.clear(); + observedFollowerLocalities.clear(); + for (VeniceServerWrapper sw: serversInBothFabrics) { + InMemoryMetricReader reader = getOtelReader(sw); + if (reader == null) { + continue; + } + for (MetricData metric: reader.collectAllMetrics()) { + if (!metric.getName().contains("ingestion.replication.record.delay")) { + continue; + } + ExponentialHistogramData histData = metric.getExponentialHistogramData(); + if (histData == null) { + continue; + } + for (ExponentialHistogramPointData point: histData.getPoints()) { + if (point.getCount() == 0 || point.getSum() <= 0 || point.getMax() <= 0) { + continue; + } + String pointStoreName = point.getAttributes().get(AttributeKey.stringKey(storeNameKey)); + String replicaType = point.getAttributes().get(AttributeKey.stringKey(replicaTypeKey)); + String locality = point.getAttributes().get(AttributeKey.stringKey(localityKey)); + String writeType = point.getAttributes().get(AttributeKey.stringKey(writeTypeKey)); + String chunking = point.getAttributes().get(AttributeKey.stringKey(chunkingKey)); + + if (!storeName.equals(pointStoreName) || !expectedWriteType.equals(writeType) + || !expectedChunking.equals(chunking)) { + continue; + } + if (ReplicaType.LEADER.getDimensionValue().equals(replicaType)) { + observedLeaderLocalities.add(locality); + } else if (ReplicaType.FOLLOWER.getDimensionValue().equals(replicaType)) { + observedFollowerLocalities.add(locality); + } + } + } + } + LOGGER.info( + "[wc={}, chunked={}] Observed leader localities: {}, follower localities: {}", + writeComputeEnabled, + chunkingEnabled, + observedLeaderLocalities, + observedFollowerLocalities); + + // Leader replicas emit BOTH localities: LOCAL when consuming local-fabric RT records, + // REMOTE when consuming remote-fabric RT records (AA cross-region pull). + Assert.assertTrue( + observedLeaderLocalities.contains(VeniceRegionLocality.LOCAL.getDimensionValue()), + "No LEADER point with (writeType=" + expectedWriteType + ", chunking=" + expectedChunking + + ", locality=LOCAL). Observed leader localities: " + observedLeaderLocalities); + Assert.assertTrue( + observedLeaderLocalities.contains(VeniceRegionLocality.REMOTE.getDimensionValue()), + "No LEADER point with (writeType=" + expectedWriteType + ", chunking=" + expectedChunking + + ", locality=REMOTE) — expected from AA cross-region pull. Observed leader localities: " + + observedLeaderLocalities); + // Follower replicas only emit LOCAL today (they read from local VT). + Assert.assertTrue( + observedFollowerLocalities.contains(VeniceRegionLocality.LOCAL.getDimensionValue()), + "No FOLLOWER point with (writeType=" + expectedWriteType + ", chunking=" + expectedChunking + + ", locality=LOCAL). Observed follower localities: " + observedFollowerLocalities); + }); + } + + private static InMemoryMetricReader getOtelReader(VeniceServerWrapper server) { + MetricsRepository metricsRepo = server.getMetricsRepository(); + if (!(metricsRepo instanceof VeniceMetricsRepository)) { + return null; + } + Object reader = ((VeniceMetricsRepository) metricsRepo).getVeniceMetricsConfig().getOtelAdditionalMetricsReader(); + return reader instanceof InMemoryMetricReader ? (InMemoryMetricReader) reader : null; + } } diff --git a/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/integration/utils/VeniceServerWrapper.java b/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/integration/utils/VeniceServerWrapper.java index 0bac8bd18a9..597490ec77c 100644 --- a/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/integration/utils/VeniceServerWrapper.java +++ b/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/integration/utils/VeniceServerWrapper.java @@ -267,7 +267,8 @@ static StatefulServiceProvider generateService( .put(SERVER_RESUBSCRIPTION_CHECK_INTERVAL_IN_SECONDS, 1) .put(SERVER_DELETE_UNASSIGNED_PARTITIONS_ON_STARTUP, serverDeleteUnassignedPartitionsOnStartup) .put(OTEL_VENICE_METRICS_ENABLED, Boolean.TRUE.toString()) - .put(SERVER_RECORD_LEVEL_TIMESTAMP_ENABLED, Boolean.TRUE.toString()); + .put(SERVER_RECORD_LEVEL_TIMESTAMP_ENABLED, Boolean.TRUE.toString()) + .put(SERVER_PER_RECORD_OTEL_METRICS_ENABLED, Boolean.TRUE.toString()); if (sslToKafka) { serverPropsBuilder.put(PUBSUB_SECURITY_PROTOCOL_LEGACY, PubSubSecurityProtocol.SSL.name()); serverPropsBuilder.put(KafkaTestUtils.getLocalCommonKafkaSSLConfig(SslUtils.getTlsConfiguration())); diff --git a/internal/venice-test-common/test-shard-assignments.json b/internal/venice-test-common/test-shard-assignments.json index 0653083f47d..47d34b1ce68 100644 --- a/internal/venice-test-common/test-shard-assignments.json +++ b/internal/venice-test-common/test-shard-assignments.json @@ -95,8 +95,7 @@ "com.linkedin.venice.router.TestConnectionWarmingForApacheAsyncClient" ], "28": [ - "com.linkedin.venice.server.VeniceServerTest", - "com.linkedin.venice.controller.server.TestAdminSparkServerWithMultiServers" + "com.linkedin.venice.server.VeniceServerTest" ], "29": [ "com.linkedin.venice.controller.TestHAASController", @@ -434,7 +433,8 @@ "com.linkedin.venice.helix.TestHelixReadWriteDarkClusterConfigRepository", "com.linkedin.venice.helix.ZkRoutersClusterManagerConfigChangeTest", "com.linkedin.venice.controller.migration.TestMigrationPushStrategyZKAccessor", - "com.linkedin.venice.integration.utils.SystemExitPrevention" + "com.linkedin.venice.integration.utils.SystemExitPrevention", + "com.linkedin.venice.controller.server.TestAdminSparkServerWithMultiServers" ] } } diff --git a/services/venice-server/src/test/java/com/linkedin/venice/listener/StorageReadRequestHandlerTest.java b/services/venice-server/src/test/java/com/linkedin/venice/listener/StorageReadRequestHandlerTest.java index 8403b0ee3bf..a3c8cb0c0eb 100644 --- a/services/venice-server/src/test/java/com/linkedin/venice/listener/StorageReadRequestHandlerTest.java +++ b/services/venice-server/src/test/java/com/linkedin/venice/listener/StorageReadRequestHandlerTest.java @@ -570,7 +570,10 @@ public void testAdminRequestsPassInStorageExecutionHandler() throws Exception { new PubSubTopicPartitionImpl(topic, expectedPartitionId), new OffsetRecord(AvroProtocolDefinition.PARTITION_STATE.getSerializer(), pubSubContext), pubSubContext, - false); + false, + false, + false, + null); expectedAdminResponse.addPartitionConsumptionState(state); doReturn(expectedAdminResponse).when(ingestionMetadataRetriever) .getConsumptionSnapshots(eq(topic.getName()), any());