Skip to content

Commit bf3525b

Browse files
bugfix: S3C-3955 ignore errors from readRecords during LogReader setup
The getRaftLog route returns an error 500 when a queried raft session is not in cache. This, when the queue populator had multiple raft sessions to manage, caused the whole setup to fail if one of them failed. Previously, before S3C-3835 was fixed, it happened during regular cron job batches which only impacted the missing raft session and logged an error without preventing progress from other raft sessions. The fix consists of not causing a global error on error 500 from one raft session, and instead falling back to starting from log offset 1. This looks acceptable as a quick fix, but it would be better to have metadata return a proper success status whenever raft sessions are missing from its cache, but the scope is larger hence this improvement over the current fix is postponed for now. Another way could be to internally retry fetching the offset while allowing the rest of the log readers to fetch their respective raft session log, but this would also be a significant change and does not look the best approach to me either.
1 parent 0d50c85 commit bf3525b

File tree

2 files changed

+47
-11
lines changed

2 files changed

+47
-11
lines changed

lib/queuePopulator/LogReader.js

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,11 +148,23 @@ class LogReader {
148148
});
149149
this.logConsumer.readRecords({ limit: 1 }, (err, res) => {
150150
if (err) {
151-
this.log.error('error while reading log', {
152-
method: 'LogReader._initializeLogOffset',
153-
error: err,
154-
});
155-
return done(err);
151+
// FIXME: getRaftLog metadata route returns 500 when
152+
// its cache does not contain the queried raft
153+
// session. For the sake of simplicity, in order to
154+
// allow the populator to make progress, we choose to
155+
// accept errors fetching the current offset during
156+
// setup phase and fallback to starting from offset
157+
// 1. It would be better to have metadata return
158+
// special success statuses in such case.
159+
this.log.warn(
160+
'error reading initial log offset, ' +
161+
'default to initial offset 1', {
162+
method: 'LogReader._initializeLogOffset',
163+
zkPath: pathToLogOffset,
164+
logOffset: 1,
165+
error: err,
166+
});
167+
return done(null, 1);
156168
}
157169
const logOffset = res.info.cseq + 1;
158170
this.log.info('starting after latest log sequence', {

tests/unit/lib/queuePopulator/LogReader.spec.js

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,29 @@ const assert = require('assert');
22

33
const ZookeeperMock = require('zookeeper-mock');
44

5-
const { versioning } = require('arsenal');
5+
const { errors, versioning } = require('arsenal');
66
const { DbPrefixes } = versioning.VersioningConstants;
77

88
const { Logger } = require('werelogs');
99

1010
const LogReader = require('../../../../lib/queuePopulator/LogReader');
1111

1212
class MockLogConsumer {
13+
constructor(params) {
14+
this.params = params || {};
15+
}
16+
1317
readRecords(params, cb) {
1418
process.nextTick(() => {
15-
cb(null, {
16-
info: {
17-
cseq: 12345,
18-
},
19-
});
19+
if (this.params.readRecordsError) {
20+
cb(this.params.readRecordsError);
21+
} else {
22+
cb(null, {
23+
info: {
24+
cseq: 12345,
25+
},
26+
});
27+
}
2028
});
2129
}
2230
}
@@ -78,4 +86,20 @@ describe('LogReader', () => {
7886
done();
7987
});
8088
});
89+
90+
it('should start from offset 1 on log consumer readRecords error', done => {
91+
const errorLogReader = new LogReader({
92+
logId: 'test-log-reader',
93+
zkClient: zkMock.createClient('localhost:2181'),
94+
logConsumer: new MockLogConsumer({
95+
readRecordsError: errors.InternalError,
96+
}),
97+
logger: new Logger('test:ErrorLogReader'),
98+
});
99+
errorLogReader.setup(err => {
100+
assert.ifError(err);
101+
assert.strictEqual(errorLogReader.logOffset, 1);
102+
done();
103+
});
104+
});
81105
});

0 commit comments

Comments
 (0)