-
Notifications
You must be signed in to change notification settings - Fork 112
subservers: fail LiT startup when integrated sub-server boot fails #1183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
7f5673e
35954a9
75f58f2
334bf77
381d0b1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,13 +4,15 @@ import ( | |
| "context" | ||
| "fmt" | ||
| "io/ioutil" | ||
| "sort" | ||
| "sync" | ||
| "time" | ||
|
|
||
| restProxy "github.com/grpc-ecosystem/grpc-gateway/v2/runtime" | ||
| "github.com/lightninglabs/lightning-terminal/perms" | ||
| "github.com/lightninglabs/lightning-terminal/status" | ||
| "github.com/lightninglabs/lndclient" | ||
| "github.com/lightningnetwork/lnd/fn" | ||
| "github.com/lightningnetwork/lnd/lncfg" | ||
| "github.com/lightningnetwork/lnd/lnrpc" | ||
| grpcProxy "github.com/mwitkow/grpc-proxy/proxy" | ||
|
|
@@ -29,6 +31,11 @@ var ( | |
| // defaultConnectTimeout is the default timeout for connecting to the | ||
| // backend. | ||
| defaultConnectTimeout = 15 * time.Second | ||
|
|
||
| // criticalIntegratedSubServers lists integrated sub-servers that must | ||
| // succeed during startup. Failures from these sub-servers are surfaced | ||
| // to LiT and abort the startup sequence. | ||
| criticalIntegratedSubServers = fn.NewSet[string](TAP) | ||
| ) | ||
|
|
||
| // Manager manages a set of subServer objects. | ||
|
|
@@ -104,14 +111,37 @@ func (s *Manager) GetServer(name string) (SubServer, bool) { | |
| } | ||
|
|
||
| // StartIntegratedServers starts all the manager's sub-servers that should be | ||
| // started in integrated mode. | ||
| // started in integrated mode. An error is returned if any critical integrated | ||
| // sub-server fails to start. | ||
| func (s *Manager) StartIntegratedServers(lndClient lnrpc.LightningClient, | ||
| lndGrpc *lndclient.GrpcLndServices, withMacaroonService bool) { | ||
| lndGrpc *lndclient.GrpcLndServices, withMacaroonService bool) error { | ||
|
|
||
| s.mu.Lock() | ||
| defer s.mu.Unlock() | ||
|
|
||
| // Sort for deterministic startup: critical integrated sub-servers | ||
| // first, then alphabetical to keep the order stable across runs. | ||
| servers := make([]*subServerWrapper, 0, len(s.servers)) | ||
| for _, ss := range s.servers { | ||
| servers = append(servers, ss) | ||
| } | ||
|
|
||
| sort.Slice(servers, func(i, j int) bool { | ||
| iCritical := criticalIntegratedSubServers.Contains( | ||
| servers[i].Name(), | ||
| ) | ||
| jCritical := criticalIntegratedSubServers.Contains( | ||
| servers[j].Name(), | ||
| ) | ||
|
|
||
| if iCritical != jCritical { | ||
| return iCritical | ||
| } | ||
|
|
||
| return servers[i].Name() < servers[j].Name() | ||
| }) | ||
|
|
||
| for _, ss := range servers { | ||
| if ss.Remote() { | ||
| continue | ||
| } | ||
|
|
@@ -126,11 +156,18 @@ func (s *Manager) StartIntegratedServers(lndClient lnrpc.LightningClient, | |
| ) | ||
| if err != nil { | ||
| s.statusServer.SetErrored(ss.Name(), err.Error()) | ||
|
|
||
| if criticalIntegratedSubServers.Contains(ss.Name()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks 🙏! Note that the commit message is outdated and does not cover the "criticalIntegratedSubServers" part. |
||
| return fmt.Errorf("%s: %v", ss.Name(), err) | ||
| } | ||
|
|
||
|
Comment on lines
+160
to
+163
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but i think this then will result in complete shutdown which i dont think we want? we want the status server to remain running. |
||
| continue | ||
| } | ||
|
|
||
| s.statusServer.SetRunning(ss.Name()) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // ConnectRemoteSubServers creates connections to all the manager's sub-servers | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| package subservers | ||
|
|
||
| import ( | ||
| "context" | ||
| "errors" | ||
| "testing" | ||
|
|
||
| restProxy "github.com/grpc-ecosystem/grpc-gateway/v2/runtime" | ||
| "github.com/lightninglabs/lightning-terminal/litrpc" | ||
| "github.com/lightninglabs/lightning-terminal/perms" | ||
| "github.com/lightninglabs/lightning-terminal/status" | ||
| "github.com/lightninglabs/lndclient" | ||
| tafn "github.com/lightninglabs/taproot-assets/fn" | ||
| "github.com/lightningnetwork/lnd/lnrpc" | ||
| "github.com/stretchr/testify/require" | ||
| "google.golang.org/grpc" | ||
| "gopkg.in/macaroon-bakery.v2/bakery" | ||
| ) | ||
|
|
||
| // mockSubServer is a lightweight SubServer test double. | ||
| type mockSubServer struct { | ||
| // name is returned by Name(). | ||
| name string | ||
|
|
||
| // remote toggles Remote() return value. | ||
| remote bool | ||
|
|
||
| // startErr is returned from Start() when set. | ||
| startErr error | ||
|
|
||
| // started tracks whether Start() succeeded. | ||
| started bool | ||
| } | ||
|
|
||
| // Name returns the mock sub-server name. | ||
| func (t *mockSubServer) Name() string { | ||
| return t.name | ||
| } | ||
|
|
||
| // Remote indicates whether the sub-server runs remotely. | ||
| func (t *mockSubServer) Remote() bool { | ||
| return t.remote | ||
| } | ||
|
|
||
| // RemoteConfig returns nil for the mock. | ||
| func (t *mockSubServer) RemoteConfig() *RemoteDaemonConfig { | ||
| return nil | ||
| } | ||
|
|
||
| // Start marks the server started unless startErr is set. | ||
| func (t *mockSubServer) Start(_ lnrpc.LightningClient, | ||
| _ *lndclient.GrpcLndServices, _ bool) error { | ||
|
|
||
| if t.startErr != nil { | ||
| return t.startErr | ||
| } | ||
|
|
||
| t.started = true | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // Stop marks the server as stopped. | ||
| func (t *mockSubServer) Stop() error { | ||
| t.started = false | ||
| return nil | ||
| } | ||
|
|
||
| // RegisterGrpcService is a no-op for the mock. | ||
| func (t *mockSubServer) RegisterGrpcService(_ grpc.ServiceRegistrar) {} | ||
|
|
||
| // RegisterRestService is a no-op for the mock. | ||
| func (t *mockSubServer) RegisterRestService(_ context.Context, | ||
| _ *restProxy.ServeMux, _ string, _ []grpc.DialOption) error { | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // ServerErrChan returns nil for the mock. | ||
| func (t *mockSubServer) ServerErrChan() chan error { | ||
| return nil | ||
| } | ||
|
|
||
| // MacPath returns an empty string for the mock. | ||
| func (t *mockSubServer) MacPath() string { | ||
| return "" | ||
| } | ||
|
|
||
| // Permissions returns nil for the mock. | ||
| func (t *mockSubServer) Permissions() map[string][]bakery.Op { | ||
| return nil | ||
| } | ||
|
|
||
| // WhiteListedURLs returns nil for the mock. | ||
| func (t *mockSubServer) WhiteListedURLs() map[string]struct{} { | ||
| return nil | ||
| } | ||
|
|
||
| // Impl returns an empty option for the mock. | ||
| func (t *mockSubServer) Impl() tafn.Option[any] { | ||
| return tafn.None[any]() | ||
| } | ||
|
|
||
| // ValidateMacaroon always succeeds for the mock. | ||
| func (t *mockSubServer) ValidateMacaroon(context.Context, | ||
| []bakery.Op, string) error { | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // newTestManager creates a Manager and status Manager with permissive perms. | ||
| func newTestManager(t *testing.T) (*Manager, *status.Manager) { | ||
| t.Helper() | ||
|
|
||
| permsMgr, err := perms.NewManager(true) | ||
| require.NoError(t, err) | ||
|
|
||
| statusMgr := status.NewStatusManager() | ||
|
|
||
| return NewManager(permsMgr, statusMgr), statusMgr | ||
| } | ||
|
|
||
| // TestStartIntegratedServersCriticalFailureStopsStartup ensures critical | ||
| // startup errors abort integrated startup. | ||
| func TestStartIntegratedServersCriticalFailureStopsStartup(t *testing.T) { | ||
| manager, statusMgr := newTestManager(t) | ||
|
|
||
| nonCritical := &mockSubServer{name: "loop"} | ||
| critical := &mockSubServer{ | ||
| name: TAP, | ||
| startErr: errors.New("boom"), | ||
| } | ||
|
|
||
| require.NoError(t, manager.AddServer(nonCritical, true)) | ||
| require.NoError(t, manager.AddServer(critical, true)) | ||
|
|
||
| err := manager.StartIntegratedServers(nil, nil, true) | ||
| require.Error(t, err) | ||
| require.Contains(t, err.Error(), TAP) | ||
|
|
||
| resp, err := statusMgr.SubServerStatus( | ||
| context.Background(), &litrpc.SubServerStatusReq{}, | ||
| ) | ||
| require.NoError(t, err) | ||
|
|
||
| statuses := resp.SubServers | ||
| require.Contains(t, statuses, TAP) | ||
| require.Equal(t, "boom", statuses[TAP].Error) | ||
| require.False(t, statuses[TAP].Running) | ||
|
|
||
| require.False( | ||
| t, nonCritical.started, "non-critical sub-server should not "+ | ||
| "start after critical failure", | ||
| ) | ||
| } | ||
|
|
||
| // TestStartIntegratedServersNonCriticalFailureContinues verifies non-critical | ||
| // startup failures are tolerated. | ||
| func TestStartIntegratedServersNonCriticalFailureContinues(t *testing.T) { | ||
| manager, statusMgr := newTestManager(t) | ||
|
|
||
| failing := &mockSubServer{ | ||
| name: "loop", | ||
| startErr: errors.New("start failed"), | ||
| } | ||
| succeeding := &mockSubServer{name: "pool"} | ||
|
|
||
| require.NoError(t, manager.AddServer(failing, true)) | ||
| require.NoError(t, manager.AddServer(succeeding, true)) | ||
|
|
||
| err := manager.StartIntegratedServers(nil, nil, true) | ||
| require.NoError(t, err) | ||
|
|
||
| resp, err := statusMgr.SubServerStatus( | ||
| context.Background(), &litrpc.SubServerStatusReq{}, | ||
| ) | ||
| require.NoError(t, err) | ||
|
|
||
| statuses := resp.SubServers | ||
|
|
||
| require.Contains(t, statuses, failing.name) | ||
| require.Equal(t, "start failed", statuses[failing.name].Error) | ||
| require.False(t, statuses[failing.name].Running) | ||
|
|
||
| require.True(t, succeeding.started) | ||
| require.Contains(t, statuses, succeeding.name) | ||
| require.True(t, statuses[succeeding.name].Running) | ||
| require.Empty(t, statuses[succeeding.name].Error) | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for this 🙏! Similar to the unit tests, we should also have an
itestwhich covers a startup when a non-critical sub-server errors during startup.