From 0102e5542df35ca6d94d6d9b54a47dff531caf90 Mon Sep 17 00:00:00 2001 From: dazzling-no-more <278675588+dazzling-no-more@users.noreply.github.com> Date: Sat, 25 Apr 2026 16:49:04 +0400 Subject: [PATCH] fix(android): tighten VPN session lifecycle reliability --- .../java/com/therealaleph/mhrv/MhrvApp.kt | 17 ++-- .../com/therealaleph/mhrv/MhrvVpnService.kt | 84 ++++++++++++++----- .../com/therealaleph/mhrv/ui/HomeScreen.kt | 46 ++++++---- 3 files changed, 108 insertions(+), 39 deletions(-) diff --git a/android/app/src/main/java/com/therealaleph/mhrv/MhrvApp.kt b/android/app/src/main/java/com/therealaleph/mhrv/MhrvApp.kt index ffa7154..1e721ec 100644 --- a/android/app/src/main/java/com/therealaleph/mhrv/MhrvApp.kt +++ b/android/app/src/main/java/com/therealaleph/mhrv/MhrvApp.kt @@ -42,11 +42,18 @@ class MhrvApp : Application() { ) val previous = Thread.getDefaultUncaughtExceptionHandler() Thread.setDefaultUncaughtExceptionHandler { thread, throwable -> - Log.e( - CRASH_TAG, - "uncaught on thread=${thread.name} (id=${thread.id}): ${throwable.message}", - throwable, - ) + // Log.e itself can throw on extreme conditions (logd dead, + // OOM allocating the formatted message). If we let that + // bubble up, we'd recurse into our own handler with a + // half-handled original exception; swallow it so the + // previous handler still fires with the real failure. + try { + Log.e( + CRASH_TAG, + "uncaught on thread=${thread.name} (id=${thread.id}): ${throwable.message}", + throwable, + ) + } catch (_: Throwable) { } // Let the default handler still terminate the process and // show the system "app closed" dialog — we just wanted to // get a log line out the door first. diff --git a/android/app/src/main/java/com/therealaleph/mhrv/MhrvVpnService.kt b/android/app/src/main/java/com/therealaleph/mhrv/MhrvVpnService.kt index c630a8d..121b9af 100644 --- a/android/app/src/main/java/com/therealaleph/mhrv/MhrvVpnService.kt +++ b/android/app/src/main/java/com/therealaleph/mhrv/MhrvVpnService.kt @@ -242,13 +242,14 @@ class MhrvVpnService : VpnService() { tun = parcelFd // 3) Start tun2proxy on a worker thread. It blocks until stop() or - // shutdown. We detach the fd so ownership transfers cleanly; the - // ParcelFileDescriptor (`tun`) still holds a reference, so closing - // it at teardown reliably tears down the TUN even if tun2proxy - // doesn't cleanly exit. + // shutdown. We detach the fd so ownership transfers cleanly to + // tun2proxy (closeFdOnDrop = true closes it on return from run()). + // The ParcelFileDescriptor (`tun`) we keep is post-detach — its + // own close() is a no-op for the underlying fd, so the worker is + // the sole owner once it's running. val detachedFd = parcelFd.detachFd() tun2proxyRunning.set(true) - tun2proxyThread = Thread({ + val worker = Thread({ try { val rc = Tun2proxy.run( "socks5://127.0.0.1:$socks5Port", @@ -264,7 +265,29 @@ class MhrvVpnService : VpnService() { } finally { tun2proxyRunning.set(false) } - }, "tun2proxy").apply { start() } + }, "tun2proxy") + try { + worker.start() + tun2proxyThread = worker + } catch (t: Throwable) { + // Thread.start can throw OutOfMemoryError under extreme memory + // pressure. The fd we just detached has no owner — without an + // explicit close it leaks for the life of the process. Adopt + // it into a fresh ParcelFileDescriptor purely so we can call + // close() on it. + Log.e(TAG, "tun2proxy thread start failed: ${t.message}", t) + tun2proxyRunning.set(false) + try { + ParcelFileDescriptor.adoptFd(detachedFd).close() + } catch (closeErr: Throwable) { + Log.w(TAG, "adoptFd($detachedFd).close failed: ${closeErr.message}") + } + Native.stopProxy(proxyHandle) + proxyHandle = 0L + try { stopForeground(STOP_FOREGROUND_REMOVE) } catch (_: Throwable) {} + stopSelf() + return + } // (startForeground was already called at the top of this method // to satisfy Android 8+'s foreground-service contract — see the @@ -291,12 +314,23 @@ class MhrvVpnService : VpnService() { * tun2proxy still forwarding packets into a half-dead Rust runtime * while the runtime is force-aborting its tasks — that's the scenario * that manifested as "Stop crashes the app" when there were in-flight - * relay requests piled up against a dead Apps Script deployment. The - * correct order is: - * 1. Signal tun2proxy to stop (cooperative). - * 2. Close the TUN fd — forces tun2proxy's read() to return EBADF. - * 3. Join the tun2proxy thread (now it really will exit). - * 4. Shut down the Rust proxy runtime (nothing left to forward to). + * relay requests piled up against a dead Apps Script deployment. + * + * Steps, with the bound on each one called out so a hung native call + * cannot stall the whole teardown thread: + * 1. Signal tun2proxy to stop (cooperative). Bounded by a 2s + * side-thread join — if the JNI call hangs we proceed anyway. + * 2. Drop our `ParcelFileDescriptor` reference. Because we already + * called detachFd() at startup, this is a no-op for the + * underlying fd — the worker (closeFdOnDrop=true) owns it. + * We keep the call only so the PROXY_ONLY / failed-establish + * paths still null out the field cleanly. + * 3. Join the tun2proxy thread, bounded at 4s. If the worker is + * stuck we log and move on — the runtime shutdown below will + * knock the rest of the world over. + * 4. Shut down the Rust proxy runtime, bounded by `rt.shutdown_timeout` + * on the Rust side (5s). This is the hard backstop: the listener + * socket is released here regardless of what the worker is doing. */ private fun teardown() { // Idempotency guard. Without this, onDestroy racing the @@ -315,17 +349,29 @@ class MhrvVpnService : VpnService() { "(tun2proxy running=${tun2proxyRunning.get()}, proxyHandle=$proxyHandle)", ) - // 1. Cooperative stop signal. + // 1. Cooperative stop signal — bounded so a hung Rust call cannot + // stall the entire teardown thread. We've never observed + // Tun2proxy.stop() block in practice, but the contract isn't + // documented as bounded and the rest of teardown already takes + // care to be timeout-bounded; this closes the gap. if (tun2proxyRunning.get()) { - try { Tun2proxy.stop() } catch (t: Throwable) { - Log.w(TAG, "Tun2proxy.stop: ${t.message}") + val stopper = Thread({ + try { Tun2proxy.stop() } catch (t: Throwable) { + Log.w(TAG, "Tun2proxy.stop: ${t.message}") + } + }, "mhrv-tun2proxy-stop").apply { start() } + try { stopper.join(2_000) } catch (_: InterruptedException) {} + if (stopper.isAlive) { + Log.w(TAG, "Tun2proxy.stop did not return within 2s — proceeding") } } - // 2. Close the TUN fd. Since we called detachFd earlier the - // ParcelFileDescriptor no longer owns the fd and close() here - // is a no-op; the real fd is owned by tun2proxy (closeFdOnDrop - // = true), which closes it on return from run(). + // 2. Drop our PFD reference. detachFd at startup means this + // close() is a no-op for the underlying fd — tun2proxy owns + // it (closeFdOnDrop = true) and closes it on return from + // run(). The call is kept only to null the field cleanly on + // paths that never reached detachFd (PROXY_ONLY, or an + // establish() that failed mid-builder). try { tun?.close() } catch (t: Throwable) { Log.w(TAG, "tun.close: ${t.message}") } diff --git a/android/app/src/main/java/com/therealaleph/mhrv/ui/HomeScreen.kt b/android/app/src/main/java/com/therealaleph/mhrv/ui/HomeScreen.kt index dd69ffd..6b8ee13 100644 --- a/android/app/src/main/java/com/therealaleph/mhrv/ui/HomeScreen.kt +++ b/android/app/src/main/java/com/therealaleph/mhrv/ui/HomeScreen.kt @@ -46,8 +46,10 @@ import com.therealaleph.mhrv.ui.theme.ErrRed import com.therealaleph.mhrv.ui.theme.OkGreen import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.delay +import kotlinx.coroutines.flow.first import kotlinx.coroutines.launch import kotlinx.coroutines.withContext +import kotlinx.coroutines.withTimeoutOrNull import org.json.JSONObject /** @@ -122,18 +124,31 @@ fun HomeScreen( } } - // Cooldown on Start/Stop. Rapid taps during a VPN transition trigger - // an emulator-specific EGL renderer crash - // (F OpenGLRenderer: EGL_NOT_INITIALIZED during rendering) — the - // service survives, but the Compose UI process dies and the app - // appears to close. On real hardware this is rare, but debouncing - // is useful UX anyway: neither start nor stop is truly instant, - // and the user gets no feedback if they tap while one is in flight. - var transitionCooldown by remember { mutableStateOf(false) } - LaunchedEffect(transitionCooldown) { - if (transitionCooldown) { - delay(2000) - transitionCooldown = false + // Gate Start/Stop on the service's actual state transition rather + // than a fixed timer. The previous 2s cooldown was shorter than the + // worst-case teardown (Tun2proxy.stop + 4s join + 5s rt.shutdown_timeout + // ≈ 9s on the slowest path), which let the user fire a fresh Connect + // while the previous Stop's native cleanup was still releasing the + // listener port — the new startProxy then failed with "Address already + // in use". + // + // `awaitingRunning` holds the value we expect VpnState.isRunning to + // settle on after the user's action; null means "no transition in + // flight". The LaunchedEffect below suspends on the StateFlow until + // the predicate matches, with a 12s backstop in case the service + // failed before flipping the flag (e.g., establish() returned null). + // Side benefit: this also debounces the rapid-tap EGL renderer crash + // the old timer was guarding against. + var awaitingRunning by remember { mutableStateOf(null) } + val transitioning = awaitingRunning != null + LaunchedEffect(awaitingRunning) { + val target = awaitingRunning ?: return@LaunchedEffect + try { + withTimeoutOrNull(12_000) { + VpnState.isRunning.first { it == target } + } + } finally { + awaitingRunning = null } } @@ -373,10 +388,11 @@ fun HomeScreen( val isVpnRunning by VpnState.isRunning.collectAsState() Button( onClick = { - transitionCooldown = true if (isVpnRunning) { + awaitingRunning = false onStop() } else { + awaitingRunning = true // Connect flow: auto-resolve google_ip so we don't // hand the proxy a stale anycast target; repair // front_domain if it got corrupted into an IP @@ -418,7 +434,7 @@ fun HomeScreen( }, enabled = (isVpnRunning || cfg.mode == Mode.GOOGLE_ONLY || - (cfg.hasDeploymentId && cfg.authKey.isNotBlank())) && !transitionCooldown, + (cfg.hasDeploymentId && cfg.authKey.isNotBlank())) && !transitioning, colors = ButtonDefaults.buttonColors( containerColor = if (isVpnRunning) ErrRed else OkGreen, contentColor = androidx.compose.ui.graphics.Color.White, @@ -430,7 +446,7 @@ fun HomeScreen( ) { Text( when { - transitionCooldown -> "…" + transitioning -> "…" isVpnRunning -> stringResource(R.string.btn_disconnect) else -> stringResource(R.string.btn_connect) },