mirror of
https://github.com/therealaleph/MasterHttpRelayVPN-RUST.git
synced 2026-05-18 23:54:48 +03:00
v0.9.3: accept-loop backoff on EMFILE + louder rlimit diagnostics (issue #18)
@Behzad9 on #18: the OpenWRT 'No file descriptors available' errors are back in v0.8.0+, this time logged as a wall of thousands of identical ERRORs within seconds of activating the proxy. Two real bugs, now fixed: === 1. accept() loop had no backoff === Previous code: loop { match listener.accept().await { Ok(x) => ..., Err(e) => { tracing::error!(...); continue; } // tight loop } } On EMFILE (RLIMIT_NOFILE exhausted), accept() returns synchronously, the match re-runs instantly, accept() EMFILEs again, forever. The tight loop ALSO starves the tokio runtime of CPU that existing connections need to finish and close their fds — so the problem never clears on its own. It's a self-sustaining meltdown. New accept_backoff() helper (in proxy_server.rs) wraps both the HTTP and SOCKS5 accept loops: - Detects EMFILE/ENFILE via raw_os_error (24 or 23). - Sleeps proportional to how long the pressure has lasted (50 ms first hit, ramping to a 2 s cap around hit #40). Gives existing connections a chance to finish and free fds. - Rate-limits the log line: one WARN on the first EMFILE with fix instructions, then one every 100 retries. No more walls of identical errors. - Resets the counter on the next successful accept. - Non-EMFILE errors (ECONNABORTED from clients that went away during handshake, etc.) get a plain single-line error + 5 ms sleep so we still don't tight-loop on any unexpected error. End-to-end verified: ran mhrv-rs under , flooded the SOCKS5 port with 247 concurrent connections to trip EMFILE. Before: log would have been 1000s of identical lines. After: exactly 1 warning, listener stayed quiet, fds drained, accept resumed. === 2. RLIMIT_NOFILE bump was too conservative + silent === Previous behavior: target 16384 soft, cap to existing hard limit, no log. On constrained systems where hard is already tiny, we'd stay at the tiny limit silently. rlimit.rs now: - Targets 65536 soft. - ALSO tries to raise the hard limit up to /proc/sys/fs/nr_open on Linux (Linux allows a non-privileged process to bump its own hard limit up to the kernel ceiling, usually 1048576 on modern kernels). On macOS/BSD we skip this — only bump soft. - Logs WARN on startup if soft ends up <4096 with the exact fix ('ulimit -n 65536' or use the procd init). No more silent failure. - Logs INFO with the before/after limits otherwise, so field bug reports tell us immediately whether the kernel cap is the real bottleneck. Moved the rlimit call from main() pre-logging to post-init_logging so its tracing output actually lands in the log panel + stderr. Small reorganization only. 49 tests pass, musl x86_64 cross-compile verified locally.
This commit is contained in:
Generated
+1
-1
@@ -1317,7 +1317,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "mhrv-rs"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "mhrv-rs"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
edition = "2021"
|
||||
description = "Rust port of MasterHttpRelayVPN -- DPI bypass via Google Apps Script relay with domain fronting"
|
||||
license = "MIT"
|
||||
|
||||
+7
-4
@@ -121,10 +121,6 @@ async fn main() -> ExitCode {
|
||||
// Install default rustls crypto provider (ring).
|
||||
let _ = rustls::crypto::ring::default_provider().install_default();
|
||||
|
||||
// Bump RLIMIT_NOFILE where possible — OpenWRT/Alpine hosts often ship a
|
||||
// default so low the proxy runs out of fds under normal browser load.
|
||||
mhrv_rs::rlimit::raise_nofile_limit_best_effort();
|
||||
|
||||
let args = match parse_args() {
|
||||
Ok(a) => a,
|
||||
Err(e) => {
|
||||
@@ -170,6 +166,13 @@ async fn main() -> ExitCode {
|
||||
|
||||
init_logging(&config.log_level);
|
||||
|
||||
// Bump RLIMIT_NOFILE now that tracing is live — OpenWRT/Alpine hosts
|
||||
// often ship a default so low (issue #8, issue #18) that we run out
|
||||
// of fds under normal proxy load. This logs the before/after values
|
||||
// at info level so field reports tell us whether the kernel cap is
|
||||
// the real culprit.
|
||||
mhrv_rs::rlimit::raise_nofile_limit_best_effort();
|
||||
|
||||
match args.command {
|
||||
Command::Test => {
|
||||
let ok = test_cmd::run(&config).await;
|
||||
|
||||
+66
-4
@@ -180,11 +180,15 @@ impl ProxyServer {
|
||||
let http_mitm = self.mitm.clone();
|
||||
let http_ctx = self.rewrite_ctx.clone();
|
||||
let mut http_task = tokio::spawn(async move {
|
||||
let mut fd_exhaust_count: u64 = 0;
|
||||
loop {
|
||||
let (sock, peer) = match http_listener.accept().await {
|
||||
Ok(x) => x,
|
||||
Ok(x) => {
|
||||
fd_exhaust_count = 0;
|
||||
x
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("accept (http): {}", e);
|
||||
accept_backoff("http", &e, &mut fd_exhaust_count).await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
@@ -204,11 +208,15 @@ impl ProxyServer {
|
||||
let socks_mitm = self.mitm.clone();
|
||||
let socks_ctx = self.rewrite_ctx.clone();
|
||||
let mut socks_task = tokio::spawn(async move {
|
||||
let mut fd_exhaust_count: u64 = 0;
|
||||
loop {
|
||||
let (sock, peer) = match socks_listener.accept().await {
|
||||
Ok(x) => x,
|
||||
Ok(x) => {
|
||||
fd_exhaust_count = 0;
|
||||
x
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("accept (socks): {}", e);
|
||||
accept_backoff("socks", &e, &mut fd_exhaust_count).await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
@@ -240,6 +248,60 @@ impl ProxyServer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Back-off helper for the accept() loop.
|
||||
///
|
||||
/// Motivated by issue #18: when the process hits its file-descriptor limit
|
||||
/// (EMFILE — `No file descriptors available`), `accept()` returns that
|
||||
/// error synchronously and is immediately ready to fire again. The old
|
||||
/// loop just `continue`'d, producing a wall of identical ERROR lines
|
||||
/// thousands per second and starving the tokio runtime of CPU that
|
||||
/// existing connections would have used to drain and close.
|
||||
///
|
||||
/// Two things this does right:
|
||||
/// 1. Sleeps when `EMFILE` / `ENFILE` are seen, proportional to how long
|
||||
/// the problem has been going on (exponential-ish, capped at 2s).
|
||||
/// Gives existing connections a chance to finish and free fds.
|
||||
/// 2. Rate-limits the log line: first occurrence logs a full warning
|
||||
/// with fix instructions, subsequent ones log once per 100 errors
|
||||
/// so the log doesn't fill up.
|
||||
async fn accept_backoff(kind: &str, err: &std::io::Error, count: &mut u64) {
|
||||
let is_fd_limit = matches!(
|
||||
err.raw_os_error(),
|
||||
Some(libc_emfile) if libc_emfile == 24 || libc_emfile == 23
|
||||
);
|
||||
|
||||
*count = count.saturating_add(1);
|
||||
|
||||
if is_fd_limit {
|
||||
if *count == 1 {
|
||||
tracing::warn!(
|
||||
"accept ({}) hit RLIMIT_NOFILE: {}. Backing off. Raise the fd limit: \
|
||||
`ulimit -n 65536` before starting, or (OpenWRT) use the shipped procd \
|
||||
init which sets nofile=16384. The listener will keep retrying.",
|
||||
kind,
|
||||
err
|
||||
);
|
||||
} else if *count % 100 == 0 {
|
||||
tracing::warn!(
|
||||
"accept ({}) still fd-limited after {} retries. Current connections \
|
||||
need to finish before we can accept new ones.",
|
||||
kind,
|
||||
*count
|
||||
);
|
||||
}
|
||||
// Back off exponentially-ish up to 2s. First hit: 50ms, 10th hit:
|
||||
// ~500ms, 50th+: 2s cap.
|
||||
let backoff_ms = (50u64 * (*count).min(40)).min(2000);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(backoff_ms)).await;
|
||||
} else {
|
||||
// Transient non-EMFILE error (e.g. ECONNABORTED from a client that
|
||||
// went away during the handshake). One-line log, short sleep to
|
||||
// avoid a tight loop in case it repeats.
|
||||
tracing::error!("accept ({}): {}", kind, err);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(5)).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_http_client(
|
||||
mut sock: TcpStream,
|
||||
fronter: Arc<DomainFronter>,
|
||||
|
||||
+100
-36
@@ -1,23 +1,27 @@
|
||||
//! Best-effort file descriptor limit bump on Unix.
|
||||
//!
|
||||
//! Context (issue #8): on OpenWRT routers — and some minimal Alpine / BSD
|
||||
//! installs — the default `RLIMIT_NOFILE` is so low (often 1024 or even
|
||||
//! 512) that a browser's burst of ~30 parallel subresource requests fills
|
||||
//! the limit within seconds. Once the limit is hit `accept(2)` returns
|
||||
//! `EMFILE` and the user sees:
|
||||
//! Context (issues #8 + #18): on OpenWRT routers — and some minimal
|
||||
//! Alpine / BSD installs — the default `RLIMIT_NOFILE` is so low
|
||||
//! (often 1024 or even 256 / 128 on constrained devices) that a
|
||||
//! browser's burst of ~30 parallel subresource requests, or a DNS-over-
|
||||
//! SOCKS5 flood from a client like v2ray, fills the limit within seconds.
|
||||
//! Once the limit is hit `accept(2)` returns `EMFILE` and the user sees:
|
||||
//!
|
||||
//! ERROR accept (socks): No file descriptors available (os error 24)
|
||||
//!
|
||||
//! This helper raises the soft limit up to the hard limit (without
|
||||
//! requiring root), so the user gets whatever headroom the kernel
|
||||
//! already allows them. Failures are logged and swallowed.
|
||||
//! Approach:
|
||||
//! - Try to raise the SOFT limit to a generous target.
|
||||
//! - If the HARD limit is also low, try to raise THAT too — Linux lets
|
||||
//! a non-root process bump its hard limit up to `/proc/sys/fs/nr_open`.
|
||||
//! - Log what we ended up with so a user filing a bug report can tell
|
||||
//! us whether their kernel cap is below what a real proxy needs.
|
||||
|
||||
#[cfg(unix)]
|
||||
pub fn raise_nofile_limit_best_effort() {
|
||||
// Target: 16384 if the hard limit allows it, else whatever the hard
|
||||
// limit is. 16k matches what most modern desktop distros default to and
|
||||
// is plenty for a local proxy.
|
||||
const DESIRED: u64 = 16_384;
|
||||
// Ambitious target. 65536 is plenty for even heavy router use (a
|
||||
// whole LAN doing browser + DNS + Telegram over our SOCKS5). Costs
|
||||
// ~0 kernel memory until actually used.
|
||||
const DESIRED: u64 = 65_536;
|
||||
|
||||
unsafe {
|
||||
let mut rl = libc::rlimit {
|
||||
@@ -26,41 +30,101 @@ pub fn raise_nofile_limit_best_effort() {
|
||||
};
|
||||
if libc::getrlimit(libc::RLIMIT_NOFILE, &mut rl) != 0 {
|
||||
let err = std::io::Error::last_os_error();
|
||||
tracing::debug!("getrlimit(RLIMIT_NOFILE) failed: {}", err);
|
||||
tracing::warn!("getrlimit(RLIMIT_NOFILE) failed: {}", err);
|
||||
return;
|
||||
}
|
||||
let original_soft = rl.rlim_cur as u64;
|
||||
let original_hard = rl.rlim_max as u64;
|
||||
|
||||
// Already high enough? Leave it.
|
||||
let current = rl.rlim_cur as u64;
|
||||
let hard = rl.rlim_max as u64;
|
||||
if current >= DESIRED {
|
||||
return;
|
||||
// Figure out an absolute ceiling. On Linux, /proc/sys/fs/nr_open
|
||||
// is the highest the kernel will ever let a process set its
|
||||
// RLIMIT_NOFILE. Read it and use it as our hard-limit target.
|
||||
// On macOS/BSD this file doesn't exist — we just keep the
|
||||
// existing hard limit.
|
||||
let kernel_ceiling = read_nr_open().unwrap_or(original_hard);
|
||||
let want_hard = DESIRED.max(original_hard).min(kernel_ceiling);
|
||||
|
||||
// Step 1: raise the hard limit if it's below what we want. This
|
||||
// can only go UP on non-privileged processes (lowering it is
|
||||
// permanent and requires CAP_SYS_RESOURCE to undo).
|
||||
if want_hard > original_hard {
|
||||
rl.rlim_max = want_hard as libc::rlim_t;
|
||||
rl.rlim_cur = want_hard as libc::rlim_t;
|
||||
if libc::setrlimit(libc::RLIMIT_NOFILE, &rl) != 0 {
|
||||
let err = std::io::Error::last_os_error();
|
||||
tracing::debug!(
|
||||
"setrlimit raising hard {}→{} failed: {} (trying soft-only)",
|
||||
original_hard,
|
||||
want_hard,
|
||||
err
|
||||
);
|
||||
// Fall through to step 2 with the unmodified hard limit.
|
||||
rl.rlim_max = original_hard as libc::rlim_t;
|
||||
}
|
||||
}
|
||||
|
||||
let new_soft = DESIRED.min(hard);
|
||||
if new_soft <= current {
|
||||
return;
|
||||
// Step 2: raise soft up to whatever hard allows.
|
||||
let effective_hard = rl.rlim_max as u64;
|
||||
let want_soft = DESIRED.min(effective_hard);
|
||||
if want_soft > original_soft {
|
||||
rl.rlim_cur = want_soft as libc::rlim_t;
|
||||
if libc::setrlimit(libc::RLIMIT_NOFILE, &rl) != 0 {
|
||||
let err = std::io::Error::last_os_error();
|
||||
tracing::warn!(
|
||||
"setrlimit raising soft {}→{} failed: {}",
|
||||
original_soft,
|
||||
want_soft,
|
||||
err
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
rl.rlim_cur = new_soft as libc::rlim_t;
|
||||
if libc::setrlimit(libc::RLIMIT_NOFILE, &rl) != 0 {
|
||||
let err = std::io::Error::last_os_error();
|
||||
tracing::debug!(
|
||||
"setrlimit(RLIMIT_NOFILE) {} -> {} failed: {}",
|
||||
current,
|
||||
new_soft,
|
||||
err
|
||||
// Re-read and report.
|
||||
let mut now = libc::rlimit {
|
||||
rlim_cur: 0,
|
||||
rlim_max: 0,
|
||||
};
|
||||
let _ = libc::getrlimit(libc::RLIMIT_NOFILE, &mut now);
|
||||
let soft = now.rlim_cur as u64;
|
||||
let hard = now.rlim_max as u64;
|
||||
|
||||
if soft < 4096 {
|
||||
// This is genuinely too low for a local proxy under LAN load.
|
||||
// Log loudly so the user knows their system is the bottleneck,
|
||||
// not us.
|
||||
tracing::warn!(
|
||||
"RLIMIT_NOFILE is {}/{} (soft/hard). This is likely too low for a \
|
||||
proxy under any real load and WILL cause 'No file descriptors \
|
||||
available' errors. On OpenWRT, ensure you're starting via the \
|
||||
shipped procd init script (which sets nofile=16384), or add \
|
||||
`ulimit -n 65536` to your startup script.",
|
||||
soft,
|
||||
hard,
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
"RLIMIT_NOFILE = {}/{} (soft/hard), was {}/{} at startup",
|
||||
soft,
|
||||
hard,
|
||||
original_soft,
|
||||
original_hard,
|
||||
);
|
||||
return;
|
||||
}
|
||||
tracing::info!(
|
||||
"raised RLIMIT_NOFILE: {} -> {} (hard={})",
|
||||
current,
|
||||
new_soft,
|
||||
hard
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
fn read_nr_open() -> Option<u64> {
|
||||
std::fs::read_to_string("/proc/sys/fs/nr_open")
|
||||
.ok()
|
||||
.and_then(|s| s.trim().parse::<u64>().ok())
|
||||
}
|
||||
|
||||
#[cfg(all(unix, not(target_os = "linux")))]
|
||||
fn read_nr_open() -> Option<u64> {
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(not(unix))]
|
||||
pub fn raise_nofile_limit_best_effort() {}
|
||||
|
||||
Reference in New Issue
Block a user