mirror of
https://github.com/therealaleph/MasterHttpRelayVPN-RUST.git
synced 2026-05-17 21:24:48 +03:00
feat(tunnel-client): blacklist deployments after sustained timeouts (#319)
This commit is contained in:
+75
-2
@@ -102,6 +102,13 @@ pub struct DomainFronter {
|
||||
inflight: Arc<Mutex<HashMap<String, broadcast::Sender<Vec<u8>>>>>,
|
||||
coalesced: AtomicU64,
|
||||
blacklist: Arc<std::sync::Mutex<HashMap<String, Instant>>>,
|
||||
/// Per-deployment rolling timeout counter. Maps `script_id` →
|
||||
/// `(window_start, strike_count)`. Reset when the window expires
|
||||
/// or when a batch succeeds. Triggers a short-cooldown blacklist
|
||||
/// at `TIMEOUT_STRIKE_LIMIT`. Distinct from `blacklist` because
|
||||
/// strike state is per-deployment health bookkeeping, not the
|
||||
/// permanent ban list.
|
||||
script_timeouts: Arc<std::sync::Mutex<HashMap<String, (Instant, u32)>>>,
|
||||
relay_calls: AtomicU64,
|
||||
relay_failures: AtomicU64,
|
||||
bytes_relayed: AtomicU64,
|
||||
@@ -146,6 +153,21 @@ impl HostStat {
|
||||
|
||||
const BLACKLIST_COOLDOWN_SECS: u64 = 600;
|
||||
|
||||
/// Sliding window for the timeout-strike blacklist heuristic. Three
|
||||
/// timeouts within this window on a single deployment trip the
|
||||
/// blacklist. Tuned so a single cold-start stall plus one transient
|
||||
/// network blip won't false-trigger, but a deployment that's actually
|
||||
/// dead (stale `TUNNEL_SERVER_URL`, paused project, dropped script)
|
||||
/// fails fast instead of poisoning round-robin until the user notices.
|
||||
const TIMEOUT_STRIKE_WINDOW: Duration = Duration::from_secs(30);
|
||||
const TIMEOUT_STRIKE_LIMIT: u32 = 3;
|
||||
|
||||
/// Cooldown for a deployment blacklisted via the timeout-strike path.
|
||||
/// Distinct from `BLACKLIST_COOLDOWN_SECS` (10 min) because timeouts
|
||||
/// are a much noisier signal than quota errors — if the deployment
|
||||
/// recovers, we want to rejoin in minutes, not after a 10-min penalty.
|
||||
const TIMEOUT_BLACKLIST_COOLDOWN_SECS: u64 = 120;
|
||||
|
||||
/// Request payload sent to Apps Script (single, non-batch).
|
||||
#[derive(Serialize)]
|
||||
struct RelayRequest<'a> {
|
||||
@@ -258,6 +280,7 @@ impl DomainFronter {
|
||||
inflight: Arc::new(Mutex::new(HashMap::new())),
|
||||
coalesced: AtomicU64::new(0),
|
||||
blacklist: Arc::new(std::sync::Mutex::new(HashMap::new())),
|
||||
script_timeouts: Arc::new(std::sync::Mutex::new(HashMap::new())),
|
||||
relay_calls: AtomicU64::new(0),
|
||||
relay_failures: AtomicU64::new(0),
|
||||
bytes_relayed: AtomicU64::new(0),
|
||||
@@ -414,17 +437,67 @@ impl DomainFronter {
|
||||
}
|
||||
|
||||
fn blacklist_script(&self, script_id: &str, reason: &str) {
|
||||
let until = Instant::now() + Duration::from_secs(BLACKLIST_COOLDOWN_SECS);
|
||||
self.blacklist_script_for(
|
||||
script_id,
|
||||
Duration::from_secs(BLACKLIST_COOLDOWN_SECS),
|
||||
reason,
|
||||
);
|
||||
}
|
||||
|
||||
fn blacklist_script_for(&self, script_id: &str, cooldown: Duration, reason: &str) {
|
||||
let until = Instant::now() + cooldown;
|
||||
let mut bl = self.blacklist.lock().unwrap();
|
||||
bl.insert(script_id.to_string(), until);
|
||||
tracing::warn!(
|
||||
"blacklisted script {} for {}s: {}",
|
||||
mask_script_id(script_id),
|
||||
BLACKLIST_COOLDOWN_SECS,
|
||||
cooldown.as_secs(),
|
||||
reason
|
||||
);
|
||||
}
|
||||
|
||||
/// Record a batch timeout against `script_id`. After
|
||||
/// `TIMEOUT_STRIKE_LIMIT` timeouts inside `TIMEOUT_STRIKE_WINDOW`
|
||||
/// the deployment is blacklisted with a short cooldown so the
|
||||
/// round-robin stops sending real traffic to a deployment that's
|
||||
/// hung (most commonly: stale `TUNNEL_SERVER_URL` after the
|
||||
/// tunnel-node moved hosts).
|
||||
pub(crate) fn record_timeout_strike(&self, script_id: &str) {
|
||||
let now = Instant::now();
|
||||
let mut counts = self.script_timeouts.lock().unwrap();
|
||||
let entry = counts
|
||||
.entry(script_id.to_string())
|
||||
.or_insert((now, 0));
|
||||
if now.duration_since(entry.0) > TIMEOUT_STRIKE_WINDOW {
|
||||
*entry = (now, 1);
|
||||
} else {
|
||||
entry.1 += 1;
|
||||
}
|
||||
let strikes = entry.1;
|
||||
if strikes >= TIMEOUT_STRIKE_LIMIT {
|
||||
counts.remove(script_id);
|
||||
drop(counts);
|
||||
self.blacklist_script_for(
|
||||
script_id,
|
||||
Duration::from_secs(TIMEOUT_BLACKLIST_COOLDOWN_SECS),
|
||||
&format!(
|
||||
"{} timeouts in {}s",
|
||||
strikes,
|
||||
TIMEOUT_STRIKE_WINDOW.as_secs()
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Clear the timeout strike counter for `script_id`. Called after
|
||||
/// a batch succeeds so a recovered deployment doesn't keep stale
|
||||
/// strikes from hours ago — three strikes must occur within one
|
||||
/// real failure burst, not accumulate across unrelated incidents.
|
||||
pub(crate) fn record_batch_success(&self, script_id: &str) {
|
||||
let mut counts = self.script_timeouts.lock().unwrap();
|
||||
counts.remove(script_id);
|
||||
}
|
||||
|
||||
/// Log a relay failure with extra guidance on cert-validation cases.
|
||||
/// Rate-limited so a flood of identical "UnknownIssuer" errors doesn't
|
||||
/// fill the log.
|
||||
|
||||
+15
-1
@@ -23,7 +23,7 @@ use tokio::io::{AsyncReadExt, AsyncWrite, AsyncWriteExt};
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::sync::{mpsc, oneshot, Semaphore};
|
||||
|
||||
use crate::domain_fronter::{BatchOp, DomainFronter, TunnelResponse};
|
||||
use crate::domain_fronter::{BatchOp, DomainFronter, FronterError, TunnelResponse};
|
||||
|
||||
/// Apps Script allows 30 concurrent executions per account / deployment.
|
||||
const CONCURRENCY_PER_DEPLOYMENT: usize = 30;
|
||||
@@ -827,6 +827,7 @@ async fn fire_batch(
|
||||
|
||||
match result {
|
||||
Ok(Ok(batch_resp)) => {
|
||||
f.record_batch_success(&script_id);
|
||||
for (idx, reply) in data_replies {
|
||||
if let Some(resp) = batch_resp.r.get(idx) {
|
||||
let _ = reply.send(Ok((resp.clone(), script_id.clone())));
|
||||
@@ -836,6 +837,15 @@ async fn fire_batch(
|
||||
}
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
// Read-side timeout from `domain_fronter`: Apps Script didn't
|
||||
// start streaming response bytes within the per-read deadline.
|
||||
// Common cause: deployment's `TUNNEL_SERVER_URL` points at a
|
||||
// dead host, so UrlFetchApp inside Apps Script hangs until its
|
||||
// own internal connect timeout. Strike-counter blacklists the
|
||||
// deployment after a sustained pattern.
|
||||
if matches!(e, FronterError::Timeout) {
|
||||
f.record_timeout_strike(&script_id);
|
||||
}
|
||||
let err_msg = format!("{}", e);
|
||||
tracing::warn!("batch failed: {}", err_msg);
|
||||
for (_, reply) in data_replies {
|
||||
@@ -843,6 +853,10 @@ async fn fire_batch(
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
// Whole-batch budget (`BATCH_TIMEOUT`, 30 s) elapsed. Even
|
||||
// stronger signal than a per-read timeout — count it the same
|
||||
// way so a truly-stuck deployment exits round-robin fast.
|
||||
f.record_timeout_strike(&script_id);
|
||||
tracing::warn!("batch timed out after {:?} ({} ops)", BATCH_TIMEOUT, n_ops);
|
||||
for (_, reply) in data_replies {
|
||||
let _ = reply.send(Err("batch timed out".into()));
|
||||
|
||||
Reference in New Issue
Block a user