feat(tunnel-client): blacklist deployments after sustained timeouts (#319)

This commit is contained in:
dazzling-no-more
2026-04-27 15:18:42 +04:00
committed by GitHub
parent 8758a75a27
commit d6d1006f32
2 changed files with 90 additions and 3 deletions
+75 -2
View File
@@ -102,6 +102,13 @@ pub struct DomainFronter {
inflight: Arc<Mutex<HashMap<String, broadcast::Sender<Vec<u8>>>>>,
coalesced: AtomicU64,
blacklist: Arc<std::sync::Mutex<HashMap<String, Instant>>>,
/// Per-deployment rolling timeout counter. Maps `script_id` →
/// `(window_start, strike_count)`. Reset when the window expires
/// or when a batch succeeds. Triggers a short-cooldown blacklist
/// at `TIMEOUT_STRIKE_LIMIT`. Distinct from `blacklist` because
/// strike state is per-deployment health bookkeeping, not the
/// permanent ban list.
script_timeouts: Arc<std::sync::Mutex<HashMap<String, (Instant, u32)>>>,
relay_calls: AtomicU64,
relay_failures: AtomicU64,
bytes_relayed: AtomicU64,
@@ -146,6 +153,21 @@ impl HostStat {
const BLACKLIST_COOLDOWN_SECS: u64 = 600;
/// Sliding window for the timeout-strike blacklist heuristic. Three
/// timeouts within this window on a single deployment trip the
/// blacklist. Tuned so a single cold-start stall plus one transient
/// network blip won't false-trigger, but a deployment that's actually
/// dead (stale `TUNNEL_SERVER_URL`, paused project, dropped script)
/// fails fast instead of poisoning round-robin until the user notices.
const TIMEOUT_STRIKE_WINDOW: Duration = Duration::from_secs(30);
const TIMEOUT_STRIKE_LIMIT: u32 = 3;
/// Cooldown for a deployment blacklisted via the timeout-strike path.
/// Distinct from `BLACKLIST_COOLDOWN_SECS` (10 min) because timeouts
/// are a much noisier signal than quota errors — if the deployment
/// recovers, we want to rejoin in minutes, not after a 10-min penalty.
const TIMEOUT_BLACKLIST_COOLDOWN_SECS: u64 = 120;
/// Request payload sent to Apps Script (single, non-batch).
#[derive(Serialize)]
struct RelayRequest<'a> {
@@ -258,6 +280,7 @@ impl DomainFronter {
inflight: Arc::new(Mutex::new(HashMap::new())),
coalesced: AtomicU64::new(0),
blacklist: Arc::new(std::sync::Mutex::new(HashMap::new())),
script_timeouts: Arc::new(std::sync::Mutex::new(HashMap::new())),
relay_calls: AtomicU64::new(0),
relay_failures: AtomicU64::new(0),
bytes_relayed: AtomicU64::new(0),
@@ -414,17 +437,67 @@ impl DomainFronter {
}
fn blacklist_script(&self, script_id: &str, reason: &str) {
let until = Instant::now() + Duration::from_secs(BLACKLIST_COOLDOWN_SECS);
self.blacklist_script_for(
script_id,
Duration::from_secs(BLACKLIST_COOLDOWN_SECS),
reason,
);
}
fn blacklist_script_for(&self, script_id: &str, cooldown: Duration, reason: &str) {
let until = Instant::now() + cooldown;
let mut bl = self.blacklist.lock().unwrap();
bl.insert(script_id.to_string(), until);
tracing::warn!(
"blacklisted script {} for {}s: {}",
mask_script_id(script_id),
BLACKLIST_COOLDOWN_SECS,
cooldown.as_secs(),
reason
);
}
/// Record a batch timeout against `script_id`. After
/// `TIMEOUT_STRIKE_LIMIT` timeouts inside `TIMEOUT_STRIKE_WINDOW`
/// the deployment is blacklisted with a short cooldown so the
/// round-robin stops sending real traffic to a deployment that's
/// hung (most commonly: stale `TUNNEL_SERVER_URL` after the
/// tunnel-node moved hosts).
pub(crate) fn record_timeout_strike(&self, script_id: &str) {
let now = Instant::now();
let mut counts = self.script_timeouts.lock().unwrap();
let entry = counts
.entry(script_id.to_string())
.or_insert((now, 0));
if now.duration_since(entry.0) > TIMEOUT_STRIKE_WINDOW {
*entry = (now, 1);
} else {
entry.1 += 1;
}
let strikes = entry.1;
if strikes >= TIMEOUT_STRIKE_LIMIT {
counts.remove(script_id);
drop(counts);
self.blacklist_script_for(
script_id,
Duration::from_secs(TIMEOUT_BLACKLIST_COOLDOWN_SECS),
&format!(
"{} timeouts in {}s",
strikes,
TIMEOUT_STRIKE_WINDOW.as_secs()
),
);
}
}
/// Clear the timeout strike counter for `script_id`. Called after
/// a batch succeeds so a recovered deployment doesn't keep stale
/// strikes from hours ago — three strikes must occur within one
/// real failure burst, not accumulate across unrelated incidents.
pub(crate) fn record_batch_success(&self, script_id: &str) {
let mut counts = self.script_timeouts.lock().unwrap();
counts.remove(script_id);
}
/// Log a relay failure with extra guidance on cert-validation cases.
/// Rate-limited so a flood of identical "UnknownIssuer" errors doesn't
/// fill the log.
+15 -1
View File
@@ -23,7 +23,7 @@ use tokio::io::{AsyncReadExt, AsyncWrite, AsyncWriteExt};
use tokio::net::TcpStream;
use tokio::sync::{mpsc, oneshot, Semaphore};
use crate::domain_fronter::{BatchOp, DomainFronter, TunnelResponse};
use crate::domain_fronter::{BatchOp, DomainFronter, FronterError, TunnelResponse};
/// Apps Script allows 30 concurrent executions per account / deployment.
const CONCURRENCY_PER_DEPLOYMENT: usize = 30;
@@ -827,6 +827,7 @@ async fn fire_batch(
match result {
Ok(Ok(batch_resp)) => {
f.record_batch_success(&script_id);
for (idx, reply) in data_replies {
if let Some(resp) = batch_resp.r.get(idx) {
let _ = reply.send(Ok((resp.clone(), script_id.clone())));
@@ -836,6 +837,15 @@ async fn fire_batch(
}
}
Ok(Err(e)) => {
// Read-side timeout from `domain_fronter`: Apps Script didn't
// start streaming response bytes within the per-read deadline.
// Common cause: deployment's `TUNNEL_SERVER_URL` points at a
// dead host, so UrlFetchApp inside Apps Script hangs until its
// own internal connect timeout. Strike-counter blacklists the
// deployment after a sustained pattern.
if matches!(e, FronterError::Timeout) {
f.record_timeout_strike(&script_id);
}
let err_msg = format!("{}", e);
tracing::warn!("batch failed: {}", err_msg);
for (_, reply) in data_replies {
@@ -843,6 +853,10 @@ async fn fire_batch(
}
}
Err(_) => {
// Whole-batch budget (`BATCH_TIMEOUT`, 30 s) elapsed. Even
// stronger signal than a per-read timeout — count it the same
// way so a truly-stuck deployment exits round-robin fast.
f.record_timeout_strike(&script_id);
tracing::warn!("batch timed out after {:?} ({} ops)", BATCH_TIMEOUT, n_ops);
for (_, reply) in data_replies {
let _ = reply.send(Err("batch timed out".into()));