diff --git a/config.example.json b/config.example.json index b02faa6..387a0ed 100644 --- a/config.example.json +++ b/config.example.json @@ -8,5 +8,7 @@ "listen_host": "127.0.0.1", "listen_port": 8085, "log_level": "INFO", - "verify_ssl": true + "verify_ssl": true, + "_hosts_comment": "Optional SNI-rewrite overrides. YouTube, googlevideo, gstatic, fonts.googleapis.com, ytimg, ggpht, doubleclick, etc. are ALREADY handled automatically (routed via google_ip with SNI=front_domain, same trick as the Xray MITM-DomainFronting config). Add entries here only for custom domains, e.g. \"example.com\": \"216.239.38.120\".", + "hosts": {} } diff --git a/domain_fronter.py b/domain_fronter.py index 0d98768..a623cab 100644 --- a/domain_fronter.py +++ b/domain_fronter.py @@ -319,7 +319,8 @@ class DomainFronter: """Send periodic pings to keep Apps Script warm + H2 connection alive.""" while True: try: - await asyncio.sleep(180) # 3 minutes (ahead of Google's ~4min timeout) + await asyncio.sleep(240) # 4 minutes — saves ~90 quota hits/day vs 180s + # Google's container timeout is ~5 min idle if not self._h2 or not self._h2.is_connected: try: await self._h2.reconnect() @@ -581,7 +582,9 @@ class DomainFronter: status, resp_hdrs, resp_body = self._split_raw_response(first_resp) - # No range support → return the single response as-is + # No range support → return the single response as-is (status 200 + # from the origin). The client sent a plain GET, so 200 is what it + # expects. if status != 206: return first_resp @@ -589,12 +592,16 @@ class DomainFronter: content_range = resp_hdrs.get("content-range", "") m = re.search(r"/(\d+)", content_range) if not m: - return first_resp + # Can't parse — downgrade to 200 so the client (which sent a + # plain GET) doesn't get confused by 206 + Content-Range. + return self._rewrite_206_to_200(first_resp) total_size = int(m.group(1)) - # Small file: probe already fetched it all + # Small file: probe already fetched it all. MUST rewrite to 200 + # because the client never sent a Range header — a stray 206 here + # breaks fetch()/XHR on sites like x.com and Cloudflare challenges. if total_size <= chunk_size or len(resp_body) >= total_size: - return first_resp + return self._rewrite_206_to_200(first_resp) # Calculate remaining ranges ranges = [] @@ -665,6 +672,40 @@ class DomainFronter: result += "\r\n" return result.encode() + full_body + @staticmethod + def _rewrite_206_to_200(raw: bytes) -> bytes: + """Rewrite a 206 Partial Content response to 200 OK. + + Used when we probed with a synthetic Range header but the client + never asked for one. Handing a 206 back to the browser for a plain + GET breaks XHR/fetch on sites like x.com and Cloudflare challenges + (they see it as an aborted/partial response). We drop the + Content-Range header and set Content-Length to the body size. + """ + sep = b"\r\n\r\n" + if sep not in raw: + return raw + header_section, body = raw.split(sep, 1) + lines = header_section.decode(errors="replace").split("\r\n") + if not lines: + return raw + # Replace status line + first = lines[0] + if " 206" in first: + lines[0] = first.replace(" 206 Partial Content", " 200 OK")\ + .replace(" 206", " 200 OK") + # Drop Content-Range and recalculate Content-Length + filtered = [lines[0]] + for ln in lines[1:]: + low = ln.lower() + if low.startswith("content-range:"): + continue + if low.startswith("content-length:"): + continue + filtered.append(ln) + filtered.append(f"Content-Length: {len(body)}") + return ("\r\n".join(filtered) + "\r\n\r\n").encode() + body + def _build_payload(self, method, url, headers, body): """Build the JSON relay payload dict.""" payload = { @@ -1127,12 +1168,42 @@ class DomainFronter: skip = {"transfer-encoding", "connection", "keep-alive", "content-length", "content-encoding"} for k, v in resp_headers.items(): - if k.lower() not in skip: - result += f"{k}: {v}\r\n" + if k.lower() in skip: + continue + # Apps Script returns multi-valued headers (e.g. Set-Cookie) as a + # JavaScript array. Emit each value as its own header line. + # A single string that holds multiple Set-Cookie values joined + # with ", " also needs to be split, otherwise the browser sees + # one malformed cookie and sites like x.com fail. + values = v if isinstance(v, list) else [v] + if k.lower() == "set-cookie": + expanded = [] + for item in values: + expanded.extend(self._split_set_cookie(str(item))) + values = expanded + for val in values: + result += f"{k}: {val}\r\n" result += f"Content-Length: {len(resp_body)}\r\n" result += "\r\n" return result.encode() + resp_body + @staticmethod + def _split_set_cookie(blob: str) -> list[str]: + """Split a Set-Cookie string that may contain multiple cookies. + + Apps Script sometimes joins multiple Set-Cookie values with ", ", + which collides with the comma that legitimately appears inside the + `Expires` attribute (e.g. "Expires=Wed, 21 Oct 2026 ..."). We split + only on commas that are immediately followed by a cookie name=value + pair (token '=' ...), leaving date commas intact. + """ + if not blob: + return [] + # Split on ", " but only when the following text looks like the start + # of a new cookie (a token followed by '='). + parts = re.split(r",\s*(?=[A-Za-z0-9!#$%&'*+\-.^_`|~]+=)", blob) + return [p.strip() for p in parts if p.strip()] + def _split_raw_response(self, raw: bytes): """Split a raw HTTP response into (status, headers_dict, body).""" if b"\r\n\r\n" not in raw: diff --git a/proxy_server.py b/proxy_server.py index 83d383f..fb9b1d8 100644 --- a/proxy_server.py +++ b/proxy_server.py @@ -12,6 +12,7 @@ Supports: import asyncio import logging import re +import ssl import time from domain_fronter import DomainFronter @@ -113,6 +114,10 @@ class ProxyServer: self._http_tunnels: dict = {} self._tunnel_lock = asyncio.Lock() + # hosts override — DNS fake-map: domain/suffix → IP + # Checked before any real DNS lookup; supports exact and suffix matching. + self._hosts: dict[str, str] = config.get("hosts", {}) + if self.mode == "apps_script": try: from mitm import MITMCertManager @@ -185,9 +190,16 @@ class ProxyServer: await writer.drain() if self.mode == "apps_script": - # Google services: tunnel directly (no MITM) to avoid - # Google's anti-bot detection from Apps Script IPs/UA. - if self._is_google_domain(host): + override_ip = self._sni_rewrite_ip(host) + if override_ip: + # SNI-blocked domain: MITM-decrypt from browser, then + # re-connect to the override IP with SNI=front_domain so + # the ISP never sees the blocked hostname in the TLS handshake. + log.info("SNI-rewrite tunnel → %s via %s (SNI: %s)", + host, override_ip, self.fronter.sni_host) + await self._do_sni_rewrite_tunnel(host, port, reader, writer, + connect_ip=override_ip) + elif self._is_google_domain(host): log.info("Direct tunnel → %s (Google domain, skipping relay)", host) await self._do_direct_tunnel(host, port, reader, writer) else: @@ -195,11 +207,70 @@ class ProxyServer: else: await self.fronter.tunnel(host, port, reader, writer) + # ── Hosts override (fake DNS) ───────────────────────────────── + + # Built-in list of domains that must be reached via Google's frontend IP + # with SNI rewritten to `front_domain` (default: www.google.com). + # These are Google-owned services whose real SNI is DPI-blocked in some + # countries, but that Google serves from the same edge IP as www.google.com. + # Users don't need to configure anything — any host matching one of these + # suffixes is transparently SNI-rewritten to the configured `google_ip`. + # Config's "hosts" map still takes precedence (for custom overrides). + _SNI_REWRITE_SUFFIXES = ( + "youtube.com", + "youtu.be", + "youtube-nocookie.com", + "ytimg.com", + "ggpht.com", + "gvt1.com", + "gvt2.com", + "doubleclick.net", + "googlesyndication.com", + "googleadservices.com", + "google-analytics.com", + "googletagmanager.com", + "googletagservices.com", + "fonts.googleapis.com", + ) + + def _sni_rewrite_ip(self, host: str) -> str | None: + """Return the IP to SNI-rewrite `host` through, or None. + + Order of precedence: + 1. Explicit entry in config `hosts` map (exact or suffix match). + 2. Built-in `_SNI_REWRITE_SUFFIXES` → mapped to config `google_ip`. + """ + ip = self._hosts_ip(host) + if ip: + return ip + h = host.lower().rstrip(".") + for suffix in self._SNI_REWRITE_SUFFIXES: + if h == suffix or h.endswith("." + suffix): + return self.fronter.connect_host # configured google_ip + return None + + def _hosts_ip(self, host: str) -> str | None: + """Return override IP for host if defined in config 'hosts', else None. + + Supports exact match and suffix match (e.g. 'youtube.com' matches + 'www.youtube.com', 'm.youtube.com', etc.). + """ + h = host.lower().rstrip(".") + if h in self._hosts: + return self._hosts[h] + # suffix match: check every parent label + parts = h.split(".") + for i in range(1, len(parts)): + parent = ".".join(parts[i:]) + if parent in self._hosts: + return self._hosts[parent] + return None + # ── Google domain detection ─────────────────────────────────── - # Only domains whose SNI the ISP does NOT block. - # YouTube/googlevideo are blocked by SNI inspection in Iran, - # so they MUST go through the MITM relay (domain-fronted). + # Only domains whose SNI the ISP does NOT block — direct tunnel is safe. + # YouTube/googlevideo SNIs are blocked; they go through _do_sni_rewrite_tunnel + # via the hosts map instead. _GOOGLE_SUFFIXES = ( ".google.com", ".google.co", ".googleapis.com", ".gstatic.com", @@ -223,21 +294,22 @@ class ProxyServer: async def _do_direct_tunnel(self, host: str, port: int, reader: asyncio.StreamReader, - writer: asyncio.StreamWriter): + writer: asyncio.StreamWriter, + connect_ip: str | None = None): """Pipe raw TLS bytes directly to the target server. - Used for Google domains: the browser's TLS goes end-to-end - with Google, preserving real User-Agent and avoiding - Apps Script IP/bot-detection issues. + connect_ip overrides DNS: the TCP connection goes to that IP + while the browser's TLS (SNI=host) is piped through unchanged. + Defaults to the configured google_ip for Google-category domains. """ - google_ip = self.fronter.connect_host + target_ip = connect_ip or self.fronter.connect_host try: r_remote, w_remote = await asyncio.wait_for( - asyncio.open_connection(google_ip, port), timeout=10 + asyncio.open_connection(target_ip, port), timeout=10 ) except Exception as e: log.error("Direct tunnel connect failed (%s via %s): %s", - host, google_ip, e) + host, target_ip, e) return async def pipe(src, dst, label): @@ -263,6 +335,76 @@ class ProxyServer: pipe(r_remote, writer, f"{host}→client"), ) + # ── SNI-rewrite tunnel ──────────────────────────────────────── + + async def _do_sni_rewrite_tunnel(self, host: str, port: int, reader, writer, + connect_ip: str | None = None): + """MITM-decrypt TLS from browser, then re-encrypt toward connect_ip + using SNI=front_domain (e.g. www.google.com). + + The ISP only ever sees SNI=www.google.com in the outgoing handshake, + hiding the blocked hostname (e.g. www.youtube.com). + """ + target_ip = connect_ip or self.fronter.connect_host + sni_out = self.fronter.sni_host # e.g. "www.google.com" + + # Step 1: MITM — accept TLS from the browser + ssl_ctx_server = self.mitm.get_server_context(host) + loop = asyncio.get_event_loop() + transport = writer.transport + protocol = transport.get_protocol() + try: + new_transport = await loop.start_tls( + transport, protocol, ssl_ctx_server, server_side=True, + ) + except Exception as e: + log.debug("SNI-rewrite TLS accept failed (%s): %s", host, e) + return + writer._transport = new_transport + + # Step 2: open outgoing TLS to target IP with the safe SNI + ssl_ctx_client = ssl.create_default_context() + if not self.fronter.verify_ssl: + ssl_ctx_client.check_hostname = False + ssl_ctx_client.verify_mode = ssl.CERT_NONE + try: + r_out, w_out = await asyncio.wait_for( + asyncio.open_connection( + target_ip, port, + ssl=ssl_ctx_client, + server_hostname=sni_out, + ), + timeout=10, + ) + except Exception as e: + log.error("SNI-rewrite outbound connect failed (%s via %s): %s", + host, target_ip, e) + return + + # Step 3: pipe application-layer bytes between the two TLS sessions + async def pipe(src, dst, label): + try: + while True: + data = await src.read(65536) + if not data: + break + dst.write(data) + await dst.drain() + except (ConnectionError, asyncio.CancelledError): + pass + except Exception as exc: + log.debug("Pipe %s ended: %s", label, exc) + finally: + try: + dst.close() + except Exception: + pass + + await asyncio.gather( + pipe(reader, w_out, f"client→{host}"), + pipe(r_out, writer, f"{host}→client"), + ) + # ── MITM CONNECT (apps_script mode) ─────────────────────────── async def _do_mitm_connect(self, host: str, port: int, reader, writer): @@ -430,44 +572,61 @@ class ProxyServer: @staticmethod def _inject_cors_headers(response: bytes, origin: str) -> bytes: - """Overwrite any existing CORS headers and inject permissive ones.""" + """Inject CORS headers only if the upstream response lacks them. + + We must NOT overwrite the origin server's CORS headers: sites like + x.com return carefully-scoped Access-Control-Allow-Headers that list + specific custom headers (e.g. x-csrf-token). Replacing them with + wildcards together with Allow-Credentials: true makes browsers + reject the response (per the Fetch spec, "*" is literal when + credentials are included), which the site then blames on privacy + extensions. So we only fill in what the server omitted. + """ sep = b"\r\n\r\n" if sep not in response: return response header_section, body = response.split(sep, 1) lines = header_section.decode(errors="replace").split("\r\n") - # Drop existing Access-Control-* headers - lines = [ln for ln in lines if not ln.lower().startswith("access-control-")] + + existing = {ln.split(":", 1)[0].strip().lower() + for ln in lines if ":" in ln} + + # If the upstream already handled CORS, leave it completely alone. + if "access-control-allow-origin" in existing: + return response + + # Otherwise inject a minimal, credential-safe set (no wildcards, + # since wildcards combined with credentials are invalid). allow_origin = origin or "*" - lines += [ - f"Access-Control-Allow-Origin: {allow_origin}", - "Access-Control-Allow-Credentials: true", - "Access-Control-Allow-Methods: GET, POST, PUT, DELETE, PATCH, OPTIONS", - "Access-Control-Allow-Headers: *", - "Access-Control-Expose-Headers: *", - ] - return ("\r\n".join(lines) + "\r\n\r\n").encode() + body + additions = [f"Access-Control-Allow-Origin: {allow_origin}"] + if allow_origin != "*": + additions.append("Access-Control-Allow-Credentials: true") + additions.append("Vary: Origin") + return ("\r\n".join(lines + additions) + "\r\n\r\n").encode() + body async def _relay_smart(self, method, url, headers, body): """Choose optimal relay strategy based on request type. - ALL GET requests go through relay_parallel: it does one probe - request and only splits into parallel chunks if the response - is large and the server supports ranges. Small responses still - use a single request (no overhead). + - GET requests for likely-large downloads use parallel-range. + - All other requests (API calls, HTML, JSON, XHR) go through the + single-request relay. This avoids injecting a synthetic Range + header on normal traffic, which some origins honor by returning + 206 — breaking fetch()/XHR on sites like x.com or Cloudflare + challenge pages. """ if method == "GET" and not body: - # Skip parallel-range if the client already sent a Range header - # (we must forward it verbatim, not modify it). + # Respect client's own Range header verbatim. if headers: for k in headers: if k.lower() == "range": return await self.fronter.relay( method, url, headers, body ) - return await self.fronter.relay_parallel( - method, url, headers, body - ) + # Only probe with Range when the URL looks like a big file. + if self._is_likely_download(url, headers): + return await self.fronter.relay_parallel( + method, url, headers, body + ) return await self.fronter.relay(method, url, headers, body) def _is_likely_download(self, url: str, headers: dict) -> bool: