Youtube & x.com fixed

2026-05-17 21:24:37 +03:00 · 2026-04-21 10:14:24 +03:30
parent f7fe687b6c
commit e54994a679
3 changed files with 273 additions and 41 deletions
@@ -8,5 +8,7 @@
 	"listen_host": "127.0.0.1",
 	"listen_port": 8085,
 	"log_level": "INFO",
-	"verify_ssl": true
+	"verify_ssl": true,
 	"_hosts_comment": "Optional SNI-rewrite overrides. YouTube, googlevideo, gstatic, fonts.googleapis.com, ytimg, ggpht, doubleclick, etc. are ALREADY handled automatically (routed via google_ip with SNI=front_domain, same trick as the Xray MITM-DomainFronting config). Add entries here only for custom domains, e.g. \"example.com\": \"216.239.38.120\".",
 	"hosts": {}
 }
@@ -319,7 +319,8 @@ class DomainFronter:
        """Send periodic pings to keep Apps Script warm + H2 connection alive."""
        while True:
            try:
-                await asyncio.sleep(180)  # 3 minutes (ahead of Google's ~4min timeout)
+                await asyncio.sleep(240)  # 4 minutes — saves ~90 quota hits/day vs 180s
                                          # Google's container timeout is ~5 min idle
                if not self._h2 or not self._h2.is_connected:
                    try:
                        await self._h2.reconnect()
@@ -581,7 +582,9 @@ class DomainFronter:
        status, resp_hdrs, resp_body = self._split_raw_response(first_resp)
-        # No range support → return the single response as-is
+        # No range support → return the single response as-is (status 200
        # from the origin). The client sent a plain GET, so 200 is what it
        # expects.
        if status != 206:
            return first_resp
@@ -589,12 +592,16 @@ class DomainFronter:
        content_range = resp_hdrs.get("content-range", "")
        m = re.search(r"/(\d+)", content_range)
        if not m:
-            return first_resp
+            # Can't parse — downgrade to 200 so the client (which sent a
            # plain GET) doesn't get confused by 206 + Content-Range.
            return self._rewrite_206_to_200(first_resp)
        total_size = int(m.group(1))
-        # Small file: probe already fetched it all
+        # Small file: probe already fetched it all. MUST rewrite to 200
        # because the client never sent a Range header — a stray 206 here
        # breaks fetch()/XHR on sites like x.com and Cloudflare challenges.
        if total_size <= chunk_size or len(resp_body) >= total_size:
-            return first_resp
+            return self._rewrite_206_to_200(first_resp)
        # Calculate remaining ranges
        ranges = []
@@ -665,6 +672,40 @@ class DomainFronter:
        result += "\r\n"
        return result.encode() + full_body
    @staticmethod
    def _rewrite_206_to_200(raw: bytes) -> bytes:
        """Rewrite a 206 Partial Content response to 200 OK.
        Used when we probed with a synthetic Range header but the client
        never asked for one. Handing a 206 back to the browser for a plain
        GET breaks XHR/fetch on sites like x.com and Cloudflare challenges
        (they see it as an aborted/partial response). We drop the
        Content-Range header and set Content-Length to the body size.
        """
        sep = b"\r\n\r\n"
        if sep not in raw:
            return raw
        header_section, body = raw.split(sep, 1)
        lines = header_section.decode(errors="replace").split("\r\n")
        if not lines:
            return raw
        # Replace status line
        first = lines[0]
        if " 206" in first:
            lines[0] = first.replace(" 206 Partial Content", " 200 OK")\
                             .replace(" 206", " 200 OK")
        # Drop Content-Range and recalculate Content-Length
        filtered = [lines[0]]
        for ln in lines[1:]:
            low = ln.lower()
            if low.startswith("content-range:"):
                continue
            if low.startswith("content-length:"):
                continue
            filtered.append(ln)
        filtered.append(f"Content-Length: {len(body)}")
        return ("\r\n".join(filtered) + "\r\n\r\n").encode() + body
    def _build_payload(self, method, url, headers, body):
        """Build the JSON relay payload dict."""
        payload = {
@@ -1127,12 +1168,42 @@ class DomainFronter:
        skip = {"transfer-encoding", "connection", "keep-alive",
                "content-length", "content-encoding"}
        for k, v in resp_headers.items():
-            if k.lower() not in skip:
+            if k.lower() in skip:
-                result += f"{k}: {v}\r\n"
+                continue
            # Apps Script returns multi-valued headers (e.g. Set-Cookie) as a
            # JavaScript array. Emit each value as its own header line.
            # A single string that holds multiple Set-Cookie values joined
            # with ", " also needs to be split, otherwise the browser sees
            # one malformed cookie and sites like x.com fail.
            values = v if isinstance(v, list) else [v]
            if k.lower() == "set-cookie":
                expanded = []
                for item in values:
                    expanded.extend(self._split_set_cookie(str(item)))
                values = expanded
            for val in values:
                result += f"{k}: {val}\r\n"
        result += f"Content-Length: {len(resp_body)}\r\n"
        result += "\r\n"
        return result.encode() + resp_body
    @staticmethod
    def _split_set_cookie(blob: str) -> list[str]:
        """Split a Set-Cookie string that may contain multiple cookies.
        Apps Script sometimes joins multiple Set-Cookie values with ", ",
        which collides with the comma that legitimately appears inside the
        `Expires` attribute (e.g. "Expires=Wed, 21 Oct 2026 ..."). We split
        only on commas that are immediately followed by a cookie name=value
        pair (token '=' ...), leaving date commas intact.
        """
        if not blob:
            return []
        # Split on ", " but only when the following text looks like the start
        # of a new cookie (a token followed by '=').
        parts = re.split(r",\s*(?=[A-Za-z0-9!#$%&'*+\-.^_`|~]+=)", blob)
        return [p.strip() for p in parts if p.strip()]
    def _split_raw_response(self, raw: bytes):
        """Split a raw HTTP response into (status, headers_dict, body)."""
        if b"\r\n\r\n" not in raw:
@@ -12,6 +12,7 @@ Supports:
 import asyncio
 import logging
 import re
 import ssl
 import time
 from domain_fronter import DomainFronter
@@ -113,6 +114,10 @@ class ProxyServer:
        self._http_tunnels: dict = {}
        self._tunnel_lock = asyncio.Lock()
        # hosts override — DNS fake-map: domain/suffix → IP
        # Checked before any real DNS lookup; supports exact and suffix matching.
        self._hosts: dict[str, str] = config.get("hosts", {})
        if self.mode == "apps_script":
            try:
                from mitm import MITMCertManager
@@ -185,9 +190,16 @@ class ProxyServer:
        await writer.drain()
        if self.mode == "apps_script":
-            # Google services: tunnel directly (no MITM) to avoid
+            override_ip = self._sni_rewrite_ip(host)
-            # Google's anti-bot detection from Apps Script IPs/UA.
+            if override_ip:
-            if self._is_google_domain(host):
+                # SNI-blocked domain: MITM-decrypt from browser, then
                # re-connect to the override IP with SNI=front_domain so
                # the ISP never sees the blocked hostname in the TLS handshake.
                log.info("SNI-rewrite tunnel → %s via %s (SNI: %s)",
                         host, override_ip, self.fronter.sni_host)
                await self._do_sni_rewrite_tunnel(host, port, reader, writer,
                                                  connect_ip=override_ip)
            elif self._is_google_domain(host):
                log.info("Direct tunnel → %s (Google domain, skipping relay)", host)
                await self._do_direct_tunnel(host, port, reader, writer)
            else:
@@ -195,11 +207,70 @@ class ProxyServer:
        else:
            await self.fronter.tunnel(host, port, reader, writer)
    # ── Hosts override (fake DNS) ─────────────────────────────────
    # Built-in list of domains that must be reached via Google's frontend IP
    # with SNI rewritten to `front_domain` (default: www.google.com).
    # These are Google-owned services whose real SNI is DPI-blocked in some
    # countries, but that Google serves from the same edge IP as www.google.com.
    # Users don't need to configure anything — any host matching one of these
    # suffixes is transparently SNI-rewritten to the configured `google_ip`.
    # Config's "hosts" map still takes precedence (for custom overrides).
    _SNI_REWRITE_SUFFIXES = (
        "youtube.com",
        "youtu.be",
        "youtube-nocookie.com",
        "ytimg.com",
        "ggpht.com",
        "gvt1.com",
        "gvt2.com",
        "doubleclick.net",
        "googlesyndication.com",
        "googleadservices.com",
        "google-analytics.com",
        "googletagmanager.com",
        "googletagservices.com",
        "fonts.googleapis.com",
    )
    def _sni_rewrite_ip(self, host: str) -> str | None:
        """Return the IP to SNI-rewrite `host` through, or None.
        Order of precedence:
          1. Explicit entry in config `hosts` map (exact or suffix match).
          2. Built-in `_SNI_REWRITE_SUFFIXES` → mapped to config `google_ip`.
        """
        ip = self._hosts_ip(host)
        if ip:
            return ip
        h = host.lower().rstrip(".")
        for suffix in self._SNI_REWRITE_SUFFIXES:
            if h == suffix or h.endswith("." + suffix):
                return self.fronter.connect_host  # configured google_ip
        return None
    def _hosts_ip(self, host: str) -> str | None:
        """Return override IP for host if defined in config 'hosts', else None.
        Supports exact match and suffix match (e.g. 'youtube.com' matches
        'www.youtube.com', 'm.youtube.com', etc.).
        """
        h = host.lower().rstrip(".")
        if h in self._hosts:
            return self._hosts[h]
        # suffix match: check every parent label
        parts = h.split(".")
        for i in range(1, len(parts)):
            parent = ".".join(parts[i:])
            if parent in self._hosts:
                return self._hosts[parent]
        return None
    # ── Google domain detection ───────────────────────────────────
-    # Only domains whose SNI the ISP does NOT block.
+    # Only domains whose SNI the ISP does NOT block — direct tunnel is safe.
-    # YouTube/googlevideo are blocked by SNI inspection in Iran,
+    # YouTube/googlevideo SNIs are blocked; they go through _do_sni_rewrite_tunnel
-    # so they MUST go through the MITM relay (domain-fronted).
+    # via the hosts map instead.
    _GOOGLE_SUFFIXES = (
        ".google.com", ".google.co",
        ".googleapis.com", ".gstatic.com",
@@ -223,21 +294,22 @@ class ProxyServer:
    async def _do_direct_tunnel(self, host: str, port: int,
                                reader: asyncio.StreamReader,
-                                writer: asyncio.StreamWriter):
+                                writer: asyncio.StreamWriter,
                                connect_ip: str | None = None):
        """Pipe raw TLS bytes directly to the target server.
-        Used for Google domains: the browser's TLS goes end-to-end
+        connect_ip overrides DNS: the TCP connection goes to that IP
-        with Google, preserving real User-Agent and avoiding
+        while the browser's TLS (SNI=host) is piped through unchanged.
-        Apps Script IP/bot-detection issues.
+        Defaults to the configured google_ip for Google-category domains.
        """
-        google_ip = self.fronter.connect_host
+        target_ip = connect_ip or self.fronter.connect_host
        try:
            r_remote, w_remote = await asyncio.wait_for(
-                asyncio.open_connection(google_ip, port), timeout=10
+                asyncio.open_connection(target_ip, port), timeout=10
            )
        except Exception as e:
            log.error("Direct tunnel connect failed (%s via %s): %s",
-                      host, google_ip, e)
+                      host, target_ip, e)
            return
        async def pipe(src, dst, label):
@@ -263,6 +335,76 @@ class ProxyServer:
            pipe(r_remote, writer, f"{host}→client"),
        )
    # ── SNI-rewrite tunnel ────────────────────────────────────────
    async def _do_sni_rewrite_tunnel(self, host: str, port: int, reader, writer,
                                     connect_ip: str | None = None):
        """MITM-decrypt TLS from browser, then re-encrypt toward connect_ip
        using SNI=front_domain (e.g. www.google.com).
        The ISP only ever sees SNI=www.google.com in the outgoing handshake,
        hiding the blocked hostname (e.g. www.youtube.com).
        """
        target_ip = connect_ip or self.fronter.connect_host
        sni_out   = self.fronter.sni_host  # e.g. "www.google.com"
        # Step 1: MITM — accept TLS from the browser
        ssl_ctx_server = self.mitm.get_server_context(host)
        loop = asyncio.get_event_loop()
        transport = writer.transport
        protocol  = transport.get_protocol()
        try:
            new_transport = await loop.start_tls(
                transport, protocol, ssl_ctx_server, server_side=True,
            )
        except Exception as e:
            log.debug("SNI-rewrite TLS accept failed (%s): %s", host, e)
            return
        writer._transport = new_transport
        # Step 2: open outgoing TLS to target IP with the safe SNI
        ssl_ctx_client = ssl.create_default_context()
        if not self.fronter.verify_ssl:
            ssl_ctx_client.check_hostname = False
            ssl_ctx_client.verify_mode = ssl.CERT_NONE
        try:
            r_out, w_out = await asyncio.wait_for(
                asyncio.open_connection(
                    target_ip, port,
                    ssl=ssl_ctx_client,
                    server_hostname=sni_out,
                ),
                timeout=10,
            )
        except Exception as e:
            log.error("SNI-rewrite outbound connect failed (%s via %s): %s",
                      host, target_ip, e)
            return
        # Step 3: pipe application-layer bytes between the two TLS sessions
        async def pipe(src, dst, label):
            try:
                while True:
                    data = await src.read(65536)
                    if not data:
                        break
                    dst.write(data)
                    await dst.drain()
            except (ConnectionError, asyncio.CancelledError):
                pass
            except Exception as exc:
                log.debug("Pipe %s ended: %s", label, exc)
            finally:
                try:
                    dst.close()
                except Exception:
                    pass
        await asyncio.gather(
            pipe(reader, w_out, f"client→{host}"),
            pipe(r_out,  writer, f"{host}→client"),
        )
    # ── MITM CONNECT (apps_script mode) ───────────────────────────
    async def _do_mitm_connect(self, host: str, port: int, reader, writer):
@@ -430,44 +572,61 @@ class ProxyServer:
    @staticmethod
    def _inject_cors_headers(response: bytes, origin: str) -> bytes:
-        """Overwrite any existing CORS headers and inject permissive ones."""
+        """Inject CORS headers only if the upstream response lacks them.
        We must NOT overwrite the origin server's CORS headers: sites like
        x.com return carefully-scoped Access-Control-Allow-Headers that list
        specific custom headers (e.g. x-csrf-token). Replacing them with
        wildcards together with Allow-Credentials: true makes browsers
        reject the response (per the Fetch spec, "*" is literal when
        credentials are included), which the site then blames on privacy
        extensions. So we only fill in what the server omitted.
        """
        sep = b"\r\n\r\n"
        if sep not in response:
            return response
        header_section, body = response.split(sep, 1)
        lines = header_section.decode(errors="replace").split("\r\n")
-        # Drop existing Access-Control-* headers
+
-        lines = [ln for ln in lines if not ln.lower().startswith("access-control-")]
+        existing = {ln.split(":", 1)[0].strip().lower()
                    for ln in lines if ":" in ln}
        # If the upstream already handled CORS, leave it completely alone.
        if "access-control-allow-origin" in existing:
            return response
        # Otherwise inject a minimal, credential-safe set (no wildcards,
        # since wildcards combined with credentials are invalid).
        allow_origin = origin or "*"
-        lines += [
+        additions = [f"Access-Control-Allow-Origin: {allow_origin}"]
-            f"Access-Control-Allow-Origin: {allow_origin}",
+        if allow_origin != "*":
-            "Access-Control-Allow-Credentials: true",
+            additions.append("Access-Control-Allow-Credentials: true")
-            "Access-Control-Allow-Methods: GET, POST, PUT, DELETE, PATCH, OPTIONS",
+            additions.append("Vary: Origin")
-            "Access-Control-Allow-Headers: *",
+        return ("\r\n".join(lines + additions) + "\r\n\r\n").encode() + body
            "Access-Control-Expose-Headers: *",
        ]
        return ("\r\n".join(lines) + "\r\n\r\n").encode() + body
    async def _relay_smart(self, method, url, headers, body):
        """Choose optimal relay strategy based on request type.
-        ALL GET requests go through relay_parallel: it does one probe
+        - GET requests for likely-large downloads use parallel-range.
-        request and only splits into parallel chunks if the response
+        - All other requests (API calls, HTML, JSON, XHR) go through the
-        is large and the server supports ranges. Small responses still
+          single-request relay. This avoids injecting a synthetic Range
-        use a single request (no overhead).
+          header on normal traffic, which some origins honor by returning
          206 — breaking fetch()/XHR on sites like x.com or Cloudflare
          challenge pages.
        """
        if method == "GET" and not body:
-            # Skip parallel-range if the client already sent a Range header
+            # Respect client's own Range header verbatim.
            # (we must forward it verbatim, not modify it).
            if headers:
                for k in headers:
                    if k.lower() == "range":
                        return await self.fronter.relay(
                            method, url, headers, body
                        )
-            return await self.fronter.relay_parallel(
+            # Only probe with Range when the URL looks like a big file.
-                method, url, headers, body
+            if self._is_likely_download(url, headers):
-            )
+                return await self.fronter.relay_parallel(
                    method, url, headers, body
                )
        return await self.fronter.relay(method, url, headers, body)
    def _is_likely_download(self, url: str, headers: dict) -> bool: