Files
thefeed/internal/server/media_http.go
T
2026-04-30 16:57:11 +03:30

149 lines
4.6 KiB
Go

package server
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"path"
"strings"
"time"
"github.com/sartoopjj/thefeed/internal/protocol"
)
// httpMediaClient is a small shared client for fetching media URLs the
// public-Telegram and X readers extract. It deliberately uses a relatively
// short timeout — media downloads must not stall the rest of a fetch cycle.
var httpMediaClient = &http.Client{
Timeout: 60 * time.Second,
// Disallow redirects to non-http(s) schemes; Telegram CDN sometimes
// redirects through 301/302 to a regional host which is fine.
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) > 5 {
return errors.New("too many redirects")
}
if req.URL.Scheme != "http" && req.URL.Scheme != "https" {
return fmt.Errorf("disallowed redirect scheme %q", req.URL.Scheme)
}
return nil
},
}
// allowedMediaSchemes is the set of URL schemes downloadHTTPMedia will load.
var allowedMediaSchemes = map[string]bool{
"http": true,
"https": true,
}
// downloadHTTPMedia fetches the bytes at rawURL and stores them in cache,
// using the URL itself as the cache key (so refreshing the same channel
// every 10 min just bumps TTL on hit).
//
// It enforces the configured max-size both up-front (Content-Length) and on
// the wire (LimitReader) so a server lying about size can't blow past the
// limit. URLs are validated against allowedMediaSchemes; private-network
// targets are not blocked here because callers (PublicReader, XPublicReader)
// only pass URLs scraped from Telegram/Nitter responses.
func downloadHTTPMedia(ctx context.Context, cache *MediaCache, tag, rawURL string) (protocol.MediaMeta, bool) {
if cache == nil || rawURL == "" {
return protocol.MediaMeta{}, false
}
parsed, err := url.Parse(rawURL)
if err != nil || !allowedMediaSchemes[parsed.Scheme] {
return protocol.MediaMeta{}, false
}
// Cache key is the canonical URL — image-link rotation on the upstream
// side will create a fresh entry, but identical URLs across fetches will
// just refresh TTL.
cacheKey := tag + ":url:" + parsed.String()
if meta, ok := cache.Lookup(cacheKey); ok {
return meta, true
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, parsed.String(), nil)
if err != nil {
return protocol.MediaMeta{}, false
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; thefeed/1.0)")
req.Header.Set("Accept", "image/*, application/octet-stream;q=0.9, */*;q=0.5")
resp, err := httpMediaClient.Do(req)
if err != nil {
logfMedia("[media-http] %s: request failed: %v", parsed.String(), err)
return protocol.MediaMeta{}, false
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return protocol.MediaMeta{}, false
}
// Defense in depth: reject HTML/XHTML responses outright. Telegram's
// public web view sometimes redirects "file" links to the channel page
// itself; without this check we'd happily cache the channel's HTML as
// the user's downloadable file.
ctype := strings.ToLower(strings.TrimSpace(strings.Split(resp.Header.Get("Content-Type"), ";")[0]))
if ctype == "text/html" || ctype == "application/xhtml+xml" {
logfMedia("[media-http] %s: refusing HTML response (got %s)", parsed.String(), ctype)
return protocol.MediaMeta{}, false
}
maxBytes := cache.MaxAcceptableBytes()
if maxBytes > 0 && resp.ContentLength > 0 && resp.ContentLength > maxBytes {
size := resp.ContentLength
return protocol.MediaMeta{
Tag: tag,
Size: size,
Relays: nil,
}, true
}
limit := int64(-1)
if maxBytes > 0 {
limit = maxBytes + 1 // +1 to detect overflow vs exact match
}
var body io.Reader = resp.Body
if limit > 0 {
body = io.LimitReader(resp.Body, limit)
}
bytes, err := io.ReadAll(body)
if err != nil {
logfMedia("[media-http] %s: read failed: %v", parsed.String(), err)
return protocol.MediaMeta{}, false
}
if maxBytes > 0 && int64(len(bytes)) > maxBytes {
return protocol.MediaMeta{
Tag: tag,
Size: int64(len(bytes)),
Relays: nil,
}, true
}
meta, err := cache.Store(cacheKey, tag, bytes, resp.Header.Get("Content-Type"), urlBaseName(parsed))
if err != nil {
if errors.Is(err, ErrTooLarge) {
return meta, true
}
return protocol.MediaMeta{}, false
}
return meta, true
}
// urlBaseName returns the trailing path segment, stripped of its query, as a
// best-effort filename for HTTP layer Content-Disposition headers.
func urlBaseName(u *url.URL) string {
if u == nil {
return ""
}
base := path.Base(u.Path)
if base == "" || base == "/" || base == "." {
return ""
}
if i := strings.IndexByte(base, '?'); i >= 0 {
base = base[:i]
}
return base
}