mirror of
https://github.com/sartoopjj/thefeed.git
synced 2026-05-19 06:54:34 +03:00
149 lines
4.6 KiB
Go
149 lines
4.6 KiB
Go
package server
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"path"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/sartoopjj/thefeed/internal/protocol"
|
|
)
|
|
|
|
// httpMediaClient is a small shared client for fetching media URLs the
|
|
// public-Telegram and X readers extract. It deliberately uses a relatively
|
|
// short timeout — media downloads must not stall the rest of a fetch cycle.
|
|
var httpMediaClient = &http.Client{
|
|
Timeout: 60 * time.Second,
|
|
// Disallow redirects to non-http(s) schemes; Telegram CDN sometimes
|
|
// redirects through 301/302 to a regional host which is fine.
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
if len(via) > 5 {
|
|
return errors.New("too many redirects")
|
|
}
|
|
if req.URL.Scheme != "http" && req.URL.Scheme != "https" {
|
|
return fmt.Errorf("disallowed redirect scheme %q", req.URL.Scheme)
|
|
}
|
|
return nil
|
|
},
|
|
}
|
|
|
|
// allowedMediaSchemes is the set of URL schemes downloadHTTPMedia will load.
|
|
var allowedMediaSchemes = map[string]bool{
|
|
"http": true,
|
|
"https": true,
|
|
}
|
|
|
|
// downloadHTTPMedia fetches the bytes at rawURL and stores them in cache,
|
|
// using the URL itself as the cache key (so refreshing the same channel
|
|
// every 10 min just bumps TTL on hit).
|
|
//
|
|
// It enforces the configured max-size both up-front (Content-Length) and on
|
|
// the wire (LimitReader) so a server lying about size can't blow past the
|
|
// limit. URLs are validated against allowedMediaSchemes; private-network
|
|
// targets are not blocked here because callers (PublicReader, XPublicReader)
|
|
// only pass URLs scraped from Telegram/Nitter responses.
|
|
func downloadHTTPMedia(ctx context.Context, cache *MediaCache, tag, rawURL string) (protocol.MediaMeta, bool) {
|
|
if cache == nil || rawURL == "" {
|
|
return protocol.MediaMeta{}, false
|
|
}
|
|
parsed, err := url.Parse(rawURL)
|
|
if err != nil || !allowedMediaSchemes[parsed.Scheme] {
|
|
return protocol.MediaMeta{}, false
|
|
}
|
|
|
|
// Cache key is the canonical URL — image-link rotation on the upstream
|
|
// side will create a fresh entry, but identical URLs across fetches will
|
|
// just refresh TTL.
|
|
cacheKey := tag + ":url:" + parsed.String()
|
|
if meta, ok := cache.Lookup(cacheKey); ok {
|
|
return meta, true
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, parsed.String(), nil)
|
|
if err != nil {
|
|
return protocol.MediaMeta{}, false
|
|
}
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; thefeed/1.0)")
|
|
req.Header.Set("Accept", "image/*, application/octet-stream;q=0.9, */*;q=0.5")
|
|
|
|
resp, err := httpMediaClient.Do(req)
|
|
if err != nil {
|
|
logfMedia("[media-http] %s: request failed: %v", parsed.String(), err)
|
|
return protocol.MediaMeta{}, false
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != http.StatusOK {
|
|
return protocol.MediaMeta{}, false
|
|
}
|
|
// Defense in depth: reject HTML/XHTML responses outright. Telegram's
|
|
// public web view sometimes redirects "file" links to the channel page
|
|
// itself; without this check we'd happily cache the channel's HTML as
|
|
// the user's downloadable file.
|
|
ctype := strings.ToLower(strings.TrimSpace(strings.Split(resp.Header.Get("Content-Type"), ";")[0]))
|
|
if ctype == "text/html" || ctype == "application/xhtml+xml" {
|
|
logfMedia("[media-http] %s: refusing HTML response (got %s)", parsed.String(), ctype)
|
|
return protocol.MediaMeta{}, false
|
|
}
|
|
|
|
maxBytes := cache.MaxAcceptableBytes()
|
|
if maxBytes > 0 && resp.ContentLength > 0 && resp.ContentLength > maxBytes {
|
|
size := resp.ContentLength
|
|
return protocol.MediaMeta{
|
|
Tag: tag,
|
|
Size: size,
|
|
Relays: nil,
|
|
}, true
|
|
}
|
|
|
|
limit := int64(-1)
|
|
if maxBytes > 0 {
|
|
limit = maxBytes + 1 // +1 to detect overflow vs exact match
|
|
}
|
|
var body io.Reader = resp.Body
|
|
if limit > 0 {
|
|
body = io.LimitReader(resp.Body, limit)
|
|
}
|
|
bytes, err := io.ReadAll(body)
|
|
if err != nil {
|
|
logfMedia("[media-http] %s: read failed: %v", parsed.String(), err)
|
|
return protocol.MediaMeta{}, false
|
|
}
|
|
if maxBytes > 0 && int64(len(bytes)) > maxBytes {
|
|
return protocol.MediaMeta{
|
|
Tag: tag,
|
|
Size: int64(len(bytes)),
|
|
Relays: nil,
|
|
}, true
|
|
}
|
|
|
|
meta, err := cache.Store(cacheKey, tag, bytes, resp.Header.Get("Content-Type"), urlBaseName(parsed))
|
|
if err != nil {
|
|
if errors.Is(err, ErrTooLarge) {
|
|
return meta, true
|
|
}
|
|
return protocol.MediaMeta{}, false
|
|
}
|
|
return meta, true
|
|
}
|
|
|
|
// urlBaseName returns the trailing path segment, stripped of its query, as a
|
|
// best-effort filename for HTTP layer Content-Disposition headers.
|
|
func urlBaseName(u *url.URL) string {
|
|
if u == nil {
|
|
return ""
|
|
}
|
|
base := path.Base(u.Path)
|
|
if base == "" || base == "/" || base == "." {
|
|
return ""
|
|
}
|
|
if i := strings.IndexByte(base, '?'); i >= 0 {
|
|
base = base[:i]
|
|
}
|
|
return base
|
|
}
|