mirror of
https://github.com/sartoopjj/thefeed.git
synced 2026-05-19 09:54:36 +03:00
009d4917f0
This ensures linkify in the web UI can render them as proper clickable anchors with the label as display text.
559 lines
14 KiB
Go
559 lines
14 KiB
Go
package server
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"net/url"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/net/html"
|
|
|
|
"github.com/sartoopjj/thefeed/internal/protocol"
|
|
)
|
|
|
|
// PublicReader fetches recent posts from public Telegram channels via the web view.
|
|
type PublicReader struct {
|
|
channels []string
|
|
feed *Feed
|
|
msgLimit int
|
|
baseCh int
|
|
|
|
client *http.Client
|
|
baseURL string
|
|
|
|
mu sync.RWMutex
|
|
cache map[string]cachedMessages
|
|
cacheTTL time.Duration
|
|
|
|
refreshCh chan struct{}
|
|
}
|
|
|
|
// NewPublicReader creates a reader for public channels without Telegram login.
|
|
func NewPublicReader(channelUsernames []string, feed *Feed, msgLimit int, baseCh int) *PublicReader {
|
|
cleaned := make([]string, len(channelUsernames))
|
|
for i, u := range channelUsernames {
|
|
cleaned[i] = strings.TrimPrefix(strings.TrimSpace(u), "@")
|
|
}
|
|
if msgLimit <= 0 {
|
|
msgLimit = 15
|
|
}
|
|
if baseCh <= 0 {
|
|
baseCh = 1
|
|
}
|
|
return &PublicReader{
|
|
channels: cleaned,
|
|
feed: feed,
|
|
msgLimit: msgLimit,
|
|
baseCh: baseCh,
|
|
client: &http.Client{
|
|
Timeout: 30 * time.Second,
|
|
},
|
|
baseURL: "https://t.me/s",
|
|
cache: make(map[string]cachedMessages),
|
|
cacheTTL: 10 * time.Minute,
|
|
refreshCh: make(chan struct{}, 1),
|
|
}
|
|
}
|
|
|
|
// Run starts the periodic public-channel fetch loop.
|
|
func (pr *PublicReader) Run(ctx context.Context) error {
|
|
pr.feed.SetTelegramLoggedIn(false)
|
|
pr.fetchAll(ctx)
|
|
|
|
ticker := time.NewTicker(10 * time.Minute)
|
|
defer ticker.Stop()
|
|
pr.feed.SetNextFetch(uint32(time.Now().Add(10 * time.Minute).Unix()))
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-ticker.C:
|
|
pr.fetchAll(ctx)
|
|
pr.feed.SetNextFetch(uint32(time.Now().Add(10 * time.Minute).Unix()))
|
|
case <-pr.refreshCh:
|
|
pr.mu.Lock()
|
|
pr.cache = make(map[string]cachedMessages)
|
|
pr.mu.Unlock()
|
|
pr.fetchAll(ctx)
|
|
ticker.Reset(10 * time.Minute)
|
|
pr.feed.SetNextFetch(uint32(time.Now().Add(10 * time.Minute).Unix()))
|
|
}
|
|
}
|
|
}
|
|
|
|
// RequestRefresh signals the fetch loop to re-fetch immediately.
|
|
func (pr *PublicReader) RequestRefresh() {
|
|
select {
|
|
case pr.refreshCh <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
|
|
// UpdateChannels replaces the channel list and updates Feed metadata.
|
|
func (pr *PublicReader) UpdateChannels(channels []string) {
|
|
cleaned := make([]string, len(channels))
|
|
for i, u := range channels {
|
|
cleaned[i] = strings.TrimPrefix(strings.TrimSpace(u), "@")
|
|
}
|
|
pr.mu.Lock()
|
|
pr.channels = cleaned
|
|
pr.cache = make(map[string]cachedMessages)
|
|
pr.mu.Unlock()
|
|
}
|
|
|
|
func (pr *PublicReader) fetchAll(ctx context.Context) {
|
|
log.Printf("[public] fetch cycle started for %d channels", len(pr.channels))
|
|
start := time.Now()
|
|
var fetched, failed int
|
|
for i, username := range pr.channels {
|
|
chNum := pr.baseCh + i
|
|
|
|
pr.mu.RLock()
|
|
cached, ok := pr.cache[username]
|
|
pr.mu.RUnlock()
|
|
if ok && time.Since(cached.fetched) < pr.cacheTTL {
|
|
continue
|
|
}
|
|
|
|
msgs, err := pr.fetchChannel(ctx, username)
|
|
if err != nil {
|
|
log.Printf("[public] fetch %s: %v", username, err)
|
|
failed++
|
|
continue
|
|
}
|
|
|
|
// Merge new messages with previously cached ones to accumulate history.
|
|
if ok && len(cached.msgs) > 0 {
|
|
msgs = mergeMessages(cached.msgs, msgs)
|
|
}
|
|
if pr.msgLimit > 0 && len(msgs) > pr.msgLimit {
|
|
msgs = msgs[:pr.msgLimit]
|
|
}
|
|
|
|
pr.mu.Lock()
|
|
pr.cache[username] = cachedMessages{msgs: msgs, fetched: time.Now()}
|
|
pr.mu.Unlock()
|
|
|
|
pr.feed.UpdateChannel(chNum, msgs)
|
|
pr.feed.SetChatInfo(chNum, protocol.ChatTypeChannel, false)
|
|
fetched++
|
|
log.Printf("[public] updated %s: %d messages", username, len(msgs))
|
|
}
|
|
log.Printf("[public] fetch cycle done in %s: %d fetched, %d failed, %d total", time.Since(start).Round(time.Millisecond), fetched, failed, len(pr.channels))
|
|
}
|
|
|
|
func (pr *PublicReader) fetchChannel(ctx context.Context, username string) ([]protocol.Message, error) {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, pr.baseURL+"/"+url.PathEscape(username), nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; thefeed/1.0; +https://github.com/sartoopjj/thefeed)")
|
|
|
|
resp, err := pr.client.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, fmt.Errorf("unexpected HTTP status: %s", resp.Status)
|
|
}
|
|
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return parsePublicMessages(body)
|
|
}
|
|
|
|
type publicMessage struct {
|
|
id uint32
|
|
timestamp uint32
|
|
text string
|
|
}
|
|
|
|
// mergeMessages combines old cached messages with newly fetched ones.
|
|
// New messages win on ID conflicts (edits). Result is sorted by ID descending.
|
|
func mergeMessages(old, new []protocol.Message) []protocol.Message {
|
|
byID := make(map[uint32]protocol.Message, len(old)+len(new))
|
|
for _, m := range old {
|
|
byID[m.ID] = m
|
|
}
|
|
for _, m := range new {
|
|
byID[m.ID] = m // new overwrites old (edits)
|
|
}
|
|
merged := make([]protocol.Message, 0, len(byID))
|
|
for _, m := range byID {
|
|
merged = append(merged, m)
|
|
}
|
|
sort.Slice(merged, func(i, j int) bool {
|
|
return merged[i].ID > merged[j].ID
|
|
})
|
|
return merged
|
|
}
|
|
|
|
func parsePublicMessages(body []byte) ([]protocol.Message, error) {
|
|
doc, err := html.Parse(strings.NewReader(string(body)))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("parse html: %w", err)
|
|
}
|
|
|
|
var collected []publicMessage
|
|
visitNodes(doc, func(n *html.Node) {
|
|
post := attrValue(n, "data-post")
|
|
if post == "" {
|
|
return
|
|
}
|
|
id, err := parsePostID(post)
|
|
if err != nil {
|
|
return
|
|
}
|
|
text := strings.TrimSpace(extractMessageText(findMessageBodyNode(n)))
|
|
mediaPrefix := ""
|
|
switch {
|
|
case findFirstByClass(n, "tgme_widget_message_photo_wrap") != nil:
|
|
mediaPrefix = protocol.MediaImage
|
|
case findFirstByClass(n, "tgme_widget_message_video_player") != nil ||
|
|
findFirstByClass(n, "tgme_widget_message_roundvideo_player") != nil:
|
|
mediaPrefix = protocol.MediaVideo
|
|
case findFirstByClass(n, "tgme_widget_message_sticker_wrap") != nil:
|
|
mediaPrefix = protocol.MediaSticker
|
|
case findFirstByClass(n, "tgme_widget_message_voice") != nil:
|
|
mediaPrefix = protocol.MediaAudio
|
|
case findFirstByClass(n, "tgme_widget_message_poll") != nil:
|
|
mediaPrefix = protocol.MediaPoll
|
|
pollBody := extractPollData(n)
|
|
if pollBody != "" {
|
|
if text != "" {
|
|
text = mediaPrefix + "\n" + pollBody + "\n" + text
|
|
} else {
|
|
text = mediaPrefix + "\n" + pollBody
|
|
}
|
|
mediaPrefix = "" // already handled
|
|
}
|
|
case findFirstByClass(n, "tgme_widget_message_location_wrap") != nil ||
|
|
findFirstByClass(n, "tgme_widget_message_venue_wrap") != nil:
|
|
mediaPrefix = protocol.MediaLocation
|
|
case findFirstByClass(n, "tgme_widget_message_contact_wrap") != nil:
|
|
mediaPrefix = protocol.MediaContact
|
|
case findFirstByClass(n, "tgme_widget_message_document_wrap") != nil:
|
|
mediaPrefix = protocol.MediaFile
|
|
case findFirstByClass(n, "message_media_not_supported") != nil:
|
|
// Telegram shows "Please open Telegram to view this post" for
|
|
// unsupported content like polls, quizzes, etc. Tag it so
|
|
// the message is not silently dropped.
|
|
mediaPrefix = protocol.MediaPoll
|
|
}
|
|
if mediaPrefix != "" {
|
|
if text != "" {
|
|
text = mediaPrefix + "\n" + text
|
|
} else {
|
|
text = mediaPrefix
|
|
}
|
|
}
|
|
if text == "" {
|
|
return
|
|
}
|
|
// Detect replies by checking for the reply preview element.
|
|
if replyNode := findFirstByClass(n, "tgme_widget_message_reply"); replyNode != nil {
|
|
replyID := extractReplyID(replyNode)
|
|
if replyID > 0 {
|
|
text = fmt.Sprintf("%s:%d\n%s", protocol.MediaReply, replyID, text)
|
|
} else {
|
|
text = protocol.MediaReply + "\n" + text
|
|
}
|
|
}
|
|
collected = append(collected, publicMessage{
|
|
id: id,
|
|
timestamp: extractMessageTimestamp(n),
|
|
text: text,
|
|
})
|
|
})
|
|
|
|
if len(collected) == 0 {
|
|
return nil, fmt.Errorf("no public messages found")
|
|
}
|
|
|
|
sort.Slice(collected, func(i, j int) bool {
|
|
return collected[i].id > collected[j].id
|
|
})
|
|
|
|
msgs := make([]protocol.Message, 0, len(collected))
|
|
for _, msg := range collected {
|
|
msgs = append(msgs, protocol.Message{ID: msg.id, Timestamp: msg.timestamp, Text: msg.text})
|
|
}
|
|
return msgs, nil
|
|
}
|
|
|
|
func visitNodes(n *html.Node, fn func(*html.Node)) {
|
|
if n == nil {
|
|
return
|
|
}
|
|
fn(n)
|
|
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
|
visitNodes(child, fn)
|
|
}
|
|
}
|
|
|
|
func findFirstByClass(n *html.Node, class string) *html.Node {
|
|
var found *html.Node
|
|
visitNodes(n, func(cur *html.Node) {
|
|
if found != nil {
|
|
return
|
|
}
|
|
if hasClass(cur, class) {
|
|
found = cur
|
|
}
|
|
})
|
|
return found
|
|
}
|
|
|
|
func hasClass(n *html.Node, class string) bool {
|
|
if n == nil || n.Type != html.ElementNode {
|
|
return false
|
|
}
|
|
for _, attr := range n.Attr {
|
|
if attr.Key != "class" {
|
|
continue
|
|
}
|
|
for _, token := range strings.Fields(attr.Val) {
|
|
if token == class {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func attrValue(n *html.Node, key string) string {
|
|
if n == nil {
|
|
return ""
|
|
}
|
|
for _, attr := range n.Attr {
|
|
if attr.Key == key {
|
|
return attr.Val
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func parsePostID(post string) (uint32, error) {
|
|
idx := strings.LastIndex(post, "/")
|
|
if idx == -1 || idx+1 >= len(post) {
|
|
return 0, fmt.Errorf("invalid post id")
|
|
}
|
|
id, err := strconv.ParseUint(post[idx+1:], 10, 32)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return uint32(id), nil
|
|
}
|
|
|
|
func extractMessageTimestamp(n *html.Node) uint32 {
|
|
timeNode := findFirstByClass(n, "tgme_widget_message_date")
|
|
if timeNode == nil {
|
|
timeNode = findFirstElement(n, "time")
|
|
}
|
|
if timeNode == nil {
|
|
return uint32(time.Now().Unix())
|
|
}
|
|
datetime := attrValue(timeNode, "datetime")
|
|
if datetime == "" {
|
|
timeChild := findFirstElement(timeNode, "time")
|
|
datetime = attrValue(timeChild, "datetime")
|
|
}
|
|
if datetime == "" {
|
|
return uint32(time.Now().Unix())
|
|
}
|
|
ts, err := time.Parse(time.RFC3339, datetime)
|
|
if err != nil {
|
|
return uint32(time.Now().Unix())
|
|
}
|
|
return uint32(ts.Unix())
|
|
}
|
|
|
|
func findFirstElement(n *html.Node, tag string) *html.Node {
|
|
var found *html.Node
|
|
visitNodes(n, func(cur *html.Node) {
|
|
if found == nil && cur.Type == html.ElementNode && cur.Data == tag {
|
|
found = cur
|
|
}
|
|
})
|
|
return found
|
|
}
|
|
|
|
// findMessageBodyNode returns the main post text node while skipping reply preview
|
|
// snippets. In Telegram public HTML, quoted/replied text may appear before the
|
|
// real message body and can otherwise be mistakenly parsed as the post text.
|
|
func findMessageBodyNode(n *html.Node) *html.Node {
|
|
var found *html.Node
|
|
visitNodes(n, func(cur *html.Node) {
|
|
if found != nil || !hasClass(cur, "tgme_widget_message_text") {
|
|
return
|
|
}
|
|
for p := cur.Parent; p != nil; p = p.Parent {
|
|
if hasClass(p, "tgme_widget_message_reply") {
|
|
return
|
|
}
|
|
}
|
|
found = cur
|
|
})
|
|
if found != nil {
|
|
return found
|
|
}
|
|
return findFirstByClass(n, "tgme_widget_message_text")
|
|
}
|
|
|
|
func extractMessageText(n *html.Node) string {
|
|
if n == nil {
|
|
return ""
|
|
}
|
|
var b strings.Builder
|
|
var walk func(*html.Node)
|
|
walk = func(cur *html.Node) {
|
|
if cur == nil {
|
|
return
|
|
}
|
|
if cur.Type == html.TextNode {
|
|
text := strings.TrimSpace(cur.Data)
|
|
if text != "" {
|
|
if b.Len() > 0 {
|
|
last := b.String()[b.Len()-1]
|
|
if last != '\n' && last != ' ' {
|
|
b.WriteByte(' ')
|
|
}
|
|
}
|
|
b.WriteString(text)
|
|
}
|
|
}
|
|
if cur.Type == html.ElementNode && cur.Data == "br" {
|
|
trimTrailingSpace(&b)
|
|
if b.Len() > 0 {
|
|
b.WriteByte('\n')
|
|
}
|
|
}
|
|
// Preserve hyperlinks: if <a href="URL"> and the link text differs from the URL, append it.
|
|
if cur.Type == html.ElementNode && cur.Data == "a" {
|
|
href := attrValue(cur, "href")
|
|
// Only preserve safe http(s) URLs; reject javascript: and other schemes.
|
|
if href != "" && !strings.HasPrefix(href, "http://") && !strings.HasPrefix(href, "https://") {
|
|
href = ""
|
|
}
|
|
linkText := extractInnerText(cur)
|
|
if href != "" && linkText != "" && linkText != href {
|
|
if b.Len() > 0 {
|
|
last := b.String()[b.Len()-1]
|
|
if last != '\n' && last != ' ' {
|
|
b.WriteByte(' ')
|
|
}
|
|
}
|
|
b.WriteString("[" + linkText + "](" + href + ")")
|
|
return // skip walking children, already consumed
|
|
} else if href != "" && (linkText == "" || linkText == href) {
|
|
if b.Len() > 0 {
|
|
last := b.String()[b.Len()-1]
|
|
if last != '\n' && last != ' ' {
|
|
b.WriteByte(' ')
|
|
}
|
|
}
|
|
b.WriteString(href)
|
|
return // skip walking children
|
|
}
|
|
}
|
|
for child := cur.FirstChild; child != nil; child = child.NextSibling {
|
|
walk(child)
|
|
}
|
|
}
|
|
walk(n)
|
|
return strings.TrimSpace(strings.ReplaceAll(b.String(), " \n", "\n"))
|
|
}
|
|
|
|
func trimTrailingSpace(b *strings.Builder) {
|
|
s := b.String()
|
|
for len(s) > 0 && s[len(s)-1] == ' ' {
|
|
s = s[:len(s)-1]
|
|
}
|
|
b.Reset()
|
|
b.WriteString(s)
|
|
}
|
|
|
|
// extractInnerText returns the concatenated text content of a node and its children.
|
|
func extractInnerText(n *html.Node) string {
|
|
if n == nil {
|
|
return ""
|
|
}
|
|
var b strings.Builder
|
|
var walk func(*html.Node)
|
|
walk = func(cur *html.Node) {
|
|
if cur.Type == html.TextNode {
|
|
b.WriteString(cur.Data)
|
|
}
|
|
for child := cur.FirstChild; child != nil; child = child.NextSibling {
|
|
walk(child)
|
|
}
|
|
}
|
|
walk(n)
|
|
return strings.TrimSpace(b.String())
|
|
}
|
|
|
|
// extractPollData extracts poll question and options from the public HTML widget.
|
|
// Telegram's poll HTML uses these classes:
|
|
// - tgme_widget_message_poll_question → question text
|
|
// - tgme_widget_message_poll_option_text → each option's text
|
|
func extractPollData(n *html.Node) string {
|
|
question := ""
|
|
if qNode := findFirstByClass(n, "tgme_widget_message_poll_question"); qNode != nil {
|
|
question = strings.TrimSpace(extractMessageText(qNode))
|
|
}
|
|
var options []string
|
|
visitNodes(n, func(cur *html.Node) {
|
|
if hasClass(cur, "tgme_widget_message_poll_option_text") {
|
|
opt := strings.TrimSpace(extractMessageText(cur))
|
|
if opt != "" {
|
|
options = append(options, "○ "+opt)
|
|
}
|
|
}
|
|
})
|
|
if question == "" && len(options) == 0 {
|
|
return ""
|
|
}
|
|
result := "📊 " + question
|
|
if len(options) > 0 {
|
|
result += "\n" + strings.Join(options, "\n")
|
|
}
|
|
return result
|
|
}
|
|
|
|
// extractReplyID parses the href of the reply element to get the replied-to message ID.
|
|
// The href typically looks like "https://t.me/channel/123" or "?single&reply=123".
|
|
func extractReplyID(replyNode *html.Node) uint32 {
|
|
href := ""
|
|
// The reply element itself may be an <a> or contain one.
|
|
if replyNode.Type == html.ElementNode && replyNode.Data == "a" {
|
|
href = attrValue(replyNode, "href")
|
|
}
|
|
if href == "" {
|
|
linkNode := findFirstElement(replyNode, "a")
|
|
if linkNode != nil {
|
|
href = attrValue(linkNode, "href")
|
|
}
|
|
}
|
|
if href == "" {
|
|
return 0
|
|
}
|
|
// Parse the last path segment as the message ID.
|
|
id, err := parsePostID(href)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
return id
|
|
}
|