Merge pull request #19 from sepehr-alipour/fix/linkify-raw-text-escaping

fix: linkify raw text to preserve URLs with & in query params
This commit is contained in:
Sarto
2026-04-20 13:42:31 +03:30
committed by GitHub
5 changed files with 39 additions and 27 deletions
+3 -3
View File
@@ -114,7 +114,7 @@ func TestApplyTextURLEntities(t *testing.T) {
entities: []tg.MessageEntityClass{
&tg.MessageEntityTextURL{Offset: 10, Length: 9, URL: "https://example.com"},
},
want: "Check out this link (https://example.com) for details",
want: "Check out [this link](https://example.com) for details",
},
{
name: "display text equals url",
@@ -139,7 +139,7 @@ func TestApplyTextURLEntities(t *testing.T) {
&tg.MessageEntityTextURL{Offset: 4, Length: 5, URL: "https://one.com"},
&tg.MessageEntityTextURL{Offset: 14, Length: 6, URL: "https://two.com"},
},
want: "see first (https://one.com) and second (https://two.com) links",
want: "see [first](https://one.com) and [second](https://two.com) links",
},
{
name: "emoji in text (surrogate pair)",
@@ -147,7 +147,7 @@ func TestApplyTextURLEntities(t *testing.T) {
entities: []tg.MessageEntityClass{
&tg.MessageEntityTextURL{Offset: 3, Length: 10, URL: "https://poll.com"},
},
want: "📊 click here (https://poll.com)",
want: "📊 [click here](https://poll.com)",
},
{
name: "non-text-url entities ignored",
+1 -1
View File
@@ -455,7 +455,7 @@ func extractMessageText(n *html.Node) string {
b.WriteByte(' ')
}
}
b.WriteString(linkText + " (" + href + ")")
b.WriteString("[" + linkText + "](" + href + ")")
return // skip walking children, already consumed
} else if href != "" && (linkText == "" || linkText == href) {
if b.Len() > 0 {
+1 -1
View File
@@ -196,7 +196,7 @@ func TestExtractMessageTextPreservesLinks(t *testing.T) {
}
node := findFirstByClass(doc, "tgme_widget_message_text")
text := extractMessageText(node)
want := "Check out this link (https://example.com) for details"
want := "Check out [this link](https://example.com) for details"
if text != want {
t.Fatalf("extractMessageText = %q, want %q", text, want)
}
+6 -5
View File
@@ -466,7 +466,7 @@ func (tr *TelegramReader) extractText(msg *tg.Message) string {
}
// applyTextURLEntities embeds hyperlink URLs from MessageEntityTextURL entities
// into the message text, producing output like "display text (https://url)".
// into the message text, producing output like "[display text](https://url)".
// This mirrors what the public HTML reader does when it extracts <a> tags.
// Offsets are in UTF-16 code units per the Telegram API spec.
func applyTextURLEntities(text string, entities []tg.MessageEntityClass) string {
@@ -519,10 +519,11 @@ func applyTextURLEntities(text string, entities []tg.MessageEntityClass) string
if string(runes[startIdx:endIdx]) == u.url {
continue
}
ins := []rune(" (" + u.url + ")")
newRunes := make([]rune, 0, len(runes)+len(ins))
newRunes = append(newRunes, runes[:endIdx]...)
newRunes = append(newRunes, ins...)
label := string(runes[startIdx:endIdx])
replacement := []rune("[" + label + "](" + u.url + ")")
newRunes := make([]rune, 0, len(runes)-len([]rune(label))+len(replacement))
newRunes = append(newRunes, runes[:startIdx]...)
newRunes = append(newRunes, replacement...)
newRunes = append(newRunes, runes[endIdx:]...)
runes = newRunes
}
+28 -17
View File
@@ -3204,7 +3204,7 @@
html += '<div class="poll-option">' + esc(ln) + '</div>';
hasContent = true;
} else if (ln.trim()) {
html += '<div>' + linkify(esc(ln)) + '</div>';
html += '<div>' + linkify(ln) + '</div>';
hasContent = true;
}
}
@@ -3255,7 +3255,7 @@
var timeStr = ts.toLocaleTimeString(dateLocale, { hour: '2-digit', minute: '2-digit' });
var text = msg.Text || msg.text || '';
currentMsgTexts.push(text);
var mediaHtml = '', textHtml = linkify(esc(text)).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">');
var mediaHtml = '', textHtml = linkify(text).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">');
// Check for [REPLY]:ID or [REPLY] format (backward compat: also [REPLY:ID])
var replyMatch = text.match(/^\[REPLY\](?::(\d+))?/) || text.match(/^\[REPLY:(\d+)\]/);
if (replyMatch) {
@@ -3269,7 +3269,7 @@
textHtml = renderPollCard(rpPollBody);
mediaHtml += '<div class="media-tag">[POLL]</div>';
} else {
textHtml = linkify(esc(replyBody)).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">');
textHtml = linkify(replyBody).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">');
}
if (replyId > 0 && msgByID[replyId]) {
var rpText = (msgByID[replyId].Text || msgByID[replyId].text || '').replace(/^\[(?:IMAGE|VIDEO|FILE|AUDIO|STICKER|GIF|POLL|CONTACT|LOCATION|REPLY)[^\]]*\](?::\d+)?\n?/, '');
@@ -3287,7 +3287,7 @@
for (var m = 0; m < mediaTypes.length; m++) {
if (text.indexOf(mediaTypes[m]) === 0) {
mediaHtml = '<div class="media-tag">' + mediaTypes[m] + '</div>';
textHtml = linkify(esc(text.substring(mediaTypes[m].length).replace(/^\n/, ''))).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">'); break
textHtml = linkify(text.substring(mediaTypes[m].length).replace(/^\n/, '')).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">'); break
}
}
}
@@ -3608,20 +3608,31 @@
// ===== UTILITIES =====
function esc(s) { var d = document.createElement('div'); d.appendChild(document.createTextNode(s)); return d.innerHTML }
function escAttr(s) { return esc(s).replace(/"/g, '&quot;').replace(/'/g, '&#39;') }
function linkify(s) {
return s.replace(/(https?:\/\/[^\s<>&"']+)/g, function(url) {
var trail = '';
// Strip trailing punctuation that's not part of the URL
while (url.length > 1) {
var ch = url[url.length - 1];
if (ch === ')' && url.split('(').length <= url.split(')').length - 1) {
trail = ch + trail; url = url.slice(0, -1);
} else if (/[.,;:!?>\u200C\u200F]/.test(ch)) {
trail = ch + trail; url = url.slice(0, -1);
} else { break }
function linkify(raw) {
// Accepts raw (unescaped) text. Handles [label](url) markdown links and
// plain URLs. Escapes HTML in non-URL segments so & in URLs is preserved.
var result = '', last = 0, m;
var re = /\[([^\]]+)\]\((https?:\/\/[^\s)]+)\)|(https?:\/\/[^\s<>"']+)/g;
while ((m = re.exec(raw)) !== null) {
result += esc(raw.slice(last, m.index));
if (m[2]) {
result += '<a href="' + escAttr(m[2]) + '" target="_blank" rel="noopener" dir="ltr">' + esc(m[1]) + '</a>';
} else {
var url = m[3], trail = '';
while (url.length > 1) {
var ch = url[url.length - 1];
if (ch === ')' && url.split('(').length <= url.split(')').length - 1) {
trail = ch + trail; url = url.slice(0, -1);
} else if (/[.,;:!?>\u200C\u200F]/.test(ch)) {
trail = ch + trail; url = url.slice(0, -1);
} else { break; }
}
result += '<a href="' + escAttr(url) + '" target="_blank" rel="noopener" dir="ltr">' + esc(url) + '</a>' + esc(trail);
}
return '<a href="' + escAttr(url) + '" target="_blank" rel="noopener" dir="ltr">' + url + '</a>' + trail;
});
last = m.index + m[0].length;
}
result += esc(raw.slice(last));
return result;
}
function scrollToMsg(id) {
var els = document.querySelectorAll('.msg');