Merge pull request #19 from sepehr-alipour/fix/linkify-raw-text-escaping

fix: linkify raw text to preserve URLs with & in query params
2026-05-19 08:54:36 +03:00 · 2026-04-20 13:42:31 +03:30
parent 2f73101677 fcd9db7047
commit 1668619c4d
5 changed files with 39 additions and 27 deletions
@@ -114,7 +114,7 @@ func TestApplyTextURLEntities(t *testing.T) {
 			entities: []tg.MessageEntityClass{
 				&tg.MessageEntityTextURL{Offset: 10, Length: 9, URL: "https://example.com"},
 			},
-			want: "Check out this link (https://example.com) for details",
+			want: "Check out [this link](https://example.com) for details",
 		},
 		{
 			name: "display text equals url",
@@ -139,7 +139,7 @@ func TestApplyTextURLEntities(t *testing.T) {
 				&tg.MessageEntityTextURL{Offset: 4, Length: 5, URL: "https://one.com"},
 				&tg.MessageEntityTextURL{Offset: 14, Length: 6, URL: "https://two.com"},
 			},
-			want: "see first (https://one.com) and second (https://two.com) links",
+			want: "see [first](https://one.com) and [second](https://two.com) links",
 		},
 		{
 			name: "emoji in text (surrogate pair)",
@@ -147,7 +147,7 @@ func TestApplyTextURLEntities(t *testing.T) {
 			entities: []tg.MessageEntityClass{
 				&tg.MessageEntityTextURL{Offset: 3, Length: 10, URL: "https://poll.com"},
 			},
-			want: "📊 click here (https://poll.com)",
+			want: "📊 [click here](https://poll.com)",
 		},
 		{
 			name: "non-text-url entities ignored",
@@ -455,7 +455,7 @@ func extractMessageText(n *html.Node) string {
 						b.WriteByte(' ')
 					}
 				}
-				b.WriteString(linkText + " (" + href + ")")
+				b.WriteString("[" + linkText + "](" + href + ")")
 				return // skip walking children, already consumed
 			} else if href != "" && (linkText == "" || linkText == href) {
 				if b.Len() > 0 {
@@ -196,7 +196,7 @@ func TestExtractMessageTextPreservesLinks(t *testing.T) {
 	}
 	node := findFirstByClass(doc, "tgme_widget_message_text")
 	text := extractMessageText(node)
-	want := "Check out this link (https://example.com) for details"
+	want := "Check out [this link](https://example.com) for details"
 	if text != want {
 		t.Fatalf("extractMessageText = %q, want %q", text, want)
 	}
@@ -466,7 +466,7 @@ func (tr *TelegramReader) extractText(msg *tg.Message) string {
 }

 // applyTextURLEntities embeds hyperlink URLs from MessageEntityTextURL entities
-// into the message text, producing output like "display text (https://url)".
+// into the message text, producing output like "[display text](https://url)".
 // This mirrors what the public HTML reader does when it extracts <a> tags.
 // Offsets are in UTF-16 code units per the Telegram API spec.
 func applyTextURLEntities(text string, entities []tg.MessageEntityClass) string {
@@ -519,10 +519,11 @@ func applyTextURLEntities(text string, entities []tg.MessageEntityClass) string
 		if string(runes[startIdx:endIdx]) == u.url {
 			continue
 		}
-		ins := []rune(" (" + u.url + ")")
-		newRunes := make([]rune, 0, len(runes)+len(ins))
-		newRunes = append(newRunes, runes[:endIdx]...)
-		newRunes = append(newRunes, ins...)
+		label := string(runes[startIdx:endIdx])
+		replacement := []rune("[" + label + "](" + u.url + ")")
+		newRunes := make([]rune, 0, len(runes)-len([]rune(label))+len(replacement))
+		newRunes = append(newRunes, runes[:startIdx]...)
+		newRunes = append(newRunes, replacement...)
 		newRunes = append(newRunes, runes[endIdx:]...)
 		runes = newRunes
 	}
@@ -3204,7 +3204,7 @@
          html += '<div class="poll-option">' + esc(ln) + '</div>';
          hasContent = true;
        } else if (ln.trim()) {
-          html += '<div>' + linkify(esc(ln)) + '</div>';
+          html += '<div>' + linkify(ln) + '</div>';
          hasContent = true;
        }
      }
@@ -3255,7 +3255,7 @@
        var timeStr = ts.toLocaleTimeString(dateLocale, { hour: '2-digit', minute: '2-digit' });
        var text = msg.Text || msg.text || '';
        currentMsgTexts.push(text);
-        var mediaHtml = '', textHtml = linkify(esc(text)).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">');
+        var mediaHtml = '', textHtml = linkify(text).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">');
        // Check for [REPLY]:ID or [REPLY] format (backward compat: also [REPLY:ID])
        var replyMatch = text.match(/^\[REPLY\](?::(\d+))?/) || text.match(/^\[REPLY:(\d+)\]/);
        if (replyMatch) {
@@ -3269,7 +3269,7 @@
            textHtml = renderPollCard(rpPollBody);
            mediaHtml += '<div class="media-tag">[POLL]</div>';
          } else {
-            textHtml = linkify(esc(replyBody)).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">');
+            textHtml = linkify(replyBody).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">');
          }
          if (replyId > 0 && msgByID[replyId]) {
            var rpText = (msgByID[replyId].Text || msgByID[replyId].text || '').replace(/^\[(?:IMAGE|VIDEO|FILE|AUDIO|STICKER|GIF|POLL|CONTACT|LOCATION|REPLY)[^\]]*\](?::\d+)?\n?/, '');
@@ -3287,7 +3287,7 @@
          for (var m = 0; m < mediaTypes.length; m++) {
            if (text.indexOf(mediaTypes[m]) === 0) {
              mediaHtml = '<div class="media-tag">' + mediaTypes[m] + '</div>';
-              textHtml = linkify(esc(text.substring(mediaTypes[m].length).replace(/^\n/, ''))).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">'); break
+              textHtml = linkify(text.substring(mediaTypes[m].length).replace(/^\n/, '')).replace(/\uD83C\uDDEE\uD83C\uDDF7/g, '<img src="/static/iran-lion-sun.svg" alt="\u{1F981}\u2600\uFE0F" style="height:1.1em;vertical-align:middle">'); break
            }
          }
        }
@@ -3608,20 +3608,31 @@
    // ===== UTILITIES =====
    function esc(s) { var d = document.createElement('div'); d.appendChild(document.createTextNode(s)); return d.innerHTML }
    function escAttr(s) { return esc(s).replace(/"/g, '&quot;').replace(/'/g, '&#39;') }
-    function linkify(s) {
-      return s.replace(/(https?:\/\/[^\s<>&"']+)/g, function(url) {
-        var trail = '';
-        // Strip trailing punctuation that's not part of the URL
-        while (url.length > 1) {
-          var ch = url[url.length - 1];
-          if (ch === ')' && url.split('(').length <= url.split(')').length - 1) {
-            trail = ch + trail; url = url.slice(0, -1);
-          } else if (/[.,;:!?>\u200C\u200F]/.test(ch)) {
-            trail = ch + trail; url = url.slice(0, -1);
-          } else { break }
+    function linkify(raw) {
+      // Accepts raw (unescaped) text. Handles [label](url) markdown links and
+      // plain URLs. Escapes HTML in non-URL segments so & in URLs is preserved.
+      var result = '', last = 0, m;
+      var re = /\[([^\]]+)\]\((https?:\/\/[^\s)]+)\)|(https?:\/\/[^\s<>"']+)/g;
+      while ((m = re.exec(raw)) !== null) {
+        result += esc(raw.slice(last, m.index));
+        if (m[2]) {
+          result += '<a href="' + escAttr(m[2]) + '" target="_blank" rel="noopener" dir="ltr">' + esc(m[1]) + '</a>';
+        } else {
+          var url = m[3], trail = '';
+          while (url.length > 1) {
+            var ch = url[url.length - 1];
+            if (ch === ')' && url.split('(').length <= url.split(')').length - 1) {
+              trail = ch + trail; url = url.slice(0, -1);
+            } else if (/[.,;:!?>\u200C\u200F]/.test(ch)) {
+              trail = ch + trail; url = url.slice(0, -1);
+            } else { break; }
+          }
+          result += '<a href="' + escAttr(url) + '" target="_blank" rel="noopener" dir="ltr">' + esc(url) + '</a>' + esc(trail);
        }
-        return '<a href="' + escAttr(url) + '" target="_blank" rel="noopener" dir="ltr">' + url + '</a>' + trail;
-      });
+        last = m.index + m[0].length;
+      }
+      result += esc(raw.slice(last));
+      return result;
    }
    function scrollToMsg(id) {
      var els = document.querySelectorAll('.msg');