diff --git a/src/telegram/bot/delivery.ts b/src/telegram/bot/delivery.ts index bd97d5708..732227ed0 100644 --- a/src/telegram/bot/delivery.ts +++ b/src/telegram/bot/delivery.ts @@ -18,6 +18,7 @@ import { markdownToTelegramChunks, markdownToTelegramHtml, renderTelegramHtmlText, + wrapFileReferencesInHtml, } from "../format.js"; import { buildInlineKeyboard } from "../send.js"; import { cacheSticker, getCachedSticker } from "../sticker-cache.js"; @@ -76,7 +77,9 @@ export async function deliverReplies(params: { const nested = markdownToTelegramChunks(chunk, textLimit, { tableMode: params.tableMode }); if (!nested.length && chunk) { chunks.push({ - html: markdownToTelegramHtml(chunk, { tableMode: params.tableMode }), + html: wrapFileReferencesInHtml( + markdownToTelegramHtml(chunk, { tableMode: params.tableMode, wrapFileRefs: false }), + ), text: chunk, }); continue; diff --git a/src/telegram/format.ts b/src/telegram/format.ts index eb457edff..dae60ff1d 100644 --- a/src/telegram/format.ts +++ b/src/telegram/format.ts @@ -20,7 +20,56 @@ function escapeHtmlAttr(text: string): string { return escapeHtml(text).replace(/"/g, """); } -function buildTelegramLink(link: MarkdownLinkSpan, _text: string) { +/** + * File extensions that share TLDs and commonly appear in code/documentation. + * These are wrapped in tags to prevent Telegram from generating + * spurious domain registrar previews. + * + * Only includes extensions that are: + * 1. Commonly used as file extensions in code/docs + * 2. Rarely used as intentional domain references + * + * Excluded: .ai, .io, .tv, .fm (popular domain TLDs like x.ai, vercel.io, github.io) + */ +const FILE_EXTENSIONS_WITH_TLD = new Set([ + "md", // Markdown (Moldova) - very common in repos + "go", // Go language - common in Go projects + "py", // Python (Paraguay) - common in Python projects + "pl", // Perl (Poland) - common in Perl projects + "sh", // Shell (Saint Helena) - common for scripts + "am", // Automake files (Armenia) + "at", // Assembly (Austria) + "be", // Backend files (Belgium) + "cc", // C++ source (Cocos Islands) +]); + +/** Detects when markdown-it linkify auto-generated a link from a bare filename (e.g. README.md → http://README.md) */ +function isAutoLinkedFileRef(href: string, label: string): boolean { + const stripped = href.replace(/^https?:\/\//i, ""); + if (stripped !== label) { + return false; + } + const dotIndex = label.lastIndexOf("."); + if (dotIndex < 1) { + return false; + } + const ext = label.slice(dotIndex + 1).toLowerCase(); + if (!FILE_EXTENSIONS_WITH_TLD.has(ext)) { + return false; + } + // Reject if any path segment before the filename contains a dot (looks like a domain) + const segments = label.split("/"); + if (segments.length > 1) { + for (let i = 0; i < segments.length - 1; i++) { + if (segments[i].includes(".")) { + return false; + } + } + } + return true; +} + +function buildTelegramLink(link: MarkdownLinkSpan, text: string) { const href = link.href.trim(); if (!href) { return null; @@ -28,6 +77,11 @@ function buildTelegramLink(link: MarkdownLinkSpan, _text: string) { if (link.start === link.end) { return null; } + // Suppress auto-linkified file references (e.g. README.md → http://README.md) + const label = text.slice(link.start, link.end); + if (isAutoLinkedFileRef(href, label)) { + return null; + } const safeHref = escapeHtmlAttr(href); return { start: link.start, @@ -55,7 +109,7 @@ function renderTelegramHtml(ir: MarkdownIR): string { export function markdownToTelegramHtml( markdown: string, - options: { tableMode?: MarkdownTableMode } = {}, + options: { tableMode?: MarkdownTableMode; wrapFileRefs?: boolean } = {}, ): string { const ir = markdownToIR(markdown ?? "", { linkify: true, @@ -64,7 +118,154 @@ export function markdownToTelegramHtml( blockquotePrefix: "", tableMode: options.tableMode, }); - return renderTelegramHtml(ir); + const html = renderTelegramHtml(ir); + // Apply file reference wrapping if requested (for chunked rendering) + if (options.wrapFileRefs !== false) { + return wrapFileReferencesInHtml(html); + } + return html; +} + +/** + * Wraps standalone file references (with TLD extensions) in tags. + * This prevents Telegram from treating them as URLs and generating + * irrelevant domain registrar previews. + * + * Runs AFTER markdown→HTML conversion to avoid modifying HTML attributes. + * Skips content inside ,
, and  tags to avoid nesting issues.
+ */
+/** Escape regex metacharacters in a string */
+function escapeRegex(str: string): string {
+  return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+export function wrapFileReferencesInHtml(html: string): string {
+  // Build regex pattern for all tracked extensions (escape metacharacters for safety)
+  const extensionsPattern = Array.from(FILE_EXTENSIONS_WITH_TLD).map(escapeRegex).join("|");
+
+  // Safety-net: de-linkify auto-generated anchors where href="http://Link';
+    const result = wrapFileReferencesInHtml(input);
+    expect(result).toBe(input);
+  });
+
+  it("does not wrap file refs inside real URL anchor tags", () => {
+    const input = 'Visit example.com/README.md';
+    const result = wrapFileReferencesInHtml(input);
+    expect(result).toBe(input);
+  });
+
+  it("handles mixed content correctly", () => {
+    const result = wrapFileReferencesInHtml("Check README.md and CONTRIBUTING.md");
+    expect(result).toContain("README.md");
+    expect(result).toContain("CONTRIBUTING.md");
+  });
+
+  it("handles edge cases", () => {
+    expect(wrapFileReferencesInHtml("No markdown files here")).not.toContain("");
+    expect(wrapFileReferencesInHtml("File.md at start")).toContain("File.md");
+    expect(wrapFileReferencesInHtml("Ends with file.md")).toContain("file.md");
+  });
+
+  it("wraps file refs with punctuation boundaries", () => {
+    expect(wrapFileReferencesInHtml("See README.md.")).toContain("README.md.");
+    expect(wrapFileReferencesInHtml("See README.md,")).toContain("README.md,");
+    expect(wrapFileReferencesInHtml("(README.md)")).toContain("(README.md)");
+    expect(wrapFileReferencesInHtml("README.md:")).toContain("README.md:");
+  });
+
+  it("de-linkifies auto-linkified file ref anchors", () => {
+    const input = 'README.md';
+    expect(wrapFileReferencesInHtml(input)).toBe("README.md");
+  });
+
+  it("de-linkifies auto-linkified path anchors", () => {
+    const input = 'squad/friday/HEARTBEAT.md';
+    expect(wrapFileReferencesInHtml(input)).toBe("squad/friday/HEARTBEAT.md");
+  });
+
+  it("preserves explicit links where label differs from href", () => {
+    const input = 'click here';
+    expect(wrapFileReferencesInHtml(input)).toBe(input);
+  });
+
+  it("wraps file ref after closing anchor tag", () => {
+    const input = 'link then README.md';
+    const result = wrapFileReferencesInHtml(input);
+    expect(result).toContain(" then README.md");
+  });
+});
+
+describe("renderTelegramHtmlText - file reference wrapping", () => {
+  it("wraps file references in markdown mode", () => {
+    const result = renderTelegramHtmlText("Check README.md");
+    expect(result).toContain("README.md");
+  });
+
+  it("does not wrap in HTML mode (trusts caller markup)", () => {
+    // textMode: "html" should pass through unchanged - caller owns the markup
+    const result = renderTelegramHtmlText("Check README.md", { textMode: "html" });
+    expect(result).toBe("Check README.md");
+    expect(result).not.toContain("");
+  });
+
+  it("does not double-wrap already code-formatted content", () => {
+    const result = renderTelegramHtmlText("Already `wrapped.md` here");
+    // Should have code tags but not nested
+    expect(result).toContain("");
+    expect(result).not.toContain("");
+  });
+});
+
+describe("markdownToTelegramHtml - file reference wrapping", () => {
+  it("wraps file references by default", () => {
+    const result = markdownToTelegramHtml("Check README.md");
+    expect(result).toContain("README.md");
+  });
+
+  it("can skip wrapping when requested", () => {
+    const result = markdownToTelegramHtml("Check README.md", { wrapFileRefs: false });
+    expect(result).not.toContain("README.md");
+  });
+
+  it("wraps multiple file types in a single message", () => {
+    const result = markdownToTelegramHtml("Edit main.go and script.py");
+    expect(result).toContain("main.go");
+    expect(result).toContain("script.py");
+  });
+
+  it("preserves real URLs as anchor tags", () => {
+    const result = markdownToTelegramHtml("Visit https://example.com");
+    expect(result).toContain('');
+  });
+
+  it("preserves explicit markdown links even when href looks like a file ref", () => {
+    const result = markdownToTelegramHtml("[docs](http://README.md)");
+    expect(result).toContain('docs');
+  });
+
+  it("wraps file ref after real URL in same message", () => {
+    const result = markdownToTelegramHtml("Visit https://example.com and README.md");
+    expect(result).toContain('');
+    expect(result).toContain("README.md");
+  });
+});
+
+describe("markdownToTelegramChunks - file reference wrapping", () => {
+  it("wraps file references in chunked output", () => {
+    const chunks = markdownToTelegramChunks("Check README.md and backup.sh", 4096);
+    expect(chunks.length).toBeGreaterThan(0);
+    expect(chunks[0].html).toContain("README.md");
+    expect(chunks[0].html).toContain("backup.sh");
+  });
+});
+
+describe("edge cases", () => {
+  it("wraps file ref inside bold tags", () => {
+    const result = markdownToTelegramHtml("**README.md**");
+    expect(result).toBe("README.md");
+  });
+
+  it("wraps file ref inside italic tags", () => {
+    const result = markdownToTelegramHtml("*script.py*");
+    expect(result).toBe("script.py");
+  });
+
+  it("does not wrap inside fenced code blocks", () => {
+    const result = markdownToTelegramHtml("```\nREADME.md\n```");
+    expect(result).toBe("
README.md\n
"); + expect(result).not.toContain(""); + }); + + it("preserves domain-like paths as anchor tags", () => { + const result = markdownToTelegramHtml("example.com/README.md"); + expect(result).toContain('
'); + expect(result).not.toContain(""); + }); + + it("preserves github URLs with file paths", () => { + const result = markdownToTelegramHtml("https://github.com/foo/README.md"); + expect(result).toContain(''); + }); + + it("handles wrapFileRefs: false (plain text output)", () => { + const result = markdownToTelegramHtml("README.md", { wrapFileRefs: false }); + // buildTelegramLink returns null, so no tag; wrapFileRefs: false skips + expect(result).toBe("README.md"); + }); + + it("wraps supported TLD extensions (.am, .at, .be, .cc)", () => { + const result = markdownToTelegramHtml("Makefile.am and code.at and app.be and main.cc"); + expect(result).toContain("Makefile.am"); + expect(result).toContain("code.at"); + expect(result).toContain("app.be"); + expect(result).toContain("main.cc"); + }); + + it("does not wrap popular domain TLDs (.ai, .io, .tv, .fm)", () => { + // These are commonly used as real domains (x.ai, vercel.io, github.io) + const result = markdownToTelegramHtml("Check x.ai and vercel.io and app.tv and radio.fm"); + // Should be links, not code + expect(result).toContain(''); + expect(result).toContain(''); + expect(result).toContain(''); + expect(result).toContain(''); + }); + + it("keeps .co domains as links", () => { + const result = markdownToTelegramHtml("Visit t.co and openclaw.co"); + expect(result).toContain(''); + expect(result).toContain(''); + expect(result).not.toContain("t.co"); + expect(result).not.toContain("openclaw.co"); + }); + + it("does not wrap non-TLD extensions", () => { + const result = markdownToTelegramHtml("image.png and style.css and script.js"); + expect(result).not.toContain("image.png"); + expect(result).not.toContain("style.css"); + expect(result).not.toContain("script.js"); + }); + + it("handles file ref at start of message", () => { + const result = markdownToTelegramHtml("README.md is important"); + expect(result).toBe("README.md is important"); + }); + + it("handles file ref at end of message", () => { + const result = markdownToTelegramHtml("Check the README.md"); + expect(result).toBe("Check the README.md"); + }); + + it("handles multiple file refs in sequence", () => { + const result = markdownToTelegramHtml("README.md CHANGELOG.md LICENSE.md"); + expect(result).toContain("README.md"); + expect(result).toContain("CHANGELOG.md"); + expect(result).toContain("LICENSE.md"); + }); + + it("handles nested path without domain-like segments", () => { + const result = markdownToTelegramHtml("src/utils/helpers/format.go"); + expect(result).toContain("src/utils/helpers/format.go"); + }); + + it("wraps path with version-like segment (not a domain)", () => { + // v1.0/README.md is not linkified by markdown-it (no TLD), so it's wrapped + const result = markdownToTelegramHtml("v1.0/README.md"); + expect(result).toContain("v1.0/README.md"); + }); + + it("preserves domain path with version segment", () => { + // example.com/v1.0/README.md IS linkified (has domain), preserved as link + const result = markdownToTelegramHtml("example.com/v1.0/README.md"); + expect(result).toContain(''); + }); + + it("handles file ref with hyphen and underscore in name", () => { + const result = markdownToTelegramHtml("my-file_name.md"); + expect(result).toContain("my-file_name.md"); + }); + + it("handles uppercase extensions", () => { + const result = markdownToTelegramHtml("README.MD and SCRIPT.PY"); + expect(result).toContain("README.MD"); + expect(result).toContain("SCRIPT.PY"); + }); + + it("handles nested code tags (depth tracking)", () => { + // Nested inside
 - should not wrap inner content
+    const input = "
README.md
then script.py"; + const result = wrapFileReferencesInHtml(input); + expect(result).toBe("
README.md
then script.py"); + }); + + it("handles multiple anchor tags in sequence", () => { + const input = + '
link1 README.md link2 script.py'; + const result = wrapFileReferencesInHtml(input); + expect(result).toContain(" README.md script.py"); + }); + + it("handles auto-linked anchor with backreference match", () => { + // The regex uses \1 backreference - href must equal label + const input = 'README.md'; + expect(wrapFileReferencesInHtml(input)).toBe("README.md"); + }); + + it("preserves anchor when href and label differ (no backreference match)", () => { + // Different href and label - should NOT de-linkify + const input = 'README.md'; + expect(wrapFileReferencesInHtml(input)).toBe(input); + }); + + it("wraps orphaned TLD pattern after special character", () => { + // R&D.md - the & breaks the main pattern, but D.md could be auto-linked + // So we wrap the orphaned D.md part to prevent Telegram linking it + const input = "R&D.md"; + const result = wrapFileReferencesInHtml(input); + expect(result).toBe("R&D.md"); + }); + + it("wraps orphaned single-letter TLD patterns", () => { + // Use extensions still in the set (md, sh, py, go) + const result1 = wrapFileReferencesInHtml("X.md is cool"); + expect(result1).toContain("X.md"); + + const result2 = wrapFileReferencesInHtml("Check R.sh"); + expect(result2).toContain("R.sh"); + }); + + it("does not match filenames containing angle brackets", () => { + // The regex character class [a-zA-Z0-9_.\\-./] doesn't include < > + // so these won't be matched and wrapped (which is correct/safe) + const input = "file