From 35ad5fc467613de80f5a32e77a4755f26083fcd9 Mon Sep 17 00:00:00 2001 From: Lina Tawfik Date: Thu, 29 May 2025 16:35:50 -0700 Subject: [PATCH] Add enhanced text sanitization (#83) * Add enhanced text sanitization * Format code with prettier * Refactor tests to remove redundancy and improve structure - Remove redundant 'mixed input patterns' test from sanitizer.test.ts - Consolidate integration tests into 2 focused real-world scenarios - Add HTML comment stripping to sanitizeContent function - Update test expectations to match sanitization behavior - Maintain full coverage with fewer, more focused tests * Fix prettier formatting * Remove rendered.html from repository * Remove test-markdown.json and update .gitignore * Revert .gitignore changes --- src/create-prompt/index.ts | 6 +- src/github/data/formatter.ts | 33 +++- src/github/utils/sanitizer.ts | 65 +++++++ test/data-formatter.test.ts | 176 ++--------------- test/integration-sanitization.test.ts | 134 +++++++++++++ test/sanitizer.test.ts | 259 ++++++++++++++++++++++++++ 6 files changed, 498 insertions(+), 175 deletions(-) create mode 100644 src/github/utils/sanitizer.ts create mode 100644 test/integration-sanitization.test.ts create mode 100644 test/sanitizer.test.ts diff --git a/src/create-prompt/index.ts b/src/create-prompt/index.ts index 2d8db0e..f9a69ef 100644 --- a/src/create-prompt/index.ts +++ b/src/create-prompt/index.ts @@ -9,8 +9,8 @@ import { formatComments, formatReviewComments, formatChangedFilesWithSHA, - stripHtmlComments, } from "../github/data/formatter"; +import { sanitizeContent } from "../github/utils/sanitizer"; import { isIssuesEvent, isIssueCommentEvent, @@ -436,14 +436,14 @@ ${ eventData.eventName === "pull_request_review") && eventData.commentBody ? ` -${stripHtmlComments(eventData.commentBody)} +${sanitizeContent(eventData.commentBody)} ` : "" } ${ context.directPrompt ? ` -${stripHtmlComments(context.directPrompt)} +${sanitizeContent(context.directPrompt)} ` : "" } diff --git a/src/github/data/formatter.ts b/src/github/data/formatter.ts index df7b485..3ecc579 100644 --- a/src/github/data/formatter.ts +++ b/src/github/data/formatter.ts @@ -6,10 +6,7 @@ import type { GitHubReview, } from "../types"; import type { GitHubFileWithSHA } from "./fetcher"; - -export function stripHtmlComments(text: string): string { - return text.replace(//g, ""); -} +import { sanitizeContent } from "../utils/sanitizer"; export function formatContext( contextData: GitHubPullRequest | GitHubIssue, @@ -37,13 +34,14 @@ export function formatBody( body: string, imageUrlMap: Map, ): string { - let processedBody = stripHtmlComments(body); + let processedBody = body; - // Replace image URLs with local paths for (const [originalUrl, localPath] of imageUrlMap) { processedBody = processedBody.replaceAll(originalUrl, localPath); } + processedBody = sanitizeContent(processedBody); + return processedBody; } @@ -53,15 +51,16 @@ export function formatComments( ): string { return comments .map((comment) => { - let body = stripHtmlComments(comment.body); + let body = comment.body; - // Replace image URLs with local paths if we have a mapping if (imageUrlMap && body) { for (const [originalUrl, localPath] of imageUrlMap) { body = body.replaceAll(originalUrl, localPath); } } + body = sanitizeContent(body); + return `[${comment.author.login} at ${comment.createdAt}]: ${body}`; }) .join("\n\n"); @@ -78,6 +77,19 @@ export function formatReviewComments( const formattedReviews = reviewData.nodes.map((review) => { let reviewOutput = `[Review by ${review.author.login} at ${review.submittedAt}]: ${review.state}`; + if (review.body && review.body.trim()) { + let body = review.body; + + if (imageUrlMap) { + for (const [originalUrl, localPath] of imageUrlMap) { + body = body.replaceAll(originalUrl, localPath); + } + } + + const sanitizedBody = sanitizeContent(body); + reviewOutput += `\n${sanitizedBody}`; + } + if ( review.comments && review.comments.nodes && @@ -85,15 +97,16 @@ export function formatReviewComments( ) { const comments = review.comments.nodes .map((comment) => { - let body = stripHtmlComments(comment.body); + let body = comment.body; - // Replace image URLs with local paths if we have a mapping if (imageUrlMap) { for (const [originalUrl, localPath] of imageUrlMap) { body = body.replaceAll(originalUrl, localPath); } } + body = sanitizeContent(body); + return ` [Comment on ${comment.path}:${comment.line || "?"}]: ${body}`; }) .join("\n"); diff --git a/src/github/utils/sanitizer.ts b/src/github/utils/sanitizer.ts new file mode 100644 index 0000000..ef5d3cc --- /dev/null +++ b/src/github/utils/sanitizer.ts @@ -0,0 +1,65 @@ +export function stripInvisibleCharacters(content: string): string { + content = content.replace(/[\u200B\u200C\u200D\uFEFF]/g, ""); + content = content.replace( + /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g, + "", + ); + content = content.replace(/\u00AD/g, ""); + content = content.replace(/[\u202A-\u202E\u2066-\u2069]/g, ""); + return content; +} + +export function stripMarkdownImageAltText(content: string): string { + return content.replace(/!\[[^\]]*\]\(/g, "![]("); +} + +export function stripMarkdownLinkTitles(content: string): string { + content = content.replace(/(\[[^\]]*\]\([^)]+)\s+"[^"]*"/g, "$1"); + content = content.replace(/(\[[^\]]*\]\([^)]+)\s+'[^']*'/g, "$1"); + return content; +} + +export function stripHiddenAttributes(content: string): string { + content = content.replace(/\salt\s*=\s*["'][^"']*["']/gi, ""); + content = content.replace(/\salt\s*=\s*[^\s>]+/gi, ""); + content = content.replace(/\stitle\s*=\s*["'][^"']*["']/gi, ""); + content = content.replace(/\stitle\s*=\s*[^\s>]+/gi, ""); + content = content.replace(/\saria-label\s*=\s*["'][^"']*["']/gi, ""); + content = content.replace(/\saria-label\s*=\s*[^\s>]+/gi, ""); + content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*["'][^"']*["']/gi, ""); + content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*[^\s>]+/gi, ""); + content = content.replace(/\splaceholder\s*=\s*["'][^"']*["']/gi, ""); + content = content.replace(/\splaceholder\s*=\s*[^\s>]+/gi, ""); + return content; +} + +export function normalizeHtmlEntities(content: string): string { + content = content.replace(/&#(\d+);/g, (_, dec) => { + const num = parseInt(dec, 10); + if (num >= 32 && num <= 126) { + return String.fromCharCode(num); + } + return ""; + }); + content = content.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => { + const num = parseInt(hex, 16); + if (num >= 32 && num <= 126) { + return String.fromCharCode(num); + } + return ""; + }); + return content; +} + +export function sanitizeContent(content: string): string { + content = stripHtmlComments(content); + content = stripInvisibleCharacters(content); + content = stripMarkdownImageAltText(content); + content = stripMarkdownLinkTitles(content); + content = stripHiddenAttributes(content); + content = normalizeHtmlEntities(content); + return content; +} + +export const stripHtmlComments = (content: string) => + content.replace(//g, ""); diff --git a/test/data-formatter.test.ts b/test/data-formatter.test.ts index 1729732..3181032 100644 --- a/test/data-formatter.test.ts +++ b/test/data-formatter.test.ts @@ -6,7 +6,6 @@ import { formatReviewComments, formatChangedFiles, formatChangedFilesWithSHA, - stripHtmlComments, } from "../src/github/data/formatter"; import type { GitHubPullRequest, @@ -99,9 +98,9 @@ Some more text.`; const result = formatBody(body, imageUrlMap); expect(result) - .toBe(`Here is some text with an image: ![screenshot](/tmp/github-images/image-1234-0.png) + .toBe(`Here is some text with an image: ![](/tmp/github-images/image-1234-0.png) -And another one: ![another](/tmp/github-images/image-1234-1.jpg) +And another one: ![](/tmp/github-images/image-1234-1.jpg) Some more text.`); }); @@ -124,7 +123,7 @@ Some more text.`); ]); const result = formatBody(body, imageUrlMap); - expect(result).toBe("![image](https://example.com/image.png)"); + expect(result).toBe("![](https://example.com/image.png)"); }); test("handles multiple occurrences of same image", () => { @@ -139,8 +138,8 @@ Second: ![img](https://github.com/user-attachments/assets/test.png)`; ]); const result = formatBody(body, imageUrlMap); - expect(result).toBe(`First: ![img](/tmp/github-images/image-1234-0.png) -Second: ![img](/tmp/github-images/image-1234-0.png)`); + expect(result).toBe(`First: ![](/tmp/github-images/image-1234-0.png) +Second: ![](/tmp/github-images/image-1234-0.png)`); }); }); @@ -205,7 +204,7 @@ describe("formatComments", () => { const result = formatComments(comments, imageUrlMap); expect(result).toBe( - `[user1 at 2023-01-01T00:00:00Z]: Check out this screenshot: ![screenshot](/tmp/github-images/image-1234-0.png)\n\n[user2 at 2023-01-02T00:00:00Z]: Here's another image: ![bug](/tmp/github-images/image-1234-1.jpg)`, + `[user1 at 2023-01-01T00:00:00Z]: Check out this screenshot: ![](/tmp/github-images/image-1234-0.png)\n\n[user2 at 2023-01-02T00:00:00Z]: Here's another image: ![](/tmp/github-images/image-1234-1.jpg)`, ); }); @@ -233,7 +232,7 @@ describe("formatComments", () => { const result = formatComments(comments, imageUrlMap); expect(result).toBe( - `[user1 at 2023-01-01T00:00:00Z]: Two images: ![first](/tmp/github-images/image-1234-0.png) and ![second](/tmp/github-images/image-1234-1.png)`, + `[user1 at 2023-01-01T00:00:00Z]: Two images: ![](/tmp/github-images/image-1234-0.png) and ![](/tmp/github-images/image-1234-1.png)`, ); }); @@ -250,7 +249,7 @@ describe("formatComments", () => { const result = formatComments(comments); expect(result).toBe( - `[user1 at 2023-01-01T00:00:00Z]: Image: ![test](https://github.com/user-attachments/assets/test.png)`, + `[user1 at 2023-01-01T00:00:00Z]: Image: ![](https://github.com/user-attachments/assets/test.png)`, ); }); }); @@ -294,7 +293,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n [Comment on src/index.ts:42]: Nice implementation\n [Comment on src/utils.ts:?]: Consider adding error handling`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nThis is a great PR! LGTM.\n [Comment on src/index.ts:42]: Nice implementation\n [Comment on src/utils.ts:?]: Consider adding error handling`, ); }); @@ -317,7 +316,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nLooks good to me!`, ); }); @@ -384,7 +383,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: CHANGES_REQUESTED\n\n[Review by reviewer2 at 2023-01-02T00:00:00Z]: APPROVED`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: CHANGES_REQUESTED\nNeeds changes\n\n[Review by reviewer2 at 2023-01-02T00:00:00Z]: APPROVED\nLGTM`, ); }); @@ -438,7 +437,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData, imageUrlMap); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n [Comment on src/index.ts:42]: Comment with image: ![comment-img](/tmp/github-images/image-1234-1.png)`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nReview with image: ![](/tmp/github-images/image-1234-0.png)\n [Comment on src/index.ts:42]: Comment with image: ![](/tmp/github-images/image-1234-1.png)`, ); }); @@ -482,7 +481,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData, imageUrlMap); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n [Comment on src/main.ts:15]: Two issues: ![issue1](/tmp/github-images/image-1234-0.png) and ![issue2](/tmp/github-images/image-1234-1.png)`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nGood work\n [Comment on src/main.ts:15]: Two issues: ![](/tmp/github-images/image-1234-0.png) and ![](/tmp/github-images/image-1234-1.png)`, ); }); @@ -515,7 +514,7 @@ describe("formatReviewComments", () => { const result = formatReviewComments(reviewData); expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n [Comment on src/index.ts:42]: Image: ![test](https://github.com/user-attachments/assets/test.png)`, + `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\nReview body\n [Comment on src/index.ts:42]: Image: ![](https://github.com/user-attachments/assets/test.png)`, ); }); }); @@ -579,150 +578,3 @@ describe("formatChangedFilesWithSHA", () => { expect(result).toBe(""); }); }); - -describe("stripHtmlComments", () => { - test("strips simple HTML comments", () => { - const text = "Hello world"; - expect(stripHtmlComments(text)).toBe("Hello world"); - }); - - test("strips multiple HTML comments", () => { - const text = "Start middle end"; - expect(stripHtmlComments(text)).toBe("Start middle end"); - }); - - test("strips multi-line HTML comments", () => { - const text = `Line 1 - -Line 2`; - expect(stripHtmlComments(text)).toBe(`Line 1 - -Line 2`); - }); - - test("strips nested comment-like content", () => { - const text = "Text still in comment --> after"; - // HTML doesn't support true nested comments - the first --> ends the comment - expect(stripHtmlComments(text)).toBe("Text still in comment --> after"); - }); - - test("handles empty string", () => { - expect(stripHtmlComments("")).toBe(""); - }); - - test("handles text without comments", () => { - const text = "No comments here!"; - expect(stripHtmlComments(text)).toBe("No comments here!"); - }); - - test("strips complex hidden content with XML tags", () => { - const text = `Normal request - -More normal text`; - expect(stripHtmlComments(text)).toBe(`Normal request - -More normal text`); - }); - - test("handles malformed comments - no closing", () => { - const text = "Text is not stripped - expect(stripHtmlComments(text)).toBe("Text comment"; - // Just --> without opening comment"); - }); - - test("preserves legitimate HTML-like content outside comments", () => { - const text = "Use the
tag and
closing tag"; - expect(stripHtmlComments(text)).toBe( - "Use the
tag and
closing tag", - ); - }); -}); - -describe("formatBody with HTML comment stripping", () => { - test("strips HTML comments from body", () => { - const body = "Issue description visible text"; - const imageUrlMap = new Map(); - - const result = formatBody(body, imageUrlMap); - expect(result).toBe("Issue description visible text"); - }); - - test("strips HTML comments and replaces images", () => { - const body = `Check this ![img](https://github.com/user-attachments/assets/test.png)`; - const imageUrlMap = new Map([ - [ - "https://github.com/user-attachments/assets/test.png", - "/tmp/github-images/image-1234-0.png", - ], - ]); - - const result = formatBody(body, imageUrlMap); - expect(result).toBe( - "Check this ![img](/tmp/github-images/image-1234-0.png)", - ); - }); -}); - -describe("formatComments with HTML comment stripping", () => { - test("strips HTML comments from comment bodies", () => { - const comments: GitHubComment[] = [ - { - id: "1", - databaseId: "100001", - body: "Good work on this PR", - author: { login: "user1" }, - createdAt: "2023-01-01T00:00:00Z", - }, - ]; - - const result = formatComments(comments); - expect(result).toBe( - "[user1 at 2023-01-01T00:00:00Z]: Good work on this PR", - ); - }); -}); - -describe("formatReviewComments with HTML comment stripping", () => { - test("strips HTML comments from review comment bodies", () => { - const reviewData = { - nodes: [ - { - id: "review1", - databaseId: "300001", - author: { login: "reviewer1" }, - body: "LGTM", - state: "APPROVED", - submittedAt: "2023-01-01T00:00:00Z", - comments: { - nodes: [ - { - id: "comment1", - databaseId: "200001", - body: "Nice work here", - author: { login: "reviewer1" }, - createdAt: "2023-01-01T00:00:00Z", - path: "src/index.ts", - line: 42, - }, - ], - }, - }, - ], - }; - - const result = formatReviewComments(reviewData); - expect(result).toBe( - `[Review by reviewer1 at 2023-01-01T00:00:00Z]: APPROVED\n [Comment on src/index.ts:42]: Nice work here`, - ); - }); -}); diff --git a/test/integration-sanitization.test.ts b/test/integration-sanitization.test.ts new file mode 100644 index 0000000..13ba45a --- /dev/null +++ b/test/integration-sanitization.test.ts @@ -0,0 +1,134 @@ +import { describe, expect, it } from "bun:test"; +import { formatBody, formatComments } from "../src/github/data/formatter"; +import type { GitHubComment } from "../src/github/types"; + +describe("Sanitization Integration", () => { + it("should sanitize complete issue/PR body with various hidden content patterns", () => { + const issueBody = ` +# Feature Request: Add user dashboard + +## Description +We need a new dashboard for users to track their activity. + + + +## Technical Details +The dashboard should display: +- User statistics ![dashboard mockup with hidden​‌‍text](dashboard.png) +- Activity graphs example graph description +- Recent actions + +## Implementation Notes +See [documentation](https://docs.example.com "internal docs title") for API details. + +
+ The implementation should follow our standard patterns. +
+ +Additional notes: Text­with­soft­hyphens and Hidden encoded content. + + + +Direction override test: ‮reversed‬ text should be normalized.`; + + const imageUrlMap = new Map(); + const result = formatBody(issueBody, imageUrlMap); + + // Verify hidden content is removed + expect(result).not.toContain(" + +I've updated the proposal based on your suggestions. + +Test note: All systems checked. + +Ready for implementation`, + author: { login: "author1" }, + createdAt: "2023-01-01T12:00:00Z", + }, + ]; + + const result = formatComments(comments); + + // Verify hidden content is removed + expect(result).not.toContain(" + example alt text + ![example image description](screenshot.png) + [click here](https://example.com "example title") +
+ Normal text with hidden\u200Bcharacters +
+ Hidden message + `; + + const sanitized = sanitizeContent(testContent); + + expect(sanitized).not.toContain(""); + expect(sanitized).not.toContain("example alt text"); + expect(sanitized).not.toContain("example image description"); + expect(sanitized).not.toContain("example title"); + expect(sanitized).not.toContain("example data"); + expect(sanitized).not.toContain("example label"); + expect(sanitized).not.toContain("\u200B"); + expect(sanitized).not.toContain("alt="); + expect(sanitized).not.toContain("data-prompt="); + expect(sanitized).not.toContain("aria-label="); + + expect(sanitized).toContain("Normal text with hiddencharacters"); + expect(sanitized).toContain("Hidden message"); + expect(sanitized).toContain(''); + expect(sanitized).toContain("![](screenshot.png)"); + expect(sanitized).toContain("[click here](https://example.com)"); + }); + + it("should handle complex nested patterns", () => { + const complexContent = ` + Text with ![alt \u200B text](image.png) and more. + Link +
Content
+ `; + + const sanitized = sanitizeContent(complexContent); + + expect(sanitized).not.toContain("\u200B"); + expect(sanitized).not.toContain("\u00AD"); + expect(sanitized).not.toContain("alt "); + expect(sanitized).not.toContain('title="'); + expect(sanitized).not.toContain('data-x="'); + expect(sanitized).toContain("![](image.png)"); + expect(sanitized).toContain('Link'); + }); + + it("should preserve legitimate markdown and HTML", () => { + const legitimateContent = ` + # Heading + + This is **bold** and *italic* text. + + Here's a normal image: ![](normal.jpg) + And a normal link: [Click here](https://example.com) + +
+

Normal paragraph

+ +
+ `; + + const sanitized = sanitizeContent(legitimateContent); + + expect(sanitized).toBe(legitimateContent); + }); + + it("should handle entity-encoded text", () => { + const encodedText = ` + Hidden message +
Test
+ `; + + const sanitized = sanitizeContent(encodedText); + + expect(sanitized).toContain("Hidden message"); + expect(sanitized).not.toContain('title="'); + expect(sanitized).toContain("
Test
"); + }); +}); + +describe("stripHtmlComments (legacy)", () => { + it("should remove HTML comments", () => { + expect(stripHtmlComments("Hello World")).toBe( + "Hello World", + ); + expect(stripHtmlComments("Text")).toBe("Text"); + expect(stripHtmlComments("Text")).toBe("Text"); + }); + + it("should handle multiline comments", () => { + expect(stripHtmlComments("Hello World")).toBe( + "Hello World", + ); + }); +});