Add enhanced text sanitization (#83)

* Add enhanced text sanitization

* Format code with prettier

* Refactor tests to remove redundancy and improve structure

- Remove redundant 'mixed input patterns' test from sanitizer.test.ts
- Consolidate integration tests into 2 focused real-world scenarios
- Add HTML comment stripping to sanitizeContent function
- Update test expectations to match sanitization behavior
- Maintain full coverage with fewer, more focused tests

* Fix prettier formatting

* Remove rendered.html from repository

* Remove test-markdown.json and update .gitignore

* Revert .gitignore changes
This commit is contained in:
Lina Tawfik
2025-05-29 16:35:50 -07:00
committed by GitHub
parent fb7365fba9
commit 35ad5fc467
6 changed files with 498 additions and 175 deletions

View File

@@ -6,10 +6,7 @@ import type {
GitHubReview,
} from "../types";
import type { GitHubFileWithSHA } from "./fetcher";
export function stripHtmlComments(text: string): string {
return text.replace(/<!--[\s\S]*?-->/g, "");
}
import { sanitizeContent } from "../utils/sanitizer";
export function formatContext(
contextData: GitHubPullRequest | GitHubIssue,
@@ -37,13 +34,14 @@ export function formatBody(
body: string,
imageUrlMap: Map<string, string>,
): string {
let processedBody = stripHtmlComments(body);
let processedBody = body;
// Replace image URLs with local paths
for (const [originalUrl, localPath] of imageUrlMap) {
processedBody = processedBody.replaceAll(originalUrl, localPath);
}
processedBody = sanitizeContent(processedBody);
return processedBody;
}
@@ -53,15 +51,16 @@ export function formatComments(
): string {
return comments
.map((comment) => {
let body = stripHtmlComments(comment.body);
let body = comment.body;
// Replace image URLs with local paths if we have a mapping
if (imageUrlMap && body) {
for (const [originalUrl, localPath] of imageUrlMap) {
body = body.replaceAll(originalUrl, localPath);
}
}
body = sanitizeContent(body);
return `[${comment.author.login} at ${comment.createdAt}]: ${body}`;
})
.join("\n\n");
@@ -78,6 +77,19 @@ export function formatReviewComments(
const formattedReviews = reviewData.nodes.map((review) => {
let reviewOutput = `[Review by ${review.author.login} at ${review.submittedAt}]: ${review.state}`;
if (review.body && review.body.trim()) {
let body = review.body;
if (imageUrlMap) {
for (const [originalUrl, localPath] of imageUrlMap) {
body = body.replaceAll(originalUrl, localPath);
}
}
const sanitizedBody = sanitizeContent(body);
reviewOutput += `\n${sanitizedBody}`;
}
if (
review.comments &&
review.comments.nodes &&
@@ -85,15 +97,16 @@ export function formatReviewComments(
) {
const comments = review.comments.nodes
.map((comment) => {
let body = stripHtmlComments(comment.body);
let body = comment.body;
// Replace image URLs with local paths if we have a mapping
if (imageUrlMap) {
for (const [originalUrl, localPath] of imageUrlMap) {
body = body.replaceAll(originalUrl, localPath);
}
}
body = sanitizeContent(body);
return ` [Comment on ${comment.path}:${comment.line || "?"}]: ${body}`;
})
.join("\n");

View File

@@ -0,0 +1,65 @@
export function stripInvisibleCharacters(content: string): string {
content = content.replace(/[\u200B\u200C\u200D\uFEFF]/g, "");
content = content.replace(
/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g,
"",
);
content = content.replace(/\u00AD/g, "");
content = content.replace(/[\u202A-\u202E\u2066-\u2069]/g, "");
return content;
}
export function stripMarkdownImageAltText(content: string): string {
return content.replace(/!\[[^\]]*\]\(/g, "![](");
}
export function stripMarkdownLinkTitles(content: string): string {
content = content.replace(/(\[[^\]]*\]\([^)]+)\s+"[^"]*"/g, "$1");
content = content.replace(/(\[[^\]]*\]\([^)]+)\s+'[^']*'/g, "$1");
return content;
}
export function stripHiddenAttributes(content: string): string {
content = content.replace(/\salt\s*=\s*["'][^"']*["']/gi, "");
content = content.replace(/\salt\s*=\s*[^\s>]+/gi, "");
content = content.replace(/\stitle\s*=\s*["'][^"']*["']/gi, "");
content = content.replace(/\stitle\s*=\s*[^\s>]+/gi, "");
content = content.replace(/\saria-label\s*=\s*["'][^"']*["']/gi, "");
content = content.replace(/\saria-label\s*=\s*[^\s>]+/gi, "");
content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*["'][^"']*["']/gi, "");
content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*[^\s>]+/gi, "");
content = content.replace(/\splaceholder\s*=\s*["'][^"']*["']/gi, "");
content = content.replace(/\splaceholder\s*=\s*[^\s>]+/gi, "");
return content;
}
export function normalizeHtmlEntities(content: string): string {
content = content.replace(/&#(\d+);/g, (_, dec) => {
const num = parseInt(dec, 10);
if (num >= 32 && num <= 126) {
return String.fromCharCode(num);
}
return "";
});
content = content.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => {
const num = parseInt(hex, 16);
if (num >= 32 && num <= 126) {
return String.fromCharCode(num);
}
return "";
});
return content;
}
export function sanitizeContent(content: string): string {
content = stripHtmlComments(content);
content = stripInvisibleCharacters(content);
content = stripMarkdownImageAltText(content);
content = stripMarkdownLinkTitles(content);
content = stripHiddenAttributes(content);
content = normalizeHtmlEntities(content);
return content;
}
export const stripHtmlComments = (content: string) =>
content.replace(/<!--[\s\S]*?-->/g, "");