Add enhanced text sanitization (#83)
* Add enhanced text sanitization * Format code with prettier * Refactor tests to remove redundancy and improve structure - Remove redundant 'mixed input patterns' test from sanitizer.test.ts - Consolidate integration tests into 2 focused real-world scenarios - Add HTML comment stripping to sanitizeContent function - Update test expectations to match sanitization behavior - Maintain full coverage with fewer, more focused tests * Fix prettier formatting * Remove rendered.html from repository * Remove test-markdown.json and update .gitignore * Revert .gitignore changes
This commit is contained in:
@@ -6,10 +6,7 @@ import type {
|
||||
GitHubReview,
|
||||
} from "../types";
|
||||
import type { GitHubFileWithSHA } from "./fetcher";
|
||||
|
||||
export function stripHtmlComments(text: string): string {
|
||||
return text.replace(/<!--[\s\S]*?-->/g, "");
|
||||
}
|
||||
import { sanitizeContent } from "../utils/sanitizer";
|
||||
|
||||
export function formatContext(
|
||||
contextData: GitHubPullRequest | GitHubIssue,
|
||||
@@ -37,13 +34,14 @@ export function formatBody(
|
||||
body: string,
|
||||
imageUrlMap: Map<string, string>,
|
||||
): string {
|
||||
let processedBody = stripHtmlComments(body);
|
||||
let processedBody = body;
|
||||
|
||||
// Replace image URLs with local paths
|
||||
for (const [originalUrl, localPath] of imageUrlMap) {
|
||||
processedBody = processedBody.replaceAll(originalUrl, localPath);
|
||||
}
|
||||
|
||||
processedBody = sanitizeContent(processedBody);
|
||||
|
||||
return processedBody;
|
||||
}
|
||||
|
||||
@@ -53,15 +51,16 @@ export function formatComments(
|
||||
): string {
|
||||
return comments
|
||||
.map((comment) => {
|
||||
let body = stripHtmlComments(comment.body);
|
||||
let body = comment.body;
|
||||
|
||||
// Replace image URLs with local paths if we have a mapping
|
||||
if (imageUrlMap && body) {
|
||||
for (const [originalUrl, localPath] of imageUrlMap) {
|
||||
body = body.replaceAll(originalUrl, localPath);
|
||||
}
|
||||
}
|
||||
|
||||
body = sanitizeContent(body);
|
||||
|
||||
return `[${comment.author.login} at ${comment.createdAt}]: ${body}`;
|
||||
})
|
||||
.join("\n\n");
|
||||
@@ -78,6 +77,19 @@ export function formatReviewComments(
|
||||
const formattedReviews = reviewData.nodes.map((review) => {
|
||||
let reviewOutput = `[Review by ${review.author.login} at ${review.submittedAt}]: ${review.state}`;
|
||||
|
||||
if (review.body && review.body.trim()) {
|
||||
let body = review.body;
|
||||
|
||||
if (imageUrlMap) {
|
||||
for (const [originalUrl, localPath] of imageUrlMap) {
|
||||
body = body.replaceAll(originalUrl, localPath);
|
||||
}
|
||||
}
|
||||
|
||||
const sanitizedBody = sanitizeContent(body);
|
||||
reviewOutput += `\n${sanitizedBody}`;
|
||||
}
|
||||
|
||||
if (
|
||||
review.comments &&
|
||||
review.comments.nodes &&
|
||||
@@ -85,15 +97,16 @@ export function formatReviewComments(
|
||||
) {
|
||||
const comments = review.comments.nodes
|
||||
.map((comment) => {
|
||||
let body = stripHtmlComments(comment.body);
|
||||
let body = comment.body;
|
||||
|
||||
// Replace image URLs with local paths if we have a mapping
|
||||
if (imageUrlMap) {
|
||||
for (const [originalUrl, localPath] of imageUrlMap) {
|
||||
body = body.replaceAll(originalUrl, localPath);
|
||||
}
|
||||
}
|
||||
|
||||
body = sanitizeContent(body);
|
||||
|
||||
return ` [Comment on ${comment.path}:${comment.line || "?"}]: ${body}`;
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
65
src/github/utils/sanitizer.ts
Normal file
65
src/github/utils/sanitizer.ts
Normal file
@@ -0,0 +1,65 @@
|
||||
export function stripInvisibleCharacters(content: string): string {
|
||||
content = content.replace(/[\u200B\u200C\u200D\uFEFF]/g, "");
|
||||
content = content.replace(
|
||||
/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g,
|
||||
"",
|
||||
);
|
||||
content = content.replace(/\u00AD/g, "");
|
||||
content = content.replace(/[\u202A-\u202E\u2066-\u2069]/g, "");
|
||||
return content;
|
||||
}
|
||||
|
||||
export function stripMarkdownImageAltText(content: string): string {
|
||||
return content.replace(/!\[[^\]]*\]\(/g, ";
|
||||
}
|
||||
|
||||
export function stripMarkdownLinkTitles(content: string): string {
|
||||
content = content.replace(/(\[[^\]]*\]\([^)]+)\s+"[^"]*"/g, "$1");
|
||||
content = content.replace(/(\[[^\]]*\]\([^)]+)\s+'[^']*'/g, "$1");
|
||||
return content;
|
||||
}
|
||||
|
||||
export function stripHiddenAttributes(content: string): string {
|
||||
content = content.replace(/\salt\s*=\s*["'][^"']*["']/gi, "");
|
||||
content = content.replace(/\salt\s*=\s*[^\s>]+/gi, "");
|
||||
content = content.replace(/\stitle\s*=\s*["'][^"']*["']/gi, "");
|
||||
content = content.replace(/\stitle\s*=\s*[^\s>]+/gi, "");
|
||||
content = content.replace(/\saria-label\s*=\s*["'][^"']*["']/gi, "");
|
||||
content = content.replace(/\saria-label\s*=\s*[^\s>]+/gi, "");
|
||||
content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*["'][^"']*["']/gi, "");
|
||||
content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*[^\s>]+/gi, "");
|
||||
content = content.replace(/\splaceholder\s*=\s*["'][^"']*["']/gi, "");
|
||||
content = content.replace(/\splaceholder\s*=\s*[^\s>]+/gi, "");
|
||||
return content;
|
||||
}
|
||||
|
||||
export function normalizeHtmlEntities(content: string): string {
|
||||
content = content.replace(/&#(\d+);/g, (_, dec) => {
|
||||
const num = parseInt(dec, 10);
|
||||
if (num >= 32 && num <= 126) {
|
||||
return String.fromCharCode(num);
|
||||
}
|
||||
return "";
|
||||
});
|
||||
content = content.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => {
|
||||
const num = parseInt(hex, 16);
|
||||
if (num >= 32 && num <= 126) {
|
||||
return String.fromCharCode(num);
|
||||
}
|
||||
return "";
|
||||
});
|
||||
return content;
|
||||
}
|
||||
|
||||
export function sanitizeContent(content: string): string {
|
||||
content = stripHtmlComments(content);
|
||||
content = stripInvisibleCharacters(content);
|
||||
content = stripMarkdownImageAltText(content);
|
||||
content = stripMarkdownLinkTitles(content);
|
||||
content = stripHiddenAttributes(content);
|
||||
content = normalizeHtmlEntities(content);
|
||||
return content;
|
||||
}
|
||||
|
||||
export const stripHtmlComments = (content: string) =>
|
||||
content.replace(/<!--[\s\S]*?-->/g, "");
|
||||
Reference in New Issue
Block a user