Add enhanced text sanitization (#83)

* Add enhanced text sanitization * Format code with prettier * Refactor tests to remove redundancy and improve structure - Remove redundant 'mixed input patterns' test from sanitizer.test.ts - Consolidate integration tests into 2 focused real-world scenarios - Add HTML comment stripping to sanitizeContent function - Update test expectations to match sanitization behavior - Maintain full coverage with fewer, more focused tests * Fix prettier formatting * Remove rendered.html from repository * Remove test-markdown.json and update .gitignore * Revert .gitignore changes
2025-05-29 16:35:50 -07:00
parent fb7365fba9
commit 35ad5fc467
6 changed files with 498 additions and 175 deletions
--- a/src/github/data/formatter.ts
+++ b/src/github/data/formatter.ts
@@ -6,10 +6,7 @@ import type {
  GitHubReview,
 } from "../types";
 import type { GitHubFileWithSHA } from "./fetcher";
-
-export function stripHtmlComments(text: string): string {
-  return text.replace(/<!--[\s\S]*?-->/g, "");
-}
+import { sanitizeContent } from "../utils/sanitizer";

 export function formatContext(
  contextData: GitHubPullRequest | GitHubIssue,
@@ -37,13 +34,14 @@ export function formatBody(
  body: string,
  imageUrlMap: Map<string, string>,
 ): string {
-  let processedBody = stripHtmlComments(body);
+  let processedBody = body;

-  // Replace image URLs with local paths
  for (const [originalUrl, localPath] of imageUrlMap) {
    processedBody = processedBody.replaceAll(originalUrl, localPath);
  }

+  processedBody = sanitizeContent(processedBody);
+
  return processedBody;
 }

@@ -53,15 +51,16 @@ export function formatComments(
 ): string {
  return comments
    .map((comment) => {
-      let body = stripHtmlComments(comment.body);
+      let body = comment.body;

-      // Replace image URLs with local paths if we have a mapping
      if (imageUrlMap && body) {
        for (const [originalUrl, localPath] of imageUrlMap) {
          body = body.replaceAll(originalUrl, localPath);
        }
      }

+      body = sanitizeContent(body);
+
      return `[${comment.author.login} at ${comment.createdAt}]: ${body}`;
    })
    .join("\n\n");
@@ -78,6 +77,19 @@ export function formatReviewComments(
  const formattedReviews = reviewData.nodes.map((review) => {
    let reviewOutput = `[Review by ${review.author.login} at ${review.submittedAt}]: ${review.state}`;

+    if (review.body && review.body.trim()) {
+      let body = review.body;
+
+      if (imageUrlMap) {
+        for (const [originalUrl, localPath] of imageUrlMap) {
+          body = body.replaceAll(originalUrl, localPath);
+        }
+      }
+
+      const sanitizedBody = sanitizeContent(body);
+      reviewOutput += `\n${sanitizedBody}`;
+    }
+
    if (
      review.comments &&
      review.comments.nodes &&
@@ -85,15 +97,16 @@ export function formatReviewComments(
    ) {
      const comments = review.comments.nodes
        .map((comment) => {
-          let body = stripHtmlComments(comment.body);
+          let body = comment.body;

-          // Replace image URLs with local paths if we have a mapping
          if (imageUrlMap) {
            for (const [originalUrl, localPath] of imageUrlMap) {
              body = body.replaceAll(originalUrl, localPath);
            }
          }

+          body = sanitizeContent(body);
+
          return `  [Comment on ${comment.path}:${comment.line || "?"}]: ${body}`;
        })
        .join("\n");
--- a/src/github/utils/sanitizer.ts
+++ b/src/github/utils/sanitizer.ts
@@ -0,0 +1,65 @@
+export function stripInvisibleCharacters(content: string): string {
+  content = content.replace(/[\u200B\u200C\u200D\uFEFF]/g, "");
+  content = content.replace(
+    /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g,
+    "",
+  );
+  content = content.replace(/\u00AD/g, "");
+  content = content.replace(/[\u202A-\u202E\u2066-\u2069]/g, "");
+  return content;
+}
+
+export function stripMarkdownImageAltText(content: string): string {
+  return content.replace(/!\[[^\]]*\]\(/g, "![](");
+}
+
+export function stripMarkdownLinkTitles(content: string): string {
+  content = content.replace(/(\[[^\]]*\]\([^)]+)\s+"[^"]*"/g, "$1");
+  content = content.replace(/(\[[^\]]*\]\([^)]+)\s+'[^']*'/g, "$1");
+  return content;
+}
+
+export function stripHiddenAttributes(content: string): string {
+  content = content.replace(/\salt\s*=\s*["'][^"']*["']/gi, "");
+  content = content.replace(/\salt\s*=\s*[^\s>]+/gi, "");
+  content = content.replace(/\stitle\s*=\s*["'][^"']*["']/gi, "");
+  content = content.replace(/\stitle\s*=\s*[^\s>]+/gi, "");
+  content = content.replace(/\saria-label\s*=\s*["'][^"']*["']/gi, "");
+  content = content.replace(/\saria-label\s*=\s*[^\s>]+/gi, "");
+  content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*["'][^"']*["']/gi, "");
+  content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*[^\s>]+/gi, "");
+  content = content.replace(/\splaceholder\s*=\s*["'][^"']*["']/gi, "");
+  content = content.replace(/\splaceholder\s*=\s*[^\s>]+/gi, "");
+  return content;
+}
+
+export function normalizeHtmlEntities(content: string): string {
+  content = content.replace(/&#(\d+);/g, (_, dec) => {
+    const num = parseInt(dec, 10);
+    if (num >= 32 && num <= 126) {
+      return String.fromCharCode(num);
+    }
+    return "";
+  });
+  content = content.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => {
+    const num = parseInt(hex, 16);
+    if (num >= 32 && num <= 126) {
+      return String.fromCharCode(num);
+    }
+    return "";
+  });
+  return content;
+}
+
+export function sanitizeContent(content: string): string {
+  content = stripHtmlComments(content);
+  content = stripInvisibleCharacters(content);
+  content = stripMarkdownImageAltText(content);
+  content = stripMarkdownLinkTitles(content);
+  content = stripHiddenAttributes(content);
+  content = normalizeHtmlEntities(content);
+  return content;
+}
+
+export const stripHtmlComments = (content: string) =>
+  content.replace(/<!--[\s\S]*?-->/g, "");