Add enhanced text sanitization (#83)
* Add enhanced text sanitization * Format code with prettier * Refactor tests to remove redundancy and improve structure - Remove redundant 'mixed input patterns' test from sanitizer.test.ts - Consolidate integration tests into 2 focused real-world scenarios - Add HTML comment stripping to sanitizeContent function - Update test expectations to match sanitization behavior - Maintain full coverage with fewer, more focused tests * Fix prettier formatting * Remove rendered.html from repository * Remove test-markdown.json and update .gitignore * Revert .gitignore changes
This commit is contained in:
134
test/integration-sanitization.test.ts
Normal file
134
test/integration-sanitization.test.ts
Normal file
@@ -0,0 +1,134 @@
|
||||
import { describe, expect, it } from "bun:test";
|
||||
import { formatBody, formatComments } from "../src/github/data/formatter";
|
||||
import type { GitHubComment } from "../src/github/types";
|
||||
|
||||
describe("Sanitization Integration", () => {
|
||||
it("should sanitize complete issue/PR body with various hidden content patterns", () => {
|
||||
const issueBody = `
|
||||
# Feature Request: Add user dashboard
|
||||
|
||||
## Description
|
||||
We need a new dashboard for users to track their activity.
|
||||
|
||||
<!-- HTML comment that should be removed -->
|
||||
|
||||
## Technical Details
|
||||
The dashboard should display:
|
||||
- User statistics 
|
||||
- Activity graphs <img alt="example graph description" src="graph.jpg">
|
||||
- Recent actions
|
||||
|
||||
## Implementation Notes
|
||||
See [documentation](https://docs.example.com "internal docs title") for API details.
|
||||
|
||||
<div data-instruction="example instruction" aria-label="dashboard label" title="hover text">
|
||||
The implementation should follow our standard patterns.
|
||||
</div>
|
||||
|
||||
Additional notes: Textwithsofthyphens and Hidden encoded content.
|
||||
|
||||
<input placeholder="search placeholder" type="text" />
|
||||
|
||||
Direction override test: reversed text should be normalized.`;
|
||||
|
||||
const imageUrlMap = new Map<string, string>();
|
||||
const result = formatBody(issueBody, imageUrlMap);
|
||||
|
||||
// Verify hidden content is removed
|
||||
expect(result).not.toContain("<!-- HTML comment");
|
||||
expect(result).not.toContain("hiddentext");
|
||||
expect(result).not.toContain("example graph description");
|
||||
expect(result).not.toContain("internal docs title");
|
||||
expect(result).not.toContain("example instruction");
|
||||
expect(result).not.toContain("dashboard label");
|
||||
expect(result).not.toContain("hover text");
|
||||
expect(result).not.toContain("search placeholder");
|
||||
expect(result).not.toContain("\u200B");
|
||||
expect(result).not.toContain("\u200C");
|
||||
expect(result).not.toContain("\u200D");
|
||||
expect(result).not.toContain("\u00AD");
|
||||
expect(result).not.toContain("\u202E");
|
||||
expect(result).not.toContain("H");
|
||||
|
||||
// Verify legitimate content is preserved
|
||||
expect(result).toContain("# Feature Request: Add user dashboard");
|
||||
expect(result).toContain("## Description");
|
||||
expect(result).toContain("We need a new dashboard");
|
||||
expect(result).toContain("User statistics");
|
||||
expect(result).toContain("");
|
||||
expect(result).toContain('<img src="graph.jpg">');
|
||||
expect(result).toContain("[documentation](https://docs.example.com)");
|
||||
expect(result).toContain(
|
||||
"The implementation should follow our standard patterns",
|
||||
);
|
||||
expect(result).toContain("Hidden encoded content");
|
||||
expect(result).toContain('<input type="text" />');
|
||||
});
|
||||
|
||||
it("should sanitize GitHub comments preserving discussion flow", () => {
|
||||
const comments: GitHubComment[] = [
|
||||
{
|
||||
id: "1",
|
||||
databaseId: "100001",
|
||||
body: `Great idea! Here are my thoughts:
|
||||
|
||||
1. We should consider the performance impact
|
||||
2. The UI mockup looks good: 
|
||||
3. Check the [API docs](https://api.example.com "api reference") for rate limits
|
||||
|
||||
<div aria-label="comment metadata" data-comment-type="review">
|
||||
This change would affect multiple systems.
|
||||
</div>
|
||||
|
||||
Note: Implementationshouldfollowbestpractices.`,
|
||||
author: { login: "reviewer1" },
|
||||
createdAt: "2023-01-01T10:00:00Z",
|
||||
},
|
||||
{
|
||||
id: "2",
|
||||
databaseId: "100002",
|
||||
body: `Thanks for the feedback!
|
||||
|
||||
<!-- Internal note: discussed with team -->
|
||||
|
||||
I've updated the proposal based on your suggestions.
|
||||
|
||||
Test note: All systems checked.
|
||||
|
||||
<span title="status update" data-status="approved">Ready for implementation</span>`,
|
||||
author: { login: "author1" },
|
||||
createdAt: "2023-01-01T12:00:00Z",
|
||||
},
|
||||
];
|
||||
|
||||
const result = formatComments(comments);
|
||||
|
||||
// Verify hidden content is removed
|
||||
expect(result).not.toContain("<!-- Internal note");
|
||||
expect(result).not.toContain("api reference");
|
||||
expect(result).not.toContain("comment metadata");
|
||||
expect(result).not.toContain('data-comment-type="review"');
|
||||
expect(result).not.toContain("status update");
|
||||
expect(result).not.toContain('data-status="approved"');
|
||||
expect(result).not.toContain("\u200B");
|
||||
expect(result).not.toContain("T");
|
||||
|
||||
// Verify discussion flow is preserved
|
||||
expect(result).toContain("Great idea! Here are my thoughts:");
|
||||
expect(result).toContain("1. We should consider the performance impact");
|
||||
expect(result).toContain("2. The UI mockup looks good: ");
|
||||
expect(result).toContain(
|
||||
"3. Check the [API docs](https://api.example.com)",
|
||||
);
|
||||
expect(result).toContain("This change would affect multiple systems.");
|
||||
expect(result).toContain("Implementationshouldfollowbestpractices");
|
||||
expect(result).toContain("Thanks for the feedback!");
|
||||
expect(result).toContain(
|
||||
"I've updated the proposal based on your suggestions.",
|
||||
);
|
||||
expect(result).toContain("Test note: All systems checked.");
|
||||
expect(result).toContain("Ready for implementation");
|
||||
expect(result).toContain("[reviewer1 at");
|
||||
expect(result).toContain("[author1 at");
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user