Add enhanced text sanitization (#83)

* Add enhanced text sanitization

* Format code with prettier

* Refactor tests to remove redundancy and improve structure

- Remove redundant 'mixed input patterns' test from sanitizer.test.ts
- Consolidate integration tests into 2 focused real-world scenarios
- Add HTML comment stripping to sanitizeContent function
- Update test expectations to match sanitization behavior
- Maintain full coverage with fewer, more focused tests

* Fix prettier formatting

* Remove rendered.html from repository

* Remove test-markdown.json and update .gitignore

* Revert .gitignore changes
This commit is contained in:
Lina Tawfik
2025-05-29 16:35:50 -07:00
committed by GitHub
parent fb7365fba9
commit 35ad5fc467
6 changed files with 498 additions and 175 deletions

View File

@@ -0,0 +1,65 @@
export function stripInvisibleCharacters(content: string): string {
content = content.replace(/[\u200B\u200C\u200D\uFEFF]/g, "");
content = content.replace(
/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g,
"",
);
content = content.replace(/\u00AD/g, "");
content = content.replace(/[\u202A-\u202E\u2066-\u2069]/g, "");
return content;
}
export function stripMarkdownImageAltText(content: string): string {
return content.replace(/!\[[^\]]*\]\(/g, "![](");
}
export function stripMarkdownLinkTitles(content: string): string {
content = content.replace(/(\[[^\]]*\]\([^)]+)\s+"[^"]*"/g, "$1");
content = content.replace(/(\[[^\]]*\]\([^)]+)\s+'[^']*'/g, "$1");
return content;
}
export function stripHiddenAttributes(content: string): string {
content = content.replace(/\salt\s*=\s*["'][^"']*["']/gi, "");
content = content.replace(/\salt\s*=\s*[^\s>]+/gi, "");
content = content.replace(/\stitle\s*=\s*["'][^"']*["']/gi, "");
content = content.replace(/\stitle\s*=\s*[^\s>]+/gi, "");
content = content.replace(/\saria-label\s*=\s*["'][^"']*["']/gi, "");
content = content.replace(/\saria-label\s*=\s*[^\s>]+/gi, "");
content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*["'][^"']*["']/gi, "");
content = content.replace(/\sdata-[a-zA-Z0-9-]+\s*=\s*[^\s>]+/gi, "");
content = content.replace(/\splaceholder\s*=\s*["'][^"']*["']/gi, "");
content = content.replace(/\splaceholder\s*=\s*[^\s>]+/gi, "");
return content;
}
export function normalizeHtmlEntities(content: string): string {
content = content.replace(/&#(\d+);/g, (_, dec) => {
const num = parseInt(dec, 10);
if (num >= 32 && num <= 126) {
return String.fromCharCode(num);
}
return "";
});
content = content.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => {
const num = parseInt(hex, 16);
if (num >= 32 && num <= 126) {
return String.fromCharCode(num);
}
return "";
});
return content;
}
export function sanitizeContent(content: string): string {
content = stripHtmlComments(content);
content = stripInvisibleCharacters(content);
content = stripMarkdownImageAltText(content);
content = stripMarkdownLinkTitles(content);
content = stripHiddenAttributes(content);
content = normalizeHtmlEntities(content);
return content;
}
export const stripHtmlComments = (content: string) =>
content.replace(/<!--[\s\S]*?-->/g, "");