package tools import ( "strings" "testing" ) func TestHtmlToMarkdown_Headings(t *testing.T) { html := `
First paragraph.
Second paragraph.
` got := htmlToMarkdown(html) if !strings.Contains(got, "First paragraph.") || !strings.Contains(got, "Second paragraph.") { t.Errorf("unexpected output:\n%s", got) } } func TestHtmlToMarkdown_Links(t *testing.T) { html := `Visit Example site.
` got := htmlToMarkdown(html) if !strings.Contains(got, "[Example](https://example.com)") { t.Errorf("missing link in:\n%s", got) } } func TestHtmlToMarkdown_Images(t *testing.T) { html := `
`
got := htmlToMarkdown(html)
if !strings.Contains(got, "") {
t.Errorf("missing image in:\n%s", got)
}
}
func TestHtmlToMarkdown_BoldItalic(t *testing.T) {
html := `bold and italic
` got := htmlToMarkdown(html) if !strings.Contains(got, "**bold**") { t.Errorf("missing bold in:\n%s", got) } if !strings.Contains(got, "*italic*") { t.Errorf("missing italic in:\n%s", got) } } func TestHtmlToMarkdown_PreCode(t *testing.T) { html := `func main() {}`
got := htmlToMarkdown(html)
if !strings.Contains(got, "```go") {
t.Errorf("missing fenced code block with language in:\n%s", got)
}
if !strings.Contains(got, "func main() {}") {
t.Errorf("missing code content in:\n%s", got)
}
if strings.Count(got, "```") < 2 {
t.Errorf("missing closing fence in:\n%s", got)
}
}
func TestHtmlToMarkdown_InlineCode(t *testing.T) {
html := `Use fmt.Println to print.
` got := htmlToMarkdown(html) if !strings.Contains(got, "> ") { t.Errorf("missing blockquote prefix in:\n%s", got) } if !strings.Contains(got, "A wise quote.") { t.Errorf("missing quote content in:\n%s", got) } } func TestHtmlToMarkdown_UnorderedList(t *testing.T) { html := `A wise quote.
| Name | Age |
|---|---|
| Alice | 30 |
Above
Below
` got := htmlToMarkdown(html) if !strings.Contains(got, "---") { t.Errorf("missing horizontal rule in:\n%s", got) } } // --- Stripping non-content elements --- func TestHtmlToMarkdown_StripsHead(t *testing.T) { html := `Content
` got := htmlToMarkdown(html) if strings.Contains(got, "Page Title") { t.Errorf("head title should not appear in output:\n%s", got) } if strings.Contains(got, "color:red") || strings.Contains(got, "var x=1") { t.Errorf("head CSS/JS leaked into output:\n%s", got) } if !strings.Contains(got, "Content") { t.Errorf("body content missing:\n%s", got) } } func TestHtmlToMarkdown_StripsScript(t *testing.T) { html := `Hello
World
` got := htmlToMarkdown(html) if strings.Contains(got, "alert") || strings.Contains(got, "xss") { t.Errorf("script content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsStyle(t *testing.T) { html := `Text
` got := htmlToMarkdown(html) if strings.Contains(got, "display") || strings.Contains(got, ".foo") { t.Errorf("style content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsNoscript(t *testing.T) { html := `Real content
` got := htmlToMarkdown(html) if strings.Contains(got, "Enable JavaScript") || strings.Contains(got, "color:blue") { t.Errorf("noscript content leaked:\n%s", got) } if !strings.Contains(got, "Real content") { t.Errorf("real content missing:\n%s", got) } } func TestHtmlToMarkdown_StripsSvg(t *testing.T) { html := `Content
` got := htmlToMarkdown(html) if strings.Contains(got, "M10") || strings.Contains(got, "icon") { t.Errorf("SVG content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsNav(t *testing.T) { html := `Article
` got := htmlToMarkdown(html) if strings.Contains(got, "Home") || strings.Contains(got, "About") { t.Errorf("nav content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsFooter(t *testing.T) { html := `Article
` got := htmlToMarkdown(html) if strings.Contains(got, "Copyright") { t.Errorf("footer content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsForm(t *testing.T) { html := `Content
` got := htmlToMarkdown(html) if strings.Contains(got, "Submit") || strings.Contains(got, "name") { t.Errorf("form content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsIframe(t *testing.T) { html := `Content
` got := htmlToMarkdown(html) if strings.Contains(got, "Fallback") || strings.Contains(got, "ads.example") { t.Errorf("iframe content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsTemplate(t *testing.T) { html := `Visible
` got := htmlToMarkdown(html) if strings.Contains(got, "Template content") { t.Errorf("template content leaked:\n%s", got) } } // --- Entity handling --- func TestHtmlToMarkdown_Entities(t *testing.T) { html := `A & B < C > D "E" 'F'
` got := htmlToMarkdown(html) if !strings.Contains(got, `A & B < C > D "E" 'F'`) { t.Errorf("entities not decoded properly:\n%s", got) } } // --- Whitespace handling --- func TestHtmlToMarkdown_WhitespaceCollapse(t *testing.T) { html := `lots of spaces
` got := htmlToMarkdown(html) if strings.Contains(got, " ") { t.Errorf("whitespace not collapsed:\n%s", got) } } func TestHtmlToMarkdown_PreservesPreWhitespace(t *testing.T) { html := `line1 line2 line3` got := htmlToMarkdown(html) if !strings.Contains(got, " line1\n line2\n line3") { t.Errorf("pre whitespace not preserved:\n%s", got) } } // --- Malformed HTML --- func TestHtmlToMarkdown_MalformedHTML(t *testing.T) { html := `
Unclosed paragraph
OK
` got := htmlToMarkdown(html) if !strings.Contains(got, "Unclosed paragraph") || !strings.Contains(got, "bold") || !strings.Contains(got, "OK") { t.Errorf("malformed HTML not handled:\n%s", got) } } // --- Text mode --- func TestHtmlToText_NoMarkdownFormatting(t *testing.T) { html := `Text with bold and link.
` got := htmlToText(html) if strings.Contains(got, "#") || strings.Contains(got, "**") || strings.Contains(got, "[link]") { t.Errorf("markdown formatting in text mode:\n%s", got) } if !strings.Contains(got, "Title") || !strings.Contains(got, "bold") || !strings.Contains(got, "link") { t.Errorf("content missing in text mode:\n%s", got) } } func TestHtmlToText_StripsHeader(t *testing.T) { html := `Article
` got := htmlToText(html) if strings.Contains(got, "Site Name") { t.Errorf("header content should be stripped in text mode:\n%s", got) } if !strings.Contains(got, "Article") { t.Errorf("body content missing:\n%s", got) } } func TestHtmlToText_StripsAside(t *testing.T) { html := `Main content
` got := htmlToText(html) if strings.Contains(got, "Sidebar") { t.Errorf("aside content should be stripped in text mode:\n%s", got) } } // --- Realistic SPA page --- func TestHtmlToMarkdown_SPAPage(t *testing.T) { html := `This is the main content of the page.