package tools import ( "strings" "testing" ) func TestHtmlToMarkdown_Headings(t *testing.T) { html := `

Title

Subtitle

Section

` got := htmlToMarkdown(html) for _, want := range []string{"# Title", "## Subtitle", "### Section"} { if !strings.Contains(got, want) { t.Errorf("missing %q in:\n%s", want, got) } } } func TestHtmlToMarkdown_Paragraphs(t *testing.T) { html := `

First paragraph.

Second paragraph.

` got := htmlToMarkdown(html) if !strings.Contains(got, "First paragraph.") || !strings.Contains(got, "Second paragraph.") { t.Errorf("unexpected output:\n%s", got) } } func TestHtmlToMarkdown_Links(t *testing.T) { html := `

Visit Example site.

` got := htmlToMarkdown(html) if !strings.Contains(got, "[Example](https://example.com)") { t.Errorf("missing link in:\n%s", got) } } func TestHtmlToMarkdown_Images(t *testing.T) { html := `logo` got := htmlToMarkdown(html) if !strings.Contains(got, "![logo](logo.png)") { t.Errorf("missing image in:\n%s", got) } } func TestHtmlToMarkdown_BoldItalic(t *testing.T) { html := `

bold and italic

` got := htmlToMarkdown(html) if !strings.Contains(got, "**bold**") { t.Errorf("missing bold in:\n%s", got) } if !strings.Contains(got, "*italic*") { t.Errorf("missing italic in:\n%s", got) } } func TestHtmlToMarkdown_PreCode(t *testing.T) { html := `
func main() {}
` got := htmlToMarkdown(html) if !strings.Contains(got, "```go") { t.Errorf("missing fenced code block with language in:\n%s", got) } if !strings.Contains(got, "func main() {}") { t.Errorf("missing code content in:\n%s", got) } if strings.Count(got, "```") < 2 { t.Errorf("missing closing fence in:\n%s", got) } } func TestHtmlToMarkdown_InlineCode(t *testing.T) { html := `

Use fmt.Println to print.

` got := htmlToMarkdown(html) if !strings.Contains(got, "`fmt.Println`") { t.Errorf("missing inline code in:\n%s", got) } } func TestHtmlToMarkdown_Blockquote(t *testing.T) { html := `

A wise quote.

` got := htmlToMarkdown(html) if !strings.Contains(got, "> ") { t.Errorf("missing blockquote prefix in:\n%s", got) } if !strings.Contains(got, "A wise quote.") { t.Errorf("missing quote content in:\n%s", got) } } func TestHtmlToMarkdown_UnorderedList(t *testing.T) { html := `` got := htmlToMarkdown(html) if !strings.Contains(got, "- One") || !strings.Contains(got, "- Two") || !strings.Contains(got, "- Three") { t.Errorf("missing list items in:\n%s", got) } } func TestHtmlToMarkdown_OrderedList(t *testing.T) { html := `
  1. First
  2. Second
` got := htmlToMarkdown(html) if !strings.Contains(got, "1. First") || !strings.Contains(got, "2. Second") { t.Errorf("missing ordered list items in:\n%s", got) } } func TestHtmlToMarkdown_NestedList(t *testing.T) { html := `` got := htmlToMarkdown(html) if !strings.Contains(got, "- A") || !strings.Contains(got, " - A1") || !strings.Contains(got, " - A2") { t.Errorf("missing nested list in:\n%s", got) } } func TestHtmlToMarkdown_Table(t *testing.T) { html := `
NameAge
Alice30
` got := htmlToMarkdown(html) if !strings.Contains(got, "| Name") || !strings.Contains(got, "| Age") { t.Errorf("missing table header in:\n%s", got) } if !strings.Contains(got, "| ---") { t.Errorf("missing table separator in:\n%s", got) } if !strings.Contains(got, "Alice") || !strings.Contains(got, "30") { t.Errorf("missing table data in:\n%s", got) } } func TestHtmlToMarkdown_HorizontalRule(t *testing.T) { html := `

Above


Below

` got := htmlToMarkdown(html) if !strings.Contains(got, "---") { t.Errorf("missing horizontal rule in:\n%s", got) } } // --- Stripping non-content elements --- func TestHtmlToMarkdown_StripsHead(t *testing.T) { html := `Page Title

Content

` got := htmlToMarkdown(html) if strings.Contains(got, "Page Title") { t.Errorf("head title should not appear in output:\n%s", got) } if strings.Contains(got, "color:red") || strings.Contains(got, "var x=1") { t.Errorf("head CSS/JS leaked into output:\n%s", got) } if !strings.Contains(got, "Content") { t.Errorf("body content missing:\n%s", got) } } func TestHtmlToMarkdown_StripsScript(t *testing.T) { html := `

Hello

World

` got := htmlToMarkdown(html) if strings.Contains(got, "alert") || strings.Contains(got, "xss") { t.Errorf("script content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsStyle(t *testing.T) { html := `

Text

` got := htmlToMarkdown(html) if strings.Contains(got, "display") || strings.Contains(got, ".foo") { t.Errorf("style content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsNoscript(t *testing.T) { html := `

Real content

` got := htmlToMarkdown(html) if strings.Contains(got, "Enable JavaScript") || strings.Contains(got, "color:blue") { t.Errorf("noscript content leaked:\n%s", got) } if !strings.Contains(got, "Real content") { t.Errorf("real content missing:\n%s", got) } } func TestHtmlToMarkdown_StripsSvg(t *testing.T) { html := `icon

Content

` got := htmlToMarkdown(html) if strings.Contains(got, "M10") || strings.Contains(got, "icon") { t.Errorf("SVG content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsNav(t *testing.T) { html := `

Article

` got := htmlToMarkdown(html) if strings.Contains(got, "Home") || strings.Contains(got, "About") { t.Errorf("nav content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsFooter(t *testing.T) { html := `

Article

` got := htmlToMarkdown(html) if strings.Contains(got, "Copyright") { t.Errorf("footer content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsForm(t *testing.T) { html := `

Content

` got := htmlToMarkdown(html) if strings.Contains(got, "Submit") || strings.Contains(got, "name") { t.Errorf("form content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsIframe(t *testing.T) { html := `

Content

` got := htmlToMarkdown(html) if strings.Contains(got, "Fallback") || strings.Contains(got, "ads.example") { t.Errorf("iframe content leaked:\n%s", got) } } func TestHtmlToMarkdown_StripsTemplate(t *testing.T) { html := `

Visible

` got := htmlToMarkdown(html) if strings.Contains(got, "Template content") { t.Errorf("template content leaked:\n%s", got) } } // --- Entity handling --- func TestHtmlToMarkdown_Entities(t *testing.T) { html := `

A & B < C > D "E" 'F'

` got := htmlToMarkdown(html) if !strings.Contains(got, `A & B < C > D "E" 'F'`) { t.Errorf("entities not decoded properly:\n%s", got) } } // --- Whitespace handling --- func TestHtmlToMarkdown_WhitespaceCollapse(t *testing.T) { html := `

lots of spaces

` got := htmlToMarkdown(html) if strings.Contains(got, " ") { t.Errorf("whitespace not collapsed:\n%s", got) } } func TestHtmlToMarkdown_PreservesPreWhitespace(t *testing.T) { html := `
  line1
  line2
  line3
` got := htmlToMarkdown(html) if !strings.Contains(got, " line1\n line2\n line3") { t.Errorf("pre whitespace not preserved:\n%s", got) } } // --- Malformed HTML --- func TestHtmlToMarkdown_MalformedHTML(t *testing.T) { html := `

Unclosed paragraph

Nested bold

OK

` got := htmlToMarkdown(html) if !strings.Contains(got, "Unclosed paragraph") || !strings.Contains(got, "bold") || !strings.Contains(got, "OK") { t.Errorf("malformed HTML not handled:\n%s", got) } } // --- Text mode --- func TestHtmlToText_NoMarkdownFormatting(t *testing.T) { html := `

Title

Text with bold and link.

` got := htmlToText(html) if strings.Contains(got, "#") || strings.Contains(got, "**") || strings.Contains(got, "[link]") { t.Errorf("markdown formatting in text mode:\n%s", got) } if !strings.Contains(got, "Title") || !strings.Contains(got, "bold") || !strings.Contains(got, "link") { t.Errorf("content missing in text mode:\n%s", got) } } func TestHtmlToText_StripsHeader(t *testing.T) { html := `

Site Name

Article

` got := htmlToText(html) if strings.Contains(got, "Site Name") { t.Errorf("header content should be stripped in text mode:\n%s", got) } if !strings.Contains(got, "Article") { t.Errorf("body content missing:\n%s", got) } } func TestHtmlToText_StripsAside(t *testing.T) { html := `

Main content

` got := htmlToText(html) if strings.Contains(got, "Sidebar") { t.Errorf("aside content should be stripped in text mode:\n%s", got) } } // --- Realistic SPA page --- func TestHtmlToMarkdown_SPAPage(t *testing.T) { html := ` My App

Welcome to My App

This is the main content of the page.

` got := htmlToMarkdown(html) // Should NOT contain CSS/JS artifacts for _, bad := range []string{ "margin: 0", "sans-serif", "background: #333", "__INITIAL_STATE__", "theme", "dark", "analytics loaded", "console.log", "main.abc123.css", "main.def456.js", "enable JavaScript", "viewport", // meta from head "All rights", // footer "Home", "About", "Contact", // nav } { if strings.Contains(got, bad) { t.Errorf("non-content %q leaked into output:\n%s", bad, got) } } // Should contain actual content for _, good := range []string{ "# Welcome to My App", "main content of the page", "- Feature 1", "- Feature 2", } { if !strings.Contains(got, good) { t.Errorf("expected content %q missing from output:\n%s", good, got) } } }