goclaw/internal/tools/web_fetch_convert.go

package tools

import (
	"encoding/json"
	"strings"

	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
)

// extractJSON pretty-prints JSON content.
func extractJSON(body []byte) (string, string) {
	var data any
	if err := json.Unmarshal(body, &data); err == nil {
		formatted, _ := json.MarshalIndent(data, "", "  ")
		return string(formatted), "json"
	}
	return string(body), "raw"
}

// --- DOM-based HTML extraction ---

type convertMode int

const (
	modeMarkdown convertMode = iota
	modeText
)

// converter walks a parsed HTML DOM tree and emits markdown or plain text.
type converter struct {
	buf       strings.Builder
	mode      convertMode
	inPre     bool
	listDepth int
	listType  []atom.Atom // stack: atom.Ul / atom.Ol
	listIndex []int       // ordered list counters
	inLink    bool
}

// Elements to skip entirely (element + all descendants).
var skipElements = map[atom.Atom]bool{
	atom.Head:     true,
	atom.Script:   true,
	atom.Style:    true,
	atom.Noscript: true,
	atom.Svg:      true,
	atom.Template: true,
	atom.Iframe:   true,
	atom.Select:   true,
	atom.Option:   true,
	atom.Button:   true,
	atom.Input:    true,
	atom.Form:     true,
	atom.Nav:      true,
	atom.Footer:   true,
	atom.Picture:  true,
	atom.Source:   true,
}

// Additional elements to skip in text mode only.
var skipInTextMode = map[atom.Atom]bool{
	atom.Header: true,
	atom.Aside:  true,
}

// Block elements that need surrounding newlines.
var blockElements = map[atom.Atom]bool{
	atom.P: true, atom.Div: true, atom.Section: true, atom.Article: true,
	atom.Main: true, atom.H1: true, atom.H2: true, atom.H3: true,
	atom.H4: true, atom.H5: true, atom.H6: true, atom.Blockquote: true,
	atom.Pre: true, atom.Ul: true, atom.Ol: true, atom.Li: true,
	atom.Table: true, atom.Tr: true, atom.Hr: true, atom.Dl: true,
	atom.Dt: true, atom.Dd: true, atom.Figure: true, atom.Figcaption: true,
	atom.Details: true, atom.Summary: true, atom.Address: true,
}

// htmlToMarkdown converts HTML to a markdown-like format using DOM parsing.
func htmlToMarkdown(rawHTML string) string {
	doc, err := html.Parse(strings.NewReader(rawHTML))
	if err != nil {
		return stripTagsFallback(rawHTML)
	}
	body := findBody(doc)
	c := &converter{mode: modeMarkdown}
	c.walkChildren(body)
	return cleanOutput(c.buf.String())
}

// htmlToText extracts plain text from HTML content using DOM parsing.
func htmlToText(rawHTML string) string {
	doc, err := html.Parse(strings.NewReader(rawHTML))
	if err != nil {
		return stripTagsFallback(rawHTML)
	}
	body := findBody(doc)
	c := &converter{mode: modeText}
	c.walkChildren(body)
	return cleanTextOutput(c.buf.String())
}

func (c *converter) walk(n *html.Node) {
	switch n.Type {
	case html.TextNode:
		c.handleText(n)
		return
	case html.ElementNode:
		// handled below
	case html.DocumentNode:
		c.walkChildren(n)
		return
	default:
		return
	}

	// Skip hidden elements (display:none, hidden attr, aria-hidden, etc.)
	// to prevent hidden-text prompt injection attacks.
	if isHiddenElement(n) {
		return
	}

	tag := n.DataAtom

	if skipElements[tag] {
		return
	}
	if c.mode == modeText && skipInTextMode[tag] {
		return
	}

	switch tag {
	case atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
		c.handleHeading(n)
	case atom.P:
		c.handleParagraph(n)
	case atom.A:
		c.handleLink(n)
	case atom.Img:
		c.handleImage(n)
	case atom.Pre:
		c.handlePre(n)
	case atom.Code:
		c.handleCode(n)
	case atom.Blockquote:
		c.handleBlockquote(n)
	case atom.Strong, atom.B:
		c.handleStrong(n)
	case atom.Em, atom.I:
		c.handleEmphasis(n)
	case atom.Br:
		c.buf.WriteByte('\n')
	case atom.Hr:
		c.ensureNewline()
		if c.mode == modeMarkdown {
			c.buf.WriteString("---\n")
		}
	case atom.Ul, atom.Ol:
		c.handleList(n)
	case atom.Li:
		c.handleListItem(n)
	case atom.Table:
		c.handleTable(n)
	case atom.Dt:
		c.handleDefinitionTerm(n)
	case atom.Dd:
		c.handleDefinitionDesc(n)
	default:
		if blockElements[tag] {
			c.ensureNewline()
			c.walkChildren(n)
			c.ensureNewline()
		} else {
			c.walkChildren(n)
		}
	}
}

func (c *converter) walkChildren(n *html.Node) {
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		c.walk(child)
	}
}