Files
goclaw/internal/tools/web_fetch_convert.go
viettranx bdb60de7ae chore: upgrade Go 1.25 → 1.26 and apply go fix modernizations
- Update go.mod and Dockerfile to Go 1.26
- Apply `go fix ./...` stdlib modernizations across 170+ files
- Add `go fix` to post-implementation checklist in CLAUDE.md
- Fix go fix misapplied rewrite in loop_history.go
2026-03-10 00:09:15 +07:00

183 lines
4.1 KiB
Go

package tools
import (
"encoding/json"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
// extractJSON pretty-prints JSON content.
func extractJSON(body []byte) (string, string) {
var data any
if err := json.Unmarshal(body, &data); err == nil {
formatted, _ := json.MarshalIndent(data, "", " ")
return string(formatted), "json"
}
return string(body), "raw"
}
// --- DOM-based HTML extraction ---
type convertMode int
const (
modeMarkdown convertMode = iota
modeText
)
// converter walks a parsed HTML DOM tree and emits markdown or plain text.
type converter struct {
buf strings.Builder
mode convertMode
inPre bool
listDepth int
listType []atom.Atom // stack: atom.Ul / atom.Ol
listIndex []int // ordered list counters
inLink bool
}
// Elements to skip entirely (element + all descendants).
var skipElements = map[atom.Atom]bool{
atom.Head: true,
atom.Script: true,
atom.Style: true,
atom.Noscript: true,
atom.Svg: true,
atom.Template: true,
atom.Iframe: true,
atom.Select: true,
atom.Option: true,
atom.Button: true,
atom.Input: true,
atom.Form: true,
atom.Nav: true,
atom.Footer: true,
atom.Picture: true,
atom.Source: true,
}
// Additional elements to skip in text mode only.
var skipInTextMode = map[atom.Atom]bool{
atom.Header: true,
atom.Aside: true,
}
// Block elements that need surrounding newlines.
var blockElements = map[atom.Atom]bool{
atom.P: true, atom.Div: true, atom.Section: true, atom.Article: true,
atom.Main: true, atom.H1: true, atom.H2: true, atom.H3: true,
atom.H4: true, atom.H5: true, atom.H6: true, atom.Blockquote: true,
atom.Pre: true, atom.Ul: true, atom.Ol: true, atom.Li: true,
atom.Table: true, atom.Tr: true, atom.Hr: true, atom.Dl: true,
atom.Dt: true, atom.Dd: true, atom.Figure: true, atom.Figcaption: true,
atom.Details: true, atom.Summary: true, atom.Address: true,
}
// htmlToMarkdown converts HTML to a markdown-like format using DOM parsing.
func htmlToMarkdown(rawHTML string) string {
doc, err := html.Parse(strings.NewReader(rawHTML))
if err != nil {
return stripTagsFallback(rawHTML)
}
body := findBody(doc)
c := &converter{mode: modeMarkdown}
c.walkChildren(body)
return cleanOutput(c.buf.String())
}
// htmlToText extracts plain text from HTML content using DOM parsing.
func htmlToText(rawHTML string) string {
doc, err := html.Parse(strings.NewReader(rawHTML))
if err != nil {
return stripTagsFallback(rawHTML)
}
body := findBody(doc)
c := &converter{mode: modeText}
c.walkChildren(body)
return cleanTextOutput(c.buf.String())
}
func (c *converter) walk(n *html.Node) {
switch n.Type {
case html.TextNode:
c.handleText(n)
return
case html.ElementNode:
// handled below
case html.DocumentNode:
c.walkChildren(n)
return
default:
return
}
// Skip hidden elements (display:none, hidden attr, aria-hidden, etc.)
// to prevent hidden-text prompt injection attacks.
if isHiddenElement(n) {
return
}
tag := n.DataAtom
if skipElements[tag] {
return
}
if c.mode == modeText && skipInTextMode[tag] {
return
}
switch tag {
case atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
c.handleHeading(n)
case atom.P:
c.handleParagraph(n)
case atom.A:
c.handleLink(n)
case atom.Img:
c.handleImage(n)
case atom.Pre:
c.handlePre(n)
case atom.Code:
c.handleCode(n)
case atom.Blockquote:
c.handleBlockquote(n)
case atom.Strong, atom.B:
c.handleStrong(n)
case atom.Em, atom.I:
c.handleEmphasis(n)
case atom.Br:
c.buf.WriteByte('\n')
case atom.Hr:
c.ensureNewline()
if c.mode == modeMarkdown {
c.buf.WriteString("---\n")
}
case atom.Ul, atom.Ol:
c.handleList(n)
case atom.Li:
c.handleListItem(n)
case atom.Table:
c.handleTable(n)
case atom.Dt:
c.handleDefinitionTerm(n)
case atom.Dd:
c.handleDefinitionDesc(n)
default:
if blockElements[tag] {
c.ensureNewline()
c.walkChildren(n)
c.ensureNewline()
} else {
c.walkChildren(n)
}
}
}
func (c *converter) walkChildren(n *html.Node) {
for child := n.FirstChild; child != nil; child = child.NextSibling {
c.walk(child)
}
}