Files
goclaw/internal/tools/web_fetch_convert_utils.go
viettranx df684e2582 refactor(security): fix false positives and deduplicate web security hardening
- Fix opacity/font-size zero detection false positives (0.5, 0.8 matched as zero)
- Extract scanWebToolResult helper to eliminate DRY violation in agent loop
- Move hidden element detection to dedicated web_fetch_hidden.go file
- Replace map false-entries with comments for hiddenClasses documentation
2026-03-07 19:35:19 +07:00

160 lines
3.6 KiB
Go

package tools
import (
"regexp"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
func getAttr(n *html.Node, key string) string {
for _, a := range n.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}
func findChild(n *html.Node, tag atom.Atom) *html.Node {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.DataAtom == tag {
return c
}
}
return nil
}
func findBody(doc *html.Node) *html.Node {
var find func(*html.Node) *html.Node
find = func(n *html.Node) *html.Node {
if n.Type == html.ElementNode && n.DataAtom == atom.Body {
return n
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if found := find(c); found != nil {
return found
}
}
return nil
}
if body := find(doc); body != nil {
return body
}
return doc
}
func collapseWhitespace(s string) string {
var buf strings.Builder
buf.Grow(len(s))
inSpace := false
for _, r := range s {
if r == ' ' || r == '\t' || r == '\n' || r == '\r' || r == '\f' {
if !inSpace {
buf.WriteByte(' ')
inSpace = true
}
} else {
buf.WriteRune(r)
inSpace = false
}
}
return buf.String()
}
func (c *converter) ensureNewline() {
if c.buf.Len() == 0 {
return
}
s := c.buf.String()
if s[len(s)-1] != '\n' {
c.buf.WriteByte('\n')
}
}
func (c *converter) ensureDoubleNewline() {
if c.buf.Len() == 0 {
return
}
s := c.buf.String()
if len(s) >= 2 && s[len(s)-1] == '\n' && s[len(s)-2] == '\n' {
return
}
if s[len(s)-1] == '\n' {
c.buf.WriteByte('\n')
} else {
c.buf.WriteString("\n\n")
}
}
// collectTableRows extracts rows from a table node. Each row is a slice of cell strings.
func collectTableRows(table *html.Node, mode convertMode) [][]string {
var rows [][]string
var findRows func(*html.Node)
findRows = func(n *html.Node) {
if n.Type == html.ElementNode && n.DataAtom == atom.Tr {
var cells []string
for td := n.FirstChild; td != nil; td = td.NextSibling {
if td.Type == html.ElementNode && (td.DataAtom == atom.Td || td.DataAtom == atom.Th) {
sub := &converter{mode: mode}
sub.walkChildren(td)
cells = append(cells, strings.TrimSpace(sub.buf.String()))
}
}
if len(cells) > 0 {
rows = append(rows, cells)
}
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
findRows(c)
}
}
findRows(table)
return rows
}
// --- output cleanup ---
var reMultiNL = regexp.MustCompile(`\n{3,}`)
func cleanOutput(s string) string {
s = reMultiNL.ReplaceAllString(s, "\n\n")
return strings.TrimSpace(s)
}
func cleanTextOutput(s string) string {
lines := strings.Split(s, "\n")
var clean []string
for _, line := range lines {
line = strings.TrimRight(line, " \t")
clean = append(clean, line)
}
s = strings.Join(clean, "\n")
s = reMultiNL.ReplaceAllString(s, "\n\n")
return strings.TrimSpace(s)
}
// stripTagsFallback is a last-resort fallback if the HTML parser fails.
var reStripTags = regexp.MustCompile(`<[^>]+>`)
func stripTagsFallback(s string) string {
return strings.TrimSpace(reStripTags.ReplaceAllString(s, ""))
}
// markdownToText strips markdown formatting for text mode.
func markdownToText(md string) string {
s := md
s = regexp.MustCompile(`(?m)^#{1,6}\s+`).ReplaceAllString(s, "")
s = strings.ReplaceAll(s, "**", "")
s = strings.ReplaceAll(s, "__", "")
s = regexp.MustCompile("`[^`]+`").ReplaceAllStringFunc(s, func(m string) string {
return strings.Trim(m, "`")
})
s = regexp.MustCompile(`\[([^\]]+)\]\([^)]+\)`).ReplaceAllString(s, "$1")
s = regexp.MustCompile(`!\[([^\]]*)\]\([^)]+\)`).ReplaceAllString(s, "$1")
s = reMultiNL.ReplaceAllString(s, "\n\n")
return strings.TrimSpace(s)
}