mirror of
https://github.com/tiennm99/goclaw.git
synced 2026-06-10 00:13:42 +00:00
38dfcf8bb0
Add Cloudflare Worker (fetch.goclaw.sh) as primary markdown extractor with waterfall fallback to built-in HTML→Markdown converter. Architecture: - ExtractorChain pattern with quality gate (min 100 chars, 10 words) - Settings stored in builtin_tools DB table (not config.json5) - ResolveExtractorChain reads chain from context per-request - InProcessExtractor delegates to fetchRawContent (full SSRF + domain policy checks on redirects) - DefuddleExtractor with configurable base_url + timeout - Seed default chain [defuddle, html-to-markdown] for new deployments - Backfill migration for existing deployments Web UI: - Dedicated DnD extractor chain form on builtin tools page - Drag-and-drop ordering, enable/disable per extractor - Timeout + base URL config for Defuddle - i18n support (en/vi/zh)
29 lines
849 B
Go
29 lines
849 B
Go
package tools
|
|
|
|
import (
|
|
"context"
|
|
)
|
|
|
|
// InProcessExtractor delegates to WebFetchTool.fetchRawContent for HTML→markdown
|
|
// extraction with full security checks (SSRF, domain policy on redirects).
|
|
// This is the fallback when external extractors (Defuddle) are unavailable.
|
|
type InProcessExtractor struct {
|
|
tool *WebFetchTool
|
|
}
|
|
|
|
func (e *InProcessExtractor) Name() string { return "html-to-markdown" }
|
|
|
|
// Extract fetches the URL via the tool's fetchRawContent (full security checks)
|
|
// and returns the raw extracted markdown content.
|
|
func (e *InProcessExtractor) Extract(ctx context.Context, rawURL string) (string, error) {
|
|
e.tool.mu.RLock()
|
|
policy := e.tool.policy
|
|
e.tool.mu.RUnlock()
|
|
|
|
raw, err := e.tool.fetchRawContent(ctx, rawURL, "markdown", defaultFetchMaxChars, policy)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return raw.content, nil
|
|
}
|