Files
viettranx 6dcac3d7e3 fix(kg): sanitize LLM JSON output before parsing (#167)
Add sanitizeJSON() to fix malformed decimal numbers (e.g. '0. 85' → '0.85')
and trailing commas before closing brackets. Fixes extraction failures with
Gemini 2.5 Flash which occasionally produces invalid JSON.
2026-03-14 16:22:48 +07:00

164 lines
3.9 KiB
Go

package knowledgegraph
import (
"encoding/json"
"testing"
)
func TestSanitizeJSON(t *testing.T) {
tests := []struct {
name string
input string
want string
}{
{
name: "valid JSON unchanged",
input: `{"confidence": 0.85}`,
want: `{"confidence": 0.85}`,
},
{
name: "fix spaced decimal",
input: `{"confidence": 0. 85}`,
want: `{"confidence": 0.85}`,
},
{
name: "fix multiple spaced decimals",
input: `{"a": 0. 9, "b": 1. 0}`,
want: `{"a": 0.9, "b": 1.0}`,
},
{
name: "fix spaced decimal with multiple spaces",
input: `{"confidence": 0. 75}`,
want: `{"confidence": 0.75}`,
},
{
name: "preserve decimal-like pattern in strings",
input: `{"description": "Founded in 2023. 15 employees"}`,
want: `{"description": "Founded in 2023. 15 employees"}`,
},
{
name: "preserve spaced period in strings (no digit before dot)",
input: `{"description": "Mr. Smith leads the team"}`,
want: `{"description": "Mr. Smith leads the team"}`,
},
{
name: "mixed: fix value decimal, preserve string decimal",
input: `{"description": "Version 2. 0 alpha", "confidence": 0. 9}`,
want: `{"description": "Version 2. 0 alpha", "confidence": 0.9}`,
},
{
name: "fix trailing comma in array",
input: `{"items": [1, 2, 3,]}`,
want: `{"items": [1, 2, 3]}`,
},
{
name: "fix trailing comma in object",
input: `{"a": 1, "b": 2,}`,
want: `{"a": 1, "b": 2}`,
},
{
name: "trailing comma with whitespace",
input: `{"a": 1, }`,
want: `{"a": 1 }`,
},
{
name: "trailing comma with newline",
input: "{\"a\": 1,\n}",
want: "{\"a\": 1\n}",
},
{
name: "preserve comma in string value",
input: `{"text": "hello, world,"}`,
want: `{"text": "hello, world,"}`,
},
{
name: "escaped quote in string",
input: `{"text": "she said \"0. 5\" loudly", "val": 0. 5}`,
want: `{"text": "she said \"0. 5\" loudly", "val": 0.5}`,
},
{
name: "nested structure",
input: `{"entities": [{"confidence": 0. 85,}]}`,
want: `{"entities": [{"confidence": 0.85}]}`,
},
{
name: "empty input",
input: "",
want: "",
},
{
name: "no fixes needed",
input: `{"entities": [], "relations": []}`,
want: `{"entities": [], "relations": []}`,
},
{
name: "truncated LLM response does not crash",
input: `{"entities": [{"name": "Facebook Ads", "confidence": 0.9}, {"name": "TikT`,
want: `{"entities": [{"name": "Facebook Ads", "confidence": 0.9}, {"name": "TikT`,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := sanitizeJSON(tt.input)
if got != tt.want {
t.Errorf("sanitizeJSON():\n input: %s\n got: %s\n want: %s", tt.input, got, tt.want)
}
})
}
}
func TestTruncatedJSONParseFails(t *testing.T) {
truncated := `{"entities": [{"name": "Facebook Ads", "confidence": 0.9}, {"name": "TikT`
sanitized := sanitizeJSON(truncated)
var result ExtractionResult
err := json.Unmarshal([]byte(sanitized), &result)
if err == nil {
t.Error("expected parse error for truncated JSON, got nil")
}
}
func TestStripCodeBlock(t *testing.T) {
tests := []struct {
name string
input string
want string
}{
{
name: "no code block",
input: `{"entities": []}`,
want: `{"entities": []}`,
},
{
name: "json code block",
input: "```json\n{\"entities\": []}\n```",
want: `{"entities": []}`,
},
{
name: "plain code block",
input: "```\n{\"entities\": []}\n```",
want: `{"entities": []}`,
},
{
name: "code block with surrounding whitespace",
input: " ```json\n{\"entities\": []}\n``` ",
want: `{"entities": []}`,
},
{
name: "empty input",
input: "",
want: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := stripCodeBlock(tt.input)
if got != tt.want {
t.Errorf("stripCodeBlock():\n got: %s\n want: %s", got, tt.want)
}
})
}
}