Skip to content

Commit b39e2c4

Browse files
committed
more xml tests
1 parent 7385ae4 commit b39e2c4

File tree

1 file changed

+105
-53
lines changed

1 file changed

+105
-53
lines changed

internal/pkg/crawl/extractor/xml_test.go

+105-53
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,26 @@ package extractor
22

33
import (
44
"bytes"
5+
"encoding/xml"
56
"io"
67
"net/http"
78
"net/url"
89
"os"
10+
"strings"
911
"testing"
1012
)
1113

1214
func TestXML(t *testing.T) {
1315
tests := []struct {
14-
name string
15-
xmlBody string
16-
wantURLs []*url.URL
17-
wantURLsCount int
18-
wantErr bool
19-
sitemap bool
16+
name string
17+
xmlBody string
18+
wantURLsLax []*url.URL
19+
wantURLsStric []*url.URL
20+
wantURLsCountLax int
21+
wantURLsCountStric int
22+
wantErrLax bool
23+
wantErrStrict bool
24+
sitemap bool
2025
}{
2126
{
2227
name: "Valid XML with URLs",
@@ -28,26 +33,49 @@ func TestXML(t *testing.T) {
2833
</nested>
2934
<noturl>just some text</noturl>
3035
</root>`,
31-
wantURLs: []*url.URL{
36+
wantURLsLax: []*url.URL{
37+
{Scheme: "http", Host: "example.com"},
38+
{Scheme: "https", Host: "example.org"},
39+
},
40+
wantURLsStric: []*url.URL{
3241
{Scheme: "http", Host: "example.com"},
3342
{Scheme: "https", Host: "example.org"},
3443
},
3544
sitemap: false,
36-
wantErr: false,
3745
},
3846
{
39-
name: "Empty XML",
40-
xmlBody: `<root></root>`,
41-
wantURLs: nil,
42-
wantErr: false,
43-
sitemap: false,
47+
name: "unbalanced XML with URLs",
48+
xmlBody: `
49+
<unbalance>
50+
<url>http://example.com</url>
51+
</unbalance></unbalance></unbalance>
52+
<outsideurl>https://unclosed.example.com</outsideurl>`,
53+
wantURLsStric: []*url.URL{
54+
{Scheme: "http", Host: "example.com"},
55+
},
56+
wantURLsLax: []*url.URL{
57+
{Scheme: "http", Host: "example.com"},
58+
{Scheme: "https", Host: "unclosed.example.com"},
59+
},
60+
wantErrStrict: true,
61+
wantErrLax: false,
62+
sitemap: false,
63+
},
64+
{
65+
name: "Empty XML",
66+
xmlBody: `<root></root>`,
67+
wantURLsStric: nil,
68+
wantURLsLax: nil,
69+
sitemap: false,
4470
},
4571
{
46-
name: "Invalid XML",
47-
xmlBody: `<root><unclosed>`,
48-
wantURLs: nil,
49-
wantErr: true,
50-
sitemap: false,
72+
name: "alien XML",
73+
xmlBody: `<h4 73><?/>/<AS "='AS "ASD@'SD>,as;g^&R$W#Sf)(U><l;rpkv ]])`,
74+
wantURLsStric: nil,
75+
wantURLsLax: nil,
76+
wantErrStrict: true,
77+
wantErrLax: true,
78+
sitemap: false,
5179
},
5280
{
5381
name: "XML with invalid URL",
@@ -56,48 +84,54 @@ func TestXML(t *testing.T) {
5684
<item>http://example.com</item>
5785
<item>not a valid url</item>
5886
</root>`,
59-
wantURLs: []*url.URL{
87+
wantURLsStric: []*url.URL{
6088
{Scheme: "http", Host: "example.com"},
6189
},
62-
wantErr: false,
63-
sitemap: false,
90+
wantURLsLax: []*url.URL{
91+
{Scheme: "http", Host: "example.com"},
92+
},
93+
wantErrStrict: false,
94+
wantErrLax: false,
95+
sitemap: false,
6496
},
6597
{
66-
name: "Huge sitemap",
67-
xmlBody: loadTestFile(t, "xml_test_sitemap.xml"),
68-
wantURLsCount: 100002,
69-
wantErr: false,
70-
sitemap: true,
98+
name: "Huge sitemap",
99+
xmlBody: loadTestFile(t, "xml_test_sitemap.xml"),
100+
wantURLsCountStric: 100002,
101+
wantURLsCountLax: 100002,
102+
wantErrStrict: false,
103+
wantErrLax: false,
104+
sitemap: true,
71105
},
72106
}
73107

74108
for _, tt := range tests {
75109
t.Run(tt.name, func(t *testing.T) {
76-
resp := &http.Response{
77-
Body: io.NopCloser(bytes.NewBufferString(tt.xmlBody)),
78-
}
79-
80-
gotURLs, sitemap, err := XML(resp)
81-
if (err != nil) != tt.wantErr {
82-
t.Errorf("XML() error = %v, wantErr %v", err, tt.wantErr)
83-
return
84-
}
85-
86-
if tt.wantURLsCount != 0 {
87-
if len(gotURLs) != tt.wantURLsCount {
88-
t.Errorf("XML() gotURLs count = %v, want %v", len(gotURLs), tt.wantURLsCount)
110+
testMode := func(strict bool, wantErr bool, wantURLs []*url.URL, wantURLsCount int) {
111+
resp := &http.Response{
112+
Body: io.NopCloser(bytes.NewBufferString(tt.xmlBody)),
89113
}
90-
}
91-
92-
if tt.wantURLs != nil {
93-
if !compareURLs(gotURLs, tt.wantURLs) {
94-
t.Errorf("XML() gotURLs = %v, want %v", gotURLs, tt.wantURLs)
114+
gotURLs, sitemap, err := XML(resp, strict)
115+
if (err != nil) != wantErr {
116+
t.Errorf("XML() strict = %v, error = %v, wantErr %v", strict, err, wantErr)
117+
return
118+
}
119+
if wantURLsCount != 0 && len(gotURLs) != wantURLsCount {
120+
t.Errorf("XML() strict = %v, gotURLs count = %v, want %v", strict, len(gotURLs), wantURLsCount)
121+
}
122+
if wantURLs != nil && !compareURLs(gotURLs, wantURLs) {
123+
t.Errorf("XML() strict = %v, gotURLs = %v, want %v", strict, gotURLs, wantURLs)
124+
}
125+
if tt.sitemap != sitemap {
126+
t.Errorf("XML() strict = %v, sitemap = %v, want %v", strict, sitemap, tt.sitemap)
95127
}
96128
}
97129

98-
if tt.sitemap != sitemap {
99-
t.Errorf("XML() sitemap = %v, want %v", sitemap, tt.sitemap)
100-
}
130+
// Strict mode
131+
testMode(true, tt.wantErrStrict, tt.wantURLsStric, tt.wantURLsCountStric)
132+
133+
// Lax mode
134+
testMode(false, tt.wantErrLax, tt.wantURLsLax, tt.wantURLsCountLax)
101135
})
102136
}
103137
}
@@ -116,14 +150,32 @@ func loadTestFile(t *testing.T, path string) string {
116150
return string(b)
117151
}
118152

119-
func TestXMLBodyReadError(t *testing.T) {
153+
func TestXMLBodySyntaxEOFErrorStrict(t *testing.T) {
154+
wantErr := xml.SyntaxError{Line: 3, Msg: "unexpected EOF"}
120155
resp := &http.Response{
121-
Body: io.NopCloser(bytes.NewReader([]byte{})), // Empty reader to simulate EOF
156+
Body: io.NopCloser(strings.NewReader(
157+
`<unclosed>
158+
<closed>
159+
</closed> <!-- Syntax EOF here -->`)),
122160
}
123-
resp.Body.Close() // Close the body to simulate a read error
124-
125-
_, _, err := XML(resp)
161+
_, _, err := XML(resp, true)
126162
if err == nil {
127-
t.Errorf("XML() expected error, got nil")
163+
t.Errorf("XML() error = %v, wantErr %v", err, wantErr)
164+
return
165+
}
166+
if err.Error() != wantErr.Error() {
167+
t.Errorf("XML() error = %v, wantErr %v", err, wantErr)
168+
}
169+
}
170+
171+
func TestXMLBodySyntaxEOFErrorLax(t *testing.T) {
172+
resp := &http.Response{
173+
Body: io.NopCloser(strings.NewReader(`<unclosed>
174+
<closed>
175+
</closed> <!-- ignore Syntax EOF here -->`)),
176+
}
177+
_, _, err := XML(resp, false)
178+
if err != nil {
179+
t.Errorf("XML() error = %v, wantErr nil", err)
128180
}
129181
}

0 commit comments

Comments
 (0)