diff --git a/internal/pkg/crawl/extractor/xml_test.go b/internal/pkg/crawl/extractor/xml_test.go
index 3dbb1cfa..5c975c86 100644
--- a/internal/pkg/crawl/extractor/xml_test.go
+++ b/internal/pkg/crawl/extractor/xml_test.go
@@ -2,21 +2,26 @@ package extractor
import (
"bytes"
+ "encoding/xml"
"io"
"net/http"
"net/url"
"os"
+ "strings"
"testing"
)
func TestXML(t *testing.T) {
tests := []struct {
- name string
- xmlBody string
- wantURLs []*url.URL
- wantURLsCount int
- wantErr bool
- sitemap bool
+ name string
+ xmlBody string
+ wantURLsLax []*url.URL
+ wantURLsStric []*url.URL
+ wantURLsCountLax int
+ wantURLsCountStric int
+ wantErrLax bool
+ wantErrStrict bool
+ sitemap bool
}{
{
name: "Valid XML with URLs",
@@ -28,26 +33,49 @@ func TestXML(t *testing.T) {
just some text
`,
- wantURLs: []*url.URL{
+ wantURLsLax: []*url.URL{
+ {Scheme: "http", Host: "example.com"},
+ {Scheme: "https", Host: "example.org"},
+ },
+ wantURLsStric: []*url.URL{
{Scheme: "http", Host: "example.com"},
{Scheme: "https", Host: "example.org"},
},
sitemap: false,
- wantErr: false,
},
{
- name: "Empty XML",
- xmlBody: ``,
- wantURLs: nil,
- wantErr: false,
- sitemap: false,
+ name: "unbalanced XML with URLs",
+ xmlBody: `
+
+ http://example.com
+
+ https://unclosed.example.com`,
+ wantURLsStric: []*url.URL{
+ {Scheme: "http", Host: "example.com"},
+ },
+ wantURLsLax: []*url.URL{
+ {Scheme: "http", Host: "example.com"},
+ {Scheme: "https", Host: "unclosed.example.com"},
+ },
+ wantErrStrict: true,
+ wantErrLax: false,
+ sitemap: false,
+ },
+ {
+ name: "Empty XML",
+ xmlBody: ``,
+ wantURLsStric: nil,
+ wantURLsLax: nil,
+ sitemap: false,
},
{
- name: "Invalid XML",
- xmlBody: ``,
- wantURLs: nil,
- wantErr: true,
- sitemap: false,
+ name: "alien XML",
+ xmlBody: `/>/,as;g^&R$W#Sf)(U>http://example.com
- not a valid url
`,
- wantURLs: []*url.URL{
+ wantURLsStric: []*url.URL{
{Scheme: "http", Host: "example.com"},
},
- wantErr: false,
- sitemap: false,
+ wantURLsLax: []*url.URL{
+ {Scheme: "http", Host: "example.com"},
+ },
+ wantErrStrict: false,
+ wantErrLax: false,
+ sitemap: false,
},
{
- name: "Huge sitemap",
- xmlBody: loadTestFile(t, "xml_test_sitemap.xml"),
- wantURLsCount: 100002,
- wantErr: false,
- sitemap: true,
+ name: "Huge sitemap",
+ xmlBody: loadTestFile(t, "xml_test_sitemap.xml"),
+ wantURLsCountStric: 100002,
+ wantURLsCountLax: 100002,
+ wantErrStrict: false,
+ wantErrLax: false,
+ sitemap: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- resp := &http.Response{
- Body: io.NopCloser(bytes.NewBufferString(tt.xmlBody)),
- }
-
- gotURLs, sitemap, err := XML(resp)
- if (err != nil) != tt.wantErr {
- t.Errorf("XML() error = %v, wantErr %v", err, tt.wantErr)
- return
- }
-
- if tt.wantURLsCount != 0 {
- if len(gotURLs) != tt.wantURLsCount {
- t.Errorf("XML() gotURLs count = %v, want %v", len(gotURLs), tt.wantURLsCount)
+ testMode := func(strict bool, wantErr bool, wantURLs []*url.URL, wantURLsCount int) {
+ resp := &http.Response{
+ Body: io.NopCloser(bytes.NewBufferString(tt.xmlBody)),
}
- }
-
- if tt.wantURLs != nil {
- if !compareURLs(gotURLs, tt.wantURLs) {
- t.Errorf("XML() gotURLs = %v, want %v", gotURLs, tt.wantURLs)
+ gotURLs, sitemap, err := XML(resp, strict)
+ if (err != nil) != wantErr {
+ t.Errorf("XML() strict = %v, error = %v, wantErr %v", strict, err, wantErr)
+ return
+ }
+ if wantURLsCount != 0 && len(gotURLs) != wantURLsCount {
+ t.Errorf("XML() strict = %v, gotURLs count = %v, want %v", strict, len(gotURLs), wantURLsCount)
+ }
+ if wantURLs != nil && !compareURLs(gotURLs, wantURLs) {
+ t.Errorf("XML() strict = %v, gotURLs = %v, want %v", strict, gotURLs, wantURLs)
+ }
+ if tt.sitemap != sitemap {
+ t.Errorf("XML() strict = %v, sitemap = %v, want %v", strict, sitemap, tt.sitemap)
}
}
- if tt.sitemap != sitemap {
- t.Errorf("XML() sitemap = %v, want %v", sitemap, tt.sitemap)
- }
+ // Strict mode
+ testMode(true, tt.wantErrStrict, tt.wantURLsStric, tt.wantURLsCountStric)
+
+ // Lax mode
+ testMode(false, tt.wantErrLax, tt.wantURLsLax, tt.wantURLsCountLax)
})
}
}
@@ -116,14 +150,32 @@ func loadTestFile(t *testing.T, path string) string {
return string(b)
}
-func TestXMLBodyReadError(t *testing.T) {
+func TestXMLBodySyntaxEOFErrorStrict(t *testing.T) {
+ wantErr := xml.SyntaxError{Line: 3, Msg: "unexpected EOF"}
resp := &http.Response{
- Body: io.NopCloser(bytes.NewReader([]byte{})), // Empty reader to simulate EOF
+ Body: io.NopCloser(strings.NewReader(
+ `
+
+ `)),
}
- resp.Body.Close() // Close the body to simulate a read error
-
- _, _, err := XML(resp)
+ _, _, err := XML(resp, true)
if err == nil {
- t.Errorf("XML() expected error, got nil")
+ t.Errorf("XML() error = %v, wantErr %v", err, wantErr)
+ return
+ }
+ if err.Error() != wantErr.Error() {
+ t.Errorf("XML() error = %v, wantErr %v", err, wantErr)
+ }
+}
+
+func TestXMLBodySyntaxEOFErrorLax(t *testing.T) {
+ resp := &http.Response{
+ Body: io.NopCloser(strings.NewReader(`
+
+ `)),
+ }
+ _, _, err := XML(resp, false)
+ if err != nil {
+ t.Errorf("XML() error = %v, wantErr nil", err)
}
}