@@ -2,21 +2,26 @@ package extractor
2
2
3
3
import (
4
4
"bytes"
5
+ "encoding/xml"
5
6
"io"
6
7
"net/http"
7
8
"net/url"
8
9
"os"
10
+ "strings"
9
11
"testing"
10
12
)
11
13
12
14
func TestXML (t * testing.T ) {
13
15
tests := []struct {
14
- name string
15
- xmlBody string
16
- wantURLs []* url.URL
17
- wantURLsCount int
18
- wantErr bool
19
- sitemap bool
16
+ name string
17
+ xmlBody string
18
+ wantURLsLax []* url.URL
19
+ wantURLsStric []* url.URL
20
+ wantURLsCountLax int
21
+ wantURLsCountStric int
22
+ wantErrLax bool
23
+ wantErrStrict bool
24
+ sitemap bool
20
25
}{
21
26
{
22
27
name : "Valid XML with URLs" ,
@@ -28,26 +33,49 @@ func TestXML(t *testing.T) {
28
33
</nested>
29
34
<noturl>just some text</noturl>
30
35
</root>` ,
31
- wantURLs : []* url.URL {
36
+ wantURLsLax : []* url.URL {
37
+ {Scheme : "http" , Host : "example.com" },
38
+ {Scheme : "https" , Host : "example.org" },
39
+ },
40
+ wantURLsStric : []* url.URL {
32
41
{Scheme : "http" , Host : "example.com" },
33
42
{Scheme : "https" , Host : "example.org" },
34
43
},
35
44
sitemap : false ,
36
- wantErr : false ,
37
45
},
38
46
{
39
- name : "Empty XML" ,
40
- xmlBody : `<root></root>` ,
41
- wantURLs : nil ,
42
- wantErr : false ,
43
- sitemap : false ,
47
+ name : "unbalanced XML with URLs" ,
48
+ xmlBody : `
49
+ <unbalance>
50
+ <url>http://example.com</url>
51
+ </unbalance></unbalance></unbalance>
52
+ <outsideurl>https://unclosed.example.com</outsideurl>` ,
53
+ wantURLsStric : []* url.URL {
54
+ {Scheme : "http" , Host : "example.com" },
55
+ },
56
+ wantURLsLax : []* url.URL {
57
+ {Scheme : "http" , Host : "example.com" },
58
+ {Scheme : "https" , Host : "unclosed.example.com" },
59
+ },
60
+ wantErrStrict : true ,
61
+ wantErrLax : false ,
62
+ sitemap : false ,
63
+ },
64
+ {
65
+ name : "Empty XML" ,
66
+ xmlBody : `<root></root>` ,
67
+ wantURLsStric : nil ,
68
+ wantURLsLax : nil ,
69
+ sitemap : false ,
44
70
},
45
71
{
46
- name : "Invalid XML" ,
47
- xmlBody : `<root><unclosed>` ,
48
- wantURLs : nil ,
49
- wantErr : true ,
50
- sitemap : false ,
72
+ name : "alien XML" ,
73
+ xmlBody : `<h4 73><?/>/<AS "='AS "ASD@'SD>,as;g^&R$W#Sf)(U><l;rpkv ]])` ,
74
+ wantURLsStric : nil ,
75
+ wantURLsLax : nil ,
76
+ wantErrStrict : true ,
77
+ wantErrLax : true ,
78
+ sitemap : false ,
51
79
},
52
80
{
53
81
name : "XML with invalid URL" ,
@@ -56,48 +84,54 @@ func TestXML(t *testing.T) {
56
84
<item>http://example.com</item>
57
85
<item>not a valid url</item>
58
86
</root>` ,
59
- wantURLs : []* url.URL {
87
+ wantURLsStric : []* url.URL {
60
88
{Scheme : "http" , Host : "example.com" },
61
89
},
62
- wantErr : false ,
63
- sitemap : false ,
90
+ wantURLsLax : []* url.URL {
91
+ {Scheme : "http" , Host : "example.com" },
92
+ },
93
+ wantErrStrict : false ,
94
+ wantErrLax : false ,
95
+ sitemap : false ,
64
96
},
65
97
{
66
- name : "Huge sitemap" ,
67
- xmlBody : loadTestFile (t , "xml_test_sitemap.xml" ),
68
- wantURLsCount : 100002 ,
69
- wantErr : false ,
70
- sitemap : true ,
98
+ name : "Huge sitemap" ,
99
+ xmlBody : loadTestFile (t , "xml_test_sitemap.xml" ),
100
+ wantURLsCountStric : 100002 ,
101
+ wantURLsCountLax : 100002 ,
102
+ wantErrStrict : false ,
103
+ wantErrLax : false ,
104
+ sitemap : true ,
71
105
},
72
106
}
73
107
74
108
for _ , tt := range tests {
75
109
t .Run (tt .name , func (t * testing.T ) {
76
- resp := & http.Response {
77
- Body : io .NopCloser (bytes .NewBufferString (tt .xmlBody )),
78
- }
79
-
80
- gotURLs , sitemap , err := XML (resp )
81
- if (err != nil ) != tt .wantErr {
82
- t .Errorf ("XML() error = %v, wantErr %v" , err , tt .wantErr )
83
- return
84
- }
85
-
86
- if tt .wantURLsCount != 0 {
87
- if len (gotURLs ) != tt .wantURLsCount {
88
- t .Errorf ("XML() gotURLs count = %v, want %v" , len (gotURLs ), tt .wantURLsCount )
110
+ testMode := func (strict bool , wantErr bool , wantURLs []* url.URL , wantURLsCount int ) {
111
+ resp := & http.Response {
112
+ Body : io .NopCloser (bytes .NewBufferString (tt .xmlBody )),
89
113
}
90
- }
91
-
92
- if tt .wantURLs != nil {
93
- if ! compareURLs (gotURLs , tt .wantURLs ) {
94
- t .Errorf ("XML() gotURLs = %v, want %v" , gotURLs , tt .wantURLs )
114
+ gotURLs , sitemap , err := XML (resp , strict )
115
+ if (err != nil ) != wantErr {
116
+ t .Errorf ("XML() strict = %v, error = %v, wantErr %v" , strict , err , wantErr )
117
+ return
118
+ }
119
+ if wantURLsCount != 0 && len (gotURLs ) != wantURLsCount {
120
+ t .Errorf ("XML() strict = %v, gotURLs count = %v, want %v" , strict , len (gotURLs ), wantURLsCount )
121
+ }
122
+ if wantURLs != nil && ! compareURLs (gotURLs , wantURLs ) {
123
+ t .Errorf ("XML() strict = %v, gotURLs = %v, want %v" , strict , gotURLs , wantURLs )
124
+ }
125
+ if tt .sitemap != sitemap {
126
+ t .Errorf ("XML() strict = %v, sitemap = %v, want %v" , strict , sitemap , tt .sitemap )
95
127
}
96
128
}
97
129
98
- if tt .sitemap != sitemap {
99
- t .Errorf ("XML() sitemap = %v, want %v" , sitemap , tt .sitemap )
100
- }
130
+ // Strict mode
131
+ testMode (true , tt .wantErrStrict , tt .wantURLsStric , tt .wantURLsCountStric )
132
+
133
+ // Lax mode
134
+ testMode (false , tt .wantErrLax , tt .wantURLsLax , tt .wantURLsCountLax )
101
135
})
102
136
}
103
137
}
@@ -116,14 +150,32 @@ func loadTestFile(t *testing.T, path string) string {
116
150
return string (b )
117
151
}
118
152
119
- func TestXMLBodyReadError (t * testing.T ) {
153
+ func TestXMLBodySyntaxEOFErrorStrict (t * testing.T ) {
154
+ wantErr := xml.SyntaxError {Line : 3 , Msg : "unexpected EOF" }
120
155
resp := & http.Response {
121
- Body : io .NopCloser (bytes .NewReader ([]byte {})), // Empty reader to simulate EOF
156
+ Body : io .NopCloser (strings .NewReader (
157
+ `<unclosed>
158
+ <closed>
159
+ </closed> <!-- Syntax EOF here -->` )),
122
160
}
123
- resp .Body .Close () // Close the body to simulate a read error
124
-
125
- _ , _ , err := XML (resp )
161
+ _ , _ , err := XML (resp , true )
126
162
if err == nil {
127
- t .Errorf ("XML() expected error, got nil" )
163
+ t .Errorf ("XML() error = %v, wantErr %v" , err , wantErr )
164
+ return
165
+ }
166
+ if err .Error () != wantErr .Error () {
167
+ t .Errorf ("XML() error = %v, wantErr %v" , err , wantErr )
168
+ }
169
+ }
170
+
171
+ func TestXMLBodySyntaxEOFErrorLax (t * testing.T ) {
172
+ resp := & http.Response {
173
+ Body : io .NopCloser (strings .NewReader (`<unclosed>
174
+ <closed>
175
+ </closed> <!-- ignore Syntax EOF here -->` )),
176
+ }
177
+ _ , _ , err := XML (resp , false )
178
+ if err != nil {
179
+ t .Errorf ("XML() error = %v, wantErr nil" , err )
128
180
}
129
181
}
0 commit comments