This repository was archived by the owner on Nov 7, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.go
196 lines (169 loc) · 5.9 KB
/
extract.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
// extract_href is a command line tool for extracting urls from a HTML web page.
// writing each url on a new line.
// Each matched url is:
// * absolute (referenced by source url)
// * unique - (no duplicates are added to the list)
// * refers to a separate resource - (no url fragments)
// it uses a jquery-style selector to search the HTML document for elements that have an href attribute
// to construct a de-duplicated list of href attributes
// It has three major command line options:
// * u - (required) url of html document
// * o - path to output file
// * s - (required, default: "a") - jquery-style selector to match html elements
// example use:
// ```
// ./extract_href -u https://www.epa.gov/endangered-species/biological-evaluation-chapters-chlorpyrifos-esa-assessment -s '.main-column.clearfix a'
// ```
// this will fetch the epa.gov url, select all "a" tags in the document that are a decendant of any element with the classes "main-column" and "clearfix"
// and build a deduplicated list of absolute urls using the `href` attribute of all found anchor tags. run that same command adding `-o urls.txt` to save
// the results to a file and see output stats instead.
// Picking the right jquery selector is a bit of an art, the goal is to isolate the most general part of the page that contains all of the links
// that you're after. For more information on jquery selectors and how they work, have a look here: https://learn.jquery.com/using-jquery-core/selecting-elements/
// When in doubt, it's often fine to leave the default "a" selector, which will generate lots of links you may not want,
// and manually remove them from the output file.
package main
import (
"flag"
"fmt"
"github.com/PuerkitoBio/goquery"
"io"
"net/http"
"net/url"
"os"
)
var (
showHelp bool
outFile string
rootUrl string
selector string
)
func init() {
flag.BoolVar(&showHelp, "h", false, "print help text")
flag.StringVar(&outFile, "o", "", "path to write file to")
flag.StringVar(&rootUrl, "u", "", "url to fetch links from")
flag.StringVar(&selector, "s", "a", "jquery-style selector to scope url search to, default is 'a'")
}
func main() {
// parse flags, grabbing values from the command line
flag.Parse()
if len(os.Args) == 1 || showHelp {
PrintHelpText()
return
}
// allocate a new results writer
w, err := NewResultsWriter(outFile)
if err != nil {
fmt.Println(err.Error())
return
}
stats, err := FetchAndWriteHrefAttrs(rootUrl, selector, w)
if err != nil {
fmt.Println(err.Error())
return
}
// if stdout isn't being used for output, write stats to stdout
if w != os.Stderr {
fmt.Println(stats)
}
// check to see if our writer implements the closer interface,
// call close if so
if closer, ok := w.(io.Closer); ok {
if err := closer.Close(); err != nil {
fmt.Println(err.Error())
return
}
}
}
// Stats tracks state for extracting hrefs from a given document
type Stats struct {
// elements matched by selector
Elements int
// elements that have an "href" attribute
WithHref int
// elements who's absolute url was a duplicate
Duplicates int
// elements with a valid url
ValidUrl int
}
// stats implements the stringer interface
func (s *Stats) String() string {
return fmt.Sprintf("%d matched HTML elements\n%d had a href attribute.\n%d were duplicates\n%d were valid\n", s.Elements, s.WithHref, s.Duplicates, s.ValidUrl)
}
// NewResultsWriter writes to either a file or stderr if no path is provided
func NewResultsWriter(path string) (io.Writer, error) {
if path != "" {
return os.Create(path)
}
return os.Stderr, nil
}
// FetchAndWriteHrefAttrs fetches a given url, and uses the provided jquery-style selector to grab
// all of the "href" attributes for a given url HTML document, writing a line-delimited list of
// deduplicated absolute urls to w
func FetchAndWriteHrefAttrs(rootUrl, selector string, w io.Writer) (*Stats, error) {
// check for required params
if rootUrl == "" {
return nil, fmt.Errorf("url is required")
}
root, err := url.Parse(rootUrl)
if err != nil {
return nil, err
}
doc, err := fetchGoqueryUrl(rootUrl)
if err != nil {
return nil, err
}
// find the selected HTML elements
elements := doc.Find(selector)
// create a stats object with the total number of matched element
stats := &Stats{Elements: elements.Length()}
// added is a list of urls that have been added already
added := map[string]bool{}
// iterate through elements
for i := range elements.Nodes {
el := elements.Eq(i)
if href, exists := el.Attr("href"); exists {
stats.WithHref++
// Reslove any relative url references by parsing the href in relation
// to the root url
abs, err := root.Parse(href)
if err != nil {
// TODO - handle error here
continue
}
// remove fragement from url element, this will make
abs.Fragment = ""
absStr := abs.String()
if absStr != rootUrl && added[absStr] == false {
added[absStr] = true
stats.ValidUrl++
// write the url as a line to the writer
w.Write([]byte(fmt.Sprintf("%s\n", abs.String())))
} else {
stats.Duplicates++
}
}
}
return stats, nil
}
// fetchGoqueryUrl performs a GET to the passed in url, returning a parsed goquery document
func fetchGoqueryUrl(urlstr string) (*goquery.Document, error) {
resp, err := http.Get(urlstr)
if err != nil {
return nil, err
}
return goquery.NewDocumentFromReader(resp.Body)
}
// PrintHelpText outputs instructions for using this program to os.Stdout
func PrintHelpText() {
fmt.Println(`
extract_href is a command line tool for extracting urls from a HTML web page.
writing each url on a new line.
Each matched url is:
* absolute (referenced by source url)
* unique - (no duplicates are added to the list)
* refers to a separate resource - (no url fragments)
extract_href uses a jquery-style selector to search the HTML document for elements that have an href attribute
to construct a de-duplicated list of href attributes.
options:`)
flag.PrintDefaults()
}