-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfetcher.go
147 lines (128 loc) · 3.18 KB
/
fetcher.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
package main
import (
"bytes"
"io/ioutil"
"net/http"
"net/url"
"strings"
"time"
markdown "github.com/JohannesKaufmann/html-to-markdown"
"github.com/PuerkitoBio/goquery"
"github.com/go-shiori/go-readability"
"golang.org/x/net/html"
)
type Options struct {
Concurrency int
Silent bool
Limit int
Matches []string
ContentSelector string
}
type Page struct {
Title string `json:"title"`
URL string `json:"url"`
Content string `json:"content"`
}
func fetchPage(urlStr string, logger *Logger, opts Options) {
parsedURL, err := url.Parse(urlStr)
if err != nil {
logger.Warn("Invalid URL:", urlStr)
return
}
client := &http.Client{
Timeout: 15 * time.Second,
}
resp, err := client.Get(urlStr)
if err != nil {
logger.Warn("Failed to fetch", urlStr, ":", err)
return
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
logger.Warn("Failed to fetch", urlStr, "status:", resp.Status)
return
}
contentType := resp.Header.Get("Content-Type")
if !strings.Contains(contentType, "text/html") {
logger.Warn("Not an HTML page:", urlStr)
return
}
if resp.Request.URL.Host != parsedURL.Host {
logger.Warn("Redirected from", parsedURL.Host, "to", resp.Request.URL.Host)
return
}
bodyBytes, err := ioutil.ReadAll(resp.Body)
if err != nil {
logger.Warn("Failed to read body from", urlStr, ":", err)
return
}
bodyStr := string(bodyBytes)
doc, err := goquery.NewDocumentFromReader(strings.NewReader(bodyStr))
if err != nil {
logger.Warn("Failed to parse HTML for", urlStr, ":", err)
return
}
doc.Find("script, style, link, img, video").Remove()
// Extract extra URLs in the order that they appear.
doc.Find("a").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists || strings.TrimSpace(href) == "" {
return
}
linkURL, err := url.Parse(href)
if err != nil {
return
}
absURL := parsedURL.ResolveReference(linkURL)
if absURL.Host != parsedURL.Host {
return
}
enqueue(absURL.String(), false, opts, logger)
})
pageTitle := doc.Find("title").Text()
var htmlContent string
if opts.ContentSelector != "" {
selection := doc.Find(opts.ContentSelector).First()
if selection.Length() > 0 {
var buf bytes.Buffer
for _, node := range selection.Nodes {
if err := html.Render(&buf, node); err == nil {
htmlContent = buf.String()
break
}
}
}
}
if htmlContent == "" {
htmlContent, err = doc.Html()
if err != nil {
logger.Warn("Failed to get HTML content for", urlStr)
return
}
}
article, err := readability.FromReader(strings.NewReader(htmlContent), parsedURL)
if err != nil {
logger.Warn("Failed to parse article for", urlStr, ":", err)
return
}
converter := markdown.NewConverter("", true, nil)
mdContent, err := converter.ConvertString(article.Content)
if err != nil {
logger.Warn("Failed to convert HTML to Markdown for", urlStr, ":", err)
return
}
finalTitle := article.Title
if finalTitle == "" {
finalTitle = pageTitle
}
norm := normalizeURL(urlStr)
pagesMu.Lock()
if opts.Limit == 0 || len(pages) < opts.Limit {
pages[norm] = Page{
Title: finalTitle,
URL: urlStr,
Content: mdContent,
}
}
pagesMu.Unlock()
}