|
4 | 4 | package main
|
5 | 5 |
|
6 | 6 | import (
|
7 |
| - "bytes" |
| 7 | + "bufio" |
| 8 | + "compress/gzip" |
8 | 9 | "encoding/xml"
|
9 | 10 | "flag"
|
10 | 11 | "fmt"
|
11 |
| - "io/ioutil" |
12 | 12 | "log"
|
13 |
| - "net/http" |
14 |
| - "sync" |
| 13 | + "os" |
15 | 14 | )
|
16 | 15 |
|
17 | 16 | var (
|
18 | 17 | dir = flag.String("dir", "", "Directory which holds Users.xml file")
|
19 |
| - dryRun = flag.Bool("dry", true, "Only show mutations.") |
| 18 | + output = flag.String("output", "posts.rdf.gz", "Output rdf.gz file") |
20 | 19 | )
|
21 | 20 |
|
22 | 21 | type PostHistory struct {
|
@@ -46,14 +45,6 @@ type Post struct {
|
46 | 45 | OwnerUserId string `xml:",attr"`
|
47 | 46 | }
|
48 | 47 |
|
49 |
| -type Posts struct { |
50 |
| - Rows []Post `xml:"row"` |
51 |
| -} |
52 |
| - |
53 |
| -type Logs struct { |
54 |
| - Rows []PostHistory `xml:"row"` |
55 |
| -} |
56 |
| - |
57 | 48 | func check(err error) {
|
58 | 49 | if err != nil {
|
59 | 50 | log.Fatal(err)
|
@@ -88,152 +79,197 @@ func parseTags(tagString string) []string {
|
88 | 79 | func main() {
|
89 | 80 | flag.Parse()
|
90 | 81 |
|
91 |
| - data, err := ioutil.ReadFile(*dir + "/Posts.xml") |
| 82 | + err := os.RemoveAll(*output) |
92 | 83 | check(err)
|
93 |
| - var posts Posts |
94 |
| - check(xml.Unmarshal(data, &posts)) |
95 | 84 |
|
96 |
| - data, err = ioutil.ReadFile(*dir + "/PostHistory.xml") |
| 85 | + o, err := os.OpenFile(*output, os.O_WRONLY|os.O_CREATE, 0755) |
97 | 86 | check(err)
|
98 |
| - var logs Logs |
99 |
| - check(xml.Unmarshal(data, &logs)) |
100 |
| - |
101 |
| - fmt.Println("dryrun: ", *dryRun) |
102 |
| - var wg sync.WaitGroup |
103 |
| - limiter := make(chan struct{}, 80) |
104 |
| - if *dryRun { |
105 |
| - limiter = make(chan struct{}, 1) |
106 |
| - } |
107 | 87 |
|
108 |
| - send := func(b *bytes.Buffer) { |
109 |
| - limiter <- struct{}{} |
110 |
| - // fmt.Println(b.String()) |
111 |
| - if *dryRun == false { |
112 |
| - //fmt.Println("POSTing") |
113 |
| - resp, err := http.Post("http://localhost:8080/query", "", b) |
114 |
| - check(err) |
115 |
| - _, err = ioutil.ReadAll(resp.Body) |
116 |
| - check(err) |
117 |
| - check(resp.Body.Close()) |
118 |
| - } |
119 |
| - wg.Done() |
120 |
| - <-limiter |
121 |
| - } |
122 |
| - |
123 |
| - // First generate all the versions. |
124 |
| - for _, p := range posts.Rows { |
125 |
| - var b bytes.Buffer |
| 88 | + pf, err := os.Open(*dir + "/Posts.xml") |
| 89 | + check(err) |
| 90 | + phf, err := os.Open(*dir + "/PostHistory.xml") |
| 91 | + check(err) |
| 92 | + w := gzip.NewWriter(o) |
126 | 93 |
|
127 |
| - node := "p" + p.Id |
128 |
| - b.WriteString("mutation { set { ") |
| 94 | + log.Println("1/2 Reading file") |
| 95 | + pc := bufio.NewReader(pf) |
| 96 | + pcd := xml.NewDecoder(pc) |
129 | 97 |
|
130 |
| - if len(p.LastEditDate) == 0 { |
131 |
| - p.LastEditDate = p.LastActivityDate |
132 |
| - } |
133 |
| - if len(p.LastEditorUserId) == 0 { |
134 |
| - p.LastEditorUserId = p.OwnerUserId |
135 |
| - } |
136 |
| - if len(p.LastEditorUserId) == 0 || len(p.LastEditDate) == 0 { |
137 |
| - continue |
138 |
| - } |
| 98 | + phc := bufio.NewReader(phf) |
| 99 | + phd := xml.NewDecoder(phc) |
139 | 100 |
|
140 |
| - // First create the versions correctly, and attach them to the node. |
141 |
| - { |
142 |
| - b.WriteString(fmt.Sprintf("_:newTitle <Timestamp> %q .\n", p.LastEditDate)) |
143 |
| - b.WriteString(fmt.Sprintf("_:newTitle <Author> <u%s> .\n", p.LastEditorUserId)) |
144 |
| - b.WriteString(fmt.Sprintf("_:newTitle <Post> <%s> .\n", node)) |
145 |
| - b.WriteString(fmt.Sprintf("_:newTitle <Text> %q .\n", p.Title)) |
146 |
| - b.WriteString(fmt.Sprintf("_:newTitle <Type> \"Title\" .\n")) |
| 101 | + var str string |
| 102 | + postHistoryIdx := 0 |
147 | 103 |
|
148 |
| - b.WriteString(fmt.Sprintf("<%s> <Title> _:newTitle .\n", node)) |
| 104 | + // First generate all the versions. |
| 105 | + for { |
| 106 | + t, _ := pcd.Token() |
| 107 | + if t == nil { |
| 108 | + break |
149 | 109 | }
|
150 | 110 |
|
151 |
| - // Generate tag node for each tag in the tag string |
152 |
| - tagList := parseTags(p.Tags) |
153 |
| - for idx, tag := range tagList { |
154 |
| - b.WriteString(fmt.Sprintf("<t-%v> <Timestamp> %q .\n", idx, p.LastEditDate)) |
155 |
| - b.WriteString(fmt.Sprintf("<t-%v> <Author> <u%s> .\n", idx, p.LastEditorUserId)) |
156 |
| - b.WriteString(fmt.Sprintf("<t-%v> <Post> <%s> .\n", idx, node)) |
157 |
| - b.WriteString(fmt.Sprintf("<t-%v> <Text> %q .\n", idx, tag)) |
158 |
| - b.WriteString(fmt.Sprintf("<t-%v> <Type> \"Tag\" .\n", idx)) |
159 |
| - |
160 |
| - b.WriteString(fmt.Sprintf("<%s> <Tags> <t-%v> .\n", node, idx)) |
| 111 | + switch se := t.(type) { |
| 112 | + case xml.StartElement: |
| 113 | + if se.Name.Local == "row" { |
| 114 | + var p Post |
| 115 | + pcd.DecodeElement(&p, &se) |
| 116 | + |
| 117 | + node := "p" + p.Id |
| 118 | + |
| 119 | + if len(p.LastEditDate) == 0 { |
| 120 | + p.LastEditDate = p.LastActivityDate |
| 121 | + } |
| 122 | + if len(p.LastEditorUserId) == 0 { |
| 123 | + p.LastEditorUserId = p.OwnerUserId |
| 124 | + } |
| 125 | + if len(p.LastEditorUserId) == 0 || len(p.LastEditDate) == 0 { |
| 126 | + continue |
| 127 | + } |
| 128 | + |
| 129 | + // First create the versions correctly, and attach them to the node. |
| 130 | + { |
| 131 | + str = fmt.Sprintf("<ph%v> <Timestamp> %q .\n", postHistoryIdx, p.LastEditDate) |
| 132 | + w.Write([]byte(str)) |
| 133 | + str = fmt.Sprintf("<ph%v> <Author> <u%s> .\n", postHistoryIdx, p.LastEditorUserId) |
| 134 | + w.Write([]byte(str)) |
| 135 | + str = fmt.Sprintf("<ph%v> <Post> <%s> .\n", postHistoryIdx, node) |
| 136 | + w.Write([]byte(str)) |
| 137 | + str = fmt.Sprintf("<ph%v> <Text> %q .\n", postHistoryIdx, p.Title) |
| 138 | + w.Write([]byte(str)) |
| 139 | + str = fmt.Sprintf("<ph%v> <Type> \"Title\" .\n", postHistoryIdx) |
| 140 | + w.Write([]byte(str)) |
| 141 | + str = fmt.Sprintf("<%s> <Title> <ph%v> .\n", node, postHistoryIdx) |
| 142 | + w.Write([]byte(str)) |
| 143 | + postHistoryIdx++ |
| 144 | + } |
| 145 | + |
| 146 | + // Generate tag node for each tag in the tag string |
| 147 | + tagList := parseTags(p.Tags) |
| 148 | + for idx, tag := range tagList { |
| 149 | + str = fmt.Sprintf("<t-%v> <Timestamp> %q .\n", idx, p.LastEditDate) |
| 150 | + w.Write([]byte(str)) |
| 151 | + str = fmt.Sprintf("<t-%v> <Author> <u%s> .\n", idx, p.LastEditorUserId) |
| 152 | + w.Write([]byte(str)) |
| 153 | + str = fmt.Sprintf("<t-%v> <Post> <%s> .\n", idx, node) |
| 154 | + w.Write([]byte(str)) |
| 155 | + str = fmt.Sprintf("<t-%v> <Text> %q .\n", idx, tag) |
| 156 | + w.Write([]byte(str)) |
| 157 | + str = fmt.Sprintf("<t-%v> <Type> \"Tag\" .\n", idx) |
| 158 | + w.Write([]byte(str)) |
| 159 | + str = fmt.Sprintf("<%s> <Tags> <t-%v> .\n", node, idx) |
| 160 | + w.Write([]byte(str)) |
| 161 | + } |
| 162 | + |
| 163 | + { |
| 164 | + str = fmt.Sprintf("<ph%v> <Timestamp> %q .\n", postHistoryIdx, p.LastEditDate) |
| 165 | + w.Write([]byte(str)) |
| 166 | + str = fmt.Sprintf("<ph%v> <Author> <u%s> .\n", postHistoryIdx, p.LastEditorUserId) |
| 167 | + w.Write([]byte(str)) |
| 168 | + str = fmt.Sprintf("<ph%v> <Post> <%s> .\n", postHistoryIdx, node) |
| 169 | + w.Write([]byte(str)) |
| 170 | + str = fmt.Sprintf("<ph%v> <Text> %q .\n", postHistoryIdx, p.Body) |
| 171 | + w.Write([]byte(str)) |
| 172 | + str = fmt.Sprintf("<ph%v> <Type> \"Body\" .\n", postHistoryIdx) |
| 173 | + w.Write([]byte(str)) |
| 174 | + str = fmt.Sprintf("<%s> <Body> <ph%v> .\n", node, postHistoryIdx) |
| 175 | + w.Write([]byte(str)) |
| 176 | + postHistoryIdx++ |
| 177 | + } |
| 178 | + |
| 179 | + // Now create the actual post. |
| 180 | + if p.PostTypeId == 1 { |
| 181 | + str = fmt.Sprintf("<%s> <Type> \"Question\" .\n", node) |
| 182 | + w.Write([]byte(str)) |
| 183 | + |
| 184 | + // Relation from question to accepted answer. |
| 185 | + if len(p.AcceptedAnswerId) > 0 { |
| 186 | + str = fmt.Sprintf("<%s> <Chosen.Answer> <p%s> .\n", node, p.AcceptedAnswerId) |
| 187 | + w.Write([]byte(str)) |
| 188 | + str = fmt.Sprintf("<%s> <Has.Answer> <p%s> .\n", node, p.AcceptedAnswerId) |
| 189 | + w.Write([]byte(str)) |
| 190 | + } |
| 191 | + |
| 192 | + } else if p.PostTypeId == 2 { |
| 193 | + str = fmt.Sprintf("<%s> <Type> \"Answer\" .\n", node) |
| 194 | + w.Write([]byte(str)) |
| 195 | + |
| 196 | + // Relation from question to answer. |
| 197 | + if len(p.ParentId) > 0 { |
| 198 | + str = fmt.Sprintf("<p%s> <Has.Answer> <%s> .\n", p.ParentId, node) |
| 199 | + w.Write([]byte(str)) |
| 200 | + } |
| 201 | + } else { |
| 202 | + // Not sure what this is. It isn't documented. |
| 203 | + continue |
| 204 | + } |
| 205 | + |
| 206 | + if len(p.OwnerUserId) > 0 { |
| 207 | + str = fmt.Sprintf("<%s> <Owner> <u%s> .\n", node, p.OwnerUserId) |
| 208 | + w.Write([]byte(str)) |
| 209 | + } |
| 210 | + //b.WriteString(fmt.Sprintf("<%s> <Score> \"%d\" .\n", node, p.Score)) |
| 211 | + str = fmt.Sprintf("<%s> <ViewCount> \"%d\" .\n", node, p.ViewCount) |
| 212 | + w.Write([]byte(str)) |
| 213 | + str = fmt.Sprintf("<%s> <Timestamp> %q .\n", node, p.CreationDate) |
| 214 | + w.Write([]byte(str)) |
| 215 | + } |
161 | 216 | }
|
| 217 | + } |
162 | 218 |
|
163 |
| - { |
164 |
| - b.WriteString(fmt.Sprintf("_:newBody <Timestamp> %q .\n", p.LastEditDate)) |
165 |
| - b.WriteString(fmt.Sprintf("_:newBody <Author> <u%s> .\n", p.LastEditorUserId)) |
166 |
| - b.WriteString(fmt.Sprintf("_:newBody <Post> <%s> .\n", node)) |
167 |
| - b.WriteString(fmt.Sprintf("_:newBody <Text> %q .\n", p.Body)) |
168 |
| - b.WriteString(fmt.Sprintf("_:newBody <Type> \"Body\" .\n")) |
169 |
| - |
170 |
| - b.WriteString(fmt.Sprintf("<%s> <Body> _:newBody .\n", node)) |
| 219 | + for { |
| 220 | + t, _ := phd.Token() |
| 221 | + if t == nil { |
| 222 | + break |
171 | 223 | }
|
172 | 224 |
|
173 |
| - // Now create the actual post. |
174 |
| - if p.PostTypeId == 1 { |
175 |
| - b.WriteString(fmt.Sprintf("<%s> <Type> \"Question\" .\n", node)) |
176 |
| - |
177 |
| - // Relation from question to accepted answer. |
178 |
| - if len(p.AcceptedAnswerId) > 0 { |
179 |
| - b.WriteString(fmt.Sprintf("<%s> <Chosen.Answer> <p%s> .\n", node, p.AcceptedAnswerId)) |
180 |
| - b.WriteString(fmt.Sprintf("<%s> <Has.Answer> <p%s> .\n", node, p.AcceptedAnswerId)) |
| 225 | + switch se := t.(type) { |
| 226 | + case xml.StartElement: |
| 227 | + if se.Name.Local == "row" { |
| 228 | + var l PostHistory |
| 229 | + phd.DecodeElement(&l, &se) |
| 230 | + |
| 231 | + if l.PostHistoryTypeId > 9 { |
| 232 | + // Ignore for this demo. |
| 233 | + continue |
| 234 | + } |
| 235 | + |
| 236 | + str = fmt.Sprintf("<ph%v> <Timestamp> %q .\n", postHistoryIdx, l.CreationDate) |
| 237 | + w.Write([]byte(str)) |
| 238 | + str = fmt.Sprintf("<ph%v> <Author> <u%s> .\n", postHistoryIdx, l.UserId) |
| 239 | + w.Write([]byte(str)) |
| 240 | + str = fmt.Sprintf("<ph%v> <Post> <p%s> .\n", postHistoryIdx, l.PostId) |
| 241 | + w.Write([]byte(str)) |
| 242 | + |
| 243 | + tid := l.PostHistoryTypeId % 3 |
| 244 | + |
| 245 | + switch tid { |
| 246 | + case 0: // Tags |
| 247 | + str = fmt.Sprintf("<ph%v> <Text> %q .\n", postHistoryIdx, l.Text) |
| 248 | + w.Write([]byte(str)) |
| 249 | + str = fmt.Sprintf("<ph%v> <Type> \"Tags\" .\n") |
| 250 | + w.Write([]byte(str)) |
| 251 | + case 1: // Title |
| 252 | + str = fmt.Sprintf("<ph%v> <Text> %q .\n", postHistoryIdx, l.Text) |
| 253 | + w.Write([]byte(str)) |
| 254 | + str = fmt.Sprintf("<ph%v> <Type> \"Title\" .\n") |
| 255 | + w.Write([]byte(str)) |
| 256 | + case 2: // Body |
| 257 | + str = fmt.Sprintf("<ph%v> <Text> %q .\n", postHistoryIdx, l.Text) |
| 258 | + w.Write([]byte(str)) |
| 259 | + str = fmt.Sprintf("<ph%v> <Type> \"Body\" .\n") |
| 260 | + w.Write([]byte(str)) |
| 261 | + } |
181 | 262 | }
|
182 |
| - |
183 |
| - } else if p.PostTypeId == 2 { |
184 |
| - b.WriteString(fmt.Sprintf("<%s> <Type> \"Answer\" .\n", node)) |
185 |
| - |
186 |
| - // Relation from question to answer. |
187 |
| - if len(p.ParentId) > 0 { |
188 |
| - b.WriteString(fmt.Sprintf("<p%s> <Has.Answer> <%s> .\n", p.ParentId, node)) |
189 |
| - } |
190 |
| - } else { |
191 |
| - // Not sure what this is. It isn't documented. |
192 |
| - continue |
193 |
| - } |
194 |
| - |
195 |
| - if len(p.OwnerUserId) > 0 { |
196 |
| - b.WriteString(fmt.Sprintf("<%s> <Owner> <u%s> .\n", node, p.OwnerUserId)) |
197 | 263 | }
|
198 |
| - //b.WriteString(fmt.Sprintf("<%s> <Score> \"%d\" .\n", node, p.Score)) |
199 |
| - b.WriteString(fmt.Sprintf("<%s> <ViewCount> \"%d\" .\n", node, p.ViewCount)) |
200 |
| - b.WriteString(fmt.Sprintf("<%s> <Timestamp> %q .\n", node, p.CreationDate)) |
201 |
| - |
202 |
| - b.WriteString("}}") |
203 |
| - wg.Add(1) |
204 |
| - go send(&b) |
205 | 264 | }
|
206 | 265 |
|
207 |
| - for _, l := range logs.Rows { |
208 |
| - if l.PostHistoryTypeId > 9 { |
209 |
| - // Ignore for this demo. |
210 |
| - continue |
211 |
| - } |
212 |
| - var b bytes.Buffer |
213 |
| - |
214 |
| - b.WriteString("mutation { set { ") |
215 |
| - b.WriteString(fmt.Sprintf("_:new <Timestamp> %q .\n", l.CreationDate)) |
216 |
| - b.WriteString(fmt.Sprintf("_:new <Author> <u%s> .\n", l.UserId)) |
217 |
| - b.WriteString(fmt.Sprintf("_:new <Post> <p%s> .\n", l.PostId)) |
218 |
| - |
219 |
| - tid := l.PostHistoryTypeId % 3 |
220 |
| - |
221 |
| - switch tid { |
222 |
| - case 0: // Tags |
223 |
| - b.WriteString(fmt.Sprintf("_:new <Text> %q .\n", l.Text)) |
224 |
| - b.WriteString(fmt.Sprintf("_:new <Type> \"Tags\" .\n")) |
225 |
| - case 1: // Title |
226 |
| - b.WriteString(fmt.Sprintf("_:new <Text> %q .\n", l.Text)) |
227 |
| - b.WriteString(fmt.Sprintf("_:new <Type> \"Title\" .\n")) |
228 |
| - case 2: // Body |
229 |
| - b.WriteString(fmt.Sprintf("_:new <Text> %q .\n", l.Text)) |
230 |
| - b.WriteString(fmt.Sprintf("_:new <Type> \"Body\" .\n")) |
231 |
| - } |
232 |
| - b.WriteString("}}") |
233 |
| - wg.Add(1) |
234 |
| - go send(&b) |
235 |
| - } |
236 |
| - wg.Wait() |
| 266 | + log.Println("Finished generating RDF.") |
| 267 | + err = w.Flush() |
| 268 | + check(err) |
| 269 | + |
| 270 | + err = w.Close() |
| 271 | + check(err) |
237 | 272 |
|
238 |
| - fmt.Println(len(posts.Rows), "processed") |
| 273 | + err = o.Close() |
| 274 | + check(err) |
239 | 275 | }
|
0 commit comments