Skip to content
This repository was archived by the owner on Jul 16, 2024. It is now read-only.

Commit 709334d

Browse files
author
Sung Won Cho
committed
Generate RDF for posts
1 parent bcb0f3f commit 709334d

File tree

2 files changed

+179
-143
lines changed

2 files changed

+179
-143
lines changed

posts/main.go

+178-142
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,18 @@
44
package main
55

66
import (
7-
"bytes"
7+
"bufio"
8+
"compress/gzip"
89
"encoding/xml"
910
"flag"
1011
"fmt"
11-
"io/ioutil"
1212
"log"
13-
"net/http"
14-
"sync"
13+
"os"
1514
)
1615

1716
var (
1817
dir = flag.String("dir", "", "Directory which holds Users.xml file")
19-
dryRun = flag.Bool("dry", true, "Only show mutations.")
18+
output = flag.String("output", "posts.rdf.gz", "Output rdf.gz file")
2019
)
2120

2221
type PostHistory struct {
@@ -46,14 +45,6 @@ type Post struct {
4645
OwnerUserId string `xml:",attr"`
4746
}
4847

49-
type Posts struct {
50-
Rows []Post `xml:"row"`
51-
}
52-
53-
type Logs struct {
54-
Rows []PostHistory `xml:"row"`
55-
}
56-
5748
func check(err error) {
5849
if err != nil {
5950
log.Fatal(err)
@@ -88,152 +79,197 @@ func parseTags(tagString string) []string {
8879
func main() {
8980
flag.Parse()
9081

91-
data, err := ioutil.ReadFile(*dir + "/Posts.xml")
82+
err := os.RemoveAll(*output)
9283
check(err)
93-
var posts Posts
94-
check(xml.Unmarshal(data, &posts))
9584

96-
data, err = ioutil.ReadFile(*dir + "/PostHistory.xml")
85+
o, err := os.OpenFile(*output, os.O_WRONLY|os.O_CREATE, 0755)
9786
check(err)
98-
var logs Logs
99-
check(xml.Unmarshal(data, &logs))
100-
101-
fmt.Println("dryrun: ", *dryRun)
102-
var wg sync.WaitGroup
103-
limiter := make(chan struct{}, 80)
104-
if *dryRun {
105-
limiter = make(chan struct{}, 1)
106-
}
10787

108-
send := func(b *bytes.Buffer) {
109-
limiter <- struct{}{}
110-
// fmt.Println(b.String())
111-
if *dryRun == false {
112-
//fmt.Println("POSTing")
113-
resp, err := http.Post("http://localhost:8080/query", "", b)
114-
check(err)
115-
_, err = ioutil.ReadAll(resp.Body)
116-
check(err)
117-
check(resp.Body.Close())
118-
}
119-
wg.Done()
120-
<-limiter
121-
}
122-
123-
// First generate all the versions.
124-
for _, p := range posts.Rows {
125-
var b bytes.Buffer
88+
pf, err := os.Open(*dir + "/Posts.xml")
89+
check(err)
90+
phf, err := os.Open(*dir + "/PostHistory.xml")
91+
check(err)
92+
w := gzip.NewWriter(o)
12693

127-
node := "p" + p.Id
128-
b.WriteString("mutation { set { ")
94+
log.Println("1/2 Reading file")
95+
pc := bufio.NewReader(pf)
96+
pcd := xml.NewDecoder(pc)
12997

130-
if len(p.LastEditDate) == 0 {
131-
p.LastEditDate = p.LastActivityDate
132-
}
133-
if len(p.LastEditorUserId) == 0 {
134-
p.LastEditorUserId = p.OwnerUserId
135-
}
136-
if len(p.LastEditorUserId) == 0 || len(p.LastEditDate) == 0 {
137-
continue
138-
}
98+
phc := bufio.NewReader(phf)
99+
phd := xml.NewDecoder(phc)
139100

140-
// First create the versions correctly, and attach them to the node.
141-
{
142-
b.WriteString(fmt.Sprintf("_:newTitle <Timestamp> %q .\n", p.LastEditDate))
143-
b.WriteString(fmt.Sprintf("_:newTitle <Author> <u%s> .\n", p.LastEditorUserId))
144-
b.WriteString(fmt.Sprintf("_:newTitle <Post> <%s> .\n", node))
145-
b.WriteString(fmt.Sprintf("_:newTitle <Text> %q .\n", p.Title))
146-
b.WriteString(fmt.Sprintf("_:newTitle <Type> \"Title\" .\n"))
101+
var str string
102+
postHistoryIdx := 0
147103

148-
b.WriteString(fmt.Sprintf("<%s> <Title> _:newTitle .\n", node))
104+
// First generate all the versions.
105+
for {
106+
t, _ := pcd.Token()
107+
if t == nil {
108+
break
149109
}
150110

151-
// Generate tag node for each tag in the tag string
152-
tagList := parseTags(p.Tags)
153-
for idx, tag := range tagList {
154-
b.WriteString(fmt.Sprintf("<t-%v> <Timestamp> %q .\n", idx, p.LastEditDate))
155-
b.WriteString(fmt.Sprintf("<t-%v> <Author> <u%s> .\n", idx, p.LastEditorUserId))
156-
b.WriteString(fmt.Sprintf("<t-%v> <Post> <%s> .\n", idx, node))
157-
b.WriteString(fmt.Sprintf("<t-%v> <Text> %q .\n", idx, tag))
158-
b.WriteString(fmt.Sprintf("<t-%v> <Type> \"Tag\" .\n", idx))
159-
160-
b.WriteString(fmt.Sprintf("<%s> <Tags> <t-%v> .\n", node, idx))
111+
switch se := t.(type) {
112+
case xml.StartElement:
113+
if se.Name.Local == "row" {
114+
var p Post
115+
pcd.DecodeElement(&p, &se)
116+
117+
node := "p" + p.Id
118+
119+
if len(p.LastEditDate) == 0 {
120+
p.LastEditDate = p.LastActivityDate
121+
}
122+
if len(p.LastEditorUserId) == 0 {
123+
p.LastEditorUserId = p.OwnerUserId
124+
}
125+
if len(p.LastEditorUserId) == 0 || len(p.LastEditDate) == 0 {
126+
continue
127+
}
128+
129+
// First create the versions correctly, and attach them to the node.
130+
{
131+
str = fmt.Sprintf("<ph%v> <Timestamp> %q .\n", postHistoryIdx, p.LastEditDate)
132+
w.Write([]byte(str))
133+
str = fmt.Sprintf("<ph%v> <Author> <u%s> .\n", postHistoryIdx, p.LastEditorUserId)
134+
w.Write([]byte(str))
135+
str = fmt.Sprintf("<ph%v> <Post> <%s> .\n", postHistoryIdx, node)
136+
w.Write([]byte(str))
137+
str = fmt.Sprintf("<ph%v> <Text> %q .\n", postHistoryIdx, p.Title)
138+
w.Write([]byte(str))
139+
str = fmt.Sprintf("<ph%v> <Type> \"Title\" .\n", postHistoryIdx)
140+
w.Write([]byte(str))
141+
str = fmt.Sprintf("<%s> <Title> <ph%v> .\n", node, postHistoryIdx)
142+
w.Write([]byte(str))
143+
postHistoryIdx++
144+
}
145+
146+
// Generate tag node for each tag in the tag string
147+
tagList := parseTags(p.Tags)
148+
for idx, tag := range tagList {
149+
str = fmt.Sprintf("<t-%v> <Timestamp> %q .\n", idx, p.LastEditDate)
150+
w.Write([]byte(str))
151+
str = fmt.Sprintf("<t-%v> <Author> <u%s> .\n", idx, p.LastEditorUserId)
152+
w.Write([]byte(str))
153+
str = fmt.Sprintf("<t-%v> <Post> <%s> .\n", idx, node)
154+
w.Write([]byte(str))
155+
str = fmt.Sprintf("<t-%v> <Text> %q .\n", idx, tag)
156+
w.Write([]byte(str))
157+
str = fmt.Sprintf("<t-%v> <Type> \"Tag\" .\n", idx)
158+
w.Write([]byte(str))
159+
str = fmt.Sprintf("<%s> <Tags> <t-%v> .\n", node, idx)
160+
w.Write([]byte(str))
161+
}
162+
163+
{
164+
str = fmt.Sprintf("<ph%v> <Timestamp> %q .\n", postHistoryIdx, p.LastEditDate)
165+
w.Write([]byte(str))
166+
str = fmt.Sprintf("<ph%v> <Author> <u%s> .\n", postHistoryIdx, p.LastEditorUserId)
167+
w.Write([]byte(str))
168+
str = fmt.Sprintf("<ph%v> <Post> <%s> .\n", postHistoryIdx, node)
169+
w.Write([]byte(str))
170+
str = fmt.Sprintf("<ph%v> <Text> %q .\n", postHistoryIdx, p.Body)
171+
w.Write([]byte(str))
172+
str = fmt.Sprintf("<ph%v> <Type> \"Body\" .\n", postHistoryIdx)
173+
w.Write([]byte(str))
174+
str = fmt.Sprintf("<%s> <Body> <ph%v> .\n", node, postHistoryIdx)
175+
w.Write([]byte(str))
176+
postHistoryIdx++
177+
}
178+
179+
// Now create the actual post.
180+
if p.PostTypeId == 1 {
181+
str = fmt.Sprintf("<%s> <Type> \"Question\" .\n", node)
182+
w.Write([]byte(str))
183+
184+
// Relation from question to accepted answer.
185+
if len(p.AcceptedAnswerId) > 0 {
186+
str = fmt.Sprintf("<%s> <Chosen.Answer> <p%s> .\n", node, p.AcceptedAnswerId)
187+
w.Write([]byte(str))
188+
str = fmt.Sprintf("<%s> <Has.Answer> <p%s> .\n", node, p.AcceptedAnswerId)
189+
w.Write([]byte(str))
190+
}
191+
192+
} else if p.PostTypeId == 2 {
193+
str = fmt.Sprintf("<%s> <Type> \"Answer\" .\n", node)
194+
w.Write([]byte(str))
195+
196+
// Relation from question to answer.
197+
if len(p.ParentId) > 0 {
198+
str = fmt.Sprintf("<p%s> <Has.Answer> <%s> .\n", p.ParentId, node)
199+
w.Write([]byte(str))
200+
}
201+
} else {
202+
// Not sure what this is. It isn't documented.
203+
continue
204+
}
205+
206+
if len(p.OwnerUserId) > 0 {
207+
str = fmt.Sprintf("<%s> <Owner> <u%s> .\n", node, p.OwnerUserId)
208+
w.Write([]byte(str))
209+
}
210+
//b.WriteString(fmt.Sprintf("<%s> <Score> \"%d\" .\n", node, p.Score))
211+
str = fmt.Sprintf("<%s> <ViewCount> \"%d\" .\n", node, p.ViewCount)
212+
w.Write([]byte(str))
213+
str = fmt.Sprintf("<%s> <Timestamp> %q .\n", node, p.CreationDate)
214+
w.Write([]byte(str))
215+
}
161216
}
217+
}
162218

163-
{
164-
b.WriteString(fmt.Sprintf("_:newBody <Timestamp> %q .\n", p.LastEditDate))
165-
b.WriteString(fmt.Sprintf("_:newBody <Author> <u%s> .\n", p.LastEditorUserId))
166-
b.WriteString(fmt.Sprintf("_:newBody <Post> <%s> .\n", node))
167-
b.WriteString(fmt.Sprintf("_:newBody <Text> %q .\n", p.Body))
168-
b.WriteString(fmt.Sprintf("_:newBody <Type> \"Body\" .\n"))
169-
170-
b.WriteString(fmt.Sprintf("<%s> <Body> _:newBody .\n", node))
219+
for {
220+
t, _ := phd.Token()
221+
if t == nil {
222+
break
171223
}
172224

173-
// Now create the actual post.
174-
if p.PostTypeId == 1 {
175-
b.WriteString(fmt.Sprintf("<%s> <Type> \"Question\" .\n", node))
176-
177-
// Relation from question to accepted answer.
178-
if len(p.AcceptedAnswerId) > 0 {
179-
b.WriteString(fmt.Sprintf("<%s> <Chosen.Answer> <p%s> .\n", node, p.AcceptedAnswerId))
180-
b.WriteString(fmt.Sprintf("<%s> <Has.Answer> <p%s> .\n", node, p.AcceptedAnswerId))
225+
switch se := t.(type) {
226+
case xml.StartElement:
227+
if se.Name.Local == "row" {
228+
var l PostHistory
229+
phd.DecodeElement(&l, &se)
230+
231+
if l.PostHistoryTypeId > 9 {
232+
// Ignore for this demo.
233+
continue
234+
}
235+
236+
str = fmt.Sprintf("<ph%v> <Timestamp> %q .\n", postHistoryIdx, l.CreationDate)
237+
w.Write([]byte(str))
238+
str = fmt.Sprintf("<ph%v> <Author> <u%s> .\n", postHistoryIdx, l.UserId)
239+
w.Write([]byte(str))
240+
str = fmt.Sprintf("<ph%v> <Post> <p%s> .\n", postHistoryIdx, l.PostId)
241+
w.Write([]byte(str))
242+
243+
tid := l.PostHistoryTypeId % 3
244+
245+
switch tid {
246+
case 0: // Tags
247+
str = fmt.Sprintf("<ph%v> <Text> %q .\n", postHistoryIdx, l.Text)
248+
w.Write([]byte(str))
249+
str = fmt.Sprintf("<ph%v> <Type> \"Tags\" .\n")
250+
w.Write([]byte(str))
251+
case 1: // Title
252+
str = fmt.Sprintf("<ph%v> <Text> %q .\n", postHistoryIdx, l.Text)
253+
w.Write([]byte(str))
254+
str = fmt.Sprintf("<ph%v> <Type> \"Title\" .\n")
255+
w.Write([]byte(str))
256+
case 2: // Body
257+
str = fmt.Sprintf("<ph%v> <Text> %q .\n", postHistoryIdx, l.Text)
258+
w.Write([]byte(str))
259+
str = fmt.Sprintf("<ph%v> <Type> \"Body\" .\n")
260+
w.Write([]byte(str))
261+
}
181262
}
182-
183-
} else if p.PostTypeId == 2 {
184-
b.WriteString(fmt.Sprintf("<%s> <Type> \"Answer\" .\n", node))
185-
186-
// Relation from question to answer.
187-
if len(p.ParentId) > 0 {
188-
b.WriteString(fmt.Sprintf("<p%s> <Has.Answer> <%s> .\n", p.ParentId, node))
189-
}
190-
} else {
191-
// Not sure what this is. It isn't documented.
192-
continue
193-
}
194-
195-
if len(p.OwnerUserId) > 0 {
196-
b.WriteString(fmt.Sprintf("<%s> <Owner> <u%s> .\n", node, p.OwnerUserId))
197263
}
198-
//b.WriteString(fmt.Sprintf("<%s> <Score> \"%d\" .\n", node, p.Score))
199-
b.WriteString(fmt.Sprintf("<%s> <ViewCount> \"%d\" .\n", node, p.ViewCount))
200-
b.WriteString(fmt.Sprintf("<%s> <Timestamp> %q .\n", node, p.CreationDate))
201-
202-
b.WriteString("}}")
203-
wg.Add(1)
204-
go send(&b)
205264
}
206265

207-
for _, l := range logs.Rows {
208-
if l.PostHistoryTypeId > 9 {
209-
// Ignore for this demo.
210-
continue
211-
}
212-
var b bytes.Buffer
213-
214-
b.WriteString("mutation { set { ")
215-
b.WriteString(fmt.Sprintf("_:new <Timestamp> %q .\n", l.CreationDate))
216-
b.WriteString(fmt.Sprintf("_:new <Author> <u%s> .\n", l.UserId))
217-
b.WriteString(fmt.Sprintf("_:new <Post> <p%s> .\n", l.PostId))
218-
219-
tid := l.PostHistoryTypeId % 3
220-
221-
switch tid {
222-
case 0: // Tags
223-
b.WriteString(fmt.Sprintf("_:new <Text> %q .\n", l.Text))
224-
b.WriteString(fmt.Sprintf("_:new <Type> \"Tags\" .\n"))
225-
case 1: // Title
226-
b.WriteString(fmt.Sprintf("_:new <Text> %q .\n", l.Text))
227-
b.WriteString(fmt.Sprintf("_:new <Type> \"Title\" .\n"))
228-
case 2: // Body
229-
b.WriteString(fmt.Sprintf("_:new <Text> %q .\n", l.Text))
230-
b.WriteString(fmt.Sprintf("_:new <Type> \"Body\" .\n"))
231-
}
232-
b.WriteString("}}")
233-
wg.Add(1)
234-
go send(&b)
235-
}
236-
wg.Wait()
266+
log.Println("Finished generating RDF.")
267+
err = w.Flush()
268+
check(err)
269+
270+
err = w.Close()
271+
check(err)
237272

238-
fmt.Println(len(posts.Rows), "processed")
273+
err = o.Close()
274+
check(err)
239275
}

votes/main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import (
1717

1818
var (
1919
dir = flag.String("dir", "", "Directory which holds Votes.xml file")
20-
output = flag.String("output", "out.rdf.gz", "Output rdf.gz file")
20+
output = flag.String("output", "votes.rdf.gz", "Output rdf.gz file")
2121
)
2222

2323
// random generates a random integer given a range

0 commit comments

Comments
 (0)