Skip to content

Commit ee6970e

Browse files
metalmatzegernest
andauthored
pqarrow/arrowutils: Add SortRecord and ReorderRecord (#628)
* pqarrow/arrowutils: Add SortRecord and ReorderRecord This is extract from a previous PR #461. * pqarrow/arrowutils: Update SortRecord to allow for multiple sort columns This isn't implemented yet, just the function signature is future proof. * pqarrow/arrowutils: Use compute.Take for ReorderRecord * pqarrow/arrowutils: Add support for sorting NULL NULL always gets sorted to the back. This seems to be the default for other language implementations. It can be made configurable in the future. * Update pqarrow/arrowutils/sort.go Co-authored-by: Geofrey Ernest <[email protected]> * Update pqarrow/arrowutils/sort.go Co-authored-by: Geofrey Ernest <[email protected]> * Update pqarrow/arrowutils/sort.go Co-authored-by: Geofrey Ernest <[email protected]> * Update pqarrow/arrowutils/sort.go Co-authored-by: Geofrey Ernest <[email protected]> * Update pqarrow/arrowutils/sort.go Co-authored-by: Geofrey Ernest <[email protected]> * pqarrow/arrowutils: Remove sorting *array.Binary This isn't properly unit tested and was more of an experiment. * pqarrow/arrowutils: Add context and reserve indices length --------- Co-authored-by: Geofrey Ernest <[email protected]>
1 parent b41edc6 commit ee6970e

File tree

4 files changed

+180
-2
lines changed

4 files changed

+180
-2
lines changed

go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ require (
2727
)
2828

2929
require (
30+
github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c // indirect
3031
github.com/andybalholm/brotli v1.0.5 // indirect
3132
github.com/benbjohnson/clock v1.3.5 // indirect
3233
github.com/benbjohnson/immutable v0.4.0 // indirect

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
2+
github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c h1:RGWPOewvKIROun94nF7v2cua9qP+thov/7M50KEoeSU=
3+
github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk=
24
github.com/RoaringBitmap/roaring v0.9.4 h1:ckvZSX5gwCRaJYBNe7syNawCU5oruY9gQmjXlp4riwo=
35
github.com/RoaringBitmap/roaring v0.9.4/go.mod h1:icnadbWcNyfEHlYdr+tDlOTih1Bf/h+rzPpv4sbomAA=
46
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
@@ -79,8 +81,6 @@ github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
7981
github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
8082
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
8183
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
82-
github.com/parquet-go/parquet-go v0.19.1-0.20231129084429-9010539a4f7a h1:NxS5GxNgZa5nJeLjJFidbzhwn+YuhdV5pXHtOw7VKB8=
83-
github.com/parquet-go/parquet-go v0.19.1-0.20231129084429-9010539a4f7a/go.mod h1:4YfUo8TkoGoqwzhA/joZKZ8f77wSMShOLHESY4Ys0bY=
8484
github.com/parquet-go/parquet-go v0.20.0 h1:a6tV5XudF893P1FMuyp01zSReXbBelquKQgRxBgJ29w=
8585
github.com/parquet-go/parquet-go v0.20.0/go.mod h1:4YfUo8TkoGoqwzhA/joZKZ8f77wSMShOLHESY4Ys0bY=
8686
github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ=

pqarrow/arrowutils/sort.go

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
package arrowutils
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"sort"
7+
8+
"github.com/apache/arrow/go/v14/arrow"
9+
"github.com/apache/arrow/go/v14/arrow/array"
10+
"github.com/apache/arrow/go/v14/arrow/compute"
11+
"github.com/apache/arrow/go/v14/arrow/memory"
12+
)
13+
14+
// SortRecord sorts the given record's rows by the given column. Currently only supports int64, string and binary columns.
15+
func SortRecord(mem memory.Allocator, r arrow.Record, cols []int) (*array.Int64, error) {
16+
if len(cols) > 1 {
17+
return nil, fmt.Errorf("sorting by multiple columns isn't implemented yet")
18+
}
19+
indicesBuilder := array.NewInt64Builder(mem)
20+
21+
if r.NumRows() == 0 {
22+
return indicesBuilder.NewInt64Array(), nil
23+
}
24+
if r.NumRows() == 1 {
25+
indicesBuilder.Append(0)
26+
return indicesBuilder.NewInt64Array(), nil
27+
}
28+
29+
indices := make([]int64, r.NumRows())
30+
// populate indices
31+
for i := range indices {
32+
indices[i] = int64(i)
33+
}
34+
35+
switch c := r.Column(cols[0]).(type) {
36+
case *array.Int64:
37+
sort.Sort(orderedSorter[int64]{array: c, indices: indices})
38+
case *array.String:
39+
sort.Sort(orderedSorter[string]{array: c, indices: indices})
40+
default:
41+
return nil, fmt.Errorf("unsupported column type for sorting %T", c)
42+
}
43+
44+
indicesBuilder.Reserve(len(indices))
45+
for _, i := range indices {
46+
indicesBuilder.Append(i)
47+
}
48+
49+
return indicesBuilder.NewInt64Array(), nil
50+
}
51+
52+
// ReorderRecord reorders the given record's rows by the given indices.
53+
// This is a wrapper around compute.Take which handles the type castings.
54+
func ReorderRecord(ctx context.Context, r arrow.Record, indices arrow.Array) (arrow.Record, error) {
55+
res, err := compute.Take(
56+
ctx,
57+
*compute.DefaultTakeOptions(),
58+
compute.NewDatum(r),
59+
compute.NewDatum(indices),
60+
)
61+
if err != nil {
62+
return nil, err
63+
}
64+
return res.(*compute.RecordDatum).Value, nil
65+
}
66+
67+
type orderedArray[T int64 | float64 | string] interface {
68+
Value(int) T
69+
IsNull(int) bool
70+
Len() int
71+
}
72+
73+
type orderedSorter[T int64 | float64 | string] struct {
74+
array orderedArray[T]
75+
indices []int64
76+
}
77+
78+
func (s orderedSorter[T]) Len() int {
79+
return s.array.Len()
80+
}
81+
82+
func (s orderedSorter[T]) Less(i, j int) bool {
83+
if s.array.IsNull(int(s.indices[i])) {
84+
return false
85+
}
86+
if s.array.IsNull(int(s.indices[j])) {
87+
return true
88+
}
89+
return s.array.Value(int(s.indices[i])) < s.array.Value(int(s.indices[j]))
90+
}
91+
92+
func (s orderedSorter[T]) Swap(i, j int) {
93+
s.indices[i], s.indices[j] = s.indices[j], s.indices[i]
94+
}

pqarrow/arrowutils/sort_test.go

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package arrowutils
2+
3+
import (
4+
"context"
5+
"testing"
6+
7+
"github.com/apache/arrow/go/v14/arrow"
8+
"github.com/apache/arrow/go/v14/arrow/array"
9+
"github.com/apache/arrow/go/v14/arrow/memory"
10+
"github.com/stretchr/testify/require"
11+
)
12+
13+
func TestSortRecord(t *testing.T) {
14+
ctx := context.Background()
15+
schema := arrow.NewSchema(
16+
[]arrow.Field{
17+
{Name: "int", Type: arrow.PrimitiveTypes.Int64},
18+
{Name: "string", Type: arrow.BinaryTypes.String},
19+
},
20+
nil,
21+
)
22+
23+
mem := memory.DefaultAllocator
24+
ib := array.NewInt64Builder(mem)
25+
ib.Append(0)
26+
ib.AppendNull()
27+
ib.Append(3)
28+
ib.Append(5)
29+
ib.Append(1)
30+
31+
sb := array.NewStringBuilder(mem)
32+
sb.Append("d")
33+
sb.Append("c")
34+
sb.Append("b")
35+
sb.AppendNull()
36+
sb.Append("a")
37+
38+
record := array.NewRecord(schema, []arrow.Array{ib.NewArray(), sb.NewArray()}, int64(5))
39+
40+
// Sort the record by the first column - int64
41+
{
42+
sortedIndices, err := SortRecord(mem, record, []int{record.Schema().FieldIndices("int")[0]})
43+
require.NoError(t, err)
44+
require.Equal(t, []int64{0, 4, 2, 3, 1}, sortedIndices.Int64Values())
45+
46+
sortedByInts, err := ReorderRecord(ctx, record, sortedIndices)
47+
require.NoError(t, err)
48+
49+
// check that the column got sortedIndices
50+
intCol := sortedByInts.Column(0).(*array.Int64)
51+
require.Equal(t, []int64{0, 1, 3, 5, 0}, intCol.Int64Values())
52+
require.True(t, intCol.IsNull(intCol.Len()-1)) // last is NULL
53+
// make sure the other column got updated too
54+
strings := make([]string, sortedByInts.NumRows())
55+
stringCol := sortedByInts.Column(1).(*array.String)
56+
for i := 0; i < int(sortedByInts.NumRows()); i++ {
57+
strings[i] = stringCol.Value(i)
58+
}
59+
require.Equal(t, []string{"d", "a", "b", "", "c"}, strings)
60+
}
61+
62+
// Sort the record by the second column - string
63+
{
64+
sortedIndices, err := SortRecord(mem, record, []int{record.Schema().FieldIndices("string")[0]})
65+
require.NoError(t, err)
66+
require.Equal(t, []int64{4, 2, 1, 0, 3}, sortedIndices.Int64Values())
67+
68+
sortedByStrings, err := ReorderRecord(ctx, record, sortedIndices)
69+
require.NoError(t, err)
70+
71+
// check that the column got sortedByInts
72+
intCol := sortedByStrings.Column(0).(*array.Int64)
73+
require.Equal(t, []int64{1, 3, 0, 0, 5}, intCol.Int64Values())
74+
// make sure the other column got updated too
75+
strings := make([]string, sortedByStrings.NumRows())
76+
stringCol := sortedByStrings.Column(1).(*array.String)
77+
for i := 0; i < int(sortedByStrings.NumRows()); i++ {
78+
strings[i] = stringCol.Value(i)
79+
}
80+
require.Equal(t, []string{"a", "b", "c", "d", ""}, strings)
81+
require.True(t, stringCol.IsNull(stringCol.Len()-1)) // last is NULL
82+
}
83+
}

0 commit comments

Comments
 (0)