Skip to content

Commit 45d5c9d

Browse files
stevemk14ebrViRb3
andauthored
Optimize and omit duplicate pattern matches (#66) (#68)
Co-authored-by: ViRb3 <[email protected]>
1 parent 54a6712 commit 45d5c9d

File tree

3 files changed

+86
-20
lines changed

3 files changed

+86
-20
lines changed

objfile/patterns.go

+55-15
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package objfile
22

33
import (
44
"errors"
5+
"sort"
56
"strconv"
67
"strings"
78

@@ -238,11 +239,48 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
238239
return &RegexAndNeedle{patLen, regex_pattern, r, needleOffset, needle}, nil
239240
}
240241

241-
func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
242+
func getOrSetRegion(regionMap map[int]map[int]bool, start, end int) bool {
243+
if ends, ok := regionMap[start]; ok {
244+
if ends[end] {
245+
return true
246+
} else {
247+
ends[end] = true
248+
return false
249+
}
250+
} else {
251+
regionMap[start] = map[int]bool{end: true}
252+
return false
253+
}
254+
}
255+
256+
func regionMapToSlices(regionMap map[int]map[int]bool) [][]int {
257+
totalSize := 0
258+
keys := make([]int, 0, len(regionMap))
259+
for key, valueMap := range regionMap {
260+
keys = append(keys, key)
261+
totalSize += len(valueMap)
262+
}
263+
sort.Ints(keys)
264+
result := make([][]int, 0, totalSize)
265+
for _, key := range keys {
266+
values := make([]int, 0, len(regionMap[key]))
267+
for value := range regionMap[key] {
268+
values = append(values, value)
269+
}
270+
sort.Ints(values)
271+
for _, value := range values {
272+
result = append(result, []int{key, value})
273+
}
274+
}
275+
return result
276+
}
277+
278+
func FindRegex(data []byte, regexInfo *RegexAndNeedle) [][]int {
242279
data_len := len(data)
243-
matches := make([]int, 0)
280+
matchMap := make(map[int]map[int]bool)
281+
cacheMap := make(map[int]map[int]bool)
244282

245-
// use an optimized memscan to find some candidates chunks from the much larger haystack
283+
// use an optimized memscan to find all candidates chunks from the much larger haystack
246284
needleMatches := findAllOccurrences(data, [][]byte{regexInfo.needle})
247285
for _, needleMatch := range needleMatches {
248286
// adjust the window to the pattern start and end
@@ -258,35 +296,37 @@ func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
258296
data_end = data_len
259297
}
260298

299+
// don't repeat previously scanned chunks
300+
if getOrSetRegion(cacheMap, data_start, data_end) {
301+
continue
302+
}
261303
// do the full regex scan on a very small chunk
262304
for _, reMatch := range regexInfo.re.FindAllIndex(data[data_start:data_end], -1) {
263305
// the match offset is the start index of the chunk + reMatch index
264306
start := reMatch[0] + data_start
307+
end := reMatch[1] + data_start
308+
getOrSetRegion(matchMap, start, end)
265309

266-
//end := reMatch[1] + data_start
267-
matches = append(matches, start)
268-
269-
// special case to handle sub-matches, which are skipped by regex but matched by YARA:
270-
// AA AA BB CC
271-
// { AA [0-1] BB CC }
272-
// must produce:
273-
// AA AA BB CC
274-
// AA BB CC
310+
// handle sub-matches, which are skipped by regex but matched by YARA
275311
subStart := start + 1
276312
for {
313+
// don't repeat previously scanned chunks
314+
if getOrSetRegion(cacheMap, subStart, data_end) {
315+
break
316+
}
277317
subMatches := regexInfo.re.FindAllIndex(data[subStart:data_end], -1)
278318
if len(subMatches) == 0 {
279319
break
280320
}
281-
282321
for _, match := range subMatches {
283-
matches = append(matches, match[0]+subStart)
322+
getOrSetRegion(matchMap, match[0]+subStart, match[1]+subStart)
284323
}
285324
subStart += subMatches[0][0] + 1
286325
}
287326
}
288327
}
289-
return matches
328+
329+
return regionMapToSlices(matchMap)
290330
}
291331

292332
type RegexAndNeedle struct {

objfile/patterns_test.go

+26
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package objfile
22

33
import (
44
"bytes"
5+
"reflect"
56
"testing"
67

78
"rsc.io/binaryregexp"
@@ -265,4 +266,29 @@ func TestRegexpPatternFromYaraPattern(t *testing.T) {
265266
t.Errorf("incorrect needle")
266267
}
267268
})
269+
270+
t.Run("Repeat", func(t *testing.T) {
271+
reg, err := RegexpPatternFromYaraPattern("{ AA [0-512] BB }")
272+
273+
if err != nil {
274+
t.Errorf("pattern errored")
275+
}
276+
277+
if reg.len != 514 {
278+
t.Errorf("incorrect pattern length")
279+
}
280+
281+
if reg.needleOffset != 0 {
282+
t.Errorf("incorrect needle offset")
283+
}
284+
285+
if !bytes.Equal(reg.needle, []byte{0xAA}) {
286+
t.Errorf("incorrect needle")
287+
}
288+
289+
results := FindRegex([]byte{0xAA, 0xAA, 0xAA, 0xBB, 0xAA, 0xAA, 0xBB, 0xAA, 0xBB, 0xCC}, reg)
290+
if !reflect.DeepEqual(results, [][]int{{0, 4}, {1, 4}, {2, 4}, {4, 7}, {5, 7}, {7, 9}}) {
291+
t.Errorf("incorrect match indexes")
292+
}
293+
})
268294
}

objfile/scanner.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
9494
}
9595

9696
for _, match := range FindRegex(data, x64reg) {
97-
sigPtr := uint64(match) // from int
97+
sigPtr := uint64(match[0]) // from int
9898

9999
// this is the pointer offset stored in the instruction
100100
// 0x44E06A: 48 8D 0D 4F F0 24 00 lea rcx, off_69D0C0 (result: 0x24f04f)
@@ -119,7 +119,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
119119
}
120120

121121
for _, match := range FindRegex(data, x86reg) {
122-
sigPtr := uint64(match) // from int
122+
sigPtr := uint64(match[0]) // from int
123123

124124
moduleDataPtr := uint64(binary.LittleEndian.Uint32(data[sigPtr+x86sig.moduleDataPtrLoc:][:4]))
125125
matches = append(matches, SignatureMatch{
@@ -138,7 +138,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
138138
}
139139

140140
for _, match := range FindRegex(data, arm64reg) {
141-
sigPtr := uint64(match) // from int
141+
sigPtr := uint64(match[0]) // from int
142142

143143
adrp := binary.LittleEndian.Uint32(data[sigPtr+ARM64_sig.moduleDataPtrADRP:][:4])
144144
add := binary.LittleEndian.Uint32(data[sigPtr+ARM64_sig.moduleDataPtrADD:][:4])
@@ -169,7 +169,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
169169
}
170170

171171
for _, match := range FindRegex(data, arm32reg) {
172-
sigPtr := uint64(match) // from int
172+
sigPtr := uint64(match[0]) // from int
173173
ldr := binary.LittleEndian.Uint32(data[sigPtr+ARM32_sig.moduleDataPtrLDR:][:4])
174174
// ARM PC relative is always +8 due to legacy nonsense
175175
ldr_pointer_stub := uint64((ldr & 0x00000FFF) + 8)
@@ -190,7 +190,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
190190
}
191191

192192
for _, match := range FindRegex(data, ppcBEreg) {
193-
sigPtr := uint64(match) // from int
193+
sigPtr := uint64(match[0]) // from int
194194
moduleDataPtrHi := int64(binary.BigEndian.Uint16(data[sigPtr+PPC_BE_sig.moduleDataPtrHi:][:2]))
195195
// addi takes a signed immediate
196196
moduleDataPtrLo := int64(int16(binary.BigEndian.Uint16(data[sigPtr+PPC_BE_sig.moduleDataPtrLo:][:2])))

0 commit comments

Comments
 (0)