@@ -2,6 +2,7 @@ package objfile
2
2
3
3
import (
4
4
"errors"
5
+ "sort"
5
6
"strconv"
6
7
"strings"
7
8
@@ -238,11 +239,48 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
238
239
return & RegexAndNeedle {patLen , regex_pattern , r , needleOffset , needle }, nil
239
240
}
240
241
241
- func FindRegex (data []byte , regexInfo * RegexAndNeedle ) []int {
242
+ func getOrSetRegion (regionMap map [int ]map [int ]bool , start , end int ) bool {
243
+ if ends , ok := regionMap [start ]; ok {
244
+ if ends [end ] {
245
+ return true
246
+ } else {
247
+ ends [end ] = true
248
+ return false
249
+ }
250
+ } else {
251
+ regionMap [start ] = map [int ]bool {end : true }
252
+ return false
253
+ }
254
+ }
255
+
256
+ func regionMapToSlices (regionMap map [int ]map [int ]bool ) [][]int {
257
+ totalSize := 0
258
+ keys := make ([]int , 0 , len (regionMap ))
259
+ for key , valueMap := range regionMap {
260
+ keys = append (keys , key )
261
+ totalSize += len (valueMap )
262
+ }
263
+ sort .Ints (keys )
264
+ result := make ([][]int , 0 , totalSize )
265
+ for _ , key := range keys {
266
+ values := make ([]int , 0 , len (regionMap [key ]))
267
+ for value := range regionMap [key ] {
268
+ values = append (values , value )
269
+ }
270
+ sort .Ints (values )
271
+ for _ , value := range values {
272
+ result = append (result , []int {key , value })
273
+ }
274
+ }
275
+ return result
276
+ }
277
+
278
+ func FindRegex (data []byte , regexInfo * RegexAndNeedle ) [][]int {
242
279
data_len := len (data )
243
- matches := make ([]int , 0 )
280
+ matchMap := make (map [int ]map [int ]bool )
281
+ cacheMap := make (map [int ]map [int ]bool )
244
282
245
- // use an optimized memscan to find some candidates chunks from the much larger haystack
283
+ // use an optimized memscan to find all candidates chunks from the much larger haystack
246
284
needleMatches := findAllOccurrences (data , [][]byte {regexInfo .needle })
247
285
for _ , needleMatch := range needleMatches {
248
286
// adjust the window to the pattern start and end
@@ -258,35 +296,37 @@ func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
258
296
data_end = data_len
259
297
}
260
298
299
+ // don't repeat previously scanned chunks
300
+ if getOrSetRegion (cacheMap , data_start , data_end ) {
301
+ continue
302
+ }
261
303
// do the full regex scan on a very small chunk
262
304
for _ , reMatch := range regexInfo .re .FindAllIndex (data [data_start :data_end ], - 1 ) {
263
305
// the match offset is the start index of the chunk + reMatch index
264
306
start := reMatch [0 ] + data_start
307
+ end := reMatch [1 ] + data_start
308
+ getOrSetRegion (matchMap , start , end )
265
309
266
- //end := reMatch[1] + data_start
267
- matches = append (matches , start )
268
-
269
- // special case to handle sub-matches, which are skipped by regex but matched by YARA:
270
- // AA AA BB CC
271
- // { AA [0-1] BB CC }
272
- // must produce:
273
- // AA AA BB CC
274
- // AA BB CC
310
+ // handle sub-matches, which are skipped by regex but matched by YARA
275
311
subStart := start + 1
276
312
for {
313
+ // don't repeat previously scanned chunks
314
+ if getOrSetRegion (cacheMap , subStart , data_end ) {
315
+ break
316
+ }
277
317
subMatches := regexInfo .re .FindAllIndex (data [subStart :data_end ], - 1 )
278
318
if len (subMatches ) == 0 {
279
319
break
280
320
}
281
-
282
321
for _ , match := range subMatches {
283
- matches = append ( matches , match [0 ]+ subStart )
322
+ getOrSetRegion ( matchMap , match [0 ] + subStart , match [ 1 ]+ subStart )
284
323
}
285
324
subStart += subMatches [0 ][0 ] + 1
286
325
}
287
326
}
288
327
}
289
- return matches
328
+
329
+ return regionMapToSlices (matchMap )
290
330
}
291
331
292
332
type RegexAndNeedle struct {
0 commit comments