Skip to content

Commit

Permalink
Text splitters (#9)
Browse files Browse the repository at this point in the history
* feat: add document and text splitters
---------

Signed-off-by: Milos Gajdos <[email protected]>
  • Loading branch information
milosgajdos authored Dec 2, 2023
1 parent dd6ea80 commit 06af13f
Show file tree
Hide file tree
Showing 7 changed files with 441 additions and 0 deletions.
55 changes: 55 additions & 0 deletions cmd/splitter/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package main

import (
"flag"
"fmt"
"log"
"os"

"github.com/milosgajdos/go-embeddings/document/text"
)

var (
input string
chunkSize int
chunkOverlap int
trimSpace bool
keepSep bool
)

func init() {
flag.StringVar(&input, "input", "", "document input")
flag.IntVar(&chunkSize, "chunk-size", 100, "chunk size")
flag.IntVar(&chunkOverlap, "chunk-overlap", 10, "chunk overlap")
flag.BoolVar(&trimSpace, "trim", false, "trim empty space chars from chunks")
flag.BoolVar(&keepSep, "keep-separator", false, "keep separator in chunks")
}

func main() {
flag.Parse()

if input == "" {
log.Fatal("empty input path")
}

content, err := os.ReadFile(input)
if err != nil {
log.Fatal(err)
}

s := text.NewSplitter().
WithChunkSize(chunkSize).
WithChunkOverlap(chunkOverlap).
WithTrimSpace(true).
WithKeepSep(true)

rs := text.NewRecursiveCharSplitter().
WithSplitter(s)

splits := rs.Split(string(content))

fmt.Println(len(splits))
for i, s := range splits {
fmt.Println(i, s)
}
}
12 changes: 12 additions & 0 deletions document/document.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package document

// Document stores data and associated metadata.
type Document struct {
Content string `json:"content"`
Metadata map[string]any `json:"metadata"`
}

// Splitter splits documents into chunks.
type Splitter interface {
Split(Document) []string
}
51 changes: 51 additions & 0 deletions document/text/character.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package text

import (
"regexp"
)

// CharSplitter is a character text splitter.
// It splits texts into chunks over
// a separator which is either a string
// or a regular expression.
type CharSplitter struct {
*Splitter
sep string
isSepRegex bool
}

// NewSplitter creates a new splitter
// with default options and returns it.
func NewCharSplitter() *CharSplitter {
return &CharSplitter{
Splitter: NewSplitter(),
sep: DefaultSeparator,
}
}

// WithSplitter sets the splitter
func (s *CharSplitter) WithSplitter(splitter *Splitter) *CharSplitter {
s.Splitter = splitter
return s
}

// WithSep sets the separator.
func (s *CharSplitter) WithSep(sep string, isSepRegex bool) *CharSplitter {
s.sep = sep
s.isSepRegex = isSepRegex
return nil
}

// Split splits text into chunks.
func (s *CharSplitter) Split(text string) []string {
sep := s.sep
if !s.isSepRegex {
sep = regexp.QuoteMeta(s.sep)
}
chunks := s.splitText(text, sep)
sep = ""
if !s.keepSep {
sep = s.sep
}
return s.merge(chunks, sep)
}
106 changes: 106 additions & 0 deletions document/text/recursive.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package text

import (
"regexp"
)

// RecursiveCharSplitter is a recursive
// character text splitter.
// It tries to split text recursively by different
// separators to find one that works.
type RecursiveCharSplitter struct {
*Splitter
seps []string
isSepRegex bool
}

// NewSplitter creates a new splitter and returns it.
func NewRecursiveCharSplitter() *RecursiveCharSplitter {
return &RecursiveCharSplitter{
Splitter: NewSplitter(),
seps: DefaultSeparators,
}
}

// WithSplitter sets the splitter
func (r *RecursiveCharSplitter) WithSplitter(splitter *Splitter) *RecursiveCharSplitter {
r.Splitter = splitter
return r
}

// WithSeps sets separators
func (r *RecursiveCharSplitter) WithSeps(seps []string, isSepRegex bool) *RecursiveCharSplitter {
r.seps = seps
r.isSepRegex = isSepRegex
return nil
}

func (r *RecursiveCharSplitter) split(text string, seps []string) []string {
var (
resChunks []string
newSeps []string
)

sep := seps[len(seps)-1]

for i, s := range seps {
if !r.isSepRegex {
s = regexp.QuoteMeta(s)
}
if s == "" {
sep = s
break
}
if match, _ := regexp.MatchString(s, text); match {
sep = s
newSeps = seps[i+1:]
break
}
}

// TODO should we escape again? Seems weird.
newSep := sep
if !r.isSepRegex {
newSep = regexp.QuoteMeta(sep)
}
chunks := r.splitText(text, newSep)

var goodChunks []string

if r.keepSep {
newSep = ""
}

for _, chunk := range chunks {
if r.lenFunc(chunk) < r.chunkSize {
goodChunks = append(goodChunks, chunk)
continue
}

if len(goodChunks) > 0 {
mergedText := r.merge(goodChunks, newSep)
resChunks = append(resChunks, mergedText...)
goodChunks = nil
}

if len(newSeps) == 0 {
resChunks = append(resChunks, chunk)
continue
}

otherChunks := r.split(chunk, newSeps)
resChunks = append(resChunks, otherChunks...)
}

if len(goodChunks) > 0 {
mergedText := r.merge(goodChunks, newSep)
resChunks = append(resChunks, mergedText...)
}

return resChunks
}

// Split splits text into chunks.
func (r *RecursiveCharSplitter) Split(text string) []string {
return r.split(text, r.seps)
}
Loading

0 comments on commit 06af13f

Please sign in to comment.