Text splitters (#9)

* feat: add document and text splitters --------- Signed-off-by: Milos Gajdos <[email protected]>
milosgajdos · Dec 2, 2023 · 06af13f · 06af13f
1 parent dd6ea80
commit 06af13f
Show file tree

Hide file tree

Showing 7 changed files with 441 additions and 0 deletions.
diff --git a/cmd/splitter/main.go b/cmd/splitter/main.go
@@ -0,0 +1,55 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+
+	"github.com/milosgajdos/go-embeddings/document/text"
+)
+
+var (
+	input        string
+	chunkSize    int
+	chunkOverlap int
+	trimSpace    bool
+	keepSep      bool
+)
+
+func init() {
+	flag.StringVar(&input, "input", "", "document input")
+	flag.IntVar(&chunkSize, "chunk-size", 100, "chunk size")
+	flag.IntVar(&chunkOverlap, "chunk-overlap", 10, "chunk overlap")
+	flag.BoolVar(&trimSpace, "trim", false, "trim empty space chars from chunks")
+	flag.BoolVar(&keepSep, "keep-separator", false, "keep separator in chunks")
+}
+
+func main() {
+	flag.Parse()
+
+	if input == "" {
+		log.Fatal("empty input path")
+	}
+
+	content, err := os.ReadFile(input)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	s := text.NewSplitter().
+		WithChunkSize(chunkSize).
+		WithChunkOverlap(chunkOverlap).
+		WithTrimSpace(true).
+		WithKeepSep(true)
+
+	rs := text.NewRecursiveCharSplitter().
+		WithSplitter(s)
+
+	splits := rs.Split(string(content))
+
+	fmt.Println(len(splits))
+	for i, s := range splits {
+		fmt.Println(i, s)
+	}
+}
diff --git a/document/document.go b/document/document.go
@@ -0,0 +1,12 @@
+package document
+
+// Document stores data and associated metadata.
+type Document struct {
+	Content  string         `json:"content"`
+	Metadata map[string]any `json:"metadata"`
+}
+
+// Splitter splits documents into chunks.
+type Splitter interface {
+	Split(Document) []string
+}
diff --git a/document/text/character.go b/document/text/character.go
@@ -0,0 +1,51 @@
+package text
+
+import (
+	"regexp"
+)
+
+// CharSplitter is a character text splitter.
+// It splits texts into chunks over
+// a separator which is either a string
+// or a regular expression.
+type CharSplitter struct {
+	*Splitter
+	sep        string
+	isSepRegex bool
+}
+
+// NewSplitter creates a new splitter
+// with default options and returns it.
+func NewCharSplitter() *CharSplitter {
+	return &CharSplitter{
+		Splitter: NewSplitter(),
+		sep:      DefaultSeparator,
+	}
+}
+
+// WithSplitter sets the splitter
+func (s *CharSplitter) WithSplitter(splitter *Splitter) *CharSplitter {
+	s.Splitter = splitter
+	return s
+}
+
+// WithSep sets the separator.
+func (s *CharSplitter) WithSep(sep string, isSepRegex bool) *CharSplitter {
+	s.sep = sep
+	s.isSepRegex = isSepRegex
+	return nil
+}
+
+// Split splits text into chunks.
+func (s *CharSplitter) Split(text string) []string {
+	sep := s.sep
+	if !s.isSepRegex {
+		sep = regexp.QuoteMeta(s.sep)
+	}
+	chunks := s.splitText(text, sep)
+	sep = ""
+	if !s.keepSep {
+		sep = s.sep
+	}
+	return s.merge(chunks, sep)
+}
diff --git a/document/text/recursive.go b/document/text/recursive.go
@@ -0,0 +1,106 @@
+package text
+
+import (
+	"regexp"
+)
+
+// RecursiveCharSplitter is a recursive
+// character text splitter.
+// It tries to split text recursively  by different
+// separators to find one that works.
+type RecursiveCharSplitter struct {
+	*Splitter
+	seps       []string
+	isSepRegex bool
+}
+
+// NewSplitter creates a new splitter and returns it.
+func NewRecursiveCharSplitter() *RecursiveCharSplitter {
+	return &RecursiveCharSplitter{
+		Splitter: NewSplitter(),
+		seps:     DefaultSeparators,
+	}
+}
+
+// WithSplitter sets the splitter
+func (r *RecursiveCharSplitter) WithSplitter(splitter *Splitter) *RecursiveCharSplitter {
+	r.Splitter = splitter
+	return r
+}
+
+// WithSeps sets separators
+func (r *RecursiveCharSplitter) WithSeps(seps []string, isSepRegex bool) *RecursiveCharSplitter {
+	r.seps = seps
+	r.isSepRegex = isSepRegex
+	return nil
+}
+
+func (r *RecursiveCharSplitter) split(text string, seps []string) []string {
+	var (
+		resChunks []string
+		newSeps   []string
+	)
+
+	sep := seps[len(seps)-1]
+
+	for i, s := range seps {
+		if !r.isSepRegex {
+			s = regexp.QuoteMeta(s)
+		}
+		if s == "" {
+			sep = s
+			break
+		}
+		if match, _ := regexp.MatchString(s, text); match {
+			sep = s
+			newSeps = seps[i+1:]
+			break
+		}
+	}
+
+	// TODO should we escape again? Seems weird.
+	newSep := sep
+	if !r.isSepRegex {
+		newSep = regexp.QuoteMeta(sep)
+	}
+	chunks := r.splitText(text, newSep)
+
+	var goodChunks []string
+
+	if r.keepSep {
+		newSep = ""
+	}
+
+	for _, chunk := range chunks {
+		if r.lenFunc(chunk) < r.chunkSize {
+			goodChunks = append(goodChunks, chunk)
+			continue
+		}
+
+		if len(goodChunks) > 0 {
+			mergedText := r.merge(goodChunks, newSep)
+			resChunks = append(resChunks, mergedText...)
+			goodChunks = nil
+		}
+
+		if len(newSeps) == 0 {
+			resChunks = append(resChunks, chunk)
+			continue
+		}
+
+		otherChunks := r.split(chunk, newSeps)
+		resChunks = append(resChunks, otherChunks...)
+	}
+
+	if len(goodChunks) > 0 {
+		mergedText := r.merge(goodChunks, newSep)
+		resChunks = append(resChunks, mergedText...)
+	}
+
+	return resChunks
+}
+
+// Split splits text into chunks.
+func (r *RecursiveCharSplitter) Split(text string) []string {
+	return r.split(text, r.seps)
+}