generated from milosgajdos/go-repo-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat: add document and text splitters --------- Signed-off-by: Milos Gajdos <[email protected]>
- Loading branch information
1 parent
dd6ea80
commit 06af13f
Showing
7 changed files
with
441 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
package main | ||
|
||
import ( | ||
"flag" | ||
"fmt" | ||
"log" | ||
"os" | ||
|
||
"github.com/milosgajdos/go-embeddings/document/text" | ||
) | ||
|
||
var ( | ||
input string | ||
chunkSize int | ||
chunkOverlap int | ||
trimSpace bool | ||
keepSep bool | ||
) | ||
|
||
func init() { | ||
flag.StringVar(&input, "input", "", "document input") | ||
flag.IntVar(&chunkSize, "chunk-size", 100, "chunk size") | ||
flag.IntVar(&chunkOverlap, "chunk-overlap", 10, "chunk overlap") | ||
flag.BoolVar(&trimSpace, "trim", false, "trim empty space chars from chunks") | ||
flag.BoolVar(&keepSep, "keep-separator", false, "keep separator in chunks") | ||
} | ||
|
||
func main() { | ||
flag.Parse() | ||
|
||
if input == "" { | ||
log.Fatal("empty input path") | ||
} | ||
|
||
content, err := os.ReadFile(input) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
s := text.NewSplitter(). | ||
WithChunkSize(chunkSize). | ||
WithChunkOverlap(chunkOverlap). | ||
WithTrimSpace(true). | ||
WithKeepSep(true) | ||
|
||
rs := text.NewRecursiveCharSplitter(). | ||
WithSplitter(s) | ||
|
||
splits := rs.Split(string(content)) | ||
|
||
fmt.Println(len(splits)) | ||
for i, s := range splits { | ||
fmt.Println(i, s) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package document | ||
|
||
// Document stores data and associated metadata. | ||
type Document struct { | ||
Content string `json:"content"` | ||
Metadata map[string]any `json:"metadata"` | ||
} | ||
|
||
// Splitter splits documents into chunks. | ||
type Splitter interface { | ||
Split(Document) []string | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
package text | ||
|
||
import ( | ||
"regexp" | ||
) | ||
|
||
// CharSplitter is a character text splitter. | ||
// It splits texts into chunks over | ||
// a separator which is either a string | ||
// or a regular expression. | ||
type CharSplitter struct { | ||
*Splitter | ||
sep string | ||
isSepRegex bool | ||
} | ||
|
||
// NewSplitter creates a new splitter | ||
// with default options and returns it. | ||
func NewCharSplitter() *CharSplitter { | ||
return &CharSplitter{ | ||
Splitter: NewSplitter(), | ||
sep: DefaultSeparator, | ||
} | ||
} | ||
|
||
// WithSplitter sets the splitter | ||
func (s *CharSplitter) WithSplitter(splitter *Splitter) *CharSplitter { | ||
s.Splitter = splitter | ||
return s | ||
} | ||
|
||
// WithSep sets the separator. | ||
func (s *CharSplitter) WithSep(sep string, isSepRegex bool) *CharSplitter { | ||
s.sep = sep | ||
s.isSepRegex = isSepRegex | ||
return nil | ||
} | ||
|
||
// Split splits text into chunks. | ||
func (s *CharSplitter) Split(text string) []string { | ||
sep := s.sep | ||
if !s.isSepRegex { | ||
sep = regexp.QuoteMeta(s.sep) | ||
} | ||
chunks := s.splitText(text, sep) | ||
sep = "" | ||
if !s.keepSep { | ||
sep = s.sep | ||
} | ||
return s.merge(chunks, sep) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
package text | ||
|
||
import ( | ||
"regexp" | ||
) | ||
|
||
// RecursiveCharSplitter is a recursive | ||
// character text splitter. | ||
// It tries to split text recursively by different | ||
// separators to find one that works. | ||
type RecursiveCharSplitter struct { | ||
*Splitter | ||
seps []string | ||
isSepRegex bool | ||
} | ||
|
||
// NewSplitter creates a new splitter and returns it. | ||
func NewRecursiveCharSplitter() *RecursiveCharSplitter { | ||
return &RecursiveCharSplitter{ | ||
Splitter: NewSplitter(), | ||
seps: DefaultSeparators, | ||
} | ||
} | ||
|
||
// WithSplitter sets the splitter | ||
func (r *RecursiveCharSplitter) WithSplitter(splitter *Splitter) *RecursiveCharSplitter { | ||
r.Splitter = splitter | ||
return r | ||
} | ||
|
||
// WithSeps sets separators | ||
func (r *RecursiveCharSplitter) WithSeps(seps []string, isSepRegex bool) *RecursiveCharSplitter { | ||
r.seps = seps | ||
r.isSepRegex = isSepRegex | ||
return nil | ||
} | ||
|
||
func (r *RecursiveCharSplitter) split(text string, seps []string) []string { | ||
var ( | ||
resChunks []string | ||
newSeps []string | ||
) | ||
|
||
sep := seps[len(seps)-1] | ||
|
||
for i, s := range seps { | ||
if !r.isSepRegex { | ||
s = regexp.QuoteMeta(s) | ||
} | ||
if s == "" { | ||
sep = s | ||
break | ||
} | ||
if match, _ := regexp.MatchString(s, text); match { | ||
sep = s | ||
newSeps = seps[i+1:] | ||
break | ||
} | ||
} | ||
|
||
// TODO should we escape again? Seems weird. | ||
newSep := sep | ||
if !r.isSepRegex { | ||
newSep = regexp.QuoteMeta(sep) | ||
} | ||
chunks := r.splitText(text, newSep) | ||
|
||
var goodChunks []string | ||
|
||
if r.keepSep { | ||
newSep = "" | ||
} | ||
|
||
for _, chunk := range chunks { | ||
if r.lenFunc(chunk) < r.chunkSize { | ||
goodChunks = append(goodChunks, chunk) | ||
continue | ||
} | ||
|
||
if len(goodChunks) > 0 { | ||
mergedText := r.merge(goodChunks, newSep) | ||
resChunks = append(resChunks, mergedText...) | ||
goodChunks = nil | ||
} | ||
|
||
if len(newSeps) == 0 { | ||
resChunks = append(resChunks, chunk) | ||
continue | ||
} | ||
|
||
otherChunks := r.split(chunk, newSeps) | ||
resChunks = append(resChunks, otherChunks...) | ||
} | ||
|
||
if len(goodChunks) > 0 { | ||
mergedText := r.merge(goodChunks, newSep) | ||
resChunks = append(resChunks, mergedText...) | ||
} | ||
|
||
return resChunks | ||
} | ||
|
||
// Split splits text into chunks. | ||
func (r *RecursiveCharSplitter) Split(text string) []string { | ||
return r.split(text, r.seps) | ||
} |
Oops, something went wrong.