Skip to content

Commit

Permalink
filter empty text, padding_size
Browse files Browse the repository at this point in the history
  • Loading branch information
jambayk committed Feb 14, 2025
1 parent 3c32603 commit 0d9b7cc
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 3 deletions.
4 changes: 2 additions & 2 deletions olive/data/component/pre_process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def huggingface_pre_process(
def _tokenizer_and_align_labels(examples):
tokenizer = get_tokenizer(model_name, trust_remote_code=trust_remote_code)
tokenized_inputs = tokenizer(
*[examples[input_col] for input_col in input_cols],
*[examples[input_col] for input_col in input_cols if examples[input_col]],
padding=kwargs.get("padding", True),
truncation=kwargs.get("truncation", True),
max_length=kwargs.get("max_length"),
Expand Down Expand Up @@ -160,7 +160,7 @@ def _align_labels_with_tokens(labels, word_ids):
def _tokenizer_and_align_labels(examples):
tokenizer = get_tokenizer(model_name, trust_remote_code=trust_remote_code)
tokenized_inputs = tokenizer(
*[examples[input_col] for input_col in input_cols],
*[examples[input_col] for input_col in input_cols if examples[input_col]],
padding=kwargs.get("padding", True),
truncation=kwargs.get("truncation", True),
is_split_into_words=kwargs.get("is_split_into_words", True),
Expand Down
12 changes: 11 additions & 1 deletion olive/data/component/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class TextGenParams(ConfigBase):
# might have to expose collator for dataloader to support dynamic padding of batches
# if false, cannot guarantee all sequences are same length. data loader will have to handle this during collation
pad_to_max_len: bool = True # pad sequences to max_len, ignored for JOIN corpus strategy
padding_side: str = "right" # pad to the right or left
drop_short_sequences: bool = False # drop sequences shorter than max_len. Mutually exclusive with pad_to_max_len
use_attention_mask: bool = True # add attention mask to each example
# either use chat template or text
Expand Down Expand Up @@ -77,6 +78,12 @@ class TextGenParams(ConfigBase):
extended_mask_type: str = None # use causal mask for language modeling tasks
extended_mask_value: float = None # value to use for causal mask, None for minimum value for torch float32 dtype

@validator("padding_side", always=True)
def _check_padding_side(cls, v):
if v not in ["left", "right"]:
raise ValueError("padding_side must be either left or right")
return v

@validator("drop_short_sequences", always=True)
def _check_padding(cls, v, values):
if "pad_to_max_len" not in values:
Expand Down Expand Up @@ -170,6 +177,9 @@ def text_gen_pre_process(dataset, tokenizer, all_kwargs):

args = validate_config(all_kwargs, TextGenParams, warn_unused_keys=True)

# set tokenizer padding side
tokenizer.padding_side = args.padding_side

if isinstance(args.text_formatting_func, str):
# load text_formatting_func
args.text_formatting_func = args.get_user_module_loader().load_object(args.text_formatting_func)
Expand All @@ -188,7 +198,7 @@ def text_gen_pre_process(dataset, tokenizer, all_kwargs):
)
}
)
text_list = dataset["text"]
text_list = [text for text in dataset["text"] if text] # remove empty strings
total_examples = len(text_list) # total number of examples

tokenized_inputs = {"input_ids": [], "labels": [], "attention_mask": []}
Expand Down

0 comments on commit 0d9b7cc

Please sign in to comment.