filter empty text, padding_size

microsoft · Feb 14, 2025 · 0d9b7cc · 0d9b7cc
1 parent 3c32603
commit 0d9b7cc
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 3 deletions.
diff --git a/olive/data/component/pre_process_data.py b/olive/data/component/pre_process_data.py
@@ -91,7 +91,7 @@ def huggingface_pre_process(
     def _tokenizer_and_align_labels(examples):
         tokenizer = get_tokenizer(model_name, trust_remote_code=trust_remote_code)
         tokenized_inputs = tokenizer(
-            *[examples[input_col] for input_col in input_cols],
+            *[examples[input_col] for input_col in input_cols if examples[input_col]],
             padding=kwargs.get("padding", True),
             truncation=kwargs.get("truncation", True),
             max_length=kwargs.get("max_length"),
@@ -160,7 +160,7 @@ def _align_labels_with_tokens(labels, word_ids):
     def _tokenizer_and_align_labels(examples):
         tokenizer = get_tokenizer(model_name, trust_remote_code=trust_remote_code)
         tokenized_inputs = tokenizer(
-            *[examples[input_col] for input_col in input_cols],
+            *[examples[input_col] for input_col in input_cols if examples[input_col]],
             padding=kwargs.get("padding", True),
             truncation=kwargs.get("truncation", True),
             is_split_into_words=kwargs.get("is_split_into_words", True),

diff --git a/olive/data/component/text_generation.py b/olive/data/component/text_generation.py
@@ -43,6 +43,7 @@ class TextGenParams(ConfigBase):
     # might have to expose collator for dataloader to support dynamic padding of batches
     # if false, cannot guarantee all sequences are same length. data loader will have to handle this during collation
     pad_to_max_len: bool = True  # pad sequences to max_len, ignored for JOIN corpus strategy
+    padding_side: str = "right"  # pad to the right or left
     drop_short_sequences: bool = False  # drop sequences shorter than max_len. Mutually exclusive with pad_to_max_len
     use_attention_mask: bool = True  # add attention mask to each example
     # either use chat template or text
@@ -77,6 +78,12 @@ class TextGenParams(ConfigBase):
     extended_mask_type: str = None  # use causal mask for language modeling tasks
     extended_mask_value: float = None  # value to use for causal mask, None for minimum value for torch float32 dtype
 
+    @validator("padding_side", always=True)
+    def _check_padding_side(cls, v):
+        if v not in ["left", "right"]:
+            raise ValueError("padding_side must be either left or right")
+        return v
+
     @validator("drop_short_sequences", always=True)
     def _check_padding(cls, v, values):
         if "pad_to_max_len" not in values:
@@ -170,6 +177,9 @@ def text_gen_pre_process(dataset, tokenizer, all_kwargs):
 
     args = validate_config(all_kwargs, TextGenParams, warn_unused_keys=True)
 
+    # set tokenizer padding side
+    tokenizer.padding_side = args.padding_side
+
     if isinstance(args.text_formatting_func, str):
         # load text_formatting_func
         args.text_formatting_func = args.get_user_module_loader().load_object(args.text_formatting_func)
@@ -188,7 +198,7 @@ def text_gen_pre_process(dataset, tokenizer, all_kwargs):
             )
         }
     )
-    text_list = dataset["text"]
+    text_list = [text for text in dataset["text"] if text]  # remove empty strings
     total_examples = len(text_list)  # total number of examples
 
     tokenized_inputs = {"input_ids": [], "labels": [], "attention_mask": []}