Skip to content

Commit fc1d9db

Browse files
minor fixes to tools prepare_data validators (#47) (#26)
* ensure that only a single whitespace is prepended. Ensure the message regarding the prompt separator is displayed only if a prompt separator exists. * change pandas contains to not use regex, which can trip if the common_suffix is actually a regex Co-authored-by: Boris Power <[email protected]>
1 parent d92502f commit fc1d9db

File tree

2 files changed

+34
-17
lines changed

2 files changed

+34
-17
lines changed

openai/validators.py

+33-16
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def common_prompt_suffix_validator(df):
176176
if suffix_option == " ->":
177177
if df.prompt.str.contains("\n").any():
178178
continue
179-
if df.prompt.str.contains(suffix_option).any():
179+
if df.prompt.str.contains(suffix_option, regex=False).any():
180180
continue
181181
suggested_suffix = suffix_option
182182
break
@@ -202,7 +202,11 @@ def add_suffix(x, suffix):
202202
)
203203
if len(common_suffix) > 10:
204204
immediate_msg += f". This suffix seems very long. Consider replacing with a shorter suffix, such as `{display_suggested_suffix}`"
205-
if df.prompt.str[: -len(common_suffix)].str.contains(common_suffix).any():
205+
if (
206+
df.prompt.str[: -len(common_suffix)]
207+
.str.contains(common_suffix, regex=False)
208+
.any()
209+
):
206210
immediate_msg += f"\n WARNING: Some of your prompts contain the suffix `{common_suffix}` more than once. We strongly suggest that you review your prompts and add a unique suffix"
207211

208212
else:
@@ -271,11 +275,15 @@ def common_completion_prefix_validator(df):
271275
MAX_PREFIX_LEN = 5
272276

273277
common_prefix = get_common_xfix(df.completion, xfix="prefix")
278+
ws_prefix = len(common_prefix) > 0 and common_prefix[0] == " "
274279
if len(common_prefix) < MAX_PREFIX_LEN:
275280
return Remediation(name="common_prefix")
276281

277-
def remove_common_prefix(x, prefix):
282+
def remove_common_prefix(x, prefix, ws_prefix):
278283
x["completion"] = x["completion"].str[len(prefix) :]
284+
if ws_prefix:
285+
# keep the single whitespace as prefix
286+
x["completion"] = " " + x["completion"]
279287
return x
280288

281289
if (df.completion == common_prefix).all():
@@ -286,7 +294,7 @@ def remove_common_prefix(x, prefix):
286294
optional_msg = f"Remove prefix `{common_prefix}` from all completions"
287295

288296
def optional_fn(x):
289-
return remove_common_prefix(x, common_prefix)
297+
return remove_common_prefix(x, common_prefix, ws_prefix)
290298

291299
return Remediation(
292300
name="common_completion_prefix",
@@ -305,6 +313,15 @@ def common_completion_suffix_validator(df):
305313
optional_msg = None
306314
optional_fn = None
307315

316+
ft_type = infer_task_type(df)
317+
if ft_type == "open-ended generation" or ft_type == "classification":
318+
return Remediation(name="common_suffix")
319+
320+
common_suffix = get_common_xfix(df.completion, xfix="suffix")
321+
if (df.completion == common_suffix).all():
322+
error_msg = f"All completions are identical: `{common_suffix}`\nEnsure completions are different, otherwise the model will just repeat `{common_suffix}`"
323+
return Remediation(name="common_suffix", error_msg=error_msg)
324+
308325
# Find a suffix which is not contained within the completion otherwise
309326
suggested_suffix = " [END]"
310327
suffix_options = [
@@ -319,33 +336,28 @@ def common_completion_suffix_validator(df):
319336
"%%%",
320337
]
321338
for suffix_option in suffix_options:
322-
if df.completion.str.contains(suffix_option).any():
339+
if df.completion.str.contains(suffix_option, regex=False).any():
323340
continue
324341
suggested_suffix = suffix_option
325342
break
326343
display_suggested_suffix = suggested_suffix.replace("\n", "\\n")
327344

328-
ft_type = infer_task_type(df)
329-
if ft_type == "open-ended generation" or ft_type == "classification":
330-
return Remediation(name="common_suffix")
331-
332345
def add_suffix(x, suffix):
333346
x["completion"] += suffix
334347
return x
335348

336-
common_suffix = get_common_xfix(df.completion, xfix="suffix")
337-
if (df.completion == common_suffix).all():
338-
error_msg = f"All completions are identical: `{common_suffix}`\nEnsure completions are different, otherwise the model will just repeat `{common_suffix}`"
339-
return Remediation(name="common_suffix", error_msg=error_msg)
340-
341349
if common_suffix != "":
342350
common_suffix_new_line_handled = common_suffix.replace("\n", "\\n")
343351
immediate_msg = (
344352
f"\n- All completions end with suffix `{common_suffix_new_line_handled}`"
345353
)
346354
if len(common_suffix) > 10:
347355
immediate_msg += f". This suffix seems very long. Consider replacing with a shorter suffix, such as `{display_suggested_suffix}`"
348-
if df.completion.str[: -len(common_suffix)].str.contains(common_suffix).any():
356+
if (
357+
df.completion.str[: -len(common_suffix)]
358+
.str.contains(common_suffix, regex=False)
359+
.any()
360+
):
349361
immediate_msg += f"\n WARNING: Some of your completions contain the suffix `{common_suffix}` more than once. We suggest that you review your completions and add a unique ending"
350362

351363
else:
@@ -617,8 +629,13 @@ def write_out_file(df, fname, any_remediations):
617629
# Add -v VALID_FILE if we split the file into train / valid
618630
files_string = ("s" if split else "") + " to `" + ("` and `".join(outfnames))
619631
valid_string = f' -v "{outfnames[1]}"' if split else ""
632+
separator_reminder = (
633+
""
634+
if len(common_prompt_suffix_new_line_handled) == 0
635+
else f"After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `{common_prompt_suffix_new_line_handled}` for the model to start generating completions, rather than continuing with the prompt."
636+
)
620637
sys.stdout.write(
621-
f'\nWrote modified file{files_string}`\nFeel free to take a look!\n\nNow use that file when fine-tuning:\n> openai api fine_tunes.create -t "{outfnames[0]}"{valid_string}{packing_param}\n\nAfter you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `{common_prompt_suffix_new_line_handled}` for the model to start generating completions, rather than continuing with the prompt.{optional_ending_string}\n'
638+
f'\nWrote modified file{files_string}`\nFeel free to take a look!\n\nNow use that file when fine-tuning:\n> openai api fine_tunes.create -t "{outfnames[0]}"{valid_string}{packing_param}\n\n{separator_reminder}{optional_ending_string}\n'
622639
)
623640
else:
624641
sys.stdout.write("Aborting... did not write the file\n")

openai/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
VERSION = "0.9.3"
1+
VERSION = "0.9.4"

0 commit comments

Comments
 (0)