Skip to content

Commit

Permalink
fix(smart-apply): stripping of markdown code blocks (#7105)
Browse files Browse the repository at this point in the history
## Description

Issue:
https://linear.app/sourcegraph/issue/CODY-4428/markdown-code-blocks-render-incorrectly-due-to-unintended-triple

## Test plan
Ask Cody to generate a readme that documents a file with code examples.
Notice when applying the changes that the code blocks remain

<!-- Required. See
https://docs-legacy.sourcegraph.com/dev/background-information/testing_principles.
-->

---------

Co-authored-by: Tom Ross <[email protected]>
  • Loading branch information
ichim-david and umpox authored Feb 19, 2025
1 parent df723e6 commit 74aeb4b
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 42 deletions.
4 changes: 4 additions & 0 deletions lib/shared/src/chat/preamble.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ For executable terminal commands: enclose each command in individual "bash" lang

const CHAT_PREAMBLE = DEFAULT_PREAMBLE.concat(SMART_APPLY_PREAMBLE)

export function getChatPreamble(): PromptString {
return CHAT_PREAMBLE
}

export function getSimplePreamble(
model: ChatModel | EditModel | undefined,
apiVersion: number,
Expand Down
2 changes: 1 addition & 1 deletion lib/shared/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ export {
} from './models/utils'
export { BotResponseMultiplexer } from './chat/bot-response-multiplexer'
export { ChatClient } from './chat/chat'
export { getDefaultSystemPrompt, getSimplePreamble } from './chat/preamble'
export { getDefaultSystemPrompt, getChatPreamble, getSimplePreamble } from './chat/preamble'
export type {
SerializedChatInteraction,
SerializedChatTranscript,
Expand Down
49 changes: 21 additions & 28 deletions vscode/src/chat/chat-view/prompt.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
type ModelsData,
contextFiltersProvider,
createModel,
getChatPreamble,
graphqlClient,
mockAuthStatus,
mockClientCapabilities,
Expand Down Expand Up @@ -79,25 +80,22 @@ describe('DefaultPrompter', () => {
chat.addHumanMessage({ text: ps`Hello` })

const { prompt, context } = await new DefaultPrompter([], []).makePrompt(chat, 0)
expect(prompt).toMatchInlineSnapshot(`
[
expect(prompt).toEqual([
{
"speaker": "human",
"text": "You are Cody, an AI coding assistant from Sourcegraph.If your answer contains fenced code blocks in Markdown, include the relevant full file path in the code block tag using this structure: \`\`\`$LANGUAGE:$FILEPATH\`\`\`
For executable terminal commands: enclose each command in individual "bash" language code block without comments and new lines inside.",
speaker: 'human',
text: getChatPreamble(),
},
{
"speaker": "assistant",
"text": "I am Cody, an AI coding assistant from Sourcegraph.",
speaker: 'assistant',
text: ps`I am Cody, an AI coding assistant from Sourcegraph.`,
},
{
"contextAlternatives": undefined,
"contextFiles": undefined,
"speaker": "human",
"text": "Hello",
contextAlternatives: undefined,
contextFiles: undefined,
speaker: 'human',
text: ps`Hello`,
},
]
`)
])
expect(context.used).toEqual([])
expect(context.ignored).toEqual([])
})
Expand Down Expand Up @@ -160,27 +158,22 @@ describe('DefaultPrompter', () => {
chat.addHumanMessage({ text: ps`Hello` })

const { prompt, context } = await new DefaultPrompter([], []).makePrompt(chat, 0)
expect(prompt).toMatchInlineSnapshot(`
[
expect(prompt).toEqual([
{
"speaker": "human",
"text": "You are Cody, an AI coding assistant from Sourcegraph.If your answer contains fenced code blocks in Markdown, include the relevant full file path in the code block tag using this structure: \`\`\`$LANGUAGE:$FILEPATH\`\`\`
For executable terminal commands: enclose each command in individual "bash" language code block without comments and new lines inside.
Always respond with 🧀 emojis",
speaker: 'human',
text: ps`${getChatPreamble()}\n\nAlways respond with 🧀 emojis`,
},
{
"speaker": "assistant",
"text": "I am Cody, an AI coding assistant from Sourcegraph.",
speaker: 'assistant',
text: ps`I am Cody, an AI coding assistant from Sourcegraph.`,
},
{
"contextAlternatives": undefined,
"contextFiles": undefined,
"speaker": "human",
"text": "Hello",
contextAlternatives: undefined,
contextFiles: undefined,
speaker: 'human',
text: ps`Hello`,
},
]
`)
])
expect(context.used).toEqual([])
expect(context.ignored).toEqual([])
})
Expand Down
31 changes: 24 additions & 7 deletions vscode/src/edit/output/response-transformer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,29 @@ const MARKDOWN_CODE_BLOCK_REGEX = new RegExp(
const LEADING_SPACES_AND_NEW_LINES = /^\s*\n/
const LEADING_SPACES = /^[ ]+/

/**
* Strips the text of any unnecessary content.
* This includes:
* 1. Prompt topics, e.g. <CODE511>. These are used by the LLM to wrap the output code.
* 2. Markdown code blocks, e.g. ```typescript. Most LLMs are trained to produce Markdown-suitable responses.
*/
function stripText(text: string, task: FixupTask): string {
const strippedText = text
// Strip specific XML tags referenced in the prompt, e.g. <CODE511>
.replaceAll(PROMPT_TOPIC_REGEX, '')

if (task.document.languageId === 'markdown') {
// Return this text as is, we do not want to strip Markdown blocks as they may be valuable
// in Markdown files
return strippedText
}

// Strip Markdown syntax for code blocks, e.g. ```typescript.
return strippedText.replaceAll(MARKDOWN_CODE_BLOCK_REGEX, block =>
block.replace(MARKDOWN_CODE_BLOCK_START, '').replace(MARKDOWN_CODE_BLOCK_END, '')
)
}

/**
* Given the LLM response for a FixupTask, transforms the response
* to make it suitable to insert as code.
Expand All @@ -43,13 +66,7 @@ export function responseTransformer(
task: FixupTask,
isMessageInProgress: boolean
): string {
const strippedText = text
// Strip specific XML tags referenced in the prompt, e.g. <CODE511>
.replaceAll(PROMPT_TOPIC_REGEX, '')
// Strip Markdown syntax for code blocks, e.g. ```typescript.
.replaceAll(MARKDOWN_CODE_BLOCK_REGEX, block =>
block.replace(MARKDOWN_CODE_BLOCK_START, '').replace(MARKDOWN_CODE_BLOCK_END, '')
)
const strippedText = stripText(text, task)

// Trim leading spaces
// - For `add` insertions, the LLM will attempt to continue the code from the position of the cursor, we handle the `insertionPoint`
Expand Down
22 changes: 21 additions & 1 deletion vscode/src/edit/output/test-fixtures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,21 @@ const CLEAN_RESPONSE = `export function logDebug<T extends string>(filterLabel:
log<T>('error', filterLabel, text, ...args)
}`

const DEFAULT_TASK = {} as FixupTask
const CLEAN_MARKDOWN_RESPONSE = `## Example heading
Some text
\`\`\`typescript
${CLEAN_RESPONSE}
\`\`\`
Some more text
\`\`\typescript
${CLEAN_RESPONSE}
\`\`\``

const DEFAULT_TASK = { document: { languageId: 'typescript' } } as FixupTask

export const RESPONSE_TEST_FIXTURES: Record<string, ResponseTestFixture> = {
clean: {
Expand Down Expand Up @@ -65,4 +79,10 @@ export const RESPONSE_TEST_FIXTURES: Record<string, ResponseTestFixture> = {
expected: '\n\n' + CLEAN_RESPONSE,
task: { ...DEFAULT_TASK, intent: 'add' } as FixupTask,
},
inMarkdownFile: {
response: CLEAN_MARKDOWN_RESPONSE,
// Markdown files should not strip Markdown code blocks
expected: CLEAN_MARKDOWN_RESPONSE,
task: { ...DEFAULT_TASK, document: { languageId: 'markdown' } } as FixupTask,
},
}
39 changes: 39 additions & 0 deletions vscode/webviews/components/MarkdownFromCody.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ const complicatedMarkdown = [
'Escaped \\* markdown and escaped html code &amp;lt;',
].join('\n')

const complicatedMarkdownCodeBlock = '```markdown' + complicatedMarkdown + '```'

describe('MarkdownFromCody', () => {
it('renders code blocks, with syntax highlighting', () => {
expect(render(complicatedMarkdown)).toMatchInlineSnapshot(`
Expand Down Expand Up @@ -72,6 +74,43 @@ describe('MarkdownFromCody', () => {
`)
// TODO(sqs): Not sure why the '<b>inline html</b>' B tags aren't passing through.
})
it('renders Markdown code blocks, with child code blocks preserved', () => {
expect(render(complicatedMarkdownCodeBlock)).toMatchInlineSnapshot(`
"<pre><code class="hljs language-markdown#">
## This is a subheading
Some text
in the same paragraph
with a [link](./destination).
\`\`\`ts
const someTypeScriptCode = funcCall()
\`\`\`
- bullet list item 1
- bullet list item 2
1. item 1
\`\`\`ts
const codeInsideTheBulletPoint = "string"
\`\`\`
1. item 2
> quoted
> text
| col 1 | col 2 |
|-------|-------|
| A | B |
![image alt text](./src.jpg)
<b>inline html</b>
Escaped \\* markdown and escaped html code &amp;lt;\`\`\`\`
</code></pre>"
`)
})
it('sanitizes script tags', () => {
expect(render('<script>evil();</script>')).toBe('')
})
Expand Down
31 changes: 26 additions & 5 deletions vscode/webviews/components/MarkdownFromCody.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,37 @@ const URL_PROCESSORS: Partial<Record<CodyIDE, UrlTransform>> = {
[CodyIDE.VSCode]: wrapLinksWithCodyOpenCommand,
}

export const MarkdownFromCody: FunctionComponent<{ className?: string; children: string }> = ({
className,
children,
}) => {
/**
* Transforms the children string by wrapping it in one extra backtick if we find '```markdown'.
* This is used to preserve the formatting of Markdown code blocks within the Markdown content.
* Such cases happen when you ask Cody to create a Markdown file or when you load a history chat
* that contains replies for creating Markdown files.
*
* @param children - The string to transform.
* @returns The transformed string.
*/
const childrenTransform = (children: string): string => {
if (children.indexOf('```markdown') === -1) {
return children
}
children = children.replace('```markdown', '````markdown')
const lastIdx = children.lastIndexOf('```')

// Replace the last three backticks with four backticks
return children.slice(0, lastIdx) + '````' + children.slice(lastIdx + 3)
}

export const MarkdownFromCody: FunctionComponent<{
className?: string
children: string
}> = ({ className, children }) => {
const clientType = useConfig().clientCapabilities.agentIDE
const urlTransform = useMemo(() => URL_PROCESSORS[clientType] ?? defaultUrlProcessor, [clientType])
const chatReplyTransformed = childrenTransform(children)

return (
<Markdown className={className} {...markdownPluginProps()} urlTransform={urlTransform}>
{children}
{chatReplyTransformed}
</Markdown>
)
}
Expand Down

0 comments on commit 74aeb4b

Please sign in to comment.