From 7901207426517bf614bb9e22f64867a5bd4861eb Mon Sep 17 00:00:00 2001 From: Hillary Mutisya <150286414+hillary-mutisya@users.noreply.github.com> Date: Mon, 3 Feb 2025 17:57:13 -0800 Subject: [PATCH] Determining page layout (#661) --- .../browser/src/agent/browserConnector.mts | 6 +- .../src/agent/discovery/actionHandler.mts | 21 +++++ .../discovery/schema/discoveryActions.mts | 6 -- .../src/agent/discovery/schema/pageLayout.mts | 9 ++ .../src/agent/discovery/translator.mts | 86 ++++++++++++++++--- .../browser/src/extension/htmlReducer.ts | 5 ++ .../browser/src/extension/serviceWorker.ts | 7 +- 7 files changed, 119 insertions(+), 21 deletions(-) create mode 100644 ts/packages/agents/browser/src/agent/discovery/schema/pageLayout.mts diff --git a/ts/packages/agents/browser/src/agent/browserConnector.mts b/ts/packages/agents/browser/src/agent/browserConnector.mts index a9090d875..605fe44e5 100644 --- a/ts/packages/agents/browser/src/agent/browserConnector.mts +++ b/ts/packages/agents/browser/src/agent/browserConnector.mts @@ -80,13 +80,17 @@ export class BrowserConnector { return []; } - async getFilteredHtmlFragments(inputHtmlFragments: any[]) { + async getFilteredHtmlFragments( + inputHtmlFragments: any[], + cssSelectorsToKeep: string[], + ) { let htmlFragments: any[] = []; const timeoutPromise = new Promise((f) => setTimeout(f, 5000)); const filterAction = { actionName: "getFilteredHTMLFragments", parameters: { fragments: inputHtmlFragments, + cssSelectorsToKeep: cssSelectorsToKeep, }, }; diff --git a/ts/packages/agents/browser/src/agent/discovery/actionHandler.mts b/ts/packages/agents/browser/src/agent/discovery/actionHandler.mts index 98f532fba..60c57253c 100644 --- a/ts/packages/agents/browser/src/agent/discovery/actionHandler.mts +++ b/ts/packages/agents/browser/src/agent/discovery/actionHandler.mts @@ -27,6 +27,9 @@ export async function handleSchemaDiscoveryAction( case "summarizePage": await handleGetPageSummary(action); break; + case "findPageComponents": + await handleGetPageComponents(action); + break; } async function handleFindUserActions(action: any) { @@ -85,5 +88,23 @@ export async function handleSchemaDiscoveryAction( return response.data; } + async function handleGetPageComponents(action: any) { + const htmlFragments = await browser.getHtmlFragments(); + const timerName = `Getting page layout`; + console.time(timerName); + const response = await agent.getPageLayout(undefined, htmlFragments); + + if (!response.success) { + console.error("Attempt to get page layout failed"); + console.error(response.message); + return; + } + + console.timeEnd(timerName); + message = "Page layout: \n" + JSON.stringify(response.data, null, 2); + + return response.data; + } + return message; } diff --git a/ts/packages/agents/browser/src/agent/discovery/schema/discoveryActions.mts b/ts/packages/agents/browser/src/agent/discovery/schema/discoveryActions.mts index 1ebc8189d..e45ab0de1 100644 --- a/ts/packages/agents/browser/src/agent/discovery/schema/discoveryActions.mts +++ b/ts/packages/agents/browser/src/agent/discovery/schema/discoveryActions.mts @@ -7,16 +7,10 @@ export type FindPageComponents = { export type FindUserActions = { actionName: "findUserActions"; - parameters: { - allowDuplicates?: boolean; - }; }; export type SummarizePage = { actionName: "summarizePage"; - parameters: { - allowDuplicates?: boolean; - }; }; export type SaveUserActions = { diff --git a/ts/packages/agents/browser/src/agent/discovery/schema/pageLayout.mts b/ts/packages/agents/browser/src/agent/discovery/schema/pageLayout.mts new file mode 100644 index 000000000..f26e66281 --- /dev/null +++ b/ts/packages/agents/browser/src/agent/discovery/schema/pageLayout.mts @@ -0,0 +1,9 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +export type PageLayout = { + headerCSSSelector: string; + footerCSSSelector: string; + navigationLinksCSSSelector: string; + mainContentCSSSelector: string; +}; diff --git a/ts/packages/agents/browser/src/agent/discovery/translator.mts b/ts/packages/agents/browser/src/agent/discovery/translator.mts index 8fe660591..ba5071389 100644 --- a/ts/packages/agents/browser/src/agent/discovery/translator.mts +++ b/ts/packages/agents/browser/src/agent/discovery/translator.mts @@ -79,17 +79,18 @@ function getScreenshotPromptSection( url: screenshot, }, }); - } - if (fragments) { - const textFragments = fragments.map((a) => a.text); - screenshotSection.push({ - type: "text", - text: `Here is the text content of the page + + if (fragments) { + const textFragments = fragments.map((a) => a.text); + screenshotSection.push({ + type: "text", + text: `Here is the text content of the page ''' ${textFragments} ''' `, - }); + }); + } } return screenshotSection; } @@ -176,7 +177,6 @@ export class SchemaDiscoveryAgent { requestSection.push({ type: "text", text: ` - Here is user request ''' ${userRequest} @@ -266,7 +266,6 @@ export class SchemaDiscoveryAgent { requestSection.push({ type: "text", text: ` - Here is user request ''' ${userRequest} @@ -366,7 +365,74 @@ export class SchemaDiscoveryAgent { type: "text", text: ` Examine the layout information provided and determine the content of the page and the actions users can take on it. - Once you have this list, a SINGLE "PageDescription" response using the typescript schema below. + Once you have this list, a SINGLE "${bootstrapTranslator.validator.getTypeName()}" response using the typescript schema below. + + ''' + ${bootstrapTranslator.validator.getSchemaText()} + ''' + `, + }, + ...requestSection, + { + type: "text", + text: ` + The following is the COMPLETE JSON response object with 2 spaces of indentation and no properties with the value undefined: + `, + }, + ]; + + const response = await bootstrapTranslator.translate("", [ + { role: "user", content: JSON.stringify(promptSections) }, + ]); + return response; + } + + async getPageLayout( + userRequest?: string, + fragments?: HtmlFragments[], + screenshot?: string, + ) { + const packageRoot = path.join("..", "..", ".."); + const resultsSchema = await fs.promises.readFile( + fileURLToPath( + new URL( + path.join(packageRoot, "./src/agent/discovery/schema/PageLayout.mts"), + import.meta.url, + ), + ), + "utf8", + ); + + const bootstrapTranslator = this.getBootstrapTranslator( + "PageLayout", + resultsSchema, + ); + + const screenshotSection = getScreenshotPromptSection(screenshot, fragments); + const htmlSection = getHtmlPromptSection(fragments); + const prefixSection = getBootstrapPrefixPromptSection(); + let requestSection = []; + if (userRequest) { + requestSection.push({ + type: "text", + text: ` + + Here is user request + ''' + ${userRequest} + ''' + `, + }); + } + const promptSections = [ + ...prefixSection, + ...screenshotSection, + ...htmlSection, + { + type: "text", + text: ` + Examine the layout information provided and determine the content of the page and the actions users can take on it. + Once you have this list, a SINGLE "${bootstrapTranslator.validator.getTypeName()}" response using the typescript schema below. ''' ${bootstrapTranslator.validator.getSchemaText()} diff --git a/ts/packages/agents/browser/src/extension/htmlReducer.ts b/ts/packages/agents/browser/src/extension/htmlReducer.ts index aad29703a..ad4578e3b 100644 --- a/ts/packages/agents/browser/src/extension/htmlReducer.ts +++ b/ts/packages/agents/browser/src/extension/htmlReducer.ts @@ -46,6 +46,7 @@ export class HTMLReducer { "nocontent", "noscript", "template", + "img", ]; mediaElementSelectors: string[] = [ @@ -68,6 +69,10 @@ export class HTMLReducer { "clickid", "fetchpriority", "srcset", + "aria-busy", + "aria-haspopup", + "aria-autocomplete", + "href", ]; attribsToReplace: Set = new Set(["href", "src"]); diff --git a/ts/packages/agents/browser/src/extension/serviceWorker.ts b/ts/packages/agents/browser/src/extension/serviceWorker.ts index ad27d37a3..7eb21a2cb 100644 --- a/ts/packages/agents/browser/src/extension/serviceWorker.ts +++ b/ts/packages/agents/browser/src/extension/serviceWorker.ts @@ -730,6 +730,7 @@ async function getTabHTMLFragmentsBySize( async function getFilteredHTMLFragments( targetTab: chrome.tabs.Tab, inputHtmlFragments: any[], + cssSelectorsToKeep: string[], ) { let htmlFragments: any[] = []; @@ -740,10 +741,7 @@ async function getFilteredHTMLFragments( { type: "get_filtered_html_fragments", inputHtml: inputHtmlFragments[i].content, - cssSelectors: [ - inputHtmlFragments[i].cssSelectorAcross, - inputHtmlFragments[i].cssSelectorDown, - ].join(", "), + cssSelectors: cssSelectorsToKeep.join(", "), frameId: inputHtmlFragments[i].frameId, }, { frameId: inputHtmlFragments[i].frameId }, @@ -1165,6 +1163,7 @@ async function runBrowserAction(action: any) { responseObject = await getFilteredHTMLFragments( targetTab, action.parameters.fragments, + action.parameters.cssSelectorsToKeep, ); break; }