Skip to content

Commit f4c49f0

Browse files
Schema discovery: Add page summaries to help ground the candidate user action responses (#659)
1 parent 3588ffd commit f4c49f0

File tree

6 files changed

+214
-39
lines changed

6 files changed

+214
-39
lines changed

ts/packages/agents/browser/src/agent/discovery/actionHandler.mts

+38-1
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,36 @@ export async function handleSchemaDiscoveryAction(
2424
case "findUserActions":
2525
await handleFindUserActions(action);
2626
break;
27+
case "summarizePage":
28+
await handleGetPageSummary(action);
29+
break;
2730
}
2831

2932
async function handleFindUserActions(action: any) {
3033
const htmlFragments = await browser.getHtmlFragments();
34+
// const screenshot = await browser.getCurrentPageScreenshot();
35+
const screenshot = "";
36+
let pageSummary = "";
37+
38+
const summaryResponse = await agent.getPageSummary(
39+
undefined,
40+
htmlFragments,
41+
screenshot,
42+
);
43+
44+
if (summaryResponse.success) {
45+
pageSummary =
46+
"Page summary: \n" + JSON.stringify(summaryResponse.data, null, 2);
47+
}
48+
3149
const timerName = `Analyzing page actions`;
3250
console.time(timerName);
51+
3352
const response = await agent.getCandidateUserActions(
3453
undefined,
3554
htmlFragments,
36-
undefined,
55+
screenshot,
56+
pageSummary,
3757
);
3858

3959
if (!response.success) {
@@ -48,5 +68,22 @@ export async function handleSchemaDiscoveryAction(
4868
return response.data;
4969
}
5070

71+
async function handleGetPageSummary(action: any) {
72+
const htmlFragments = await browser.getHtmlFragments();
73+
const timerName = `Summarizing page`;
74+
console.time(timerName);
75+
const response = await agent.getPageSummary(undefined, htmlFragments);
76+
77+
if (!response.success) {
78+
console.error("Attempt to get page summary failed");
79+
console.error(response.message);
80+
return;
81+
}
82+
83+
console.timeEnd(timerName);
84+
message = "Page summary: \n" + JSON.stringify(response.data, null, 2);
85+
return response.data;
86+
}
87+
5188
return message;
5289
}

ts/packages/agents/browser/src/agent/discovery/schema/discoveryActions.mts

+3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ export type FindUserActions = {
1414

1515
export type SummarizePage = {
1616
actionName: "summarizePage";
17+
parameters: {
18+
allowDuplicates?: boolean;
19+
};
1720
};
1821

1922
export type SaveUserActions = {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT License.
3+
4+
// A description of the page, including layout information and summary of content.
5+
export type PageDescription = {
6+
description: string;
7+
features: string[];
8+
entities: string[];
9+
possibleUserAction: string[];
10+
};
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,71 @@
11
// Copyright (c) Microsoft Corporation.
22
// Licensed under the MIT License.
33

4-
export type SearchBox = {
5-
featureName: "searchInputBox";
6-
description: "Input box for searching on the page";
7-
parameters: {
8-
cssSelector: string;
9-
};
10-
};
11-
12-
export type SearchResultsList = {
13-
featureName: "searchResultsList";
14-
description: "List of products available from the search results";
15-
parameters: {
16-
cssSelector: string;
17-
};
18-
};
19-
20-
export type ProductDetailsCard = {
21-
featureName: "productDetailsCard";
22-
description: "A section that shows the product name, price, images and rating. This also gives an option to add the product to the shopping cart.";
23-
parameters: {
24-
cssSelector: string;
25-
};
26-
};
27-
28-
export type SearchForContent = {
29-
actionName: "searchForProduct";
30-
description: "Find content on the page";
31-
parameters: {
32-
value: string;
33-
cssSelector: string;
34-
};
35-
};
36-
374
export type LandingPage = {
385
description: "The default landing page for the site";
39-
features: SearchBox;
406
};
417

428
export type SearchResultsPage = {
439
description: "The search results page";
44-
features: SearchResultsList;
4510
};
4611

4712
export type ProductDetailsPage = {
4813
description: "A product details page, with focus on one product.";
49-
features: ProductDetailsCard;
5014
};
5115

5216
export type ShoppingCartPage = {
5317
description: "The shopping cart page for the site";
54-
features: SearchBox;
5518
};
19+
20+
export type PastOrderPage = {
21+
description: "The page showing a user's past orders";
22+
};
23+
24+
export type UnknownPage = {
25+
description: "A page that does not meet the previous more-specific categories";
26+
};
27+
28+
export type CommercePageTypes =
29+
| LandingPage
30+
| SearchResultsPage
31+
| ProductDetailsPage
32+
| ShoppingCartPage
33+
| PastOrderPage
34+
| UnknownPage;
35+
36+
export type CrosswordPage = {
37+
description: "The page showing a crossword puzzle";
38+
};
39+
40+
export type NewsLandingPage = {
41+
description: "The page showing news headlines for the day";
42+
};
43+
44+
export type SportsLandingPage = {
45+
description: "The page showing sports headlines for the day";
46+
};
47+
48+
export type OpinionPage = {
49+
description: "The page showing editorial opinions for the day";
50+
};
51+
52+
export type ArticlePage = {
53+
description: "The page showing an individual news article";
54+
};
55+
56+
export type WeatherPage = {
57+
description: "The page showing weather headlines";
58+
};
59+
60+
export type PuzzlesPage = {
61+
description: "The page showing a list of puzzles, such as sudoku, crossword, word matching games and more.";
62+
};
63+
64+
export type NewsPageTypes =
65+
| CrosswordPage
66+
| NewsLandingPage
67+
| SportsLandingPage
68+
| OpinionPage
69+
| ArticlePage
70+
| PuzzlesPage
71+
| UnknownPage;

ts/packages/agents/browser/src/agent/discovery/schema/userActionsPool.mts

+26-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ export type SearchForProductAction = {
2828
};
2929
};
3030

31+
// This allows users to select individual results on the search results page.
3132
export type SelectSearchResult = {
3233
actionName: "selectSearchResult";
3334
parameters: {
@@ -38,39 +39,63 @@ export type SelectSearchResult = {
3839

3940
export type NavigateToHomePage = {
4041
actionName: "navigateToHomePage";
42+
parameters: {
43+
linkCssSelector: string;
44+
};
4145
};
4246

4347
// Follow a link to view a store landing page
4448
export type NavigateToStorePage = {
4549
actionName: "navigateToStorePage";
50+
parameters: {
51+
linkCssSelector: string;
52+
};
4653
};
4754

4855
// Follow a link to view a product details page
4956
export type NavigateToProductPage = {
5057
actionName: "navigateToProductPage";
58+
parameters: {
59+
linkCssSelector: string;
60+
};
5161
};
5262

53-
// Follow a link to view a recipe details page
63+
// Follow a link to view a recipe details page. This link is typically named "Recipe" or "Recipes"
5464
export type NavigateToRecipePage = {
5565
actionName: "navigateToRecipePage";
66+
parameters: {
67+
linkCssSelector: string;
68+
};
5669
};
5770

5871
export type NavigateToListPage = {
5972
actionName: "navigateToListPage";
73+
parameters: {
74+
linkCssSelector: string;
75+
};
6076
};
6177

78+
// Navigate to the "Buy it again" page. This page may also be called Past Orders.
6279
export type NavigateToBuyItAgainPage = {
6380
actionName: "navigateToBuyItAgainPage";
81+
parameters: {
82+
linkCssSelector: string;
83+
};
6484
};
6585

86+
// This link opens the shopping cart. Its usually indicated by a cart or bag icon.
6687
export type NavigateToShoppingCartPage = {
6788
actionName: "navigateToShoppingCartPage";
89+
parameters: {
90+
linkCssSelector: string;
91+
};
6892
};
6993

7094
export type NavigateToOtherPage = {
7195
actionName: "navigateToOtherPage";
7296
parameters: {
7397
pageType: string;
98+
linkCssSelector: string;
7499
};
75100
};
76101

ts/packages/agents/browser/src/agent/discovery/translator.mts

+84
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ export class SchemaDiscoveryAgent<T extends object> {
250250
userRequest?: string,
251251
fragments?: HtmlFragments[],
252252
screenshot?: string,
253+
pageSummary?: string,
253254
) {
254255
// prompt - present html, optional screenshot and list of candidate actions
255256
const bootstrapTranslator = this.getBootstrapTranslator(
@@ -273,6 +274,19 @@ export class SchemaDiscoveryAgent<T extends object> {
273274
`,
274275
});
275276
}
277+
if (pageSummary) {
278+
requestSection.push({
279+
type: "text",
280+
text: `
281+
282+
Here is a previously-generated summary of the page
283+
'''
284+
${pageSummary}
285+
'''
286+
`,
287+
});
288+
}
289+
276290
const promptSections = [
277291
...prefixSection,
278292
...screenshotSection,
@@ -303,4 +317,74 @@ export class SchemaDiscoveryAgent<T extends object> {
303317
]);
304318
return response;
305319
}
320+
321+
async getPageSummary(
322+
userRequest?: string,
323+
fragments?: HtmlFragments[],
324+
screenshot?: string,
325+
) {
326+
const packageRoot = path.join("..", "..", "..");
327+
const resultsSchema = await fs.promises.readFile(
328+
fileURLToPath(
329+
new URL(
330+
path.join(
331+
packageRoot,
332+
"./src/agent/discovery/schema/pageSummary.mts",
333+
),
334+
import.meta.url,
335+
),
336+
),
337+
"utf8",
338+
);
339+
340+
const bootstrapTranslator = this.getBootstrapTranslator(
341+
"PageDescription",
342+
resultsSchema,
343+
);
344+
345+
const screenshotSection = getScreenshotPromptSection(screenshot, fragments);
346+
const htmlSection = getHtmlPromptSection(fragments);
347+
const prefixSection = getBootstrapPrefixPromptSection();
348+
let requestSection = [];
349+
if (userRequest) {
350+
requestSection.push({
351+
type: "text",
352+
text: `
353+
354+
Here is user request
355+
'''
356+
${userRequest}
357+
'''
358+
`,
359+
});
360+
}
361+
const promptSections = [
362+
...prefixSection,
363+
...screenshotSection,
364+
...htmlSection,
365+
{
366+
type: "text",
367+
text: `
368+
Examine the layout information provided and determine the content of the page and the actions users can take on it.
369+
Once you have this list, a SINGLE "PageDescription" response using the typescript schema below.
370+
371+
'''
372+
${bootstrapTranslator.validator.getSchemaText()}
373+
'''
374+
`,
375+
},
376+
...requestSection,
377+
{
378+
type: "text",
379+
text: `
380+
The following is the COMPLETE JSON response object with 2 spaces of indentation and no properties with the value undefined:
381+
`,
382+
},
383+
];
384+
385+
const response = await bootstrapTranslator.translate("", [
386+
{ role: "user", content: JSON.stringify(promptSections) },
387+
]);
388+
return response;
389+
}
306390
}

0 commit comments

Comments
 (0)