diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py index 73c0799fff3..2c62caf2cad 100644 --- a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py +++ b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py @@ -10,12 +10,10 @@ from typing import ( Any, AsyncGenerator, - BinaryIO, Dict, List, Optional, Sequence, - cast, ) from urllib.parse import quote_plus @@ -31,6 +29,7 @@ AssistantMessage, ChatCompletionClient, LLMMessage, + ModelFamily, RequestUsage, SystemMessage, UserMessage, @@ -42,7 +41,6 @@ from ._events import WebSurferEvent from ._prompts import ( - WEB_SURFER_OCR_PROMPT, WEB_SURFER_QA_PROMPT, WEB_SURFER_QA_SYSTEM_MESSAGE, WEB_SURFER_TOOL_PROMPT_MM, @@ -444,6 +442,22 @@ async def _generate_reply(self, cancellation_token: CancellationToken) -> UserCo # Clone the messages, removing old screenshots history: List[LLMMessage] = remove_images(self._chat_history) + # Split the history, removing the last message + if len(history): + user_request = history.pop() + else: + user_request = UserMessage(content="Empty request.", source="user") + + # Truncate the history for smaller models + if self._model_client.model_info["family"] not in [ + ModelFamily.GPT_4O, + ModelFamily.O1, + ModelFamily.O3, + ModelFamily.GPT_4, + ModelFamily.GPT_35, + ]: + history = [] + # Ask the page for interactive elements, then prepare the state-of-mark screenshot rects = await self._playwright_controller.get_interactive_rects(self._page) viewport = await self._playwright_controller.get_visual_viewport(self._page) @@ -499,21 +513,31 @@ async def _generate_reply(self, cancellation_token: CancellationToken) -> UserCo other_targets.extend(self._format_target_list(rects_below, rects)) if len(other_targets) > 0: + if len(other_targets) > 30: + other_targets = other_targets[0:30] + other_targets.append("...") other_targets_str = ( - "Additional valid interaction targets (not shown) include:\n" + "\n".join(other_targets) + "\n\n" + "Additional valid interaction targets include (but are not limited to):\n" + + "\n".join(other_targets) + + "\n\n" ) else: other_targets_str = "" + state_description = "Your " + await self._get_state_description() tool_names = "\n".join([t["name"] for t in tools]) + page_title = await self._page.title() + prompt_message = None if self._model_client.model_info["vision"]: text_prompt = WEB_SURFER_TOOL_PROMPT_MM.format( - url=self._page.url, + state_description=state_description, visible_targets=visible_targets, other_targets_str=other_targets_str, focused_hint=focused_hint, tool_names=tool_names, + title=page_title, + url=self._page.url, ).strip() # Scale the screenshot for the MLM, and close the original @@ -522,26 +546,42 @@ async def _generate_reply(self, cancellation_token: CancellationToken) -> UserCo if self.to_save_screenshots: scaled_screenshot.save(os.path.join(self.debug_dir, "screenshot_scaled.png")) # type: ignore - # Add the message - history.append(UserMessage(content=[text_prompt, AGImage.from_pil(scaled_screenshot)], source=self.name)) + # Create the message + prompt_message = UserMessage( + content=[re.sub(r"(\n\s*){3,}", "\n\n", text_prompt), AGImage.from_pil(scaled_screenshot)], + source=self.name, + ) else: - visible_text = await self._playwright_controller.get_visible_text(self._page) - text_prompt = WEB_SURFER_TOOL_PROMPT_TEXT.format( - url=self._page.url, + state_description=state_description, visible_targets=visible_targets, other_targets_str=other_targets_str, focused_hint=focused_hint, tool_names=tool_names, - visible_text=visible_text.strip(), + title=page_title, + url=self._page.url, ).strip() - # Add the message - history.append(UserMessage(content=text_prompt, source=self.name)) + # Create the message + prompt_message = UserMessage(content=re.sub(r"(\n\s*){3,}", "\n\n", text_prompt), source=self.name) + + history.append(prompt_message) + history.append(user_request) + + # {history[-2].content if isinstance(history[-2].content, str) else history[-2].content[0]} + # print(f""" + # ================={len(history)}================= + # {history[-2].content} + # ===== + # {history[-1].content} + # =================================================== + # """) + # Make the request response = await self._model_client.create( history, tools=tools, extra_create_args={"tool_choice": "auto"}, cancellation_token=cancellation_token ) # , "parallel_tool_calls": False}) + self.model_usage.append(response.usage) message = response.content self._last_download = None @@ -716,23 +756,12 @@ async def _execute_tool( metadata_hash = hashlib.md5(page_metadata.encode("utf-8")).hexdigest() if metadata_hash != self._prior_metadata_hash: page_metadata = ( - "\nThe following metadata was extracted from the webpage:\n\n" + page_metadata.strip() + "\n" + "\n\nThe following metadata was extracted from the webpage:\n\n" + page_metadata.strip() + "\n" ) else: page_metadata = "" self._prior_metadata_hash = metadata_hash - # Describe the viewport of the new page in words - viewport = await self._playwright_controller.get_visual_viewport(self._page) - percent_visible = int(viewport["height"] * 100 / viewport["scrollHeight"]) - percent_scrolled = int(viewport["pageTop"] * 100 / viewport["scrollHeight"]) - if percent_scrolled < 1: # Allow some rounding error - position_text = "at the top of the page" - elif percent_scrolled + percent_visible >= 99: # Allow some rounding error - position_text = "at the bottom of the page" - else: - position_text = str(percent_scrolled) + "% down from the top of the page" - new_screenshot = await self._page.screenshot() if self.to_save_screenshots: current_timestamp = "_" + int(time.time()).__str__() @@ -748,25 +777,40 @@ async def _execute_tool( ) ) - ocr_text = ( - await self._get_ocr_text(new_screenshot, cancellation_token=cancellation_token) - if self.use_ocr is True - else await self._playwright_controller.get_visible_text(self._page) - ) - # Return the complete observation - page_title = await self._page.title() - message_content = f"{action_description}\n\n Here is a screenshot of the webpage: [{page_title}]({self._page.url}).\n The viewport shows {percent_visible}% of the webpage, and is positioned {position_text} {page_metadata}\n" - if self.use_ocr: - message_content += f"Automatic OCR of the page screenshot has detected the following text:\n\n{ocr_text}" - else: - message_content += f"The following text is visible in the viewport:\n\n{ocr_text}" + state_description = "The " + await self._get_state_description() + message_content = ( + f"{action_description}\n\n" + state_description + page_metadata + "\nHere is a screenshot of the page." + ) return [ - message_content, + re.sub(r"(\n\s*){3,}", "\n\n", message_content), # Removing blank lines AGImage.from_pil(PIL.Image.open(io.BytesIO(new_screenshot))), ] + async def _get_state_description(self) -> str: + assert self._playwright_controller is not None + assert self._page is not None + + # Describe the viewport of the new page in words + viewport = await self._playwright_controller.get_visual_viewport(self._page) + percent_visible = int(viewport["height"] * 100 / viewport["scrollHeight"]) + percent_scrolled = int(viewport["pageTop"] * 100 / viewport["scrollHeight"]) + if percent_scrolled < 1: # Allow some rounding error + position_text = "at the top of the page" + elif percent_scrolled + percent_visible >= 99: # Allow some rounding error + position_text = "at the bottom of the page" + else: + position_text = str(percent_scrolled) + "% down from the top of the page" + + visible_text = await self._playwright_controller.get_visible_text(self._page) + + # Return the complete observation + page_title = await self._page.title() + message_content = f"web browser is open to the page [{page_title}]({self._page.url}).\nThe viewport shows {percent_visible}% of the webpage, and is positioned {position_text}\n" + message_content += f"The following text is visible in the viewport:\n\n{visible_text}" + return message_content + def _target_name(self, target: str, rects: Dict[str, InteractiveRegion]) -> str | None: try: return rects[target]["aria_name"].strip() @@ -798,38 +842,6 @@ def _format_target_list(self, ids: List[str], rects: Dict[str, InteractiveRegion return targets - async def _get_ocr_text( - self, image: bytes | io.BufferedIOBase | PIL.Image.Image, cancellation_token: Optional[CancellationToken] = None - ) -> str: - scaled_screenshot = None - if isinstance(image, PIL.Image.Image): - scaled_screenshot = image.resize((self.MLM_WIDTH, self.MLM_HEIGHT)) - else: - pil_image = None - if not isinstance(image, io.BufferedIOBase): - pil_image = PIL.Image.open(io.BytesIO(image)) - else: - pil_image = PIL.Image.open(cast(BinaryIO, image)) - scaled_screenshot = pil_image.resize((self.MLM_WIDTH, self.MLM_HEIGHT)) - pil_image.close() - - # Add the multimodal message and make the request - messages: List[LLMMessage] = [] - messages.append( - UserMessage( - content=[ - WEB_SURFER_OCR_PROMPT, - AGImage.from_pil(scaled_screenshot), - ], - source=self.name, - ) - ) - response = await self._model_client.create(messages, cancellation_token=cancellation_token) - self.model_usage.append(response.usage) - scaled_screenshot.close() - assert isinstance(response.content, str) - return response.content - async def _summarize_page( self, question: str | None = None, diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_prompts.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_prompts.py index 59a0a7c95d5..d1f1885240e 100644 --- a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_prompts.py +++ b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_prompts.py @@ -1,43 +1,42 @@ WEB_SURFER_TOOL_PROMPT_MM = """ -Consider the following screenshot of a web browser, which is open to the page '{url}'. In this screenshot, interactive elements are outlined in bounding boxes of different colors. Each bounding box has a numeric ID label in the same color. Additional information about each visible label is listed below: +{state_description} + +Consider the following screenshot of the page. In this screenshot, interactive elements are outlined in bounding boxes of different colors. Each bounding box has a numeric ID label in the same color. Additional information about each visible label is listed below: {visible_targets}{other_targets_str}{focused_hint} -You are to respond to the most recent request by selecting an appropriate tool from the following set, or by answering the question directly if possible without tools: +You are to respond to my next request by selecting an appropriate tool from the following set, or by answering the question directly if possible: {tool_names} When deciding between tools, consider if the request can be best addressed by: - - the contents of the current viewport (in which case actions like clicking links, clicking buttons, inputting text, or hovering over an element might be most appropriate) - - contents found elsewhere on the full webpage (in which case actions like scrolling, summarization, or full-page Q&A might be most appropriate) - - on some other website entirely (in which case actions like performing a new web search might be the best option) + - the contents of the CURRENT VIEWPORT (in which case actions like clicking links, clicking buttons, inputting text, or hovering over an element, might be more appropriate) + - contents found elsewhere on the CURRENT WEBPAGE [{title}]({url}), in which case actions like scrolling, summarization, or full-page Q&A might be most appropriate + - on ANOTHER WEBSITE entirely (in which case actions like performing a new web search might be the best option) + +My request follows: """ WEB_SURFER_TOOL_PROMPT_TEXT = """ -Your web browser is open to the page '{url}'. The following text is visible in the viewport: - -``` -{visible_text} -``` +{state_description} You have also identified the following interactive components: {visible_targets}{other_targets_str}{focused_hint} -You are to respond to the most recent request by selecting an appropriate tool from the following set, or by answering the question directly if possible without tools: +You are to respond to my next request by selecting an appropriate tool from the following set, or by answering the question directly if possible: {tool_names} When deciding between tools, consider if the request can be best addressed by: - - the contents of the current viewport (in which case actions like clicking links, clicking buttons, inputting text, or hovering over an element might be most appropriate) - - contents found elsewhere on the full webpage (in which case actions like scrolling, summarization, or full-page Q&A might be most appropriate) - - on some other website entirely (in which case actions like performing a new web search might be the best option) -""" + - the contents of the CURRENT VIEWPORT (in which case actions like clicking links, clicking buttons, inputting text, or hovering over an element, might be more appropriate) + - contents found elsewhere on the CURRENT WEBPAGE [{title}]({url}), in which case actions like scrolling, summarization, or full-page Q&A might be most appropriate + - on ANOTHER WEBSITE entirely (in which case actions like performing a new web search might be the best option) -WEB_SURFER_OCR_PROMPT = """ -Please transcribe all visible text on this page, including both main content and the labels of UI elements. +My request follows: """ + WEB_SURFER_QA_SYSTEM_MESSAGE = """ You are a helpful assistant that can summarize long documents to answer question. """ diff --git a/python/packages/autogen-ext/tests/test_websurfer_agent.py b/python/packages/autogen-ext/tests/test_websurfer_agent.py index a2aa33a1093..37423bfe6a5 100644 --- a/python/packages/autogen-ext/tests/test_websurfer_agent.py +++ b/python/packages/autogen-ext/tests/test_websurfer_agent.py @@ -140,7 +140,7 @@ async def test_run_websurfer(monkeypatch: pytest.MonkeyPatch) -> None: result.messages[2] # type: ignore .content[0] # type: ignore .startswith( # type: ignore - "I am waiting a short period of time before taking further action.\n\n Here is a screenshot of the webpage:" + "I am waiting a short period of time before taking further action." ) ) # type: ignore url_after_sleep = agent._page.url # type: ignore