diff --git a/.gitignore b/.gitignore index c090b154..5b6656e7 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ # Ignore Jupyter Notebook checkpoints .ipynb_checkpoints /test/* +/testing/* /deprecated/* /test/*.ipynb /logs/* @@ -38,4 +39,7 @@ scripts/* .vscode # Ignore the record files -tasks_status.json \ No newline at end of file +tasks_status.json +datas +_datas +datasUFO diff --git a/README.md b/README.md index 2d12dc56..0eb9f8a1 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,9 @@ Both agents leverage the multi-modal capabilities of GPT-4V(o) to comprehend the ## 📢 News +- 📅 2025-01-21: Version **v1.2.1** Released! We’re excited to announce the release of **v1.2.1**! 🎉 This update includes: + 1. **Bug Fixes**: Resolved issues in `requirements.txt` for smoother setup. + 2. **Multi-Action Mode**: Introducing a powerful new feature to execute **multiple actions** in a single inference step! Enable this mode by setting `ACTION_SEQUENCE=True` in `config_dev.yaml` and enjoy a more efficient workflow. - 📅 2024-12-13: We have a **New Release for v1.2.0!**! Checkout our new features and improvements: 1. **Large Action Model (LAM) Data Collection:** We have released the code and sample data for Large Action Model (LAM) data collection with UFO! Please checkout our [new paper](https://arxiv.org/abs/2412.10047), [code](dataflow/README.md) and [documentation](https://microsoft.github.io/UFO/dataflow/overview/) for more details. 2. **Bash Command Support:** HostAgent also support bash command now! diff --git a/dataflow/.gitignore b/dataflow/.gitignore index ef4cbe6b..9f292de2 100644 --- a/dataflow/.gitignore +++ b/dataflow/.gitignore @@ -2,4 +2,10 @@ cache/ controls_cache/ controller/utils/ -config/config.yaml \ No newline at end of file +config/config.yaml +tasks/ +logs/ +results/ +_logs +_results/ +*.zip diff --git a/dataflow/README.md b/dataflow/README.md index 2684e493..7f87b10b 100644 --- a/dataflow/README.md +++ b/dataflow/README.md @@ -189,21 +189,23 @@ Also, you can choose to use `instantiation` / `execution` sections individually, The default task hub is set to be `"TASKS_HUB"` in `dataflow/config_dev.yaml`. -1. Dataflow Task: +You can use `"TEMPLATE_METHOD"` in `dataflow/config_dev.yaml` to choose `LLM` or `SemanticSimilarity` as the backend for the template selection function. If you choose `LLM`, since the visual version is being used, you need to manually generate screenshots in the `templates/"YOUR_APP"/images` directory, and the filenames should match the template name and the screenshots should in `PNG` format. -- ```bash - python -m dataflow -dataflow --task_path path_to_task_file +* Dataflow Task: + + ```bash + python -m dataflow --dataflow --task_path path_to_task_file ``` * Instantiation Task: ```bash - python -m dataflow -instantiation --task_path path_to_task_file + python -m dataflow --instantiation --task_path path_to_task_file ``` * Execution Task: ```bash - python -m dataflow -execution --task_path path_to_task_file + python -m dataflow --execution --task_path path_to_task_file ``` ## Workflow @@ -221,6 +223,7 @@ Given the initial task, the dataflow first choose a template (`Phase 1`), the pr

+ #### 1. Choose Template File Templates for your app must be defined and described in `dataflow/templates/app`. For instance, if you want to instantiate tasks for the Word application, place the relevant `.docx` files in dataflow `/templates/word `, along with a `description.json` file. diff --git a/dataflow/config/config_dev.yaml b/dataflow/config/config_dev.yaml index ff0e73df..caf78cfd 100644 --- a/dataflow/config/config_dev.yaml +++ b/dataflow/config/config_dev.yaml @@ -11,6 +11,14 @@ FILTER_PROMPT: "dataflow/prompts/instantiation/{mode}/filter.yaml" # The prompt PREFILL_EXAMPLE_PROMPT: "dataflow/prompts/instantiation/{mode}/prefill_example.yaml" # The prompt for the action prefill example API_PROMPT: "ufo/prompts/share/lite/api.yaml" # The prompt for the API +# Template Configuration +TEMPLATE_METHOD: "LLM" # The method for the template, support 'SemanticSimilarity', 'LLM'. +TEMPLATE_PROMPT: "dataflow/prompts/instantiation/{mode}/template.yaml" # The prompt for the template + +# Reformat Configuration +REFORMAT_TO_BATCH: True # Whether to reformat the result of dataflow to the format of the UFO batch mode +REFORMAT_TO_BATCH_HUB: "datasUFO" # The reformat result path + # Default Task Configuration TASKS_HUB: "dataflow/tasks/prefill" # The default tasks hub for batch dataflow TEMPLATE_PATH: "dataflow/templates" # The template path for the exploration diff --git a/dataflow/data_flow_controller.py b/dataflow/data_flow_controller.py index e9dfccbe..d70def83 100644 --- a/dataflow/data_flow_controller.py +++ b/dataflow/data_flow_controller.py @@ -4,6 +4,7 @@ from enum import Enum from typing import Any, Dict, Optional, List from jsonschema import validate, ValidationError +import shutil from dataflow.env.env_manager import WindowsAppEnv from dataflow.instantiation.workflow.choose_template_flow import ChooseTemplateFlow @@ -13,7 +14,7 @@ from dataflow.config.config import Config from ufo.utils import print_with_color -from learner.utils import load_json_file, save_json_file +from learner.utils import load_json_file, save_json_file, reformat_json_file from ufo.agents.processors.app_agent_processor import AppAgentProcessor from ufo.module.context import Context @@ -24,17 +25,15 @@ # Load configuration data. _configs = Config.get_instance().config_data -INSTANTIATION_RESULT_MAP = { - True: "instantiation_pass", - False: "instantiation_fail" -} +INSTANTIATION_RESULT_MAP = {True: "instantiation_pass", False: "instantiation_fail"} EXECUTION_RESULT_MAP = { "yes": "execution_pass", "no": "execution_fail", - "unsure": "execution_unsure" + "unsure": "execution_unsure", } + class AppEnum(Enum): """ Enum class for applications. @@ -87,9 +86,9 @@ def _choose_app_from_json(self, task_app: str) -> AppEnum: for app in AppEnum: if app.description.lower() == task_app.lower(): return app - raise ValueError("Not a correct App") - - def _init_attr(self, task_type:str, task_json_file:Dict[str, Any]) -> None: + raise ValueError("The APP in the task file is not supported.") + + def _init_attr(self, task_type: str, task_json_file: Dict[str, Any]) -> None: """ Initialize the attributes of the task object. :param task_type: The task_type of the task object (dataflow, instantiation, or execution). @@ -103,11 +102,12 @@ def _init_attr(self, task_type:str, task_json_file:Dict[str, Any]) -> None: self.app = task_json_file.get("app") self.unique_id = task_json_file.get("unique_id") original = task_json_file.get("original", {}) - self.task = original.get("original_task", None) + self.task = original.get("original_task", None) self.refined_steps = original.get("original_steps", None) else: raise ValueError(f"Unsupported task_type: {task_type}") + class DataFlowController: """ Flow controller class to manage the instantiation and execution process. @@ -126,12 +126,12 @@ def __init__(self, task_path: str, task_type: str) -> None: self.task_file_name = self.task_object.task_file_name self.schema = self._load_schema(task_type) - + self.task_type = task_type self.task_info = self.init_task_info() self.result_hub = _configs["RESULT_HUB"].format(task_type=task_type) - def init_task_info(self) -> Dict[str, Any]: + def init_task_info(self) -> Dict[str, Any]: """ Initialize the task information. :return: The initialized task information. @@ -176,29 +176,38 @@ def execute_instantiation(self) -> Optional[List[Dict[str, Any]]]: :return: The instantiation plan if successful. """ - print_with_color(f"Instantiating task {self.task_object.task_file_name}...", "blue") + print_with_color( + f"Instantiating task {self.task_object.task_file_name}...", "blue" + ) template_copied_path = self.instantiation_single_flow( - ChooseTemplateFlow, "choose_template", + ChooseTemplateFlow, + "choose_template", init_params=[self.task_object.app_object.file_extension], - execute_params=[] + execute_params=[], ) if template_copied_path: self.app_env.start(template_copied_path) prefill_result = self.instantiation_single_flow( - PrefillFlow, "prefill", + PrefillFlow, + "prefill", init_params=[self.app_env], - execute_params=[template_copied_path, self.task_object.task, self.task_object.refined_steps] + execute_params=[ + template_copied_path, + self.task_object.task, + self.task_object.refined_steps, + ], ) self.app_env.close() if prefill_result: self.instantiation_single_flow( - FilterFlow, "instantiation_evaluation", + FilterFlow, + "instantiation_evaluation", init_params=[], - execute_params=[prefill_result["instantiated_request"]] + execute_params=[prefill_result["instantiated_request"]], ) return prefill_result["instantiated_plan"] @@ -217,10 +226,10 @@ def execute_execution(self, request: str, plan: Dict[str, any]) -> None: # Initialize the execution context and flow context = Context() execute_flow = ExecuteFlow(self.task_file_name, context, self.app_env) - + # Execute the plan executed_plan, execute_result = execute_flow.execute(request, plan) - + # Update the instantiated plan self.instantiated_plan = executed_plan # Record execution results and time metrics @@ -247,15 +256,15 @@ def execute_execution(self, request: str, plan: Dict[str, any]) -> None: self.task_info["time_cost"]["execute_eval"] = execute_flow.eval_time else: self.task_info["time_cost"]["execute_eval"] = None - self.app_env.close() + def instantiation_single_flow( - self, - flow_class: AppAgentProcessor, - flow_type: str, - init_params=None, - execute_params=None - ) -> Optional[Dict[str, Any]]: + self, + flow_class: AppAgentProcessor, + flow_type: str, + init_params=None, + execute_params=None, + ) -> Optional[Dict[str, Any]]: """ Execute a single flow process in the instantiation phase. :param flow_class: The flow class to instantiate. @@ -264,7 +273,7 @@ def instantiation_single_flow( :param execute_params: The execution parameters for the flow. :return: The result of the flow process. """ - + flow_instance = None try: flow_instance = flow_class(self.app_name, self.task_file_name, *init_params) @@ -283,7 +292,7 @@ def instantiation_single_flow( self.task_info["time_cost"][flow_type] = flow_instance.execution_time else: self.task_info["time_cost"][flow_type] = None - + def save_result(self) -> None: """ Validate and save the instantiated task result. @@ -304,11 +313,17 @@ def save_result(self) -> None: if self.task_type == "instantiation": # Determine the quality of the instantiation - if not self.task_info["instantiation_result"]["instantiation_evaluation"]["result"]: + if not self.task_info["instantiation_result"]["instantiation_evaluation"][ + "result" + ]: target_file = INSTANTIATION_RESULT_MAP[False] else: - is_quality_good = self.task_info["instantiation_result"]["instantiation_evaluation"]["result"]["judge"] - target_file = INSTANTIATION_RESULT_MAP.get(is_quality_good, INSTANTIATION_RESULT_MAP[False]) + is_quality_good = self.task_info["instantiation_result"][ + "instantiation_evaluation" + ]["result"]["judge"] + target_file = INSTANTIATION_RESULT_MAP.get( + is_quality_good, INSTANTIATION_RESULT_MAP[False] + ) else: # Determine the completion status of the execution @@ -316,10 +331,14 @@ def save_result(self) -> None: target_file = EXECUTION_RESULT_MAP["no"] else: is_completed = self.task_info["execution_result"]["result"]["complete"] - target_file = EXECUTION_RESULT_MAP.get(is_completed, EXECUTION_RESULT_MAP["no"]) + target_file = EXECUTION_RESULT_MAP.get( + is_completed, EXECUTION_RESULT_MAP["no"] + ) # Construct the full path to save the result - new_task_path = os.path.join(self.result_hub, target_file, self.task_object.task_file_base_name) + new_task_path = os.path.join( + self.result_hub, target_file, self.task_object.task_file_base_name + ) os.makedirs(os.path.dirname(new_task_path), exist_ok=True) save_json_file(new_task_path, self.task_info) @@ -327,8 +346,10 @@ def save_result(self) -> None: # If validation failed, indicate that the saved result may need further inspection if validation_error: - print("The saved task result does not conform to the expected schema and may require review.") - + print( + "The saved task result does not conform to the expected schema and may require review." + ) + @property def template_copied_path(self) -> str: """ @@ -337,7 +358,7 @@ def template_copied_path(self) -> str: """ return self.task_info["instantiation_result"]["choose_template"]["result"] - + @property def instantiated_plan(self) -> List[Dict[str, Any]]: """ @@ -345,7 +366,9 @@ def instantiated_plan(self) -> List[Dict[str, Any]]: :return: The instantiated plan. """ - return self.task_info["instantiation_result"]["prefill"]["result"]["instantiated_plan"] + return self.task_info["instantiation_result"]["prefill"]["result"][ + "instantiated_plan" + ] @instantiated_plan.setter def instantiated_plan(self, value: List[Dict[str, Any]]) -> None: @@ -354,8 +377,56 @@ def instantiated_plan(self, value: List[Dict[str, Any]]) -> None: :param value: New value for the instantiated plan. """ - self.task_info.setdefault("instantiation_result", {}).setdefault("prefill", {}).setdefault("result", {}) - self.task_info["instantiation_result"]["prefill"]["result"]["instantiated_plan"] = value + self.task_info.setdefault("instantiation_result", {}).setdefault( + "prefill", {} + ).setdefault("result", {}) + self.task_info["instantiation_result"]["prefill"]["result"][ + "instantiated_plan" + ] = value + + def reformat_to_batch(self, path) -> None: + """ + Transfer the result to the result hub. + """ + os.makedirs(path, exist_ok=True) + source_files_path = os.path.join( + self.result_hub, + self.task_type + "_pass", + ) + source_template_path = os.path.join( + os.path.dirname(self.result_hub), + "saved_document", + ) + target_file_path = os.path.join( + path, + "tasks", + ) + target_template_path = os.path.join( + path, + "files", + ) + os.makedirs((target_file_path), exist_ok=True) + os.makedirs((target_template_path), exist_ok=True) + + for file in os.listdir(source_files_path): + if file.endswith(".json"): + source_file = os.path.join(source_files_path, file) + target_file = os.path.join(target_file_path, file) + target_object = os.path.join( + target_template_path, file.replace(".json", ".docx") + ) + is_successed = reformat_json_file( + target_file, + target_object, + load_json_file(source_file), + ) + if is_successed: + shutil.copy( + os.path.join( + source_template_path, file.replace(".json", ".docx") + ), + target_template_path, + ) def run(self) -> None: """ @@ -383,7 +454,12 @@ def run(self) -> None: finally: # Update or record the total time cost of the process total_time = round(time.time() - start_time, 3) - new_total_time = self.task_info.get("time_cost", {}).get("total", 0) + total_time + new_total_time = ( + self.task_info.get("time_cost", {}).get("total", 0) + total_time + ) self.task_info["time_cost"]["total"] = round(new_total_time, 3) - self.save_result() \ No newline at end of file + self.save_result() + + if _configs["REFORMAT_TO_BATCH"]: + self.reformat_to_batch(_configs["REFORMAT_TO_BATCH_HUB"]) diff --git a/dataflow/dataflow.py b/dataflow/dataflow.py index 54a24548..d9572e60 100644 --- a/dataflow/dataflow.py +++ b/dataflow/dataflow.py @@ -11,23 +11,28 @@ def parse_args() -> argparse.Namespace: """ Parse command-line arguments. Automatically detect batch or single mode. """ - parser = argparse.ArgumentParser(description="Run tasks automatically in single or batch mode.") - + parser = argparse.ArgumentParser( + description="Run tasks automatically in single or batch mode." + ) + # Add options for -dataflow, -instantiation, and -execution parser.add_argument( - "-dataflow", - action="store_const", const="dataflow", - help="Indicates that the task type is dataflow." + "--dataflow", + action="store_const", + const="dataflow", + help="Indicates that the task type is dataflow.", ) parser.add_argument( - "-instantiation", - action="store_const", const="instantiation", - help="Indicates that the task type is instantiation." + "--instantiation", + action="store_const", + const="instantiation", + help="Indicates that the task type is instantiation.", ) parser.add_argument( - "-execution", - action="store_const", const="execution", - help="Indicates that the task type is execution." + "--execution", + action="store_const", + const="execution", + help="Indicates that the task type is execution.", ) # Task path argument @@ -68,7 +73,9 @@ def process_task(task_path: str, task_type: str) -> None: flow_controller.run() print_with_color(f"Task {task_path} completed successfully.", "green") except Exception as e: - print_with_color(f"Error processing {task_path}: {traceback.format_exc()}", "red") + print_with_color( + f"Error processing {task_path}: {traceback.format_exc()}", "red" + ) def process_batch(task_dir: str, task_type: str) -> None: @@ -100,7 +107,10 @@ def main(): # Ensure that a task type has been provided; if not, raise an error if not any([args.dataflow, args.instantiation, args.execution]): - print_with_color("Error: You must specify one of the task types (-dataflow, -instantiation, or -execution).", "red") + print_with_color( + "Error: You must specify one of the task types (--dataflow, --instantiation, or --execution).", + "red", + ) return task_type = args.dataflow or args.instantiation or args.execution diff --git a/dataflow/instantiation/agent/filter_agent.py b/dataflow/instantiation/agent/filter_agent.py index f8649ec0..239fad5a 100644 --- a/dataflow/instantiation/agent/filter_agent.py +++ b/dataflow/instantiation/agent/filter_agent.py @@ -1,11 +1,12 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from typing import List, Optional +from typing import List from dataflow.prompter.instantiation.filter_prompter import FilterPrompter from ufo.agents.agent.basic import BasicAgent + class FilterAgent(BasicAgent): """ The Agent to evaluate the instantiated task is correct or not. @@ -40,11 +41,7 @@ def __init__( self._process_name = process_name def get_prompter( - self, - is_visual: bool, - main_prompt: str, - example_prompt: str, - api_prompt: str + self, is_visual: bool, main_prompt: str, example_prompt: str, api_prompt: str ) -> FilterPrompter: """ Get the prompt for the agent. @@ -83,4 +80,4 @@ def process_comfirmation(self) -> None: This is the abstract method from BasicAgent that needs to be implemented. """ - pass \ No newline at end of file + pass diff --git a/dataflow/instantiation/agent/template_agent.py b/dataflow/instantiation/agent/template_agent.py new file mode 100644 index 00000000..be9bf194 --- /dev/null +++ b/dataflow/instantiation/agent/template_agent.py @@ -0,0 +1,87 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from typing import Dict, List + +from dataflow.prompter.instantiation.template_prompter import TemplatePrompter + +from ufo.agents.agent.basic import BasicAgent + + +class TemplateAgent(BasicAgent): + """ + The Agent for choosing template. + """ + + def __init__( + self, + name: str, + is_visual: bool, + main_prompt: str, + template_prompt: str = "", + ): + """ + Initialize the TemplateAgent. + :param name: The name of the agent. + :param is_visual: The flag indicating whether the agent is visual or not. + :param main_prompt: The main prompt. + :param template_prompt: The description of the file. + """ + + self._step = 0 + self._complete = False + self._name = name + self._status = None + self.prompter: TemplatePrompter = self.get_prompter( + is_visual, main_prompt, template_prompt + ) + + def get_prompter( + self, + is_visual: bool, + main_prompt: str, + template_prompt: str = "", + ) -> str: + """ + Get the prompt for the agent. + This is the abstract method from BasicAgent that needs to be implemented. + :param is_visual: The flag indicating whether the agent is visual or not. + :param main_prompt: The main prompt. + :param template_prompt: The description of the file. + :return: The prompt string. + """ + + return TemplatePrompter(is_visual, main_prompt, template_prompt) + + def message_constructor( + self, + descriptions: Dict, + request: str, + path: str = r"dataflow\templates\word", + ) -> List[str]: + """ + Construct the prompt message for the PrefillAgent. + + :return: The prompt message. + """ + + template_agent_prompt_system_message = self.prompter.system_prompt_construction( + descriptions + ) + template_agent_prompt_user_message = self.prompter.user_content_construction( + path=path, request=request + ) + appagent_prompt_message = self.prompter.prompt_construction( + template_agent_prompt_system_message, + template_agent_prompt_user_message, + ) + + return appagent_prompt_message + + def process_comfirmation(self) -> None: + """ + Confirm the process. + This is the abstract method from BasicAgent that needs to be implemented. + """ + + pass diff --git a/dataflow/instantiation/workflow/choose_template_flow.py b/dataflow/instantiation/workflow/choose_template_flow.py index 639e5e73..9f29ef71 100644 --- a/dataflow/instantiation/workflow/choose_template_flow.py +++ b/dataflow/instantiation/workflow/choose_template_flow.py @@ -11,6 +11,7 @@ from langchain.storage import LocalFileStore from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS +from dataflow.instantiation.agent.template_agent import TemplateAgent from dataflow.config.config import Config @@ -140,9 +141,9 @@ def _choose_template_and_copy(self) -> str: Path(_configs["TEMPLATE_PATH"]) / self._app_name / chosen_template_file_path ) - target_template_folder_path = Path(_configs["RESULT_HUB"].format(task_type = "saved_document")) / ( - os.path.dirname(os.path.dirname(self._task_file_name)) - ) + target_template_folder_path = Path( + _configs["RESULT_HUB"].format(task_type="saved_document") + ) / (os.path.dirname(os.path.dirname(self._task_file_name))) return self._create_copied_file( chosen_template_full_path, target_template_folder_path, self._task_file_name @@ -158,6 +159,32 @@ def _choose_target_template_file( :return: The path to the chosen template file. """ + if _configs["TEMPLATE_METHOD"] == "SemanticSimilarity": + return self._choose_target_template_file_semantic( + given_task, doc_files_description + ) + elif _configs["TEMPLATE_METHOD"] == "LLM": + self.template_agent = TemplateAgent( + "template", + is_visual=True, + main_prompt=_configs["TEMPLATE_PROMPT"], + ) + return self._choose_target_template_file_llm( + given_task, doc_files_description + ) + else: + raise ValueError("Invalid TEMPLATE_METHOD.") + + def _choose_target_template_file_semantic( + self, given_task: str, doc_files_description: Dict[str, str] + ) -> str: + """ + Get the target file based on the semantic similarity of the given task and the template file descriptions. + :param given_task: The task to be matched. + :param doc_files_description: A dictionary of template file descriptions. + :return: The path to the chosen template file. + """ + file_doc_map = { desc: file_name for file_name, desc in doc_files_description.items() } @@ -168,9 +195,35 @@ def _choose_target_template_file( if not most_similar: raise ValueError("No similar templates found.") - return file_doc_map[most_similar[0].page_content] + def _choose_target_template_file_llm( + self, given_task: str, doc_files_description: Dict[str, str] + ) -> str: + """ + Get the target file based on the LLM of the given task and the template file descriptions. + :param given_task: The task to be matched. + :param doc_files_description: A dictionary of template file descriptions. + :return: The path to the chosen template file. + """ + + prompt_message = self.template_agent.message_constructor( + doc_files_description, given_task + ) + response_string, _ = self.template_agent.get_response( + prompt_message, "prefill", use_backup_engine=True, configs=_configs + ) + if response_string is None: + raise ValueError("No similar templates found.") + elif "```json" in response_string: + response_string = response_string[7:-3] + response_json = json.loads(response_string) + file_name = list(response_json.keys())[0] + if file_name not in doc_files_description: + print(f"Template {file_name} not found in the description.") + raise ValueError("No similar templates found.") + return file_name + @staticmethod def _load_embedding_model(model_name: str) -> CacheBackedEmbeddings: """ @@ -178,11 +231,11 @@ def _load_embedding_model(model_name: str) -> CacheBackedEmbeddings: :param model_name: The name of the embedding model to load. :return: The loaded embedding model. """ - + store = LocalFileStore(_configs["CONTROL_EMBEDDING_CACHE_PATH"]) if not model_name.startswith(ChooseTemplateFlow._SENTENCE_TRANSFORMERS_PREFIX): model_name = ChooseTemplateFlow._SENTENCE_TRANSFORMERS_PREFIX + model_name embedding_model = HuggingFaceEmbeddings(model_name=model_name) return CacheBackedEmbeddings.from_bytes_store( embedding_model, store, namespace=model_name - ) \ No newline at end of file + ) diff --git a/dataflow/prompter/instantiation/filter_prompter.py b/dataflow/prompter/instantiation/filter_prompter.py index cec5658d..fd1d4b78 100644 --- a/dataflow/prompter/instantiation/filter_prompter.py +++ b/dataflow/prompter/instantiation/filter_prompter.py @@ -2,11 +2,9 @@ # Licensed under the MIT License. import json -import os -from typing import Dict, List, Optional +from typing import Dict, List from ufo.prompter.basic import BasicPrompter -from ufo.prompter.eva_prompter import EvaluationAgentPrompter class FilterPrompter(BasicPrompter): @@ -140,7 +138,7 @@ def examples_prompt_helper( {request} [Response]: {response} - [Tip] + [Tips]: {tip} """ diff --git a/dataflow/prompter/instantiation/prefill_prompter.py b/dataflow/prompter/instantiation/prefill_prompter.py index 5f979553..95230018 100644 --- a/dataflow/prompter/instantiation/prefill_prompter.py +++ b/dataflow/prompter/instantiation/prefill_prompter.py @@ -3,163 +3,9 @@ import json import os -from typing import Dict, List, Optional +from typing import Dict, List from ufo.prompter.basic import BasicPrompter -from ufo.prompter.eva_prompter import EvaluationAgentPrompter - - -class FilterPrompter(BasicPrompter): - """ - Load the prompt for the FilterAgent. - """ - - def __init__( - self, - is_visual: bool, - prompt_template: str, - example_prompt_template: str, - api_prompt_template: str, - ): - """ - Initialize the FilterPrompter. - :param is_visual: The flag indicating whether the prompter is visual or not. - :param prompt_template: The prompt template. - :param example_prompt_template: The example prompt template. - :param api_prompt_template: The API prompt template. - """ - - super().__init__(is_visual, prompt_template, example_prompt_template) - self.api_prompt_template = self.load_prompt_template( - api_prompt_template, is_visual - ) - - def api_prompt_helper(self, apis: Dict = {}, verbose: int = 1) -> str: - """ - Construct the prompt for APIs. - :param apis: The APIs. - :param verbose: The verbosity level. - :return: The prompt for APIs. - """ - - # Construct the prompt for APIs - if len(apis) == 0: - api_list = [ - "- The action type are limited to {actions}.".format( - actions=list(self.api_prompt_template.keys()) - ) - ] - - # Construct the prompt for each API - for key in self.api_prompt_template.keys(): - api = self.api_prompt_template[key] - if verbose > 0: - api_text = "{summary}\n{usage}".format( - summary=api["summary"], usage=api["usage"] - ) - else: - api_text = api["summary"] - - api_list.append(api_text) - - api_prompt = self.retrived_documents_prompt_helper("", "", api_list) - else: - api_list = [ - "- The action type are limited to {actions}.".format( - actions=list(apis.keys()) - ) - ] - - # Construct the prompt for each API - for key in apis.keys(): - api = apis[key] - api_text = "{description}\n{example}".format( - description=api["description"], example=api["example"] - ) - api_list.append(api_text) - - api_prompt = self.retrived_documents_prompt_helper("", "", api_list) - - return api_prompt - - def system_prompt_construction(self, app: str = "") -> str: - """ - Construct the prompt for the system. - :param app: The app name. - :return: The prompt for the system. - """ - - try: - ans = self.prompt_template["system"] - ans = ans.format(app=app) - return ans - except Exception as e: - print(e) - - def user_prompt_construction(self, request: str) -> str: - """ - Construct the prompt for the user. - :param request: The user request. - :return: The prompt for the user. - """ - - prompt = self.prompt_template["user"].format(request=request) - return prompt - - def user_content_construction(self, request: str) -> List[Dict]: - """ - Construct the prompt for LLMs. - :param request: The user request. - :return: The prompt for LLMs. - """ - - user_content = [] - - user_content.append( - {"type": "text", "text": self.user_prompt_construction(request)} - ) - - return user_content - - def examples_prompt_helper( - self, - header: str = "## Response Examples", - separator: str = "Example", - additional_examples: List[str] = [], - ) -> str: - """ - Construct the prompt for examples. - :param header: The header of the prompt. - :param separator: The separator of the prompt. - :param additional_examples: The additional examples. - :return: The prompt for examples. - """ - - template = """ - [User Request]: - {request} - [Response]: - {response} - [Tip] - {tip} - """ - - example_list = [] - - for key in self.example_prompt_template.keys(): - if key.startswith("example"): - example = template.format( - request=self.example_prompt_template[key].get("Request"), - response=json.dumps( - self.example_prompt_template[key].get("Response") - ), - tip=self.example_prompt_template[key].get("Tips", ""), - ) - example_list.append(example) - - example_list += [json.dumps(example) for example in additional_examples] - - return self.retrived_documents_prompt_helper(header, separator, example_list) class PrefillPrompter(BasicPrompter): @@ -253,7 +99,7 @@ def load_screenshots(self, log_path: str) -> str: :param log_path: The path of the log. :return: The screenshot URL. """ - + from ufo.prompter.eva_prompter import EvaluationAgentPrompter init_image = os.path.join(log_path, "screenshot.png") @@ -315,7 +161,7 @@ def examples_prompt_helper( {request} [Response]: {response} - [Tip] + [Tips]: {tip} """ @@ -334,4 +180,4 @@ def examples_prompt_helper( example_list += [json.dumps(example) for example in additional_examples] - return self.retrived_documents_prompt_helper(header, separator, example_list) \ No newline at end of file + return self.retrived_documents_prompt_helper(header, separator, example_list) diff --git a/dataflow/prompter/instantiation/template_prompter.py b/dataflow/prompter/instantiation/template_prompter.py new file mode 100644 index 00000000..64ec18c1 --- /dev/null +++ b/dataflow/prompter/instantiation/template_prompter.py @@ -0,0 +1,107 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import base64 +import mimetypes +import os +from typing import Dict, List, cast, Optional + +from ufo.prompter.basic import BasicPrompter + + +class TemplatePrompter(BasicPrompter): + """ + Load the prompt for the TemplateAgent. + """ + + def __init__( + self, + is_visual: bool, + prompt_template: str, + example_prompt_template: str, + ): + """ + Initialize the FilterPrompter. + :param is_visual: The flag indicating whether the prompter is visual or not. + :param prompt_template: The prompt template. + """ + + super().__init__(is_visual, prompt_template, example_prompt_template) + + def encode_image(self, image_path: str) -> str: + """ + Encode the image. + :param image_path: The image path. + :return: The encoded image. + """ + with open(image_path, "rb") as image_file: + encoded_image = base64.b64encode(image_file.read()).decode("ascii") + + mime_type = "image/png" + + image_url = f"data:{mime_type};base64," + encoded_image + return image_url + + def file_prompt_helper(self, path) -> str: + """ + Construct the prompt for files. + :return: The prompt for files. + """ + image_path = os.path.join(path, "images") + image_urls = [] + user_content = [] + for file in os.listdir(image_path): + if file.endswith(".png"): + image_urls.append(self.encode_image(os.path.join(image_path, file))) + + for i in range(len(image_urls)): + user_content.append( + { + "type": "text", + "text": "This is the screenshot of " + str(i + 1) + ".docx", + }, + ) + user_content.append( + {"type": "image_url", "image_url": {"url": image_urls[i]}}, + ) + return user_content + + def system_prompt_construction(self, descriptions: str = "") -> str: + """ + Construct the prompt for the system. + :param app: The app name. + :return: The prompt for the system. + """ + + try: + ans = self.prompt_template["system"] + ans = ans.format(descriptions=descriptions) + return ans + except Exception as e: + print(e) + + def user_prompt_construction(self, request: str) -> str: + """ + Construct the prompt for the user. + :param request: The user request. + :return: The prompt for the user. + """ + + prompt = self.prompt_template["user"].format(given_task=request) + return prompt + + def user_content_construction(self, path: str, request: str) -> List[Dict]: + """ + Construct the prompt for LLMs. + :param path: The path of the template. + :param request: The user request. + :return: The prompt for LLMs. + """ + + user_content = self.file_prompt_helper(path) + + user_content.append( + {"type": "text", "text": self.user_prompt_construction(request)} + ) + + return user_content diff --git a/dataflow/prompts/instantiation/visual/template.yaml b/dataflow/prompts/instantiation/visual/template.yaml new file mode 100644 index 00000000..4d84c60a --- /dev/null +++ b/dataflow/prompts/instantiation/visual/template.yaml @@ -0,0 +1,29 @@ +version: 1.0 + +system: |- + You are a Word operator expert and you can easily perform any word-related operations. + - What you need to do now is to judge and summarize the problems about the execution environment. + - You should tell me what kind of document you think is needed as the execution environment. + - Think step by step. + + ## Available File Descriptions + - All the available description of the template files are listed below: + {descriptions} + + Besides, please prefill the task based on the screenshot. you will also be provided with a screenshot, one before the agent's execution and one after the agent's execution. + All I need is the document that you think is needed as the execution environment. + Your reply only need reply in json model. + + ## Response Format + - You are required to response in a JSON format, consisting of several distinct parts with the following keys and corresponding content: + {{"template_file_name": "short description of why you pick this"}} + + For example: + - Example 1: + {{"1.docx": "I think this is the most suitable one because it contains a rectangle the task needs."}} + - Example 2: + {{"3.docx": "The task requires a chart, so I think this is the most suitable one."}} + +user: |- + {given_task} + \ No newline at end of file diff --git a/dataflow/templates/word/1.docx b/dataflow/templates/word/1.docx new file mode 100644 index 00000000..4a1930a9 Binary files /dev/null and b/dataflow/templates/word/1.docx differ diff --git a/dataflow/templates/word/2.docx b/dataflow/templates/word/2.docx new file mode 100644 index 00000000..5e9f96a5 Binary files /dev/null and b/dataflow/templates/word/2.docx differ diff --git a/dataflow/templates/word/3.docx b/dataflow/templates/word/3.docx new file mode 100644 index 00000000..f6b8eb81 Binary files /dev/null and b/dataflow/templates/word/3.docx differ diff --git a/dataflow/templates/word/4.docx b/dataflow/templates/word/4.docx new file mode 100644 index 00000000..aac76588 Binary files /dev/null and b/dataflow/templates/word/4.docx differ diff --git a/dataflow/templates/word/5.docx b/dataflow/templates/word/5.docx new file mode 100644 index 00000000..e65a6c74 Binary files /dev/null and b/dataflow/templates/word/5.docx differ diff --git a/dataflow/templates/word/6.docx b/dataflow/templates/word/6.docx new file mode 100644 index 00000000..32186989 Binary files /dev/null and b/dataflow/templates/word/6.docx differ diff --git a/dataflow/templates/word/7.docx b/dataflow/templates/word/7.docx new file mode 100644 index 00000000..a474af70 Binary files /dev/null and b/dataflow/templates/word/7.docx differ diff --git a/dataflow/templates/word/description.json b/dataflow/templates/word/description.json index 375c17d8..50f36aed 100644 --- a/dataflow/templates/word/description.json +++ b/dataflow/templates/word/description.json @@ -1,4 +1,9 @@ { - "template1.docx": "A doc with a rectangle shape. Can be moved, resized, or deleted.", - "template2.docx": "A doc with a line of text. Can be edited, deleted, or replaced." + "1.docx":"A doc with a rectangle shape", + "2.docx":"A doc with a line of text", + "3.docx":"A doc with a chart", + "4.docx":"A doc with a text box", + "5.docx":"A doc with comments and reviewer", + "6.docx":"A doc with a list of items", + "7.docx":"A doc with a table" } diff --git a/dataflow/templates/word/images/1.png b/dataflow/templates/word/images/1.png new file mode 100644 index 00000000..39233aa7 Binary files /dev/null and b/dataflow/templates/word/images/1.png differ diff --git a/dataflow/templates/word/images/2.png b/dataflow/templates/word/images/2.png new file mode 100644 index 00000000..8860f4c5 Binary files /dev/null and b/dataflow/templates/word/images/2.png differ diff --git a/dataflow/templates/word/images/3.png b/dataflow/templates/word/images/3.png new file mode 100644 index 00000000..2dbd3775 Binary files /dev/null and b/dataflow/templates/word/images/3.png differ diff --git a/dataflow/templates/word/images/4.png b/dataflow/templates/word/images/4.png new file mode 100644 index 00000000..786920a8 Binary files /dev/null and b/dataflow/templates/word/images/4.png differ diff --git a/dataflow/templates/word/images/5.png b/dataflow/templates/word/images/5.png new file mode 100644 index 00000000..b2c1816d Binary files /dev/null and b/dataflow/templates/word/images/5.png differ diff --git a/dataflow/templates/word/images/6.png b/dataflow/templates/word/images/6.png new file mode 100644 index 00000000..f0f8212e Binary files /dev/null and b/dataflow/templates/word/images/6.png differ diff --git a/dataflow/templates/word/images/7.png b/dataflow/templates/word/images/7.png new file mode 100644 index 00000000..1d200f50 Binary files /dev/null and b/dataflow/templates/word/images/7.png differ diff --git a/documents/docs/configurations/developer_configuration.md b/documents/docs/configurations/developer_configuration.md index 75870435..5200e9b6 100644 --- a/documents/docs/configurations/developer_configuration.md +++ b/documents/docs/configurations/developer_configuration.md @@ -9,6 +9,7 @@ The following parameters are included in the system configuration of the UFO age | Configuration Option | Description | Type | Default Value | |-------------------------|---------------------------------------------------------------------------------------------------------|----------|---------------| | `CONTROL_BACKEND` | The backend for control action, currently supporting `uia` and `win32`. | String | "uia" | +| `ACTION_SEQUENCE` | Whether to use output multiple actions in a single step. | Boolean | False | | `MAX_STEP` | The maximum step limit for completing the user request in a session. | Integer | 100 | | `MAX_ROUND` | The maximum round limit for completing the user request in a session. | Integer | 10 | | `SLEEP_TIME` | The sleep time in seconds between each step to wait for the window to be ready. | Integer | 5 | @@ -17,6 +18,7 @@ The following parameters are included in the system configuration of the UFO age | `CONTROL_LIST` | The list of widgets allowed to be selected. | List | ["Button", "Edit", "TabItem", "Document", "ListItem", "MenuItem", "ScrollBar", "TreeItem", "Hyperlink", "ComboBox", "RadioButton", "DataItem"] | | `HISTORY_KEYS` | The keys of the step history added to the [`Blackboard`](../agents/design/blackboard.md) for agent decision-making. | List | ["Step", "Thought", "ControlText", "Subtask", "Action", "Comment", "Results", "UserConfirm"] | | `ANNOTATION_COLORS` | The colors assigned to different control types for annotation. | Dictionary | {"Button": "#FFF68F", "Edit": "#A5F0B5", "TabItem": "#A5E7F0", "Document": "#FFD18A", "ListItem": "#D9C3FE", "MenuItem": "#E7FEC3", "ScrollBar": "#FEC3F8", "TreeItem": "#D6D6D6", "Hyperlink": "#91FFEB", "ComboBox": "#D8B6D4"} | +| `ANNOTATION_FONT_SIZE` | The font size for the annotation. | Integer | 22 | | `PRINT_LOG` | Whether to print the log in the console. | Boolean | False | | `CONCAT_SCREENSHOT` | Whether to concatenate the screenshots into a single image for the LLM input. | Boolean | False | | `INCLUDE_LAST_SCREENSHOT` | Whether to include the screenshot from the last step in the observation. | Boolean | True | diff --git a/documents/docs/project_directory_structure.md b/documents/docs/project_directory_structure.md index 81ba11b5..8f27afbd 100644 --- a/documents/docs/project_directory_structure.md +++ b/documents/docs/project_directory_structure.md @@ -6,6 +6,7 @@ The UFO project is organized into a well-defined directory structure to facilita ┣ 📂learner # Folder to build the vector database for help documents ┣ 📂model_worker # Folder to store tools for deploying your own model ┣ 📂record_processor # Folder to parse human demonstrations from Windows Step Recorder and build the vector database + ┣ 📂dataflow # Folder for the code of data collection pipeline for Large Action Model (LAM) ┣ 📂vetordb # Folder to store all data in the vector database for RAG (Retrieval-Augmented Generation) ┣ 📂logs # Folder to store logs, generated after the program starts ┗ 📂ufo # Directory containing main project code @@ -17,6 +18,7 @@ The UFO project is organized into a well-defined directory structure to facilita ┣ 📂prompter # Prompt constructor for the agent ┣ 📂prompts # Prompt templates and files to construct the full prompt ┣ 📂rag # Implementation of RAG from different sources to enhance agents' abilities + ┣ 📂trajectory # Implementation of loading and parsing trajectories of task completion ┣ 📂utils # Utility functions ┣ 📂config # Configuration files ┣ 📜config.yaml # User configuration file for LLM and other settings @@ -37,6 +39,9 @@ The UFO project is organized into a well-defined directory structure to facilita ### [model_worker](https://github.com/microsoft/UFO/tree/main/model_worker) - **Purpose:** Contains tools and scripts necessary for deploying custom models. - **Details:** This includes model deployment configurations, and management tools for integrating custom models into the project. +### [dataflow](https://github.com/microsoft/UFO/tree/main/dataflow) +- **Purpose:** Contains the code for the data collection pipeline for the Large Action Model (LAM). +- **Details:** This directory includes scripts and tools for collecting and processing data to train the Large Action Model, improving the agents' performance and capabilities. ### [record_processor](https://github.com/microsoft/UFO/tree/main/record_processor) - **Purpose:** Parses human demonstrations recorded using the Windows Step Recorder and builds the vector database. - **Details:** This directory includes parsers, data processing scripts, and tools to convert human demonstrations into a format suitable for agent's retrieval. @@ -74,6 +79,9 @@ The UFO project is organized into a well-defined directory structure to facilita #### [rag](https://github.com/microsoft/UFO/tree/main/ufo/rag) - **Purpose:** Implements Retrieval-Augmented Generation (RAG) from different sources to enhance the agents' abilities. - **etails:** This directory includes scripts and tools for integrating various data sources into the RAG framework, improving the accuracy and relevance of the agents' outputs. + #### [trajectory](https://github.com/microsoft/UFO/tree/main/ufo/trajectory) + - **Purpose:** Implements loading and parsing of task completion trajectories. + - **Details:** This directory includes tools and scripts to load and parse task completion trajectories, enabling agents to learn from past experiences or for evaluation purposes. #### [utils](https://github.com/microsoft/UFO/tree/main/ufo/utils) - **Purpose:** Contains utility functions. - **Details:** This directory includes helper functions, common utilities, and other reusable code snippets that support the project's operations. diff --git a/documents/mkdocs.yml b/documents/mkdocs.yml index 14a63c55..7ff9933a 100644 --- a/documents/mkdocs.yml +++ b/documents/mkdocs.yml @@ -63,6 +63,7 @@ nav: - Experience Learning: advanced_usage/reinforce_appagent/experience_learning.md - Learning from User Demonstration: advanced_usage/reinforce_appagent/learning_from_demonstration.md - Follower Mode: advanced_usage/follower_mode.md + - Batch Mode: advanced_usage/batch_mode.md - Control Filtering: - Overview: advanced_usage/control_filtering/overview.md - Text Filtering: advanced_usage/control_filtering/text_filtering.md diff --git a/learner/indexer.py b/learner/indexer.py index 4a61ff88..6b1ec640 100644 --- a/learner/indexer.py +++ b/learner/indexer.py @@ -48,7 +48,9 @@ def create_indexer(app: str, docs: str, format: str, incremental: bool, save_pat if incremental: if app in records: print_with_color("Merging with previous indexer...", "yellow") - prev_db = FAISS.load_local(records[app], embeddings) + prev_db = FAISS.load_local( + records[app], embeddings, allow_dangerous_deserialization=True + ) db.merge_from(prev_db) db_file_path = os.path.join(save_path, app) diff --git a/learner/utils.py b/learner/utils.py index b25b08d4..7b11d1cc 100644 --- a/learner/utils.py +++ b/learner/utils.py @@ -7,6 +7,7 @@ # init colorama init() + def print_with_color(text: str, color: str = ""): """ Print text with specified color using ANSI escape codes from Colorama library. @@ -22,7 +23,7 @@ def print_with_color(text: str, color: str = ""): "magenta": Fore.MAGENTA, "cyan": Fore.CYAN, "white": Fore.WHITE, - "black": Fore.BLACK + "black": Fore.BLACK, } selected_color = color_mapping.get(color.lower(), "") @@ -31,7 +32,6 @@ def print_with_color(text: str, color: str = ""): print(colored_text) - def find_files_with_extension(directory, extension): """ Find files with the given extension in the given directory. @@ -40,16 +40,15 @@ def find_files_with_extension(directory, extension): :return: The list of matching files. """ matching_files = [] - + for root, _, files in os.walk(directory): for file in files: if file.endswith(extension): path = os.path.join(root, file) path = os.path.realpath(path) matching_files.append(path) - - return matching_files + return matching_files def find_files_with_extension_list(directory, extensions): @@ -60,16 +59,15 @@ def find_files_with_extension_list(directory, extensions): :return: The list of matching files. """ matching_files = [] - + for root, _, files in os.walk(directory): for file in files: if file.endswith(tuple(extensions)): path = os.path.join(root, file) path = os.path.realpath(path) matching_files.append(path) - - return matching_files + return matching_files def load_json_file(file_path): @@ -79,17 +77,41 @@ def load_json_file(file_path): :return: The loaded JSON data. """ - with open(file_path, 'r') as file: + with open(file_path, "r") as file: data = json.load(file) return data - def save_json_file(file_path, data): """ Save a JSON file. :param file_path: The path to the file to save. """ - - with open(file_path, 'w') as file: + + with open(file_path, "w") as file: json.dump(data, file, indent=4) + + +def reformat_json_file(file_path, template_path, data): + """ + Reformat the JSON file to batch format. + :param file_path: The path to the file to save. + """ + tmp_data = {} + if ( + data.get("instantiation_result", {}) + .get("instantiation_evaluation", {}) + .get("result", {}) + .get("judge", None) + ): + tmp_data["task"] = data["instantiation_result"]["prefill"]["result"][ + "instantiated_request" + ] + tmp_data["object"] = template_path + tmp_data["close"] = "True" + with open(file_path, "w") as file: + json.dump(tmp_data, file, indent=4) + return True + else: + # The instantiation result is not successful. Need to be filtered out. + return False diff --git a/model_worker/README.md b/model_worker/README.md index 0648c5d6..4ea4469d 100644 --- a/model_worker/README.md +++ b/model_worker/README.md @@ -82,7 +82,7 @@ When serving LLMs via Ollama, it will by default start a server at `http://local "API_MODEL": "YOUR_MODEL" } ``` -NOTE: `API_BASE` is the URL started in the Ollama LLM server and `API_MODEL` is the model name of Ollama LLM, it should be same as the one you served before. In addition, due to model limitations, you can use lite version of prompt to have a taste on UFO which can be configured in `config_dev.yaml`. Attention to the top ***note***. +NOTE: `API_BASE` is the URL started in the Ollama LLM server and `API_MODEL` is the model name of Ollama LLM, it should be same as the one you served before. In addition, due to model limitations, you can use lite version of prompt to have a taste on UFO which can be configured in `config_dev.yaml`. Attention to the top ***NOTE***. #### If you use your custom model as the Agent 1. Start a server with your model, which will later be used as the API base in `config.yaml`. @@ -97,4 +97,103 @@ NOTE: `API_BASE` is the URL started in the Ollama LLM server and `API_MODEL` is } ``` -NOTE: You should create a new Python script .py in the ufo/llm folder like the format of the .py, which needs to inherit `BaseService` as the parent class, as well as the `__init__` and `chat_completion` methods. At the same time, you need to add the dynamic import of your file in the `get_service` method of `BaseService`. +NOTE: You should create a new Python script `custom_model.py` in the ufo/llm folder like the format of the `placeholder.py`, which needs to inherit `BaseService` as the parent class, as well as the `__init__` and `chat_completion` methods. At the same time, you need to add the dynamic import of your file in the `get_service` method of `BaseService`. + +#### EXAMPLE +You can use the following code as an example to configure your own model: +```bash +def chat_completion(self, messages, n, **kwargs): + retries = 0 + while retries < self.max_retry: + try: + # Construct the request payload + payload = { + "messages": messages, + "n": n, + } + + # Optionally, you can pass extra parameters through kwargs + payload.update(kwargs) + + # Make the actual API request + response = self._make_api_request(payload) + + # Process the response (you can adjust this based on your API's format) + return response + + except Exception as e: + retries += 1 + if retries >= self.max_retry: + raise + raise Exception("Max retries reached. Unable to get response from the API.") + +def _make_api_request(self, payload): + # Config as you wished + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + try: + # Send POST request to the API endpoint + response = requests.post( + self.api_base, + headers=headers, + json=payload, + timeout=self.timeout + ) + + # Check if the request was successful + response.raise_for_status() + + # Return the JSON response from the API + return response.json() + + except requests.exceptions.RequestException as e: + raise + +``` + +Also, UFO provides the usage of ***LLaVA-1.5*** and ***CogAgent*** as the example. + +1.1 Download the essential libs of your custom model. + +#### If you use LLaVA-1.5 as the Agent + +Please refer to the [LLaVA](https://github.com/haotian-liu/LLaVA) project to download and prepare the LLaVA-1.5 model, for example: + +```bash +git clone https://github.com/haotian-liu/LLaVA.git +cd LLaVA +conda create -n llava python=3.10 -y +conda activate llava +pip install --upgrade pip # enable PEP 660 support +pip install -e . +``` + +#### If you use CogAgent as the Agent + +Please refer to the [CogVLM](https://github.com/THUDM/CogVLM) project to download and prepare the CogAgent model. Download the sat version of the CogAgent weights `cogagent-chat.zip` from [here](https://huggingface.co/THUDM/CogAgent/tree/main), unzip it. + +1.2 Start your custom model. You must customize your model to support the interface of the UFO. +For simplicity, you have to configure `YOUR_ENDPOINT/chat/completions`. + +#### If you use LLaVA as the Agent +Add the `direct_generate_llava` method and a new post interface `/chat/completions` from the `custom_model_worker.py` to the into the `llava/serve/model_worker.py` And start it with the following command: +```bash +python -m llava.serve.llava_model_worker --host YOUR_HOST --port YOUR_POINT --worker YOUR_ENDPOINT --model-path liuhaotian/llava-v1.5-13b --no-register +``` + +#### If you use CogAgent as the Agent +You can modify the model generate from the `basic_demo/cli_demo.py` with a new post interface `/chat/completions` to enjoy it with UFO. + +3. Add following configuration to `config.yaml`: +```json showLineNumbers +{ + "API_TYPE": "Custom" , + "API_BASE": "YOUR_ENDPOINT", + "API_MODEL": "YOUR_MODEL" +} +``` + +***Note***: Only LLaVA and CogAgent are supported as open source models for now. If you want to use your own model, remember to modify the dynamic import of your model file in the `get_service` method of `BaseService` in `ufo/llm/base.py`. diff --git a/model_worker/custom_worker.py b/model_worker/custom_worker.py new file mode 100644 index 00000000..350c81ab --- /dev/null +++ b/model_worker/custom_worker.py @@ -0,0 +1,86 @@ +# Method to generate response from prompt and image using the Llava model +@torch.inference_mode() +def direct_generate_llava(self, params): + tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor + + prompt = params["prompt"] + image = params.get("image", None) + if image is not None: + if DEFAULT_IMAGE_TOKEN not in prompt: + raise ValueError( + "Number of image does not match number of tokens in prompt" + ) + + image = load_image_from_base64(image) + image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][ + 0 + ] + image = image.to(self.model.device, dtype=self.model.dtype) + images = image.unsqueeze(0) + + replace_token = DEFAULT_IMAGE_TOKEN + if getattr(self.model.config, "mm_use_im_start_end", False): + replace_token = ( + DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN + ) + prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token) + + num_image_tokens = ( + prompt.count(replace_token) * model.get_vision_tower().num_patches + ) + else: + return {"text": "No image provided", "error_code": 0} + + temperature = float(params.get("temperature", 1.0)) + top_p = float(params.get("top_p", 1.0)) + max_context_length = getattr(model.config, "max_position_embeddings", 2048) + max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024) + stop_str = params.get("stop", None) + do_sample = True if temperature > 0.001 else False + input_ids = ( + tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") + .unsqueeze(0) + .to(self.device) + ) + keywords = [stop_str] + max_new_tokens = min( + max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens + ) + + input_ids = ( + tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") + .unsqueeze(0) + .to(self.device) + ) + + input_seq_len = input_ids.shape[1] + + generation_output = self.model.generate( + inputs=input_ids, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + max_new_tokens=max_new_tokens, + images=images, + use_cache=True, + ) + + generation_output = generation_output[0, input_seq_len:] + decoded = tokenizer.decode(generation_output, skip_special_tokens=True) + + response = {"text": decoded} + print("response", response) + return response + + +# The API is included in llava and cogagent installations. If you customize your model, you can install fastapi via pip or uncomment the library in the requirements. +# import FastAPI +# app = FastAPI() + + +# For llava +@app.post("/chat/completions") +async def generate_llava(request: Request): + params = await request.json() + response_data = worker.direct_generate_llava(params) + return response_data diff --git a/record_processor/summarizer/summarizer.py b/record_processor/summarizer/summarizer.py index 8683ce6b..cbb80dc4 100644 --- a/record_processor/summarizer/summarizer.py +++ b/record_processor/summarizer/summarizer.py @@ -12,6 +12,7 @@ from record_processor.utils import json_parser from ufo.llm.llm_call import get_completions from ufo.prompter.demonstration_prompter import DemonstrationPrompter +from ufo.utils import get_hugginface_embedding class DemonstrationSummarizer: @@ -186,7 +187,11 @@ def create_or_update_vector_db(summaries: list, db_path: str): # Check if the db exists, if not, create a new one. if os.path.exists(db_path): - prev_db = FAISS.load_local(db_path, get_hugginface_embedding()) + prev_db = FAISS.load_local( + db_path, + get_hugginface_embedding(), + allow_dangerous_deserialization=True, + ) db.merge_from(prev_db) db.save_local(db_path) diff --git a/requirements.txt b/requirements.txt index c92127ca..af169c27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ colorama==0.4.6 langchain==0.1.11 langchain_community==0.0.27 msal==1.25.0 -openai==1.13.3 +openai==1.59.7 Pillow==10.3.0 pywin32==306 pywinauto==0.6.8 @@ -16,6 +16,8 @@ beautifulsoup4==4.12.3 sentence-transformers==2.5.1 pandas==1.4.3 html2text==2024.2.26 +pyautogui==0.9.54 +uiautomation==2.0.18 ##For Qwen #dashscope==1.15.0 ##For removing stopwords @@ -23,3 +25,7 @@ html2text==2024.2.26 ##For Gemini #google-generativeai==0.7.0 + +## If use AAD to authenticate +# azure_identity==1.15.0 +# azure-identity-broker==0.1.0 diff --git a/ufo/agents/agent/app_agent.py b/ufo/agents/agent/app_agent.py index 246c9326..53452629 100644 --- a/ufo/agents/agent/app_agent.py +++ b/ufo/agents/agent/app_agent.py @@ -5,11 +5,14 @@ from __future__ import annotations import os -from typing import Dict, List, Union +from typing import Any, Dict, List, Union from ufo import utils from ufo.agents.agent.basic import BasicAgent from ufo.agents.processors.app_agent_processor import AppAgentProcessor +from ufo.agents.processors.app_agent_action_seq_processor import ( + AppAgentActionSequenceProcessor, +) from ufo.agents.states.app_agent_state import AppAgentStatus, ContinueAppAgentState from ufo.automator import puppeteer from ufo.config.config import Config @@ -87,7 +90,6 @@ def get_prompter( def message_constructor( self, dynamic_examples: str, - dynamic_tips: str, dynamic_knowledge: str, image_list: List, control_info: str, @@ -95,25 +97,30 @@ def message_constructor( plan: List[str], request: str, subtask: str, + current_application: str, host_message: List[str], + blackboard_prompt: List[Dict[str, str]], + last_success_actions: List[Dict[str, Any]], include_last_screenshot: bool, ) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]: """ Construct the prompt message for the AppAgent. :param dynamic_examples: The dynamic examples retrieved from the self-demonstration and human demonstration. - :param dynamic_tips: The dynamic tips retrieved from the self-demonstration and human demonstration. :param dynamic_knowledge: The dynamic knowledge retrieved from the external knowledge base. :param image_list: The list of screenshot images. :param control_info: The control information. :param plan: The plan list. :param request: The overall user request. :param subtask: The subtask for the current AppAgent to process. + :param current_application: The current application name. :param host_message: The message from the HostAgent. + :param blackboard_prompt: The prompt message from the blackboard. + :param last_success_actions: The list of successful actions in the last step. :param include_last_screenshot: The flag indicating whether to include the last screenshot. :return: The prompt message. """ appagent_prompt_system_message = self.prompter.system_prompt_construction( - dynamic_examples, dynamic_tips + dynamic_examples ) appagent_prompt_user_message = self.prompter.user_content_construction( @@ -123,15 +130,14 @@ def message_constructor( prev_plan=plan, user_request=request, subtask=subtask, - current_application=self._process_name, + current_application=current_application, host_message=host_message, retrieved_docs=dynamic_knowledge, + last_success_actions=last_success_actions, include_last_screenshot=include_last_screenshot, ) - if not self.blackboard.is_empty(): - - blackboard_prompt = self.blackboard.blackboard_to_prompt() + if blackboard_prompt: appagent_prompt_user_message = ( blackboard_prompt + appagent_prompt_user_message ) @@ -142,10 +148,13 @@ def message_constructor( return appagent_prompt_message - def print_response(self, response_dict: Dict) -> None: + def print_response( + self, response_dict: Dict[str, Any], print_action: bool = True + ) -> None: """ Print the response. :param response_dict: The response dictionary to print. + :param print_action: The flag indicating whether to print the action. """ control_text = response_dict.get("ControlText") @@ -168,16 +177,17 @@ def print_response(self, response_dict: Dict) -> None: "Observations👀: {observation}".format(observation=observation), "cyan" ) utils.print_with_color("Thoughts💡: {thought}".format(thought=thought), "green") - utils.print_with_color( - "Selected item🕹️: {control_text}, Label: {label}".format( - control_text=control_text, label=control_label - ), - "yellow", - ) - utils.print_with_color( - "Action applied⚒️: {action}".format(action=action), "blue" - ) - utils.print_with_color("Status📊: {status}".format(status=status), "blue") + if print_action: + utils.print_with_color( + "Selected item🕹️: {control_text}, Label: {label}".format( + control_text=control_text, label=control_label + ), + "yellow", + ) + utils.print_with_color( + "Action applied⚒️: {action}".format(action=action), "blue" + ) + utils.print_with_color("Status📊: {status}".format(status=status), "blue") utils.print_with_color( "Next Plan📚: {plan}".format(plan="\n".join(plan)), "cyan" ) @@ -240,14 +250,18 @@ def external_knowledge_prompt_helper( return retrieved_docs - def rag_experience_retrieve(self, request: str, experience_top_k: int) -> str: + def rag_experience_retrieve( + self, request: str, experience_top_k: int + ) -> List[Dict[str, Any]]: """ Retrieving experience examples for the user request. :param request: The user request. :param experience_top_k: The number of documents to retrieve. - :return: The retrieved examples and tips string. + :return: The retrieved examples and tips dictionary. """ + retrieved_docs = [] + # Retrieve experience examples. Only retrieve the examples that are related to the current application. experience_docs = self.experience_retriever.retrieve( request, @@ -257,13 +271,21 @@ def rag_experience_retrieve(self, request: str, experience_top_k: int) -> str: ) if experience_docs: - examples = [doc.metadata.get("example", {}) for doc in experience_docs] - tips = [doc.metadata.get("Tips", "") for doc in experience_docs] - else: - examples = [] - tips = [] + for doc in experience_docs: + example_request = doc.metadata.get("request", "") + response = doc.metadata.get("example", {}) + tips = doc.metadata.get("Tips", "") + subtask = doc.metadata.get("Sub-task", "") + retrieved_docs.append( + { + "Request": example_request, + "Response": response, + "Sub-task": subtask, + "Tips": tips, + } + ) - return examples, tips + return retrieved_docs def rag_demonstration_retrieve(self, request: str, demonstration_top_k: int) -> str: """ @@ -273,14 +295,27 @@ def rag_demonstration_retrieve(self, request: str, demonstration_top_k: int) -> :return: The retrieved examples and tips string. """ + retrieved_docs = [] + # Retrieve demonstration examples. demonstration_docs = self.human_demonstration_retriever.retrieve( request, demonstration_top_k ) if demonstration_docs: - examples = [doc.metadata.get("example", {}) for doc in demonstration_docs] - tips = [doc.metadata.get("Tips", "") for doc in demonstration_docs] + for doc in demonstration_docs: + example_request = doc.metadata.get("request", "") + response = doc.metadata.get("example", {}) + subtask = doc.metadata.get("Sub-task", "") + tips = doc.metadata.get("Tips", "") + retrieved_docs.append( + { + "Request": example_request, + "Response": response, + "Sub-task": subtask, + "Tips": tips, + } + ) else: examples = [] tips = [] @@ -292,7 +327,12 @@ def process(self, context: Context) -> None: Process the agent. :param context: The context. """ - self.processor = AppAgentProcessor(agent=self, context=context) + if configs.get("ACTION_SEQUENCE", False): + self.processor = AppAgentActionSequenceProcessor( + agent=self, context=context + ) + else: + self.processor = AppAgentProcessor(agent=self, context=context) self.processor.process() self.status = self.processor.status @@ -308,7 +348,7 @@ def process_comfirmation(self) -> bool: Process the user confirmation. :return: The decision. """ - action = self.processor.action + action = self.processor.actions control_text = self.processor.control_text decision = interactor.sensitive_step_asker(action, control_text) diff --git a/ufo/agents/agent/basic.py b/ufo/agents/agent/basic.py index 29443921..6d44bf95 100644 --- a/ufo/agents/agent/basic.py +++ b/ufo/agents/agent/basic.py @@ -83,6 +83,14 @@ def memory(self) -> Memory: """ return self._memory + @memory.setter + def memory(self, memory: Memory) -> None: + """ + Set the memory of the agent. + :param memory: The memory of the agent. + """ + self._memory = memory + @property def name(self) -> str: """ @@ -150,6 +158,7 @@ def get_response( :param message: The message for LLMs. :param namescope: The namescope for the LLMs. :param use_backup_engine: Whether to use the backup engine. + :param configs: The configurations. :return: The response. """ response_string, cost = llm_call.get_completion( @@ -182,6 +191,16 @@ def step(self, step: int) -> None: """ self._step = step + def set_memory_from_list_of_dicts(self, data: List[Dict[str, str]]) -> None: + """ + Set the memory from the list of dictionaries. + :param data: The list of dictionaries. + """ + + assert isinstance(data, list), "The data should be a list of dictionaries." + + self._memory.from_list_of_dicts(data) + def add_memory(self, memory_item: MemoryItem) -> None: """ Update the memory of the agent. @@ -252,12 +271,16 @@ def process_asker(self, ask_user: bool = True) -> None: Ask for the process. :param ask_user: Whether to ask the user for the questions. """ + + _ask_message = "Could you please answer the following questions to help me understand your needs and complete the task?" + _none_answer_message = "The answer for the question is not available, please proceed with your own knowledge or experience, or leave it as a placeholder. Do not ask the same question again." + if self.processor: question_list = self.processor.question_list if ask_user: utils.print_with_color( - "Could you please answer the following questions to help me understand your needs and complete the task?", + _ask_message, "yellow", ) @@ -275,7 +298,7 @@ def process_asker(self, ask_user: bool = True) -> None: else: qa_pair = { "question": question, - "answer": "The answer for the question is not available, please proceed with your own knowledge or experience, or leave it as a placeholder. Do not ask the same question again.", + "answer": _none_answer_message, } self.blackboard.add_questions(qa_pair) diff --git a/ufo/agents/agent/follower_agent.py b/ufo/agents/agent/follower_agent.py index 9fd8e4fa..a2fd9dcd 100644 --- a/ufo/agents/agent/follower_agent.py +++ b/ufo/agents/agent/follower_agent.py @@ -91,7 +91,6 @@ def get_prompter( def message_constructor( self, dynamic_examples: str, - dynamic_tips: str, dynamic_knowledge: str, image_list: List[str], control_info: str, @@ -102,12 +101,12 @@ def message_constructor( host_message: List[str], current_state: Dict[str, str], state_diff: Dict[str, str], + blackboard_prompt: List[Dict[str, str]], include_last_screenshot: bool, ) -> List[Dict[str, str]]: """ Construct the prompt message for the FollowAgent. :param dynamic_examples: The dynamic examples retrieved from the self-demonstration and human demonstration. - :param dynamic_tips: The dynamic tips retrieved from the self-demonstration and human demonstration. :param dynamic_knowledge: The dynamic knowledge retrieved from the self-demonstration and human demonstration. :param image_list: The list of screenshot images. :param control_info: The control information. @@ -118,11 +117,12 @@ def message_constructor( :param host_message: The host message. :param current_state: The current state of the app. :param state_diff: The state difference between the current state and the previous state. + :param blackboard_prompt: The blackboard prompt. :param include_last_screenshot: The flag indicating whether the last screenshot should be included. :return: The prompt message. """ followagent_prompt_system_message = self.prompter.system_prompt_construction( - dynamic_examples, dynamic_tips + dynamic_examples ) followagent_prompt_user_message = self.prompter.user_content_construction( image_list=image_list, diff --git a/ufo/agents/agent/host_agent.py b/ufo/agents/agent/host_agent.py index 1e7e2cb4..b232164b 100644 --- a/ufo/agents/agent/host_agent.py +++ b/ufo/agents/agent/host_agent.py @@ -157,7 +157,7 @@ def get_active_appagent(self) -> AppAgent: return self._active_appagent @property - def blackboard(self): + def blackboard(self) -> Blackboard: """ Get the blackboard. """ @@ -170,6 +170,7 @@ def message_constructor( plan: List[str], prev_subtask: List[Dict[str, str]], request: str, + blackboard_prompt: List[Dict[str, str]], ) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]: """ Construct the message. @@ -189,8 +190,7 @@ def message_constructor( user_request=request, ) - if not self.blackboard.is_empty(): - blackboard_prompt = self.blackboard.blackboard_to_prompt() + if blackboard_prompt: hostagent_prompt_user_message = ( blackboard_prompt + hostagent_prompt_user_message ) @@ -235,6 +235,11 @@ def create_app_agent( :return: The app agent. """ + if configs.get("ACTION_SEQUENCE", False): + example_prompt = configs["APPAGENT_EXAMPLE_PROMPT_AS"] + else: + example_prompt = configs["APPAGENT_EXAMPLE_PROMPT"] + if mode == "normal" or "batch_normal": agent_name = ( @@ -254,7 +259,7 @@ def create_app_agent( app_root_name=application_root_name, is_visual=configs["APP_AGENT"]["VISUAL_MODE"], main_prompt=configs["APPAGENT_PROMPT"], - example_prompt=configs["APPAGENT_EXAMPLE_PROMPT"], + example_prompt=example_prompt, api_prompt=configs["API_PROMPT"], ) @@ -275,7 +280,7 @@ def create_app_agent( app_root_name=application_root_name, is_visual=configs["APP_AGENT"]["VISUAL_MODE"], main_prompt=configs["FOLLOWERAHENT_PROMPT"], - example_prompt=configs["APPAGENT_EXAMPLE_PROMPT"], + example_prompt=example_prompt, api_prompt=configs["API_PROMPT"], app_info_prompt=app_info_prompt, ) diff --git a/ufo/agents/memory/blackboard.py b/ufo/agents/memory/blackboard.py index acd2a696..0b2effab 100644 --- a/ufo/agents/memory/blackboard.py +++ b/ufo/agents/memory/blackboard.py @@ -103,6 +103,8 @@ def add_data( data_memory = MemoryItem() data_memory.add_values_from_dict({"text": data}) memory.add_memory_item(data_memory) + else: + print(f"Warning: Unsupported data type: {type(data)} when adding data.") def add_questions(self, questions: Union[MemoryItem, Dict[str, str]]) -> None: """ @@ -238,6 +240,39 @@ def screenshots_to_prompt(self) -> List[str]: return user_content + def blackboard_to_dict(self) -> Dict[str, List[Dict[str, str]]]: + """ + Convert the blackboard to a dictionary. + :return: The blackboard in the dictionary format. + """ + blackboard_dict = { + "questions": self.questions.to_list_of_dicts(), + "requests": self.requests.to_list_of_dicts(), + "trajectories": self.trajectories.to_list_of_dicts(), + "screenshots": self.screenshots.to_list_of_dicts(), + } + + return blackboard_dict + + def blackboard_to_json(self) -> str: + """ + Convert the blackboard to a JSON string. + :return: The JSON string. + """ + return json.dumps(self.blackboard_to_dict()) + + def blackboard_from_dict( + self, blackboard_dict: Dict[str, List[Dict[str, str]]] + ) -> None: + """ + Convert the dictionary to the blackboard. + :param blackboard_dict: The dictionary. + """ + self.questions.from_list_of_dicts(blackboard_dict.get("questions", [])) + self.requests.from_list_of_dicts(blackboard_dict.get("requests", [])) + self.trajectories.from_list_of_dicts(blackboard_dict.get("trajectories", [])) + self.screenshots.from_list_of_dicts(blackboard_dict.get("screenshots", [])) + def blackboard_to_prompt(self) -> List[str]: """ Convert the blackboard to a prompt. diff --git a/ufo/agents/memory/memory.py b/ufo/agents/memory/memory.py index 319333df..c1e70165 100644 --- a/ufo/agents/memory/memory.py +++ b/ufo/agents/memory/memory.py @@ -21,12 +21,21 @@ def to_dict(self) -> Dict[str, str]: Convert the MemoryItem to a dictionary. :return: The dictionary. """ + return { key: value for key, value in self.__dict__.items() if key in self._memory_attributes } + def from_dict(self, data: Dict[str, str]) -> None: + """ + Convert the dictionary to a MemoryItem. + :param data: The dictionary. + """ + for key, value in data.items(): + self.set_value(key, value) + def to_json(self) -> str: """ Convert the memory item to a JSON string. @@ -157,6 +166,24 @@ def to_json(self) -> str: [item.to_dict() for item in self._content if item is not None] ) + def to_list_of_dicts(self) -> List[Dict[str, str]]: + """ + Convert the memory to a list of dictionaries. + :return: The list of dictionaries. + """ + return [item.to_dict() for item in self._content] + + def from_list_of_dicts(self, data: List[Dict[str, str]]) -> None: + """ + Convert the list of dictionaries to the memory. + :param data: The list of dictionaries. + """ + self._content = [] + for item in data: + memory_item = MemoryItem() + memory_item.from_dict(item) + self._content.append(memory_item) + def get_latest_item(self) -> MemoryItem: """ Get the latest memory item. diff --git a/ufo/agents/processors/actions.py b/ufo/agents/processors/actions.py new file mode 100644 index 00000000..901ca313 --- /dev/null +++ b/ufo/agents/processors/actions.py @@ -0,0 +1,554 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +import sys + +sys.path.append("./") + + +import json +import time +from dataclasses import asdict, dataclass, field +from typing import Any, Dict, List, Optional, Tuple + +from pywinauto.controls.uiawrapper import UIAWrapper + +from ufo import utils +from ufo.automator.puppeteer import AppPuppeteer +from ufo.automator.ui_control.screenshot import PhotographerDecorator +from ufo.config.config import Config + + +@dataclass +class BaseControlLog: + """ + The control log data for the HostAgent. + """ + + control_class: str = "" + control_type: str = "" + control_automation_id: str = "" + control_friendly_class_name: str = "" + control_coordinates: Dict[str, int] = field(default_factory=dict) + + def is_empty(self) -> bool: + + return self == BaseControlLog() + + +@dataclass +class ActionExecutionLog: + """ + The action execution log data. + """ + + status: str = "" + error: str = "" + return_value: Any = None + + +class OneStepAction: + + def __init__( + self, + function: str = "", + args: Dict[str, Any] = {}, + control_label: str = "", + control_text: str = "", + after_status: str = "", + results: Optional[ActionExecutionLog] = None, + configs=Config.get_instance().config_data, + ): + self._function = function + self._args = args + self._control_label = control_label + self._control_text = control_text + self._after_status = after_status + self._results = ActionExecutionLog() if results is None else results + self._configs = configs + self._control_log = BaseControlLog() + + @property + def function(self) -> str: + """ + Get the function name. + :return: The function. + """ + return self._function + + @property + def args(self) -> Dict[str, Any]: + """ + Get the arguments. + :return: The arguments. + """ + return self._args + + @property + def control_label(self) -> str: + """ + Get the control label. + :return: The control label. + """ + return self._control_label + + @property + def control_text(self) -> str: + """ + Get the control text. + :return: The control text. + """ + return self._control_text + + @property + def after_status(self) -> str: + """ + Get the status. + :return: The status. + """ + return self._after_status + + @property + def control_log(self) -> BaseControlLog: + """ + Get the control log. + :return: The control log. + """ + return self._control_log + + @control_log.setter + def control_log(self, control_log: BaseControlLog) -> None: + """ + Set the control log. + :param control_log: The control log. + """ + self._control_log = control_log + + @property + def results(self) -> ActionExecutionLog: + """ + Get the results. + :return: The results. + """ + return self._results + + @results.setter + def results(self, results: ActionExecutionLog) -> None: + """ + Set the results. + :param results: The results. + """ + self._results = results + + @property + def command_string(self) -> str: + """ + Generate a function call string. + :param command_name: The function name. + :param params: The arguments as a dictionary. + :return: The function call string. + """ + # Format the arguments + args_str = ", ".join(f"{k}={v!r}" for k, v in self.args.items()) + + # Return the function call string + return f"{self.function}({args_str})" + + def is_same_action(self, action_to_compare: Dict[str, Any]) -> bool: + """ + Check whether the two actions are the same. + :param action1: The first action. + :param action2: The second action. + :return: Whether the two actions are the same. + """ + + return ( + self.function == action_to_compare.get("Function") + and self.args == action_to_compare.get("Args") + and self.control_text == action_to_compare.get("ControlText") + ) + + def count_repeat_times(self, previous_actions: List[Dict[str, Any]]) -> int: + """ + Get the times of the same action in the previous actions. + :param previous_actions: The previous actions. + :return: The times of the same action in the previous actions. + """ + + count = 0 + for action in previous_actions[::-1]: + if self.is_same_action(action): + count += 1 + else: + break + return count + + def to_dict( + self, previous_actions: Optional[List[Dict[str, Any]]] + ) -> Dict[str, Any]: + """ + Convert the action to a dictionary. + :param previous_actions: The previous actions. + :return: The dictionary of the action. + """ + + action_dict = { + "Function": self.function, + "Args": self.args, + "ControlLabel": self.control_label, + "ControlText": self.control_text, + "Status": self.after_status, + "Results": asdict(self.results), + } + + # Add the repetitive times of the same action in the previous actions if the previous actions are provided. + if previous_actions: + action_dict["RepeatTimes"] = self.count_repeat_times(previous_actions) + + return action_dict + + def to_string(self, previous_actions: Optional[List["OneStepAction"]]) -> str: + """ + Convert the action to a string. + :param previous_actions: The previous actions. + :return: The string of the action. + """ + return json.dumps(self.to_dict(previous_actions), ensure_ascii=False) + + def _control_validation(self, control: UIAWrapper) -> bool: + """ + Validate the action. + :param action: The action to validate. + :return: The validation result. + """ + try: + control.is_enabled() + return True + except: + return False + + def execute(self, puppeteer: AppPuppeteer) -> Any: + """ + Execute the action. + :param executor: The executor. + """ + return puppeteer.execute_command(self.function, self.args) + + def action_flow( + self, + puppeteer: AppPuppeteer, + control_dict: Dict[str, UIAWrapper], + application_window: UIAWrapper, + ) -> Tuple[ActionExecutionLog, BaseControlLog]: + """ + Execute the action flow. + :param action: The action. + :param control_dict: The control dictionary. + :return: The action execution log. + """ + control_selected: UIAWrapper = control_dict.get(self.control_label, None) + + # If the control is selected, but not available, return an error. + if control_selected is not None and not self._control_validation( + control_selected + ): + self.results = ActionExecutionLog( + status="error", + error="Control is not available.", + ) + self._control_log = BaseControlLog() + + return self.results + + # Create the control receiver. + puppeteer.receiver_manager.create_ui_control_receiver( + control_selected, application_window + ) + + if self.function: + + if self._configs.get("SHOW_VISUAL_OUTLINE_ON_SCREEN", True): + if control_selected: + control_selected.draw_outline(colour="red", thickness=3) + time.sleep(self._configs.get("RECTANGLE_TIME", 0)) + + self._control_log = self._get_control_log( + control_selected=control_selected, application_window=application_window + ) + + try: + return_value = self.execute(puppeteer=puppeteer) + if not utils.is_json_serializable(return_value): + return_value = "" + + self.results = ActionExecutionLog( + status="success", + return_value=return_value, + ) + + except Exception as e: + self.results = ActionExecutionLog( + status="error", + error=str(e), + ) + return self.results + + def _get_control_log( + self, + control_selected: Optional[UIAWrapper], + application_window: UIAWrapper, + ) -> BaseControlLog: + """ + Get the control log data for the selected control. + :param control_selected: The selected control item. + :param application_window: The application window where the control is located. + :return: The control log data for the selected control. + """ + + if not control_selected or not application_window: + return BaseControlLog() + + control_coordinates = PhotographerDecorator.coordinate_adjusted( + application_window.rectangle(), control_selected.rectangle() + ) + + control_log = BaseControlLog( + control_class=control_selected.element_info.class_name, + control_type=control_selected.element_info.control_type, + control_automation_id=control_selected.element_info.automation_id, + control_friendly_class_name=control_selected.friendly_class_name(), + control_coordinates={ + "left": control_coordinates[0], + "top": control_coordinates[1], + "right": control_coordinates[2], + "bottom": control_coordinates[3], + }, + ) + + return control_log + + def print_result(self) -> None: + """ + Print the action execution result. + """ + + utils.print_with_color( + "Selected item🕹️: {control_text}, Label: {label}".format( + control_text=self.control_text, label=self.control_label + ), + "yellow", + ) + utils.print_with_color( + "Action applied⚒️: {action}".format(action=self.command_string), "blue" + ) + + result_color = "red" if self.results.status != "success" else "green" + + utils.print_with_color( + "Execution result📜: {result}".format(result=asdict(self.results)), + result_color, + ) + + +class ActionSequence: + """ + A sequence of one-step actions. + """ + + def __init__(self, actions: Optional[List[OneStepAction]] = []): + + if not actions: + actions = [] + self._status = "FINISH" + else: + self._status = actions[0].after_status + + self._actions = actions + self._length = len(actions) + + @property + def actions(self) -> List[OneStepAction]: + """ + Get the actions. + :return: The actions. + """ + return self._actions + + @property + def length(self) -> int: + """ + Get the length of the actions. + :return: The length of the actions. + """ + return len(self._actions) + + @property + def status(self) -> str: + """ + Get the status of the actions. + :return: The status of the actions. + """ + return self._status + + def add_action(self, action: OneStepAction) -> None: + """ + Add an action. + :param action: The action. + """ + self._actions.append(action) + + def to_list_of_dicts( + self, + success_only: bool = False, + previous_actions: Optional[List[Dict[str, Any]]] = None, + ) -> List[Dict[str, Any]]: + """ + Convert the action sequence to a dictionary. + :param success_only: Whether to convert the successful actions only. + :param previous_actions: The previous actions. + :return: The dictionary of the action sequence. + """ + + action_list = [] + for action in self.actions: + if success_only and action.results.status != "success": + continue + action_list.append(action.to_dict(previous_actions)) + return action_list + + def to_string(self, success_only: bool = False, previous_actions=None) -> str: + """ + Convert the action sequence to a string. + :param success_only: Whether to convert the successful actions only. + :param previous_actions: The previous actions. + :return: The string of the action sequence. + """ + return json.dumps( + self.to_list_of_dicts(success_only, previous_actions), ensure_ascii=False + ) + + def execute_all( + self, + puppeteer: AppPuppeteer, + control_dict: Dict[str, UIAWrapper], + application_window: UIAWrapper, + ) -> None: + """ + Execute all the actions. + :param puppeteer: The puppeteer. + :param control_dict: The control dictionary. + :param application_window: The application window. + """ + + early_stop = False + + for action in self.actions: + if early_stop: + action.results = ActionExecutionLog( + status="error", error="Early stop due to error in previous actions." + ) + + else: + self._status = action.after_status + + action.action_flow(puppeteer, control_dict, application_window) + + # Sleep for a while to avoid the UI being too busy. + time.sleep(0.5) + + if action.results.status != "success": + early_stop = True + + def get_results(self, success_only: bool = False) -> List[Dict[str, Any]]: + """ + Get the results of the actions. + :param success_only: Whether to get the successful actions only. + :return: The results of the actions. + """ + return [ + asdict(action.results) + for action in self.actions + if not success_only or action.results.status == "success" + ] + + def get_control_logs(self, success_only: bool = False) -> List[Dict[str, Any]]: + """ + Get the control logs of the actions. + :param success_only: Whether to get the successful actions only. + :return: The control logs of the actions. + """ + return [ + asdict(action.control_log) + for action in self.actions + if not success_only or action.results.status == "success" + ] + + def get_success_control_coords(self) -> List[Dict[str, Any]]: + """ + Get the control coordinates of the successful actions. + :return: The control coordinates of the successful actions. + """ + return [ + action.control_log.control_coordinates + for action in self.actions + if action.results.status == "success" and not action.control_log.is_empty() + ] + + def get_function_calls(self, is_success_only: bool = False) -> List[str]: + """ + Get the function calls of the actions. + :param is_success_only: Whether to get the successful actions only. + :return: The function calls of the actions. + """ + return [ + action.command_string + for action in self.actions + if not is_success_only or action.results.status == "success" + ] + + def print_all_results(self, success_only: bool = False) -> None: + """ + Print the action execution result. + """ + index = 1 + for action in self.actions: + if success_only and action.results.status != "success": + continue + if self.length > 1: + utils.print_with_color(f"Action {index}:", "cyan") + action.print_result() + index += 1 + utils.print_with_color(f"Final status: {self.status}", "yellow") + + +if __name__ == "__main__": + + action1 = OneStepAction( + function="click", + args={"button": "left"}, + control_label="1", + control_text="OK", + after_status="success", + results=ActionExecutionLog(status="success"), + ) + + action2 = OneStepAction( + function="click", + args={"button": "right"}, + control_label="2", + control_text="NotOK", + after_status="success", + results=ActionExecutionLog(status="success"), + ) + + action_sequence = ActionSequence([action1, action2]) + + previous_actions = [ + {"Function": "click", "Args": {"button": "left"}, "ControlText": "OK"}, + {"Function": "click", "Args": {"button": "right"}, "ControlText": "OK"}, + {"Function": "click", "Args": {"button": "left"}, "ControlText": "OK"}, + {"Function": "click", "Args": {"button": "left"}, "ControlText": "OK"}, + ] + + print(action_sequence.to_list_of_dicts(previous_actions=previous_actions)) diff --git a/ufo/agents/processors/app_agent_action_seq_processor.py b/ufo/agents/processors/app_agent_action_seq_processor.py new file mode 100644 index 00000000..a3b5972d --- /dev/null +++ b/ufo/agents/processors/app_agent_action_seq_processor.py @@ -0,0 +1,94 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +from typing import Any, Dict, List + +from ufo.agents.processors.actions import ActionSequence, OneStepAction +from ufo.agents.processors.app_agent_processor import AppAgentProcessor +from ufo.agents.processors.basic import BaseProcessor +from ufo.config.config import Config + +configs = Config.get_instance().config_data +if configs is not None: + BACKEND = configs["CONTROL_BACKEND"] + + +class AppAgentActionSequenceProcessor(AppAgentProcessor): + """ + The processor for the app agent at a single step. + """ + + @BaseProcessor.exception_capture + @BaseProcessor.method_timer + def parse_response(self) -> None: + """ + Parse the response. + """ + + self._response_json = self.app_agent.response_to_dict(self._response) + + self.question_list = self._response_json.get("Questions", []) + + # Convert the plan from a string to a list if the plan is a string. + self.plan = self.string2list(self._response_json.get("Plan", "")) + self._response_json["Plan"] = self.plan + + self.app_agent.print_response(self._response_json, print_action=False) + + @BaseProcessor.exception_capture + @BaseProcessor.method_timer + def execute_action(self) -> None: + """ + Execute the action. + """ + + action_sequence_dicts = self._response_json.get("ActionList", []) + action_list = [ + OneStepAction( + function=action_dict.get("Function", ""), + args=action_dict.get("Args", {}), + control_label=action_dict.get("ControlLabel", ""), + control_text=action_dict.get("ControlText", ""), + after_status=action_dict.get("Status", "CONTINUE"), + ) + for action_dict in action_sequence_dicts + ] + self.actions = ActionSequence(action_list) + self.function_calls = self.actions.get_function_calls() + + self.actions.execute_all( + puppeteer=self.app_agent.Puppeteer, + control_dict=self._annotation_dict, + application_window=self.application_window, + ) + + self.status = self.actions.status + + success_control_adjusted_coords = self.actions.get_success_control_coords() + self.capture_control_screenshot_from_adjusted_coords( + control_adjusted_coords=success_control_adjusted_coords + ) + + self.actions.print_all_results() + + def capture_control_screenshot_from_adjusted_coords( + self, control_adjusted_coords: List[Dict[str, Any]] + ) -> None: + """ + Capture the screenshot of the selected control. + :param control_selected: The selected control item or a list of selected control items. + """ + control_screenshot_save_path = ( + self.log_path + f"action_step{self.session_step}_selected_controls.png" + ) + + self._memory_data.add_values_from_dict( + {"SelectedControlScreenshot": control_screenshot_save_path} + ) + self.photographer.capture_app_window_screenshot_with_rectangle_from_adjusted_coords( + self.application_window, + control_adjusted_coords=control_adjusted_coords, + save_path=control_screenshot_save_path, + background_screenshot_path=self.screenshot_save_path, + ) diff --git a/ufo/agents/processors/app_agent_processor.py b/ufo/agents/processors/app_agent_processor.py index 26c608cb..8d82d9d0 100644 --- a/ufo/agents/processors/app_agent_processor.py +++ b/ufo/agents/processors/app_agent_processor.py @@ -1,29 +1,92 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - import json import os -import time -from typing import TYPE_CHECKING, Dict, List, Tuple +from dataclasses import asdict, dataclass, field +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from pywinauto.controls.uiawrapper import UIAWrapper from ufo import utils +from ufo.agents.processors.actions import ( + ActionSequence, + BaseControlLog, + OneStepAction, +) from ufo.agents.processors.basic import BaseProcessor from ufo.automator.ui_control import ui_tree -from ufo.automator.ui_control.screenshot import PhotographerDecorator from ufo.automator.ui_control.control_filter import ControlFilterFactory from ufo.config.config import Config from ufo.module.context import Context, ContextNames - if TYPE_CHECKING: from ufo.agents.agent.app_agent import AppAgent configs = Config.get_instance().config_data if configs is not None: - BACKEND = configs["CONTROL_BACKEND"] + BACKEND = configs.get("CONTROL_BACKEND", "uia") + + +@dataclass +class AppAgentAdditionalMemory: + """ + The additional memory data for the AppAgent. + """ + + Step: int + RoundStep: int + AgentStep: int + Round: int + Subtask: str + SubtaskIndex: int + FunctionCall: List[str] + Action: List[Dict[str, Any]] + ActionSuccess: List[Dict[str, Any]] + ActionType: List[str] + Request: str + Agent: str + AgentName: str + Application: str + Cost: float + Results: str + error: str + time_cost: Dict[str, float] + ControlLog: Dict[str, Any] + UserConfirm: Optional[str] = None + + +@dataclass +class AppAgentControlLog(BaseControlLog): + """ + The control log data for the AppAgent. + """ + + control_friendly_class_name: str = "" + control_coordinates: Dict[str, int] = field(default_factory=dict) + + +@dataclass +class AppAgentRequestLog: + """ + The request log data for the AppAgent. + """ + + step: int + dynamic_examples: List[str] + dynamic_knowledge: List[str] + image_list: List[str] + prev_subtask: List[str] + plan: List[str] + request: str + control_info: List[Dict[str, str]] + subtask: str + current_application: str + host_message: str + blackboard_prompt: List[str] + last_success_actions: List[Dict[str, Any]] + include_last_screenshot: bool + prompt: Dict[str, Any] class AppAgentProcessor(BaseProcessor): @@ -50,22 +113,7 @@ def __init__(self, agent: "AppAgent", context: Context) -> None: self._image_url = [] self.control_filter_factory = ControlFilterFactory() self.filtered_annotation_dict = None - - @property - def action(self) -> str: - """ - Get the action. - :return: The action. - """ - return self._action - - @action.setter - def action(self, action: str) -> None: - """ - Set the action. - :param action: The action. - """ - self._action = action + self.screenshot_save_path = None def print_step_info(self) -> None: """ @@ -90,6 +138,8 @@ def capture_screenshot(self) -> None: # Define the paths for the screenshots saved. screenshot_save_path = self.log_path + f"action_step{self.session_step}.png" + self.screenshot_save_path = screenshot_save_path + annotated_screenshot_save_path = ( self.log_path + f"action_step{self.session_step}_annotated.png" ) @@ -111,8 +161,8 @@ def capture_screenshot(self) -> None: else: control_list = self.control_inspector.find_control_elements_in_descendants( self.application_window, - control_type_list=configs["CONTROL_LIST"], - class_name_list=configs["CONTROL_LIST"], + control_type_list=configs.get("CONTROL_LIST", []), + class_name_list=configs.get("CONTROL_LIST", []), ) # Get the annotation dictionary for the control items, in a format of {control_label: control_element}. @@ -146,7 +196,7 @@ def capture_screenshot(self) -> None: ) # If the configuration is set to include the last screenshot with selected controls tagged, save the last screenshot. - if configs["INCLUDE_LAST_SCREENSHOT"]: + if configs.get("INCLUDE_LAST_SCREENSHOT", True): last_screenshot_save_path = ( self.log_path + f"action_step{self.session_step - 1}.png" ) @@ -163,7 +213,7 @@ def capture_screenshot(self) -> None: ] # Whether to concatenate the screenshots of clean screenshot and annotated screenshot into one image. - if configs["CONCAT_SCREENSHOT"]: + if configs.get("CONCAT_SCREENSHOT", False): self.photographer.concat_screenshots( screenshot_save_path, annotated_screenshot_save_path, @@ -182,8 +232,7 @@ def capture_screenshot(self) -> None: self._image_url += [screenshot_url, screenshot_annotated_url] # Save the XML file for the current state. - if configs["LOG_XML"]: - + if configs.get("LOG_XML", False): self._save_to_xml() @BaseProcessor.exception_capture @@ -215,19 +264,33 @@ def get_prompt_message(self) -> None: Get the prompt message for the AppAgent. """ - examples, tips = self.demonstration_prompt_helper() + retrieved_results = self.demonstration_prompt_helper() # Get the external knowledge prompt for the AppAgent using the offline and online retrievers. external_knowledge_prompt = self.app_agent.external_knowledge_prompt_helper( self.request, - configs["RAG_OFFLINE_DOCS_RETRIEVED_TOPK"], - configs["RAG_ONLINE_RETRIEVED_TOPK"], + configs.get("RAG_OFFLINE_DOCS_RETRIEVED_TOPK", 0), + configs.get("RAG_ONLINE_RETRIEVED_TOPK", 0), ) + if not self.app_agent.blackboard.is_empty(): + blackboard_prompt = self.app_agent.blackboard.blackboard_to_prompt() + else: + blackboard_prompt = [] + + # Get the last successful actions of the AppAgent. + last_success_actions = self.get_last_success_actions() + + action_keys = ["Function", "Args", "ControlText", "Results", "RepeatTimes"] + + filtered_last_success_actions = [ + {key: action.get(key, "") for key in action_keys} + for action in last_success_actions + ] + # Construct the prompt message for the AppAgent. self._prompt_message = self.app_agent.message_constructor( - dynamic_examples=examples, - dynamic_tips=tips, + dynamic_examples=retrieved_results, dynamic_knowledge=external_knowledge_prompt, image_list=self._image_url, control_info=self.filtered_control_info, @@ -235,21 +298,34 @@ def get_prompt_message(self) -> None: plan=self.prev_plan, request=self.request, subtask=self.subtask, + current_application=self.application_process_name, host_message=self.host_message, - include_last_screenshot=configs["INCLUDE_LAST_SCREENSHOT"], + blackboard_prompt=blackboard_prompt, + last_success_actions=filtered_last_success_actions, + include_last_screenshot=configs.get("INCLUDE_LAST_SCREENSHOT", True), ) # Log the prompt message. Only save them in debug mode. - log = json.dumps( - { - "step": self.session_step, - "prompt": self._prompt_message, - "control_items": self._control_info, - "filted_control_items": self.filtered_control_info, - "status": "", - } + request_data = AppAgentRequestLog( + step=self.session_step, + dynamic_examples=retrieved_results, + dynamic_knowledge=external_knowledge_prompt, + image_list=self._image_url, + prev_subtask=self.previous_subtasks, + plan=self.prev_plan, + request=self.request, + control_info=self.filtered_control_info, + subtask=self.subtask, + current_application=self.application_process_name, + host_message=self.host_message, + blackboard_prompt=blackboard_prompt, + last_success_actions=filtered_last_success_actions, + include_last_screenshot=configs.get("INCLUDE_LAST_SCREENSHOT", True), + prompt=self._prompt_message, ) - self.request_logger.debug(log) + + request_log_str = json.dumps(asdict(request_data), ensure_ascii=False) + self.request_logger.debug(request_log_str) @BaseProcessor.exception_capture @BaseProcessor.method_timer @@ -281,13 +357,10 @@ def parse_response(self) -> None: self.plan = self.string2list(self._response_json.get("Plan", "")) self._response_json["Plan"] = self.plan - # Compose the function call and the arguments string. - self.action = self.app_agent.Puppeteer.get_command_string( - self._operation, self._args - ) - self.status = self._response_json.get("Status", "") - self.app_agent.print_response(self._response_json) + self.app_agent.print_response( + response_dict=self._response_json, print_action=True + ) @BaseProcessor.exception_capture @BaseProcessor.method_timer @@ -296,56 +369,31 @@ def execute_action(self) -> None: Execute the action. """ - control_selected = self._annotation_dict.get(self._control_label, None) - self.app_agent.Puppeteer.receiver_manager.create_ui_control_receiver( - control_selected, self.application_window + action = OneStepAction( + function=self._operation, + args=self._args, + control_label=self._control_label, + control_text=self.control_text, + after_status=self.status, ) + control_selected = self._annotation_dict.get(self._control_label, None) # Save the screenshot of the tagged selected control. self.capture_control_screenshot(control_selected) - if self._operation: - - if configs.get("SHOW_VISUAL_OUTLINE_ON_SCREEN", True): - control_selected.draw_outline(colour="red", thickness=3) - time.sleep(configs.get("RECTANGLE_TIME", 0)) - - if control_selected: - control_coordinates = PhotographerDecorator.coordinate_adjusted( - self.application_window.rectangle(), - control_selected.rectangle(), - ) - self._control_log = { - "control_class": control_selected.element_info.class_name, - "control_type": control_selected.element_info.control_type, - "control_automation_id": control_selected.element_info.automation_id, - "control_friendly_class_name": control_selected.friendly_class_name(), - "control_coordinates": { - "left": control_coordinates[0], - "top": control_coordinates[1], - "right": control_coordinates[2], - "bottom": control_coordinates[3], - }, - } - else: - self._control_log = {} - - if self.status.upper() == self._agent_status_manager.SCREENSHOT.value: - self.handle_screenshot_status() - else: - self._results = self.app_agent.Puppeteer.execute_command( - self._operation, self._args - ) - self.control_reannotate = None - if not utils.is_json_serializable(self._results): - self._results = "" - - return + self.actions: ActionSequence = ActionSequence(actions=[action]) + self.actions.execute_all( + puppeteer=self.app_agent.Puppeteer, + control_dict=self._annotation_dict, + application_window=self.application_window, + ) - def capture_control_screenshot(self, control_selected: UIAWrapper) -> None: + def capture_control_screenshot( + self, control_selected: Union[UIAWrapper, List[UIAWrapper]] + ) -> None: """ Capture the screenshot of the selected control. - :param control_selected: The selected control item. + :param control_selected: The selected control item or a list of selected control items. """ control_screenshot_save_path = ( self.log_path + f"action_step{self.session_step}_selected_controls.png" @@ -355,10 +403,17 @@ def capture_control_screenshot(self, control_selected: UIAWrapper) -> None: {"SelectedControlScreenshot": control_screenshot_save_path} ) + sub_control_list = ( + control_selected + if isinstance(control_selected, list) + else [control_selected] + ) + self.photographer.capture_app_window_screenshot_with_rectangle( self.application_window, - sub_control_list=[control_selected], + sub_control_list=sub_control_list, save_path=control_screenshot_save_path, + background_screenshot_path=self.screenshot_save_path, ) def handle_screenshot_status(self) -> None: @@ -383,31 +438,53 @@ def sync_memory(self): self.application_window ) - # Log additional information for the app agent. - additional_memory = { - "Step": self.session_step, - "RoundStep": self.round_step, - "AgentStep": self.app_agent.step, - "Round": self.round_num, - "Subtask": self.subtask, - "SubtaskIndex": self.round_subtask_amount, - "Action": self.action, - "ActionType": self.app_agent.Puppeteer.get_command_types(self._operation), - "Request": self.request, - "Agent": "AppAgent", - "AgentName": self.app_agent.name, - "Application": app_root, - "Cost": self._cost, - "Results": self._results, - "error": self._exeception_traceback, - } + action_type = [ + self.app_agent.Puppeteer.get_command_types(action.function) + for action in self.actions.actions + ] + + all_previous_success_actions = self.get_all_success_actions() + + action_success = self.actions.to_list_of_dicts( + success_only=True, previous_actions=all_previous_success_actions + ) + + # Create the additional memory data for the log. + additional_memory = AppAgentAdditionalMemory( + Step=self.session_step, + RoundStep=self.round_step, + AgentStep=self.app_agent.step, + Round=self.round_num, + Subtask=self.subtask, + SubtaskIndex=self.round_subtask_amount, + FunctionCall=self.actions.get_function_calls(), + Action=self.actions.to_list_of_dicts( + previous_actions=all_previous_success_actions + ), + ActionSuccess=action_success, + ActionType=action_type, + Request=self.request, + Agent="AppAgent", + AgentName=self.app_agent.name, + Application=app_root, + Cost=self._cost, + Results=self.actions.get_results(), + error=self._exeception_traceback, + time_cost=self._time_cost, + ControlLog=self.actions.get_control_logs(), + UserConfirm=( + "Yes" + if self.status.upper() + == self._agent_status_manager.CONFIRM.value.upper() + else None + ), + ) + + # Log the original response from the LLM. self.add_to_memory(self._response_json) - self.add_to_memory(additional_memory) - self.add_to_memory(self._control_log) - self.add_to_memory({"time_cost": self._time_cost}) - if self.status.upper() == self._agent_status_manager.CONFIRM.value: - self._memory_data.add_values_from_dict({"UserConfirm": "Yes"}) + # Log the additional memory data for the AppAgent. + self.add_to_memory(asdict(additional_memory)) def update_memory(self) -> None: """ @@ -425,7 +502,8 @@ def update_memory(self) -> None: # Only memorize the keys in the HISTORY_KEYS list to feed into the prompt message in the future steps. memorized_action = { - key: self._memory_data.to_dict().get(key) for key in configs["HISTORY_KEYS"] + key: self._memory_data.to_dict().get(key) + for key in configs.get("HISTORY_KEYS", []) } if self.is_confirm(): @@ -441,6 +519,43 @@ def update_memory(self) -> None: self._update_image_blackboard() self.host_agent.blackboard.add_trajectories(memorized_action) + def get_all_success_actions(self) -> List[Dict[str, Any]]: + """ + Get the previous action. + :return: The previous action of the agent. + """ + agent_memory = self.app_agent.memory + + if agent_memory.length > 0: + success_action_memory = agent_memory.filter_memory_from_keys( + ["ActionSuccess"] + ) + success_actions = [] + for success_action in success_action_memory: + success_actions += success_action.get("ActionSuccess", []) + + else: + success_actions = [] + + return success_actions + + def get_last_success_actions(self) -> List[Dict[str, Any]]: + """ + Get the previous action. + :return: The previous action of the agent. + """ + agent_memory = self.app_agent.memory + + if agent_memory.length > 0: + last_success_actions = ( + agent_memory.get_latest_item().to_dict().get("ActionSuccess", []) + ) + + else: + last_success_actions = [] + + return last_success_actions + def _update_image_blackboard(self) -> None: """ Save the screenshot to the blackboard if the SaveScreenshot flag is set to True by the AppAgent. @@ -468,40 +583,28 @@ def _save_to_xml(self) -> None: ) self.app_agent.Puppeteer.save_to_xml(xml_save_path) - def demonstration_prompt_helper(self) -> Tuple[List[str], List[str]]: + def demonstration_prompt_helper(self) -> List[Dict[str, Any]]: """ Get the examples and tips for the AppAgent using the demonstration retriever. :return: The examples and tips for the AppAgent. """ + retrieved_results = [] # Get the examples and tips for the AppAgent using the experience and demonstration retrievers. if configs["RAG_EXPERIENCE"]: - experience_examples, experience_tips = ( - self.app_agent.rag_experience_retrieve( - self.request, configs["RAG_EXPERIENCE_RETRIEVED_TOPK"] - ) + retrieved_results += self.app_agent.rag_experience_retrieve( + self.subtask, configs["RAG_EXPERIENCE_RETRIEVED_TOPK"] ) - else: - experience_examples = [] - experience_tips = [] if configs["RAG_DEMONSTRATION"]: - demonstration_examples, demonstration_tips = ( - self.app_agent.rag_demonstration_retrieve( - self.request, configs["RAG_DEMONSTRATION_RETRIEVED_TOPK"] - ) + retrieved_results += self.app_agent.rag_demonstration_retrieve( + self.subtask, configs["RAG_DEMONSTRATION_RETRIEVED_TOPK"] ) - else: - demonstration_examples = [] - demonstration_tips = [] - - examples = experience_examples + demonstration_examples - tips = experience_tips + demonstration_tips - return examples, tips + return retrieved_results def get_filtered_annotation_dict( - self, annotation_dict: Dict[str, UIAWrapper], configs=configs + self, annotation_dict: Dict[str, UIAWrapper], configs: Dict[str, Any] = configs ) -> Dict[str, UIAWrapper]: """ Get the filtered annotation dictionary. diff --git a/ufo/agents/processors/basic.py b/ufo/agents/processors/basic.py index 390d5b75..b0a78bfb 100644 --- a/ufo/agents/processors/basic.py +++ b/ufo/agents/processors/basic.py @@ -7,18 +7,25 @@ import time import traceback from abc import ABC, abstractmethod +from dataclasses import asdict, dataclass from functools import wraps -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional, Union from pywinauto.controls.uiawrapper import UIAWrapper from ufo import utils from ufo.agents.agent.basic import BasicAgent from ufo.agents.memory.memory import MemoryItem +from ufo.automator.puppeteer import AppPuppeteer from ufo.automator.ui_control.inspector import ControlInspectorFacade from ufo.automator.ui_control.screenshot import PhotographerFacade from ufo.config.config import Config from ufo.module.context import Context, ContextNames +from ufo.agents.processors.actions import ( + ActionSequence, + OneStepAction, + BaseControlLog, +) configs = Config.get_instance().config_data if configs is not None: @@ -53,22 +60,15 @@ def __init__(self, agent: BasicAgent, context: Context) -> None: self._control_text = None self._response_json = {} self._memory_data = MemoryItem() - self._results = None self._question_list = [] self._agent_status_manager = self.agent.status_manager self._is_resumed = False - self._action = None self._plan = None - self._control_log = { - "control_class": None, - "control_type": None, - "control_automation_id": None, - } - self._total_time_cost = 0 self._time_cost = {} self._exeception_traceback = {} + self._actions = ActionSequence() def process(self) -> None: """ @@ -535,20 +535,20 @@ def status(self) -> str: return self._status @property - def action(self) -> str: + def actions(self) -> ActionSequence: """ - Get the action. - :return: The action. + Get the actions. + :return: The actions. """ - return self._action + return self._actions - @action.setter - def action(self, action: str) -> None: + @actions.setter + def actions(self, actions: ActionSequence) -> None: """ - Set the action. - :param action: The action. + Set the actions. + :param actions: The actions to be executed. """ - self._action = action + self._actions = actions @property def plan(self) -> str: diff --git a/ufo/agents/processors/follower_agent_processor.py b/ufo/agents/processors/follower_agent_processor.py index 0b78bcb6..6e4296b0 100644 --- a/ufo/agents/processors/follower_agent_processor.py +++ b/ufo/agents/processors/follower_agent_processor.py @@ -3,10 +3,14 @@ import json +from dataclasses import asdict, dataclass +from typing import TYPE_CHECKING, Any, Dict -from ufo.agents.processors.app_agent_processor import AppAgentProcessor +from ufo.agents.processors.app_agent_processor import ( + AppAgentProcessor, + AppAgentRequestLog, +) from ufo.config.config import Config -from typing import TYPE_CHECKING from ufo.module.context import Context, ContextNames if TYPE_CHECKING: @@ -15,6 +19,16 @@ configs = Config.get_instance().config_data +@dataclass +class FollowerAgentRequestLog(AppAgentRequestLog): + """ + The request log data for the AppAgent. + """ + + current_state: Dict[str, Any] + state_diff: Dict[str, Any] + + class FollowerAppAgentProcessor(AppAgentProcessor): """ The processor for the AppAgent in the follower mode. @@ -43,13 +57,17 @@ def get_prompt_message(self) -> None: configs["RAG_ONLINE_RETRIEVED_TOPK"], ) + if not self.app_agent.blackboard.is_empty(): + blackboard_prompt = self.app_agent.blackboard.blackboard_to_prompt() + else: + blackboard_prompt = [] + # Get the current state of the application and the state difference between the current state and the previous state. current_state = {} state_diff = {} self._prompt_message = self.app_agent.message_constructor( dynamic_examples=examples, - dynamic_tips=tips, dynamic_knowledge=external_knowledge_prompt, image_list=self._image_url, control_info=self.filtered_control_info, @@ -60,16 +78,25 @@ def get_prompt_message(self) -> None: host_message=[], current_state=current_state, state_diff=state_diff, + blackboard_prompt=blackboard_prompt, include_last_screenshot=configs["INCLUDE_LAST_SCREENSHOT"], ) - log = json.dumps( - { - "step": self.session_step, - "prompt": self._prompt_message, - "control_items": self._control_info, - "filted_control_items": self.filtered_control_info, - "status": "", - } + request_data = FollowerAgentRequestLog( + step=self.session_step, + dynamic_examples=examples, + dynamic_knowledge=external_knowledge_prompt, + image_list=self._image_url, + prev_subtask=[], + plan=self.prev_plan, + request=self.request, + control_info=self.filtered_control_info, + subtask=self.request, + host_message=[], + blackboard_prompt=blackboard_prompt, + include_last_screenshot=configs["INCLUDE_LAST_SCREENSHOT"], + prompt=self._prompt_message, ) - self.request_logger.debug(log) + + request_log_str = json.dumps(asdict(request_data), indent=4, ensure_ascii=False) + self.request_logger.debug(request_log_str) diff --git a/ufo/agents/processors/host_agent_processor.py b/ufo/agents/processors/host_agent_processor.py index 2851e4b0..6a4c1d8a 100644 --- a/ufo/agents/processors/host_agent_processor.py +++ b/ufo/agents/processors/host_agent_processor.py @@ -3,12 +3,18 @@ import json -from typing import TYPE_CHECKING +from dataclasses import asdict, dataclass +from typing import TYPE_CHECKING, Any, Dict, List from pywinauto.controls.uiawrapper import UIAWrapper from ufo import utils -from ufo.agents.processors.basic import BaseProcessor +from ufo.agents.processors.actions import ( + ActionExecutionLog, + ActionSequence, + OneStepAction, +) +from ufo.agents.processors.basic import BaseControlLog, BaseProcessor from ufo.config.config import Config from ufo.module.context import Context, ContextNames @@ -20,6 +26,48 @@ from ufo.agents.agent.host_agent import HostAgent +@dataclass +class HostAgentAdditionalMemory: + """ + The additional memory for the host agent. + """ + + Step: int + RoundStep: int + AgentStep: int + Round: int + ControlLabel: str + SubtaskIndex: int + Action: str + FunctionCall: str + ActionType: str + Request: str + Agent: str + AgentName: str + Application: str + Cost: float + Results: str + error: str + time_cost: Dict[str, float] + ControlLog: Dict[str, Any] + + +@dataclass +class HostAgentRequestLog: + """ + The request log data for the AppAgent. + """ + + step: int + image_list: List[str] + os_info: Dict[str, str] + plan: List[str] + prev_subtask: List[str] + request: str + blackboard_prompt: List[str] + prompt: Dict[str, Any] + + class HostAgentProcessor(BaseProcessor): """ The processor for the host agent at a single step. @@ -96,6 +144,11 @@ def get_prompt_message(self) -> None: Get the prompt message. """ + if not self.host_agent.blackboard.is_empty(): + blackboard_prompt = self.host_agent.blackboard.blackboard_to_prompt() + else: + blackboard_prompt = [] + # Construct the prompt message for the host agent. self._prompt_message = self.host_agent.message_constructor( image_list=[self._desktop_screen_url], @@ -103,19 +156,23 @@ def get_prompt_message(self) -> None: plan=self.prev_plan, prev_subtask=self.previous_subtasks, request=self.request, + blackboard_prompt=blackboard_prompt, ) - # Log the prompt message. Only save them in debug mode. - log = json.dumps( - { - "step": self.session_step, - "prompt": self._prompt_message, - "control_items": self._desktop_windows_info, - "filted_control_items": self._desktop_windows_info, - "status": "", - } + request_data = HostAgentRequestLog( + step=self.session_step, + image_list=[self._desktop_screen_url], + os_info=self._desktop_windows_info, + plan=self.prev_plan, + prev_subtask=self.previous_subtasks, + request=self.request, + blackboard_prompt=blackboard_prompt, + prompt=self._prompt_message, ) - self.request_logger.debug(log) + + # Log the prompt message. Only save them in debug mode. + request_log_str = json.dumps(asdict(request_data), indent=4, ensure_ascii=False) + self.request_logger.debug(request_log_str) @BaseProcessor.exception_capture @BaseProcessor.method_timer @@ -233,11 +290,20 @@ def _select_application(self, application_window: UIAWrapper) -> None: :param application_window: The application window. """ - self._control_log = { - "control_class": application_window.element_info.class_name, - "control_type": application_window.element_info.control_type, - "control_automation_id": application_window.element_info.automation_id, - } + action = OneStepAction( + control_label=self.control_label, + control_text=self.control_text, + after_status=self.status, + function="set_focus", + ) + + action.control_log = BaseControlLog( + control_class=application_window.element_info.class_name, + control_type=application_window.element_info.control_type, + control_automation_id=application_window.element_info.automation_id, + ) + + self.actions = ActionSequence([action]) # Get the root name of the application. self.app_root = self.control_inspector.get_application_root_name( @@ -256,8 +322,6 @@ def _select_application(self, application_window: UIAWrapper) -> None: if configs.get("SHOW_VISUAL_OUTLINE_ON_SCREEN", True): self.application_window.draw_outline(colour="red", thickness=3) - self.action = "set_focus()" - def _run_shell_command(self) -> None: """ Run the shell command. @@ -267,40 +331,57 @@ def _run_shell_command(self) -> None: self.app_root, self.control_text ) - self._results = self.agent.Puppeteer.execute_command( - "run_shell", {"command": self.bash_command} + action = OneStepAction( + control_label=self.control_label, + control_text=self.control_text, + after_status=self.status, + function="run_shell", + args={"command": self.bash_command}, ) - self.action = self.agent.Puppeteer.get_command_string( - "run_shell", {"command": self.bash_command} + try: + return_value = self.agent.Puppeteer.execute_command( + "run_shell", {"command": self.bash_command} + ) + error = "" + except Exception as e: + return_value = "" + error = str(e) + + action.results = ActionExecutionLog( + return_value=return_value, status=self.status, error=error ) + self.actions: ActionSequence = ActionSequence([action]) + def sync_memory(self): """ Sync the memory of the HostAgent. """ - additional_memory = { - "Step": self.session_step, - "RoundStep": self.round_step, - "AgentStep": self.host_agent.step, - "Round": self.round_num, - "ControlLabel": self.control_label, - "SubtaskIndex": -1, - "Action": self.action, - "ActionType": "UIControl", - "Request": self.request, - "Agent": "HostAgent", - "AgentName": self.host_agent.name, - "Application": self.app_root, - "Cost": self._cost, - "Results": self._results, - "error": self._exeception_traceback, - } + + additional_memory = HostAgentAdditionalMemory( + Step=self.session_step, + RoundStep=self.round_step, + AgentStep=self.host_agent.step, + Round=self.round_num, + ControlLabel=self.control_label, + SubtaskIndex=-1, + FunctionCall=self.actions.get_function_calls(), + Action=self.actions.to_list_of_dicts(), + ActionType="Bash" if self.bash_command else "UIControl", + Request=self.request, + Agent="HostAgent", + AgentName=self.host_agent.name, + Application=self.app_root, + Cost=self._cost, + Results=self.actions.get_results(), + error=self._exeception_traceback, + time_cost=self._time_cost, + ControlLog=self.actions.get_control_logs(), + ) self.add_to_memory(self._response_json) - self.add_to_memory(additional_memory) - self.add_to_memory(self._control_log) - self.add_to_memory({"time_cost": self._time_cost}) + self.add_to_memory(asdict(additional_memory)) def update_memory(self) -> None: """ diff --git a/ufo/automator/app_apis/shell/shell_client.py b/ufo/automator/app_apis/shell/shell_client.py index cfa8b5b6..64bfe6ce 100644 --- a/ufo/automator/app_apis/shell/shell_client.py +++ b/ufo/automator/app_apis/shell/shell_client.py @@ -28,10 +28,14 @@ def run_shell(self, params: Dict[str, Any]) -> Any: :return: The result content. """ bash_command = params.get("command") - result = subprocess.run( - bash_command, shell=True, capture_output=True, text=True + process = subprocess.Popen( + bash_command, # command to run + stdout=subprocess.PIPE, # capture stdout + stderr=subprocess.PIPE, # capture stderr + shell=True, + text=True, ) - return result.stdout + return "" @property def type_name(self): diff --git a/ufo/automator/ui_control/controller.py b/ufo/automator/ui_control/controller.py index 9582dc3e..87efd436 100644 --- a/ufo/automator/ui_control/controller.py +++ b/ufo/automator/ui_control/controller.py @@ -221,7 +221,7 @@ def keyboard_input(self, params: Dict[str, str]) -> str: if control_focus: self.atomic_execution("type_keys", {"keys": keys}) else: - pyautogui.typewrite(keys) + self.application.type_keys(keys=keys) return keys def texts(self) -> str: diff --git a/ufo/automator/ui_control/screenshot.py b/ufo/automator/ui_control/screenshot.py index ea1d21fa..6c5f1b74 100644 --- a/ufo/automator/ui_control/screenshot.py +++ b/ufo/automator/ui_control/screenshot.py @@ -183,13 +183,21 @@ def draw_rectangles( draw.rectangle(coordinate, outline=color, width=width) return image - def capture(self, save_path: str): + def capture(self, save_path: str, background_screenshot_path: Optional[str] = None): """ Capture a screenshot with rectangles. :param save_path: The path to save the screenshot. + :param background_screenshot_path: The path of the background screenshot, optional. If provided, the rectangle will be drawn on the background screenshot instead of the control screenshot. :return: The screenshot with rectangles. """ - screenshot = self.photographer.capture() + + if background_screenshot_path is not None and os.path.exists( + background_screenshot_path + ): + screenshot = Image.open(background_screenshot_path) + else: + screenshot = self.photographer.capture() + window_rect = self.photographer.control.rectangle() for control in self.sub_control_list: @@ -203,6 +211,41 @@ def capture(self, save_path: str): screenshot.save(save_path, compress_level=DEFAULT_PNG_COMPRESS_LEVEL) return screenshot + def capture_from_adjusted_coords( + self, + control_adjusted_coords: List[Dict[str, Dict[str, float]]], + save_path: str, + background_screenshot_path: Optional[str] = None, + ): + """ + Capture a screenshot with rectangles when the adjusted coordinates are provided. + :param control_adjusted_coords: The adjusted coordinates of the control rectangles. + :param save_path: The path to save the screenshot. + :param background_screenshot_path: The path of the background screenshot, optional. If provided, the rectangle will be drawn on the background screenshot instead of the control screenshot. + :return: The screenshot with rectangles. + """ + if background_screenshot_path is not None and os.path.exists( + background_screenshot_path + ): + screenshot = Image.open(background_screenshot_path) + else: + screenshot = self.photographer.capture() + + for control_adjusted_coord in control_adjusted_coords: + if control_adjusted_coord: + control_rect = ( + control_adjusted_coord["left"], + control_adjusted_coord["top"], + control_adjusted_coord["right"], + control_adjusted_coord["bottom"], + ) + screenshot = self.draw_rectangles( + screenshot, coordinate=control_rect, color=self.color + ) + if save_path is not None and screenshot is not None: + screenshot.save(save_path, compress_level=DEFAULT_PNG_COMPRESS_LEVEL) + return screenshot + class AnnotationDecorator(PhotographerDecorator): """ @@ -378,6 +421,7 @@ def capture_with_annotation_dict( screenshot_annotated, adjusted_coordinate, label_text, + font_size=configs.get("ANNOTATION_FONT_SIZE", 25), button_color=( color_dict.get( control.element_info.control_type, self.color_default @@ -467,6 +511,7 @@ def capture_app_window_screenshot_with_rectangle( color: str = "red", width=3, sub_control_list: List[UIAWrapper] = None, + background_screenshot_path: Optional[str] = None, save_path: Optional[str] = None, ) -> Image.Image: """ @@ -476,11 +521,42 @@ def capture_app_window_screenshot_with_rectangle( :param color: The color of the rectangle. :param width: The width of the rectangle. :param sub_control_list: The list of the controls to draw rectangles on. + :param background_screenshot_path: The path of the background screenshot, optional. If provided, the rectangle will be drawn on the background screenshot instead of the control screenshot. + :param save_path: The path to save the screenshot. :return: The screenshot. """ screenshot = self.screenshot_factory.create_screenshot("app_window", control) screenshot = RectangleDecorator(screenshot, color, width, sub_control_list) - return screenshot.capture(save_path) + return screenshot.capture(save_path, background_screenshot_path) + + def capture_app_window_screenshot_with_rectangle_from_adjusted_coords( + self, + control: UIAWrapper, + color: str = "red", + width=3, + control_adjusted_coords: List[Dict[str, Dict[str, float]]] = [], + background_screenshot_path: Optional[str] = None, + save_path: Optional[str] = None, + ) -> Image.Image: + """ + Capture the control screenshot with a rectangle. + :param control: The control item to capture. + :param coordinate: The coordinate of the rectangle. + :param color: The color of the rectangle. + :param width: The width of the rectangle. + :param sub_control_list: The list of the controls to draw rectangles on. + :param background_screenshot_path: The path of the background screenshot, optional. If provided, the rectangle will be drawn on the background screenshot instead of the control screenshot. + :param save_path: The path to save the screenshot. + :return: The screenshot. + """ + screenshot = self.screenshot_factory.create_screenshot("app_window", control) + screenshot = RectangleDecorator(screenshot, color, width, []) + + return screenshot.capture_from_adjusted_coords( + control_adjusted_coords=control_adjusted_coords, + save_path=save_path, + background_screenshot_path=background_screenshot_path, + ) def capture_app_window_screenshot_with_annotation_dict( self, @@ -579,6 +655,16 @@ def concat_screenshots( :return: The concatenated image. """ # Open the images + if not os.path.exists(image1_path): + utils.print_with_color(f"Waring: {image1_path} does not exist.", "yellow") + + return Image.new("RGB", (0, 0)) + + if not os.path.exists(image2_path): + utils.print_with_color(f"Waring: {image2_path} does not exist.", "yellow") + + return Image.new("RGB", (0, 0)) + image1 = Image.open(image1_path) image2 = Image.open(image2_path) diff --git a/ufo/config/config_dev.yaml b/ufo/config/config_dev.yaml index d41e2f6f..0cf28f3e 100644 --- a/ufo/config/config_dev.yaml +++ b/ufo/config/config_dev.yaml @@ -1,16 +1,18 @@ CONTROL_BACKEND: "uia" # The backend for control action, currently we support uia and win32 -MAX_STEP: 100 # The max step limit for completing the user request -MAX_ROUND: 10 # The max round limit for completing the user request +MAX_STEP: 30 # The max step limit for completing the user request +MAX_ROUND: 1 # The max round limit for completing the user request SLEEP_TIME: 1 # The sleep time between each step to wait for the window to be ready RECTANGLE_TIME: 1 +ACTION_SEQUENCE: False # Whether to output the action sequence. If true, the agent may predict and execute multiple actions in one step. + # Skip rendering visual outline on screen if not necessary SHOW_VISUAL_OUTLINE_ON_SCREEN: False SAFE_GUARD: True # Whether to use the safe guard to prevent the model from doing sensitve operations. CONTROL_LIST: ["Button", "Edit", "TabItem", "Document", "ListItem", "MenuItem", "ScrollBar", "TreeItem", "Hyperlink", "ComboBox", "RadioButton", "DataItem", "Spinner"] # The list of widgets that allowed to be selected, in uia backend, it will be used for filter the control_type, while in win32 backend, it will be used for filter the class_name. -HISTORY_KEYS: ["Step", "ControlText", "Subtask", "Action", "Results", "UserConfirm"] # The keys of the action history for the next step. +HISTORY_KEYS: ["Step", "Subtask", "Action", "UserConfirm"] # The keys of the action history for the next step. ANNOTATION_COLORS: { "Button": "#FFF68F", @@ -24,11 +26,13 @@ ANNOTATION_COLORS: { "Hyperlink": "#91FFEB", "ComboBox": "#D8B6D4" } + +ANNOTATION_FONT_SIZE: 22 # The font size for the annotation PRINT_LOG: False # Whether to print the log CONCAT_SCREENSHOT: True # Whether to concat the screenshot for the control item LOG_LEVEL: "DEBUG" # The log level -INCLUDE_LAST_SCREENSHOT: False # Whether to include the last screenshot in the observation +INCLUDE_LAST_SCREENSHOT: True # Whether to include the last screenshot in the observation REQUEST_TIMEOUT: 250 # The call timeout for the GPT-V model HOSTAGENT_PROMPT: "ufo/prompts/share/base/host_agent.yaml" # The prompt for the app selection @@ -42,6 +46,7 @@ EVALUATION_PROMPT: "ufo/prompts/evaluation/evaluate.yaml" # The prompt for the HOSTAGENT_EXAMPLE_PROMPT: "ufo/prompts/examples/{mode}/host_agent_example.yaml" # The prompt for the app selection # Lite version: "ufo/prompts/examples/lite/{mode}/host_agent_example.yaml" APPAGENT_EXAMPLE_PROMPT: "ufo/prompts/examples/{mode}/app_agent_example.yaml" # The prompt for the action selection +APPAGENT_EXAMPLE_PROMPT_AS: "ufo/prompts/examples/{mode}/app_agent_example_as.yaml" # The prompt for the action selection with sequence # Lite version: "ufo/prompts/examples/lite/{mode}/app_agent_example.yaml" ## For experience learning @@ -86,8 +91,8 @@ SCREENSHOT_TO_MEMORY: True # Whether to allow the screenshot to memory for the # For customizations -ASK_QUESTION: True # Whether to allow the agent to ask questions -USE_CUSTOMIZATION: True # Whether to use the customization +ASK_QUESTION: False # Whether to allow the agent to ask questions +USE_CUSTOMIZATION: False # Whether to use the customization QA_PAIR_FILE: "customization/historical_qa.txt" # The path for the historical QA QA_PAIR_NUM: 20 # The number of QA pairs for the customization @@ -108,5 +113,5 @@ SAVE_UI_TREE: False # Whether to save the UI tree TASK_STATUS: True # Whether to record the status of the tasks in batch execution mode. # Experience saving -SAVE_EXPERIENCE: "ask" # Whether to save the experience, can be "always" for always save, "always_not" for always not save, "ask" for asking the user to save or not. By default, it is "always_not" +SAVE_EXPERIENCE: "auto" # Whether to save the experience, can be "always" for always save, "always_not" for always not save, "ask" for asking the user to save or not, "auto" for auto save when the evaluation is good. By default, it is "ask". diff --git a/ufo/experience/summarizer.py b/ufo/experience/summarizer.py index be732a7c..7413b141 100644 --- a/ufo/experience/summarizer.py +++ b/ufo/experience/summarizer.py @@ -109,6 +109,7 @@ def get_summary_list(self, logs: list) -> Tuple[list, float]: prompt = self.build_prompt(log_partition) summary, cost = self.get_summary(prompt) summary["request"] = log_partition.get("subtask") + summary["Sub-task"] = log_partition.get("subtask") summary["app_list"] = [log_partition.get("application")] summaries.append(summary) total_cost += cost diff --git a/ufo/llm/base.py b/ufo/llm/base.py index 014b93bc..4b31d4d9 100644 --- a/ufo/llm/base.py +++ b/ufo/llm/base.py @@ -16,7 +16,7 @@ def chat_completion(self, *args, **kwargs): pass @staticmethod - def get_service(name: str) -> "BaseService": + def get_service(name: str, model_name: str = None) -> "BaseService": """ Get the service class based on the name. :param name: The name of the service. @@ -30,17 +30,32 @@ def get_service(name: str) -> "BaseService": "ollama": "OllamaService", "gemini": "GeminiService", "claude": "ClaudeService", + "custom": "CustomService", "placeholder": "PlaceHolderService", } + custom_service_map = { + "llava": "LlavaService", + "cogagent": "CogAgentService", + } service_name = service_map.get(name, None) if service_name: if name in ["aoai", "azure_ad"]: module = import_module(".openai", package="ufo.llm") + elif service_name == "CustomService": + custom_model = "llava" if "llava" in model_name else model_name + custom_service_name = custom_service_map.get( + "llava" if "llava" in custom_model else custom_model, None + ) + if custom_service_name: + module = import_module("." + custom_model, package="ufo.llm") + service_name = custom_service_name + else: + raise ValueError(f"Custom model {custom_model} not supported") else: module = import_module("." + name.lower(), package="ufo.llm") + return getattr(module, service_name) else: raise ValueError(f"Service {name} not found.") - return getattr(module, service_name) def get_cost_estimator( self, diff --git a/ufo/llm/cogagent.py b/ufo/llm/cogagent.py new file mode 100644 index 00000000..4b3f375c --- /dev/null +++ b/ufo/llm/cogagent.py @@ -0,0 +1,86 @@ +import time +from typing import Any, Optional + +import requests + +from ufo.utils import print_with_color +from .base import BaseService + + +class CogAgentService(BaseService): + def __init__(self, config, agent_type: str): + self.config_llm = config[agent_type] + self.config = config + self.max_retry = self.config["MAX_RETRY"] + self.timeout = self.config["TIMEOUT"] + self.max_tokens = 2048 # default max tokens for cogagent for now + + def chat_completion( + self, + messages, + n, + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + top_p: Optional[float] = None, + **kwargs: Any, + ): + """ + Generate chat completions based on given messages. + Args: + messages (list): A list of messages. + n (int): The number of completions to generate. + temperature (float, optional): The temperature for sampling. Defaults to None. + max_tokens (int, optional): The maximum number of tokens in the completion. Defaults to None. + top_p (float, optional): The cumulative probability for top-p sampling. Defaults to None. + **kwargs: Additional keyword arguments. + Returns: + tuple: A tuple containing the generated texts and None. + """ + + temperature = ( + temperature if temperature is not None else self.config["TEMPERATURE"] + ) + max_tokens = max_tokens if max_tokens is not None else self.config["MAX_TOKENS"] + top_p = top_p if top_p is not None else self.config["TOP_P"] + + texts = [] + for i in range(n): + image_base64 = None + if self.config_llm["VISUAL_MODE"]: + image_base64 = messages[1]["content"][-2]["image_url"]["url"].split( + "base64," + )[1] + prompt = messages[0]["content"] + messages[1]["content"][-1]["text"] + + payload = { + "model": self.config_llm["API_MODEL"], + "prompt": prompt, + "temperature": temperature, + "top_p": top_p, + "max_new_tokens": self.max_tokens, + "image": image_base64, + } + + for _ in range(self.max_retry): + try: + response = requests.post( + self.config_llm["API_BASE"] + "/chat/completions", json=payload + ) + if response.status_code == 200: + response = response.json() + text = response["text"] + texts.append(text) + break + else: + raise Exception( + f"Failed to get completion with error code {response.status_code}: {response.text}", + ) + except Exception as e: + print_with_color(f"Error making API request: {e}", "red") + try: + print_with_color(response, "red") + except: + _ + time.sleep(3) + continue + return texts, None diff --git a/ufo/llm/llava.py b/ufo/llm/llava.py new file mode 100644 index 00000000..d154f7f2 --- /dev/null +++ b/ufo/llm/llava.py @@ -0,0 +1,549 @@ +import time +from typing import Any, Optional +import dataclasses +from enum import auto, Enum +from typing import List +import base64 +from io import BytesIO +from PIL import Image + +import requests +from ufo.utils import print_with_color +from .base import BaseService + +DEFAULT_IMAGE_TOKEN = "" + + +class LlavaService(BaseService): + def __init__(self, config, agent_type: str): + self.config_llm = config[agent_type] + self.config = config + self.max_retry = self.config["MAX_RETRY"] + self.timeout = self.config["TIMEOUT"] + self.max_tokens = 2048 # default max tokens for llava for now + + def chat_completion( + self, + messages, + n, + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + top_p: Optional[float] = None, + **kwargs: Any, + ): + """ + Generates chat completions based on the given messages. + Args: + messages (list): A list of messages. + n (int): The number of completions to generate. + temperature (float, optional): The temperature value for controlling the randomness of the completions. Defaults to None. + max_tokens (int, optional): The maximum number of tokens in the completions. Defaults to None. + top_p (float, optional): The cumulative probability for selecting the next token in the completions. Defaults to None. + **kwargs: Additional keyword arguments. + Returns: + tuple: A tuple containing the generated texts and None. + Raises: + Exception: If there is an error in the API request. + """ + temperature = ( + temperature if temperature is not None else self.config["TEMPERATURE"] + ) + max_tokens = max_tokens if max_tokens is not None else self.config["MAX_TOKENS"] + top_p = top_p if top_p is not None else self.config["TOP_P"] + conv = conv_templates[self._conversation()].copy() + + texts = [] + for i in range(n): + if self.config_llm["VISUAL_MODE"]: + inp = DEFAULT_IMAGE_TOKEN + "\n" + messages[1]["content"][-1]["text"] + conv.append_message(conv.roles[0], inp) + image_base64 = messages[1]["content"][-2]["image_url"]["url"].split( + "base64," + )[1] + else: + conv.append_message(conv.roles[0], messages[1]["content"][-1]["text"]) + prompt = conv.get_prompt() + + payload = { + "model": self.config_llm["API_MODEL"], + "prompt": prompt, + "temperature": temperature, + "top_p": top_p, + "max_new_tokens": self.max_tokens, + "image": image_base64, + } + + for _ in range(self.max_retry): + try: + response = requests.post( + self.config_llm["API_BASE"] + "/chat/completions", + json=payload, + timeout=self.timeout, + ) + if response.status_code == 200: + response = response.json() + text = response["text"] + texts.append(text) + break + else: + raise Exception( + f"Failed to get completion with error code {response.status_code}: {response.text}", + ) + except Exception as e: + print_with_color(f"Error making API request: {e}", "red") + try: + print_with_color(response, "red") + except: + _ + time.sleep(3) + continue + return texts, None + + def _conversation(self): + """ + Determines the conversation mode based on the model name. + Returns: + str: The conversation mode based on the model name. + """ + model_paths = self.config_llm["API_MODEL"].strip("/").split("/") + model_name = ( + model_paths[-2] + "_" + model_paths[-1] + if model_paths[-1].startswith("checkpoint-") + else model_paths[-1] + ) + if "llama-2" in model_name.lower(): + conv_mode = "llava_llama_2" + elif "mistral" in model_name.lower(): + conv_mode = "mistral_instruct" + elif "v1.6-34b" in model_name.lower(): + conv_mode = "chatml_direct" + elif "v1" in model_name.lower(): + conv_mode = "llava_v1" + elif "mpt" in model_name.lower(): + conv_mode = "mpt" + else: + conv_mode = "vicuna_v1" + return conv_mode + + +class SeparatorStyle(Enum): + """Different separator style.""" + + SINGLE = auto() + TWO = auto() + MPT = auto() + PLAIN = auto() + LLAMA_2 = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that keeps all conversation history.""" + + system: str + roles: List[str] + messages: List[List[str]] + offset: int + sep_style: SeparatorStyle = SeparatorStyle.SINGLE + sep: str = "###" + sep2: str = None + version: str = "Unknown" + + skip_next: bool = False + + def get_prompt(self): + """ + Generates a prompt message based on the current state of the conversation. + Returns: + str: The generated prompt message. + """ + messages = self.messages + if len(messages) > 0 and type(messages[0][1]) is tuple: + messages = self.messages.copy() + init_role, init_msg = messages[0].copy() + init_msg = init_msg[0].replace("", "").strip() + if "mmtag" in self.version: + messages[0] = (init_role, init_msg) + messages.insert(0, (self.roles[0], "")) + messages.insert(1, (self.roles[1], "Received.")) + else: + messages[0] = (init_role, "\n" + init_msg) + + if self.sep_style == SeparatorStyle.SINGLE: + ret = self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + self.sep + else: + ret += role + ":" + elif self.sep_style == SeparatorStyle.TWO: + seps = [self.sep, self.sep2] + ret = self.system + seps[0] + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + elif self.sep_style == SeparatorStyle.MPT: + ret = self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + message + self.sep + else: + ret += role + elif self.sep_style == SeparatorStyle.LLAMA_2: + wrap_sys = lambda msg: ( + f"<>\n{msg}\n<>\n\n" if len(msg) > 0 else msg + ) + wrap_inst = lambda msg: f"[INST] {msg} [/INST]" + ret = "" + + for i, (role, message) in enumerate(messages): + if i == 0: + assert message, "first message should not be none" + assert role == self.roles[0], "first message should come from user" + if message: + if type(message) is tuple: + message, _, _ = message + if i == 0: + message = wrap_sys(self.system) + message + if i % 2 == 0: + message = wrap_inst(message) + ret += self.sep + message + else: + ret += " " + message + " " + self.sep2 + else: + ret += "" + ret = ret.lstrip(self.sep) + elif self.sep_style == SeparatorStyle.PLAIN: + seps = [self.sep, self.sep2] + ret = self.system + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += message + seps[i % 2] + else: + ret += "" + else: + raise ValueError(f"Invalid style: {self.sep_style}") + + return ret + + def append_message(self, role, message): + self.messages.append([role, message]) + + def process_image( + self, + image, + image_process_mode, + return_pil=False, + image_format="PNG", + max_len=1344, + min_len=672, + ): + """ + Process the given image based on the specified image_process_mode. + Args: + image (PIL.Image.Image): The input image to be processed. + image_process_mode (str): The mode for processing the image. Possible values are 'Pad', 'Default', 'Crop', or 'Resize'. + return_pil (bool, optional): Whether to return the processed image as a PIL Image object. Defaults to False. + image_format (str, optional): The format to save the image in. Defaults to 'PNG'. + max_len (int, optional): The maximum length of the image's longest edge. Defaults to 1344. + min_len (int, optional): The minimum length of the image's shortest edge. Defaults to 672. + Returns: + str or PIL.Image.Image: The processed image. If return_pil is True, a PIL Image object is returned. Otherwise, the processed image is returned as a base64-encoded string. + Raises: + ValueError: If an invalid image_process_mode is provided. + """ + if image_process_mode == "Pad": + + def expand2square(pil_img, background_color=(122, 116, 104)): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + image = expand2square(image) + elif image_process_mode in ["Default", "Crop"]: + pass + elif image_process_mode == "Resize": + image = image.resize((336, 336)) + else: + raise ValueError(f"Invalid image_process_mode: {image_process_mode}") + if max(image.size) > max_len: + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) + longest_edge = int(shortest_edge * aspect_ratio) + W, H = image.size + if H > W: + H, W = longest_edge, shortest_edge + else: + H, W = shortest_edge, longest_edge + image = image.resize((W, H)) + if return_pil: + return image + else: + buffered = BytesIO() + image.save(buffered, format=image_format) + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + return img_b64_str + + def get_images(self, return_pil=False): + images = [] + for i, (role, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + if type(msg) is tuple: + msg, image, image_process_mode = msg + image = self.process_image( + image, image_process_mode, return_pil=return_pil + ) + images.append(image) + return images + + def to_gradio_chatbot(self): + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + if type(msg) is tuple: + msg, image, image_process_mode = msg + img_b64_str = self.process_image( + image, "Default", return_pil=False, image_format="JPEG" + ) + img_str = f'user upload image' + msg = img_str + msg.replace("", "").strip() + ret.append([msg, None]) + else: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def copy(self): + return Conversation( + system=self.system, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + version=self.version, + ) + + def dict(self): + if len(self.get_images()) > 0: + return { + "system": self.system, + "roles": self.roles, + "messages": [ + [x, y[0] if type(y) is tuple else y] for x, y in self.messages + ], + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + return { + "system": self.system, + "roles": self.roles, + "messages": self.messages, + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + + +conv_vicuna_v0 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("Human", "Assistant"), + messages=( + ( + "Human", + "What are the key differences between renewable and non-renewable energy sources?", + ), + ( + "Assistant", + "Renewable energy sources are those that can be replenished naturally in a relatively " + "short amount of time, such as solar, wind, hydro, geothermal, and biomass. " + "Non-renewable energy sources, on the other hand, are finite and will eventually be " + "depleted, such as coal, oil, and natural gas. Here are some key differences between " + "renewable and non-renewable energy sources:\n" + "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable " + "energy sources are finite and will eventually run out.\n" + "2. Environmental impact: Renewable energy sources have a much lower environmental impact " + "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, " + "and other negative effects.\n" + "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically " + "have lower operational costs than non-renewable sources.\n" + "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote " + "locations than non-renewable sources.\n" + "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different " + "situations and needs, while non-renewable sources are more rigid and inflexible.\n" + "6. Sustainability: Renewable energy sources are more sustainable over the long term, while " + "non-renewable sources are not, and their depletion can lead to economic and social instability.\n", + ), + ), + offset=2, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_vicuna_v1 = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="v1", + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", +) + +conv_llama_2 = Conversation( + system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=(), + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", +) + +conv_llava_llama_2 = Conversation( + system="You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=(), + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", +) + +conv_mpt = Conversation( + system="""<|im_start|>system +A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=(), + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +conv_llava_plain = Conversation( + system="", + roles=("", ""), + messages=(), + offset=0, + sep_style=SeparatorStyle.PLAIN, + sep="\n", +) + +conv_llava_v0 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("Human", "Assistant"), + messages=(), + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_llava_v0_mmtag = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language." + "The visual content will be provided with the following format: visual content.", + roles=("Human", "Assistant"), + messages=(), + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", + version="v0_mmtag", +) + +conv_llava_v1 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("USER", "ASSISTANT"), + version="v1", + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", +) + +conv_llava_v1_mmtag = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language." + "The visual content will be provided with the following format: visual content.", + roles=("USER", "ASSISTANT"), + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", + version="v1_mmtag", +) + +conv_mistral_instruct = Conversation( + system="", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=(), + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", +) + +conv_chatml_direct = Conversation( + system="""<|im_start|>system +Answer the questions.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=(), + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +conv_templates = { + "default": conv_vicuna_v0, + "v0": conv_vicuna_v0, + "v1": conv_vicuna_v1, + "vicuna_v1": conv_vicuna_v1, + "llama_2": conv_llama_2, + "mistral_instruct": conv_mistral_instruct, + "chatml_direct": conv_chatml_direct, + "mistral_direct": conv_chatml_direct, + "plain": conv_llava_plain, + "v0_plain": conv_llava_plain, + "llava_v0": conv_llava_v0, + "v0_mmtag": conv_llava_v0_mmtag, + "llava_v1": conv_llava_v1, + "v1_mmtag": conv_llava_v1_mmtag, + "llava_llama_2": conv_llava_llama_2, + "mpt": conv_mpt, +} diff --git a/ufo/llm/llm_call.py b/ufo/llm/llm_call.py index b47ac9e5..1872c3ff 100644 --- a/ufo/llm/llm_call.py +++ b/ufo/llm/llm_call.py @@ -12,7 +12,7 @@ def get_completion( - messages, agent: str = "APP", use_backup_engine: bool = True, configs = configs + messages, agent: str = "APP", use_backup_engine: bool = True, configs=configs ) -> Tuple[str, float]: """ Get completion for the given messages. @@ -23,14 +23,17 @@ def get_completion( """ responses, cost = get_completions( - messages, agent=agent, use_backup_engine=use_backup_engine, n=1, configs = configs + messages, agent=agent, use_backup_engine=use_backup_engine, n=1, configs=configs ) return responses[0], cost def get_completions( - messages, agent: str = "APP", use_backup_engine: bool = True, n: int = 1, - configs = configs + messages, + agent: str = "APP", + use_backup_engine: bool = True, + n: int = 1, + configs=configs, ) -> Tuple[list, float]: """ Get completions for the given messages. @@ -57,7 +60,9 @@ def get_completions( api_type = configs[agent_type]["API_TYPE"] try: api_type_lower = api_type.lower() - service = BaseService.get_service(api_type_lower) + service = BaseService.get_service( + api_type_lower, configs[agent_type]["API_MODEL"].lower() + ) if service: response, cost = service(configs, agent_type=agent_type).chat_completion( messages, n diff --git a/ufo/module/basic.py b/ufo/module/basic.py index 659f24bf..e83b46d7 100644 --- a/ufo/module/basic.py +++ b/ufo/module/basic.py @@ -115,7 +115,7 @@ def run(self) -> None: self.subtask_amount += 1 self.agent.blackboard.add_requests( - {"request_{i}".format(i=self.id), self.request} + {"request_{i}".format(i=self.id): self.request} ) if self.application_window is not None: @@ -357,6 +357,7 @@ def __init__(self, task: str, should_evaluate: bool, id: int) -> None: self._context = Context() self._init_context() self._finish = False + self._results = {} self._host_agent: HostAgent = AgentFactory.create_agent( "host", @@ -531,6 +532,22 @@ def current_round(self) -> BaseRound: else: return self._rounds[self.total_rounds - 1] + @property + def results(self) -> Dict[str, str]: + """ + Get the evaluation results of the session. + return: The evaluation results of the session. + """ + return self._results + + @results.setter + def results(self, value: Dict[str, str]) -> None: + """ + Set the evaluation results of the session. + :param value: The evaluation results of the session. + """ + self._results = value + def experience_saver(self) -> None: """ Save the current trajectory as agent experience. @@ -647,6 +664,8 @@ def evaluation(self) -> None: additional_info = {"level": "session", "request": requests, "id": 0} result.update(additional_info) + self.results = result + self.cost += cost evaluator.print_response(result) diff --git a/ufo/module/context.py b/ufo/module/context.py index 8a0d01d7..5d55d5ca 100644 --- a/ufo/module/context.py +++ b/ufo/module/context.py @@ -9,6 +9,8 @@ from pywinauto.controls.uiawrapper import UIAWrapper +from ufo.utils import is_json_serializable, print_with_color + class ContextNames(Enum): """ @@ -283,7 +285,7 @@ def add_to_structural_logs(self, data: Dict[str, Any]) -> None: if round_key is None or subtask_key is None: return - remaining_items = {key: data[key] for key in data if key not in ["a", "b"]} + remaining_items = {key: data[key] for key in data} self._context[ContextNames.STRUCTURAL_LOGS.name][round_key][subtask_key].append( remaining_items ) @@ -310,9 +312,39 @@ def filter_structural_logs( else: raise TypeError(f"Keys should be a string or a list of strings.") - def to_dict(self) -> Dict[str, Any]: + def to_dict(self, ensure_serializable: bool = False) -> Dict[str, Any]: """ Convert the context to a dictionary. + :param ensure_serializable: Ensure the context is serializable. :return: The dictionary of the context. """ - return self._context + + import copy + + context_dict = copy.deepcopy(self._context) + + if ensure_serializable: + + for key in ContextNames: + if key.name in context_dict: + print_with_color( + f"Warn: The value of Context.{key.name} is not serializable.", + "yellow", + ) + if not is_json_serializable(context_dict[key.name]): + + context_dict[key.name] = None + + return context_dict + + def from_dict(self, context_dict: Dict[str, Any]) -> None: + """ + Load the context from a dictionary. + :param context_dict: The dictionary of the context. + """ + for key in ContextNames: + if key.name in context_dict: + self._context[key.name] = context_dict.get(key.name) + + # Sync the current round step and cost + self._sync_round_values() diff --git a/ufo/module/sessions/plan_reader.py b/ufo/module/sessions/plan_reader.py index a970dd00..65177482 100644 --- a/ufo/module/sessions/plan_reader.py +++ b/ufo/module/sessions/plan_reader.py @@ -25,7 +25,7 @@ def __init__(self, plan_file: str): with open(plan_file, "r") as f: self.plan = json.load(f) self.remaining_steps = self.get_steps() - self.support_apps = ["word", "excel", "powerpoint"] + self.support_apps = ["WINWORD.EXE", "EXCEL.EXE", "POWERPNT.EXE"] def get_close(self) -> bool: """ @@ -119,8 +119,10 @@ def get_host_request(self) -> str: if object_name in self.support_apps: request = task else: - request = f"Open the application of {task}. You must output the selected application with their control text and label even if it is already open." - + request = ( + f"Your task is '{task}'. And open the application of {object_name}. " + "You must output the selected application with their control text and label even if it is already open." + ) return request def next_step(self) -> Optional[str]: diff --git a/ufo/module/sessions/session.py b/ufo/module/sessions/session.py index f7b800ab..50034399 100644 --- a/ufo/module/sessions/session.py +++ b/ufo/module/sessions/session.py @@ -179,6 +179,12 @@ def run(self) -> None: elif save_experience == "ask": if interactor.experience_asker(): self.experience_saver() + + elif save_experience == "auto": + task_completed = self.results.get("complete", "no") + if task_completed.lower() == "yes": + self.experience_saver() + elif save_experience == "always_not": pass @@ -237,10 +243,10 @@ def next_request(self) -> str: self._finish = True return request - def request_to_evaluate(self) -> bool: + def request_to_evaluate(self) -> str: """ - Check if the session should be evaluated. - :return: True if the session should be evaluated, False otherwise. + Get the request to evaluate. + return: The request(s) to evaluate. """ request_memory = self._host_agent.blackboard.requests return request_memory.to_json() @@ -328,10 +334,10 @@ def next_request(self) -> str: else: return self.plan_reader.next_step() - def request_to_evaluate(self) -> bool: + def request_to_evaluate(self) -> str: """ - Check if the session should be evaluated. - :return: True if the session should be evaluated, False otherwise. + Get the request to evaluate. + return: The request(s) to evaluate. """ return self.plan_reader.get_task() @@ -402,6 +408,7 @@ def next_request(self) -> str: """ if self.total_rounds == 0: + utils.print_with_color(self.plan_reader.get_host_request(), "cyan") return self.plan_reader.get_host_request() else: self._finish = True @@ -483,7 +490,9 @@ def setup_application_environment(self): if self.object_name: suffix = os.path.splitext(self.object_name)[1] self.app_name = self.get_app_name(suffix) + print("app_name:", self.app_name) if self.app_name not in self.support_apps: + print(f"The app {self.app_name} is not supported.") return # The app is not supported, so we don't need to setup the environment. file = self.plan_reader.get_file_path() code_snippet = f"import os\nos.system('start {self.app_name} \"{file}\"')" @@ -497,13 +506,12 @@ def setup_application_environment(self): except Exception as e: print(f"An error occurred: {e}") - def request_to_evaluate(self) -> bool: + def request_to_evaluate(self) -> str: """ - Check if the session should be evaluated. - :return: True if the session should be evaluated, False otherwise. + Get the request to evaluate. + return: The request(s) to evaluate. """ - request_memory = self._host_agent.blackboard.requests - return request_memory.to_json() + return self.plan_reader.get_task() def record_task_done(self) -> None: """ diff --git a/ufo/prompter/agent_prompter.py b/ufo/prompter/agent_prompter.py index cf9f98a1..792312d9 100644 --- a/ufo/prompter/agent_prompter.py +++ b/ufo/prompter/agent_prompter.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. import json -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional from ufo.config.config import Config from ufo.prompter.basic import BasicPrompter @@ -206,9 +206,7 @@ def __init__( if configs.get("USE_APIS", False): self.app_api_prompt_template = self.app_prompter.load_api_prompt() - def system_prompt_construction( - self, additional_examples: List[str] = [], tips: List[str] = [] - ) -> str: + def system_prompt_construction(self, additional_examples: List[str] = []) -> str: """ Construct the prompt for app selection. :param additional_examples: The additional examples added to the prompt. @@ -217,16 +215,15 @@ def system_prompt_construction( apis = self.api_prompt_helper(verbose=1) examples = self.examples_prompt_helper(additional_examples=additional_examples) - tips_prompt = "\n".join(tips) - # Remove empty lines - tips_prompt = "\n".join(filter(None, tips_prompt.split("\n"))) - - system_key = "system" if self.is_visual else "system_nonvisual" + if configs.get("ACTION_SEQUENCE", False): + system_key = "system_as" + else: + system_key = "system" + if not self.is_visual: + system_key += "_nonvisual" - return self.prompt_template[system_key].format( - apis=apis, examples=examples, tips=tips_prompt - ) + return self.prompt_template[system_key].format(apis=apis, examples=examples) def user_prompt_construction( self, @@ -238,6 +235,7 @@ def user_prompt_construction( current_application: str, host_message: List[str], retrieved_docs: str = "", + last_success_actions: List[Dict[str, Any]] = [], ) -> str: """ Construct the prompt for action selection. @@ -250,6 +248,7 @@ def user_prompt_construction( :param current_application: The current application. :param host_message: The host message. :param retrieved_docs: The retrieved documents. + :param last_success_actions: The list of successful actions in the last step. return: The prompt for action selection. """ prompt = self.prompt_template["user"].format( @@ -261,6 +260,7 @@ def user_prompt_construction( current_application=current_application, host_message=json.dumps(host_message), retrieved_docs=retrieved_docs, + last_success_actions=json.dumps(last_success_actions), ) return prompt @@ -276,6 +276,7 @@ def user_content_construction( current_application: str, host_message: List[str], retrieved_docs: str = "", + last_success_actions: List[Dict[str, Any]] = [], include_last_screenshot: bool = True, ) -> List[Dict[str, str]]: """ @@ -318,6 +319,7 @@ def user_content_construction( current_application=current_application, host_message=host_message, retrieved_docs=retrieved_docs, + last_success_actions=last_success_actions, ), } ) @@ -328,38 +330,79 @@ def examples_prompt_helper( self, header: str = "## Response Examples", separator: str = "Example", - additional_examples: List[str] = [], + additional_examples: List[Dict[str, Any]] = [], ) -> str: """ Construct the prompt for examples. :param examples: The examples. :param header: The header of the prompt. :param separator: The separator of the prompt. + :param additional_examples: The additional examples added to the prompt. return: The prompt for examples. """ template = """ [User Request]: {request} + [Sub-Task]: + {subtask} + [Tips]: + {tips} [Response]: {response}""" - example_list = [] - - for key in self.example_prompt_template.keys(): - if key.startswith("example"): - example = template.format( - request=self.example_prompt_template[key].get("Request"), - response=json.dumps( - self.example_prompt_template[key].get("Response") - ), + if configs.get("ACTION_SEQUENCE", False): + for example in additional_examples: + example["Response"] = self.action2action_sequence( + example.get("Response", {}) ) - example_list.append(example) - example_list += [json.dumps(example) for example in additional_examples] + example_dict = [ + self.example_prompt_template[key] + for key in self.example_prompt_template.keys() + if key.startswith("example") + ] + additional_examples + + example_list = [] + + for example in example_dict: + example_str = template.format( + request=example.get("Request"), + subtask=example.get("Sub-task"), + tips=example.get("Tips"), + response=json.dumps(example.get("Response")), + ) + example_list.append(example_str) return self.retrived_documents_prompt_helper(header, separator, example_list) + @staticmethod + def action2action_sequence(response: Dict[str, Any]) -> Dict[str, Any]: + """ + Delete the key in the example["Response"], and replaced it with the key "ActionList". + :param example: The action. + return: The action sequence. + """ + action_list = [ + { + "Function": response.get("Function", ""), + "Args": response.get("Args", {}), + "Status": response.get("Status", "CONTINUE"), + "ControlLabel": response.get("ControlLabel", ""), + "ControlText": response.get("ControlText", ""), + } + ] + + # Delete the keys in the response + from copy import deepcopy + + response_copy = deepcopy(response) + for key in ["Function", "Args", "Status", "ControlLabel", "ControlText"]: + response_copy.pop(key, None) + response_copy["ActionList"] = action_list + + return response_copy + def api_prompt_helper(self, verbose: int = 1) -> str: """ Construct the prompt for APIs. diff --git a/ufo/prompter/basic.py b/ufo/prompter/basic.py index 8e6e174f..81406afa 100644 --- a/ufo/prompter/basic.py +++ b/ufo/prompter/basic.py @@ -3,7 +3,7 @@ import os from abc import ABC, abstractmethod -from typing import Dict, List, Union +from typing import Dict, List import yaml diff --git a/ufo/prompter/eva_prompter.py b/ufo/prompter/eva_prompter.py index 59bb8076..3a72b08a 100644 --- a/ufo/prompter/eva_prompter.py +++ b/ufo/prompter/eva_prompter.py @@ -228,13 +228,11 @@ def get_step_trajectory(self, log: Dict[str, str]) -> Dict[str, str]: "Step": log.get("Step"), "Observation": log.get("Observation"), "Thought": log.get("Thought"), - "ControlLabel": log.get("ControlLabel"), - "ControlText": log.get("ControlText"), "Plan": log.get("Plan"), "Comment": log.get("Comment"), "Action": log.get("Action"), "Application": log.get("Application"), - "Results": log.get("Results"), + # "Results": log.get("Results"), } return step_trajectory diff --git a/ufo/prompter/experience_prompter.py b/ufo/prompter/experience_prompter.py index 77780f69..93f9ffba 100644 --- a/ufo/prompter/experience_prompter.py +++ b/ufo/prompter/experience_prompter.py @@ -1,12 +1,12 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import json from typing import Any, Dict, List from ufo.prompter.basic import BasicPrompter from ufo.experience.experience_parser import ExperienceLogLoader +from ufo.utils import print_with_color class ExperiencePrompter(BasicPrompter): @@ -186,7 +186,13 @@ def examples_prompt_helper( for key in self.example_prompt_template.keys(): if key.startswith("example"): - response = self.example_prompt_template[key].get("Response") + response = self.example_prompt_template[key].get("Response", {}) + if not response: + print_with_color( + f"waring: The Response of the example {key} is empty.", "yellow" + ) + continue + response["Tips"] = self.example_prompt_template[key].get("Tips") example = template.format( request=self.example_prompt_template[key].get("Request"), diff --git a/ufo/prompts/examples/nonvisual/app_agent_example_as.yaml b/ufo/prompts/examples/nonvisual/app_agent_example_as.yaml new file mode 100644 index 00000000..e096fe4e --- /dev/null +++ b/ufo/prompts/examples/nonvisual/app_agent_example_as.yaml @@ -0,0 +1,304 @@ +version: 1.0 + +example1: + Request: |- + My name is Zac. Please send a email to jack@outlook.com to thanks his contribution on the open source. + Sub-task: |- + Compose an email to send to Jack (jack@outlook.com) to thank him for his contribution to the open source project on the outlook application, using the name Zac. + Response: + Observation: |- + The screenshot shows that I am on the Main Page of Outlook. The Main Page has a list of control items and email received. The new email editing window is not opened. The last action took effect by opening the Outlook application. + Thought: |- + Base on the screenshots and the control item list, I need to click the New Email button to open a New Email window for the one-step action. + Actions: + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 1 + ControlText: New Email + Status: CONTINUE + Plan: + - (1) Input the email address of the receiver. + - (2) Input the title of the email. I need to input 'Thanks for your contribution on the open source.'. + - (3) Input the content of the email. I need to input 'Dear Jack,\\nI hope this message finds you well. I am writing to express my sincere gratitude for your outstanding contribution to our open-source project. Your dedication and expertise have truly made a significant impact, and we are incredibly grateful to have you on board.\\nYour commitment to the open-source community has not gone unnoticed, and your recent contributions have been instrumental in enhancing the functionality and quality of our project. It's through the efforts of individuals like you that we are able to create valuable resources that benefit the community as a whole.\\nYour code reviews, bug fixes, and innovative ideas have not only improved the project but have also inspired others to contribute their best. We recognize and appreciate the time and effort you've invested in making our open-source initiative a success.\\nPlease know that your contributions are highly valued, and we look forward to continued collaboration with someone as talented and dedicated as yourself. If there's anything you need or if you have further ideas you'd like to discuss, please don't hesitate to reach out.\\nOnce again, thank you for your exceptional contributions. We are fortunate to have you as part of our open-source community.\\nBest regards,\\nZac'. + - (4) Click the Send button to send the email. + Comment: |- + After I click the New Email button, the New Email window will be opened and available for composing the email. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - Sending an email is a sensitive action that needs to be confirmed by the user before the execution. + - You need to draft the content of the email and send it to the receiver. + +example2: + Request: |- + Draft an email to Amy to ask her how she feels about the new project. + Sub-task: |- + Draft an email to send to Amy (amy@gmail.com) to ask her how she feels about the new project on the outlook application. + Response: + Observation: |- + The screenshot shows that I am on the editing window of a new email, and the 'To', 'CC', 'Title' and 'Email Body' blocks are visible and ready to input. The last action took effect by opening the Outlook windows and jump to the new email editing window directly. + Thought: |- + Base on the previous plan, I need to click the New Email button to open a New Email window. But the screenshot shows that the New Email window has already opened. I can now take mutiple actions of filling the fields of 'To', 'Title' and 'Email Body' at a single step. + Actions: + - Function: set_edit_text + Args: {"text": "amy@gmail.com"} + ControlLabel: 33 + ControlText: To + Status: CONTINUE + - Function: set_edit_text + Args: {"text": "Inquiry about the Feedback on the New Project"} + ControlLabel: 34 + ControlText: Title + Status: CONTINUE + - Function: set_edit_text + Args: {"text": "Dear Amy,\\n\\nI hope this message finds you well. I am writing to ask how you feel about the new project. Let me know if you have any concerns.\\n\\nBest regards,\\n[Sender's Name]"} + ControlLabel: 36 + ControlText: Email Body + Status: FINISH + Plan: + - (1) After I draft the content of the email, the task is finished. I do not need to send the email since the user did not ask me to send it. + Comment: |- + I revised the previous plan base on the screenshot since I observe that New Email window has already opened and the title of email has already been inputted. I cannot input the email address since it is not provided in the user request. Since the user did not ask me to send the email, the task is finished after I draft the content of the email. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - The user only asked me to draft an email to Amy to ask her how she feels about the new project. I do not need to send the email since the user did not ask me to send it. + - You need to draft the content of the email and send it to the receiver. Use polite language when drafting the email. + + +example3: + Request: |- + Search for the word 'UFO' in the document. + Sub-task: |- + Search for the word 'UFO' in the document on the Word application. + Response: + Observation: |- + The screenshot shows that I am on the editing window of a Word file. The search box is visible and the word 'UFO' is already inputted. The previous action of inputting 'UFO' took effect based on the screenshot of the last step. + Thought: |- + Base on the screenshots, the word 'UFO' is already inputted in the Edit control named 'Find'. I need to click the Find button to search for the word 'UFO' in the document, and the task is finished. + Actions: + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 59 + ControlText: Find + Status: FINISH + Plan: + - + Comment: |- + I searched for the word 'UFO' in the document successfully. The user should view the search result. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - If there are steps already taken (e.g., the required text is already inputted edit box), you do not need to repeat them. You can skip the steps that have already been taken and move to the next step. + + +example4: + Request: |- + Send an message to Tom on Teams by including a (1) the text extracted from framework.docx, (2) and a detailed description of the workflow of the framework in the image of framework.png. + Sub-task: |- + Extract the full text from the framework.docx. + Response: + Observation: |- + The screenshot shows that I am on the main window of the Word file named 'framework.docx'. The text of the file, which I am interest, is visible in the screenshot. The last action took effect by opening the document successfully, if looking at the previous screenshot. I need to save the screenshot, as the text of the document is needed for composing the message. + Thought: |- + Base on the screenshots and the control item list, I need to use the 'texts' action to get the text of the Document control named 'framework.docx'. Given the previous plan, this completes the partial user request on the framework.docx, and therefore I need to set the status as 'FINISH' immediately. + Actions: + - Function: texts + Args: {} + ControlLabel: 83 + ControlText: framework - Word + Status: FINISH + Plan: + - + Comment: |- + The partial tasks on 'framework.docx' is completed once I take the current action. The current sub-task is completed, and we should switch to the image of framework.png to complete the next task. + SaveScreenshot: + {"save": true, "reason": "The text of the document in the screenshot is needed for composing the message in further steps."} + Tips: |- + - In Word application, the main canva is a Document control, and the text of the document can be extracted by using the 'texts' action. + + +example5: + Request: |- + Quickly remove all notes in the presentation.pptx, without looping through each slide. + Sub-task: |- + Remove all notes in the presentation.pptx file quickly without looping through each slide. + Response: + Observation: |- + The current screenshot shows the PowerPoint application with the 'presentation.pptx' file open. The 'File Tab' button is visible, which is needed to access the backstage view where the 'Check for Issues' option is located. This matches the next step in the previous plan. The application appears to be in the normal editing view, and no backstage view or other menus are open at this time. + Thought: |- + The user request to do it quickly. To make the task more efficient, I need to use some fast but niche tricks using the 'Remove All Presentation Notes' function. To do so, I need to first click the 'File' button to open the menu. + Actions: + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 59 + ControlText: File + Status: CONTINUE + Plan: + - (1) Click the 'Info' ListItem, to access the 'Info' page. + - (2) Click the 'Check for Issues' MenuItem, to open a menu for checking. + - (3) Click the 'Inspect Document', to open a document inspector. + - (4) Click the 'Inspect' button, to inspect the ppt for notes. + - (5) I should now find and click the 'Remove All Presentation Notes'. However, this button is usually at the bottom of the menu, and therefore I should apply wheel_mouse_input(wheel_dist=-20) to a ScrollBar to reach the menu bottom to make this button visible. + - (6) Once 'Remove All Presentation Notes' is visible, I should click it to remove all notes in the presentation.pptx file. This action is sensitive to the system and need to be confirmed by the user. + Comment: |- + I plan to use the 'Remove All Presentation Notes' function. This is the fastest way to remove all notes in the presentation.pptx file. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - The 'Remove All Presentation Notes' function is the fastest way to remove all notes in the presentation.pptx file. I should use this function to complete the task quickly instead of looping through each slide. + - The 'Remove All Presentation Notes' is usually at the bottom of the menu, and I should apply wheel_mouse_input(wheel_dist=-20) to a ScrollBar to reach the menu bottom to make this button visible. + + +example6: + Request: |- + How many stars does the Imdiffusion repo have? + Sub-task: |- + Google search for the Imdiffusion repo on github and summarize the number of stars the Imdiffusion repo page visually. + Response: + Observation: |- + I observe that the Edge browser is visible in the screenshot, with the Google search page opened. + Thought: |- + I need to input the text 'Imdiffusion GitHub' in the search box of Google to get to the Imdiffusion repo page from the search results. The search box is usually in a type of ComboBox. Then, I can click the "Search" button to search for the Imdiffusion repo on GitHub at the same step. + Actions: + - Function: set_edit_text + Args: {"text": "Imdiffusion GitHub"} + ControlLabel: 36 + ControlText: 搜索 + Status: CONTINUE + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 18 + ControlText: 搜一搜 + Status: CONTINUE + Plan: + - (1) Once the searched results are visible, click the Imdiffusion repo Hyperlink in the searched results to open the repo page. + - (2) Observing and summarize the number of stars the Imdiffusion repo page, and reply to the user request. + Comment: |- + I plan to use Google search for the Imdiffusion repo on github and summarize the number of stars the Imdiffusion repo page visually. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - The search box is usually in a type of ComboBox. + - The number of stars of a Github repo page can be found in the repo page visually. + + +example7: + Request: |- + Please remind me to get party dinner (5 people) preparation done before 5PM today with steps and notes. + Sub-task: |- + Add a task of 'Get party dinner (5 people) preparation done before 5PM today.' to the Microsoft To Do application, and set more details for the task, including adding steps and notes. + Response: + Observation: |- + The current screenshot shows that I am on the Microsoft To Do application. The application is open and there is a edit block at the bottom for inputting the task. + Thought: |- + I need to add a task of 'Get party dinner (5 people) preparation done before 5PM today.' to the edit block of 'Add a task in “My Day”'. After adding the task, I need to press the 'ENTER' key to submit the task. + Action: + - Function: set_edit_text + Args: {text: "Get party dinner (5 people) preparation done before 5PM today."} + ControlLabel: 19 + ControlText: Add a task in “My Day” + Status: CONTINUE + - Function: keyboard_input + Args: {"keys": "{ENTER}", "control_focus": true} + ControlLabel: 19 + ControlText: Add a task in “My Day” + Plan: + - (1) Click the ListItem of the task that was just created to set more details. + - (2) Add a step 1 for the task by inputing 'Menu Planning and Ingredient Acquisition' in the Add step edit control. + - (3) Add a step 2 for the task by inputing 'Meal Preparation Schedule' in the Add step edit control following the step one. + - (4) Add a step 3 for the task by inputing 'Setting Up the Dining Area' in the Add step edit control following the step two. + - (5) Add a note to this task in the 'Add note' edit blocking at the bottom by inputing 'Considering preferences, dietary restrictions and food allergiesof the guests.'. + - (6) Close the details view by clicking the 'Dismiss detail view' botton. + Comment: |- + After creating the task, I need to set more details for the reminder, including adding steps and notes. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - You need to add more details for the task, including adding steps and notes. + - You need to add each step one by one and add notes to the task. + + + +example8: + Request: |- + Please create a slide from the meeting_notes.docx in the presentation1.pptx. + Sub-task: |- + Create a slide from the meeting_notes.docx in the presentation1.pptx file using the Copilot Add-in in the Microsoft PowerPoint application. + Response: + Observation: |- + The current screenshot shows that I am on the Microsoft PowerPoint application. The Copilot Add-in is visible in the screenshot. + Thought: |- + I need to click the Copilot Add-in, which can help me create a slide from the meeting_notes.docx. The Copilot Add-in can directly create a slide from the meeting_notes.docx. + Action: + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 60 + ControlText: Copilot + Status: CONTINUE + Plan: + - (1) Click the 'Create presentation from file' button to open a file picker. + - (2) When the 'meeting_notes' is visible in the file picker, select and click the "meeting_notes" ListItem to create a slide. + - (3) Click the 'Send' buttom to send the request to Copilot. This completes the task. + SaveScreenshot: + {"save": false, "reason": ""} + Comment: |- + I need to use the Copilot Add-in to create a slide from the meeting_notes.docx. + Tips: |- + - The Copilot Add-in can directly create a slide from the meeting_notes.docx. You need to use the Copilot Add-in to complete the task, instead of manually creating a slide from the meeting_notes.docx. + + +example9: + Request: |- + Add a title slide to the presentation.pptx on its first slide with the title 'Project Update'. + Sub-task: |- + Add a title slide to the presentation.pptx on its first slide with the title 'Project Update'. + Response: + Observation: |- + The current screenshot shows that I am on the Microsoft PowerPoint application. The first slide of the presentation.pptx is visible in the screenshot and a title text box is on the top of the slide. + Thought: |- + I need to input the title 'Project Update' in the title text box of the first slide of the presentation.pptx. The title text box is on the canvas which is not a control item, thus I need to first estimate the relative fractional x and y coordinates of the point to click on and activate the title text box. The estimated coordinates of the point to click on are (0.35, 0.4). + Actions: + - Function: click_on_coordinates + Args: {"x": 0.35, "y": 0.4, "button": "left", "double": false} + ControlLabel: "" + ControlText: "" + Status: CONTINUE + Plan: + - (1) Input the title 'Project Update' in the title text box of the first slide of the presentation.pptx. + SaveScreenshot: + {"save": false, "reason": ""} + Comment: |- + I need to estimate the relative fractional x and y coordinates of the point to click on and activate the title text box, so that I can input the title 'Project Update'. + Tips: |- + - If the control item is not available in the control item list and screenshot, you can use the 'click_on_coordinates' API to click on a specific point in the application window. + + +example10: + Request: |- + Fill the information for top 3 events one by one in the forms of private Event Bookings web page. + Sub-task: |- + Fill out the form on the 'Private Event Bookings' web page with the extracted information for the top 3 events, one by one. + Response: + Observation: |- + The screenshot shows that I am on the 'Private Event Bookings' web page. The form for booking a private event is visible, the first field of 'Event Type' has a default value of 'Wedding'. + Thought: |- + I need to first input the information for the 'Event Type' field, which is 'Restaurant Reservation'. However, the 'Event Type' field is already filled with 'Wedding'. I need to first click the 'Event Type' field to open the dropdown list and select 'Restaurant Reservation'. + Actions: + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 70 + ControlText: Event Type + Status: CONTINUE + Plan: + - (1) Select 'Restaurant Reservation' from the dropdown list. + - (2) Input the information for the 'Event Name' field, which is 'Birthday Party'. + - (3) Input the information for the 'Event Date' field, which is '2022-12-25'. + - (4) Input the information for the 'Event Time' field, which is '18:00'. + - (5) Click the 'Submit' button to submit the form. + SaveScreenshot: + {"save": false, "reason": ""} + Comment: |- + I need to first click the 'Event Type' field to open the dropdown list and select 'Restaurant Reservation' to change the default value of 'Wedding'. + Tips: |- + - If the field is already filled with a default value, you need to first click on the field to open the dropdown list and select the correct value. \ No newline at end of file diff --git a/ufo/prompts/examples/visual/app_agent_example.yaml b/ufo/prompts/examples/visual/app_agent_example.yaml index f1dce115..8c96c7b1 100644 --- a/ufo/prompts/examples/visual/app_agent_example.yaml +++ b/ufo/prompts/examples/visual/app_agent_example.yaml @@ -66,7 +66,7 @@ example2: SaveScreenshot: {"save": false, "reason": ""} Tips: |- - - Sending an email is a sensitive action that needs to be confirmed by the user before the execution. + - The user only asked me to draft an email to Amy to ask her how she feels about the new project. I do not need to send the email since the user did not ask me to send it. - You need to draft the content of the email and send it to the receiver. Use polite language when drafting the email. @@ -326,4 +326,8 @@ example10: - (4) Input the information for the 'Event Time' field, which is '18:00'. - (5) Click the 'Submit' button to submit the form. Comment: |- - I need to first click the 'Event Type' field to open the dropdown list and select 'Restaurant Reservation' to change the default value of 'Wedding'. \ No newline at end of file + I need to first click the 'Event Type' field to open the dropdown list and select 'Restaurant Reservation' to change the default value of 'Wedding'. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - If the field is already filled with a default value, you need to first click on the field to open the dropdown list and select the correct value. \ No newline at end of file diff --git a/ufo/prompts/examples/visual/app_agent_example_as.yaml b/ufo/prompts/examples/visual/app_agent_example_as.yaml new file mode 100644 index 00000000..d9490201 --- /dev/null +++ b/ufo/prompts/examples/visual/app_agent_example_as.yaml @@ -0,0 +1,305 @@ +version: 1.0 + +example1: + Request: |- + My name is Zac. Please send a email to jack@outlook.com to thanks his contribution on the open source. + Sub-task: |- + Compose an email to send to Jack (jack@outlook.com) to thank him for his contribution to the open source project on the outlook application, using the name Zac. + Response: + Observation: |- + The screenshot shows that I am on the Main Page of Outlook. The Main Page has a list of control items and email received. The new email editing window is not opened. The last action took effect by opening the Outlook application. + Thought: |- + Base on the screenshots and the control item list, I need to click the New Email button to open a New Email window for the one-step action. + ActionList: + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 1 + ControlText: New Email + Status: CONTINUE + Plan: + - (1) Input the email address of the receiver. + - (2) Input the title of the email. I need to input 'Thanks for your contribution on the open source.'. + - (3) Input the content of the email. I need to input 'Dear Jack,\\nI hope this message finds you well. I am writing to express my sincere gratitude for your outstanding contribution to our open-source project. Your dedication and expertise have truly made a significant impact, and we are incredibly grateful to have you on board.\\nYour commitment to the open-source community has not gone unnoticed, and your recent contributions have been instrumental in enhancing the functionality and quality of our project. It's through the efforts of individuals like you that we are able to create valuable resources that benefit the community as a whole.\\nYour code reviews, bug fixes, and innovative ideas have not only improved the project but have also inspired others to contribute their best. We recognize and appreciate the time and effort you've invested in making our open-source initiative a success.\\nPlease know that your contributions are highly valued, and we look forward to continued collaboration with someone as talented and dedicated as yourself. If there's anything you need or if you have further ideas you'd like to discuss, please don't hesitate to reach out.\\nOnce again, thank you for your exceptional contributions. We are fortunate to have you as part of our open-source community.\\nBest regards,\\nZac'. + - (4) Click the Send button to send the email. + Comment: |- + After I click the New Email button, the New Email window will be opened and available for composing the email. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - Sending an email is a sensitive action that needs to be confirmed by the user before the execution. + - You need to draft the content of the email and send it to the receiver. + +example2: + Request: |- + Draft an email to Amy to ask her how she feels about the new project. + Sub-task: |- + Draft an email to send to Amy (amy@gmail.com) to ask her how she feels about the new project on the outlook application. + Response: + Observation: |- + The screenshot shows that I am on the editing window of a new email, and the 'To', 'CC', 'Title' and 'Email Body' blocks are visible and ready to input. The last action took effect by opening the Outlook windows and jump to the new email editing window directly. + Thought: |- + Base on the previous plan, I need to click the New Email button to open a New Email window. But the screenshot shows that the New Email window has already opened. I can now take mutiple actions of filling the fields of 'To', 'Title' and 'Email Body' at a single step. + ActionList: + - Function: set_edit_text + Args: {"text": "amy@gmail.com"} + ControlLabel: 33 + ControlText: To + Status: CONTINUE + - Function: set_edit_text + Args: {"text": "Inquiry about the Feedback on the New Project"} + ControlLabel: 34 + ControlText: Title + Status: CONTINUE + - Function: set_edit_text + Args: {"text": "Dear Amy,\\n\\nI hope this message finds you well. I am writing to ask how you feel about the new project. Let me know if you have any concerns.\\n\\nBest regards,\\n[Sender's Name]"} + ControlLabel: 36 + ControlText: Email Body + Status: FINISH + Plan: + - (1) After I draft the content of the email, the task is finished. I do not need to send the email since the user did not ask me to send it. + Comment: |- + I revised the previous plan base on the screenshot since I observe that New Email window has already opened and the title of email has already been inputted. I cannot input the email address since it is not provided in the user request. Since the user did not ask me to send the email, the task is finished after I draft the content of the email. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - The user only asked me to draft an email to Amy to ask her how she feels about the new project. I do not need to send the email since the user did not ask me to send it. + - You need to draft the content of the email and send it to the receiver. Use polite language when drafting the email. + + +example3: + Request: |- + Search for the word 'UFO' in the document. + Sub-task: |- + Search for the word 'UFO' in the document on the Word application. + Response: + Observation: |- + The screenshot shows that I am on the editing window of a Word file. The search box is visible and the word 'UFO' is already inputted. The previous action of inputting 'UFO' took effect based on the screenshot of the last step. + Thought: |- + Base on the screenshots, the word 'UFO' is already inputted in the Edit control named 'Find'. I need to click the Find button to search for the word 'UFO' in the document, and the task is finished. + ActionList: + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 59 + ControlText: Find + Status: FINISH + Plan: + - + Comment: |- + I searched for the word 'UFO' in the document successfully. The user should view the search result. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - If there are steps already taken (e.g., the required text is already inputted edit box), you do not need to repeat them. You can skip the steps that have already been taken and move to the next step. + + +example4: + Request: |- + Send an message to Tom on Teams by including a (1) the text extracted from framework.docx, (2) and a detailed description of the workflow of the framework in the image of framework.png. + Sub-task: |- + Extract the full text from the framework.docx. + Response: + Observation: |- + The screenshot shows that I am on the main window of the Word file named 'framework.docx'. The text of the file, which I am interest, is visible in the screenshot. The last action took effect by opening the document successfully, if looking at the previous screenshot. I need to save the screenshot, as the text of the document is needed for composing the message. + Thought: |- + Base on the screenshots and the control item list, I need to use the 'texts' action to get the text of the Document control named 'framework.docx'. Given the previous plan, this completes the partial user request on the framework.docx, and therefore I need to set the status as 'FINISH' immediately. + ActionList: + - Function: texts + Args: {} + ControlLabel: 83 + ControlText: framework - Word + Status: FINISH + Plan: + - + Comment: |- + The partial tasks on 'framework.docx' is completed once I take the current action. The current sub-task is completed, and we should switch to the image of framework.png to complete the next task. + SaveScreenshot: + {"save": true, "reason": "The text of the document in the screenshot is needed for composing the message in further steps."} + Tips: |- + - In Word application, the main canva is a Document control, and the text of the document can be extracted by using the 'texts' action. + + +example5: + Request: |- + Quickly remove all notes in the presentation.pptx, without looping through each slide. + Sub-task: |- + Remove all notes in the presentation.pptx file quickly without looping through each slide. + Response: + Observation: |- + The current screenshot shows the PowerPoint application with the 'presentation.pptx' file open. The 'File Tab' button is visible, which is needed to access the backstage view where the 'Check for Issues' option is located. This matches the next step in the previous plan. The application appears to be in the normal editing view, and no backstage view or other menus are open at this time. + Thought: |- + The user request to do it quickly. To make the task more efficient, I need to use some fast but niche tricks using the 'Remove All Presentation Notes' function. To do so, I need to first click the 'File' button to open the menu. + ActionList: + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 59 + ControlText: File + Status: CONTINUE + Plan: + - (1) Click the 'Info' ListItem, to access the 'Info' page. + - (2) Click the 'Check for Issues' MenuItem, to open a menu for checking. + - (3) Click the 'Inspect Document', to open a document inspector. + - (4) Click the 'Inspect' button, to inspect the ppt for notes. + - (5) I should now find and click the 'Remove All Presentation Notes'. However, this button is usually at the bottom of the menu, and therefore I should apply wheel_mouse_input(wheel_dist=-20) to a ScrollBar to reach the menu bottom to make this button visible. + - (6) Once 'Remove All Presentation Notes' is visible, I should click it to remove all notes in the presentation.pptx file. This action is sensitive to the system and need to be confirmed by the user. + Comment: |- + I plan to use the 'Remove All Presentation Notes' function. This is the fastest way to remove all notes in the presentation.pptx file. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - The 'Remove All Presentation Notes' function is the fastest way to remove all notes in the presentation.pptx file. I should use this function to complete the task quickly instead of looping through each slide. + - The 'Remove All Presentation Notes' is usually at the bottom of the menu, and I should apply wheel_mouse_input(wheel_dist=-20) to a ScrollBar to reach the menu bottom to make this button visible. + + +example6: + Request: |- + How many stars does the Imdiffusion repo have? + Sub-task: |- + Google search for the Imdiffusion repo on github and summarize the number of stars the Imdiffusion repo page visually. + Response: + Observation: |- + I observe that the Edge browser is visible in the screenshot, with the Google search page opened. + Thought: |- + I need to input the text 'Imdiffusion GitHub' in the search box of Google to get to the Imdiffusion repo page from the search results. The search box is usually in a type of ComboBox. Then, I can click the "Search" button to search for the Imdiffusion repo on GitHub at the same step. + ActionList: + - Function: set_edit_text + Args: {"text": "Imdiffusion GitHub"} + ControlLabel: 36 + ControlText: 搜索 + Status: CONTINUE + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 18 + ControlText: 搜一搜 + Status: CONTINUE + Plan: + - (1) Once the searched results are visible, click the Imdiffusion repo Hyperlink in the searched results to open the repo page. + - (2) Observing and summarize the number of stars the Imdiffusion repo page, and reply to the user request. + Comment: |- + I plan to use Google search for the Imdiffusion repo on github and summarize the number of stars the Imdiffusion repo page visually. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - The search box is usually in a type of ComboBox. + - The number of stars of a Github repo page can be found in the repo page visually. + + +example7: + Request: |- + Please remind me to get party dinner (5 people) preparation done before 5PM today with steps and notes. + Sub-task: |- + Add a task of 'Get party dinner (5 people) preparation done before 5PM today.' to the Microsoft To Do application, and set more details for the task, including adding steps and notes. + Response: + Observation: |- + The current screenshot shows that I am on the Microsoft To Do application. The application is open and there is a edit block at the bottom for inputting the task. + Thought: |- + I need to add a task of 'Get party dinner (5 people) preparation done before 5PM today.' to the edit block of 'Add a task in “My Day”'. After adding the task, I need to press the 'ENTER' key to submit the task. + ActionList: + - Function: set_edit_text + Args: {text: "Get party dinner (5 people) preparation done before 5PM today."} + ControlLabel: 19 + ControlText: Add a task in “My Day” + Status: CONTINUE + - Function: keyboard_input + Args: {"keys": "{ENTER}", "control_focus": true} + ControlLabel: 19 + ControlText: Add a task in “My Day” + Status: FINISH + Plan: + - (1) Click the ListItem of the task that was just created to set more details. + - (2) Add a step 1 for the task by inputing 'Menu Planning and Ingredient Acquisition' in the Add step edit control. + - (3) Add a step 2 for the task by inputing 'Meal Preparation Schedule' in the Add step edit control following the step one. + - (4) Add a step 3 for the task by inputing 'Setting Up the Dining Area' in the Add step edit control following the step two. + - (5) Add a note to this task in the 'Add note' edit blocking at the bottom by inputing 'Considering preferences, dietary restrictions and food allergiesof the guests.'. + - (6) Close the details view by clicking the 'Dismiss detail view' botton. + Comment: |- + After creating the task, I need to set more details for the reminder, including adding steps and notes. + SaveScreenshot: + {"save": false, "reason": ""} + Tips: |- + - You need to add more details for the task, including adding steps and notes. + - You need to add each step one by one and add notes to the task. + + + +example8: + Request: |- + Please create a slide from the meeting_notes.docx in the presentation1.pptx. + Sub-task: |- + Create a slide from the meeting_notes.docx in the presentation1.pptx file using the Copilot Add-in in the Microsoft PowerPoint application. + Response: + Observation: |- + The current screenshot shows that I am on the Microsoft PowerPoint application. The Copilot Add-in is visible in the screenshot. + Thought: |- + I need to click the Copilot Add-in, which can help me create a slide from the meeting_notes.docx. The Copilot Add-in can directly create a slide from the meeting_notes.docx. + ActionList: + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 60 + ControlText: Copilot + Status: CONTINUE + Plan: + - (1) Click the 'Create presentation from file' button to open a file picker. + - (2) When the 'meeting_notes' is visible in the file picker, select and click the "meeting_notes" ListItem to create a slide. + - (3) Click the 'Send' buttom to send the request to Copilot. This completes the task. + SaveScreenshot: + {"save": false, "reason": ""} + Comment: |- + I need to use the Copilot Add-in to create a slide from the meeting_notes.docx. + Tips: |- + - The Copilot Add-in can directly create a slide from the meeting_notes.docx. You need to use the Copilot Add-in to complete the task, instead of manually creating a slide from the meeting_notes.docx. + + +example9: + Request: |- + Add a title slide to the presentation.pptx on its first slide with the title 'Project Update'. + Sub-task: |- + Add a title slide to the presentation.pptx on its first slide with the title 'Project Update'. + Response: + Observation: |- + The current screenshot shows that I am on the Microsoft PowerPoint application. The first slide of the presentation.pptx is visible in the screenshot and a title text box is on the top of the slide. + Thought: |- + I need to input the title 'Project Update' in the title text box of the first slide of the presentation.pptx. The title text box is on the canvas which is not a control item, thus I need to first estimate the relative fractional x and y coordinates of the point to click on and activate the title text box. The estimated coordinates of the point to click on are (0.35, 0.4). + ActionList: + - Function: click_on_coordinates + Args: {"x": 0.35, "y": 0.4, "button": "left", "double": false} + ControlLabel: "" + ControlText: "" + Status: CONTINUE + Plan: + - (1) Input the title 'Project Update' in the title text box of the first slide of the presentation.pptx. + SaveScreenshot: + {"save": false, "reason": ""} + Comment: |- + I need to estimate the relative fractional x and y coordinates of the point to click on and activate the title text box, so that I can input the title 'Project Update'. + Tips: |- + - If the control item is not available in the control item list and screenshot, you can use the 'click_on_coordinates' API to click on a specific point in the application window. + + +example10: + Request: |- + Fill the information for top 3 events one by one in the forms of private Event Bookings web page. + Sub-task: |- + Fill out the form on the 'Private Event Bookings' web page with the extracted information for the top 3 events, one by one. + Response: + Observation: |- + The screenshot shows that I am on the 'Private Event Bookings' web page. The form for booking a private event is visible, the first field of 'Event Type' has a default value of 'Wedding'. + Thought: |- + I need to first input the information for the 'Event Type' field, which is 'Restaurant Reservation'. However, the 'Event Type' field is already filled with 'Wedding'. I need to first click the 'Event Type' field to open the dropdown list and select 'Restaurant Reservation'. + ActionList: + - Function: click_input + Args: {"button": "left", "double": false} + ControlLabel: 70 + ControlText: Event Type + Status: CONTINUE + Plan: + - (1) Select 'Restaurant Reservation' from the dropdown list. + - (2) Input the information for the 'Event Name' field, which is 'Birthday Party'. + - (3) Input the information for the 'Event Date' field, which is '2022-12-25'. + - (4) Input the information for the 'Event Time' field, which is '18:00'. + - (5) Click the 'Submit' button to submit the form. + SaveScreenshot: + {"save": false, "reason": ""} + Comment: |- + I need to first click the 'Event Type' field to open the dropdown list and select 'Restaurant Reservation' to change the default value of 'Wedding'. + Tips: |- + - If the field is already filled with a default value, you need to first click on the field to open the dropdown list and select the correct value. \ No newline at end of file diff --git a/ufo/prompts/share/base/api.yaml b/ufo/prompts/share/base/api.yaml index 64c66fdd..47427700 100644 --- a/ufo/prompts/share/base/api.yaml +++ b/ufo/prompts/share/base/api.yaml @@ -4,7 +4,7 @@ click_input: class_name: |- ClickInputCommand usage: |- - [1] API call: click_input(button: str, double: bool) + [1] API call: click_input(button: str, double: bool = False, pressed: str = None) [2] Args: - button: The mouse button to click. One of ''left'', ''right'', ''middle'' or ''x'' (Default: ''left'') - double: Whether to perform a double click or not (Default: False)' diff --git a/ufo/prompts/share/base/app_agent.yaml b/ufo/prompts/share/base/app_agent.yaml index 1d3d99a8..f2ade3b8 100644 --- a/ufo/prompts/share/base/app_agent.yaml +++ b/ufo/prompts/share/base/app_agent.yaml @@ -8,7 +8,7 @@ system: |- - You are provided the user request history for reference to decide the next step. These requests are the requests that you have completed before. - You are provided the [Step Trajectories Completed Previously], including historical actions, thoughts, and results of your previous steps for reference to decide the next step. - You are provided the blackboard, which records the information that you have saved at the previous steps, such as historical screenshots, thoughts. You may need to use them as reference for the next action. - - You are required to select the control item and take **one-step** action on it to complete the sub-task for one step. + - You are required to select the control item and take **one-step** action on it to complete the sub-task. ## On screenshots - You are provided two versions of screenshots of the current application in a single image, one with annotation (right) and one without annotation (left). @@ -32,7 +32,7 @@ system: |- - You are required to decide the status of the task after taking the current action, choose from the following actions, and fill in the "Status" field in the response. - "CONTINUE": means the task is not finished and need further action. - "FINISH": means the current subtask is finished for the AppAgent in current application and no further actions are required, even there are more sub-tasks in the user request. - - "FAIL": means that you believe the task cannot be completed due to the current application state, incorrect application, or other reasons. You should provide a reason in the "Comment" field in the response. + - "FAIL": means that you believe the task cannot be completed due to the current application state, incorrect application, or other reasons. Alternatively, if you find the action repeated multiple times and not effective, you can also choose "FAIL". - "CONFIRM": means the current one-step action you are taking is sensitive to the system and need to be confirmed by the user before its execution. This does not apply to future actions after the current step. Below are some examples of sensitive actions, but they are not limited to these cases: [1] Taking the "Send" action for a message or email: The sending action (e.g. clicking the send button) is sensitive to the system and as the message or email is sent, it can not be retrieved. Thus, the user need to confirm the sending action. Note that inputting the message or email is not sensitive, but clicking the send button is sensitive. @@ -79,11 +79,10 @@ system: |- - When inputting the searched text on Google, you must use the Search Box, which is a ComboBox type of control item. Do not use the address bar to input the searched text. - You are given the help documents of the application or/and the online search results for completing the sub-task. You may use them to help you think about the next step and construct your planning. These information are for reference only, and may not be relevant, accurate or up-to-date. - The "UserConfirm" field in the action trajectory in the Blackboard is used to record the user's confirmation of the sensitive action. If the user confirms the action, the value of "UserConfirm" will be set to "Yes" and the action was executed. If the user does not confirm the action, the value of "UserConfirm" will be set to "No" and the action was not executed. + - If you see current application window pop-up a sub-window, but controls in the sub-window are not annotated in the screenshot, you can set the "Status" to "FINISH". This will allow the HostAgent to switch to the sub-window and continue the task. - User request and sub-task are different. Your working scope is limited to the current application window for the assigned sub-task. If you have completed the current sub-task and need to switch to another application window to complete the full user request, you MUST output "FINISH" in the "Status" field in the response. - Please review the [Step Trajectories Completed Previously] carefully to ensure that you are not repeating the same actions that have been taken before. - - ## Here are other tips for you to complete the sub-task: - {tips} + - You are also given . Each action contains the control text, the function, arguments, and the results of the action. The "RepeatTimes" indicates the number of times the action has been repeated. If the action been repeated (RepeatTimes>0), please consider not to repeat the action again at the current step, since it has been taken previously but not effective. {examples} @@ -116,7 +115,7 @@ system_nonvisual: |- - You are required to decide the status of the task after taking the current action, choose from the following actions, and fill in the "Status" field in the response. - "CONTINUE": means the task is not finished and need further action. - "FINISH": means the current subtask is finished for the AppAgent in current application and no further actions are required, even there are more sub-tasks in the user request. - - "FAIL": means that you believe the task cannot be completed due to the current application state, incorrect application, or other reasons. You should provide a reason in the "Comment" field in the response. + - "FAIL": means that you believe the task cannot be completed due to the current application state, incorrect application, or other reasons. Alternatively, if you find the action repeated multiple times and not effective, you can also choose "FAIL". - "CONFIRM": means the current one-step action you are taking is sensitive to the system and need to be confirmed by the user before its execution. This does not apply to future actions after the current step. Below are some examples of sensitive actions, but they are not limited to these cases: [1] Taking the "Send" action for a message or email: The sending action (e.g. clicking the send button) is sensitive to the system and as the message or email is sent, it can not be retrieved. Thus, the user need to confirm the sending action. Note that inputting the message or email is not sensitive, but clicking the send button is sensitive. @@ -164,9 +163,8 @@ system_nonvisual: |- - The "UserConfirm" field in the action trajectory in the Blackboard is used to record the user's confirmation of the sensitive action. If the user confirms the action, the value of "UserConfirm" will be set to "Yes" and the action was executed. If the user does not confirm the action, the value of "UserConfirm" will be set to "No" and the action was not executed. - User request and sub-task are different. Your working scope is limited to the current application window for the assigned sub-task. If you have completed the current sub-task and need to switch to another application window to complete the full user request, you MUST output "FINISH" in the "Status" field in the response. - Please review the [Step Trajectories Completed Previously] carefully to ensure that you are not repeating the same actions that have been taken before. + - You are also given . Each action contains the control text, the function, arguments, and the results of the action. The "RepeatTimes" indicates the number of times the action has been repeated. If the action been repeated (RepeatTimes>0), please consider not to repeat the action again at the current step, since it has been taken previously but not effective. - ## Here are other tips for you to complete the sub-task: - {tips} {examples} @@ -179,7 +177,109 @@ user: |- {user_request} {prev_subtask} {subtask} + {last_success_actions} {current_application} - {host_message} + {host_message} {prev_plan} + +system_as: |- + - You are the AppAgent of UFO, a UI-focused agent framework for Windows OS. UFO is a virtual assistant that can help users to complete their current requests by interacting with the UI of the system and describe the content in the screenshot. + - As an AppAgent, you are responsible for completing the sub-task assigned by the HostAgent. The HostAgent will provide you with the necessary information to complete the task, please use these information wisely and selectively to complete the sub-task. + - You are provided a list of control items of the current application window for interaction. + - You are provided your previous plan of action for reference to decide the next step. But you are not required to strictly follow your previous plan of action. + - You are provided the user request history for reference to decide the next step. These requests are the requests that you have completed before. + - You are provided the [Step Trajectories Completed Previously], including historical actions, thoughts, and results of your previous steps for reference to decide the next step. + - You are provided the blackboard, which records the information that you have saved at the previous steps, such as historical screenshots, thoughts. You may need to use them as reference for the next action. + - You are required to select the control item and take **one or multiple** actions on it to complete the sub-task. + + ## On screenshots + - You are provided two versions of screenshots of the current application in a single image, one with annotation (right) and one without annotation (left). + - You are also provided the screenshot from the last step for your reference and comparison. The control items selected at the last step is labeled with red rectangle box on the screenshot. Use it to help you think whether the previous action has taken effect. + - The annotation is to help you identify the control elements on the application. The number is the label of the control item. + - You can refer to the clean screenshot without annotation to see what control item are without blocking the view by the annotation. + - Different types of control items have different colors of annotation. + - Use the screenshot to analyze the state of current application window. + + + ## Control item + - The control item is the element on the window that you can interact with. + - You are given the information of all available control item in the current application window in a list format: {{label: "the annotated label of the control item", control_text: "the text of the control item", control_type: "the type of the control item"}}. + + ## Actions + An action is defined as a single interaction with a control item in the application window, or calling an API function in the application. It has the following components: + - Function: The API function name without arguments to be called on the control item to complete the sub-task, e.g., click_input. + - Args: The arguments in a dictionary format of the selected API function to be called on the control item to complete the sub-task, e.g., {{"button": "left", "double": false}}. + - ControlLabel: The precise number of annotated label of the control item to be selected, adhering strictly to the provided options in the field of "label" in the control information. It can be empty if the Function does not require a control item. + - ControlText: The precise text of the control item to be selected, adhering strictly to the provided options in the field of "control_text" in the control information. It must match exactly with the selected ControlLabel. It can be empty if the Function does not require a control item. + - Status: The status of the task **after** taking the action. Choose from "CONTINUE", "FINISH", "FAIL", "CONFIRM". + + - In the ActionList field of the response, you need to provide a list of action sequences that can be taken given the current application state, if the earlier action does not affect the later actions. + - For example, in a new email window, you can take 4 actions to fill the "To", "CC", "Subject", "Body" fields at current step, since the earlier action does not affect the later actions. + - If the earlier action may affects the later actions, e.g. the first action is to expand a menu, and the second action is to click a button in the menu, you should only provide the first action in the Actions field, as without the first action, the second action cannot be taken. + + - You are able to use the following APIs: + {apis} + + ## Status of the task + - You are required to decide the status of the task after taking the current action, choose from the following actions, and fill in the "Status" field in the response. + - "CONTINUE": means the task is not finished and need further action. + - "FINISH": means the current subtask is finished for the AppAgent in current application and no further actions are required, even there are more sub-tasks in the user request. + - "FAIL": means that you believe the task cannot be completed due to the current application state, incorrect application, or other reasons. Alternatively, if you find the action repeated multiple times and not effective, you can also choose "FAIL". + - "CONFIRM": means the current one-step action you are taking is sensitive to the system and need to be confirmed by the user before its execution. This does not apply to future actions after the current step. Below are some examples of sensitive actions, but they are not limited to these cases: + [1] Taking the "Send" action for a message or email: + The sending action (e.g. clicking the send button) is sensitive to the system and as the message or email is sent, it can not be retrieved. Thus, the user need to confirm the sending action. Note that inputting the message or email is not sensitive, but clicking the send button is sensitive. + [2] Deleting or modifying files and folders: + Deleting or modifying files and folders, especially those located in critical system directories or containing important user data. + [3] Close an Window or Application: + Closing an window or application, since it may cause data loss or system crash. + [4] Accessing Webcam or Microphone: + Accessing the webcam or microphone without explicit user consent, as this raises privacy concerns. + [5] Installing or Uninstalling Software: + Installing or uninstalling software applications, as this can affect the system's configuration and potentially introduce security risks. + [6] Browser History or Password Retrieval: + Accessing sensitive user data such as browser history or stored passwords. + Please justify your decision on why current one-step action you output is sensitive in your "Thought". + For example, if the sub-task is to send a message to someone, you only need to output "CONFIRM" in the "Status" field in the response when the current one-step action is to click the send button. + The "CONFIRM" only applies to the current action you are taking, not related to future actions in your plan. + + + ## Other Guidelines + - You are required to response in a JSON format, consisting of 9 distinct parts with the following keys and corresponding content: + {{"Observation": + "Thought": + "ActionList": + - "Function": + - "Args": + - "ControlLabel": + - "ControlText": + - "Status": + "Plan": for reference, and you can reflect on it and revise if necessary. If you believe the task is finished and no further actions are required after the current action, output "".> + "Comment": + "SaveScreenshot": }} + }} + + - You must not do further actions beyond the completion of the current **sub-task**. + - If the sub-task includes asking questions, and you can answer the question without taking action. You should answer the question in the "Comment" field in the response, and set the "Status" as "FINISH". + - If the required control item is not visible in the screenshot, and not available in the control item list, you may need to take action on other control items to navigate to the required control item. + - You must select the control item in the given list . In your response, the ControlText of the selected control item must strictly match exactly with its ControlLabel in the given . + - You must look at the both screenshots and the control item list carefully, analyse the current status before you select the control item and take action on it. Base on the status of the application window, reflect on your previous plan for removing redundant actions or adding missing actions to complete the current user request. + - You must stop and output "FINISH" in "Status" field in your response if you believe the task has finished or finished after the current action. + - The Plan you provided are only for the future steps after the current action. You must not include the current action in the Plan. + - Check your step history and the screenshot of the last step to see if you have taken the same action before. You must not take repetitive actions from history if the previous action has already taken effect. + - Compare the current screenshot with the screenshot of the last step to see if the previous action has taken effect. If the previous action has taken effect, you must not take the same action again. + - Try to locate and use the "Results" in the to complete the sub-task, such as adding these results along with information to meet the sub-task into SetText when composing a message, email or document, when necessary. For example, if the the user request need includes results from different applications, you must try to find them in previous "Results" and incorporate them into the message with other necessary text, not leaving them as placeholders. + - Your output of SaveScreenshot must be strictly in the format of {{"save": True/False, "reason": "The reason for saving the screenshot"}}. Only set "save" to True if you strongly believe the screenshot is useful for the future steps, for example, the screenshot contains important information to fill in the form in the future steps. You must provide a reason for saving the screenshot in the "reason" field. + - When inputting the searched text on Google, you must use the Search Box, which is a ComboBox type of control item. Do not use the address bar to input the searched text. + - You are given the help documents of the application or/and the online search results for completing the sub-task. You may use them to help you think about the next step and construct your planning. These information are for reference only, and may not be relevant, accurate or up-to-date. + - The "UserConfirm" field in the action trajectory in the Blackboard is used to record the user's confirmation of the sensitive action. If the user confirms the action, the value of "UserConfirm" will be set to "Yes" and the action was executed. If the user does not confirm the action, the value of "UserConfirm" will be set to "No" and the action was not executed. + - If you see current application window pop-up a sub-window, but controls in the sub-window are not annotated in the screenshot, you can set the "Status" to "FINISH". This will allow the HostAgent to switch to the sub-window and continue the task. + - User request and sub-task are different. Your working scope is limited to the current application window for the assigned sub-task. If you have completed the current sub-task and need to switch to another application window to complete the full user request, you MUST output "FINISH" in the "Status" field in the response. + - Please review the [Step Trajectories Completed Previously] carefully to ensure that you are not repeating the same actions that have been taken before. + - You are also given . Each action contains the control text, the function, arguments, and the results of the action. The "RepeatTimes" indicates the number of times the action has been repeated. If the action been repeated (RepeatTimes>0), please consider not to repeat the action again at the current step, since it has been taken previously but not effective. + + + {examples} + + This is a very important task. Please read the user request, sub-task and the screenshot carefully, think step by step and take a deep breath before you start. I will tip you 200$ if you do a good job. + Make sure you answer must be strictly in JSON format only, without other redundant text such as json header. Your output must be able to be able to be parsed by json.loads(). Otherwise, it will crash the system and destroy the user's computer. \ No newline at end of file diff --git a/ufo/rag/retriever.py b/ufo/rag/retriever.py index 7a59b347..721ff62c 100644 --- a/ufo/rag/retriever.py +++ b/ufo/rag/retriever.py @@ -111,7 +111,9 @@ def get_indexer(self, path: str): return None try: - db = FAISS.load_local(path, get_hugginface_embedding()) + db = FAISS.load_local( + path, get_hugginface_embedding(), allow_dangerous_deserialization=True + ) return db except: # print_with_color( @@ -142,7 +144,11 @@ def get_indexer(self, db_path: str): """ try: - db = FAISS.load_local(db_path, get_hugginface_embedding()) + db = FAISS.load_local( + db_path, + get_hugginface_embedding(), + allow_dangerous_deserialization=True, + ) return db except: # print_with_color( @@ -209,7 +215,11 @@ def get_indexer(self, db_path: str): """ try: - db = FAISS.load_local(db_path, get_hugginface_embedding()) + db = FAISS.load_local( + db_path, + get_hugginface_embedding(), + allow_dangerous_deserialization=True, + ) return db except: # print_with_color( diff --git a/ufo/trajectory/parser.py b/ufo/trajectory/parser.py index 6d3590b3..7bc30cff 100644 --- a/ufo/trajectory/parser.py +++ b/ufo/trajectory/parser.py @@ -318,16 +318,16 @@ def round_number(self) -> int: :return: The total number of rounds. """ - return ( - max( - [ - self.step_log[i].get("Round") - for i in range(len(self.step_log)) - if isinstance(self.step_log[i].get("Round"), int) - ] - ) - + 1 - ) + round_numbers = [ + self.step_log[i].get("Round") + for i in range(len(self.step_log)) + if isinstance(self.step_log[i].get("Round"), int) + ] + + if len(round_numbers) == 0: + return 0 + + return max(round_numbers) + 1 @property def step_number(self) -> int: