THUDM
diff --git a/‎.github/ISSUE_TEMPLATE/bug-report---assistance-request.md
+28 b/‎.github/ISSUE_TEMPLATE/bug-report---assistance-request.md
+28
diff --git a/‎.github/ISSUE_TEMPLATE/feature-request.md
+17 b/‎.github/ISSUE_TEMPLATE/feature-request.md
+17
diff --git a/‎.gitignore
+19 b/‎.gitignore
+19
diff --git a/‎README.md
+180 b/‎README.md
+180
diff --git a/‎assets/agentbench.png
6.01 MB b/‎assets/agentbench.png
6.01 MB
diff --git a/‎assets/architecture.png
58.2 KB b/‎assets/architecture.png
58.2 KB
diff --git a/‎assets/cover.jpg
521 KB b/‎assets/cover.jpg
521 KB
diff --git a/‎assets/intro.png
114 KB b/‎assets/intro.png
114 KB
diff --git a/‎assets/leaderboard.png
163 KB b/‎assets/leaderboard.png
163 KB
diff --git a/‎assets/logo.png
32.1 KB b/‎assets/logo.png
32.1 KB
diff --git a/‎assets/statistics.png
79.8 KB b/‎assets/statistics.png
79.8 KB
diff --git a/‎configs/agents/api_agents.yaml
+23 b/‎configs/agents/api_agents.yaml
+23
diff --git a/‎configs/agents/fs_agent.yaml
+23 b/‎configs/agents/fs_agent.yaml
+23
diff --git a/‎configs/agents/openai-chat.yaml
+13 b/‎configs/agents/openai-chat.yaml
+13
diff --git a/‎configs/agents/openai-text.yaml
+14 b/‎configs/agents/openai-text.yaml
+14
diff --git a/‎configs/assignments/default.yaml
+17 b/‎configs/assignments/default.yaml
+17
diff --git a/‎configs/assignments/definition.yaml
+11 b/‎configs/assignments/definition.yaml
+11
diff --git a/‎configs/start_task.yaml
+6 b/‎configs/start_task.yaml
+6
diff --git a/‎configs/tasks/alfworld.yaml
+22 b/‎configs/tasks/alfworld.yaml
+22
@@ -0,0 +1,28 @@
+---
+name: Bug Report & Assistance Request
+about: Create a report to help us improve
+title: "[Bug/Assistance] "
+labels: bug, help wanted
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Screenshots or Terminal Copy&Paste**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. Ubuntu 22.04]
+ - Python: [e.g. 3.9]
+
+**Additional context**
+Add any other context about the problem here.
@@ -0,0 +1,17 @@
+---
+name: Feature Request
+about: Suggest an idea for this project
+title: "[Feature] "
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
@@ -0,0 +1,19 @@
+__pycache__
+%*
+.idea
+.vscode
+src/tasks/humaneval_x/env/vendor
+logs
+outputs
+data/full
+results
+config.sh
+download
+.DS_Store
+# local*
+*.ipynb
+.cache
+src/server/tasks/card_game/result
+.dockerfile
+.dockerfile-cache
+analysis
@@ -0,0 +1,180 @@
+# AgentBench
+
+![](./assets/cover.jpg)
+
+<p align="center">
+   <a href="https://llmbench.ai" target="_blank">🌐 Website</a> | <a href="https://twitter.com/thukeg" target="_blank">🐦 Twitter</a> | <a href="mailto:[email protected]">✉️ Google Group</a> | <a href="https://arxiv.org/abs/2308.03688" target="_blank">📃 Paper </a>
+</p>
+
+<p align="center">
+👋 Join our <a href="https://join.slack.com/t/agentbenchcol-huw1944/shared_invite/zt-20ixabcuv-31cFLBAkqGQxQkJqrWVEVg" target="_blank">Slack</a>  for <i>Q & A</i> or <i><b>collaboration</b> on next version of AgentBench</i>!
+</p>
+
+## 📌Introducing AgentBench v0.2🎉
+
+You are now browsing AgentBench v0.2. If you wish to use the older version, you can revert to [v0.1](https://github.com/THUDM/AgentBench/tree/v0.1).
+
+Based on [v0.1](https://github.com/THUDM/AgentBench/tree/v0.1), we:
+
+-   Updated the framework architecture for easier use and extension
+-   Adjusted some task settings
+-   Added test results for more models
+-   Released the full data for the Dev and Test sets
+
+# AgentBench: Evaluating LLMs as Agents
+
+https://github.com/THUDM/AgentBench/assets/129033897/656eed6e-d9d9-4d07-b568-f43f5a451f04
+
+**AgentBench** is the first benchmark designed to evaluate **LLM-as-Agent** across a diverse spectrum of different
+environments. It encompasses 8 distinct environments to provide a more comprehensive evaluation of the LLMs' ability to
+operate as autonomous agents in various scenarios. These environments include 5 freshly created domains, namely
+
+-   Operating System (OS)
+-   Database (DB)
+-   Knowledge Graph (KG)
+-   Digital Card Game (DCG)
+-   Lateral Thinking Puzzles (LTP)
+
+as well as 3 recompiled from published datasets:
+
+-   House-Holding (HH) ([ALFWorld](https://github.com/alfworld/alfworld))
+-   Web Shopping (WS) ([WebShop](https://github.com/princeton-nlp/webshop))
+-   Web Browsing (WB) ([Mind2Web](https://github.com/OSU-NLP-Group/Mind2Web))
+
+![](./assets/agentbench.png)
+
+## Table of Contents
+
+-   [Dataset Summary](#dataset-summary)
+-   [Leaderboard](#leaderboard)
+-   [Quick Start](#quick-start)
+-   [Next Steps](#next-steps)
+-   [Citation](#citation)
+
+## Dataset Summary
+
+We offer two splits for each dataset: Dev and Test. The multi-turn interaction requires an LLMs to generate around 4k
+and 13k times respectively.
+
+![](./assets/statistics.png)
+
+## Leaderboard
+
+Here is the scores on test set (standard) results of AgentBench.
+
+![](./assets/leaderboard.png)
+
+While LLMs begin to manifest their proficiency in LLM-as-Agent, gaps between models and the distance towards practical
+usability are significant.
+
+![](./assets/intro.png)
+
+## Quick Start
+
+This section will guide you on how to quickly use gpt-3.5-turbo-0613 as an agent to launch the `dbbench-std` and `os-std` tasks.
+For the specific framework structure, please refer to [Framework Introduction](docs/Introduction_en.md).
+For more detailed configuration and launch methods, please check [Configuration Guide](docs/Config_en.md)
+and [Program Entrance Guide](docs/Entrance_en.md).
+
+### Step 1. Prerequisites
+
+Clone this repo and install the dependencies.
+
+```bash
+cd AgentBench
+conda create -n agent-bench python=3.9
+conda activate agent-bench
+pip install -r requirements.txt
+```
+
+Ensure that [Docker](https://www.docker.com/) is properly installed.
+
+```bash
+docker ps
+```
+
+Build required images for `dbbench-std` and `os-std`.
+
+```bash
+docker pull mysql
+docker pull ubuntu
+docker build -f data/os_interaction/res/dockerfiles/default data/os_interaction/res/dockerfiles --tag local-os/default
+docker build -f data/os_interaction/res/dockerfiles/packages data/os_interaction/res/dockerfiles --tag local-os/packages
+docker build -f data/os_interaction/res/dockerfiles/ubuntu data/os_interaction/res/dockerfiles --tag local-os/ubuntu
+```
+
+### Step 2. Configure the Agent
+
+Fill in your OpenAI API Key at the correct location in `configs/agents/openai-chat.yaml`. (e.g. `gpt-3.5-turbo-0613`)
+
+You can try using `python -m src.client.agent_test` to check if your agent is configured correctly.
+
+By default, `gpt-3.5-turbo-0613` will be started. You can replace it with other agents by modifying the parameters:
+
+```bash
+python -m src.client.agent_test --config configs/agents/api_agents.yaml --agent gpt-3.5-turbo-0613
+```
+
+### Step 3. Start the task server
+
+Starting the task worker involves specific tasks. Manual starting might be cumbersome; hence, we provide an automated
+script.
+
+The assumption for this step is that ports from 5000 to 5015 are available.
+
+```bash
+python -m src.start_task -a
+```
+
+This will launch five task_workers each for `dbbench-std`, `os-std`, and `kg-std` tasks and automatically connect them
+to the controller on port 5000. **After executing this command, please allow approximately 1 minute for the task setup to complete.**
+
+### Step 4. Start the assigner
+
+This step is to actually start the tasks.
+
+If everything is correctly configured so far, you can now initiate the task tests.
+
+```bash
+python -m src.assigner
+```
+
+## Next Steps
+
+If you wish to launch more tasks or use other models, you can refer to the content
+in [Configuration Guide](docs/Config_en.md) and [Program Entrance Guide](docs/Entrance_en.md).
+
+For the environment of the remaining five tasks, you will need to download the Docker images we provide.
+
+```
+longinyu/agentbench-ltp
+longinyu/agentbench-webshop
+longinyu/agentbench-mind2web
+longinyu/agentbench-card_game
+longinyu/agentbench-alfworld
+```
+
+The resource consumption of a single task_worker for the eight tasks is roughly as follows; consider this when
+launching:
+
+| Task Name | Start-up Speed | Memory Consumption |
+| --------- | -------------- | ------------------ |
+| webshop   | ~3min          | ~15G               |
+| mind2web  | ~5min          | ~1G                |
+| db        | ~20s           | < 500M             |
+| alfworld  | ~10s           | < 500M             |
+| card_game | ~5s            | < 500M             |
+| ltp       | ~5s            | < 500M             |
+| os        | ~5s            | < 500M             |
+| kd        | ~5s            | < 500M             |
+
+## Citation
+
+```
+@article{liu2023agentbench,
+  title   = {AgentBench: Evaluating LLMs as Agents},
+  author  = {Xiao Liu and Hao Yu and Hanchen Zhang and Yifan Xu and Xuanyu Lei and Hanyu Lai and Yu Gu and Hangliang Ding and Kaiwen Men and Kejuan Yang and Shudan Zhang and Xiang Deng and Aohan Zeng and Zhengxiao Du and Chenhui Zhang and Sheng Shen and Tianjun Zhang and Yu Su and Huan Sun and Minlie Huang and Yuxiao Dong and Jie Tang},
+  year    = {2023},
+  journal = {arXiv preprint arXiv: 2308.03688}
+}
+```
@@ -0,0 +1,23 @@
+gpt-3.5-turbo-0613:
+    import: "./openai-chat.yaml"
+    parameters:
+        name: "gpt-3.5-turbo-0613"
+        body:
+            model: "gpt-3.5-turbo-0613"
+            max_tokens: 512
+
+text-davinci-003:
+    import: "./openai-text.yaml"
+    parameters:
+        name: "text-davinci-003"
+        body:
+            model: "text-davinci-003"
+            max_tokens: 512
+
+text-davinci-002:
+    import: "./openai-text.yaml"
+    parameters:
+        name: "text-davinci-002"
+        body:
+            model: "text-davinci-002"
+            max_tokens: 512
@@ -0,0 +1,23 @@
+default:
+  module: "src.client.agents.FastChatAgent"
+  parameters:
+    name: "FastChat"
+    controller_address: "http://localhost:55555"
+    max_new_tokens: 512
+    temperature: 0
+
+vicuna-33b:
+  parameters:
+    model_name: "vicuna-33b-v1.3"
+
+wizard-30b:
+  parameters:
+    model_name: "WizardLM-30B-V1.0-merged"
+
+vicuna-13b:
+  parameters:
+    model_name: "vicuna-13b-v1.5"
+
+vicuna-7b:
+  parameters:
+    model_name: "vicuna-7b-v1.5"
@@ -0,0 +1,13 @@
+module: src.client.agents.HTTPAgent
+parameters:
+  url: https://api.openai.com/v1/chat/completions
+  headers:
+    Content-Type: application/json
+    Authorization: Bearer <% PUT-YOUR-OPENAI-KEY-HERE %>
+  body:
+    temperature: 0
+  prompter:
+    name: role_content_dict
+    args:
+      agent_role: assistant
+  return_format: "{response[choices][0][message][content]}"
@@ -0,0 +1,14 @@
+module: src.client.agents.HTTPAgent
+parameters:
+  name: <% NAME %>
+  url: https://api.openai.com/v1/chat/completions
+  headers:
+    Content-Type: application/json
+    Authorization: Bearer <% PUT-YOUR-OPENAI-KEY-HERE %>
+  body:
+    model: <% NAME %>
+    temperature: 0
+  prompter:
+    name: prompt_string
+  return_format: "{response[choices][0][text]}"
+
@@ -0,0 +1,17 @@
+import: definition.yaml
+
+concurrency:
+  task:
+    dbbench-std: 5
+    os-std: 5
+  agent:
+    gpt-3.5-turbo-0613: 5
+
+assignments: # List[Assignment] | Assignment
+  - agent: # "task": List[str] | str ,  "agent": List[str] | str
+      - gpt-3.5-turbo-0613
+    task:
+      - dbbench-std
+      - os-std
+
+output: "outputs/{TIMESTAMP}"
@@ -0,0 +1,11 @@
+definition:
+  task:
+    overwrite:
+      module: src.client.TaskClient
+      parameters:
+        controller_address: "http://localhost:5000/api"
+    import: ../tasks/task_assembly.yaml
+  agent:
+    import:
+      - ../agents/api_agents.yaml
+      - ../agents/fs_agent.yaml
@@ -0,0 +1,6 @@
+definition:
+  import: tasks/task_assembly.yaml
+
+start:
+  dbbench-std: 5
+  os-std: 5
@@ -0,0 +1,22 @@
+default:
+  module: src.server.tasks.alfworld.ALFWorld
+  docker:
+    image: longinyu/agentbench-alfworld
+    command: umask 0; [ -f /root/.setup.sh ] && bash /root/.setup.sh;
+  parameters:
+    name: alfworld-std
+    data_path: "/AgentBench/data/alfworld"
+    config_path: "src/server/tasks/alfworld/configs/base_config.yaml"
+    prompts_path: "src/server/tasks/alfworld/prompts/alfworld_multiturn_plan_first.json"
+    split: "standard"
+    max_step: 35
+
+alfworld-dev:
+  parameters:
+    name: alfworld-dev
+    split: "dev"
+
+alfworld-std:
+  parameters:
+    name: alfworld-std
+    split: "standard"