kolbytn · gmuffiness · Jan 15, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 20, 2025
diff --git a/README.md b/README.md
@@ -123,6 +123,11 @@ You can pass a string or an object for these fields. A model object must specify
   "model": "gpt-4",
   "url": "https://api.openai.com/v1/"
 },
+"vision_model": {
+  "api": "openai",
+  "model": "gpt-4o",
+  "url": "https://api.openai.com/v1/"
+},
 "embedding": {
   "api": "openai",
   "url": "https://api.openai.com/v1/",
@@ -131,7 +136,7 @@ You can pass a string or an object for these fields. A model object must specify
 
 ```
 
-`model` is used for chat, `code_model` is used for newAction coding, and `embedding` is used to embed text for example selection. If `code_model` is not specified, then it will use `model` for coding.
+`model` is used for chat, `code_model` is used for newAction coding, `vision_model` is used for image interpretation, and `embedding` is used to embed text for example selection. If `code_model` or `vision_model` is not specified, `model` will be used by default.
 
 All apis have default models and urls, so those fields are optional. Note some apis have no embedding model, so they will default to word overlap to retrieve examples. 
 

diff --git a/package.json b/package.json
@@ -24,7 +24,9 @@
         "yargs": "^17.7.2",
         "socket.io": "^4.7.2",
         "socket.io-client": "^4.7.2",
-        "express": "^4.18.2"
+        "express": "^4.18.2",
+        "node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
+        "three": "^0.128.0"
     },
     "scripts": {
         "postinstall": "patch-package",

diff --git a/profiles/defaults/_default.json b/profiles/defaults/_default.json
@@ -9,6 +9,8 @@
 
     "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:",
 
+    "image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 400 chars.",
+
     "modes": {
         "self_preservation": true,
         "unstuck": true,
@@ -65,14 +67,26 @@
             {"role": "user", "content": "derg: (FROM OTHER BOT)Thanks, see you later!"},
             {"role": "assistant", "content": "See you later. !endConversation(\"derg\")"}
         ],
-    
+
         [
-            {"role": "user", "content": "grombo_Xx: What do you see?"},
-            {"role": "assistant", "content": "Let me see... !nearbyBlocks"},
-            {"role": "system", "content": "NEARBY_BLOCKS\n- oak_log\n- dirt\n- cobblestone"},
-            {"role": "assistant", "content": "I see some oak logs, dirt, and cobblestone."}
+            {"role": "user", "content": "d7297: look here"},
+            {"role": "assistant", "content": "Okay! !lookAtPlayer(\"d7297\", \"at\")"},
+            {"role": "system", "content": "Code Output:\nLooking at player d7297. There is a giant waterfall behind d7297."},
+            {"role": "assistant", "content": "Woo, that is a cool waterfall!"}
         ],
-
+        [
+            {"role": "user", "content": "d7297: look there"},
+            {"role": "assistant", "content": "Okay! !lookAtPlayer(\"d7297\", \"with\")"},
+            {"role": "system", "content": "Code Output:\nLooking in the same direction as d7297\nThere is a castle made of stone."},
+            {"role": "assistant", "content": "Wow, that stone castle looks amazing!"}
+        ],
+        [
+            {"role": "user", "content": "d7297: look 0 70 12 and describe what is there"},
+            {"role": "assistant", "content": "Sure! !lookAtPosition(0, 70, 12)"},
+            {"role": "system", "content": "Code Output:\nLooking in the same direction as d7297\nThere is a water fountain."},
+            {"role": "assistant", "content": "I see the water fountain! amazing!"}
+        ],
+
         [
             {"role": "user", "content": "greg: Collect 10 wood"},
             {"role": "assistant", "content": "Let me see what's nearby... !nearbyBlocks"},

diff --git a/settings.js b/settings.js
@@ -35,6 +35,7 @@ export default
     "show_bot_views": false, // show bot's view in browser at localhost:3000, 3001...
 
     "allow_insecure_coding": false, // allows newAction command and model can write/run code on your computer. enable at own risk
+    "allow_vision": false, // allows vision model to interpret screenshots as inputs
     "code_timeout_mins": -1, // minutes code is allowed to run. -1 for no timeout
     "relevant_docs_count": 5, // Parameter: -1 = all, 0 = no references, 5 = five references. If exceeding the maximum, all reference documents are returned.
 

diff --git a/src/agent/agent.js b/src/agent/agent.js
@@ -1,5 +1,6 @@
 import { History } from './history.js';
 import { Coder } from './coder.js';
+import { VisionInterpreter } from './vision_interpreter.js';
 import { Prompter } from '../models/prompter.js';
 import { initModes } from './modes.js';
 import { initBot } from '../utils/mcdata.js';
@@ -36,6 +37,8 @@ export class Agent {
             this.history = new History(this);
             console.log('Initializing coder...');
             this.coder = new Coder(this);
+            console.log('Initializing vision intepreter...');
+            this.vision_interpreter = new VisionInterpreter(this, settings.allow_vision);
             console.log('Initializing npc controller...');
             this.npc = new NPCContoller(this);
             console.log('Initializing memory bank...');

diff --git a/src/agent/commands/actions.js b/src/agent/commands/actions.js
@@ -407,17 +407,34 @@ export const actionsList = [
             return `Converstaion with ${player_name} ended.`;
         }
     },
-    // { // commented for now, causes confusion with goal command
-    //     name: '!npcGoal',
-    //     description: 'Set a simple goal for an item or building to automatically work towards. Do not use for complex goals.',
-    //     params: {
-    //         'name': { type: 'string', description: 'The name of the goal to set. Can be item or building name. If empty will automatically choose a goal.' },
-    //         'quantity': { type: 'int', description: 'The quantity of the goal to set. Default is 1.', domain: [1, Number.MAX_SAFE_INTEGER] }
-    //     },
-    //     perform: async function (agent, name=null, quantity=1) {
-    //         await agent.npc.setGoal(name, quantity);
-    //         agent.bot.emit('idle');  // to trigger the goal
-    //         return 'Set npc goal: ' + agent.npc.data.curr_goal.name;
-    //     }
-    // },
+    {
+        name: '!lookAtPlayer',
+        description: 'Look at a player or look in the same direction as the player.',
+        params: {
+            'player_name': {
+                type: 'string',
+                description: 'Name of the target player'
+            },
+            'direction': {
+                type: 'string',
+                description: 'How to look ("at": look at the player, "with": look in the same direction as the player)',
+                enum: ['at', 'with']
+            }
+        },
+        perform: runAsAction(async (agent, player_name, direction) => {
+            await agent.vision_interpreter.lookAtPlayer(player_name, direction);
+        })
+    },
+    {
+        name: '!lookAtPosition',
+        description: 'Look at specified coordinates.',
+        params: {
+            'x': { type: 'int', description: 'x coordinate' },
+            'y': { type: 'int', description: 'y coordinate' },
+            'z': { type: 'int', description: 'z coordinate' }
+        },
+        perform: runAsAction(async (agent, x, y, z) => {
+            await agent.vision_interpreter.lookAtPosition(x, y, z);
+        })
+    }
 ];
diff --git a/src/agent/library/skills.js b/src/agent/library/skills.js
@@ -1,7 +1,9 @@
 import * as mc from "../../utils/mcdata.js";
+import { Camera } from "../../utils/camera.js";
 import * as world from "./world.js";
 import pf from 'mineflayer-pathfinder';
 import Vec3 from 'vec3';
+import fs from 'fs';
 
 
 export function log(bot, message) {
@@ -1350,4 +1352,4 @@ export async function activateNearestBlock(bot, type) {
     await bot.activateBlock(block);
     log(bot, `Activated ${type} at x:${block.position.x.toFixed(1)}, y:${block.position.y.toFixed(1)}, z:${block.position.z.toFixed(1)}.`);
     return true;
-}
+}
diff --git a/src/agent/vision_interpreter.js b/src/agent/vision_interpreter.js
@@ -0,0 +1,104 @@
+import { Vec3 } from 'vec3';
+import { Camera } from "../utils/camera.js";
+import fs from 'fs';
+import { log } from './library/skills.js';
+import * as world from './library/world.js';
+
+const pad = (str) => {
+    return '\n' + str + '\n';
+}
+
+export class VisionInterpreter {
+    constructor(agent, allow_vision) {
+        this.agent = agent;
+        this.allow_vision = allow_vision;
+        this.fp = './bots/'+agent.name+'/screenshots/';
+    }
+
+    async lookAtPlayer(player_name, direction) {
+        const bot = this.agent.bot;
+        const player = bot.players[player_name]?.entity;
+        if (!player) {
+            log(bot, `Could not find player ${player_name}`);
+        }
+
+        let filename;
+        if (direction === 'with') {
+            await bot.look(player.yaw, player.pitch);
+            const camera = new Camera(bot, this.fp);
+            await new Promise(resolve => setTimeout(resolve, 500));
+            log(bot, `Looking in the same direction as ${player_name}`);
+            filename = await camera.capture();
+        } else {
+            await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
+            const camera = new Camera(bot, this.fp);
+            await new Promise(resolve => setTimeout(resolve, 500));
+            log(bot, `Looking at player ${player_name}`);
+            filename = await camera.capture();
+        }
+
+        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
+            log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
+            log(this.agent.bot, this._nearbyBlocks());
+        } else {
+            await this.analyzeImage(filename);
+        }
+    }
+
+    async lookAtPosition(x, y, z) {
+        const bot = this.agent.bot;
+        await bot.lookAt(new Vec3(x, y + 2, z));
+        const camera = new Camera(bot, this.fp);
+        await new Promise(resolve => setTimeout(resolve, 500));
+        log(bot, `Looking at coordinate ${x, y, z}`);
+
+        let filename = await camera.capture();
+
+        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
+            log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
+            log(this.agent.bot, this._nearbyBlocks());
+        } else {
+            await this.analyzeImage(filename);
+        }
+    }
+
+    async analyzeImage(filename) {
+        let prompt = this.agent.prompter.profile.image_conversing;
+        let res = null;
+
+        try {
+            const bot = this.agent.bot;
+            const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`);
+            const messages = this.agent.history.getHistory();
+            res = await this.agent.prompter.vision_model.sendVisionRequest(messages, prompt, imageBuffer);
+
+            if (res == 'Vision is only supported by certain models.') {
+                log(bot, "Vision may not be supported on this model. Using text-based environment description instead.");
+                log(bot, this._nearbyBlocks());
+            } else {
+                log(bot, res);
+            }
+
+        } catch (error) {
+            log(this.agent.bot, `Error analyzing image: ${error.message}`);
+        }
+    }
+
+    _nearbyBlocks() {
+        const bot = this.agent.bot;
+        let res = 'NEARBY_BLOCKS';
+
+        let blocks = world.getNearbyBlockTypes(bot);
+        for (let i = 0; i < blocks.length; i++) {
+            res += `\n- ${blocks[i]}`;
+        }
+        if (blocks.length == 0) {
+            res += ': none';
+        } else {
+            // Environmental Awareness
+            res += '\n- ' + world.getSurroundingBlocks(bot).join('\n- ')
+            res += `\n- First Solid Block Above Head: ${world.getFirstBlockAboveHead(bot, null, 32)}`;
+        }        
+        return pad(res);
+    }
+} 
diff --git a/src/models/claude.js b/src/models/claude.js
@@ -35,16 +35,40 @@ export class Claude {
             res = resp.content[0].text;
         }
         catch (err) {
+            if (err.message.includes("does not support image input")) {
+                res = "Vision is only supported by certain models.";
+            } else {
+                res = "My brain disconnected, try again.";
+            }
             console.log(err);
-            res = 'My brain disconnected, try again.';
         }
         return res;
     }
 
+    async sendVisionRequest(turns, systemMessage, imageBuffer) {
+        const imageMessages = [...turns];
+        imageMessages.push({
+            role: "user",
+            content: [
+                {
+                    type: "text",
+                    text: systemMessage
+                },
+                {
+                    type: "image",
+                    source: {
+                        type: "base64",
+                        media_type: "image/jpeg",
+                        data: imageBuffer.toString('base64')
+                    }
+                }
+            ]
+        });
+
+        return this.sendRequest(imageMessages, systemMessage);
+    }
+
     async embed(text) {
         throw new Error('Embeddings are not supported by Claude.');
     }
 }
-
-
-
diff --git a/src/models/gemini.js b/src/models/gemini.js
@@ -78,6 +78,51 @@ export class Gemini {
         return text;
     }
 
+    async sendVisionRequest(turns, systemMessage, imageBuffer) {
+        let model;
+        if (this.url) {
+            model = this.genAI.getGenerativeModel(
+                { model: this.model_name || "gemini-1.5-flash" },
+                { baseUrl: this.url },
+                { safetySettings: this.safetySettings }
+            );
+        } else {
+            model = this.genAI.getGenerativeModel(
+                { model: this.model_name || "gemini-1.5-flash" },
+                { safetySettings: this.safetySettings }
+            );
+        }
+
+        const imagePart = {
+            inlineData: {
+                data: imageBuffer.toString('base64'),
+                mimeType: 'image/jpeg'
+            }
+        };
+
+        const stop_seq = '***';
+        const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model');
+        let res = null;
+        try {
+            console.log('Awaiting Google API vision response...');
+            const result = await model.generateContent([prompt, imagePart]);
+            const response = await result.response;
+            const text = response.text();
+            console.log('Received.');
+            if (!text.includes(stop_seq)) return text;
+            const idx = text.indexOf(stop_seq);
+            res = text.slice(0, idx);
+        } catch (err) {
+            console.log(err);
+            if (err.message.includes("Image input modality is not enabled for models/")) {
+                res = "Vision is only supported by certain models.";
+            } else {
+                res = "An unexpected error occurred, please try again.";
+            }
+        }
+        return res;
+    }
+
     async embed(text) {
         let model;
         if (this.url) {