Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add vision capability for bots #413

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ You can pass a string or an object for these fields. A model object must specify
"model": "gpt-4",
"url": "https://api.openai.com/v1/"
},
"vision_model": {
"api": "openai",
"model": "gpt-4o",
"url": "https://api.openai.com/v1/"
},
"embedding": {
"api": "openai",
"url": "https://api.openai.com/v1/",
Expand All @@ -131,7 +136,7 @@ You can pass a string or an object for these fields. A model object must specify

```

`model` is used for chat, `code_model` is used for newAction coding, and `embedding` is used to embed text for example selection. If `code_model` is not specified, then it will use `model` for coding.
`model` is used for chat, `code_model` is used for newAction coding, `vision_model` is used for image interpretation, and `embedding` is used to embed text for example selection. If `code_model` or `vision_model` is not specified, `model` will be used by default.

All apis have default models and urls, so those fields are optional. Note some apis have no embedding model, so they will default to word overlap to retrieve examples.

Expand Down
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
"yargs": "^17.7.2",
"socket.io": "^4.7.2",
"socket.io-client": "^4.7.2",
"express": "^4.18.2"
"express": "^4.18.2",
"node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
"three": "^0.128.0"
},
"scripts": {
"postinstall": "patch-package",
Expand Down
26 changes: 20 additions & 6 deletions profiles/defaults/_default.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

"bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:",

"image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 400 chars.",

"modes": {
"self_preservation": true,
"unstuck": true,
Expand Down Expand Up @@ -65,14 +67,26 @@
{"role": "user", "content": "derg: (FROM OTHER BOT)Thanks, see you later!"},
{"role": "assistant", "content": "See you later. !endConversation(\"derg\")"}
],

[
{"role": "user", "content": "grombo_Xx: What do you see?"},
{"role": "assistant", "content": "Let me see... !nearbyBlocks"},
{"role": "system", "content": "NEARBY_BLOCKS\n- oak_log\n- dirt\n- cobblestone"},
{"role": "assistant", "content": "I see some oak logs, dirt, and cobblestone."}
{"role": "user", "content": "d7297: look here"},
{"role": "assistant", "content": "Okay! !lookAtPlayer(\"d7297\", \"at\")"},
{"role": "system", "content": "Code Output:\nLooking at player d7297. There is a giant waterfall behind d7297."},
{"role": "assistant", "content": "Woo, that is a cool waterfall!"}
],

[
{"role": "user", "content": "d7297: look there"},
{"role": "assistant", "content": "Okay! !lookAtPlayer(\"d7297\", \"with\")"},
{"role": "system", "content": "Code Output:\nLooking in the same direction as d7297\nThere is a castle made of stone."},
{"role": "assistant", "content": "Wow, that stone castle looks amazing!"}
],
[
{"role": "user", "content": "d7297: look 0 70 12 and describe what is there"},
{"role": "assistant", "content": "Sure! !lookAtPosition(0, 70, 12)"},
{"role": "system", "content": "Code Output:\nLooking in the same direction as d7297\nThere is a water fountain."},
{"role": "assistant", "content": "I see the water fountain! amazing!"}
],

[
{"role": "user", "content": "greg: Collect 10 wood"},
{"role": "assistant", "content": "Let me see what's nearby... !nearbyBlocks"},
Expand Down
1 change: 1 addition & 0 deletions settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ export default
"show_bot_views": false, // show bot's view in browser at localhost:3000, 3001...

"allow_insecure_coding": false, // allows newAction command and model can write/run code on your computer. enable at own risk
"allow_vision": false, // allows vision model to interpret screenshots as inputs
"code_timeout_mins": -1, // minutes code is allowed to run. -1 for no timeout
"relevant_docs_count": 5, // Parameter: -1 = all, 0 = no references, 5 = five references. If exceeding the maximum, all reference documents are returned.

Expand Down
3 changes: 3 additions & 0 deletions src/agent/agent.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { History } from './history.js';
import { Coder } from './coder.js';
import { VisionInterpreter } from './vision_interpreter.js';
import { Prompter } from '../models/prompter.js';
import { initModes } from './modes.js';
import { initBot } from '../utils/mcdata.js';
Expand Down Expand Up @@ -36,6 +37,8 @@ export class Agent {
this.history = new History(this);
console.log('Initializing coder...');
this.coder = new Coder(this);
console.log('Initializing vision intepreter...');
this.vision_interpreter = new VisionInterpreter(this, settings.allow_vision);
console.log('Initializing npc controller...');
this.npc = new NPCContoller(this);
console.log('Initializing memory bank...');
Expand Down
43 changes: 30 additions & 13 deletions src/agent/commands/actions.js
Original file line number Diff line number Diff line change
Expand Up @@ -407,17 +407,34 @@ export const actionsList = [
return `Converstaion with ${player_name} ended.`;
}
},
// { // commented for now, causes confusion with goal command
// name: '!npcGoal',
// description: 'Set a simple goal for an item or building to automatically work towards. Do not use for complex goals.',
// params: {
// 'name': { type: 'string', description: 'The name of the goal to set. Can be item or building name. If empty will automatically choose a goal.' },
// 'quantity': { type: 'int', description: 'The quantity of the goal to set. Default is 1.', domain: [1, Number.MAX_SAFE_INTEGER] }
// },
// perform: async function (agent, name=null, quantity=1) {
// await agent.npc.setGoal(name, quantity);
// agent.bot.emit('idle'); // to trigger the goal
// return 'Set npc goal: ' + agent.npc.data.curr_goal.name;
// }
// },
{
name: '!lookAtPlayer',
description: 'Look at a player or look in the same direction as the player.',
params: {
'player_name': {
type: 'string',
description: 'Name of the target player'
},
'direction': {
type: 'string',
description: 'How to look ("at": look at the player, "with": look in the same direction as the player)',
enum: ['at', 'with']
}
},
perform: runAsAction(async (agent, player_name, direction) => {
await agent.vision_interpreter.lookAtPlayer(player_name, direction);
})
},
{
name: '!lookAtPosition',
description: 'Look at specified coordinates.',
params: {
'x': { type: 'int', description: 'x coordinate' },
'y': { type: 'int', description: 'y coordinate' },
'z': { type: 'int', description: 'z coordinate' }
},
perform: runAsAction(async (agent, x, y, z) => {
await agent.vision_interpreter.lookAtPosition(x, y, z);
})
}
];
4 changes: 3 additions & 1 deletion src/agent/library/skills.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import * as mc from "../../utils/mcdata.js";
import { Camera } from "../../utils/camera.js";
import * as world from "./world.js";
import pf from 'mineflayer-pathfinder';
import Vec3 from 'vec3';
import fs from 'fs';


export function log(bot, message) {
Expand Down Expand Up @@ -1350,4 +1352,4 @@ export async function activateNearestBlock(bot, type) {
await bot.activateBlock(block);
log(bot, `Activated ${type} at x:${block.position.x.toFixed(1)}, y:${block.position.y.toFixed(1)}, z:${block.position.z.toFixed(1)}.`);
return true;
}
}
104 changes: 104 additions & 0 deletions src/agent/vision_interpreter.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import { Vec3 } from 'vec3';
import { Camera } from "../utils/camera.js";
import fs from 'fs';
import { log } from './library/skills.js';
import * as world from './library/world.js';

const pad = (str) => {
return '\n' + str + '\n';
}

export class VisionInterpreter {
constructor(agent, allow_vision) {
this.agent = agent;
this.allow_vision = allow_vision;
this.fp = './bots/'+agent.name+'/screenshots/';
}

async lookAtPlayer(player_name, direction) {
const bot = this.agent.bot;
const player = bot.players[player_name]?.entity;
if (!player) {
log(bot, `Could not find player ${player_name}`);
}

let filename;
if (direction === 'with') {
await bot.look(player.yaw, player.pitch);
const camera = new Camera(bot, this.fp);
await new Promise(resolve => setTimeout(resolve, 500));
log(bot, `Looking in the same direction as ${player_name}`);
filename = await camera.capture();
} else {
await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
const camera = new Camera(bot, this.fp);
await new Promise(resolve => setTimeout(resolve, 500));
log(bot, `Looking at player ${player_name}`);
filename = await camera.capture();
}

if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
log(this.agent.bot, this._nearbyBlocks());
} else {
await this.analyzeImage(filename);
}
}

async lookAtPosition(x, y, z) {
const bot = this.agent.bot;
await bot.lookAt(new Vec3(x, y + 2, z));
const camera = new Camera(bot, this.fp);
await new Promise(resolve => setTimeout(resolve, 500));
log(bot, `Looking at coordinate ${x, y, z}`);

let filename = await camera.capture();

if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
log(this.agent.bot, this._nearbyBlocks());
} else {
await this.analyzeImage(filename);
}
}

async analyzeImage(filename) {
let prompt = this.agent.prompter.profile.image_conversing;
let res = null;

try {
const bot = this.agent.bot;
const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`);
const messages = this.agent.history.getHistory();
res = await this.agent.prompter.vision_model.sendVisionRequest(messages, prompt, imageBuffer);

if (res == 'Vision is only supported by certain models.') {
log(bot, "Vision may not be supported on this model. Using text-based environment description instead.");
log(bot, this._nearbyBlocks());
} else {
log(bot, res);
}

} catch (error) {
log(this.agent.bot, `Error analyzing image: ${error.message}`);
}
}

_nearbyBlocks() {
const bot = this.agent.bot;
let res = 'NEARBY_BLOCKS';

let blocks = world.getNearbyBlockTypes(bot);
for (let i = 0; i < blocks.length; i++) {
res += `\n- ${blocks[i]}`;
}
if (blocks.length == 0) {
res += ': none';
} else {
// Environmental Awareness
res += '\n- ' + world.getSurroundingBlocks(bot).join('\n- ')
res += `\n- First Solid Block Above Head: ${world.getFirstBlockAboveHead(bot, null, 32)}`;
}
return pad(res);
}
}
32 changes: 28 additions & 4 deletions src/models/claude.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,40 @@ export class Claude {
res = resp.content[0].text;
}
catch (err) {
if (err.message.includes("does not support image input")) {
res = "Vision is only supported by certain models.";
} else {
res = "My brain disconnected, try again.";
}
console.log(err);
res = 'My brain disconnected, try again.';
}
return res;
}

async sendVisionRequest(turns, systemMessage, imageBuffer) {
const imageMessages = [...turns];
imageMessages.push({
role: "user",
content: [
{
type: "text",
text: systemMessage
},
{
type: "image",
source: {
type: "base64",
media_type: "image/jpeg",
data: imageBuffer.toString('base64')
}
}
]
});

return this.sendRequest(imageMessages, systemMessage);
}

async embed(text) {
throw new Error('Embeddings are not supported by Claude.');
}
}



45 changes: 45 additions & 0 deletions src/models/gemini.js
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,51 @@ export class Gemini {
return text;
}

async sendVisionRequest(turns, systemMessage, imageBuffer) {
let model;
if (this.url) {
model = this.genAI.getGenerativeModel(
{ model: this.model_name || "gemini-1.5-flash" },
{ baseUrl: this.url },
{ safetySettings: this.safetySettings }
);
} else {
model = this.genAI.getGenerativeModel(
{ model: this.model_name || "gemini-1.5-flash" },
{ safetySettings: this.safetySettings }
);
}

const imagePart = {
inlineData: {
data: imageBuffer.toString('base64'),
mimeType: 'image/jpeg'
}
};

const stop_seq = '***';
const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model');
let res = null;
try {
console.log('Awaiting Google API vision response...');
const result = await model.generateContent([prompt, imagePart]);
const response = await result.response;
const text = response.text();
console.log('Received.');
if (!text.includes(stop_seq)) return text;
const idx = text.indexOf(stop_seq);
res = text.slice(0, idx);
} catch (err) {
console.log(err);
if (err.message.includes("Image input modality is not enabled for models/")) {
res = "Vision is only supported by certain models.";
} else {
res = "An unexpected error occurred, please try again.";
}
}
return res;
}

async embed(text) {
let model;
if (this.url) {
Expand Down
Loading