continuedev · Fmstrat · Feb 25, 2025
diff --git a/core/config/types.ts b/core/config/types.ts
@@ -843,6 +843,7 @@ declare global {
     numThreads?: number;
     useMmap?: boolean;
     keepAlive?: number;
+    numGpu?: number;
     raw?: boolean;
     stream?: boolean;
     prediction?: Prediction;

diff --git a/core/control-plane/schema.ts b/core/control-plane/schema.ts
@@ -58,6 +58,7 @@ const modelDescriptionSchema = z.object({
       numThreads: z.number().optional(),
       useMmap: z.boolean().optional(),
       keepAlive: z.number().optional(),
+      numGpu: z.number().optional(),
       raw: z.boolean().optional(),
       stream: z.boolean().optional(),
     })

diff --git a/core/index.d.ts b/core/index.d.ts
@@ -903,6 +903,7 @@ export interface BaseCompletionOptions {
   numThreads?: number;
   useMmap?: boolean;
   keepAlive?: number;
+  numGpu?: number;
   raw?: boolean;
   stream?: boolean;
   prediction?: Prediction;

diff --git a/core/llm/llms/Ollama.ts b/core/llm/llms/Ollama.ts
@@ -33,12 +33,12 @@ interface OllamaModelFileParams {
   top_k?: number;
   top_p?: number;
   min_p?: number;
+  num_gpu?: number;
 
   // deprecated or not directly supported here:
   num_thread?: number;
   use_mmap?: boolean;
   num_gqa?: number;
-  num_gpu?: number;
   num_keep?: number;
   typical_p?: number;
   presence_penalty?: number;
@@ -264,6 +264,7 @@ class Ollama extends BaseLLM {
       num_thread: options.numThreads,
       use_mmap: options.useMmap,
       min_p: options.minP,
+      num_gpu: options.numGpu,
     };
   }
 

diff --git a/docs/docs/json-reference.md b/docs/docs/json-reference.md
@@ -155,6 +155,7 @@ Parameters that control the behavior of text generation and completion settings.
 - `maxTokens`: The maximum number of tokens to generate in a completion (default: `2048`).
 - `numThreads`: The number of threads used during the generation process. Available only for Ollama as `num_thread`.
 - `keepAlive`: For Ollama, this parameter sets the number of seconds to keep the model loaded after the last request, unloading it from memory if inactive (default: `1800` seconds, or 30 minutes).
+- `numGpu`: For Ollama, this parameter overrides the number of gpu layers that will be used to load the model into VRAM.
 - `useMmap`: For Ollama, this parameter allows the model to be mapped into memory. If disabled can enhance response time on low end devices but will slow down the stream.
 
 Example

diff --git a/docs/docs/reference.md b/docs/docs/reference.md
@@ -157,6 +157,7 @@ Parameters that control the behavior of text generation and completion settings.
 - `maxTokens`: The maximum number of tokens to generate in a completion (default: `2048`).
 - `numThreads`: The number of threads used during the generation process. Available only for Ollama as `num_thread`.
 - `keepAlive`: For Ollama, this parameter sets the number of seconds to keep the model loaded after the last request, unloading it from memory if inactive (default: `1800` seconds, or 30 minutes).
+- `numGpu`: For Ollama, this parameter overrides the number of gpu layers that will be used to load the model into VRAM.
 - `useMmap`: For Ollama, this parameter allows the model to be mapped into memory. If disabled can enhance response time on low end devices but will slow down the stream.
 
 Example

diff --git a/docs/i18n/zh-CN/docusaurus-plugin-content-docs/current/reference.md b/docs/i18n/zh-CN/docusaurus-plugin-content-docs/current/reference.md
@@ -155,6 +155,7 @@ Parameters that control the behavior of text generation and completion settings.
 - `maxTokens`: The maximum number of tokens to generate in a completion (default: `2048`).
 - `numThreads`: The number of threads used during the generation process. Available only for Ollama as `num_thread`.
 - `keepAlive`: For Ollama, this parameter sets the number of seconds to keep the model loaded after the last request, unloading it from memory if inactive (default: `1800` seconds, or 30 minutes).
+- `numGpu`: For Ollama, this parameter overrides the number of gpu layers that will be used to load the model into VRAM.
 - `useMmap`: For Ollama, this parameter allows the model to be mapped into memory. If disabled can enhance response time on low end devices but will slow down the stream.
 
 Example

diff --git a/extensions/vscode/config_schema.json b/extensions/vscode/config_schema.json
@@ -71,6 +71,11 @@
           "title": "Ollama keep_alive",
           "description": "The number of seconds after no requests are made to unload the model from memory. Defaults to 60*30 = 30min",
           "type": "integer"
+        },
+        "numGpu": {
+          "title": "Ollama num_gpu",
+          "description": "Override the number of gpu layers used when loading the model",
+          "type": "integer"
         }
       }
     },

diff --git a/packages/config-types/src/index.ts b/packages/config-types/src/index.ts
@@ -13,6 +13,7 @@ export const completionOptionsSchema = z.object({
   numThreads: z.number().optional(),
   useMmap: z.boolean().optional(),
   keepAlive: z.number().optional(),
+  numGpu: z.number().optional(),
   raw: z.boolean().optional(),
   stream: z.boolean().optional(),
 });