research embeddings

gespispace · gespispace · commit 17009c622fad · 2024-05-20T17:11:09.000+03:00
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -216,4 +216,4 @@
     "@xenova/transformers": "^2.17.1",
     "langchain": "^0.1.17"
   }
-}
+}
diff --git a/src/extension.ts b/src/extension.ts
@@ -11,6 +11,7 @@ import { tokenizer } from "./common/prompt/tokenizer";
 import { login } from "./common/auth";
 import { secretsStorage } from "./common/utils/secretStore";
 import { getSuppabaseClient } from "./common/auth/supabaseClient";
+import { startTest } from "./test";
 
 export async function activate(context: vscode.ExtensionContext) {
   FirecoderTelemetrySenderInstance.init(context);
@@ -40,6 +41,7 @@ export async function activate(context: vscode.ExtensionContext) {
 
   statusBar.init(context);
   await tokenizer.init();
+  startTest();
 
   context.subscriptions.push(
     vscode.commands.registerCommand(
diff --git a/src/hft.ts b/src/hft.ts
@@ -0,0 +1,122 @@
+import { Embeddings, type EmbeddingsParams } from "@langchain/core/embeddings";
+import { chunkArray } from "@langchain/core/utils/chunk_array";
+import { getSaveFolder } from "./common/download/utils";
+
+export interface HuggingFaceTransformersEmbeddingsParams
+  extends EmbeddingsParams {
+  /** Model name to use */
+  modelName: string;
+
+  /**
+   * Timeout to use when making requests to OpenAI.
+   */
+  timeout?: number;
+
+  /**
+   * The maximum number of documents to embed in a single request.
+   */
+  batchSize?: number;
+
+  /**
+   * Whether to strip new lines from the input text. This is recommended by
+   * OpenAI, but may not be suitable for all use cases.
+   */
+  stripNewLines?: boolean;
+}
+
+/**
+ * @example
+ * ```typescript
+ * const model = new HuggingFaceTransformersEmbeddings({
+ *   modelName: "Xenova/all-MiniLM-L6-v2",
+ * });
+ *
+ * // Embed a single query
+ * const res = await model.embedQuery(
+ *   "What would be a good company name for a company that makes colorful socks?"
+ * );
+ * console.log({ res });
+ *
+ * // Embed multiple documents
+ * const documentRes = await model.embedDocuments(["Hello world", "Bye bye"]);
+ * console.log({ documentRes });
+ * ```
+ */
+export class HuggingFaceTransformersEmbeddingsLocal
+  extends Embeddings
+  implements HuggingFaceTransformersEmbeddingsParams
+{
+  modelName = "Xenova/all-MiniLM-L6-v2";
+
+  batchSize = 1;
+
+  stripNewLines = true;
+
+  timeout?: number;
+
+  private pipelinePromise?: Promise<any>;
+
+  constructor(fields?: Partial<HuggingFaceTransformersEmbeddingsParams>) {
+    super(fields ?? {});
+
+    this.modelName = fields?.modelName ?? this.modelName;
+    this.stripNewLines = fields?.stripNewLines ?? this.stripNewLines;
+    this.timeout = fields?.timeout;
+  }
+
+  async embedDocuments(texts: string[]): Promise<number[][]> {
+    const batches = chunkArray(
+      this.stripNewLines ? texts.map((t) => t.replace(/\n/g, " ")) : texts,
+      this.batchSize
+    );
+
+    const batchRequests = batches.map((batch) => this.runEmbedding(batch));
+    const batchResponses = await Promise.all(batchRequests);
+    const embeddings: number[][] = [];
+
+    for (let i = 0; i < batchResponses.length; i += 1) {
+      const batchResponse = batchResponses[i];
+      for (let j = 0; j < batchResponse.length; j += 1) {
+        embeddings.push(batchResponse[j]);
+      }
+    }
+
+    return embeddings;
+  }
+
+  async embedQuery(text: string): Promise<number[]> {
+    const data = await this.runEmbedding([
+      this.stripNewLines ? text.replace(/\n/g, " ") : text,
+    ]);
+    return data[0];
+  }
+
+  private async runEmbedding(texts: string[]) {
+    return this.caller.call(async () => {
+      try {
+        const { pipeline } = await import("@xenova/transformers");
+
+        const pipe = await (this.pipelinePromise ??= pipeline(
+          "feature-extraction",
+          this.modelName,
+          {
+            cache_dir: await getSaveFolder(),
+            quantized: true,
+          }
+        ));
+
+        // cls 0.537 place 1
+        // mean 0.601 place 13
+        // none shit
+        const output = await pipe(texts, {
+          pooling: "cls",
+          normalize: true,
+        });
+        return output.tolist();
+      } catch (error) {
+        console.log(error);
+        debugger;
+      }
+    });
+  }
+}
diff --git a/src/test.ts b/src/test.ts
@@ -0,0 +1,49 @@
+import { HuggingFaceTransformersEmbeddingsLocal } from "./hft";
+import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
+import { TextLoader } from "langchain/document_loaders/fs/text";
+import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
+import { MemoryVectorStore } from "langchain/vectorstores/memory";
+
+export const startTest = async () => {
+  const loader = new DirectoryLoader(
+    "/home/gespispace/helper/helper-coder/src",
+    {
+      ".ts": (path) => new TextLoader(path),
+    }
+  );
+
+  const docs = await loader.load();
+
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
+    chunkSize: 4000,
+    chunkOverlap: 0,
+  });
+  const jsOutput = await splitter.splitDocuments(docs);
+  const vectorStore = await MemoryVectorStore.fromDocuments(
+    jsOutput,
+    new HuggingFaceTransformersEmbeddingsLocal({
+      // modelName: "jinaai/jina-embeddings-v2-base-code",
+      modelName: "Xenova/bge-m3",
+      maxConcurrency: 1,
+    })
+  );
+
+  const resultOne = await vectorStore.similaritySearchWithScore(
+    "what properties do we send with each events to telemetry?",
+    20
+  );
+  console.log(resultOne);
+  // const model = new HuggingFaceTransformersEmbeddingsLocal({
+  //   // modelName: "jinaai/jina-embeddings-v2-base-code",
+  //   modelName: "Xenova/bge-m3",
+  // });
+
+  // /* Embed queries */
+  // const res = await model.embedQuery(
+  //   "What would be a good company name for a company that makes colorful socks?"
+  // );
+  // console.log({ res });
+  // /* Embed documents */
+  // const documentRes = await model.embedDocuments(["Hello world", "Bye bye"]);
+  // console.log({ documentRes });
+};

Original file line number	Diff line number	Diff line change
`@@ -216,4 +216,4 @@`
`216`	`216`	`"@xenova/transformers": "^2.17.1",`
`217`	`217`	`"langchain": "^0.1.17"`
`218`	`218`	`}`
`219`		`-}`
	`219`	`+}`