-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathingest.py
49 lines (41 loc) · 1.68 KB
/
ingest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from pathlib import Path
import csv
from langchain.text_splitter import CharacterTextSplitter
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import pickle
from dotenv import load_dotenv
import os
load_dotenv() # take environment variables from .env.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Prepare list of all CSV file paths
file_paths = list(Path("Data/").glob("**/*.csv"))
docs = []
metadatas = []
# Initialize the text splitter with desired chunk size
text_splitter = CharacterTextSplitter(chunk_size=1900, separator="\n")
for file_path in file_paths:
try:
# Use 'with' statement to ensure file is properly closed after being used
with open(file_path, 'r') as file:
csv_reader = csv.DictReader(file)
# Process each line in the CSV
for row in csv_reader:
# Concatenate column values into a single string, prefixed with their column names
data = ' '.join([f'{k}: {v}' for k, v in row.items()])
# Split the data into chunks
splits = text_splitter.split_text(data)
docs.extend(splits)
# Associate each chunk with its source file
metadatas.extend([{"source": str(file_path)}] * len(splits))
except Exception as e:
print(f"Error reading file {file_path}: {e}")
# Create a vector store from the documents
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
# Save the index separately
faiss.write_index(store.index, "docs.index")
# Save the store without the index
store.index = None
with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f)