Skip to content

Commit fd06466

Browse files
authoredApr 19, 2024
Add files via upload
1 parent 1eb0926 commit fd06466

File tree

1 file changed

+626
-0
lines changed

1 file changed

+626
-0
lines changed
 
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,626 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "abf98e2c-a73d-45d6-8dc8-c0c86a894adb",
6+
"metadata": {},
7+
"source": [
8+
"# Question-Answering Application Using LangChain and Chromadb"
9+
]
10+
},
11+
{
12+
"cell_type": "markdown",
13+
"id": "60f0ee27-75b0-4091-8714-95b632bbb20c",
14+
"metadata": {},
15+
"source": [
16+
"### Installing requried libraries"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 1,
22+
"id": "9465caf5-6ac9-4cb4-9f16-10e66fb6a1a7",
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"!pip install openai -q\n",
27+
"!pip install langchain -q\n",
28+
"!pip install -q chromadb\n",
29+
"!pip install python-dotenv -q\n",
30+
"!pip install tiktoken -q\n",
31+
"!pip install pypdf -q\n",
32+
"!pip install docx2txt -q"
33+
]
34+
},
35+
{
36+
"cell_type": "markdown",
37+
"id": "14685b00-308e-4148-9bbe-d6d639692533",
38+
"metadata": {},
39+
"source": [
40+
"### Load Environment Variable"
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": 3,
46+
"id": "1d0c1517-8f25-47a2-bfbf-f38833544550",
47+
"metadata": {},
48+
"outputs": [
49+
{
50+
"name": "stdout",
51+
"output_type": "stream",
52+
"text": [
53+
"API Key Loaded: True\n"
54+
]
55+
}
56+
],
57+
"source": [
58+
"import os\n",
59+
"from dotenv import load_dotenv, find_dotenv\n",
60+
"load_dotenv(find_dotenv(), override=True)\n",
61+
"\n",
62+
"os.environ.get('OPENAI_API_KEY')\n",
63+
"\n",
64+
"print(\"API Key Loaded:\", os.environ.get('OPENAI_API_KEY') is not None)"
65+
]
66+
},
67+
{
68+
"cell_type": "markdown",
69+
"id": "d87b1b89-0a66-4d5e-8fde-2cf99d0e1920",
70+
"metadata": {},
71+
"source": [
72+
"### Load documents with different formats"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": 4,
78+
"id": "1d657673-4dac-42ce-ab62-471ea9e4220c",
79+
"metadata": {},
80+
"outputs": [],
81+
"source": [
82+
"def extract_text_from_document(file):\n",
83+
" import os\n",
84+
" name, extension = os.path.splitext(file)\n",
85+
"\n",
86+
" if extension == '.pdf':\n",
87+
" from langchain.document_loaders import PyPDFLoader\n",
88+
" print(f'Loading {file}')\n",
89+
" loader = PyPDFLoader(file)\n",
90+
" elif extension == '.docx':\n",
91+
" from langchain.document_loaders import Docx2txtLoader\n",
92+
" print(f'Loading {file}')\n",
93+
" loader = Docx2txtLoader(file)\n",
94+
" else:\n",
95+
" print('Document format is not supported by our application!')\n",
96+
" return None\n",
97+
"\n",
98+
" data = loader.load()\n",
99+
" return data"
100+
]
101+
},
102+
{
103+
"cell_type": "markdown",
104+
"id": "075e173d-4832-48a2-be10-31008d0bb0db",
105+
"metadata": {},
106+
"source": [
107+
"### Chunking Strategies and splitting the documents"
108+
]
109+
},
110+
{
111+
"cell_type": "code",
112+
"execution_count": 5,
113+
"id": "823f7daa-1935-4613-a805-5769953cadca",
114+
"metadata": {},
115+
"outputs": [],
116+
"source": [
117+
"def split_text_into_chunks(data, chunk_size=256):\n",
118+
" from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
119+
" text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)\n",
120+
" chunks = text_splitter.split_documents(data)\n",
121+
"\n",
122+
" return chunks"
123+
]
124+
},
125+
{
126+
"cell_type": "markdown",
127+
"id": "002a7874-dd21-4fb4-9496-4c36c97b2f46",
128+
"metadata": {},
129+
"source": [
130+
"### Create the function for generating questions and answers"
131+
]
132+
},
133+
{
134+
"cell_type": "code",
135+
"execution_count": 6,
136+
"id": "6616d5ed-19d6-4f29-8a1f-2aed551c181a",
137+
"metadata": {},
138+
"outputs": [],
139+
"source": [
140+
"def generate_answer_from_vector_store(vector_store, question):\n",
141+
" from langchain.chains import RetrievalQA\n",
142+
" from langchain_openai import ChatOpenAI\n",
143+
"\n",
144+
" llm = ChatOpenAI(model='gpt-4', temperature=1)\n",
145+
"\n",
146+
" retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})\n",
147+
"\n",
148+
" chain = RetrievalQA.from_chain_type(llm=llm, chain_type=\"stuff\", retriever=retriever)\n",
149+
"\n",
150+
" answer = chain.invoke(question)\n",
151+
"\n",
152+
" return answer"
153+
]
154+
},
155+
{
156+
"cell_type": "markdown",
157+
"id": "69eb9733-9d11-408b-b84c-dc552875ed1f",
158+
"metadata": {},
159+
"source": [
160+
"### Define Create Function for Embedding"
161+
]
162+
},
163+
{
164+
"cell_type": "code",
165+
"execution_count": 7,
166+
"id": "ebb25535-d400-4873-8713-573f3cdd28c7",
167+
"metadata": {},
168+
"outputs": [],
169+
"source": [
170+
"def create_embeddings_chroma(chunks, persist_directory='./chroma_db'):\n",
171+
" from langchain.vectorstores import Chroma\n",
172+
" from langchain_openai import OpenAIEmbeddings\n",
173+
"\n",
174+
" embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)\n",
175+
" vector_store = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)\n",
176+
"\n",
177+
" return vector_store"
178+
]
179+
},
180+
{
181+
"cell_type": "markdown",
182+
"id": "fc2ec332-9ddf-45de-9611-23e4ea1ff34d",
183+
"metadata": {},
184+
"source": [
185+
"### Define Load Emedding Function"
186+
]
187+
},
188+
{
189+
"cell_type": "code",
190+
"execution_count": 8,
191+
"id": "f747c7b3-7c65-4041-909d-79547607853a",
192+
"metadata": {},
193+
"outputs": [],
194+
"source": [
195+
"def load_embeddings_chroma(persist_directory='./chroma_db'):\n",
196+
" from langchain.vectorstores import Chroma\n",
197+
" from langchain_openai import OpenAIEmbeddings\n",
198+
"\n",
199+
" embedding = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)\n",
200+
" vector_store= Chroma(persist_directory=persist_directory, embedding_function = embedding)\n",
201+
" return vector_store"
202+
]
203+
},
204+
{
205+
"cell_type": "markdown",
206+
"id": "f1766950-f03a-4a23-b6a1-65fd70b6ddcf",
207+
"metadata": {},
208+
"source": [
209+
"### Testing"
210+
]
211+
},
212+
{
213+
"cell_type": "code",
214+
"execution_count": 9,
215+
"id": "49cc8bcb-79f2-428c-a577-7887f73aa6f6",
216+
"metadata": {},
217+
"outputs": [
218+
{
219+
"name": "stdout",
220+
"output_type": "stream",
221+
"text": [
222+
"Loading Files/GoogleGeminiFamily.pdf\n"
223+
]
224+
}
225+
],
226+
"source": [
227+
"data = extract_text_from_document('Files/GoogleGeminiFamily.pdf')\n",
228+
"chunks = split_text_into_chunks(data, chunk_size=256)\n",
229+
"vector_store = create_embeddings_chroma(chunks)"
230+
]
231+
},
232+
{
233+
"cell_type": "code",
234+
"execution_count": 12,
235+
"id": "53dc93e0-7818-46ba-bb23-2195b25c6c8e",
236+
"metadata": {},
237+
"outputs": [
238+
{
239+
"name": "stdout",
240+
"output_type": "stream",
241+
"text": [
242+
"The \"Gemini family\" likely refers to the members involved in the Gemini project at Google. This includes Google DeepMind (GDM), Google Research (GR), Knowledge and Information (K&I), Core ML, Cloud, Labs, and more.\n"
243+
]
244+
}
245+
],
246+
"source": [
247+
"question = 'What is Google Gemini Family?'\n",
248+
"answer = generate_answer_from_vector_store(vector_store, question)\n",
249+
"print(answer['result'])"
250+
]
251+
},
252+
{
253+
"cell_type": "code",
254+
"execution_count": null,
255+
"id": "7e634b6f-d093-4e60-883f-da73cb217208",
256+
"metadata": {},
257+
"outputs": [],
258+
"source": []
259+
},
260+
{
261+
"cell_type": "code",
262+
"execution_count": null,
263+
"id": "739f01ac-b0c3-40fb-a0f9-776f2864fc63",
264+
"metadata": {},
265+
"outputs": [],
266+
"source": []
267+
},
268+
{
269+
"cell_type": "code",
270+
"execution_count": null,
271+
"id": "ac2bc391-987b-48aa-a6a8-2c1f2a564286",
272+
"metadata": {},
273+
"outputs": [],
274+
"source": []
275+
},
276+
{
277+
"cell_type": "code",
278+
"execution_count": null,
279+
"id": "35603e84-2565-43ca-b417-c6c7c1b26d98",
280+
"metadata": {},
281+
"outputs": [],
282+
"source": []
283+
},
284+
{
285+
"cell_type": "code",
286+
"execution_count": null,
287+
"id": "c33b2365-6323-4e17-b66d-9b7d796cfd30",
288+
"metadata": {},
289+
"outputs": [],
290+
"source": []
291+
},
292+
{
293+
"cell_type": "code",
294+
"execution_count": null,
295+
"id": "0f570688-7aac-42f4-998e-f478ca2b5554",
296+
"metadata": {},
297+
"outputs": [],
298+
"source": []
299+
},
300+
{
301+
"cell_type": "code",
302+
"execution_count": null,
303+
"id": "1a2aca9c-4806-45fe-a9cd-a5beabbaf2b9",
304+
"metadata": {},
305+
"outputs": [],
306+
"source": []
307+
},
308+
{
309+
"cell_type": "code",
310+
"execution_count": null,
311+
"id": "c04ea573-6dd5-4daf-9863-954a643679be",
312+
"metadata": {},
313+
"outputs": [],
314+
"source": []
315+
},
316+
{
317+
"cell_type": "code",
318+
"execution_count": null,
319+
"id": "c2ca1315-5e66-4626-996c-954347792ced",
320+
"metadata": {},
321+
"outputs": [],
322+
"source": []
323+
},
324+
{
325+
"cell_type": "code",
326+
"execution_count": null,
327+
"id": "ae5d6c33-587c-400a-bbdf-b8dc6b095eb5",
328+
"metadata": {},
329+
"outputs": [],
330+
"source": []
331+
},
332+
{
333+
"cell_type": "code",
334+
"execution_count": null,
335+
"id": "f2a19225-faa2-4b2b-b3f8-925b97e10805",
336+
"metadata": {},
337+
"outputs": [],
338+
"source": []
339+
},
340+
{
341+
"cell_type": "code",
342+
"execution_count": null,
343+
"id": "af14f695-8bc3-41fe-9fac-fa11eb8854d8",
344+
"metadata": {},
345+
"outputs": [],
346+
"source": []
347+
},
348+
{
349+
"cell_type": "code",
350+
"execution_count": null,
351+
"id": "bb2ae2c7-99a0-4a17-ac13-9e3b0f759cfe",
352+
"metadata": {},
353+
"outputs": [],
354+
"source": []
355+
},
356+
{
357+
"cell_type": "code",
358+
"execution_count": null,
359+
"id": "525a71cd-42ad-4e59-aa4f-4d06cdf6946e",
360+
"metadata": {},
361+
"outputs": [],
362+
"source": []
363+
},
364+
{
365+
"cell_type": "code",
366+
"execution_count": null,
367+
"id": "0608142a-ff40-47cb-a7ab-36263eea6666",
368+
"metadata": {},
369+
"outputs": [],
370+
"source": []
371+
},
372+
{
373+
"cell_type": "code",
374+
"execution_count": null,
375+
"id": "991312b7-0509-4fe6-b3e8-b7a4fd75778d",
376+
"metadata": {},
377+
"outputs": [],
378+
"source": []
379+
},
380+
{
381+
"cell_type": "code",
382+
"execution_count": null,
383+
"id": "e52bbc40-2840-4eaa-ac53-12104760a2c7",
384+
"metadata": {},
385+
"outputs": [],
386+
"source": []
387+
},
388+
{
389+
"cell_type": "code",
390+
"execution_count": null,
391+
"id": "8e049c5a-0e98-4d72-ba16-cd73b5d408fb",
392+
"metadata": {},
393+
"outputs": [],
394+
"source": []
395+
},
396+
{
397+
"cell_type": "code",
398+
"execution_count": null,
399+
"id": "bb336154-bd7c-428e-9fcd-e81e9f16ac4a",
400+
"metadata": {},
401+
"outputs": [],
402+
"source": []
403+
},
404+
{
405+
"cell_type": "code",
406+
"execution_count": null,
407+
"id": "f8da1d23-d4f8-46ad-aaf9-8999db15fdcd",
408+
"metadata": {},
409+
"outputs": [],
410+
"source": []
411+
},
412+
{
413+
"cell_type": "code",
414+
"execution_count": null,
415+
"id": "4c990776-f9ec-4494-b321-73f413754949",
416+
"metadata": {},
417+
"outputs": [],
418+
"source": []
419+
},
420+
{
421+
"cell_type": "code",
422+
"execution_count": null,
423+
"id": "3bbfb4b5-b92d-4087-90f7-752000760634",
424+
"metadata": {},
425+
"outputs": [],
426+
"source": []
427+
},
428+
{
429+
"cell_type": "code",
430+
"execution_count": null,
431+
"id": "40da3482-2788-4dd9-97c4-b1abfb3dffde",
432+
"metadata": {},
433+
"outputs": [],
434+
"source": []
435+
},
436+
{
437+
"cell_type": "code",
438+
"execution_count": null,
439+
"id": "2a4af43c-046a-4036-8ebe-606ac585fa26",
440+
"metadata": {},
441+
"outputs": [],
442+
"source": []
443+
},
444+
{
445+
"cell_type": "code",
446+
"execution_count": null,
447+
"id": "e6b69d07-8552-47c9-a9b6-ad143a0888de",
448+
"metadata": {},
449+
"outputs": [],
450+
"source": []
451+
},
452+
{
453+
"cell_type": "code",
454+
"execution_count": null,
455+
"id": "9cf33d97-8185-4175-82d4-0b07dfe749a4",
456+
"metadata": {},
457+
"outputs": [],
458+
"source": []
459+
},
460+
{
461+
"cell_type": "code",
462+
"execution_count": null,
463+
"id": "285a7728-a011-4b0a-9b0f-c343ef5f5efc",
464+
"metadata": {},
465+
"outputs": [],
466+
"source": []
467+
},
468+
{
469+
"cell_type": "code",
470+
"execution_count": null,
471+
"id": "e3fa6b4d-284d-4a5b-b4b3-da45979201bf",
472+
"metadata": {},
473+
"outputs": [],
474+
"source": []
475+
},
476+
{
477+
"cell_type": "code",
478+
"execution_count": null,
479+
"id": "6cf6cf16-bd3b-44a9-90c2-e9397dca71ea",
480+
"metadata": {},
481+
"outputs": [],
482+
"source": []
483+
},
484+
{
485+
"cell_type": "code",
486+
"execution_count": null,
487+
"id": "44636c38-8218-4f2c-b537-12fb26b3ab34",
488+
"metadata": {},
489+
"outputs": [],
490+
"source": []
491+
},
492+
{
493+
"cell_type": "code",
494+
"execution_count": null,
495+
"id": "0525deb1-44c9-4aaf-8ee1-33a13d328dc5",
496+
"metadata": {},
497+
"outputs": [],
498+
"source": []
499+
},
500+
{
501+
"cell_type": "code",
502+
"execution_count": null,
503+
"id": "d700d7d2-fb46-46b7-9e24-600346b01438",
504+
"metadata": {},
505+
"outputs": [],
506+
"source": []
507+
},
508+
{
509+
"cell_type": "code",
510+
"execution_count": null,
511+
"id": "78d48ec2-22c3-41f4-bf79-0063c6da5cde",
512+
"metadata": {},
513+
"outputs": [],
514+
"source": []
515+
},
516+
{
517+
"cell_type": "code",
518+
"execution_count": null,
519+
"id": "95c90c49-0785-40c9-b829-8afa8a7c2e48",
520+
"metadata": {},
521+
"outputs": [],
522+
"source": []
523+
},
524+
{
525+
"cell_type": "code",
526+
"execution_count": null,
527+
"id": "ac3b7fb9-c11c-4db9-9e4f-d7c3e50da79b",
528+
"metadata": {},
529+
"outputs": [],
530+
"source": []
531+
},
532+
{
533+
"cell_type": "code",
534+
"execution_count": null,
535+
"id": "6de9e1e4-007b-4318-b800-105377decc2c",
536+
"metadata": {},
537+
"outputs": [],
538+
"source": []
539+
},
540+
{
541+
"cell_type": "code",
542+
"execution_count": null,
543+
"id": "55339b66-3306-4e2e-b91e-208a9eb659f3",
544+
"metadata": {},
545+
"outputs": [],
546+
"source": []
547+
},
548+
{
549+
"cell_type": "code",
550+
"execution_count": null,
551+
"id": "8cc33950-49e9-43b5-8ea6-b1a54d9b08d9",
552+
"metadata": {},
553+
"outputs": [],
554+
"source": []
555+
},
556+
{
557+
"cell_type": "code",
558+
"execution_count": null,
559+
"id": "6c9dd290-72da-444a-b710-87a0000eb07c",
560+
"metadata": {},
561+
"outputs": [],
562+
"source": []
563+
},
564+
{
565+
"cell_type": "code",
566+
"execution_count": null,
567+
"id": "ca513d66-486c-4406-8ab7-2462ff5fa11b",
568+
"metadata": {},
569+
"outputs": [],
570+
"source": []
571+
},
572+
{
573+
"cell_type": "code",
574+
"execution_count": null,
575+
"id": "09b47f9c-f61a-47c0-827a-faeeaede244e",
576+
"metadata": {},
577+
"outputs": [],
578+
"source": []
579+
},
580+
{
581+
"cell_type": "code",
582+
"execution_count": null,
583+
"id": "a2d61b94-0a95-49c7-851c-e27cdee11ac4",
584+
"metadata": {},
585+
"outputs": [],
586+
"source": []
587+
},
588+
{
589+
"cell_type": "code",
590+
"execution_count": null,
591+
"id": "2b1a3f61-bd96-4fbc-a8c4-82d63b7cd510",
592+
"metadata": {},
593+
"outputs": [],
594+
"source": []
595+
},
596+
{
597+
"cell_type": "code",
598+
"execution_count": null,
599+
"id": "3a0a7de4-ce55-4f67-9895-65e43ddaa6cf",
600+
"metadata": {},
601+
"outputs": [],
602+
"source": []
603+
}
604+
],
605+
"metadata": {
606+
"kernelspec": {
607+
"display_name": "Python 3 (ipykernel)",
608+
"language": "python",
609+
"name": "python3"
610+
},
611+
"language_info": {
612+
"codemirror_mode": {
613+
"name": "ipython",
614+
"version": 3
615+
},
616+
"file_extension": ".py",
617+
"mimetype": "text/x-python",
618+
"name": "python",
619+
"nbconvert_exporter": "python",
620+
"pygments_lexer": "ipython3",
621+
"version": "3.11.7"
622+
}
623+
},
624+
"nbformat": 4,
625+
"nbformat_minor": 5
626+
}

0 commit comments

Comments
 (0)
Please sign in to comment.