Doux Thibault commited on
Commit
3e299e4
·
1 Parent(s): 720638f

rag script and more requirements

Browse files
Files changed (2) hide show
  1. Modules/rag.py +81 -0
  2. requirements.txt +7 -1
Modules/rag.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ['TOKENIZERS_PARALLELISM'] = 'true'
3
+ os.environ['MISTRAL_API_KEY'] = "i5jSJkCFNGKfgIztloxTMjfckiFbYBj4"
4
+ os.environ['OPENAI_API_KEY'] = ""
5
+ os.environ['TAVILY_API_KEY'] = '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'
6
+
7
+ mistral_api_key = os.getenv("MISTRAL_API_KEY")
8
+ tavily_api_key = os.getenv("TAVILY_API_KEY")
9
+
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain_community.document_loaders import WebBaseLoader
12
+ from langchain_community.vectorstores import Chroma, FAISS
13
+ from langchain_mistralai import MistralAIEmbeddings
14
+ from langchain_openai import OpenAIEmbeddings
15
+ from typing import Literal
16
+
17
+ from langchain_core.prompts import ChatPromptTemplate
18
+ from langchain_core.pydantic_v1 import BaseModel, Field
19
+ from langchain_mistralai import ChatMistralAI
20
+ from sentence_transformers import SentenceTransformer
21
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
22
+ from transformers import AutoModel, AutoTokenizer
23
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
24
+
25
+ urls = [
26
+ "https://lilianweng.github.io/posts/2023-06-23-agent/",
27
+ "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
28
+ "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
29
+ ]
30
+
31
+ docs = [WebBaseLoader(url).load() for url in urls]
32
+ docs_list = [item for sublist in docs for item in sublist]
33
+
34
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
35
+ chunk_size=250, chunk_overlap=0
36
+ )
37
+ doc_splits = text_splitter.split_documents(docs_list)
38
+
39
+ ##################### EMBED #####################
40
+ # embeddings = MistralAIEmbeddings(mistral_api_key=mistral_api_key)
41
+ embeddings = OpenAIEmbeddings()
42
+ ############## VECTORSTORE ##################
43
+ # vectorstore = FAISS.from_documents(
44
+ # documents=doc_splits,
45
+ # embedding=embeddings
46
+ # )
47
+ vectorstore = Chroma.from_documents(
48
+ documents=doc_splits,
49
+ collection_name="rag-chroma",
50
+ embedding=embeddings
51
+ )
52
+ retriever = vectorstore.as_retriever()
53
+
54
+ # Data model
55
+ class RouteQuery(BaseModel):
56
+ """Route a user query to the most relevant datasource."""
57
+
58
+ datasource: Literal["vectorstore", "websearch"] = Field(
59
+ ...,
60
+ description="Given a user question choose to route it to web search or a vectorstore.",
61
+ )
62
+
63
+ # LLM with function call
64
+ # llm = ChatMistralAI(model="mistral-large-latest", mistral_api_key=mistral_api_key, temperature=0)
65
+
66
+ # structured_llm_router = llm.with_structured_output(RouteQuery)
67
+
68
+ # # Prompt
69
+ # system = """You are an expert at routing a user question to a vectorstore or web search.
70
+ # The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.
71
+ # Use the vectorstore for questions on these topics. For all else, use web-search."""
72
+ # route_prompt = ChatPromptTemplate.from_messages(
73
+ # [
74
+ # ("system", system),
75
+ # ("human", "{question}"),
76
+ # ]
77
+ # )
78
+
79
+ # question_router = route_prompt | structured_llm_router
80
+ # print(question_router.invoke({"question": "Who will the Bears draft first in the NFL draft?"}))
81
+ # print(question_router.invoke({"question": "What are the types of agent memory?"}))
requirements.txt CHANGED
@@ -5,4 +5,10 @@ langchain-mistralai
5
  pandas
6
  langchain-community
7
  st_audiorec
8
- openai-whisper
 
 
 
 
 
 
 
5
  pandas
6
  langchain-community
7
  st_audiorec
8
+ openai-whisper
9
+ tiktoken
10
+ langchain
11
+ bs4
12
+ chromadb
13
+ langgraph
14
+ langchainhub