{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/larawehbe/Documents/fakkerai/sehatech/pinecone-env/lib/python3.11/site-packages/pinecone/data/index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from tqdm.autonotebook import tqdm\n", "/Users/larawehbe/Documents/fakkerai/sehatech/pinecone-env/lib/python3.11/site-packages/langchain_openai/embeddings/base.py:281: UserWarning: WARNING! return_source_documents is not default parameter.\n", " return_source_documents was transferred to model_kwargs.\n", " Please confirm that return_source_documents is what you intended.\n", " warnings.warn(\n" ] } ], "source": [ "from langchain.vectorstores import Pinecone\n", "from langchain_openai import OpenAIEmbeddings\n", "import pinecone\n", "from langchain_pinecone import PineconeVectorStore\n", "from config import get_settings\n", "settings = get_settings()\n", "# Initialize Pinecone\n", "pc = pinecone.Pinecone(settings.PINECONE_API_KEY)\n", "import os \n", "\n", "os.environ['PINECONE_API_KEY'] = settings.PINECONE_API_KEY\n", "\n", "# Connect to the existing index\n", "index_name = settings.INDEX_NAME # Replace with your index name\n", "index = pc.Index(index_name)\n", "\n", "# Initialize embeddings (ensure your embedding logic matches the one used during indexing)\n", "embeddings = OpenAIEmbeddings(openai_api_key=settings.OPENAI_API_KEY, return_source_documents=True)\n", "\n", "# Create the retriever from Pinecone\n", "retriever = PineconeVectorStore(index=index, embedding=OpenAIEmbeddings())\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import os \n", "# Define the LLM\n", "os.environ[\"OPENAI_API_KEY\"] = settings.OPENAI_API_KEY\n", "\n", "from langchain_openai import ChatOpenAI\n", "\n", "llm = ChatOpenAI(model=\"gpt-4o\")" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'llm' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[1], line 25\u001b[0m\n\u001b[1;32m 17\u001b[0m prompt \u001b[38;5;241m=\u001b[39m ChatPromptTemplate\u001b[38;5;241m.\u001b[39mfrom_messages(\n\u001b[1;32m 18\u001b[0m [\n\u001b[1;32m 19\u001b[0m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msystem\u001b[39m\u001b[38;5;124m\"\u001b[39m, system_prompt),\n\u001b[1;32m 20\u001b[0m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhuman\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{input}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 21\u001b[0m ]\n\u001b[1;32m 22\u001b[0m )\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# Use your LLM instance\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m question_answer_chain \u001b[38;5;241m=\u001b[39m create_stuff_documents_chain(\u001b[43mllm\u001b[49m, prompt)\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# Create the RAG chain\u001b[39;00m\n\u001b[1;32m 28\u001b[0m rag_chain \u001b[38;5;241m=\u001b[39m create_retrieval_chain(retriever\u001b[38;5;241m.\u001b[39mas_retriever(), question_answer_chain)\n", "\u001b[0;31mNameError\u001b[0m: name 'llm' is not defined" ] } ], "source": [ "from langchain.chains import create_retrieval_chain\n", "from langchain.chains.combine_documents import create_stuff_documents_chain\n", "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "# Define the system prompt\n", "system_prompt = (\n", " \"You are an assistant for question-answering tasks. \"\n", " \"Use the following pieces of retrieved context to answer \"\n", " \"the question. If you don't know the answer, say that you \"\n", " \"don't know. Use three sentences maximum and keep the \"\n", " \"answer concise.\"\n", " \"\\n\\n\"\n", " \"{context}\"\n", ")\n", "\n", "# Create the chat prompt template\n", "prompt = ChatPromptTemplate.from_messages(\n", " [\n", " (\"system\", system_prompt),\n", " (\"human\", \"{input}\"),\n", " ]\n", ")\n", "\n", "# Use your LLM instance\n", "question_answer_chain = create_stuff_documents_chain(llm, prompt)\n", "\n", "# Create the RAG chain\n", "rag_chain = create_retrieval_chain(retriever.as_retriever(), question_answer_chain)\n", "\n", "# Invoke the RAG chain with a sample question\n", "results = rag_chain.invoke({\"input\": \"pubmed_pdfs/PMC1474056.pdf\"})\n", "print(results)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input': 'What are the benefits of using electronic health records for patient care??', 'context': [Document(id='chunk_100_63', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='The use of sub-cellular localization data and functional\\nannotation as filters for the predictions increased their overlap\\nwith experimental complexes, as compared with the unfil-\\ntered predictions. This finding is in agreement with previousobservations that combining multiple sources of information\\nimproves the accuracy of function annotation as well as inter-\\naction prediction (9–11). Our method easily allows for theuse of additional biological filters when other types of\\ndata are available, such as synthetic gene lethality (36),\\nco-expression (37), and so on. This incremental addition oforthogonal information is also necessary to more accuratelyrepresent the conditions in the cellular milieu, where the\\npropensity of two protein structures to interact is not limited\\nonly by the physical chemistry of the interaction, but also byhigher levels of biological regulation, including compartmen-\\ntalization, expression, degradation, abundance and so on.'), Document(id='chunk_100_80', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='23. Ghaemmaghami,S., Huh,W.K., Bower,K., Howson,R.W., Belle,A.,\\nDephoure,N., O’Shea,E.K. and Weissman,J.S. (2003) Global analysis\\nof protein expression in yeast. Nature ,425, 737–741.\\n24. Dwight,S.S., Harris,M.A., Dolinski,K., Ball,C.A., Binkley,G.,\\nChristie,K.R., Fisk,D.G., Issel-Tarver,L., Schroeder,M., Sherlock,G.et al. (2002) Saccharomyces Genome Database (SGD) provides\\nsecondary gene annotation using the gene ontology (GO). Nucleic\\nAcids Res. ,30, 69–72.\\n25. Fawcett,T. (2003) ROC graphs: notes and practical considerations for\\ndata mining researchers. Technical Report HPL-2003-4, HP Labs,Palo Alto, CA, USA.\\n26. Pieper,U., Eswar,N., Davis,F.P., Braberg,H., Madhusudhan,M.S.,\\nRossi,A., Marti-Renom,M., Karchin,R., Webb,B.M., Eranian,D. et al.\\n(2006) MODBASE: a database of annotated comparative proteinstructure models and associated resources. Nucleic Acids Res. ,34,\\nD291–D295.\\n27. Eswar,N., John,B., Mirkovic,N., Fiser,A., Ilyin,V.A., Pieper,U.,'), Document(id='chunk_100_73', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='used to help bridge the resolution gap between electroncryo-microscopy (cryo-EM) density maps and atomic protein\\nstructures (41). Fitting of protein and protein domain models\\ninto density maps of large assemblies is already common, butdepending on the resolution, the information encoded in the\\nmap is often insufficient for an unambiguous determination\\nof the positions and orientations of the individual proteins(42). Models of the complexes predicted here may provideadditional restraints for a more accurate fitting of proteins\\ninto large complexes studied by cryo-EM and electron\\ncryo-tomography (14,43).\\nAs the number and size of experimentally determined\\nstructures of protein complexes increase, the number of\\ncomplexes that can be predicted and modeled using thesestructures as templates increases correspondingly, expanding\\nthe structural coverage of protein interaction space (44).'), Document(id='chunk_100_84', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='Chung,S., Vidal,M. and Gerstein,M. (2004) Annotation transfer\\nbetween genomes: protein–protein interologs and protein–DNA\\nregulogs. Genome Res. ,14, 1107–1118.\\n39. Bornberg-Bauer,E., Beaussart,F., Kummerfeld,S.K., Teichmann,S.A.\\nand Weiner,J.,III (2005) The evolution of domain arrangementsin proteins and interaction networks. Cell Mol. Life Sci. ,62,\\n435–445.\\n40. Han,J.D., Bertin,N., Hao,T., Goldberg,D.S., Berriz,G.F., Zhang,L.V.,\\nDupuy,D., Walhout,A.J., Cusick,M.E., Roth,F.P. et al. (2004) Evidence\\nfor dynamically organized modularity in the yeast protein-proteininteraction network. Nature ,430, 88–93.\\n41. Topf,M. and Sali,A. (2005) Combining electron microscopy and\\ncomparative protein structure modeling. Curr. Opin. Struct. Biol. ,15,\\n578–585.\\n42. Fabiola,F. and Chapman,M.S. (2005) Fitting of high-resolution\\nstructures into electron microscopy reconstruction images. Structure ,\\n13, 389–400.\\n43. Sali,A., Glaeser,R., Earnest,T. and Baumeister,W. (2003) From words')], 'answer': 'The benefits of using electronic health records (EHRs) for patient care include improved accuracy and accessibility of patient information, which enhances coordination and reduces errors in treatment. EHRs facilitate better communication among healthcare providers, leading to more informed decision-making and streamlined care processes. Additionally, EHRs support data analysis for improved healthcare outcomes and can enhance patient engagement by providing them access to their health information.'}\n" ] } ], "source": [ "results = rag_chain.invoke({\"input\": \"What are the benefits of using electronic health records for patient care??\"})\n", "\n", "# Extract the answer and the source documents\n", "print(results)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input': 'How does the algorithm perform when multiple templates for binding modes are available?',\n", " 'context': [Document(id='chunk_100_67', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='The ability of the algorithm to choose the correct binding\\nmode when multiple templates are available was illustrated\\nby evaluation of three alternative binding modes that have\\nbeen structurally characterized between porcine pancreatica-amylase and camelid VHH domains (Figure 5). The\\nalgorithm successfully chose the native binding mode for\\nall three VHH domains. In addition, the statistical potentialscores that were computed for the native binding modesexhibit the same rank order as the affinity of the interactions\\nmeasured by total internal reflectance (33).\\nHowever, this example is also cautionary in that each VHH\\ndomain had one non-native mode that scored below the\\noptimal Z-score threshold, though only the native modes\\nproduced negative raw scores (Results). In a large-scalepredictive setting, if the native binding mode was not\\navailable as a template, the VHH domain would have been'),\n", " Document(id='chunk_100_39', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='how many of the predicted complexes were equivalent to,or were subcomplexes of, experimentally determined com-\\nplexes. Since the predictions are based on known structures,\\nthe sizes of the predicted complexes are far smaller than thoseobtained by biochemical methods such as tandem affinitypurification methods. For this reason, we elected not to use\\na metric that explicitly penalizes size differences [e.g. the\\nmetric defined in Ref. (16)].\\nBinding mode selection\\nThe ability of the potential to select the proper binding mode\\nwhen multiple template interfaces of different orientation are\\navailable was assessed. The test cases used were the struc-\\ntures of camelid VHH domains AMB7, AMD10 and AMD9bound to porcine pancreatic a-amylase (PPA) (PDB codes\\n1kxt, 1kxv and 1kxq, respectively). All three modes were\\nevaluated for each VHH–PPA complex using the interfacestatistical potential.\\nData sources\\nThe prediction algorithm uses three types of data: (i) target'),\n", " Document(id='chunk_100_68', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='produced negative raw scores (Results). In a large-scalepredictive setting, if the native binding mode was not\\navailable as a template, the VHH domain would have been\\npredicted to interact with PPA, but through an incorrectbinding mode. This example illustrates a connection between\\nthe observed performance and the underlying scoring scheme.\\nHowever, a systematic analysis of alternative binding modesin protein interactions, and the ability of our method to dis-tinguish them, remains a useful goal for the future.2950 Nucleic Acids Research, 2006, Vol. 34, No. 10Network specificities\\nA more difficult test of the method is the prediction of\\nspecificities within interaction networks between homologous\\nproteins. To address this problem, the method was applied to\\npredict the specificities within the epidermal growth factorreceptor (EGFR) and tumor necrosis factor b(TNFb) net-\\nworks of ligand receptor interactions (data not shown). In\\nboth networks the method failed to recapitulate known bind-'),\n", " Document(id='chunk_100_56', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='structural assessment, PSI-BLAST (32) was used to\\npredict binary interactions by detecting similarities between\\nS.cerevisiae proteins and the template complexes. An overlap\\nof 929 binary interactions was observed between the set of\\n36 790 (2.5%) predictions and the 19 424 (4.8%) experimen-\\ntally observed binary interactions.\\nAlternate binding modes\\nThe ability of the algorithm to correctly select the native\\nbinding mode when alternate templates are available wastested. The native binding mode was correctly selected for\\nall three VHH domains interacting with porcine pancreatic\\na-amylase (Figure 5). In addition, the statistical potential\\nscores that were computed for the native binding modesexhibit the same rank-order as the affinity measured experi-\\nmentally by total internal reflectance (33).\\nCo-complexed domains\\nAn extension process merged predicted complexes containing\\ndifferent domains of a single target protein (Figure 1c). This')],\n", " 'answer': 'The algorithm successfully chooses the native binding mode for all three VHH domains interacting with porcine pancreatic a-amylase when multiple templates are available. The statistical potential scores for the native binding modes exhibit the same rank order as the experimentally measured affinity. However, each VHH domain had one non-native mode that scored below the optimal Z-score threshold, indicating some caution is necessary in large-scale predictive settings.'}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results = rag_chain.invoke({\"input\": \"How does the algorithm perform when multiple templates for binding modes are available?\"})\n", "\n", "\n", "# Ensure the output includes the answer and source documents\n", "\n", "results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "pinecone-env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.10" } }, "nbformat": 4, "nbformat_minor": 2 }