{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/larawehbe/Documents/fakkerai/sehatech/pinecone-env/lib/python3.11/site-packages/pinecone/data/index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from tqdm.autonotebook import tqdm\n",
      "/Users/larawehbe/Documents/fakkerai/sehatech/pinecone-env/lib/python3.11/site-packages/langchain_openai/embeddings/base.py:281: UserWarning: WARNING! return_source_documents is not default parameter.\n",
      "                    return_source_documents was transferred to model_kwargs.\n",
      "                    Please confirm that return_source_documents is what you intended.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from langchain.vectorstores import Pinecone\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "import pinecone\n",
    "from langchain_pinecone import PineconeVectorStore\n",
    "from config import get_settings\n",
    "settings = get_settings()\n",
    "# Initialize Pinecone\n",
    "pc  = pinecone.Pinecone(settings.PINECONE_API_KEY)\n",
    "import os \n",
    "\n",
    "os.environ['PINECONE_API_KEY'] = settings.PINECONE_API_KEY\n",
    "\n",
    "# Connect to the existing index\n",
    "index_name = settings.INDEX_NAME # Replace with your index name\n",
    "index = pc.Index(index_name)\n",
    "\n",
    "# Initialize embeddings (ensure your embedding logic matches the one used during indexing)\n",
    "embeddings = OpenAIEmbeddings(openai_api_key=settings.OPENAI_API_KEY, return_source_documents=True)\n",
    "\n",
    "# Create the retriever from Pinecone\n",
    "retriever = PineconeVectorStore(index=index, embedding=OpenAIEmbeddings())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "# Define the LLM\n",
    "os.environ[\"OPENAI_API_KEY\"] = settings.OPENAI_API_KEY\n",
    "\n",
    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "llm = ChatOpenAI(model=\"gpt-4o\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'llm' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 25\u001b[0m\n\u001b[1;32m     17\u001b[0m prompt \u001b[38;5;241m=\u001b[39m ChatPromptTemplate\u001b[38;5;241m.\u001b[39mfrom_messages(\n\u001b[1;32m     18\u001b[0m     [\n\u001b[1;32m     19\u001b[0m         (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msystem\u001b[39m\u001b[38;5;124m\"\u001b[39m, system_prompt),\n\u001b[1;32m     20\u001b[0m         (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhuman\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{input}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m     21\u001b[0m     ]\n\u001b[1;32m     22\u001b[0m )\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# Use your LLM instance\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m question_answer_chain \u001b[38;5;241m=\u001b[39m create_stuff_documents_chain(\u001b[43mllm\u001b[49m, prompt)\n\u001b[1;32m     27\u001b[0m \u001b[38;5;66;03m# Create the RAG chain\u001b[39;00m\n\u001b[1;32m     28\u001b[0m rag_chain \u001b[38;5;241m=\u001b[39m create_retrieval_chain(retriever\u001b[38;5;241m.\u001b[39mas_retriever(), question_answer_chain)\n",
      "\u001b[0;31mNameError\u001b[0m: name 'llm' is not defined"
     ]
    }
   ],
   "source": [
    "from langchain.chains import create_retrieval_chain\n",
    "from langchain.chains.combine_documents import create_stuff_documents_chain\n",
    "from langchain_core.prompts import ChatPromptTemplate\n",
    "\n",
    "# Define the system prompt\n",
    "system_prompt = (\n",
    "    \"You are an assistant for question-answering tasks. \"\n",
    "    \"Use the following pieces of retrieved context to answer \"\n",
    "    \"the question. If you don't know the answer, say that you \"\n",
    "    \"don't know. Use three sentences maximum and keep the \"\n",
    "    \"answer concise.\"\n",
    "    \"\\n\\n\"\n",
    "    \"{context}\"\n",
    ")\n",
    "\n",
    "# Create the chat prompt template\n",
    "prompt = ChatPromptTemplate.from_messages(\n",
    "    [\n",
    "        (\"system\", system_prompt),\n",
    "        (\"human\", \"{input}\"),\n",
    "    ]\n",
    ")\n",
    "\n",
    "# Use your LLM instance\n",
    "question_answer_chain = create_stuff_documents_chain(llm, prompt)\n",
    "\n",
    "# Create the RAG chain\n",
    "rag_chain = create_retrieval_chain(retriever.as_retriever(), question_answer_chain)\n",
    "\n",
    "# Invoke the RAG chain with a sample question\n",
    "results = rag_chain.invoke({\"input\": \"pubmed_pdfs/PMC1474056.pdf\"})\n",
    "print(results)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input': 'What are the benefits of using electronic health records for patient care??', 'context': [Document(id='chunk_100_63', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='The use of sub-cellular localization data and functional\\nannotation as ﬁlters for the predictions increased their overlap\\nwith experimental complexes, as compared with the unﬁl-\\ntered predictions. This ﬁnding is in agreement with previousobservations that combining multiple sources of information\\nimproves the accuracy of function annotation as well as inter-\\naction prediction (9–11). Our method easily allows for theuse of additional biological ﬁlters when other types of\\ndata are available, such as synthetic gene lethality (36),\\nco-expression (37), and so on. This incremental addition oforthogonal information is also necessary to more accuratelyrepresent the conditions in the cellular milieu, where the\\npropensity of two protein structures to interact is not limited\\nonly by the physical chemistry of the interaction, but also byhigher levels of biological regulation, including compartmen-\\ntalization, expression, degradation, abundance and so on.'), Document(id='chunk_100_80', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='23. Ghaemmaghami,S., Huh,W.K., Bower,K., Howson,R.W., Belle,A.,\\nDephoure,N., O’Shea,E.K. and Weissman,J.S. (2003) Global analysis\\nof protein expression in yeast. Nature ,425, 737–741.\\n24. Dwight,S.S., Harris,M.A., Dolinski,K., Ball,C.A., Binkley,G.,\\nChristie,K.R., Fisk,D.G., Issel-Tarver,L., Schroeder,M., Sherlock,G.et al. (2002) Saccharomyces Genome Database (SGD) provides\\nsecondary gene annotation using the gene ontology (GO). Nucleic\\nAcids Res. ,30, 69–72.\\n25. Fawcett,T. (2003) ROC graphs: notes and practical considerations for\\ndata mining researchers. Technical Report HPL-2003-4, HP Labs,Palo Alto, CA, USA.\\n26. Pieper,U., Eswar,N., Davis,F.P., Braberg,H., Madhusudhan,M.S.,\\nRossi,A., Marti-Renom,M., Karchin,R., Webb,B.M., Eranian,D. et al.\\n(2006) MODBASE: a database of annotated comparative proteinstructure models and associated resources. Nucleic Acids Res. ,34,\\nD291–D295.\\n27. Eswar,N., John,B., Mirkovic,N., Fiser,A., Ilyin,V.A., Pieper,U.,'), Document(id='chunk_100_73', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='used to help bridge the resolution gap between electroncryo-microscopy (cryo-EM) density maps and atomic protein\\nstructures (41). Fitting of protein and protein domain models\\ninto density maps of large assemblies is already common, butdepending on the resolution, the information encoded in the\\nmap is often insufﬁcient for an unambiguous determination\\nof the positions and orientations of the individual proteins(42). Models of the complexes predicted here may provideadditional restraints for a more accurate ﬁtting of proteins\\ninto large complexes studied by cryo-EM and electron\\ncryo-tomography (14,43).\\nAs the number and size of experimentally determined\\nstructures of protein complexes increase, the number of\\ncomplexes that can be predicted and modeled using thesestructures as templates increases correspondingly, expanding\\nthe structural coverage of protein interaction space (44).'), Document(id='chunk_100_84', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='Chung,S., Vidal,M. and Gerstein,M. (2004) Annotation transfer\\nbetween genomes: protein–protein interologs and protein–DNA\\nregulogs. Genome Res. ,14, 1107–1118.\\n39. Bornberg-Bauer,E., Beaussart,F., Kummerfeld,S.K., Teichmann,S.A.\\nand Weiner,J.,III (2005) The evolution of domain arrangementsin proteins and interaction networks. Cell Mol. Life Sci. ,62,\\n435–445.\\n40. Han,J.D., Bertin,N., Hao,T., Goldberg,D.S., Berriz,G.F., Zhang,L.V.,\\nDupuy,D., Walhout,A.J., Cusick,M.E., Roth,F.P. et al. (2004) Evidence\\nfor dynamically organized modularity in the yeast protein-proteininteraction network. Nature ,430, 88–93.\\n41. Topf,M. and Sali,A. (2005) Combining electron microscopy and\\ncomparative protein structure modeling. Curr. Opin. Struct. Biol. ,15,\\n578–585.\\n42. Fabiola,F. and Chapman,M.S. (2005) Fitting of high-resolution\\nstructures into electron microscopy reconstruction images. Structure ,\\n13, 389–400.\\n43. Sali,A., Glaeser,R., Earnest,T. and Baumeister,W. (2003) From words')], 'answer': 'The benefits of using electronic health records (EHRs) for patient care include improved accuracy and accessibility of patient information, which enhances coordination and reduces errors in treatment. EHRs facilitate better communication among healthcare providers, leading to more informed decision-making and streamlined care processes. Additionally, EHRs support data analysis for improved healthcare outcomes and can enhance patient engagement by providing them access to their health information.'}\n"
     ]
    }
   ],
   "source": [
    "results = rag_chain.invoke({\"input\": \"What are the benefits of using electronic health records for patient care??\"})\n",
    "\n",
    "# Extract the answer and the source documents\n",
    "print(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input': 'How does the algorithm perform when multiple templates for binding modes are available?',\n",
       " 'context': [Document(id='chunk_100_67', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='The ability of the algorithm to choose the correct binding\\nmode when multiple templates are available was illustrated\\nby evaluation of three alternative binding modes that have\\nbeen structurally characterized between porcine pancreatica-amylase and camelid VHH domains (Figure 5). The\\nalgorithm successfully chose the native binding mode for\\nall three VHH domains. In addition, the statistical potentialscores that were computed for the native binding modesexhibit the same rank order as the afﬁnity of the interactions\\nmeasured by total internal reﬂectance (33).\\nHowever, this example is also cautionary in that each VHH\\ndomain had one non-native mode that scored below the\\noptimal Z-score threshold, though only the native modes\\nproduced negative raw scores (Results). In a large-scalepredictive setting, if the native binding mode was not\\navailable as a template, the VHH domain would have been'),\n",
       "  Document(id='chunk_100_39', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='how many of the predicted complexes were equivalent to,or were subcomplexes of, experimentally determined com-\\nplexes. Since the predictions are based on known structures,\\nthe sizes of the predicted complexes are far smaller than thoseobtained by biochemical methods such as tandem afﬁnitypuriﬁcation methods. For this reason, we elected not to use\\na metric that explicitly penalizes size differences [e.g. the\\nmetric deﬁned in Ref. (16)].\\nBinding mode selection\\nThe ability of the potential to select the proper binding mode\\nwhen multiple template interfaces of different orientation are\\navailable was assessed. The test cases used were the struc-\\ntures of camelid VHH domains AMB7, AMD10 and AMD9bound to porcine pancreatic a-amylase (PPA) (PDB codes\\n1kxt, 1kxv and 1kxq, respectively). All three modes were\\nevaluated for each VHH–PPA complex using the interfacestatistical potential.\\nData sources\\nThe prediction algorithm uses three types of data: (i) target'),\n",
       "  Document(id='chunk_100_68', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='produced negative raw scores (Results). In a large-scalepredictive setting, if the native binding mode was not\\navailable as a template, the VHH domain would have been\\npredicted to interact with PPA, but through an incorrectbinding mode. This example illustrates a connection between\\nthe observed performance and the underlying scoring scheme.\\nHowever, a systematic analysis of alternative binding modesin protein interactions, and the ability of our method to dis-tinguish them, remains a useful goal for the future.2950 Nucleic Acids Research, 2006, Vol. 34, No. 10Network specificities\\nA more difﬁcult test of the method is the prediction of\\nspeciﬁcities within interaction networks between homologous\\nproteins. To address this problem, the method was applied to\\npredict the speciﬁcities within the epidermal growth factorreceptor (EGFR) and tumor necrosis factor b(TNFb) net-\\nworks of ligand receptor interactions (data not shown). In\\nboth networks the method failed to recapitulate known bind-'),\n",
       "  Document(id='chunk_100_56', metadata={'page': 0.0, 'source': 'pubmed_pdfs/PMC1474056.pdf'}, page_content='structural assessment, PSI-BLAST (32) was used to\\npredict binary interactions by detecting similarities between\\nS.cerevisiae proteins and the template complexes. An overlap\\nof 929 binary interactions was observed between the set of\\n36 790 (2.5%) predictions and the 19 424 (4.8%) experimen-\\ntally observed binary interactions.\\nAlternate binding modes\\nThe ability of the algorithm to correctly select the native\\nbinding mode when alternate templates are available wastested. The native binding mode was correctly selected for\\nall three VHH domains interacting with porcine pancreatic\\na-amylase (Figure 5). In addition, the statistical potential\\nscores that were computed for the native binding modesexhibit the same rank-order as the afﬁnity measured experi-\\nmentally by total internal reﬂectance (33).\\nCo-complexed domains\\nAn extension process merged predicted complexes containing\\ndifferent domains of a single target protein (Figure 1c). This')],\n",
       " 'answer': 'The algorithm successfully chooses the native binding mode for all three VHH domains interacting with porcine pancreatic a-amylase when multiple templates are available. The statistical potential scores for the native binding modes exhibit the same rank order as the experimentally measured affinity. However, each VHH domain had one non-native mode that scored below the optimal Z-score threshold, indicating some caution is necessary in large-scale predictive settings.'}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results = rag_chain.invoke({\"input\": \"How does the algorithm perform when multiple templates for binding modes are available?\"})\n",
    "\n",
    "\n",
    "# Ensure the output includes the answer and source documents\n",
    "\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pinecone-env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}