diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..95780644263ecc62dd91867afd13c6f1fcc2d920 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,35 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Auto detect text files and perform LF normalization
+* text=auto
+assets/llamole.png filter=lfs diff=lfs merge=lfs -text
+assets/ui_example.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e65bbe3e61c5d2c38a82a44df6ea59e2ab926331
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,172 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# custom .gitignore
+cache/
+output/
+wandb/
+# Ignore everything in data directory
+run_script*
+
+saves/*
+!saves/README.md
+
+.gradio
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..b09cd7856d58590578ee1a4f3ad45d1310a97f87
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.md b/README.md
index f71dbcd28ab6d925eb7db48e7e9739d92d410223..d0757408fe88552cf1840fb904e5a7fdaa0ed4a2 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,190 @@
---
title: Llamole
-emoji: 🐢
-colorFrom: gray
-colorTo: purple
+app_file: launch.py
sdk: gradio
sdk_version: 5.15.0
-app_file: app.py
-pinned: false
---
+
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+
+
+# 🔬 🧪 Llamole: Multimodal Large Language Models for Inverse Molecular Design with Retrosynthetic Planning
+
+[![](https://img.shields.io/badge/Paper-pink?style=plastic&logo=arXiv)](https://arxiv.org/abs/2410.04223)
+[![](https://img.shields.io/badge/GitHub-blue?style=plastic&logo=github)](https://github.com/liugangcode/Llamole)
+[![](https://img.shields.io/badge/Huggingface-orange?style=plastic&logo=huggingface)](https://huggingface.co/collections/liuganghuggingface/llamole-collection-67073a2e76b47d5fea909434)
+
+
+
+## 🌟 Introduction
+
+**Llamole** is a multimodal Large Language Model (LLM) that integrates a base LLM with the Graph Diffusion Transformer and Graph Neural Networks for multi-conditional molecular generation and multi-step reaction inference within texts.
+
+📄 **[Paper](https://arxiv.org/abs/2410.04223)**: *Multimodal Large Language Models for Inverse Molecular Design with Retrosynthetic Planning*
+
+
+
+
+ 🔍 Abstract
+
+ While large language models (LLMs) have integrated images, adapting them to graphs remains challenging, limiting their applications in materials and drug design. This difficulty stems from the need for coherent autoregressive generation across texts and graphs. To address this, we introduce Llamole, the first multimodal LLM capable of interleaved text and graph generation, enabling molecular inverse design with retrosynthetic planning. Llamole integrates a base LLM with the Graph Diffusion Transformer and Graph Neural Networks for multi-conditional molecular generation and reaction inference within texts, while the LLM, with enhanced molecular understanding, flexibly controls activation among the different graph modules. Additionally, Llamole integrates A* search with LLM-based cost functions for efficient retrosynthetic planning. We create benchmarking datasets and conduct extensive experiments to evaluate Llamole against in-context learning and supervised fine-tuning. Llamole significantly outperforms 14 adapted LLMs across 12 metrics for controllable molecular design and retrosynthetic planning.
+
+
+
+
![Llamole Design](assets/llamole.png)
+
+
+
+---
+
+## 🛠️ Environment Setup
+
+Initialize the environment by following these steps:
+
+```bash
+conda create --name llamole python=3.11 -y
+conda activate llamole
+./install_environment.sh
+```
+
+Alternatively, you can install all required dependencies using the `requirements.sh` script.
+
+---
+
+## 🚀 Model Usage
+
+### 🔧 Requirements
+
+- **Hardware**: A single V100 or A6000 GPU for inference.
+- **Configuration Files**:
+ - `config/train/{model}_lora.yaml`
+ - `config/generate/{model}_{task}.yaml`
+
+### 📥 Automatic Model Download
+
+On the first run, the necessary models will be automatically downloaded, including:
+
+1. **Base LLMs** (Please ensure you have access to the model):
+ - [Llama-3.1-8b-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B)
+ - [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct)
+ - [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
+
+2. **Pretrained Graph Models**:
+ - **Graph Decoder**: [Graph Diffusion Transformer](https://huggingface.co/liuganghuggingface/Llamole-Pretrained-GraphDiT)
+ - **Graph Encoder**: [GIN-based Encoder](https://huggingface.co/liuganghuggingface/Llamole-Pretrained-GraphEncoder)
+ - **Graph Predictor**: [GIN-based Predictor](https://huggingface.co/liuganghuggingface/Llamole-Pretrained-GNNPredictor)
+
+3. **Adapters and Connectors** for integrating the base LLM with pretrained graph models.
+
+### 🗂️ Manual Model Download
+
+If you prefer to download the models manually, refer to [![](https://img.shields.io/badge/Huggingface-orange?style=plastic&logo=huggingface)](https://huggingface.co/collections/liuganghuggingface/llamole-collection-67073a2e76b47d5fea909434) and place them in the following directories:
+
+- `saves/graph_decoder`
+- `saves/graph_encoder`
+- `saves/graph_predictor`
+- `saves/{model_name}-Adapter`
+
+---
+
+### 🕸️ Gradio-based Web UI
+
+Launch the web interface using Gradio:
+
+```bash
+python launch.py
+```
+
+The default base LLM is **Qwen2-7B-Instruct**. If you wish to change this, please modify the `args_dict` variable accordingly. Upon launch, the web UI will appear as shown below:
+
+
+
![Web UI Example](assets/ui_example.png)
+
+
+
+---
+
+### 💻 Command Line Usage
+
+For command-line evaluation, specify the path to the configuration file:
+
+```bash
+python main.py eval config/generate/qwen_material.yaml
+```
+
+You can modify the configuration files to suit your custom datasets.
+
+**Note**: Examples of training and evaluation datasets are available in the `data` folder. For more details, refer to `data/dataset_info.json`. To test generation on all MolQA questions, first download the dataset by running:
+
+```bash
+python main.py download_data
+```
+
+Then, update the configuration files to point to the downloaded dataset based on the names from `data/dataset_info.json`.
+
+---
+
+## 📚 Supervised Fine-Tuning
+
+The codebase supports multimodal graph-text supervised fine-tuning. Follow these steps:
+
+1. **Download MolQA Training Data**:
+
+ ```bash
+ python main.py download_data
+ ```
+ Then you may need to modify the configuration files in the `config` folder to point to the downloaded training data. Skipping this step and directly using the command from step 2 will result in training only on the example training set.
+
+2. **Run Fine-Tuning**:
+
+ ```bash
+ python main.py train config/train/mistral_lora.yaml
+ ```
+
+ During the first run, pretrained graph models will be downloaded in the `saves` folder. Modify the configuration files as needed for your setup. An 80G A100 GPU is recommended for supervised fine-tuning.
+
+---
+## 📖 Citation
+
+If you find this repository useful, please cite our paper:
+
+```
+@misc{liu2024llamole,
+ title={Multimodal Large Language Models for Inverse Molecular Design with Retrosynthetic Planning},
+ author={Gang Liu and Michael Sun and Wojciech Matusik and Meng Jiang and Jie Chen},
+ year={2024},
+ eprint={2410.04223},
+ archivePrefix={arXiv},
+ primaryClass={cs.LG},
+ url={https://arxiv.org/abs/2410.04223},
+}
+```
+
+```
+@article{liu2024graphdit,
+ title={Graph Diffusion Transformers for Multi-Conditional Molecular Generation},
+ author={Liu, Gang and Xu, Jiaxin and Luo, Tengfei and Jiang, Meng},
+ journal={Thirty-Eighth Annual Conference on Neural Information Processing Systems},
+ year={2024}
+}
+```
+
+---
+
+## 📄 Acknowledgments
+
+This codebase is built upon **[Llama-Factory](https://github.com/hiyouga/LLaMA-Factory)**. We extend our gratitude for their open-source contributions.
+
+---
+
+## 📂 Additional Resources
+
+🔗 **Huggingface Models**: Llamole is developed with three variants (adapters) and three pretrained graph modules (encoder, decoder, predictor):
+- **Base LLM Variant 1**: [Llama-3.1-8b-Instruct](https://huggingface.co/liuganghuggingface/Llamole-Llama-3.1-8B-Instruct-Adapter)
+- **Base LLM Variant 2**: [Qwen2-7B-Instruct](https://huggingface.co/liuganghuggingface/Llamole-Qwen2-7B-Instruct-Adapter)
+- **Base LLM Variant 3**: [Mistral-7B-Instruct-v0.3](https://huggingface.co/liuganghuggingface/Llamole-Mistral-7B-Instruct-v0.3-Adapter)
+- **Pretrained Graph Decoder** for multi-conditional molecular generation: [Graph Diffusion Transformer](https://huggingface.co/liuganghuggingface/Llamole-Pretrained-GraphDiT)
+- **Pretrained Graph Predictor** for one-step reaction prediction: [GNN Predictor](https://huggingface.co/liuganghuggingface/Llamole-Pretrained-GNNPredictor)
+- **Pretrained Graph Encoder** for enhanced molecule understanding: [Graph Encoder](https://huggingface.co/liuganghuggingface/Llamole-Pretrained-GraphEncoder)
+
+---
diff --git a/assets/llamole.png b/assets/llamole.png
new file mode 100644
index 0000000000000000000000000000000000000000..219559fac13f78692b09fc18d490de6481a6f7eb
--- /dev/null
+++ b/assets/llamole.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f98ad05d897399cd4db1c618f78509e19b0b832f3aa2d938121bbc5a6e77166
+size 771154
diff --git a/assets/ui_example.png b/assets/ui_example.png
new file mode 100644
index 0000000000000000000000000000000000000000..f45c02874b33de5f64a5daf3188e181a0ab407db
--- /dev/null
+++ b/assets/ui_example.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36a5f766fa0f9da927fea34d046939d88fc07ddbdaa3f35244f5bf6ea873b246
+size 150578
diff --git a/config/generate/llama_drug.yaml b/config/generate/llama_drug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bae79f29709d14eed525cd369ae70d3e6d27d87b
--- /dev/null
+++ b/config/generate/llama_drug.yaml
@@ -0,0 +1,30 @@
+### model
+model_name_or_path: meta-llama/Meta-Llama-3.1-8B-Instruct
+new_special_tokens: ,,,,,,,,
+graph_decoder_path: saves/graph_decoder
+graph_encoder_path: saves/graph_encoder
+graph_predictor_path: saves/graph_predictor
+adapter_name_or_path: saves/Llama-3.1-8B-Instruct-Adapter
+graph_lm_connector_path: saves/Llama-3.1-8B-Instruct-Adapter/connector
+### generation
+max_length: 512
+max_new_tokens: 128
+temperature: 0.6
+top_p: 0.9
+### method
+finetuning_type: lora
+do_train: false
+flash_attn: disabled
+learned_query_size: 8
+### dataset
+dataset: molqa_drug_examples
+template: llama3
+cutoff_len: 128
+overwrite_cache: true
+preprocessing_num_workers: 16
+output_dir: null
+bf16: true
+pure_bf16: true
+### eval
+per_device_eval_batch_size: 6
+report_to: 'none'
\ No newline at end of file
diff --git a/config/generate/llama_material.yaml b/config/generate/llama_material.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e06e2c848f08c5dd77ee58431d22e26ad941f7c8
--- /dev/null
+++ b/config/generate/llama_material.yaml
@@ -0,0 +1,30 @@
+### model
+model_name_or_path: meta-llama/Meta-Llama-3.1-8B-Instruct
+new_special_tokens: ,,,,,,,,
+graph_decoder_path: saves/graph_decoder
+graph_encoder_path: saves/graph_encoder
+graph_predictor_path: saves/graph_predictor
+adapter_name_or_path: saves/Llama-3.1-8B-Instruct-Adapter
+graph_lm_connector_path: saves/Llama-3.1-8B-Instruct-Adapter/connector
+### generation
+max_length: 512
+max_new_tokens: 128
+temperature: 0.6
+top_p: 0.9
+### method
+finetuning_type: lora
+do_train: false
+flash_attn: disabled
+learned_query_size: 8
+### dataset
+dataset: molqa_material_examples
+template: llama3
+cutoff_len: 128
+overwrite_cache: true
+preprocessing_num_workers: 16
+output_dir: null
+bf16: true
+pure_bf16: true
+### eval
+per_device_eval_batch_size: 6
+report_to: 'none'
\ No newline at end of file
diff --git a/config/generate/mistral_drug.yaml b/config/generate/mistral_drug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bc6ad005380b534e5e75bdedeed3b540882a64cd
--- /dev/null
+++ b/config/generate/mistral_drug.yaml
@@ -0,0 +1,30 @@
+### model
+model_name_or_path: mistralai/Mistral-7B-Instruct-v0.3
+new_special_tokens: ,,,,,,,,
+graph_decoder_path: saves/graph_decoder
+graph_encoder_path: saves/graph_encoder
+graph_predictor_path: saves/graph_predictor
+adapter_name_or_path: saves/Mistral-7B-Instruct-v0.3-Adapter
+graph_lm_connector_path: saves/Mistral-7B-Instruct-v0.3-Adapter/connector
+### generation
+max_length: 512
+max_new_tokens: 128
+temperature: 0.6
+top_p: 0.9
+### method
+finetuning_type: lora
+do_train: false
+flash_attn: disabled
+learned_query_size: 8
+### dataset
+dataset: molqa_drug_examples
+template: mistral
+cutoff_len: 128
+overwrite_cache: true
+preprocessing_num_workers: 16
+output_dir: null
+bf16: true
+pure_bf16: true
+### eval
+per_device_eval_batch_size: 6
+report_to: 'none'
\ No newline at end of file
diff --git a/config/generate/mistral_material.yaml b/config/generate/mistral_material.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f578dbf11926055588cc2fe2c449ee9b9d7e1d6
--- /dev/null
+++ b/config/generate/mistral_material.yaml
@@ -0,0 +1,30 @@
+### model
+model_name_or_path: mistralai/Mistral-7B-Instruct-v0.3
+new_special_tokens: ,,,,,,,,
+graph_decoder_path: saves/graph_decoder
+graph_encoder_path: saves/graph_encoder
+graph_predictor_path: saves/graph_predictor
+adapter_name_or_path: saves/Mistral-7B-Instruct-v0.3-Adapter
+graph_lm_connector_path: saves/Mistral-7B-Instruct-v0.3-Adapter/connector
+### generation
+max_length: 512
+max_new_tokens: 128
+temperature: 0.6
+top_p: 0.9
+### method
+finetuning_type: lora
+do_train: false
+flash_attn: disabled
+learned_query_size: 8
+### dataset
+dataset: molqa_material_examples
+template: mistral
+cutoff_len: 128
+overwrite_cache: true
+preprocessing_num_workers: 16
+output_dir: null
+bf16: true
+pure_bf16: true
+### eval
+per_device_eval_batch_size: 6
+report_to: 'none'
\ No newline at end of file
diff --git a/config/generate/qwen_drug.yaml b/config/generate/qwen_drug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b9babecd750cb21f5167c56e47e9f9290c3f6f05
--- /dev/null
+++ b/config/generate/qwen_drug.yaml
@@ -0,0 +1,30 @@
+### model
+model_name_or_path: Qwen/Qwen2-7B-Instruct
+new_special_tokens: ,,,,,,,,
+graph_decoder_path: saves/graph_decoder
+graph_encoder_path: saves/graph_encoder
+graph_predictor_path: saves/graph_predictor
+adapter_name_or_path: saves/Qwen2-7B-Instruct-Adapter
+graph_lm_connector_path: saves/Qwen2-7B-Instruct-Adapter/connector
+### generation
+max_length: 512
+max_new_tokens: 128
+temperature: 0.6
+top_p: 0.9
+### method
+finetuning_type: lora
+do_train: false
+flash_attn: disabled
+learned_query_size: 8
+### dataset
+dataset: molqa_drug_examples
+template: qwen
+cutoff_len: 128
+overwrite_cache: true
+preprocessing_num_workers: 16
+output_dir: null
+bf16: true
+pure_bf16: true
+### eval
+per_device_eval_batch_size: 6
+report_to: 'none'
\ No newline at end of file
diff --git a/config/generate/qwen_material.yaml b/config/generate/qwen_material.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..71ea710bd40c5768d2be0871fb59107e881dd37c
--- /dev/null
+++ b/config/generate/qwen_material.yaml
@@ -0,0 +1,30 @@
+### model
+model_name_or_path: Qwen/Qwen2-7B-Instruct
+new_special_tokens: ,,,,,,,,
+graph_decoder_path: saves/graph_decoder
+graph_encoder_path: saves/graph_encoder
+graph_predictor_path: saves/graph_predictor
+adapter_name_or_path: saves/Qwen2-7B-Instruct-Adapter
+graph_lm_connector_path: saves/Qwen2-7B-Instruct-Adapter/connector
+### generation
+max_length: 512
+max_new_tokens: 128
+temperature: 0.6
+top_p: 0.9
+### method
+finetuning_type: lora
+do_train: false
+flash_attn: disabled
+learned_query_size: 8
+### dataset
+dataset: molqa_material_examples
+template: qwen
+cutoff_len: 128
+overwrite_cache: true
+preprocessing_num_workers: 16
+output_dir: null
+bf16: true
+pure_bf16: true
+### eval
+per_device_eval_batch_size: 6
+report_to: 'none'
\ No newline at end of file
diff --git a/config/train/llama_lora.yaml b/config/train/llama_lora.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17543117e02510c1190a3e3d797f0ccd447e1919
--- /dev/null
+++ b/config/train/llama_lora.yaml
@@ -0,0 +1,49 @@
+### model
+model_name_or_path: meta-llama/Meta-Llama-3.1-8B-Instruct
+new_special_tokens: ,,,,,,,,
+graph_decoder_path: saves/graph_decoder
+graph_encoder_path: saves/graph_encoder
+graph_predictor_path: saves/graph_predictor
+graph_lm_connector_path: null #specify when resume
+### method
+stage: mmsft
+do_train: true
+finetuning_type: lora
+lora_target: all
+flash_attn: disabled
+learned_query_size: 8
+### dataset
+dataset: molqa_train_examples
+template: llama3
+cutoff_len: 2048
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/Llama-3.1-8B-Instruct-Adapter
+logging_steps: 10
+save_steps: 2000
+plot_loss: true
+overwrite_output_dir: true
+### to resume
+# overwrite_output_dir: false
+### train
+per_device_train_batch_size: 10
+gradient_accumulation_steps: 2
+learning_rate: 1.0e-4
+num_train_epochs: 4.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+ddp_timeout: 180000000
+bf16: true
+pure_bf16: true
+### train loss
+loss_weight_retro: 1
+loss_weight_design: 1
+loss_weight_lm: 1
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 6
+eval_strategy: steps
+eval_steps: 2000
+### specify if connected to wandb
+report_to: 'none'
\ No newline at end of file
diff --git a/config/train/mistral_lora.yaml b/config/train/mistral_lora.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2446b9b2ad1c81d493a3d1d18cf087c2154a1c2
--- /dev/null
+++ b/config/train/mistral_lora.yaml
@@ -0,0 +1,48 @@
+### model
+model_name_or_path: mistralai/Mistral-7B-Instruct-v0.3
+new_special_tokens: ,,,,,,,,
+graph_decoder_path: saves/graph_decoder
+graph_encoder_path: saves/graph_encoder
+graph_predictor_path: saves/graph_predictor
+graph_lm_connector_path: null #specify when resume
+### method
+stage: mmsft
+do_train: true
+finetuning_type: lora
+lora_target: all
+flash_attn: disabled
+learned_query_size: 8
+### dataset
+dataset: molqa_train_examples
+template: mistral
+cutoff_len: 2048
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/Mistral-7B-Instruct-v0.3-Adapter
+logging_steps: 10
+save_steps: 2000
+plot_loss: true
+overwrite_output_dir: true
+### to resume
+# overwrite_output_dir: false
+### train
+per_device_train_batch_size: 6
+gradient_accumulation_steps: 2
+learning_rate: 1.0e-4
+num_train_epochs: 4.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+ddp_timeout: 180000000
+bf16: true
+pure_bf16: true
+loss_weight_retro: 1
+loss_weight_design: 1
+loss_weight_lm: 1
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 6
+eval_strategy: steps
+eval_steps: 2000
+### specify if connected to wandb
+report_to: 'none'
\ No newline at end of file
diff --git a/config/train/qwen_lora.yaml b/config/train/qwen_lora.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4110f3b8c27b69f7931097bad04f290c2581e7f8
--- /dev/null
+++ b/config/train/qwen_lora.yaml
@@ -0,0 +1,48 @@
+### model
+model_name_or_path: Qwen/Qwen2-7B-Instruct
+new_special_tokens: ,,,,,,,,
+graph_decoder_path: saves/graph_decoder
+graph_encoder_path: saves/graph_encoder
+graph_predictor_path: saves/graph_predictor
+graph_lm_connector_path: null #specify when resume
+### method
+stage: mmsft
+do_train: true
+finetuning_type: lora
+lora_target: all
+flash_attn: disabled
+learned_query_size: 8
+### dataset
+dataset: molqa_train_examples
+template: qwen
+cutoff_len: 2048
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/Qwen2-7B-Instruct-Adapter
+logging_steps: 10
+save_steps: 2000
+plot_loss: true
+overwrite_output_dir: true
+### to resume
+# overwrite_output_dir: false
+### train
+per_device_train_batch_size: 6
+gradient_accumulation_steps: 2
+learning_rate: 1.0e-4
+num_train_epochs: 4.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+ddp_timeout: 180000000
+bf16: true
+pure_bf16: true
+loss_weight_retro: 1
+loss_weight_design: 1
+loss_weight_lm: 1
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 6
+eval_strategy: steps
+eval_steps: 2000
+### specify if connected to wandb
+report_to: 'none'
\ No newline at end of file
diff --git a/data/dataset_info.json b/data/dataset_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..65736361cfb62706c4e026bacba8a2e9ab135805
--- /dev/null
+++ b/data/dataset_info.json
@@ -0,0 +1,20 @@
+{
+ "molqa_train_examples": {
+ "file_name": "molqa_train_examples.json"
+ },
+ "molqa_material_examples": {
+ "file_name": "molqa_material_examples.json"
+ },
+ "molqa_drug_examples": {
+ "file_name": "molqa_drug_examples.json"
+ },
+ "molqa_train": {
+ "file_name": "molqa_train.json"
+ },
+ "molqa_material": {
+ "file_name": "molqa_material.json"
+ },
+ "molqa_drug": {
+ "file_name": "molqa_drug.json"
+ }
+}
\ No newline at end of file
diff --git a/data/molqa_drug_examples.json b/data/molqa_drug_examples.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a5e37dfa0077ef4a349aa7ee70fd4e0d19fa07e
--- /dev/null
+++ b/data/molqa_drug_examples.json
@@ -0,0 +1,75 @@
+[
+ {
+ "instruction": "Can you design and synthesize a molecule that inhibits both Human Immunodeficiency Virus and Beta-Secretase 1, with a molecular weight of around 335 and two aromatic rings connected to aliphatic chains?",
+ "input": "",
+ "output": "To satisfy the requirements: The molecule has a moderately complex structure, consisting of aromatic rings connected to aliphatic chains, and featuring carboxyl and hydroxyl functional groups. Therefore, the designed molecule is: CC(=O)OCCC(C)Oc1ccc(Cl)cc1Oc1ccccc1. This is step 1 in the retrosynthesis process. To synthesize CC(=O)OCCC(C)Oc1ccc(Cl)cc1Oc1ccccc1, follow these procedures: A solution of (R)-acetic acid 3-(toluene-4-sulfonyloxy)-butyl ester (5.89 g, 21 mmol) and 4-chloro-2-phenoxy-phenol (5.0 g, 23 mmol) in DMF (50 mL) is treated with cesium carbonate (7.4 g, 23 mmol). The solution is heated to 60\u00b0 C. and stirred overnight. The reaction is cooled and quenched with 1N HCl. The solution is partitioned in EtOAc and water. The organic is separated, washed with brine, and dried over sodium sulfate. The organic is filtered, and the solvent is removed to afford acetic acid 3-(4-chloro-2-phenoxy-phenoxy)-butyl ester, which is then diluted in methanol (100 mL) and treated with potassium carbonate (5.68 g, 40 mmol). The reaction is stirred for 2 hours at rt. The reaction is then partitioned in EtOAc and water. The organic layer is separated, washed with brine, and dried over sodium sulfate. The organic is filtered and the solvent is removed to afford the crude product. The crude is purified by silica gel column chromatography using 1/1 hexanes/EtOAc to elute the pure product. The solvent is removed to afford 4.35 g (72%) of the desired product. 1H NMR (400 MHz, CDCl3); MS (ES+) m/z mass calcd for C16H17ClO3 292, found 293 (M+1, 100%). The applied reaction is: CC(=O)OCCC(C)Oc1ccc(Cl)cc1Oc1ccccc1>>CC(=O)OCCC(C)OS(=O)(=O)c1ccc(C)cc1.O=C(O)O.Oc1ccc(Cl)cc1Oc1ccccc1.[Cs+].[Cs+], which requires the reactants: CC(=O)OCCC(C)OS(=O)(=O)c1ccc(C)cc1 (available), O=C(O)O (available), Oc1ccc(Cl)cc1Oc1ccccc1 (not available, needs to be synthesized), [Cs+] (available), [Cs+] (available). This is step 2 in the retrosynthesis process. To synthesize Oc1ccc(Cl)cc1Oc1ccccc1, follow these procedures: A \u221240\u00b0 C. solution of 4-chloro-2-phenoxy-1-methoxy-benzene (7.16 g, 30.5 mmol) in dry CH2Cl2 (70 mL) is treated dropwise with borontribromide (22.9 g, 91.5 mmol) and then warmed to 0\u00b0 C. and stirred for 3 h under N2. The reaction is diluted with Et2O and quenched with water. The organic layer is dried (Na2SO4), and the solvent is removed in vacuo to afford 7.11 g (100%) of the title compound. Rf=0.30 (4/1 hexanes/acetone). 1H NMR (400 MHz, CDCl3); MS (ES\u2212) m/z mass calcd for C12H9O2Cl 220, found 219 (M\u22121, 100%). The applied reaction is: Oc1ccc(Cl)cc1Oc1ccccc1>>COc1ccc(Cl)cc1Oc1ccccc1, which requires the reactants: COc1ccc(Cl)cc1Oc1ccccc1 (not available, needs to be synthesized). This is step 3 in the retrosynthesis process. To synthesize COc1ccc(Cl)cc1Oc1ccccc1, follow these procedures: A mixture of 2-bromo-4-chloro-1-methoxy-benzene (8.0 g, 36.1 mmol), phenol (6.80 g, 72.2 mmol), cesium carbonate (23.54 g, 72.2 mmol), copper (I) chloride (1.79 g, 18.1 mmol) and 2,2,6,6-tetramethyl-3,5-heptanedione (1.66 g, 9.00 mmol) in dry 1-methyl-2-pyrrolidinone (80 mL) is heated to 120\u00b0 C. for 20 hours under N2. The reaction is cooled, filtered and the filtrate quenched with 1 N HCl (50 mL). The filtrate is diluted with Et2O and extracted with water. The organic layer is dried (Na2SO4), and the solvent is removed in vacuo to afford crude product that is absorbed on silica gel and purified by flash chromatography using 9/1 hexanes/EtOAc to afford 7.42 g (88%) of the title compound. Rf=0.37 (4/1 hexanes/EtOAc). The applied reaction is: COc1ccc(Cl)cc1Oc1ccccc1>>CC(C)(C)C(=O)CC(=O)C(C)(C)C.COc1ccc(Cl)cc1Br.O=C(O)O.[Cs+].[Cs+].[O-]c1ccccc1, which requires the reactants: CC(C)(C)C(=O)CC(=O)C(C)(C)C (available), COc1ccc(Cl)cc1Br (available), O=C(O)O (available), [Cs+] (available), [Cs+] (available), [O-]c1ccccc1 (available).",
+ "property": {
+ "HIV": 1.0,
+ "BACE": 1.0,
+ "SC": 3.1,
+ "SA": 2.46
+ },
+ "retro": [
+ null,
+ null,
+ null
+ ]
+ },
+ {
+ "instruction": "What is a possible molecular structure that can penetrate the blood-brain barrier and inhibit Human Immunodeficiency Virus, given a molecular weight of 209.249 and specific ring and bond characteristics?",
+ "input": "",
+ "output": "To satisfy the requirements: The molecule has an aromatic ring and a common functional group of carboxyl, which suggests a certain level of structural complexity, likely influencing its physical and chemical properties, and indicating a moderate synthetic complexity score. Therefore, the designed molecule is: CC(C)(C)OC(=O)Nc1ccc(N)cn1. This is step 1 in the retrosynthesis process. To synthesize CC(C)(C)OC(=O)Nc1ccc(N)cn1, follow these procedures: To a suspension of (tert-butoxy)-N-(5-nitro(2-pyridyl))carboxamide (0.27 g, 1.13 mmol) in methanol (2 mL), ethyl acetate (4 mL) and TEA (0.16 mL) was added 10% Pd/C (60 mg, 0.056 mmol) under argon. The reaction mixture was hydrogenated under 1 atm H2 for 20 hr, filtered through Celite and concentrated in vacuo to give N-(5-amino(2-pyridyl))(tert-butoxy)carboxamide (0.226 g, 97%). 1H-NMR (DMSO-d6): \u03b4 1.40 (s, 9H), 4.92 (br s, 2H), 6.89-6.91 (dd, 1H), 7.35-7.37 (d, 1H), 7.58 (d, 1H), 9.06 (s, 1H). The applied reaction is: CC(C)(C)OC(=O)Nc1ccc(N)cn1>>CC(C)(C)OC(=O)Nc1ccc([N+](=O)[O-])cn1 with the template [#7;a:4]:[c:3]:[c:2]-[NH2;D1;+0:1]>>O=[N+;H0;D3:1](-[O-])-[c:2]:[c:3]:[#7;a:4], which requires the reactants: CC(C)(C)OC(=O)Nc1ccc([N+](=O)[O-])cn1 (available).",
+ "property": {
+ "BBBP": 1.0,
+ "HIV": 1.0,
+ "SC": 2.03,
+ "SA": 2.08
+ },
+ "retro": [
+ 2965
+ ]
+ },
+ {
+ "instruction": "What molecular structure can be designed to exhibit Blood-Brain Barrier Penetration and Human Immunodeficiency Virus Inhibition while featuring a specific combination of molecular weight, ring count, and functional groups?",
+ "input": "",
+ "output": "To satisfy the requirements: The molecule features a complex structure with two aromatic rings and a high number of rotatable bonds, indicating potential difficulties in synthesis. Its chemical properties are influenced by the presence of functional groups such as carboxyl and amino groups. The predicted properties suggest high substrate specificity for CYP2C9 and moderate substrate specificity for CYP3A4, while also indicating a low risk of carcinogenicity and high clearance in microsomes. The molecule's hydration free energy and skin reaction predictions are also noteworthy. Overall, the molecule's unique combination of structural features and predicted chemical properties makes it an interesting candidate for further investigation. Therefore, the designed molecule is: CC(=O)c1sc(NS(=O)(=O)c2cc(Br)c(Cl)s2)nc1C. This is step 1 in the retrosynthesis process. To synthesize CC(=O)c1sc(NS(=O)(=O)c2cc(Br)c(Cl)s2)nc1C, follow these procedures: The title compound was prepared from 5-acety1-2-amino-4-methylthiazole (42 mg) and 4-bromo-5-chlorothiophene-2-sulfonyl chloride (80 mg) as described in the synthetic METHOD B to give a white solid (11.7 mg) with purity >90%: MS (pos) m/z 415.3, 417.3. The applied reaction is: CC(=O)c1sc(NS(=O)(=O)c2cc(Br)c(Cl)s2)nc1C>>CC(=O)c1sc(N)nc1C.O=S(=O)(Cl)c1cc(Br)c(Cl)s1 with the template [#16;a:5]:[c:4]-[S;H0;D4;+0:1](=[O;D1;H0:2])(=[O;D1;H0:3])-[NH;D2;+0:9]-[c:7](:[#16;a:6]):[#7;a:8]>>Cl-[S;H0;D4;+0:1](=[O;D1;H0:2])(=[O;D1;H0:3])-[c:4]:[#16;a:5].[#16;a:6]:[c:7](:[#7;a:8])-[NH2;D1;+0:9], which requires the reactants: CC(=O)c1sc(N)nc1C (available), O=S(=O)(Cl)c1cc(Br)c(Cl)s1 (available).",
+ "property": {
+ "BBBP": 1.0,
+ "HIV": 1.0,
+ "SC": 2.83,
+ "SA": 2.74
+ },
+ "retro": [
+ 145329
+ ]
+ },
+ {
+ "instruction": "Can you design a molecule that penetrates the blood-brain barrier, inhibits Human Immunodeficiency Virus, and doesn't inhibit Beta-Secretase 1, while having a specific set of structural properties?",
+ "input": "",
+ "output": "To satisfy the requirements: The molecule exhibits a moderate level of structural complexity, featuring three aromatic rings and an aliphatic chain, with a molecular weight of 216.27. Its synthetic complexity score is 2.88, indicating it may be moderately challenging to synthesize. This structure is predicted to have a relatively low likelihood of mutagenicity, with an AMES score of 0.608. It also shows moderate to low likelihood of blood-brain barrier penetration, with a BBB_Martins score of 0.576. The molecule's oral bioavailability is predicted to be moderate, with a Bioavailability_Ma score of 0.797. Additionally, it is predicted to have a low to moderate interaction with various cytochrome P450 enzymes. Overall, the molecule's drug utility is influenced by its structural features, bioactivity, and pharmacokinetic properties, which suggest it may have potential as a therapeutic agent. Therefore, the designed molecule is: c1csc(Nc2nccn3ccnc23)c1. This is step 1 in the retrosynthesis process. To synthesize c1csc(Nc2nccn3ccnc23)c1, follow these procedures: To a solution of (3-bromo-imidazo[1,2-a]pyrazin-8-yl)-methyl-amine (50 mg, 0.2 mmol) in tetrahydrofuran (2 ml) under nitrogen was added 2-thiopheneboronic acid (41 mg, 0.3 mmol), K2CO3 (1.1 ml of a 1 M solution in water) and 16 mg (0.1 mmol) of PdCl2(Dppf)CH2Cl2 (16 mg, 0.1 eq). The mixture was heated at 70\u00b0 C. in a sealed tube overnight. The product was precipitated by adding methanol. The filtrate was evaporated and purified by preparative thin layer chromatography on silica gel to give additional title compound for a total of 45 mg (100% yield). The applied reaction is: c1csc(Nc2nccn3ccnc23)c1>>CNc1nccn2c(Br)cnc12.O=C(O)O.OB(O)c1cccs1.[K+].[K+] with the template [#7;a:4]:[c:3]:[cH;D2;+0:1]:[#7;a:2].[#7:5]-[c;H0;D3;+0:6]1:[cH;D2;+0:9]:[c:8]:[cH;D2;+0:7]:[s;H0;D2;+0:10]:1>>Br-[c;H0;D3;+0:1](:[#7;a:2]):[c:3]:[#7;a:4].[#7:5]-[CH3;D1;+0:6].O-B(-O)-[c;H0;D3;+0:7]1:[c:8]:[cH;D2;+0:9]:c:[s;H0;D2;+0:10]:1, which requires the reactants: CNc1nccn2c(Br)cnc12 (available), O=C(O)O (available), OB(O)c1cccs1 (available), [K+] (available), [K+] (available).",
+ "property": {
+ "BBBP": 1.0,
+ "HIV": 1.0,
+ "BACE": 0.0,
+ "SC": 2.88,
+ "SA": 2.75
+ },
+ "retro": [
+ 155241
+ ]
+ },
+ {
+ "instruction": "What molecule can be designed and synthesized to penetrate the blood-brain barrier and inhibit Human Immunodeficiency Virus, given the constraints of a molecular weight around 209.245 and a single aromatic ring?",
+ "input": "",
+ "output": "To satisfy the requirements: The molecule featuring an aromatic ring and an aliphatic chain, with functional groups including hydroxyl and carboxyl, exhibits moderate synthetic complexity and moderate lipophilicity, which may influence its physical and chemical properties. Therefore, the designed molecule is: CC(C)(C)OC(=O)Nc1ccccc1O. This is step 1 in the retrosynthesis process. To synthesize CC(C)(C)OC(=O)Nc1ccccc1O, follow these procedures: To a solution of 2-aminophenol (2.84 g) in dichloromethane (120 ml) was added di-tert-butyl dicarbonate (6.55 g). The mixture was stirred at room temperature for 18 hours. The mixture was partitioned between water and dichloromethane. The organic extracts were dried (MgSO4) and evaporated to give a solid which was purified by column chromatography on silica eluting with a mixture of ethyl acetate and hexane (20:80) to give 2-tert-butyloxycarbonylaminophenol (1.80 g);. The applied reaction is: CC(C)(C)OC(=O)Nc1ccccc1O>>CC(C)(C)OC(=O)OC(=O)OC(C)(C)C.Nc1ccccc1[O-] with the template [C;D1;H3:1]-[C:2](-[C;D1;H3:3])(-[C;D1;H3:4])-[#8:5]-[C;H0;D3;+0:6](=[O;D1;H0:7])-[NH;D2;+0:8]-[c:9]>>[C;D1;H3:1]-[C:2](-[C;D1;H3:3])(-[C;D1;H3:4])-[#8:5]-[C;H0;D3;+0:6](=[O;D1;H0:7])-O-[C;H0;D3;+0:6](=[O;D1;H0:7])-[#8:5]-[C:2](-[C;D1;H3:1])(-[C;D1;H3:3])-[C;D1;H3:4].[NH2;D1;+0:8]-[c:9], which requires the reactants: CC(C)(C)OC(=O)OC(=O)OC(C)(C)C (available), Nc1ccccc1[O-] (available).",
+ "property": {
+ "BBBP": 1.0,
+ "HIV": 1.0,
+ "SC": 1.7,
+ "SA": 1.73
+ },
+ "retro": [
+ 38065
+ ]
+ }
+]
\ No newline at end of file
diff --git a/data/molqa_material_examples.json b/data/molqa_material_examples.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f3cfcb8802b1be01035924bcdd61bcd4640b303
--- /dev/null
+++ b/data/molqa_material_examples.json
@@ -0,0 +1,85 @@
+[
+ {
+ "instruction": "What is the optimal molecular design and synthesis route for a polymer with high CO2 gas permeability and low permeability to N2 and O2, featuring an aromatic ring and specific functional groups?",
+ "input": "",
+ "output": "To satisfy the requirements: The polymer, featuring an aromatic ring and functional groups such as carboxyl and hydroxyl, exhibits a moderate synthetic complexity score and high gas permeability, indicating potential for use in gas separation membranes with good efficiency. Therefore, the designed molecule is: *c1cc(C(=O)OCC)c(*)[nH]1. This is step 1 in the retrosynthesis process. To synthesize CCOC(=O)c1cc[nH]c1, follow these procedures: Ethanol (3.2 mL, 54 mmol) and 4-dimethylamino pyridine (DMAP) (54 mg, 0.45 mmol) were added to a mixture of pyrrole-3-carboxylic acid (500 mg, 4.5 mmol) and dicyclohexylcarbodiimide (1.11 g, 5.4 mmol) (Aldrich, Milwaukee, Wis.) in tetrahydrofuran (THF) (15 mL). After heating at 60\u00b0 C. for 10 hours, the reaction was cooled. The precipitate was filtered off, washed with ethyl acetate, the combined filtrate was concentrated and purified on a silica gel column to give 500 mg (81%) of 1H-pyrrole-3-carboxylic acid ethyl ester as a colorless oil. The applied reaction is: CCOC(=O)c1cc[nH]c1>>C(=NC1CCCCC1)=NC1CCCCC1.CCO.O=C(O)c1cc[nH]c1 with the template [#7;a:5]:[c:4]:[c:3]-[C;H0;D3;+0:1](=[O;D1;H0:2])-[O;H0;D2;+0:7]-[C:6]>>O-[C;H0;D3;+0:1](=[O;D1;H0:2])-[c:3]:[c:4]:[#7;a:5].[C:6]-[OH;D1;+0:7], which requires the reactants: C(=NC1CCCCC1)=NC1CCCCC1 (available), CCO (available), O=C(O)c1cc[nH]c1 (available).",
+ "property": {
+ "CO2": 0.94,
+ "N2": 0.0,
+ "O2": 0.0,
+ "FFV": 0.381,
+ "SC": 2.28,
+ "SA": 4.21
+ },
+ "retro": [
+ 36155
+ ]
+ },
+ {
+ "instruction": "Can a polymer with a molecular weight of 184.22, one aromatic ring, and specific gas permeability, thermal conductivity, and electrical properties be designed and synthesized for gas separation applications?",
+ "input": "",
+ "output": "To satisfy the requirements: The polymer has an aromatic ring incorporated within its structure, which can contribute to its overall rigidity and mechanical strength. The presence of the sulfonamide group can also enhance the polymer's chemical stability and resistance to degradation. The predicted thermal properties, such as melting point and thermal conductivity, suggest that the polymer may exhibit suitable thermal behavior for various applications. Additionally, the predicted dielectric constant and electrical conductivity indicate that the polymer may possess suitable electrical properties. The gas permeability coefficient and gas diffusion coefficient suggest that the polymer may be suitable for gas separation applications. Overall, the polymer's material utility seems promising for creating polymeric materials with desirable properties. Therefore, the designed molecule is: *Nc1cc(*)ccc1S(=O)(=O)NC. This is step 1 in the retrosynthesis process. To synthesize CNS(=O)(=O)c1ccccc1N, follow these procedures: N-Methyl-2-nitrobenzenesulfonamide (1.0 g, 4.6 mmol), 10% palladium on carbon (100 mg) and absolute ethanol (30 mL) were placed in a Parr bottle, the bottle placed on a Parr apparatus under a hydrogen atmosphere (45 psi), and shaken for 1 hour, The Parr bottle was evacuated and flushed with nitrogen, then the mixture was filtered through a disposable syringe filter. The filtrate was concentrated in vacuo to yield 0.81 g of 2-amino-N-methylbenzenesulfonamide. The applied reaction is: CNS(=O)(=O)c1ccccc1N>>CNS(=O)(=O)c1ccccc1[N+](=O)[O-] with the template [NH2;D1;+0:1]-[c:2]>>O=[N+;H0;D3:1](-[O-])-[c:2], which requires the reactants: CNS(=O)(=O)c1ccccc1[N+](=O)[O-] (available).",
+ "property": {
+ "CO2": 1.746,
+ "O2": 1.247,
+ "FFV": 0.37,
+ "TC": 0.192,
+ "SC": 2.8,
+ "SA": 4.17
+ },
+ "retro": [
+ 3
+ ]
+ },
+ {
+ "instruction": "Can a molecule be designed and synthesized to exhibit high CO2 permeability, moderate fractional free volume, and specific thermal conductivity, while incorporating aromatic and carboxyl functional groups?",
+ "input": "",
+ "output": "To satisfy the requirements: The polymer, with its aromatic ring and carboxyl functional group, exhibits a moderate synthetic complexity score, suggesting a relatively accessible molecular structure for synthesis. Its molecular structure and functional groups are expected to impart specific physical and chemical properties, influencing its material utility for polymeric materials. Therefore, the designed molecule is: *CC(*)OC(=O)c1cc(C)cc(Cl)c1. This is step 1 in the retrosynthesis process. To synthesize CCOC(=O)c1cc(C)cc(Cl)c1, follow these procedures: The title compound was synthesized from 3-chloro-5-methyl-benzoic acid under the same conditions as for Compound b1. The applied reaction is: CCOC(=O)c1cc(C)cc(Cl)c1>>CCOC(=O)c1ccc(Br)c(C(F)(F)F)c1.Cc1cc(Cl)cc(C(=O)O)c1, which requires the reactants: CCOC(=O)c1ccc(Br)c(C(F)(F)F)c1 (not available, needs to be synthesized), Cc1cc(Cl)cc(C(=O)O)c1 (available). This is step 2 in the retrosynthesis process. To synthesize CCOC(=O)c1ccc(Br)c(C(F)(F)F)c1, follow these procedures: Potassium carbonate (1.5 g, 11.2 mmol) and ethyl iodide (1.2 g, 7.4 mmol) were added to a solution of 4-bromo-3-trifluoromethyl-benzoic acid (1.0 g, 3.7 mmol) in DMF (5 ml), and the mixture was stirred at room temperature for 24 hours. The reaction mixture was diluted with ethyl acetate, and the organic layer was washed with water and saturated-saline, and dried over anhydrous sodium sulfate. The drying agent was removed by filtration. After concentration under reduced pressure, the resulting residue was purified by silica gel column chromatography (ethyl acetate/hexane) to yield the title compound (1.03 g, 94%) as a brown oily substance. The applied reaction is: CCOC(=O)c1ccc(Br)c(C(F)(F)F)c1>>CCI.O=C(O)c1ccc(Br)c(C(F)(F)F)c1.O=C([O-])[O-].[K+].[K+], which requires the reactants: CCI (available), O=C(O)c1ccc(Br)c(C(F)(F)F)c1 (available), O=C([O-])[O-] (available), [K+] (available), [K+] (available).",
+ "property": {
+ "CO2": 0.94,
+ "N2": 0.0,
+ "O2": 0.0,
+ "FFV": 0.375,
+ "TC": 0.223,
+ "SC": 2.58,
+ "SA": 4.01
+ },
+ "retro": [
+ null,
+ null
+ ]
+ },
+ {
+ "instruction": "What molecular structure can be designed and synthesized to achieve CO2 permeability of 1.743 Barrer, N2 impermeability, and specific thermal conductivity and fractional free volume properties, while incorporating an aromatic ring and limited rotatable bonds?",
+ "input": "",
+ "output": "To satisfy the requirements: The polymer is a complex molecular structure incorporating an aromatic ring and a bromine functional group, which may provide enhanced thermal and mechanical properties, potentially useful for gas separation applications. Therefore, the designed molecule is: *c1ccc(*)c(CBr)c1. This is step 1 in the retrosynthesis process. To synthesize BrCc1ccccc1, follow these procedures: 4.65 g (10 mM) of compound 1a were treated, while stirring, with 40 ml of 2 N HBr in glacial acetic acid for 45 min. at 20\u00b0 in the absence of moisture. The amino acid derivative dissolved with CO2 evolution. The reaction solution was added dropwise with vigorous stirring to 250 ml of absolute ether which resulted in the precipitation of 2HBr.H-Arg-pNA. The ethereal phase was sucked off, whereupon the solid phase was washed 4 times with portions of 100 ml of abs. ether in order to substantially remove benzyl bromide which had formed as a by-product as well as excess HBr and AcOH. The residue was dissolved in 50 ml of MeOH, the pH was adjusted to 4.5 by the addition of Et3N, and the solution was concentrated to dryness in vacuo at 30\u00b0. The resulting product was dissolved in 75 ml of MeOH and passed through a column of \"Sephadex\" LH-20 (cross-linked dextran gel) equilibrated with MeOH. From a fraction of the eluate there were obtained 4.18 g (91.6% of the theory) of amorphous compound 1b which was homogeneous in the SS as shown by TLC. Elementary analysis and calculation from the empirical formula C12H20N6O3Br2 gave the following values: C=31.15% (31.60%), H=4.35% (4.42%), N=18.84% (18.43%) and Br=34.81% (35.03%). The applied reaction is: BrCc1ccccc1>>CC(C)(C#N)/N=N/C(C)(C)C#N.Cc1ccc(S(=O)(=O)Cl)cc1.O=C1CCC(=O)N1Br, which requires the reactants: CC(C)(C#N)/N=N/C(C)(C)C#N (available), Cc1ccc(S(=O)(=O)Cl)cc1 (available), O=C1CCC(=O)N1Br (available).",
+ "property": {
+ "CO2": 1.743,
+ "N2": 0.0,
+ "O2": 4.692,
+ "FFV": 0.411,
+ "TC": 0.256,
+ "SC": 2.25,
+ "SA": 5.14
+ },
+ "retro": [
+ null
+ ]
+ },
+ {
+ "instruction": "Can you design a molecule with moderate synthetic complexity, featuring a carboxyl functional group and an aromatic ring, that exhibits CO2 gas permeability of 9.176 Barrer, O2 gas permeability of 1.494 Barrer, fractional free volume of 0.374, and thermal conductivity of 0.228 W m^{-1} K^{-1}?",
+ "input": "",
+ "output": "To satisfy the requirements: The polymer exhibits a moderate synthetic complexity score, indicating that its synthesis is relatively feasible. The presence of an aromatic ring and a carboxyl functional group in its scaffold contributes to its moderate complexity. Therefore, the designed molecule is: *OC(=O)c1ccc(*)s1. This is step 1 in the retrosynthesis process. To synthesize O=C([O-])c1cccs1, follow these procedures: To a 15-mL polypropylene centrifuge tube was added 3.86 mL of 50 mM potassium phosphate buffer (pH 7.0), 1.0 mL of a suspension of 22.1 mg dry cell weight E. coli SW132 wet cells (prepared as described in Example 10) in 50 mM potassium phosphate buffer (pH 7.0), and 0.1691 g of 2-thiophenecarbonitrile. The final concentration of 2-thiophencarbonitrile was 0.307 M. The reaction mixture was mixed on a rotating platform at 27\u00b0 C. After 30 min, 7.50 mL of 95:5 acetonitrile/water containing 0.30 M N,N-dimethylbenzamide (HPLC external standard) was added to the reaction, the resulting mixture centrifuged, and a 0.100 mL of the supernatant mixed with 0.900 mL of acetonitrile and analyzed by HPLC. The conversion of 2-thiophenecarbonitrile was 99.5%, and the yields of 2-thiophenecarboxamide and 2-thiophenecarboxylic acid were 98% and 0%, respectively. The applied reaction is: O=C([O-])c1cccs1>>CN(C)C(=O)c1ccccc1.N#Cc1cccs1.O=P([O-])([O-])O.[2H]C([2H])([2H])C#N.[K+].[K+].[K+].[OH-], which requires the reactants: CN(C)C(=O)c1ccccc1 (available), N#Cc1cccs1 (available), O=P([O-])([O-])O (available), [2H]C([2H])([2H])C#N (available), [K+] (available), [K+] (available), [K+] (available), [OH-] (available).",
+ "property": {
+ "CO2": 9.176,
+ "O2": 1.494,
+ "FFV": 0.374,
+ "TC": 0.228,
+ "SC": 2.23,
+ "SA": 4.78
+ },
+ "retro": [
+ null
+ ]
+ }
+]
\ No newline at end of file
diff --git a/data/molqa_train_examples.json b/data/molqa_train_examples.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff85e69a1791a0553a7ea45bbeaaeeaab6d5717a
--- /dev/null
+++ b/data/molqa_train_examples.json
@@ -0,0 +1,62 @@
+[
+ {
+ "instruction": "What is a feasible molecular design that meets the criteria of a molecular weight of 243.306, two rings including one aromatic and one aliphatic, two rotatable bonds, and two hydrogen bond acceptors, with a balanced hydrophobic and hydrophilic profile?",
+ "input": "",
+ "output": "To satisfy the requirements: The molecule has a scaffold consisting of a central aromatic ring linked to an amino-functionalized aliphatic chain, featuring a carboxyl group. Its structural complexity is moderate, with a balance of hydrophobic and hydrophilic regions, which could influence its physical and chemical properties. Therefore, the designed molecule is: C#CCC1CCN(C(=O)Oc2ccccc2)CC1. This is step 1 in the retrosynthesis process. To synthesize C#CCC1CCN(C(=O)Oc2ccccc2)CC1, follow these procedures: Phenyl chloroformate (6.2 g, 40.2 mmol) was added to a solution of tert-butyl 4-(prop-2-ynyl)piperidine-1-carboxylate (1.65 g, 13.4 mmol) according to general procedure 1. Yield=0.600 g, 34%. m/z MH+=244.08. HPLC rt=10.3 min. The applied reaction is: C#CCC1CCN(C(=O)Oc2ccccc2)CC1>>C#CCC1CCN(C(=O)OC(C)(C)C)CC1.O=C(Cl)Oc1ccccc1 with the template [#8:5]-[C;H0;D3;+0:4](=[O;D1;H0:6])-[N;H0;D3;+0:1](-[C:2])-[C:3]>>C-C(-C)(-C)-O-C(=O)-[N;H0;D3;+0:1](-[C:2])-[C:3].Cl-[C;H0;D3;+0:4](-[#8:5])=[O;D1;H0:6], which requires the reactants: C#CCC1CCN(C(=O)OC(C)(C)C)CC1 (available), O=C(Cl)Oc1ccccc1 (available).",
+ "property": {
+ "SC": 2.42,
+ "SA": 2.22
+ },
+ "retro": [
+ 180575
+ ]
+ },
+ {
+ "instruction": "Can you design and synthesize a molecule with a molecular weight of around 206, containing 2 rings including 1 aromatic and 1 aliphatic, and 3 rotatable bonds, with no hydrogen bond donors and 2 acceptors?",
+ "input": "",
+ "output": "To satisfy the requirements: The molecule has a relatively low synthetic complexity score and a moderate synthetic accessibility score, indicating that it is relatively easy to synthesize. Its molecular structure is composed of an aromatic ring and an aliphatic chain, which may influence its physical and chemical properties. Therefore, the designed molecule is: Cc1ccc(C)n1CCN1CCCC1=O. This is step 1 in the retrosynthesis process. To synthesize Cc1ccc(C)n1CCN1CCCC1=O, follow these procedures: 5.7 g (0.05 mmol) of acetonylacetone and 6.4 g (0.05 mmol) of 1-(2-aminoethyl)-2-pyrrolidinone are heated under reflux in 70 ml of methanol for 2 h. The reaction mixture is then concentrated and, after the concentration, the crude reaction product is then precipitated by addition of petroleum ether. The crude product is recrystallized from diethyl ether. Yield: 3.2 g (31% of theory), Melting point: 66\u00b0-68\u00b0 C. Elemental analysis: C12H18N2O (206.29) calculated: C 69.9 H 8.8 N 13.6 0 7.8 found: C 69.4 H 8.7 N 13.7 0 8.11. The applied reaction is: Cc1ccc(C)n1CCN1CCCC1=O>>CC(=O)CCC(C)=O.NCCN1CCCC1=O with the template [C:7]-[n;H0;D3;+0:8]1:[c;H0;D3;+0:1](-[C;D1;H3:2]):[cH;D2;+0:3]:[cH;D2;+0:4]:[c;H0;D3;+0:5]:1-[C;D1;H3:6]>>O=[C;H0;D3;+0:1](-[C;D1;H3:2])-[CH2;D2;+0:3]-[CH2;D2;+0:4]-[C;H0;D3;+0:5](=O)-[C;D1;H3:6].[C:7]-[NH2;D1;+0:8], which requires the reactants: CC(=O)CCC(C)=O (available), NCCN1CCCC1=O (available).",
+ "property": {
+ "SC": 2.69,
+ "SA": 2.34
+ },
+ "retro": [
+ 18636
+ ]
+ },
+ {
+ "instruction": "Can you design and synthesize a molecule with a molecular weight around 377, incorporating four rings including two aromatic and two aliphatic rings, and six rotatable bonds, with no hydrogen bond donors and four acceptors?",
+ "input": "",
+ "output": "To satisfy the requirements: The molecule has a moderately complex structure built around a core scaffold that incorporates aromatic rings and aliphatic chains, with the presence of functional groups and a moderate number of rotatable bonds, contributing to its moderate synthetic complexity score. Therefore, the designed molecule is: Cc1ccccc1N1CCN(CCCCN2C(=O)c3ccccc3C2=O)CC1. This is step 1 in the retrosynthesis process. To synthesize Cc1ccccc1N1CCN(CCCCN2C(=O)c3ccccc3C2=O)CC1, follow these procedures: The title compound was synthesized applying the general procedure III step A using 1-(2-methylphenyl)piperazine (1.99 mmol, 350 mg), N-(4-bromobutyl)phthalimide (1.99 mmol, 560 mg), and K2CO3 (4.97 mmol, 686 mg) in 6 mL of acetonitrile. White solid 750 mg (99%). 1H NMR (400 MHz, Chloroform-d) \u03b4 7.92-7.79 (m, 2H), 7.72 (dt, J=5.5, 3.1 Hz, 2H), 7.16 (t, J=7.6 Hz, 2H), 7.08-6.88 (m, 2H), 3.74 (t, J=7.1 Hz, 2H), 2.93 (t, J=4.8 Hz, 4H), 2.60 (s, 4H), 2.52-2.39 (m, 2H), 2.30 (s, 3H), 1.75 (p, J=7.3 Hz, 2H), 1.59 (tt, J=9.7, 5.9 Hz, 2H). The applied reaction is: Cc1ccccc1N1CCN(CCCCN2C(=O)c3ccccc3C2=O)CC1>>Cc1ccccc1[NH+]1CC[NH2+]CC1.O=C(O)O.O=C1c2ccccc2C(=O)N1CCCCBr.[K+].[K+], which requires the reactants: Cc1ccccc1[NH+]1CC[NH2+]CC1 (available), O=C(O)O (available), O=C1c2ccccc2C(=O)N1CCCCBr (available), [K+] (available), [K+] (available).",
+ "property": {
+ "SC": 2.93,
+ "SA": 1.97
+ },
+ "retro": [
+ null
+ ]
+ },
+ {
+ "instruction": "Can you design a molecule with a molecular weight around 242, one aromatic ring, three rotatable bonds, and three hydrogen bond acceptors, and describe its synthesis process?",
+ "input": "",
+ "output": "To satisfy the requirements: The molecule has an aromatic ring and an aliphatic chain, with the presence of carboxyl, amino, and halogen groups, which contribute to its moderate synthetic complexity score, indicating a relatively challenging synthesis process. Therefore, the designed molecule is: COC(=O)c1ccccc1N(C)C(=O)CCl. This is step 1 in the retrosynthesis process. To synthesize COC(=O)c1ccccc1N(C)C(=O)CCl, follow these procedures: 25 Grams of methyl o-methylaminobenzoate was dissolved in 125 ml of acetone, to this solution was added a solution prepared by dissolving 20.92 g of potassium carbonate in 60 ml of water. Under an ice-cooled condition, 17.95 g of chloroacetyl chloride was added dropwise thereto, then the reaction mixture was stired at a room temperature for 1 hour. Acetone was removed by distillation, the residue obtained was extracted with chloroform, then the chloroform layer was washed with water, dried and the solvent was removed by distillation. The residue thus obtained was purified by a silica gel column chromatography to obtain 16.80 g of N-methyl-o-methoxycarbonyl-\u03b1-chloroacetanilide in the form of colorless oily substance. The applied reaction is: COC(=O)c1ccccc1N(C)C(=O)CCl>>CNc1ccccc1C(=O)OC.O=C(Cl)CCl.O=C(O)O.[K+].[K+] with the template [C:2]-[C;H0;D3;+0:1](=[O;D1;H0:3])-[N;H0;D3;+0:5](-[C;D1;H3:4])-[c:6]>>Cl-[C;H0;D3;+0:1](-[C:2])=[O;D1;H0:3].[C;D1;H3:4]-[NH;D2;+0:5]-[c:6], which requires the reactants: CNc1ccccc1C(=O)OC (available), O=C(Cl)CCl (available), O=C(O)O (available), [K+] (available), [K+] (available).",
+ "property": {
+ "SC": 2.13,
+ "SA": 2.07
+ },
+ "retro": [
+ 39003
+ ]
+ },
+ {
+ "instruction": "Can you design a molecule with a molecular weight of 317.186, three aromatic rings, and two rotatable bonds, and outline a synthesis pathway for it?",
+ "input": "",
+ "output": "To satisfy the requirements: The molecule features a complex aromatic scaffold with multiple rings, exhibiting moderate synthetic complexity and potential bioavailability, while also showing possible interactions with nuclear receptors and stress response pathways. Therefore, the designed molecule is: COc1ccccc1-c1cn2cc(Br)cc(C)c2n1. This is step 1 in the retrosynthesis process. To synthesize COc1ccccc1-c1cn2cc(Br)cc(C)c2n1, follow these procedures: A mixture of 2-bromo-2\u2032-methoxyacetophenone (0.500 g, 2.183 mmol) and 2-amino-5-bromo-3-methylpyridine (0.408 g, 2.183 mmol) in ethanol (8 mL) was heated to 80\u00b0 C. for 16 hours in a pressure vessel. A yellow thick precipitate formed. Water was added and the solid was extracted with ethyl acetate (some DCM and MeOH were added to help in the dissolution). The organic extracts were then washed with brine, dried over Na2SO4, filtered and concentrated and the residue was purified by silica gel chromatography (Biotage SNAP 100 g, 10 to 20% ethyl acetate in hexanes) to give 220 (0.375 g, 1.182 mmol, 54.2% yield) as a white solid. LRMS (ESI): calc. 316.02 found 317.1 (MH)+. The applied reaction is: COc1ccccc1-c1cn2cc(Br)cc(C)c2n1>>COc1ccccc1C(=O)CBr.Cc1cc(Br)cnc1N.[OH-] with the template [c:4]:[c;H0;D3;+0:3](:[c:5])-[c;H0;D3;+0:2]1:[cH;D2;+0:1]:[n;H0;D3;+0:8](:[c:9]):[c:7]:[n;H0;D2;+0:6]:1>>Br-[CH2;D2;+0:1]-[C;H0;D3;+0:2](=O)-[c;H0;D3;+0:3](:[c:4]):[c:5].[NH2;D1;+0:6]-[c:7]:[n;H0;D2;+0:8]:[c:9], which requires the reactants: COc1ccccc1C(=O)CBr (available), Cc1cc(Br)cnc1N (available), [OH-] (available).",
+ "property": {
+ "SC": 2.94,
+ "SA": 2.19
+ },
+ "retro": [
+ 10919
+ ]
+ }
+]
\ No newline at end of file
diff --git a/data/property_ranges.json b/data/property_ranges.json
new file mode 100644
index 0000000000000000000000000000000000000000..cad714371b949ecdb69cd0574d61ff202aa56cc5
--- /dev/null
+++ b/data/property_ranges.json
@@ -0,0 +1,42 @@
+{
+ "BBBP": {
+ "min": 0.0,
+ "max": 1.0
+ },
+ "HIV": {
+ "min": 0.0,
+ "max": 1.0
+ },
+ "BACE": {
+ "min": 0.0,
+ "max": 1.0
+ },
+ "CO2": {
+ "min": 0.94,
+ "max": 1019.265
+ },
+ "N2": {
+ "min": 0.0,
+ "max": 73.417
+ },
+ "O2": {
+ "min": 0.0,
+ "max": 122.94
+ },
+ "FFV": {
+ "min": 0.324,
+ "max": 0.434
+ },
+ "TC": {
+ "min": 0.117,
+ "max": 0.38
+ },
+ "SC": {
+ "min": 1.0,
+ "max": 5.0
+ },
+ "SA": {
+ "min": 1.0,
+ "max": 8.48
+ }
+}
\ No newline at end of file
diff --git a/install_environment.sh b/install_environment.sh
new file mode 100644
index 0000000000000000000000000000000000000000..878ec6131eea1bca3b9ee7a4afcb394a89e2e54b
--- /dev/null
+++ b/install_environment.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Set non-interactive frontend
+export DEBIAN_FRONTEND=noninteractive
+
+# Activate the llama_factory environment
+# Uncomment the following line if you need to create the environment
+# conda create --name llamole python=3.11 -y
+# conda activate llamole
+
+# Function to get the current conda environment name
+get_current_conda_env() {
+ echo "current: $CONDA_DEFAULT_ENV"
+}
+
+# Get the current conda environment
+current_env=$(basename "$CONDA_PREFIX")
+
+# Check if the current environment is 'llamole'
+if [ "$current_env" != "llamole" ]; then
+ echo "current: $CONDA_DEFAULT_ENV"
+ echo "Current conda environment is neither 'llama_factory' nor 'llamole'."
+ echo "Please activate one of these environments before running this script."
+ echo "You can activate an environment using one of these commands:"
+ echo "conda activate llama_factory"
+ echo "conda activate llamole"
+ exit 1
+fi
+
+echo "Running in conda environment: $current_env"
+
+# "pandas>=2.0.0" \
+# Install packages using pip
+pip install --no-cache-dir \
+ pyarrow \
+ "pandas>=1.5.3" \
+ "rdkit==2023.9.6" \
+ pyyaml \
+ ipykernel \
+ packaging \
+ gdown \
+ "fcd_torch==1.0.7" \
+ "omegaconf==2.3.0" \
+ "imageio==2.26.0" \
+ wandb \
+ pandarallel \
+ scipy \
+ einops \
+ sentencepiece \
+ tiktoken \
+ protobuf \
+ uvicorn \
+ pydantic \
+ fastapi \
+ sse-starlette \
+ "matplotlib>=3.7.0" \
+ fire \
+ "numpy<2.0.0" \
+ gradio
+
+pip install --no-cache-dir hydra-core --upgrade
+
+# Install PyTorch
+pip install --no-cache-dir torch
+
+# Install PyTorch Geometric and related packages
+pip install --no-cache-dir torch_geometric
+
+# for retro reaction
+pip install rdchiral
+pip install nltk
+
+# Install transformers and related packages
+pip install --no-cache-dir \
+ "transformers>=4.41.3" \
+ "datasets>=2.16.0" \
+ "accelerate>=0.30.1" \
+ "peft>=0.11.1" \
+ "trl>=0.8.6" \
+ "gradio>=4.0.0"
+
+# Install mini-moses from GitHub
+pip install --no-cache-dir git+https://github.com/igor-krawczuk/mini-moses
+
+echo "Installation complete!"
\ No newline at end of file
diff --git a/launch.py b/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e31fcd739f403ca3ce24861e176ac545d76e987
--- /dev/null
+++ b/launch.py
@@ -0,0 +1,238 @@
+# Copyright 2024 Llamole Team
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import yaml
+import numpy as np
+import gradio as gr
+import random
+from rdkit import Chem
+from rdkit.Chem import Draw
+from rdkit.Chem import AllChem
+
+from src.webui.workflow import load_model_and_tokenizer, process_input, generate
+from src.webui.elements import create_input_components
+
+# Load candidates
+with open('data/molqa_material_examples.json', 'r') as f:
+ material_examples = json.load(f)
+
+with open('data/molqa_drug_examples.json', 'r') as f:
+ drug_examples = json.load(f)
+
+# Add type to each example
+for example in material_examples:
+ example['type'] = 'Material'
+
+for example in drug_examples:
+ example['type'] = 'Drug'
+
+# Function to process property values
+def process_property(value):
+ return 1e-8 if value == 0 else value
+
+# Add type to each example and process property values
+for example in material_examples:
+ example['type'] = 'Material'
+ for prop in ['CO2', 'N2', 'O2', 'FFV']:
+ if prop in example['property']:
+ example['property'][prop] = process_property(example['property'][prop])
+
+# Combine examples
+all_examples = material_examples + drug_examples
+
+# Get default values from the first material example
+default_values = drug_examples[0]
+
+# Load property ranges and arguments
+with open('data/property_ranges.json', 'r') as f:
+ property_ranges = json.load(f)
+
+# with open('config/generate/qwen_material.yaml', 'r') as file:
+with open('config/generate/llama_material.yaml', 'r') as file:
+ args_dict = yaml.safe_load(file)
+
+# Load model and tokenizer outside the function
+model, tokenizer, generating_args = load_model_and_tokenizer(args_dict)
+
+def format_example(example):
+ formatted = [example['instruction']]
+
+ # Determine if it's a drug or material example based on properties
+ is_drug = any(prop in example.get('property', {}) for prop in ["HIV", "BBBP", "BACE"])
+ formatted.append("Drug" if is_drug else "Material")
+
+ # Handle drug properties
+ for prop in ["HIV", "BBBP", "BACE"]:
+ value = example.get('property', {}).get(prop, float('nan'))
+ formatted.append(value if not np.isnan(value) else "NAN")
+
+ # Handle material properties
+ for prop in ["CO2", "N2", "O2", "FFV", "TC"]:
+ value = example.get('property', {}).get(prop, float('nan'))
+ formatted.append(value if not np.isnan(value) else 0) # 0 represents NAN for material properties
+
+ # Handle synthetic properties
+ for prop in ["SC", "SA"]:
+ value = example.get('property', {}).get(prop, float('nan'))
+ formatted.append(value if not np.isnan(value) else float('nan'))
+
+ return formatted
+
+# Prepare examples
+formatted_examples = [format_example(example) for example in all_examples]
+
+def random_example(examples):
+ example = random.choice(examples)
+ property_type = example['type']
+
+ outputs = [example['instruction'], property_type]
+
+ for prop in ["HIV", "BBBP", "BACE"]:
+ outputs.append(example['property'].get(prop, "NAN"))
+
+ for prop in ["CO2", "N2", "O2", "FFV", "TC"]:
+ outputs.append(example['property'].get(prop, 0))
+
+ for prop in ["SC", "SA"]:
+ outputs.append(example['property'].get(prop, float('nan')))
+
+ return outputs
+
+def generate_and_visualize(instruction, property_type, HIV, BBBP, BACE, CO2, N2, O2, FFV, TC, SC, SA):
+ properties = {
+ "HIV": float('nan') if HIV == "NAN" else HIV,
+ "BBBP": float('nan') if BBBP == "NAN" else BBBP,
+ "BACE": float('nan') if BACE == "NAN" else BACE,
+ "CO2": float('nan') if CO2 == 0 else CO2,
+ "N2": float('nan') if N2 == 0 else N2,
+ "O2": float('nan') if O2 == 0 else O2,
+ "FFV": float('nan') if FFV == 0 else FFV,
+ "TC": float('nan') if TC == 0 else TC,
+ "SC": SC,
+ "SA": SA
+ }
+
+ # Filter out NaN values
+ properties = {k: v for k, v in properties.items() if not np.isnan(v)}
+
+ print('instruction', instruction)
+ print('properties', properties)
+ results = run_molqa(instruction, **properties)
+
+ llm_response = results.get('llm_response', 'No response generated')
+ llm_smiles = results.get('llm_smiles')
+ llm_reactions = results['llm_reactions']
+
+ molecule_img = visualize_molecule(llm_smiles) if llm_smiles else None
+
+ reaction_steps = []
+ reaction_imgs = []
+ if llm_reactions:
+ for i, reaction_dict in enumerate(llm_reactions):
+ reaction = reaction_dict.get('reaction')
+ if reaction:
+ reaction_steps.append(f"Step {i+1}: {reaction}")
+ reaction_imgs.append(visualize_reaction(reaction))
+
+ return (
+ llm_response,
+ llm_smiles if llm_smiles else "No SMILES generated",
+ molecule_img,
+ gr.JSON(value=reaction_steps, visible=bool(reaction_steps)),
+ gr.Gallery(value=reaction_imgs, visible=bool(reaction_imgs))
+ )
+
+def run_molqa(instruction: str, **properties) -> dict:
+ # Filter out properties with NaN values
+ filtered_properties = {k: v for k, v in properties.items() if not np.isnan(v)}
+
+ input_data = {
+ "instruction": instruction,
+ "input": "",
+ "property": filtered_properties
+ }
+
+ dataloader, gen_kwargs = process_input(input_data, model, tokenizer, generating_args)
+ generated_results = generate(model, dataloader, gen_kwargs)
+
+ return generated_results
+
+def visualize_molecule(smiles: str) -> np.ndarray:
+ mol = Chem.MolFromSmiles(smiles)
+ if mol is not None:
+ img = Draw.MolToImage(mol)
+ return np.array(img)
+ return np.zeros((300, 300, 3), dtype=np.uint8)
+
+def visualize_reaction(reaction: str) -> np.ndarray:
+ rxn = AllChem.ReactionFromSmarts(reaction, useSmiles=True)
+ if rxn is not None:
+ img = Draw.ReactionToImage(rxn)
+ return np.array(img)
+ return np.zeros((300, 300, 3), dtype=np.uint8)
+
+# Define property names and their full descriptions
+property_names = {
+ "HIV": "HIV virus replication inhibition",
+ "BBBP": "Blood-brain barrier permeability",
+ "BACE": "Human β-secretase 1 inhibition",
+ "CO2": "CO2 Perm",
+ "N2": "N2 Perm",
+ "O2": "O2 Perm",
+ "FFV": "Fractional free volume",
+ "TC": "Thermal conductivity",
+ "SC": "Heuristic Synthetic Scores (SCScore)",
+ "SA": "Synthetic Synthetic Scores (SAScore)"
+}
+
+# Define outputs
+outputs = [
+ gr.Textbox(label="Overall LLM Response"),
+ gr.Textbox(label="Generated SMILES"),
+ gr.Image(label="Generated Molecule"),
+ gr.JSON(label="Reaction Steps"),
+ gr.Gallery(label="Reaction Visualizations")
+]
+
+with gr.Blocks() as iface:
+ gr.Markdown("# Llamole Demo Interface")
+ gr.Markdown("Enter an instruction and property values to generate a molecule design.")
+
+ interface, instruction, property_type, drug_properties, material_properties, synthetic_properties = create_input_components(default_values, property_names, property_ranges)
+
+ random_btn = gr.Button("Random Example")
+ generate_btn = gr.Button("Generate")
+
+ for output in outputs:
+ output.render()
+
+ # Update the inputs for the generate button
+ all_inputs = [instruction, property_type]
+ all_inputs.extend(drug_properties.values())
+ all_inputs.extend(material_properties.values())
+ all_inputs.extend(synthetic_properties.values())
+
+ generate_btn.click(generate_and_visualize, inputs=all_inputs, outputs=outputs)
+ random_btn.click(
+ random_example,
+ inputs=gr.State(all_examples),
+ outputs=all_inputs
+ )
+
+if __name__ == "__main__":
+ iface.launch(share=True)
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ef96f0db4d33180cbc6df272f2478ebbd05835b
--- /dev/null
+++ b/main.py
@@ -0,0 +1,66 @@
+# Copyright 2024 Llamole Team
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+from src.train.tuner import run_train, merge_adapter
+from src.eval.workflow import run_eval
+from tqdm import tqdm
+
+from huggingface_hub import hf_hub_download
+
+def download_data():
+ repo_id = "liuganghuggingface/Llamole-MolQA"
+ files_to_download = [
+ "molqa_drug.json",
+ "molqa_material.json",
+ "molqa_train.json"
+ ]
+ local_dir = "data"
+
+ # Create the data directory if it doesn't exist
+ os.makedirs(local_dir, exist_ok=True)
+
+ print(f"Downloading files from {repo_id} to {local_dir}/")
+ for file in tqdm(files_to_download, desc="Downloading files"):
+ try:
+ hf_hub_download(
+ repo_id=repo_id,
+ filename=file,
+ repo_type="dataset",
+ local_dir=local_dir,
+ local_dir_use_symlinks=False
+ )
+ print(f"Successfully downloaded: {file}")
+ except Exception as e:
+ print(f"Error downloading {file}: {e}")
+
+ print("Download complete!")
+
+if __name__ == "__main__":
+ command = sys.argv.pop(1) if len(sys.argv) != 1 else 'train'
+ if command == 'train':
+ run_train()
+ elif command == 'export':
+ merge_adapter()
+ elif command == 'eval':
+ run_eval()
+ elif command == 'download_data':
+ download_data()
+ else:
+ print(f"Invalid command: {command}. Please use 'train', 'export', 'eval', or 'download_data'.")
+ sys.exit(1)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2fe4f69c2186fa1f41ecd3cc1c645fe9ef217d6
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,30 @@
+accelerate==0.33.0
+badam==1.2.2
+bitsandbytes==0.44.1
+datasets==2.21.0
+deepspeed==0.15.2
+galore_torch==1.0
+gradio==5.0.1
+huggingface_hub==0.24.5
+jieba==0.42.1
+matplotlib==3.9.2
+MoD==0.3.0
+modelscope==1.18.1
+nltk==3.9.1
+numpy==2.1.2
+optuna==3.6.1
+packaging==24.1
+pandas==1.5.3
+peft==0.12.0
+PyYAML==6.0.2
+PyYAML==6.0.2
+rdchiral==1.1.0
+rdkit==2023.9.6
+rouge_chinese==1.0.3
+safetensors==0.4.5
+torch==2.4.0
+torch_geometric==2.6.1
+transformers==4.44.0
+trl==0.9.6
+typing_extensions==4.12.2
+vllm==0.6.2
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/data/__init__.py b/src/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa8b6b9c00d80fa71c76337d15dc5f6cd8aa5175
--- /dev/null
+++ b/src/data/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# from .collator import KTODataCollatorWithPadding, PairwiseDataCollatorWithPadding
+from .data_utils import Role, split_dataset
+from .loader import get_dataset
+from .template import TEMPLATES, Template, get_template_and_fix_tokenizer
+from .collator import DataCollatorForSeqGraph
+
+__all__ = [
+ "Role",
+ "split_dataset",
+ "get_dataset",
+ "TEMPLATES",
+ "Template",
+ "get_template_and_fix_tokenizer",
+ 'DataCollatorForSeqGraph'
+]
diff --git a/src/data/aligner.py b/src/data/aligner.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d33041d5f5b48196dcf4d25e87cce3d3c179fa1
--- /dev/null
+++ b/src/data/aligner.py
@@ -0,0 +1,233 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from functools import partial
+from typing import TYPE_CHECKING, Any, Dict, List, Union, Tuple
+
+from datasets import Features
+
+from ..extras.logging import get_logger
+from .data_utils import Role
+
+if TYPE_CHECKING:
+ from datasets import Dataset, IterableDataset
+ from transformers import Seq2SeqTrainingArguments
+
+ from ..hparams import DataArguments
+ from .parser import DatasetAttr
+
+
+logger = get_logger(__name__)
+
+def extract_all_smiles(text):
+ pattern = r'(.*?)'
+ return re.findall(pattern, text)
+
+def replace_all_smiles(text):
+ pattern = r'.*?'
+ return re.sub(pattern, '', text)
+
+def replace_smiles_with_callback(text):
+ def replace_mol(match):
+ design_end = match.group(1)
+ smiles = match.group(2)
+ # return f'{design_end}{smiles}'
+ return f'{design_end}{smiles}'
+
+ pattern = r'()(.*?)'
+ text = re.sub(pattern, replace_mol, text)
+
+ # Replace remaining molecules that are not immediately after
+ remaining_pattern = r'.*?'
+ text = re.sub(remaining_pattern, '', text)
+
+ return text
+
+def dict_to_list(data_dict, mol_properties):
+ return [data_dict.get(prop, None) for prop in mol_properties]
+
+def insert_bodies(text, num_insertions, retro_labels):
+ design_pattern = r'(.*?)'
+ retro_pattern = r'(This is step \d+ in the retrosynthesis process\..*?.*?)(.*?)(?=This is step \d+|$)'
+
+ def replace_design(match):
+ return f'' + ''.join([''] * num_insertions) + f''
+
+ def replace_retro(match, label):
+ step_content = match.group(1)
+ remaining_text = match.group(2)
+ retro_match = re.search(r'(.*?)', step_content)
+ if retro_match and label is not None:
+ modified_content = f'' + ''.join([''] * num_insertions) + f''
+ return re.sub(r'.*?', modified_content, step_content)
+ return step_content + remaining_text
+
+ text = re.sub(design_pattern, replace_design, text)
+
+ steps = re.finditer(retro_pattern, text)
+ modified_text = ""
+ last_end = 0
+
+ for i, step in enumerate(steps):
+ label = retro_labels[i] if i < len(retro_labels) else None
+ modified_text += text[last_end:step.start()] + replace_retro(step, label)
+ last_end = step.end()
+
+ modified_text += text[last_end:]
+ return modified_text
+
+def extract_retro_products(text):
+ pattern = r'(.*?)>>'
+ matches = re.findall(pattern, text)
+ return [match.strip() for match in matches]
+
+def convert_molqa(
+ examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments"
+) -> Dict[str, List[Any]]:
+ r"""
+ Converts alpaca format dataset to the standard format.
+ """
+ outputs = {"prompt": [], "response": [], "system": [], "molecules": [], "property": [], "retro_labels": [], "retro_products": []}
+
+ mol_properties = ['BBBP', 'HIV', 'BACE', 'CO2', 'N2', 'O2', 'FFV', 'TC', 'SC', 'SA']
+ for i in range(len(examples[dataset_attr.prompt])):
+ prompt = []
+ if dataset_attr.history and isinstance(examples[dataset_attr.history][i], list):
+ for old_prompt, old_response in examples[dataset_attr.history][i]:
+ prompt.append({"role": Role.USER.value, "content": old_prompt})
+ prompt.append({"role": Role.ASSISTANT.value, "content": old_response})
+
+ content = []
+ if dataset_attr.prompt and examples[dataset_attr.prompt][i]:
+ content.append(examples[dataset_attr.prompt][i])
+
+ if dataset_attr.query and examples[dataset_attr.query][i]:
+ content.append(examples[dataset_attr.query][i])
+
+ prompt.append({"role": Role.USER.value, "content": "\n".join(content)}) # "prompt\nquery"
+
+ if dataset_attr.response and isinstance(examples[dataset_attr.response][i], str): # normal example
+ current_response = examples[dataset_attr.response][i]
+ smiles_list = extract_all_smiles(current_response)
+ modified_response = replace_smiles_with_callback(current_response)
+ retro_labels = examples[dataset_attr.retro][i] if dataset_attr.retro else []
+ retro_products = extract_retro_products(current_response)
+ modified_response = insert_bodies(modified_response, data_args.learned_query_size, retro_labels)
+ # modified_response = insert_bodies(modified_response, dataset_attr.learned_query_size, retro_labels)
+ response = [{"role": Role.ASSISTANT.value, "content": modified_response}]
+ else: # unsupervised
+ response = []
+
+ outputs["prompt"].append(prompt)
+ outputs["response"].append(response)
+ outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
+ outputs["molecules"].append(smiles_list)
+ outputs["property"].append(dict_to_list(examples[dataset_attr.property][i], mol_properties))
+ outputs["retro_labels"].append(retro_labels)
+ outputs["retro_products"].append(retro_products)
+
+ return outputs
+
+def map_smiles_to_id(example, smiles_to_id):
+ example['molecules'] = [smiles_to_id[smiles] for smiles in example['molecules']]
+ return example
+
+def align_dataset(
+ dataset: Union["Dataset", "IterableDataset"],
+ dataset_attr: "DatasetAttr",
+ data_args: "DataArguments",
+ training_args: "Seq2SeqTrainingArguments",
+) -> Tuple[Union["Dataset", "IterableDataset"], Dict[int, str]]:
+ r"""
+ Aligns the dataset and maps unique SMILES strings to molecule IDs.
+
+ This function performs the following operations:
+ 1. Converts the dataset to the required format (molqa).
+ 2. Extracts all unique SMILES strings from the dataset.
+ 3. Maps each unique SMILES string to a unique integer ID (0, 1, 2, ...).
+ 4. Update 'molecules' field to each example, containing the mapped IDs.
+
+ The aligned dataset contains the following fields:
+ prompt: [{"role": "user", "content": "..."}] * (2T - 1)
+ response: [{"role": "assistant", "content": "..."}] * N (N > 1 for ranking dataset)
+ system: "..."
+ molecules: [List of SMILES string]
+ property: [List of float values]
+ retro_labels: [List of int values]
+ retro_products: [List of SMILES string]
+
+ Args:
+ dataset (Union["Dataset", "IterableDataset"]): The input dataset.
+ dataset_attr (DatasetAttr): Attributes of the dataset.
+ data_args (DataArguments): Arguments for data processing.
+ training_args (Seq2SeqTrainingArguments): Arguments for training.
+
+ Returns:
+ Tuple[Union["Dataset", "IterableDataset"], Dict[int, str]]:
+ - The aligned and converted dataset with molecule IDs.
+ - A dictionary mapping molecule IDs to their SMILES strings.
+ """
+ assert dataset_attr.formatting == "molqa"
+
+ features = Features.from_dict(
+ {
+ "prompt": [
+ {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}}
+ ],
+ "response": [
+ {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}}
+ ],
+ "system": {"dtype": "string", "_type": "Value"},
+ "molecules": [{'dtype': "string", "_type": "Value"}],
+ "property": [{"dtype": "float", "_type": "Value"}],
+ "retro_labels": [{"dtype": "int32", "_type": "Value"}],
+ "retro_products": [{'dtype': "string", "_type": "Value"}],
+ }
+ )
+
+ convert_func = partial(convert_molqa, dataset_attr=dataset_attr, data_args=data_args)
+ aligned = dataset.map(
+ convert_func,
+ batched=True,
+ remove_columns=['instruction', 'input', 'output', 'property', 'retro'],
+ features=features,
+ num_proc=data_args.preprocessing_num_workers,
+ load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
+ desc="Converting molqa format of dataset"
+ )
+
+ # Extract all unique SMILES strings and map them to molecule IDs
+ all_smiles = set()
+ for item in aligned:
+ all_smiles.update(item['molecules'])
+ all_smiles.update(item['retro_products'])
+
+ smiles_to_id = {smiles: idx for idx, smiles in enumerate(sorted(all_smiles))}
+ id_to_smiles = {idx: smiles for smiles, idx in smiles_to_id.items()}
+
+ def map_smiles_to_id(example, smiles_to_id):
+ example['molecules'] = [smiles_to_id[smiles] for smiles in example['molecules']]
+ example['retro_products'] = [smiles_to_id[smiles] for smiles in example['retro_products']]
+ return example
+
+ smiles_convert_func = partial(map_smiles_to_id, smiles_to_id=smiles_to_id)
+
+ aligned = aligned.map(
+ smiles_convert_func,
+ desc="Mapping SMILES to molecule IDs",
+ )
+
+ return aligned, id_to_smiles
\ No newline at end of file
diff --git a/src/data/collator.py b/src/data/collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..989ac073a842146529da1c9bb69146788b9dec31
--- /dev/null
+++ b/src/data/collator.py
@@ -0,0 +1,165 @@
+import torch
+import numpy as np
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+from torch_geometric.data import Batch as PyGBatch
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.utils import PaddingStrategy
+
+def pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs):
+ """
+ Pads without triggering the warning about how using the pad function is sub-optimal when using a fast tokenizer.
+ """
+ # To avoid errors when using Feature extractors
+ if not hasattr(tokenizer, "deprecation_warnings"):
+ return tokenizer.pad(*pad_args, **pad_kwargs)
+
+ # Save the state of the warning, then disable it
+ warning_state = tokenizer.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False)
+ tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+
+ try:
+ padded = tokenizer.pad(*pad_args, **pad_kwargs)
+ finally:
+ # Restore the state of the warning.
+ tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state
+
+ return padded
+
+@dataclass
+class DataCollatorForSeqGraph:
+ """
+ Data collator that will dynamically pad the inputs received, as well as the labels.
+ """
+ tokenizer: PreTrainedTokenizerBase
+ mol_id_to_pyg: Dict[str, Any]
+ model: Optional[Any] = None
+ padding: Union[bool, str, PaddingStrategy] = True
+ max_length: Optional[int] = None
+ pad_to_multiple_of: Optional[int] = None
+ label_pad_token_id: int = -100
+ return_tensors: str = "pt"
+
+ def __call__(self, features, return_tensors=None):
+ if return_tensors is None:
+ return_tensors = self.return_tensors
+
+ label_name = "label" if "label" in features[0].keys() else "labels"
+ labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+ if labels is not None and all(label is None for label in labels):
+ labels = None
+
+ # Store molecule_ids, retro_labels, and retro_product_ids separately and remove from non_labels_features
+ molecule_ids_list = []
+ retro_labels_list = []
+ retro_products_list = []
+ non_labels_features = []
+ for feature in features:
+ new_feature = {k: v for k, v in feature.items() if k != label_name}
+ if 'molecule_ids' in new_feature:
+ molecule_ids_list.append(new_feature['molecule_ids'])
+ del new_feature['molecule_ids']
+ else:
+ molecule_ids_list.append(None)
+ if 'retro_labels' in new_feature:
+ retro_labels_list.append(new_feature['retro_labels'])
+ del new_feature['retro_labels']
+ else:
+ retro_labels_list.append(None)
+ if 'retro_product_ids' in new_feature:
+ retro_products_list.append(new_feature['retro_product_ids'])
+ del new_feature['retro_product_ids']
+ else:
+ retro_products_list.append(None)
+ non_labels_features.append(new_feature)
+
+ # Convert molecule IDs to PyG Data objects
+ molecule_graphs_list = []
+ design_graphs_list = []
+ for seq_idx, molecule_ids in enumerate(molecule_ids_list):
+ if molecule_ids is not None and len(molecule_ids) > 0:
+ for pos, mol_id in enumerate(molecule_ids):
+ if pos == 0:
+ design_graphs_list.append(self.mol_id_to_pyg[mol_id])
+ if mol_id != self.label_pad_token_id and mol_id in self.mol_id_to_pyg:
+ molecule_graphs_list.append(self.mol_id_to_pyg[mol_id])
+
+ # Convert retro_product_ids to PyG Data objects
+ retro_product_graphs_list = []
+ for seq_idx, retro_product_ids in enumerate(retro_products_list):
+ if retro_product_ids is not None and len(retro_product_ids) > 0:
+ for pos, mol_id in enumerate(retro_product_ids):
+ if mol_id != self.label_pad_token_id and mol_id in self.mol_id_to_pyg:
+ retro_product_graphs_list.append(self.mol_id_to_pyg[mol_id])
+
+ # Batch the PyG Data objects
+ if molecule_graphs_list:
+ batched_graphs = PyGBatch.from_data_list(molecule_graphs_list)
+ else:
+ batched_graphs = None
+
+ if design_graphs_list:
+ batched_design_graphs = PyGBatch.from_data_list(design_graphs_list)
+ else:
+ batched_design_graphs = None
+
+ if retro_product_graphs_list:
+ batched_retro_products = PyGBatch.from_data_list(retro_product_graphs_list)
+ else:
+ batched_retro_products = None
+
+ # Pad retro_labels
+ if retro_labels_list and any(retro_labels is not None for retro_labels in retro_labels_list):
+ max_retro_length = max(len(retro_labels) for retro_labels in retro_labels_list if retro_labels is not None)
+ padded_retro_labels = [
+ retro_labels + [self.label_pad_token_id] * (max_retro_length - len(retro_labels)) if retro_labels is not None else [self.label_pad_token_id] * max_retro_length
+ for retro_labels in retro_labels_list
+ ]
+ else:
+ padded_retro_labels = None
+
+ # Pad other features
+ batch = pad_without_fast_tokenizer_warning(
+ self.tokenizer,
+ non_labels_features,
+ padding=self.padding,
+ max_length=self.max_length,
+ pad_to_multiple_of=self.pad_to_multiple_of,
+ return_tensors=return_tensors,
+ )
+
+ batch["molecule_graphs"] = batched_graphs
+ batch["design_graphs"] = batched_design_graphs
+ batch["retro_product_graphs"] = batched_retro_products
+ batch["retro_labels"] = torch.tensor(padded_retro_labels, dtype=torch.int64)
+
+ # Pad labels
+ if labels is not None:
+ max_label_length = max(len(l) for l in labels)
+ if self.pad_to_multiple_of is not None:
+ max_label_length = (
+ (max_label_length + self.pad_to_multiple_of - 1)
+ // self.pad_to_multiple_of
+ * self.pad_to_multiple_of
+ )
+
+ padding_side = self.tokenizer.padding_side
+ padded_labels = [
+ label + [self.label_pad_token_id] * (max_label_length - len(label))
+ if padding_side == "right"
+ else [self.label_pad_token_id] * (max_label_length - len(label)) + label
+ for label in labels
+ ]
+ batch["labels"] = torch.tensor(padded_labels, dtype=torch.int64)
+
+ # Prepare decoder_input_ids
+ if (
+ labels is not None
+ and self.model is not None
+ and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
+ ):
+ decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=batch["labels"])
+ batch["decoder_input_ids"] = decoder_input_ids
+
+ return batch
\ No newline at end of file
diff --git a/src/data/data_utils.py b/src/data/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..561aadb7720af4b085ff59a364b9744daa16b954
--- /dev/null
+++ b/src/data/data_utils.py
@@ -0,0 +1,82 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum, unique
+from typing import TYPE_CHECKING, Dict, List, Sequence, Set, Union
+
+from datasets import concatenate_datasets, interleave_datasets
+
+from ..extras.logging import get_logger
+
+
+if TYPE_CHECKING:
+ from datasets import Dataset, IterableDataset
+ from transformers import Seq2SeqTrainingArguments
+
+ from ..hparams import DataArguments
+
+
+logger = get_logger(__name__)
+
+
+SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]
+
+
+@unique
+class Role(str, Enum):
+ USER = "user"
+ ASSISTANT = "assistant"
+ SYSTEM = "system"
+ FUNCTION = "function"
+ OBSERVATION = "observation"
+
+
+def merge_dataset(
+ all_datasets: List[Union["Dataset", "IterableDataset"]],
+ data_args: "DataArguments",
+ training_args: "Seq2SeqTrainingArguments",
+) -> Union["Dataset", "IterableDataset"]:
+ if len(all_datasets) == 1:
+ return all_datasets[0]
+ elif data_args.mix_strategy == "concat":
+ if data_args.streaming:
+ logger.warning("The samples between different datasets will not be mixed in streaming mode.")
+ return concatenate_datasets(all_datasets)
+ elif data_args.mix_strategy.startswith("interleave"):
+ if not data_args.streaming:
+ logger.warning("We recommend using `mix_strategy=concat` in non-streaming mode.")
+ return interleave_datasets(
+ datasets=all_datasets,
+ probabilities=data_args.interleave_probs,
+ seed=training_args.seed,
+ stopping_strategy="first_exhausted" if data_args.mix_strategy.endswith("under") else "all_exhausted",
+ )
+ else:
+ raise ValueError("Unknown mixing strategy.")
+
+
+def split_dataset(
+ dataset: Union["Dataset", "IterableDataset"], data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments"
+) -> Dict[str, "Dataset"]:
+ if training_args.do_train:
+ if data_args.val_size > 1e-6: # Split the dataset
+ val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size
+ dataset = dataset.train_test_split(test_size=val_size, seed=training_args.seed)
+ return {"train_dataset": dataset["train"], "eval_dataset": dataset["test"]}
+ else:
+ if data_args.streaming:
+ dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
+ return {"train_dataset": dataset}
+ else: # do_eval or do_predict
+ return {"eval_dataset": dataset}
diff --git a/src/data/formatter.py b/src/data/formatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..2086900afefa97481f8e30838b69ec4827bbf3ad
--- /dev/null
+++ b/src/data/formatter.py
@@ -0,0 +1,139 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import List, Literal, Optional, Tuple, Union
+
+from .data_utils import SLOTS
+from .tool_utils import DefaultToolUtils, GLM4ToolUtils
+
+
+@dataclass
+class Formatter(ABC):
+ slots: SLOTS = field(default_factory=list)
+ tool_format: Optional[Literal["default", "glm4"]] = None
+
+ @abstractmethod
+ def apply(self, **kwargs) -> SLOTS: ...
+
+ def extract(self, content: str) -> Union[str, List[Tuple[str, str]]]:
+ raise NotImplementedError
+
+
+@dataclass
+class EmptyFormatter(Formatter):
+ def __post_init__(self):
+ has_placeholder = False
+ for slot in filter(lambda s: isinstance(s, str), self.slots):
+ if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+ has_placeholder = True
+
+ if has_placeholder:
+ raise ValueError("Empty formatter should not contain any placeholder.")
+
+ def apply(self, **kwargs) -> SLOTS:
+ return self.slots
+
+
+@dataclass
+class StringFormatter(Formatter):
+ def __post_init__(self):
+ has_placeholder = False
+ for slot in filter(lambda s: isinstance(s, str), self.slots):
+ if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+ has_placeholder = True
+
+ if not has_placeholder:
+ raise ValueError("A placeholder is required in the string formatter.")
+
+ def apply(self, **kwargs) -> SLOTS:
+ elements = []
+ for slot in self.slots:
+ if isinstance(slot, str):
+ for name, value in kwargs.items():
+ if not isinstance(value, str):
+ raise RuntimeError("Expected a string, got {}".format(value))
+
+ slot = slot.replace("{{" + name + "}}", value, 1)
+ elements.append(slot)
+ elif isinstance(slot, (dict, set)):
+ elements.append(slot)
+ else:
+ raise RuntimeError("Input must be string, set[str] or dict[str, str], got {}".format(type(slot)))
+
+ return elements
+
+@dataclass
+class FunctionFormatter(Formatter):
+ def __post_init__(self):
+ if self.tool_format == "default":
+ self.slots = DefaultToolUtils.get_function_slots() + self.slots
+ elif self.tool_format == "glm4":
+ self.slots = GLM4ToolUtils.get_function_slots() + self.slots
+ else:
+ raise NotImplementedError("Tool format {} was not found.".format(self.tool_format))
+
+ def apply(self, **kwargs) -> SLOTS:
+ content = kwargs.pop("content")
+ functions: List[Tuple[str, str]] = []
+ try:
+ tool_calls = json.loads(content)
+ if not isinstance(tool_calls, list): # parallel function call
+ tool_calls = [tool_calls]
+
+ for tool_call in tool_calls:
+ functions.append((tool_call["name"], json.dumps(tool_call["arguments"], ensure_ascii=False)))
+
+ except json.JSONDecodeError:
+ functions = []
+
+ elements = []
+ for name, arguments in functions:
+ for slot in self.slots:
+ if isinstance(slot, str):
+ slot = slot.replace("{{name}}", name).replace("{{arguments}}", arguments)
+ elements.append(slot)
+ elif isinstance(slot, (dict, set)):
+ elements.append(slot)
+ else:
+ raise RuntimeError("Input must be string, set[str] or dict[str, str], got {}".format(type(slot)))
+
+ return elements
+
+
+@dataclass
+class ToolFormatter(Formatter):
+ def __post_init__(self):
+ if self.tool_format == "default":
+ self._tool_formatter = DefaultToolUtils.tool_formatter
+ self._tool_extractor = DefaultToolUtils.tool_extractor
+ elif self.tool_format == "glm4":
+ self._tool_formatter = GLM4ToolUtils.tool_formatter
+ self._tool_extractor = GLM4ToolUtils.tool_extractor
+ else:
+ raise NotImplementedError("Tool format {} was not found.".format(self.tool_format))
+
+ def apply(self, **kwargs) -> SLOTS:
+ content = kwargs.pop("content")
+ try:
+ tools = json.loads(content)
+ return [self._tool_formatter(tools) if len(tools) != 0 else ""]
+ except json.JSONDecodeError:
+ return [""]
+
+ def extract(self, content: str) -> Union[str, List[Tuple[str, str]]]:
+ return self._tool_extractor(content)
diff --git a/src/data/loader.py b/src/data/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..0647d46e70e155e86eff42ac0243a2276a4ed83e
--- /dev/null
+++ b/src/data/loader.py
@@ -0,0 +1,149 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+import sys
+from typing import TYPE_CHECKING, Literal, Optional, Union
+from functools import partial
+
+import numpy as np
+from datasets import load_dataset, load_from_disk
+
+# from ..extras.constants import FILEEXT2TYPE
+from ..extras.logging import get_logger
+from ..extras.misc import has_tokenized_data
+from .aligner import align_dataset
+from .data_utils import merge_dataset
+from .parser import get_dataset_attr
+# from .preprocess import get_preprocess_and_print_func
+from .template import get_template_and_fix_tokenizer
+
+from .processors.mmsupervised import (
+ preprocess_mmsupervised_dataset,
+ print_supervised_dataset_example,
+ encode_graph_pyg
+)
+
+if TYPE_CHECKING:
+ from datasets import Dataset, IterableDataset
+ from transformers import PreTrainedTokenizer, ProcessorMixin, Seq2SeqTrainingArguments
+
+ from ..hparams import DataArguments, ModelArguments
+ from .parser import DatasetAttr
+
+
+logger = get_logger(__name__)
+
+
+def load_single_dataset(
+ dataset_attr: "DatasetAttr",
+ model_args: "ModelArguments",
+ data_args: "DataArguments",
+ training_args: "Seq2SeqTrainingArguments",
+) -> Union["Dataset", "IterableDataset"]:
+ logger.info("Loading dataset {}...".format(dataset_attr))
+
+ data_files = []
+ assert dataset_attr.load_from == "file"
+
+ data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+ data_files.append(data_path)
+ data_path = data_path.split(".")[-1]
+
+ if "trust_remote_code" in inspect.signature(load_dataset).parameters: # for datasets==2.16.0
+ kwargs = {"trust_remote_code": True}
+ else:
+ kwargs = {}
+
+ dataset = load_dataset(
+ path=data_path,
+ name=None,
+ data_dir=None,
+ data_files=data_files,
+ split=data_args.split,
+ cache_dir=model_args.cache_dir,
+ token=model_args.hf_hub_token,
+ streaming=False,
+ **kwargs,
+ )
+
+ converted_dataset, mol_id_to_smiles = align_dataset(dataset, dataset_attr, data_args, training_args)
+ return converted_dataset, mol_id_to_smiles
+
+def get_dataset(
+ model_args: "ModelArguments",
+ data_args: "DataArguments",
+ training_args: "Seq2SeqTrainingArguments",
+ tokenizer: "PreTrainedTokenizer",
+) -> Union["Dataset", "IterableDataset"]:
+
+ template = get_template_and_fix_tokenizer(tokenizer, data_args.template, data_args.tool_format)
+ if data_args.train_on_prompt and template.efficient_eos:
+ raise ValueError("Current template does not support `train_on_prompt`.")
+ print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer)
+
+ # Load tokenized dataset
+ if data_args.tokenized_path is not None:
+ if has_tokenized_data(data_args.tokenized_path):
+ mol_id_to_pyg = encode_graph_pyg(data_path=data_args.tokenized_path)
+ logger.warning("Loading dataset from disk will ignore other data arguments.")
+ dataset = load_from_disk(data_args.tokenized_path)
+ logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path))
+ # print_function(next(iter(dataset)))
+ data_iter = iter(dataset)
+ print_function(next(data_iter))
+ return mol_id_to_pyg, dataset
+
+ # Load tokenized dataset
+ with training_args.main_process_first(desc="load dataset"):
+ # current only support one dataset
+ dataset_attr = get_dataset_attr(data_args)
+ dataset, mol_id_to_smiles = load_single_dataset(dataset_attr, model_args, data_args, training_args)
+
+ with training_args.main_process_first(desc="pre-process dataset"):
+ preprocess_func = partial(
+ preprocess_mmsupervised_dataset,
+ template=template,
+ tokenizer=tokenizer,
+ data_args=data_args,
+ )
+
+ column_names = list(next(iter(dataset)).keys())
+ kwargs = {}
+ kwargs = dict(
+ num_proc=data_args.preprocessing_num_workers,
+ load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
+ desc="Running tokenizer on dataset",
+ )
+
+ dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs)
+
+ if data_args.tokenized_path is not None:
+ if training_args.should_save:
+ dataset.save_to_disk(data_args.tokenized_path)
+ mol_id_to_pyg = encode_graph_pyg(data_path=data_args.tokenized_path, mol_id_to_smiles=mol_id_to_smiles)
+ logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path))
+ logger.info("Please restart the training with `tokenized_path: {}`.".format(data_args.tokenized_path))
+ sys.exit(0)
+ else:
+ mol_id_to_pyg = encode_graph_pyg(mol_id_to_smiles=mol_id_to_smiles)
+
+ if training_args.should_log:
+ try:
+ print_function(next(iter(dataset)))
+ except StopIteration:
+ raise RuntimeError("Cannot find valid samples.")
+
+ return mol_id_to_pyg, dataset
diff --git a/src/data/parser.py b/src/data/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..8364e965e693506fe49fc824dfcffb21c16abf69
--- /dev/null
+++ b/src/data/parser.py
@@ -0,0 +1,113 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
+
+from ..extras.constants import DATA_CONFIG
+from ..extras.misc import use_modelscope
+
+
+if TYPE_CHECKING:
+ from ..hparams import DataArguments
+
+
+@dataclass
+class DatasetAttr:
+ r"""
+ Dataset attributes.
+ """
+
+ # basic configs
+ load_from: Literal["hf_hub", "ms_hub", "script", "file"]
+ dataset_name: str
+ formatting: Literal["alpaca", "sharegpt", "molqa"] = "molqa"
+ ranking: bool = False
+ # extra configs
+ subset: Optional[str] = None
+ folder: Optional[str] = None
+ num_samples: Optional[int] = None
+ # common columns
+ system: Optional[str] = None
+ tools: Optional[str] = None
+ images: Optional[str] = None
+ # rlhf columns
+ chosen: Optional[str] = None
+ rejected: Optional[str] = None
+ kto_tag: Optional[str] = None
+ # alpaca columns
+ prompt: Optional[str] = "instruction"
+ query: Optional[str] = "input"
+ response: Optional[str] = "output"
+ history: Optional[str] = None
+ # sharegpt columns
+ messages: Optional[str] = "conversations"
+ # sharegpt tags
+ role_tag: Optional[str] = "from"
+ content_tag: Optional[str] = "value"
+ user_tag: Optional[str] = "human"
+ assistant_tag: Optional[str] = "gpt"
+ observation_tag: Optional[str] = "observation"
+ function_tag: Optional[str] = "function_call"
+ system_tag: Optional[str] = "system"
+ # molqa columns
+ property: Optional[str] = 'property'
+ retro: Optional[str] = 'retro'
+ # learned_query_size: Optional[int] = None
+
+ def __repr__(self) -> str:
+ return self.dataset_name
+
+ def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None:
+ setattr(self, key, obj.get(key, default))
+
+def get_dataset_attr(data_args: "DataArguments") -> List["DatasetAttr"]:
+ if data_args.dataset is not None:
+ dataset_name = data_args.dataset.strip()
+ else:
+ raise ValueError("Please specify the dataset name.")
+
+ try:
+ with open(os.path.join(data_args.dataset_dir, DATA_CONFIG), "r") as f:
+ dataset_info = json.load(f)
+ except Exception as err:
+ raise ValueError(
+ "Cannot open {} due to {}.".format(os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err))
+ )
+ dataset_info = None
+
+ if dataset_name not in dataset_info:
+ raise ValueError("Undefined dataset {} in {}.".format(dataset_name, DATA_CONFIG))
+
+ dataset_attr = DatasetAttr("file", dataset_name=dataset_info[dataset_name]["file_name"])
+
+ print('dataset_info', dataset_info)
+
+ dataset_attr.set_attr("formatting", dataset_info[dataset_name], default="molqa")
+ dataset_attr.set_attr("ranking", dataset_info[dataset_name], default=False)
+ dataset_attr.set_attr("subset", dataset_info[dataset_name])
+ dataset_attr.set_attr("folder", dataset_info[dataset_name])
+ dataset_attr.set_attr("num_samples", dataset_info[dataset_name])
+
+ if "columns" in dataset_info[dataset_name]:
+ column_names = ["system", "tools", "images", "chosen", "rejected", "kto_tag"]
+ assert dataset_attr.formatting == "molqa"
+ column_names.extend(["prompt", "query", "response", "history", "property", "retro"])
+
+ for column_name in column_names:
+ dataset_attr.set_attr(column_name, dataset_info[dataset_name]["columns"])
+
+ return dataset_attr
\ No newline at end of file
diff --git a/src/data/processors/__init__.py b/src/data/processors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/data/processors/mmsupervised.py b/src/data/processors/mmsupervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..7597e902c6ad7d7fc41a530362a2ed30d84930a8
--- /dev/null
+++ b/src/data/processors/mmsupervised.py
@@ -0,0 +1,335 @@
+# Copyright 2024 Llamole Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
+
+from ...extras.constants import IGNORE_INDEX, BOND_INDEX, NO_LABEL_INDEX
+from ...extras.logging import get_logger
+
+if TYPE_CHECKING:
+ from transformers import PreTrainedTokenizer, ProcessorMixin
+
+ from ...hparams import DataArguments
+ from ..template import Template
+
+import os
+from rdkit import Chem
+import torch
+from torch_geometric.data import Data, Batch
+import pickle
+
+logger = get_logger(__name__)
+
+import os
+import torch
+from typing import Dict
+from torch_geometric.data import Data
+from rdkit import Chem
+import pickle
+
+
+def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> Tuple[int, int]:
+ if target_len * 2 < cutoff_len: # truncate source
+ max_target_len = cutoff_len
+ elif source_len * 2 < cutoff_len: # truncate target
+ max_target_len = cutoff_len - source_len
+ else: # truncate both
+ max_target_len = int(cutoff_len * (target_len / (source_len + target_len)))
+
+ new_target_len = min(max_target_len, target_len)
+ new_source_len = max(cutoff_len - new_target_len, 0)
+ return new_source_len, new_target_len
+
+def encode_graph_pyg(
+ data_path: Optional[str] = None, mol_id_to_smiles: Optional[Dict[str, str]] = None
+) -> Dict[str, Data]:
+ """
+ Converts molecule data to a dictionary of PyTorch Geometric Data objects, with caching functionality.
+ Uses a sparse representation for efficiency.
+
+ Args:
+ data_path (Optional[str]): Path to the Hugging Face dataset folder.
+ mol_id_to_smiles (Optional[Dict[str, str]]): Dictionary where keys are molecule IDs
+ and values are SMILES strings.
+
+ Returns:
+ Dict[str, Data]: Dictionary where keys are molecule IDs and values are
+ PyTorch Geometric Data objects.
+
+ Raises:
+ ValueError: If both data_path and mol_id_to_smiles are None, or if data_path is provided but loading fails.
+ """
+ print(f"Current execution directory: {os.getcwd()}")
+
+ if data_path is None and mol_id_to_smiles is None:
+ raise ValueError("Either data_path or mol_id_to_smiles must be provided.")
+
+ if data_path is not None:
+ cache_file = os.path.join(data_path, "pyg_molecule.pickle")
+
+ # Try to load cached data
+ if os.path.exists(cache_file):
+ try:
+ with open(cache_file, "rb") as f:
+ return pickle.load(f)
+ except Exception as e:
+ print(f"Failed to load cached data: {e}")
+
+ mol_id_to_pyg = {}
+
+ for mol_id, smiles in mol_id_to_smiles.items():
+ mol = Chem.MolFromSmiles(smiles)
+ if mol is None:
+ raise ValueError(f"Invalid SMILES string for molecule {mol_id}: {smiles}")
+
+ type_idx = []
+ heavy_atom_indices = []
+ for atom in mol.GetAtoms():
+ if atom.GetAtomicNum() != 1: # Exclude hydrogen atoms
+ type_idx.append(
+ 119 - 2 if atom.GetSymbol() == "*" else atom.GetAtomicNum() - 2
+ )
+ heavy_atom_indices.append(atom.GetIdx())
+
+ x = torch.LongTensor(type_idx)
+
+ edge_index = []
+ edge_attr = []
+ for bond in mol.GetBonds():
+ start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+ if start in heavy_atom_indices and end in heavy_atom_indices:
+ start_new, end_new = heavy_atom_indices.index(
+ start
+ ), heavy_atom_indices.index(end)
+ edge_index.extend([[start_new, end_new], [end_new, start_new]])
+ bond_type = BOND_INDEX[bond.GetBondType()]
+ edge_attr.extend([bond_type, bond_type])
+
+ edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
+ edge_attr = torch.tensor(edge_attr, dtype=torch.long)
+
+ # Create PyG Data object
+ data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
+
+ mol_id_to_pyg[mol_id] = data
+
+ # Save cached data if data_path is provided
+ if data_path is not None:
+ with open(cache_file, "wb") as f:
+ pickle.dump(mol_id_to_pyg, f)
+
+ print(f"Saved PyG data to {cache_file}")
+
+ return mol_id_to_pyg
+
+def encode_supervised_example(
+ prompt: Sequence[Dict[str, str]],
+ response: Sequence[Dict[str, str]],
+ system: Optional[str],
+ molecule_ids: List[int],
+ retro_product_ids: List[int],
+ retro_labels: List[int],
+ template: "Template",
+ tokenizer: "PreTrainedTokenizer",
+ data_args: "DataArguments",
+) -> Tuple[List[int], List[int], List[int], List[int], List[int]]:
+
+ messages = prompt + response
+ input_ids, labels = [], []
+ final_molecule_ids = []
+ final_product_ids = []
+ final_retro_labels = []
+
+ encoded_pairs = template.encode_multiturn(tokenizer, messages, system)
+ special_tokens = [
+ "",
+ "",
+ "",
+ "",
+ "",
+ "