phongnp2010 commited on
Commit
e94ce8c
·
verified ·
1 Parent(s): d727134

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +90 -0
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1.Import Necessary libraries
2
+ import os
3
+ import torch
4
+ import time
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ AutoModelForCausalLM,
8
+ BitsAndBytesConfig,
9
+ HfArgumentParser,
10
+ TrainingArguments,
11
+ pipeline,
12
+ logging,
13
+
14
+ )
15
+ from peft import LoraConfig, PeftModel
16
+ import torch.distributed as dist
17
+ from torch.nn.parallel import DistributedDataParallel as DDP
18
+ from accelerate import Accelerator
19
+ import config_train as cfg
20
+
21
+ accelerator = Accelerator()
22
+
23
+ # 2. Model and Dataset Configuration
24
+ model_name = cfg.model_name
25
+ # new_model = "Llama-2-7b-chat-finetune-qlora"
26
+ # new_model = "/mnt/md1/check_point_text_recognition/ckpt_chatbot/checkpoint-53390"
27
+ new_model = "/mnt/md1/check_point_text_recognition/ckpt_chatbot/241202/checkpoint-2700"
28
+ # device_map = {"":0}
29
+
30
+ # 3. Tokenizer and PEFT configuration
31
+ #Load LLama tokenizer
32
+ tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code = True)
33
+ tokenizer.pad_token = tokenizer.eos_token
34
+ tokenizer.padding_side = "right"
35
+
36
+ # 4. load model for inference
37
+ '''
38
+ Since the model is loaded in full precision (float32), it requires more memory.
39
+ For large models like LLaMA-2 7B, this can consume significant GPU memory.
40
+ '''
41
+ # Step 1: Load the base model
42
+ # base_model = AutoModelForCausalLM.from_pretrained(
43
+ # model_name, # The original base model's name or path
44
+ # device_map=device_map, # Or specify your device
45
+ # )
46
+ '''
47
+ Mixed Precision: FP16 uses 16-bit floating point numbers, which reduces the memory usage and
48
+ allows the model to fit into GPU memory more easily. However, this could potentially reduce
49
+ numerical accuracy slightly, but in most NLP tasks, the difference is negligible.
50
+ '''
51
+ base_model = AutoModelForCausalLM.from_pretrained(
52
+ model_name,
53
+ low_cpu_mem_usage=True,
54
+ return_dict=True,
55
+ torch_dtype=torch.float16,
56
+ device_map=cfg.device_map,
57
+ )
58
+ # Step 2: Load the fine-tuned LoRA model (saved from trainer.model.save_pretrained)
59
+ model = PeftModel.from_pretrained(base_model, new_model) # `new_model` is the path where you saved the model
60
+
61
+ # Step 3: Merge the LoRA weights with the base model
62
+ model = model.merge_and_unload()
63
+ model, tokenizer = accelerator.prepare(model, tokenizer) #Wrap model and tokenizer with Accelerator
64
+
65
+ # Ignore warnings
66
+ logging.set_verbosity(logging.CRITICAL)
67
+
68
+ # 5. Run text generation pipeline with our next model
69
+ # prompt = "How can I learn to optimize my webpage for search engines?"
70
+
71
+ prompt_path = "/mnt/md1/check_point_text_recognition/ckpt_chatbot/prompt_for_test.txt"
72
+
73
+ prompt = '''
74
+ How to train a LLM model
75
+ '''
76
+ pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=2048)
77
+ while True:
78
+ prompt = input("Type your question: ")
79
+ if prompt != '0':
80
+ with open(prompt_path, 'r') as file:
81
+ text = file.read().strip()
82
+ start = time.time()
83
+ result = pipe(f"<s>[INST] {text} [/INST]")
84
+ result = result[0]['generated_text']
85
+ answer = result.split('[/INST]')[1].split('</s>')[0].strip()
86
+ print('Answer:', answer)
87
+ print('time:', time.time() - start)
88
+ else:
89
+ print('Xin cảm ơn!')
90
+ exit(0)