Steven10429 commited on
Commit
cfd08d1
·
1 Parent(s): 039130e
Files changed (1) hide show
  1. app.py +28 -27
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from peft import PeftModel, PeftConfig
5
  from huggingface_hub import login, create_repo, HfApi
6
  import gradio as gr
@@ -10,6 +10,8 @@ from gradio_log import Log
10
  import logging
11
 
12
 
 
 
13
  MEMORY = int(os.getenv("MEMORY", 16)[:-2]) # 64Gi
14
  CPU_CORES = int(os.getenv("CPU_CORES", 4)) # 4
15
  SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME", "Steven10429") # str
@@ -122,15 +124,16 @@ def download_and_merge_model(base_model_name, lora_model_name, output_dir, devic
122
  6. 调整合并模型嵌入层尺寸并保存
123
  """
124
  os.makedirs("temp", exist_ok=True)
 
125
  log.info("Loading base model...")
126
- model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, torch_dtype=torch.float16, cache_dir="temp")
127
  log.info("Loading adapter tokenizer...")
128
  adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name, trust_remote_code=True, device_map="auto", force_download=True)
129
  log.info("Resizing token embeddings...")
130
  added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
131
  model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
132
  log.info("Loading LoRA adapter...")
133
- peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, torch_dtype=torch.float16, cache_dir="temp")
134
  log.info("Merging and unloading model...")
135
  model = peft_model.merge_and_unload()
136
  log.info("Saving model...")
@@ -192,25 +195,25 @@ def quantize(model_path, repo_id, quant_method=None):
192
  os.makedirs(model_output_dir, exist_ok=True)
193
 
194
  # 中间文件保存在 model_output 目录下
195
- guff_16_path =f"./{repo_id}-f16.gguf"
196
 
197
- if not os.path.exists(guff_16_path):
198
  log.info(f"正在将模型转换为GGML格式")
199
  convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
200
- convert_cmd = f"python {convert_script} {model_path} --outfile {guff_16_path}"
201
  print(f"syscall:[{convert_cmd}]")
202
  os.system(convert_cmd)
203
  else:
204
  log.info(f"GGML中间文件已存在,跳过转换")
205
 
206
- if quant_method == "fp16":
207
- return guff_16_path # for upload to hub
208
 
209
  # 最终文件保存在 model_output 目录下
210
  final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
211
  log.info(f"正在进行{quant_method}量化")
212
  quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
213
- quant_cmd = f"{quantize_bin} {guff_16_path} {final_path} {quant_method}"
214
  print(f"syscall:[{quant_cmd}]")
215
 
216
  if not os.path.exists(final_path):
@@ -230,14 +233,12 @@ tags:
230
  - text-generation
231
  - peft{quantization}
232
  library_name: transformers
233
- base_model: {base_model_name}
234
  widget:
235
  - messages:
236
  - role: user
237
  content: What is your favorite condiment?
238
  license: other
239
- datasets:
240
- - {lora_model_name}
241
  ---
242
  # Model
243
 
@@ -310,29 +311,29 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
310
  )
311
  log.info("Upload completed.")
312
 
313
- # remove model for space limit
 
 
314
  shutil.rmtree(model_path)
315
-
 
316
  os.makedirs(os.path.join(output_dir, "quantized"), exist_ok=True)
317
  if len(quant_methods) > 0:
318
- quantize(output_dir, repo_name, "fp16") # for
319
- # 量化模型
320
  for quant_method in quant_methods:
321
  quantize(output_dir, repo_name, quant_method=quant_method)
322
  os.system(f"mv ./{repo_name}-f16.gguf ./{output_dir}/quantized/")
323
 
324
- api.upload_folder(
325
- folder_path=os.path.join(output_dir, "quantized"),
326
- path_in_repo="quantized",
327
- repo_id=repo_name,
328
- repo_type="model",
329
- num_workers=os.cpu_count() if os.cpu_count() > 4 else 4,
330
- print_report_every=10,
331
- )
332
 
333
- # rm -rf model_path
334
- shutil.rmtree(model_path)
335
- log.info("Removed model from local")
336
 
337
  except Exception as e:
338
  error_message = f"Error during processing: {e}"
 
1
  import os
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
4
  from peft import PeftModel, PeftConfig
5
  from huggingface_hub import login, create_repo, HfApi
6
  import gradio as gr
 
10
  import logging
11
 
12
 
13
+
14
+
15
  MEMORY = int(os.getenv("MEMORY", 16)[:-2]) # 64Gi
16
  CPU_CORES = int(os.getenv("CPU_CORES", 4)) # 4
17
  SPACE_AUTHOR_NAME = os.getenv("SPACE_AUTHOR_NAME", "Steven10429") # str
 
124
  6. 调整合并模型嵌入层尺寸并保存
125
  """
126
  os.makedirs("temp", exist_ok=True)
127
+ bnb_config = BitsAndBytesConfig(load_in_8bit=True)
128
  log.info("Loading base model...")
129
+ model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, quantization_config=bnb_config, cache_dir="temp")
130
  log.info("Loading adapter tokenizer...")
131
  adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name, trust_remote_code=True, device_map="auto", force_download=True)
132
  log.info("Resizing token embeddings...")
133
  added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
134
  model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
135
  log.info("Loading LoRA adapter...")
136
+ peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True, device_map="auto", force_download=True, trust_remote_code=True, quantization_config=bnb_config, cache_dir="temp")
137
  log.info("Merging and unloading model...")
138
  model = peft_model.merge_and_unload()
139
  log.info("Saving model...")
 
195
  os.makedirs(model_output_dir, exist_ok=True)
196
 
197
  # 中间文件保存在 model_output 目录下
198
+ guff_8_path =f"./{repo_id}-q8_0.gguf"
199
 
200
+ if not os.path.exists(guff_8_path):
201
  log.info(f"正在将模型转换为GGML格式")
202
  convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
203
+ convert_cmd = f"python {convert_script} {model_path} --outfile {guff_8_path} --outtype q8_0"
204
  print(f"syscall:[{convert_cmd}]")
205
  os.system(convert_cmd)
206
  else:
207
  log.info(f"GGML中间文件已存在,跳过转换")
208
 
209
+ if quant_method.lower() == "q8_0":
210
+ return guff_8_path # for upload to hub
211
 
212
  # 最终文件保存在 model_output 目录下
213
  final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
214
  log.info(f"正在进行{quant_method}量化")
215
  quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
216
+ quant_cmd = f"{quantize_bin} {guff_8_path} {final_path} {quant_method}"
217
  print(f"syscall:[{quant_cmd}]")
218
 
219
  if not os.path.exists(final_path):
 
233
  - text-generation
234
  - peft{quantization}
235
  library_name: transformers
236
+ base_model: {base_model_name}{lora_model_name}
237
  widget:
238
  - messages:
239
  - role: user
240
  content: What is your favorite condiment?
241
  license: other
 
 
242
  ---
243
  # Model
244
 
 
311
  )
312
  log.info("Upload completed.")
313
 
314
+ if len(quant_methods) > 0:
315
+ quantize(output_dir, repo_name, "Q8_0")
316
+ # remove model for space limit
317
  shutil.rmtree(model_path)
318
+ log.info("Removed model from local")
319
+
320
  os.makedirs(os.path.join(output_dir, "quantized"), exist_ok=True)
321
  if len(quant_methods) > 0:
 
 
322
  for quant_method in quant_methods:
323
  quantize(output_dir, repo_name, quant_method=quant_method)
324
  os.system(f"mv ./{repo_name}-f16.gguf ./{output_dir}/quantized/")
325
 
326
+ api.upload_large_folder(
327
+ folder_path=os.path.join(output_dir, "quantized"),
328
+ repo_id=repo_name,
329
+ repo_type="model",
330
+ num_workers=os.cpu_count() if os.cpu_count() > 4 else 4,
331
+ print_report_every=10,
332
+ )
 
333
 
334
+ # rm -rf model_path
335
+ shutil.rmtree(model_path)
336
+ log.info("Removed model from local")
337
 
338
  except Exception as e:
339
  error_message = f"Error during processing: {e}"