Steven10429 commited on
Commit
cc859c9
·
verified ·
1 Parent(s): 9b649b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -16
app.py CHANGED
@@ -8,6 +8,8 @@ from huggingface_hub import login, create_repo, HfApi
8
  import gradio as gr
9
  import queue
10
  import time
 
 
11
 
12
  # 全局日志
13
  log_queue = queue.Queue()
@@ -46,9 +48,7 @@ def check_system_resources(model_name):
46
  log("Checking system resources...")
47
  system_memory = psutil.virtual_memory()
48
  total_memory_gb = system_memory.total / (1024 ** 3)
49
- available_memory_gb = system_memory.available / (1024 ** 3)
50
  log(f"Total system memory: {total_memory_gb:.1f}GB")
51
- log(f"Available memory: {available_memory_gb:.1f}GB")
52
 
53
  model_size_gb = get_model_size_in_gb(model_name)
54
  required_memory_gb = model_size_gb * 2.5 # 预留额外内存
@@ -66,9 +66,9 @@ def check_system_resources(model_name):
66
  else:
67
  log("❌ No GPU detected.")
68
 
69
- if available_memory_gb >= required_memory_gb:
70
  log("✅ Sufficient CPU memory available; using CPU.")
71
- return "cpu", available_memory_gb
72
  else:
73
  raise MemoryError(f"❌ Insufficient system memory (requires {required_memory_gb:.1f}GB, available {available_memory_gb:.1f}GB).")
74
 
@@ -117,10 +117,9 @@ def download_and_merge_model(base_model_name, lora_model_name, output_dir, devic
117
  model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True)
118
  log("Loading adapter tokenizer...")
119
  adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
120
- if adapter_tokenizer.pad_token != model.config.pad_token:
121
- log("Resizing token embeddings...")
122
- added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
123
- model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
124
  log("Loading LoRA adapter...")
125
  peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True)
126
  log("Merging and unloading model...")
@@ -157,7 +156,9 @@ def clone_llamacpp_and_download_build():
157
  log("llama.cpp build completed.")
158
  # 返回到原始目录
159
  os.chdir(os.path.dirname(llamacpp_dir))
160
-
 
 
161
 
162
  @timeit
163
  def quantize(model_path, repo_id, quant_method=None):
@@ -180,12 +181,13 @@ def quantize(model_path, repo_id, quant_method=None):
180
  os.makedirs(model_output_dir, exist_ok=True)
181
 
182
  # 中间文件保存在 model_output 目录下
183
- temp_gguf_path = os.path.join(model_output_dir, f"{repo_id}-f16.gguf")
184
 
185
- if not os.path.exists(temp_gguf_path):
186
  log(f"正在将模型转换为GGML格式")
187
  convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
188
- convert_cmd = f"python {convert_script} {model_path} --outfile {temp_gguf_path}"
 
189
  os.system(convert_cmd)
190
  else:
191
  log(f"GGML中间文件已存在,跳过转换")
@@ -194,7 +196,8 @@ def quantize(model_path, repo_id, quant_method=None):
194
  final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
195
  log(f"正在进行{quant_method}量化")
196
  quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
197
- quant_cmd = f"{quantize_bin} {temp_gguf_path} {final_path} {quant_method}"
 
198
 
199
  if not os.path.exists(final_path):
200
  os.system(quant_cmd)
@@ -226,7 +229,8 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
226
  adapter_config = PeftConfig.from_pretrained(lora_model_name)
227
  base_model_name = adapter_config.base_model_name_or_path
228
  if repo_name.strip().lower() == "auto":
229
- repo_name = f"{username}/{base_model_name.split('/')[-1]}_{lora_model_name.split('/')[-1]}"
 
230
 
231
  device = setup_environment(base_model_name)
232
  repo_name = create_hf_repo(repo_name)
@@ -238,15 +242,20 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
238
 
239
  # 量化模型
240
  for quant_method in quant_methods:
241
- quantize(f"{output_dir}/model.gguf", repo_name, bits=8, quant_method=quant_method)
242
 
243
  # 上传合并后的模型和量化模型
244
  api.upload_large_folder(
245
  folder_path=model_path,
246
  repo_id=repo_name,
247
  repo_type="model",
248
- num_workers=4,
 
249
  )
 
 
 
 
250
  return "\n".join(current_logs)
251
  except Exception as e:
252
  error_message = f"Error during processing: {e}"
 
8
  import gradio as gr
9
  import queue
10
  import time
11
+ import shutil
12
+
13
 
14
  # 全局日志
15
  log_queue = queue.Queue()
 
48
  log("Checking system resources...")
49
  system_memory = psutil.virtual_memory()
50
  total_memory_gb = system_memory.total / (1024 ** 3)
 
51
  log(f"Total system memory: {total_memory_gb:.1f}GB")
 
52
 
53
  model_size_gb = get_model_size_in_gb(model_name)
54
  required_memory_gb = model_size_gb * 2.5 # 预留额外内存
 
66
  else:
67
  log("❌ No GPU detected.")
68
 
69
+ if total_memory_gb >= required_memory_gb:
70
  log("✅ Sufficient CPU memory available; using CPU.")
71
+ return "cpu", total_memory_gb
72
  else:
73
  raise MemoryError(f"❌ Insufficient system memory (requires {required_memory_gb:.1f}GB, available {available_memory_gb:.1f}GB).")
74
 
 
117
  model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True)
118
  log("Loading adapter tokenizer...")
119
  adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
120
+ log("Resizing token embeddings...")
121
+ added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
122
+ model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
 
123
  log("Loading LoRA adapter...")
124
  peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True)
125
  log("Merging and unloading model...")
 
156
  log("llama.cpp build completed.")
157
  # 返回到原始目录
158
  os.chdir(os.path.dirname(llamacpp_dir))
159
+
160
+ def remove_illegal_chars_in_path(text):
161
+ return text.replace(".", "_").replace(":", "_").replace("/", "_")
162
 
163
  @timeit
164
  def quantize(model_path, repo_id, quant_method=None):
 
181
  os.makedirs(model_output_dir, exist_ok=True)
182
 
183
  # 中间文件保存在 model_output 目录下
184
+ guff_16 = os.path.join(model_output_dir, f"{repo_id}-f16.gguf")
185
 
186
+ if not os.path.exists(guff_16):
187
  log(f"正在将模型转换为GGML格式")
188
  convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
189
+ convert_cmd = f"python {convert_script} {model_path} --outfile {guff_16}"
190
+ print(f"syscall:[{convert_cmd}]")
191
  os.system(convert_cmd)
192
  else:
193
  log(f"GGML中间文件已存在,跳过转换")
 
196
  final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
197
  log(f"正在进行{quant_method}量化")
198
  quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
199
+ quant_cmd = f"{quantize_bin} {guff_16} {final_path} {quant_method}"
200
+ print(f"syscall:[{quant_cmd}]")
201
 
202
  if not os.path.exists(final_path):
203
  os.system(quant_cmd)
 
229
  adapter_config = PeftConfig.from_pretrained(lora_model_name)
230
  base_model_name = adapter_config.base_model_name_or_path
231
  if repo_name.strip().lower() == "auto":
232
+ repo_name = f"{base_model_name.split('/')[-1]}_{lora_model_name.split('/')[-1]}"
233
+ repo_name = remove_illegal_chars_in_path(repo_name)
234
 
235
  device = setup_environment(base_model_name)
236
  repo_name = create_hf_repo(repo_name)
 
242
 
243
  # 量化模型
244
  for quant_method in quant_methods:
245
+ quantize(output_dir, repo_name, quant_method=quant_method)
246
 
247
  # 上传合并后的模型和量化模型
248
  api.upload_large_folder(
249
  folder_path=model_path,
250
  repo_id=repo_name,
251
  repo_type="model",
252
+ num_workers=os.cpu_count() if os.cpu_count() > 4 else 4,
253
+ print_report_every=10,
254
  )
255
+
256
+ # rm -rf model_path
257
+ shutil.rmtree(model_path)
258
+
259
  return "\n".join(current_logs)
260
  except Exception as e:
261
  error_message = f"Error during processing: {e}"