Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,8 @@ from huggingface_hub import login, create_repo, HfApi
|
|
8 |
import gradio as gr
|
9 |
import queue
|
10 |
import time
|
|
|
|
|
11 |
|
12 |
# 全局日志
|
13 |
log_queue = queue.Queue()
|
@@ -46,9 +48,7 @@ def check_system_resources(model_name):
|
|
46 |
log("Checking system resources...")
|
47 |
system_memory = psutil.virtual_memory()
|
48 |
total_memory_gb = system_memory.total / (1024 ** 3)
|
49 |
-
available_memory_gb = system_memory.available / (1024 ** 3)
|
50 |
log(f"Total system memory: {total_memory_gb:.1f}GB")
|
51 |
-
log(f"Available memory: {available_memory_gb:.1f}GB")
|
52 |
|
53 |
model_size_gb = get_model_size_in_gb(model_name)
|
54 |
required_memory_gb = model_size_gb * 2.5 # 预留额外内存
|
@@ -66,9 +66,9 @@ def check_system_resources(model_name):
|
|
66 |
else:
|
67 |
log("❌ No GPU detected.")
|
68 |
|
69 |
-
if
|
70 |
log("✅ Sufficient CPU memory available; using CPU.")
|
71 |
-
return "cpu",
|
72 |
else:
|
73 |
raise MemoryError(f"❌ Insufficient system memory (requires {required_memory_gb:.1f}GB, available {available_memory_gb:.1f}GB).")
|
74 |
|
@@ -117,10 +117,9 @@ def download_and_merge_model(base_model_name, lora_model_name, output_dir, devic
|
|
117 |
model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True)
|
118 |
log("Loading adapter tokenizer...")
|
119 |
adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
|
124 |
log("Loading LoRA adapter...")
|
125 |
peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True)
|
126 |
log("Merging and unloading model...")
|
@@ -157,7 +156,9 @@ def clone_llamacpp_and_download_build():
|
|
157 |
log("llama.cpp build completed.")
|
158 |
# 返回到原始目录
|
159 |
os.chdir(os.path.dirname(llamacpp_dir))
|
160 |
-
|
|
|
|
|
161 |
|
162 |
@timeit
|
163 |
def quantize(model_path, repo_id, quant_method=None):
|
@@ -180,12 +181,13 @@ def quantize(model_path, repo_id, quant_method=None):
|
|
180 |
os.makedirs(model_output_dir, exist_ok=True)
|
181 |
|
182 |
# 中间文件保存在 model_output 目录下
|
183 |
-
|
184 |
|
185 |
-
if not os.path.exists(
|
186 |
log(f"正在将模型转换为GGML格式")
|
187 |
convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
|
188 |
-
convert_cmd = f"python {convert_script} {model_path} --outfile {
|
|
|
189 |
os.system(convert_cmd)
|
190 |
else:
|
191 |
log(f"GGML中间文件已存在,跳过转换")
|
@@ -194,7 +196,8 @@ def quantize(model_path, repo_id, quant_method=None):
|
|
194 |
final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
|
195 |
log(f"正在进行{quant_method}量化")
|
196 |
quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
|
197 |
-
quant_cmd = f"{quantize_bin} {
|
|
|
198 |
|
199 |
if not os.path.exists(final_path):
|
200 |
os.system(quant_cmd)
|
@@ -226,7 +229,8 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
|
|
226 |
adapter_config = PeftConfig.from_pretrained(lora_model_name)
|
227 |
base_model_name = adapter_config.base_model_name_or_path
|
228 |
if repo_name.strip().lower() == "auto":
|
229 |
-
repo_name = f"{
|
|
|
230 |
|
231 |
device = setup_environment(base_model_name)
|
232 |
repo_name = create_hf_repo(repo_name)
|
@@ -238,15 +242,20 @@ def process_model(base_model_name, lora_model_name, repo_name, quant_methods, hf
|
|
238 |
|
239 |
# 量化模型
|
240 |
for quant_method in quant_methods:
|
241 |
-
quantize(
|
242 |
|
243 |
# 上传合并后的模型和量化模型
|
244 |
api.upload_large_folder(
|
245 |
folder_path=model_path,
|
246 |
repo_id=repo_name,
|
247 |
repo_type="model",
|
248 |
-
num_workers=4,
|
|
|
249 |
)
|
|
|
|
|
|
|
|
|
250 |
return "\n".join(current_logs)
|
251 |
except Exception as e:
|
252 |
error_message = f"Error during processing: {e}"
|
|
|
8 |
import gradio as gr
|
9 |
import queue
|
10 |
import time
|
11 |
+
import shutil
|
12 |
+
|
13 |
|
14 |
# 全局日志
|
15 |
log_queue = queue.Queue()
|
|
|
48 |
log("Checking system resources...")
|
49 |
system_memory = psutil.virtual_memory()
|
50 |
total_memory_gb = system_memory.total / (1024 ** 3)
|
|
|
51 |
log(f"Total system memory: {total_memory_gb:.1f}GB")
|
|
|
52 |
|
53 |
model_size_gb = get_model_size_in_gb(model_name)
|
54 |
required_memory_gb = model_size_gb * 2.5 # 预留额外内存
|
|
|
66 |
else:
|
67 |
log("❌ No GPU detected.")
|
68 |
|
69 |
+
if total_memory_gb >= required_memory_gb:
|
70 |
log("✅ Sufficient CPU memory available; using CPU.")
|
71 |
+
return "cpu", total_memory_gb
|
72 |
else:
|
73 |
raise MemoryError(f"❌ Insufficient system memory (requires {required_memory_gb:.1f}GB, available {available_memory_gb:.1f}GB).")
|
74 |
|
|
|
117 |
model = AutoModelForCausalLM.from_pretrained(base_model_name, low_cpu_mem_usage=True)
|
118 |
log("Loading adapter tokenizer...")
|
119 |
adapter_tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
|
120 |
+
log("Resizing token embeddings...")
|
121 |
+
added_tokens_decoder = adapter_tokenizer.added_tokens_decoder
|
122 |
+
model.resize_token_embeddings(adapter_tokenizer.vocab_size + len(added_tokens_decoder))
|
|
|
123 |
log("Loading LoRA adapter...")
|
124 |
peft_model = PeftModel.from_pretrained(model, lora_model_name, low_cpu_mem_usage=True)
|
125 |
log("Merging and unloading model...")
|
|
|
156 |
log("llama.cpp build completed.")
|
157 |
# 返回到原始目录
|
158 |
os.chdir(os.path.dirname(llamacpp_dir))
|
159 |
+
|
160 |
+
def remove_illegal_chars_in_path(text):
|
161 |
+
return text.replace(".", "_").replace(":", "_").replace("/", "_")
|
162 |
|
163 |
@timeit
|
164 |
def quantize(model_path, repo_id, quant_method=None):
|
|
|
181 |
os.makedirs(model_output_dir, exist_ok=True)
|
182 |
|
183 |
# 中间文件保存在 model_output 目录下
|
184 |
+
guff_16 = os.path.join(model_output_dir, f"{repo_id}-f16.gguf")
|
185 |
|
186 |
+
if not os.path.exists(guff_16):
|
187 |
log(f"正在将模型转换为GGML格式")
|
188 |
convert_script = os.path.join(llamacpp_dir, "convert_hf_to_gguf.py")
|
189 |
+
convert_cmd = f"python {convert_script} {model_path} --outfile {guff_16}"
|
190 |
+
print(f"syscall:[{convert_cmd}]")
|
191 |
os.system(convert_cmd)
|
192 |
else:
|
193 |
log(f"GGML中间文件已存在,跳过转换")
|
|
|
196 |
final_path = os.path.join(model_output_dir, f"{repo_id}-{quant_method}.gguf")
|
197 |
log(f"正在进行{quant_method}量化")
|
198 |
quantize_bin = os.path.join(llamacpp_dir, "build", "bin", "llama-quantize")
|
199 |
+
quant_cmd = f"{quantize_bin} {guff_16} {final_path} {quant_method}"
|
200 |
+
print(f"syscall:[{quant_cmd}]")
|
201 |
|
202 |
if not os.path.exists(final_path):
|
203 |
os.system(quant_cmd)
|
|
|
229 |
adapter_config = PeftConfig.from_pretrained(lora_model_name)
|
230 |
base_model_name = adapter_config.base_model_name_or_path
|
231 |
if repo_name.strip().lower() == "auto":
|
232 |
+
repo_name = f"{base_model_name.split('/')[-1]}_{lora_model_name.split('/')[-1]}"
|
233 |
+
repo_name = remove_illegal_chars_in_path(repo_name)
|
234 |
|
235 |
device = setup_environment(base_model_name)
|
236 |
repo_name = create_hf_repo(repo_name)
|
|
|
242 |
|
243 |
# 量化模型
|
244 |
for quant_method in quant_methods:
|
245 |
+
quantize(output_dir, repo_name, quant_method=quant_method)
|
246 |
|
247 |
# 上传合并后的模型和量化模型
|
248 |
api.upload_large_folder(
|
249 |
folder_path=model_path,
|
250 |
repo_id=repo_name,
|
251 |
repo_type="model",
|
252 |
+
num_workers=os.cpu_count() if os.cpu_count() > 4 else 4,
|
253 |
+
print_report_every=10,
|
254 |
)
|
255 |
+
|
256 |
+
# rm -rf model_path
|
257 |
+
shutil.rmtree(model_path)
|
258 |
+
|
259 |
return "\n".join(current_logs)
|
260 |
except Exception as e:
|
261 |
error_message = f"Error during processing: {e}"
|