Filip commited on
Commit
ed56d3f
·
1 Parent(s): fe01251
Files changed (2) hide show
  1. app.py +64 -33
  2. requirements.txt +4 -3
app.py CHANGED
@@ -3,70 +3,100 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, Tex
3
  import torch
4
  import gc
5
  import os
 
 
 
6
 
7
  # Enable better CPU performance
8
  torch.set_num_threads(4)
9
  device = "cpu"
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def load_model():
12
  model_name = "forestav/unsloth_vision_radiography_finetune"
13
- base_model_name = "unsloth/Llama-3.2-11B-Vision-Instruct" # Correct base model
14
 
15
  print("Loading tokenizer and processor...")
16
- # Load tokenizer from base model
17
  tokenizer = AutoTokenizer.from_pretrained(
18
  base_model_name,
19
- trust_remote_code=True
 
20
  )
21
 
22
- # Load processor from base model
23
  processor = AutoProcessor.from_pretrained(
24
  base_model_name,
25
- trust_remote_code=True
 
26
  )
27
 
28
- print("Loading model...")
29
- # Load model with CPU optimizations
30
- model = AutoModelForCausalLM.from_pretrained(
31
- model_name,
32
- device_map="cpu",
33
- torch_dtype=torch.float32,
34
- low_cpu_mem_usage=True,
35
- offload_folder="offload",
36
- offload_state_dict=True,
37
- trust_remote_code=True
38
- )
39
 
40
- print("Quantizing model...")
 
 
41
  model = torch.quantization.quantize_dynamic(
42
  model,
43
- {torch.nn.Linear},
44
  dtype=torch.qint8
45
  )
46
 
47
  return model, tokenizer, processor
48
 
49
- # Create offload directory if it doesn't exist
 
50
  os.makedirs("offload", exist_ok=True)
51
 
 
 
52
  # Initialize model and tokenizer globally
53
  print("Starting model initialization...")
54
  try:
55
  model, tokenizer, processor = load_model()
56
- print("Model loaded and quantized successfully!")
 
57
  except Exception as e:
58
  print(f"Error loading model: {str(e)}")
59
  raise
60
 
61
  def analyze_image(image, instruction):
62
  try:
63
- # Clear memory
64
  gc.collect()
65
 
66
  if instruction.strip() == "":
67
  instruction = "You are an expert radiographer. Describe accurately what you see in this image."
68
 
69
- # Prepare the messages
70
  messages = [
71
  {"role": "user", "content": [
72
  {"type": "image"},
@@ -74,14 +104,17 @@ def analyze_image(image, instruction):
74
  ]}
75
  ]
76
 
77
- # Process the image and text
 
 
 
78
  inputs = processor(
79
  images=image,
80
  text=tokenizer.apply_chat_template(messages, add_generation_prompt=True),
81
  return_tensors="pt"
82
  )
83
 
84
- # Generate with conservative settings for CPU
85
  with torch.no_grad():
86
  outputs = model.generate(
87
  **inputs,
@@ -90,14 +123,13 @@ def analyze_image(image, instruction):
90
  min_p=0.1,
91
  use_cache=True,
92
  pad_token_id=tokenizer.eos_token_id,
93
- num_beams=1
 
94
  )
95
 
96
- # Decode the response
97
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
98
 
99
- # Clean up
100
- del outputs
101
  gc.collect()
102
 
103
  return response
@@ -116,7 +148,7 @@ with gr.Blocks() as demo:
116
  image_input = gr.Image(
117
  type="pil",
118
  label="Upload Medical Image",
119
- max_pixels=1500000 # Limit image size
120
  )
121
  instruction_input = gr.Textbox(
122
  label="Custom Instruction (optional)",
@@ -128,7 +160,6 @@ with gr.Blocks() as demo:
128
  with gr.Column():
129
  output_text = gr.Textbox(label="Analysis Result", lines=10)
130
 
131
- # Handle the submission
132
  submit_btn.click(
133
  fn=analyze_image,
134
  inputs=[image_input, instruction_input],
@@ -137,11 +168,11 @@ with gr.Blocks() as demo:
137
 
138
  gr.Markdown("""
139
  ### Notes:
140
- - The model runs on CPU and may take several moments to process each image
141
- - For best results, upload images smaller than 1.5MP
 
142
  - Please be patient during processing
143
  """)
144
 
145
- # Launch the app
146
  if __name__ == "__main__":
147
  demo.launch()
 
3
  import torch
4
  import gc
5
  import os
6
+ from accelerate import init_empty_weights
7
+ from accelerate.utils import load_checkpoint_in_model
8
+ import psutil
9
 
10
  # Enable better CPU performance
11
  torch.set_num_threads(4)
12
  device = "cpu"
13
 
14
+ def get_free_memory():
15
+ """Get available memory in GB"""
16
+ return psutil.virtual_memory().available / (1024 * 1024 * 1024)
17
+
18
+ def load_model_in_chunks(model_path, chunk_size_gb=2):
19
+ """Load model in chunks to manage memory"""
20
+ config = AutoModelForCausalLM.from_pretrained(model_path, return_dict=False).config
21
+
22
+ with init_empty_weights():
23
+ empty_model = AutoModelForCausalLM.from_config(config)
24
+
25
+ # Get checkpoint files
26
+ index_path = os.path.join(model_path, "model.safetensors.index.json")
27
+ if os.path.exists(index_path):
28
+ checkpoint_files = [
29
+ os.path.join(model_path, f"model-{i:05d}-of-{5:05d}.safetensors")
30
+ for i in range(1, 6)
31
+ ]
32
+ else:
33
+ checkpoint_files = [os.path.join(model_path, "pytorch_model.bin")]
34
+
35
+ # Load each chunk
36
+ for checkpoint in checkpoint_files:
37
+ if get_free_memory() < 2: # If less than 2GB free
38
+ gc.collect()
39
+ torch.cuda.empty_cache()
40
+
41
+ load_checkpoint_in_model(empty_model, checkpoint)
42
+ gc.collect()
43
+
44
+ return empty_model
45
+
46
  def load_model():
47
  model_name = "forestav/unsloth_vision_radiography_finetune"
48
+ base_model_name = "unsloth/Llama-3.2-11B-Vision-Instruct"
49
 
50
  print("Loading tokenizer and processor...")
 
51
  tokenizer = AutoTokenizer.from_pretrained(
52
  base_model_name,
53
+ trust_remote_code=True,
54
+ cache_dir="model_cache"
55
  )
56
 
 
57
  processor = AutoProcessor.from_pretrained(
58
  base_model_name,
59
+ trust_remote_code=True,
60
+ cache_dir="model_cache"
61
  )
62
 
63
+ print("Loading model in chunks...")
64
+ model = load_model_in_chunks(model_name)
 
 
 
 
 
 
 
 
 
65
 
66
+ print("Optimizing model for CPU...")
67
+ # Convert to float32 and quantize
68
+ model = model.to(torch.float32)
69
  model = torch.quantization.quantize_dynamic(
70
  model,
71
+ {torch.nn.Linear, torch.nn.Conv2d},
72
  dtype=torch.qint8
73
  )
74
 
75
  return model, tokenizer, processor
76
 
77
+ # Create cache directories
78
+ os.makedirs("model_cache", exist_ok=True)
79
  os.makedirs("offload", exist_ok=True)
80
 
81
+ print(f"Available memory before loading: {get_free_memory():.2f} GB")
82
+
83
  # Initialize model and tokenizer globally
84
  print("Starting model initialization...")
85
  try:
86
  model, tokenizer, processor = load_model()
87
+ print("Model loaded successfully!")
88
+ print(f"Available memory after loading: {get_free_memory():.2f} GB")
89
  except Exception as e:
90
  print(f"Error loading model: {str(e)}")
91
  raise
92
 
93
  def analyze_image(image, instruction):
94
  try:
 
95
  gc.collect()
96
 
97
  if instruction.strip() == "":
98
  instruction = "You are an expert radiographer. Describe accurately what you see in this image."
99
 
 
100
  messages = [
101
  {"role": "user", "content": [
102
  {"type": "image"},
 
104
  ]}
105
  ]
106
 
107
+ # Process with memory checks
108
+ if get_free_memory() < 2:
109
+ gc.collect()
110
+
111
  inputs = processor(
112
  images=image,
113
  text=tokenizer.apply_chat_template(messages, add_generation_prompt=True),
114
  return_tensors="pt"
115
  )
116
 
117
+ # Generate with minimal memory usage
118
  with torch.no_grad():
119
  outputs = model.generate(
120
  **inputs,
 
123
  min_p=0.1,
124
  use_cache=True,
125
  pad_token_id=tokenizer.eos_token_id,
126
+ num_beams=1,
127
+ do_sample=False # Disable sampling to save memory
128
  )
129
 
 
130
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
131
 
132
+ del outputs, inputs
 
133
  gc.collect()
134
 
135
  return response
 
148
  image_input = gr.Image(
149
  type="pil",
150
  label="Upload Medical Image",
151
+ max_pixels=1000000 # Reduced max image size
152
  )
153
  instruction_input = gr.Textbox(
154
  label="Custom Instruction (optional)",
 
160
  with gr.Column():
161
  output_text = gr.Textbox(label="Analysis Result", lines=10)
162
 
 
163
  submit_btn.click(
164
  fn=analyze_image,
165
  inputs=[image_input, instruction_input],
 
168
 
169
  gr.Markdown("""
170
  ### Notes:
171
+ - The model runs on CPU and may take several minutes to process each image
172
+ - For best results, upload images smaller than 1MP
173
+ - Initial loading may take some time
174
  - Please be patient during processing
175
  """)
176
 
 
177
  if __name__ == "__main__":
178
  demo.launch()
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  gradio
2
  torch
3
- transformers
4
- bitsandbytes
5
- accelerate
 
 
1
  gradio
2
  torch
3
+ transformers>=4.36.0
4
+ accelerate
5
+ psutil
6
+ safetensors