John6666
/

joy-caption-alpha-one-cli-mod

Safetensors

English

Model card Files Files and versions Community

John6666 commited on Sep 28, 2024

Commit

4a827c1

verified ·

1 Parent(s): 3203488

Upload app.py

Browse files

Files changed (1) hide show

app.py +49 -51

app.py CHANGED Viewed

@@ -215,60 +215,58 @@ def stream_chat(input_images: List[Image.Image], caption_type: str, caption_tone
     for i in range(0, len(input_images), batch_size):
         batch = input_images[i:i+batch_size]
         # Preprocess image
-        try:
-            all_images = []
-            for input_image in batch:
                 image = input_image.resize((384, 384), Image.LANCZOS)
                 pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
                 pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-                all_images.append(TVF.to_pil_image(pixel_values.squeeze()))
-            batch_pixel_values = clip_processor(images=all_images, return_tensors='pt', padding=True).pixel_values.to(device)
-        except ValueError as e:
-            print(f"Error processing image batch: {e}")
-            print("Skipping this batch and continuing...")
-            continue
-        # Embed image
-        with torch.amp.autocast_mode.autocast(device, enabled=True):
-            vision_outputs = clip_model(pixel_values=batch_pixel_values, output_hidden_states=True)
-            image_features = vision_outputs.hidden_states
-            embedded_images = image_adapter(image_features).to(device)
-        # Tokenize the prompt
-        prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
-        # Embed prompt
-        prompt_embeds = text_model.model.embed_tokens(prompt.to(device))
-        assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
-        embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
-        eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
-        # Construct prompts
-        inputs_embeds = torch.cat([
-            embedded_bos.expand(embedded_images.shape[0], -1, -1),
-            embedded_images.to(dtype=embedded_bos.dtype),
-            prompt_embeds.expand(embedded_images.shape[0], -1, -1),
-            eot_embed.expand(embedded_images.shape[0], -1, -1),
-        ], dim=1)
-        input_ids = torch.cat([
-            torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
-            torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
-            prompt,
-            torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
-        ], dim=1).to(device)
-        attention_mask = torch.ones_like(input_ids)
-        generate_ids = text_model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, do_sample=True,
-                                           suppress_tokens=None, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature)
-        # Trim off the prompt
-        generate_ids = generate_ids[:, input_ids.shape[1]:]
-        for ids in generate_ids:
-            caption = tokenizer.decode(ids[:] if ids[0] == tokenizer.eos_token_id or ids[0] == tokenizer.convert_tokens_to_ids("<|eot_id|>") else ids,
-                                       skip_special_tokens=True, clean_up_tokenization_spaces=True)
-            caption = caption.replace('<|end_of_text|>', '').replace('<|finetune_right_pad_id|>', '').strip()
-            all_captions.append(caption)
         if pbar:
             pbar.update(len(batch))

     for i in range(0, len(input_images), batch_size):
         batch = input_images[i:i+batch_size]
         # Preprocess image
+        for input_image in input_images:
+            try:
                 image = input_image.resize((384, 384), Image.LANCZOS)
                 pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
                 pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+                pixel_values = pixel_values.to(device)
+            except ValueError as e:
+                print(f"Error processing image: {e}")
+                print("Skipping this image and continuing...")
+                continue
+            # Embed image
+            with torch.amp.autocast_mode.autocast(device, enabled=True):
+                vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
+                image_features = vision_outputs.hidden_states
+                embedded_images = image_adapter(image_features).to(device)
+            # Tokenize the prompt
+            prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
+            # Embed prompt
+            prompt_embeds = text_model.model.embed_tokens(prompt.to(device))
+            assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
+            embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
+            eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
+            # Construct prompts
+            inputs_embeds = torch.cat([
+                embedded_bos.expand(embedded_images.shape[0], -1, -1),
+                embedded_images.to(dtype=embedded_bos.dtype),
+                prompt_embeds.expand(embedded_images.shape[0], -1, -1),
+                eot_embed.expand(embedded_images.shape[0], -1, -1),
+            ], dim=1)
+            input_ids = torch.cat([
+                torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
+                torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
+                prompt,
+                torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
+            ], dim=1).to(device)
+            attention_mask = torch.ones_like(input_ids)
+            generate_ids = text_model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, do_sample=True,
+                                            suppress_tokens=None, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature)
+            # Trim off the prompt
+            generate_ids = generate_ids[:, input_ids.shape[1]:]
+            if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
+                generate_ids = generate_ids[:, :-1]
+            caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
+            all_captions.append(caption.strip())
         if pbar:
             pbar.update(len(batch))