import os import requests from PIL import Image, UnidentifiedImageError import numpy as np import gradio as gr from encoder import FashionCLIPEncoder from pinecone import Pinecone from dotenv import load_dotenv import json # Load environment variables load_dotenv() # Constants PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME") PINECONE_NAMESPACE = os.getenv("PINECONE_NAMESPACE") REQUESTS_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } BATCH_SIZE = 30 # Initialize services pc = Pinecone(api_key=PINECONE_API_KEY) index = pc.Index(PINECONE_INDEX_NAME) encoder = FashionCLIPEncoder() def download_image_as_pil(url: str, timeout: int = 10) -> Image.Image: try: response = requests.get(url, stream=True, headers=REQUESTS_HEADERS, timeout=timeout) if response.status_code == 200 and 'image' in response.headers.get('Content-Type', ''): try: return Image.open(response.raw).convert("RGB") except UnidentifiedImageError: print(f"Unidentified image file from URL: {url}") return None except Exception as e: print(f"Error downloading image: {e}") return None def process_batch(batch_products, batch_images, results): try: # Generate embeddings embeddings = encoder.encode_images(batch_images) for product, embedding in zip(batch_products, embeddings): # Normalize embedding embedding_normalized = embedding / np.linalg.norm(embedding) # Append results results.append({ "product_id": product["product_id"], "image_url": product["url"], "embedding": embedding_normalized.tolist(), "embedding_preview": embedding_normalized[:5].tolist(), "success": True }) except Exception as e: for product in batch_products: results.append({ "product_id": product["product_id"], "image_url": product["url"], "error": str(e) }) def batch_process_images(json_input: str): try: # Parse JSON input data = json.loads(json_input) products = data.get("products", []) upload_to_pinecone = data.get("upload_to_pinecone", False) if not products: return {"error": "No products provided in JSON input."} results = [] batch_products, batch_images = [], [] for product in products: try: # Download image image = download_image_as_pil(product["url"]) if not image: results.append({ "product_id": product["product_id"], "image_url": product["url"], "error": "Failed to download image" }) continue batch_products.append(product) batch_images.append(image) # Process batch when reaching batch size if len(batch_images) == BATCH_SIZE: process_batch(batch_products, batch_images, results) batch_products, batch_images = [], [] except Exception as e: results.append({ "product_id": product["product_id"], "image_url": product["url"], "error": str(e) }) # Process remaining images in the last batch if batch_images: process_batch(batch_products, batch_images, results) # Upload to Pinecone if requested if upload_to_pinecone: upload_result = upload_vector_to_pinecone(results) return { "processing_results": results, "pinecone_upload": upload_result } return {"processing_results": results} except json.JSONDecodeError: return {"error": "Invalid JSON format"} except Exception as e: return {"error": f"Unexpecteddd error: {str(e)}"} def upload_vector_to_pinecone(processed_results): """Upload embeddings to Pinecone""" vectors_to_upsert = [] for result in processed_results: if 'error' not in result and 'embedding' in result: vector = { 'id': result['product_id'], 'values': result['embedding'], 'metadata': { 'image_url': result['image_url'] } } vectors_to_upsert.append(vector) if vectors_to_upsert: index.upsert(vectors=vectors_to_upsert, namespace=PINECONE_NAMESPACE) return {"uploaded_count": len(vectors_to_upsert)} # Example JSON input EXAMPLE_INPUT = { "products": [ { "product_id": "1", "url": "https://cdn.shopify.com/s/files/1/0522/2239/4534/files/CT21355-22_1024x1024.webp" } ], "upload_to_pinecone": False } # Gradio Interface iface = gr.Interface( fn=batch_process_images, inputs=gr.Code( label="Input JSON", language="json", value=json.dumps(EXAMPLE_INPUT, indent=4) # Changed from default to value ), outputs=gr.JSON(label="Processing Results"), title="Fashion CLIP Embedding Generator", description="Provide JSON input with product IDs, URLs, and Pinecone upload preference to generate embeddings.", article=""" ### Input JSON Format: ```json { "products": [ { "product_id": "string", "url": "string" } ], "upload_to_pinecone": boolean } ``` ### Features: - Batch processing of multiple images - Custom product ID support - Embedding generation using Fashion CLIP - Optional Pinecone database integration - Error handling and detailed results Make sure to set up your environment variables in a .env file: - PINECONE_API_KEY - PINECONE_INDEX_NAME - PINECONE_NAMESPACE """ ) # Launch Gradio App if __name__ == "__main__": iface.launch()