import gradio as gr from datasets import load_dataset import json import random from datetime import datetime import os from PIL import Image import io import numpy as np # Get access token from environment access_token = os.environ.get("HUGGINGFACE_TOKEN") class DatasetViewer: def __init__(self): self.dataset = None self.dataset_size = 0 self.last_refresh_time = None self.max_display_size = (800, 600) # Maximum width and height for displayed images self.load_dataset() def resize_image(self, image): """Resize image keeping aspect ratio with a maximum size constraint""" if isinstance(image, np.ndarray): # Convert numpy array to PIL Image image = Image.fromarray(image) elif isinstance(image, bytes): # Convert bytes to PIL Image image = Image.open(io.BytesIO(image)) # Calculate scaling factor to fit within max dimensions width_ratio = self.max_display_size[0] / image.width height_ratio = self.max_display_size[1] / image.height scale_factor = min(width_ratio, height_ratio) # Only resize if image is larger than max dimensions if scale_factor < 1: new_width = int(image.width * scale_factor) new_height = int(image.height * scale_factor) image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) # Convert back to numpy array for gradio return np.array(image) def load_dataset(self): """Load the complete dataset into memory""" # Load the full dataset (non-streaming) self.dataset = load_dataset( "taesiri/PhotoshopRequest-DailyDump-January-2025-RandomSample", split="train", token=access_token ) self.dataset_size = len(self.dataset) self.last_refresh_time = datetime.now() def get_next_samples(self, num_samples=5): """Get random samples from the dataset""" # Generate random indices indices = random.sample(range(self.dataset_size), min(num_samples, self.dataset_size)) results = [] for idx in indices: sample = self.dataset[idx] # Get post information post_id = sample["post_id"] title = sample["title"] reddit_url = f"https://www.reddit.com/r/PhotoshopRequest/comments/{post_id}" # Extract selftext if available selftext = "" try: selftext = json.loads(sample["json_data"])["post"]["selftext"] except: print(f"No selftext found for post {post_id}") # Create markdown text markdown_text = f"# {title}\n\n{selftext}\n\n[View post on r/PhotoshopRequest]({reddit_url})" # Append the triple (post_info, source_image, edited_image) results.append(markdown_text) # Resize images before adding to results source_image = self.resize_image(sample["source_image"]) edited_image = self.resize_image(sample["edited_image"]) results.append(source_image) results.append(edited_image) return tuple(results) def get_info(self): """Return dataset information""" return f"""

Dataset Size: {self.dataset_size} items
Last Refreshed: {self.last_refresh_time.strftime('%Y-%m-%d %H:%M:%S UTC')}
""" def create_interface(): viewer = DatasetViewer() with gr.Blocks() as demo: gr.Markdown("# PhotoshopRequest Dataset Viewer") gr.Markdown(""" This is a viewer for the PhotoshopRequest dataset. Each sample shows a Photoshop editing request post. Click the 'Show New Samples' button to see **5 random samples** from the dataset. **Layout**: For each sample, you'll see: 1. The post title and description 2. The source image (left) and edited result (right) """) # Create 5 sets of outputs outputs = [] for i in range(5): post_info = gr.Markdown() outputs.append(post_info) with gr.Row(): source = gr.Image(label=f"Source Image {i+1}") edited = gr.Image(label=f"Edited Image {i+1}") outputs.extend([source, edited]) sample_button = gr.Button("Show New Samples") info_md = gr.Markdown() # Set up event handlers sample_button.click( viewer.get_next_samples, outputs=outputs ).then( viewer.get_info, outputs=[info_md] ) return demo if __name__ == "__main__": demo = create_interface() demo.launch()