File size: 8,672 Bytes
7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 e19b4b0 9a9cd51 e19b4b0 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 e19b4b0 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 e19b4b0 9a9cd51 e19b4b0 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 7e7e4f5 9a9cd51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import json
import os
from pathlib import Path
class VideoRetrieval:
def __init__(self, use_dummy_data=True):
self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
if use_dummy_data:
self.create_dummy_data()
else:
self.load_data()
def create_dummy_data(self):
"""Create dummy features and metadata for demonstration"""
# Create dummy features
n_clips = 20
feature_dim = 384 # matching the dimension of all-MiniLM-L6-v2
self.features = {
'visual_features': np.random.randn(n_clips, feature_dim),
'scene_features': np.random.randn(n_clips, feature_dim),
'object_features': np.random.randn(n_clips, feature_dim)
}
# Create dummy metadata
movie_titles = [
"The Matrix", "Inception", "The Dark Knight", "Pulp Fiction",
"The Shawshank Redemption", "Forrest Gump", "The Godfather",
"Fight Club", "Interstellar", "The Silence of the Lambs"
]
descriptions = [
"A dramatic confrontation in a dark room where the truth is revealed",
"A high-stakes chase through a crowded city street",
"An emotional reunion between long-lost friends",
"A tense negotiation that determines the fate of many",
"A quiet moment of reflection before a life-changing decision"
]
# Sample YouTube clips (famous movie scenes)
youtube_clips = [
"https://www.youtube.com/watch?v=kcsNbQRU5TI", # Matrix - Red Pill Blue Pill
"https://www.youtube.com/watch?v=YoHD9XEInc0", # Inception - Hallway Fight
"https://www.youtube.com/watch?v=ZWCAf-xLV2k", # Dark Knight - Interrogation
"https://www.youtube.com/watch?v=Jomr9SAjcyw", # Pulp Fiction - Restaurant
"https://www.youtube.com/watch?v=SQ7_5MMbPYs", # Shawshank - Hope Speech
]
data = []
for i in range(n_clips):
data.append({
'clip_id': f'clip_{i}',
'movie_title': movie_titles[i % len(movie_titles)],
'description': descriptions[i % len(descriptions)],
'timestamp': f'{(i*5):02d}:00 - {(i*5+3):02d}:00',
'duration': '3:00',
'youtube_url': youtube_clips[i % len(youtube_clips)]
})
self.clips_df = pd.DataFrame(data)
def load_data(self):
"""Load actual pre-computed features and metadata"""
try:
self.features = {
'visual_features': np.load('path_to_visual_features.npy'),
'scene_features': np.load('path_to_scene_features.npy'),
'object_features': np.load('path_to_object_features.npy')
}
self.clips_df = pd.read_csv('clips_metadata.csv')
except FileNotFoundError as e:
st.error(f"Error loading data: {e}. Falling back to dummy data.")
self.create_dummy_data()
def encode_query(self, query_text):
"""Encode the text query into embeddings"""
return self.text_model.encode(query_text)
def compute_similarity(self, query_embedding, feature_type='visual_features'):
"""Compute similarity between query and video features"""
similarities = cosine_similarity(
query_embedding.reshape(1, -1),
self.features[feature_type]
)
return similarities[0]
def retrieve_clips(self, query_text, top_k=3):
"""Retrieve top-k most relevant clips based on query"""
# Encode query
query_embedding = self.encode_query(query_text)
# Compute similarities for different feature types
similarities = {}
weights = {
'visual_features': 0.4,
'scene_features': 0.3,
'object_features': 0.3
}
for feat_type, weight in weights.items():
similarities[feat_type] = self.compute_similarity(query_embedding, feat_type) * weight
# Combine similarities
combined_similarities = sum(similarities.values())
# Get top-k indices
top_indices = np.argsort(combined_similarities)[-top_k:][::-1]
# Return clip information
results = []
for idx in top_indices:
results.append({
'clip_id': self.clips_df.iloc[idx]['clip_id'],
'movie_title': self.clips_df.iloc[idx]['movie_title'],
'description': self.clips_df.iloc[idx]['description'],
'timestamp': self.clips_df.iloc[idx]['timestamp'],
'youtube_url': self.clips_df.iloc[idx]['youtube_url'],
'similarity_score': float(combined_similarities[idx]) # Convert to float for JSON serialization
})
return results
def main():
st.set_page_config(
page_title="Movie Scene Retrieval System",
page_icon="π¬",
layout="wide"
)
st.title("π¬ Movie Scene Retrieval System")
st.write("""
Search for movie scenes using natural language descriptions.
The system will retrieve the most relevant 2-3 minute clips based on your query.
*Note: This is a demo version using simulated data.*
""")
# Initialize retrieval system
try:
retrieval_system = st.session_state.retrieval_system
except AttributeError:
retrieval_system = VideoRetrieval(use_dummy_data=True)
st.session_state.retrieval_system = retrieval_system
# Search interface
col1, col2 = st.columns([3, 1])
with col1:
query = st.text_input(
"Enter your scene description:",
placeholder="e.g., A dramatic confrontation between two characters in a dark room"
)
with col2:
num_results = st.slider("Number of results:", min_value=1, max_value=5, value=3)
if st.button("π Search", type="primary"):
if not query:
st.warning("Please enter a scene description.")
else:
with st.spinner("Searching for relevant clips..."):
results = retrieval_system.retrieve_clips(query, top_k=num_results)
for i, result in enumerate(results, 1):
with st.container():
st.subheader(f"{result['movie_title']}")
cols = st.columns([2, 1])
with cols[0]:
st.markdown(f"**Scene Description:**")
st.write(result['description'])
st.text(f"β±οΈ Timestamp: {result['timestamp']}")
# Add video player
if result['youtube_url']:
st.video(result['youtube_url'])
with cols[1]:
st.markdown("**Relevance Score:**")
score = min(1.0, max(0.0, result['similarity_score']))
st.progress(score)
st.text(f"{score:.2%} match")
# Add direct YouTube link
st.markdown(f"[π Watch on YouTube]({result['youtube_url']})")
st.text("Click to open in a new tab")
st.divider()
# Sidebar with additional information
with st.sidebar:
st.header("βΉοΈ About")
st.write("""
This demo system simulates a video retrieval engine that uses:
- π₯ Visual scene understanding
- π₯ Character interaction analysis
- π― Object detection
- π Action recognition
In a production system, these features would be pre-computed
from actual movie clips using state-of-the-art AI models.
""")
st.header("βοΈ Feature Weights")
st.write("Current weights used for similarity computation:")
st.write("- π¬ Visual Features: 40%")
st.write("- ποΈ Scene Features: 30%")
st.write("- π¦ Object Features: 30%")
if __name__ == "__main__":
main() |