File size: 8,672 Bytes
7e7e4f5
 
 
 
 
 
 
 
 
 
 
9a9cd51
7e7e4f5
9a9cd51
 
 
 
 
 
 
 
 
 
7e7e4f5
 
9a9cd51
 
 
7e7e4f5
 
9a9cd51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e19b4b0
 
 
 
 
 
 
 
 
9a9cd51
 
 
 
 
 
 
e19b4b0
 
9a9cd51
 
 
7e7e4f5
9a9cd51
 
 
 
 
 
 
 
 
 
 
 
 
7e7e4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e19b4b0
9a9cd51
7e7e4f5
 
 
 
 
9a9cd51
 
 
 
 
 
 
7e7e4f5
 
 
9a9cd51
 
7e7e4f5
 
 
 
 
 
9a9cd51
7e7e4f5
 
 
9a9cd51
7e7e4f5
9a9cd51
 
 
 
 
7e7e4f5
9a9cd51
 
 
 
 
 
 
 
 
7e7e4f5
9a9cd51
 
 
 
 
 
 
 
 
e19b4b0
 
 
 
9a9cd51
 
 
 
 
 
e19b4b0
 
 
 
9a9cd51
 
7e7e4f5
9a9cd51
7e7e4f5
9a9cd51
7e7e4f5
9a9cd51
7e7e4f5
9a9cd51
 
 
 
 
 
 
7e7e4f5
 
9a9cd51
7e7e4f5
9a9cd51
 
 
7e7e4f5
 
9a9cd51
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import json
import os
from pathlib import Path

class VideoRetrieval:
    def __init__(self, use_dummy_data=True):
        self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
        if use_dummy_data:
            self.create_dummy_data()
        else:
            self.load_data()
        
    def create_dummy_data(self):
        """Create dummy features and metadata for demonstration"""
        # Create dummy features
        n_clips = 20
        feature_dim = 384  # matching the dimension of all-MiniLM-L6-v2
        
        self.features = {
            'visual_features': np.random.randn(n_clips, feature_dim),
            'scene_features': np.random.randn(n_clips, feature_dim),
            'object_features': np.random.randn(n_clips, feature_dim)
        }
        
        # Create dummy metadata
        movie_titles = [
            "The Matrix", "Inception", "The Dark Knight", "Pulp Fiction",
            "The Shawshank Redemption", "Forrest Gump", "The Godfather",
            "Fight Club", "Interstellar", "The Silence of the Lambs"
        ]
        
        descriptions = [
            "A dramatic confrontation in a dark room where the truth is revealed",
            "A high-stakes chase through a crowded city street",
            "An emotional reunion between long-lost friends",
            "A tense negotiation that determines the fate of many",
            "A quiet moment of reflection before a life-changing decision"
        ]
        
        # Sample YouTube clips (famous movie scenes)
        youtube_clips = [
            "https://www.youtube.com/watch?v=kcsNbQRU5TI",  # Matrix - Red Pill Blue Pill
            "https://www.youtube.com/watch?v=YoHD9XEInc0",  # Inception - Hallway Fight
            "https://www.youtube.com/watch?v=ZWCAf-xLV2k",  # Dark Knight - Interrogation
            "https://www.youtube.com/watch?v=Jomr9SAjcyw",  # Pulp Fiction - Restaurant
            "https://www.youtube.com/watch?v=SQ7_5MMbPYs",  # Shawshank - Hope Speech
        ]
        
        data = []
        for i in range(n_clips):
            data.append({
                'clip_id': f'clip_{i}',
                'movie_title': movie_titles[i % len(movie_titles)],
                'description': descriptions[i % len(descriptions)],
                'timestamp': f'{(i*5):02d}:00 - {(i*5+3):02d}:00',
                'duration': '3:00',
                'youtube_url': youtube_clips[i % len(youtube_clips)]
            })
        
        self.clips_df = pd.DataFrame(data)
        
    def load_data(self):
        """Load actual pre-computed features and metadata"""
        try:
            self.features = {
                'visual_features': np.load('path_to_visual_features.npy'),
                'scene_features': np.load('path_to_scene_features.npy'),
                'object_features': np.load('path_to_object_features.npy')
            }
            self.clips_df = pd.read_csv('clips_metadata.csv')
        except FileNotFoundError as e:
            st.error(f"Error loading data: {e}. Falling back to dummy data.")
            self.create_dummy_data()
    
    def encode_query(self, query_text):
        """Encode the text query into embeddings"""
        return self.text_model.encode(query_text)
    
    def compute_similarity(self, query_embedding, feature_type='visual_features'):
        """Compute similarity between query and video features"""
        similarities = cosine_similarity(
            query_embedding.reshape(1, -1),
            self.features[feature_type]
        )
        return similarities[0]
    
    def retrieve_clips(self, query_text, top_k=3):
        """Retrieve top-k most relevant clips based on query"""
        # Encode query
        query_embedding = self.encode_query(query_text)
        
        # Compute similarities for different feature types
        similarities = {}
        weights = {
            'visual_features': 0.4,
            'scene_features': 0.3,
            'object_features': 0.3
        }
        
        for feat_type, weight in weights.items():
            similarities[feat_type] = self.compute_similarity(query_embedding, feat_type) * weight
            
        # Combine similarities
        combined_similarities = sum(similarities.values())
        
        # Get top-k indices
        top_indices = np.argsort(combined_similarities)[-top_k:][::-1]
        
        # Return clip information
        results = []
        for idx in top_indices:
            results.append({
                'clip_id': self.clips_df.iloc[idx]['clip_id'],
                'movie_title': self.clips_df.iloc[idx]['movie_title'],
                'description': self.clips_df.iloc[idx]['description'],
                'timestamp': self.clips_df.iloc[idx]['timestamp'],
                'youtube_url': self.clips_df.iloc[idx]['youtube_url'],
                'similarity_score': float(combined_similarities[idx])  # Convert to float for JSON serialization
            })
        
        return results

def main():
    st.set_page_config(
        page_title="Movie Scene Retrieval System",
        page_icon="🎬",
        layout="wide"
    )
    
    st.title("🎬 Movie Scene Retrieval System")
    st.write("""
    Search for movie scenes using natural language descriptions.
    The system will retrieve the most relevant 2-3 minute clips based on your query.
    
    *Note: This is a demo version using simulated data.*
    """)
    
    # Initialize retrieval system
    try:
        retrieval_system = st.session_state.retrieval_system
    except AttributeError:
        retrieval_system = VideoRetrieval(use_dummy_data=True)
        st.session_state.retrieval_system = retrieval_system
    
    # Search interface
    col1, col2 = st.columns([3, 1])
    
    with col1:
        query = st.text_input(
            "Enter your scene description:",
            placeholder="e.g., A dramatic confrontation between two characters in a dark room"
        )
    
    with col2:
        num_results = st.slider("Number of results:", min_value=1, max_value=5, value=3)
    
    if st.button("πŸ” Search", type="primary"):
        if not query:
            st.warning("Please enter a scene description.")
        else:
            with st.spinner("Searching for relevant clips..."):
                results = retrieval_system.retrieve_clips(query, top_k=num_results)
                
                for i, result in enumerate(results, 1):
                    with st.container():
                        st.subheader(f"{result['movie_title']}")
                        cols = st.columns([2, 1])
                        
                        with cols[0]:
                            st.markdown(f"**Scene Description:**")
                            st.write(result['description'])
                            st.text(f"⏱️ Timestamp: {result['timestamp']}")
                            
                            # Add video player
                            if result['youtube_url']:
                                st.video(result['youtube_url'])
                        
                        with cols[1]:
                            st.markdown("**Relevance Score:**")
                            score = min(1.0, max(0.0, result['similarity_score']))
                            st.progress(score)
                            st.text(f"{score:.2%} match")
                            
                            # Add direct YouTube link
                            st.markdown(f"[πŸ”— Watch on YouTube]({result['youtube_url']})")
                            st.text("Click to open in a new tab")
                        
                        st.divider()
    
    # Sidebar with additional information
    with st.sidebar:
        st.header("ℹ️ About")
        st.write("""
        This demo system simulates a video retrieval engine that uses:
        
        - πŸŽ₯ Visual scene understanding
        - πŸ‘₯ Character interaction analysis
        - 🎯 Object detection
        - 🎭 Action recognition
        
        In a production system, these features would be pre-computed
        from actual movie clips using state-of-the-art AI models.
        """)
        
        st.header("βš™οΈ Feature Weights")
        st.write("Current weights used for similarity computation:")
        st.write("- 🎬 Visual Features: 40%")
        st.write("- 🏞️ Scene Features: 30%")
        st.write("- πŸ“¦ Object Features: 30%")

if __name__ == "__main__":
    main()