Spaces:

awacke1
/

CodeCompetitionClaudeVsGPT

Running

App Files Files Community

awacke1 commited on Dec 19, 2024

Commit

8630bc3

verified ·

1 Parent(s): 832c41b

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -43

app.py CHANGED Viewed

@@ -58,72 +58,101 @@ class VideoSearch:
     def fetch_dataset_rows(self):
         """Fetch dataset from Hugging Face API with debug and caching"""
         try:
-            # First try to load from local cache
-            cache_file = "dataset_cache.json"
-            if os.path.exists(cache_file):
-                st.info("Loading from cache...")
-                with open(cache_file, 'r', encoding='utf-8') as f:
-                    data = json.load(f)
-                return pd.DataFrame(data)
             st.info("Fetching from Hugging Face API...")
             url = "https://datasets-server.huggingface.co/first-rows?dataset=omegalabsinc%2Fomega-multimodal&config=default&split=train"
-            # Add debug output
-            st.write(f"Requesting URL: {url}")
             response = requests.get(url, timeout=30)
             st.write(f"Response status: {response.status_code}")
             if response.status_code == 200:
                 data = response.json()
-                # Debug output
-                st.write("Response structure:", list(data.keys()))
                 if 'rows' in data:
-                    rows = data['rows']
-                    # Cache the response
-                    with open(cache_file, 'w', encoding='utf-8') as f:
-                        json.dump(rows, f)
-                    df = pd.DataFrame(rows)
                     # Debug output
-                    st.write("DataFrame columns:", list(df.columns))
                     st.write("Number of rows:", len(df))
                     return df
                 else:
                     st.error("No 'rows' found in API response")
-                    st.write("API Response:", data)
-                    # Try loading example data
-                    example_file = "example_data.json"
-                    if os.path.exists(example_file):
-                        st.info("Loading example data...")
-                        with open(example_file, 'r', encoding='utf-8') as f:
-                            example_data = json.load(f)
-                        return pd.DataFrame(example_data)
-                    return None
             else:
                 st.error(f"API request failed with status code: {response.status_code}")
-                if response.status_code == 404:
-                    st.error("Dataset not found - check the dataset name and configuration")
-                try:
-                    error_details = response.json()
-                    st.write("Error details:", error_details)
-                except:
-                    st.write("Raw response:", response.text)
-                return None
         except Exception as e:
             st.error(f"Error fetching dataset: {str(e)}")
-            import traceback
-            st.write("Traceback:", traceback.format_exc())
-            return None
     def load_dataset(self):
         try:

     def fetch_dataset_rows(self):
         """Fetch dataset from Hugging Face API with debug and caching"""
         try:
             st.info("Fetching from Hugging Face API...")
             url = "https://datasets-server.huggingface.co/first-rows?dataset=omegalabsinc%2Fomega-multimodal&config=default&split=train"
             response = requests.get(url, timeout=30)
             st.write(f"Response status: {response.status_code}")
             if response.status_code == 200:
                 data = response.json()
                 if 'rows' in data:
+                    # Extract actual row data from the nested structure
+                    processed_rows = []
+                    for row_data in data['rows']:
+                        if 'row' in row_data:  # Access the nested 'row' data
+                            processed_rows.append(row_data['row'])
+                    df = pd.DataFrame(processed_rows)
                     # Debug output
+                    st.write("DataFrame columns after processing:", list(df.columns))
                     st.write("Number of rows:", len(df))
                     return df
                 else:
                     st.error("No 'rows' found in API response")
+                    st.write("Raw API Response:", data)
+                    return self.load_example_data()
             else:
                 st.error(f"API request failed with status code: {response.status_code}")
+                return self.load_example_data()
         except Exception as e:
             st.error(f"Error fetching dataset: {str(e)}")
+            return self.load_example_data()
+    def load_example_data(self):
+        """Load example data as fallback"""
+        example_data = [
+            {
+                "video_id": "cd21da96-fcca-4c94-a60f-0b1e4e1e29fc",
+                "youtube_id": "IO-vwtyicn4",
+                "description": "This video shows a close-up of an ancient text carved into a surface, with the text appearing to be in a cursive script.",
+                "views": 45489,
+                "start_time": 1452,
+                "end_time": 1458,
+                "video_embed": [0.014160037972033024, -0.003111184574663639, -0.016604168340563774],
+                "description_embed": [-0.05835828185081482, 0.02589797042310238, 0.11952091753482819]
+            },
+            {
+                "video_id": "a8ebde7d-d717-4c1e-8be4-bdb4bc0c544f",
+                "youtube_id": "mo4rEyF7gTE",
+                "description": "This video shows a close-up view of a classical architectural structure, featuring stone statues with ornate details.",
+                "views": 4468,
+                "start_time": 318,
+                "end_time": 324,
+                "video_embed": [0.015160037972033024, -0.004111184574663639, -0.017604168340563774],
+                "description_embed": [-0.06835828185081482, 0.03589797042310238, 0.12952091753482819]
+            },
+            {
+                "video_id": "d1be64a6-22e2-4fbd-a176-20749e7c3d8a",
+                "youtube_id": "IO-vwtyicn4",
+                "description": "This video shows a weathered ancient painting depicting figures in classical style with vibrant colors preserved.",
+                "views": 45489,
+                "start_time": 1698,
+                "end_time": 1704,
+                "video_embed": [0.016160037972033024, -0.005111184574663639, -0.018604168340563774],
+                "description_embed": [-0.07835828185081482, 0.04589797042310238, 0.13952091753482819]
+            }
+        ]
+        return pd.DataFrame(example_data)
+    def prepare_features(self):
+        """Prepare and cache embeddings"""
+        try:
+            if 'video_embed' not in self.dataset.columns:
+                st.warning("Using example data embeddings")
+                self.dataset = self.load_example_data()
+            # Convert string representations of embeddings back to numpy arrays
+            try:
+                self.video_embeds = np.array([json.loads(e) if isinstance(e, str) else e
+                                            for e in self.dataset.video_embed])
+                self.text_embeds = np.array([json.loads(e) if isinstance(e, str) else e
+                                           for e in self.dataset.description_embed])
+            except Exception as e:
+                st.error(f"Error converting embeddings: {e}")
+                num_rows = len(self.dataset)
+                self.video_embeds = np.random.randn(num_rows, 384)
+                self.text_embeds = np.random.randn(num_rows, 384)
+        except Exception as e:
+            st.error(f"Error preparing features: {e}")
+            # Create random embeddings as fallback
+            num_rows = len(self.dataset)
+            self.video_embeds = np.random.randn(num_rows, 384)
+            self.text_embeds = np.random.randn(num_rows, 384)
     def load_dataset(self):
         try: