inie2003 commited on
Commit
20ea451
Β·
verified Β·
1 Parent(s): 8ba7a1f

added helper files

Browse files
Files changed (3) hide show
  1. README.md +5 -6
  2. helper.py +155 -0
  3. requirements.txt +11 -0
README.md CHANGED
@@ -1,14 +1,13 @@
1
  ---
2
- title: Semantic Search Frontend
3
- emoji: πŸ“ˆ
4
- colorFrom: indigo
5
- colorTo: blue
6
  sdk: streamlit
7
- sdk_version: 1.39.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: 'Try out our frontend with our existing datatsets '
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Quasara Frontend Try
3
+ emoji: πŸ‘
4
+ colorFrom: gray
5
+ colorTo: pink
6
  sdk: streamlit
7
+ sdk_version: 1.38.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
helper.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sentence_transformers import SentenceTransformer, util
3
+ from open_clip import create_model_from_pretrained, get_tokenizer
4
+ import torch
5
+ from datasets import load_dataset
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ import torch.nn as nn
8
+ import boto3
9
+ import streamlit as st
10
+ from PIL import Image
11
+ from io import BytesIO
12
+ from typing import List, Union
13
+
14
+
15
+ # Initialize the model globally to avoid reloading each time
16
+ model, preprocess = create_model_from_pretrained('hf-hub:timm/ViT-SO400M-14-SigLIP-384')
17
+ tokenizer = get_tokenizer('hf-hub:timm/ViT-SO400M-14-SigLIP-384')
18
+
19
+ #what model do we use?
20
+
21
+ def encode_query(query: Union[str, Image.Image]) -> torch.Tensor:
22
+ """
23
+ Encode the query using the OpenCLIP model.
24
+
25
+ Parameters
26
+ ----------
27
+ query : Union[str, Image.Image]
28
+ The query, which can be a text string or an Image object.
29
+
30
+ Returns
31
+ -------
32
+ torch.Tensor
33
+ The encoded query vector.
34
+ """
35
+ if isinstance(query, Image.Image):
36
+ query = preprocess(query).unsqueeze(0) # Preprocess the image and add batch dimension
37
+ with torch.no_grad():
38
+ query_embedding = model.encode_image(query) # Get image embedding
39
+ elif isinstance(query, str):
40
+ text = tokenizer(query, context_length=model.context_length)
41
+ with torch.no_grad():
42
+ query_embedding = model.encode_text(text) # Get text embedding
43
+ else:
44
+ raise ValueError("Query must be either a string or an Image.")
45
+
46
+ return query_embedding
47
+
48
+ def load_hf_datasets(dataset_name):
49
+ """
50
+ Load Datasets from Hugging Face as DF
51
+ ---------------------------------------
52
+ dataset_name: str - name of dataset on Hugging Face
53
+ ---------------------------------------
54
+
55
+ RETURNS: dataset as pandas dataframe
56
+ """
57
+ dataset = load_dataset(f"quasara-io/{dataset_name}")
58
+ # Access only the 'Main' split
59
+ main_dataset = dataset['Main']
60
+ # Convert to Pandas DataFrame
61
+ df = main_dataset.to_pandas()
62
+ return df
63
+
64
+ def get_image_vectors(df):
65
+ # Get the image vectors from the dataframe
66
+ image_vectors = np.vstack(df['Vector'].to_numpy())
67
+ return torch.tensor(image_vectors, dtype=torch.float32)
68
+
69
+
70
+ def search(query, df, limit, offset, scoring_func, search_in_images, search_in_small_objects):
71
+ if search_in_images:
72
+ # Encode the image query
73
+ query_vector = encode_query(query)
74
+
75
+
76
+ # Get the image vectors from the dataframe
77
+ image_vectors = get_image_vectors(df)
78
+
79
+
80
+ # Calculate the cosine similarity between the query vector and each image vector
81
+ query_vector = query_vector[0, :].detach().numpy() # Detach and convert to a NumPy array
82
+ image_vectors = image_vectors.detach().numpy() # Convert the image vectors to a NumPy array
83
+ cosine_similarities = cosine_similarity([query_vector], image_vectors)
84
+
85
+ # Get the top K indices of the most similar image vectors
86
+ top_k_indices = np.argsort(-cosine_similarities[0])[:limit]
87
+
88
+ # Return the top K indices
89
+ return top_k_indices
90
+
91
+ def get_file_paths(df, top_k_indices, column_name = 'File_Path'):
92
+ """
93
+ Retrieve the file paths (or any specific column) from the DataFrame using the top K indices.
94
+
95
+ Parameters:
96
+ - df: pandas DataFrame containing the data
97
+ - top_k_indices: numpy array of the top K indices
98
+ - column_name: str, the name of the column to fetch (e.g., 'ImagePath')
99
+
100
+ Returns:
101
+ - top_k_paths: list of file paths or values from the specified column
102
+ """
103
+ # Fetch the specific column corresponding to the top K indices
104
+ top_k_paths = df.iloc[top_k_indices][column_name].tolist()
105
+ return top_k_paths
106
+
107
+
108
+ def get_images_from_s3_to_display(bucket_name, file_paths, AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY, folder_name= None):
109
+ """
110
+ Retrieve and display images from AWS S3 in a Streamlit app.
111
+
112
+ Parameters:
113
+ - bucket_name: str, the name of the S3 bucket
114
+ - file_paths: list, a list of file paths to retrieve from S3
115
+
116
+ Returns:
117
+ - None (directly displays images in the Streamlit app)
118
+ """
119
+ # Initialize S3 client
120
+ s3 = boto3.client(
121
+ 's3',
122
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
123
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
124
+ )
125
+
126
+ # Iterate over file paths and display each image
127
+ for file_path in file_paths:
128
+ # Retrieve the image from S3
129
+ s3_object = s3.get_object(Bucket=bucket_name, Key=f"{folder_name}{file_path}")
130
+ img_data = s3_object['Body'].read()
131
+
132
+ # Open the image using PIL and display it using Streamlit
133
+ img = Image.open(BytesIO(img_data))
134
+ st.image(img, caption=file_path, use_column_width=True)
135
+
136
+
137
+
138
+ def main():
139
+ dataset_name = "StopSign_test"
140
+ query = "black car"
141
+ limit = 10
142
+ offset = 0
143
+ scoring_func = "cosine"
144
+ search_in_images = True
145
+ search_in_small_objects = False
146
+
147
+ df = load_hf_datasets(dataset_name)
148
+ results = search(query, df, limit, offset, scoring_func, search_in_images, search_in_small_objects)
149
+ top_k_paths = get_file_paths(df,results)
150
+ return top_k_paths
151
+
152
+
153
+ if __name__ == "__main__":
154
+ main()
155
+
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ sentence-transformers
3
+ open_clip
4
+ torch
5
+ datasets
6
+ scikit-learn
7
+ torchvision # Typically required with PyTorch for image processing
8
+ boto3
9
+ streamlit
10
+ Pillow
11
+ python-dotenv # If you are using a .env file for managing secrets