import pickle import requests import umap from numba.typed import List import torch from sentence_transformers import SentenceTransformer import time from pathlib import Path def check_resources(files_dict, basemap_path, mapper_params_path): """ Check if all required resources are present. Args: files_dict (dict): Dictionary mapping filenames to their download URLs basemap_path (str): Path to the basemap pickle file mapper_params_path (str): Path to the UMAP mapper parameters pickle file Returns: bool: True if all resources are present, False otherwise """ all_files_present = True # Check downloaded files for filename in files_dict.keys(): if not Path(filename).exists(): print(f"Missing file: {filename}") all_files_present = False # Check basemap if not Path(basemap_path).exists(): print(f"Missing basemap file: {basemap_path}") all_files_present = False # Check mapper params if not Path(mapper_params_path).exists(): print(f"Missing mapper params file: {mapper_params_path}") all_files_present = False return all_files_present def download_required_files(files_dict): """ Download required files from URLs only if they don't exist. Args: files_dict (dict): Dictionary mapping filenames to their download URLs """ print(f"Checking required files: {time.strftime('%Y-%m-%d %H:%M:%S')}") files_to_download = { filename: url for filename, url in files_dict.items() if not Path(filename).exists() } if not files_to_download: print("All files already present, skipping downloads") return print(f"Downloading missing files: {list(files_to_download.keys())}") for filename, url in files_to_download.items(): print(f"Downloading {filename}...") response = requests.get(url) with open(filename, "wb") as f: f.write(response.content) def setup_basemap_data(basemap_path): """ Load and setup the base map data. Args: basemap_path (str): Path to the basemap pickle file """ print(f"Getting basemap data: {time.strftime('%Y-%m-%d %H:%M:%S')}") basedata_df = pickle.load(open(basemap_path, 'rb')) return basedata_df def setup_mapper(mapper_params_path): """ Setup and configure the UMAP mapper. Args: mapper_params_path (str): Path to the UMAP mapper parameters pickle file """ print(f"Getting Mapper: {time.strftime('%Y-%m-%d %H:%M:%S')}") params_new = pickle.load(open(mapper_params_path, 'rb')) print("setting up mapper...") mapper = umap.UMAP() umap_params = {k: v for k, v in params_new.get('umap_params', {}).items() if k != 'target_backend'} mapper.set_params(**umap_params) for attr, value in params_new.get('umap_attributes', {}).items(): if attr != 'embedding_': setattr(mapper, attr, value) if 'embedding_' in params_new.get('umap_attributes', {}): mapper.embedding_ = List(params_new['umap_attributes']['embedding_']) return mapper def setup_embedding_model(model_name): """ Setup the SentenceTransformer model. Args: model_name (str): Name or path of the SentenceTransformer model """ print(f"Setting up language model: {time.strftime('%Y-%m-%d %H:%M:%S')}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") model = SentenceTransformer(model_name) return model