from typing import Dict from transformers.pipelines.audio_utils import ffmpeg_read import whisper import torch import pytube import time class EndpointHandler(): def __init__(self, path=""): # load the model MODEL_NAME = "tiny.en" device = "cuda" if torch.cuda.is_available() else "cpu" print(f'whisper will use: {device}') t0 = time.time() self.model = whisper.load_model(MODEL_NAME).to(device) t1 = time.time() total = t1-t0 print(f'Finished loading model in {total} seconds') def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: """ Args: data (:obj:): includes the URL to video for transcription Return: A :obj:`dict`:. transcribed dict """ # process input print('data', data) video_url = data.pop("inputs", data) decode_options = { # Set language to None to support multilingual, # but it will take longer to process while it detects the language. # Realized this by running in verbose mode and seeing how much time # was spent on the decoding language step "language":"en", "verbose": True } yt = pytube.YouTube(video_url) video_info = { 'id': yt.video_id, 'thumbnail': yt.thumbnail_url, 'title': yt.title, 'views': yt.views, 'length': yt.length, # Althhough, this might seem redundant since we already have id # but it allows the link to the video be accessed in 1-click in the API response 'url': f"https://www.youtube.com/watch?v={yt.video_id}" } stream = yt.streams.filter(only_audio=True)[0] path_to_audio = f"{yt.video_id}.mp3" stream.download(filename=path_to_audio) t0 = time.time() transcript = self.model.transcribe(path_to_audio, **decode_options) t1 = time.time() for segment in transcript['segments']: # Remove the tokens array, it makes the response too verbose segment.pop('tokens', None) total = t1-t0 print(f'Finished transcription in {total} seconds') # postprocess the prediction return {"transcript": transcript, 'video': video_info}