import torch from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoModelForCausalLM, AutoTokenizer, EncoderDecoderCache import gradio as gr import librosa # Determine the device if torch.cuda.is_available(): # for CUDA device = torch.device("cuda") elif torch.backends.mps.is_available(): # for Apple MPS device = torch.device("mps") else: # fallback for CPU device = torch.device("cpu") # Load the audio processor and model stt_processor = AutoProcessor.from_pretrained("openai/whisper-large", language='en') stt_model = AutoModelForSpeechSeq2Seq.from_pretrained( "openai/whisper-large" ) # Move the model to the device stt_model.to(device) def transcribe_audio(audio_path: str): transcript = "" try: audio, sr = librosa.load(audio_path, sr=16000) inputs = stt_processor(audio, sampling_rate=16000, return_tensors="pt", language='en') input_features = inputs.input_features.to(device) with torch.no_grad(): predicted_ids = stt_model.generate(input_features) transcript = stt_processor.batch_decode(predicted_ids, skip_special_tokens=True, language='en')[0] except Exception as e: return f"Error during transcription: {str(e)}" finally: return transcript def extract_action_items(transcript: str) -> str: """ Extracts action items from a transcript using the Llama-3.1-8B-Instruct model. see example code in the model card: https://huggingface.co/Qwen/Qwen/Qwen2.5-7B-Instruct """ model_name = "Qwen/Qwen2.5-7B-Instruct" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name) messages = [ {"role": "system", "content": "You are the perfect action item extraction system."}, {"role": "user", "content": """Extract the action items from the voice note transcript using the following format:\n\n [ {"title 1": "description 1"}, "title 2": "description 2"}, ... ]"""}, {"role": "user", "content": f"transcript: {transcript}"} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate( **model_inputs, max_new_tokens=512 ) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] print(response) return response def transcribe_and_extract_action_items(audio_path): transcript = transcribe_audio(audio_path) action_items_text = extract_action_items(transcript) return transcript, action_items_text ################################################## # Gradio Interface ################################################## # Define the Gradio interface components input_audio = gr.Audio( type="filepath", label="Upload or Record Audio" ) output_transcript = gr.Textbox( label="Transcript", lines=10, placeholder="The transcribed text will appear here..." ) output_action_items = gr.Textbox( label="Action Items", lines=10, placeholder="Extracted action items will appear here..." ) # 4. Create the Gradio interface interface = gr.Interface( fn=transcribe_and_extract_action_items, inputs=input_audio, outputs=[output_transcript, output_action_items], title="Audio Transcription and Action Item Extraction", description=( "Upload or record an audio clip. The system will transcribe the audio " "and extract actionable items from the transcript." ), theme="default" ) # 5. Launch the interface if __name__ == "__main__": interface.launch()