Spaces:
Sleeping
Sleeping
File size: 3,965 Bytes
fad4fae 7c2028b 297d337 fad4fae 297d337 e8ecce6 76ed6be fad4fae 297d337 e8ecce6 297d337 e8ecce6 bec6d62 fad4fae e8ecce6 297d337 e8ecce6 fad4fae 76ed6be e8ecce6 7c2028b e8ecce6 7c2028b e8ecce6 7c2028b e8ecce6 7c2028b bec6d62 c87e011 7c2028b e8ecce6 7c2028b 135612a e8ecce6 fad4fae e8ecce6 fad4fae 76ed6be fad4fae 297d337 fad4fae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoModelForCausalLM, AutoTokenizer, EncoderDecoderCache
import gradio as gr
import librosa
# Determine the device
if torch.cuda.is_available(): # for CUDA
device = torch.device("cuda")
elif torch.backends.mps.is_available(): # for Apple MPS
device = torch.device("mps")
else: # fallback for CPU
device = torch.device("cpu")
# Load the audio processor and model
stt_processor = AutoProcessor.from_pretrained("openai/whisper-large", language='en')
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
"openai/whisper-large"
)
# Move the model to the device
stt_model.to(device)
def transcribe_audio(audio_path: str):
transcript = ""
try:
audio, sr = librosa.load(audio_path, sr=16000)
inputs = stt_processor(audio, sampling_rate=16000, return_tensors="pt", language='en')
input_features = inputs.input_features.to(device)
with torch.no_grad():
predicted_ids = stt_model.generate(input_features)
transcript = stt_processor.batch_decode(predicted_ids, skip_special_tokens=True, language='en')[0]
except Exception as e:
return f"Error during transcription: {str(e)}"
finally:
return transcript
def extract_action_items(transcript: str) -> str:
"""
Extracts action items from a transcript using the Llama-3.1-8B-Instruct model.
see example code in the model card: https://huggingface.co/Qwen/Qwen/Qwen2.5-7B-Instruct
"""
model_name = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [
{"role": "system", "content": "You are the perfect action item extraction system."},
{"role": "user", "content": """Extract the action items from the voice note transcript using the following format:\n\n
[
{"title 1": "description 1"},
"title 2": "description 2"},
...
]"""},
{"role": "user", "content": f"transcript: {transcript}"}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
return response
def transcribe_and_extract_action_items(audio_path):
transcript = transcribe_audio(audio_path)
action_items_text = extract_action_items(transcript)
return transcript, action_items_text
##################################################
# Gradio Interface
##################################################
# Define the Gradio interface components
input_audio = gr.Audio(
type="filepath",
label="Upload or Record Audio"
)
output_transcript = gr.Textbox(
label="Transcript",
lines=10,
placeholder="The transcribed text will appear here..."
)
output_action_items = gr.Textbox(
label="Action Items",
lines=10,
placeholder="Extracted action items will appear here..."
)
# 4. Create the Gradio interface
interface = gr.Interface(
fn=transcribe_and_extract_action_items,
inputs=input_audio,
outputs=[output_transcript, output_action_items],
title="Audio Transcription and Action Item Extraction",
description=(
"Upload or record an audio clip. The system will transcribe the audio "
"and extract actionable items from the transcript."
),
theme="default"
)
# 5. Launch the interface
if __name__ == "__main__":
interface.launch()
|