Spaces:

nslaughter
/

voicenote-2-actionitems

Sleeping

File size: 3,965 Bytes

fad4fae
7c2028b
297d337
fad4fae
297d337
e8ecce6
 
 
 
 
 
 
 
 
 
 
76ed6be
fad4fae
297d337
e8ecce6
 
297d337
e8ecce6
bec6d62
fad4fae
e8ecce6
 
 
 
 
 
297d337
e8ecce6
 
 
fad4fae
 
76ed6be
e8ecce6
 
7c2028b
e8ecce6
7c2028b
e8ecce6
7c2028b
 
 
 
 
 
e8ecce6
7c2028b
 
bec6d62
 
 
 
 
 
c87e011
7c2028b
 
 
 
 
 
 
e8ecce6
7c2028b
 
 
 
 
 
 
 
 
135612a
 
e8ecce6
 
 
 
 
 
fad4fae
e8ecce6
 
 
fad4fae
76ed6be
fad4fae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297d337
fad4fae

import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoModelForCausalLM, AutoTokenizer, EncoderDecoderCache
import gradio as gr
import librosa

# Determine the device
if torch.cuda.is_available(): # for CUDA
    device = torch.device("cuda")
elif torch.backends.mps.is_available(): # for Apple MPS
    device = torch.device("mps")
else: # fallback for CPU
    device = torch.device("cpu")

# Load the audio processor and model
stt_processor = AutoProcessor.from_pretrained("openai/whisper-large", language='en')
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "openai/whisper-large"
)

# Move the model to the device
stt_model.to(device)

def transcribe_audio(audio_path: str):
    transcript = ""
    try:
        audio, sr = librosa.load(audio_path, sr=16000)
        inputs = stt_processor(audio, sampling_rate=16000, return_tensors="pt", language='en')
        input_features = inputs.input_features.to(device)
        with torch.no_grad():
            predicted_ids = stt_model.generate(input_features)
        transcript = stt_processor.batch_decode(predicted_ids, skip_special_tokens=True, language='en')[0]
    except Exception as e:
        return f"Error during transcription: {str(e)}"
    finally:
        return transcript


def extract_action_items(transcript: str) -> str:
    """
    Extracts action items from a transcript using the Llama-3.1-8B-Instruct model.
    see example code in the model card: https://huggingface.co/Qwen/Qwen/Qwen2.5-7B-Instruct
    """
    model_name = "Qwen/Qwen2.5-7B-Instruct"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    messages = [
        {"role": "system", "content": "You are the perfect action item extraction system."},
        {"role": "user", "content": """Extract the action items from the voice note transcript using the following format:\n\n
         [
             {"title 1": "description 1"},
             "title 2": "description 2"},
             ...
         ]"""},
        {"role": "user", "content": f"transcript: {transcript}"}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(response)
    return response


def transcribe_and_extract_action_items(audio_path):
    transcript = transcribe_audio(audio_path)
    action_items_text = extract_action_items(transcript)
    return transcript, action_items_text

##################################################
# Gradio Interface
##################################################

# Define the Gradio interface components
input_audio = gr.Audio(
    type="filepath",
    label="Upload or Record Audio"
)

output_transcript = gr.Textbox(
    label="Transcript",
    lines=10,
    placeholder="The transcribed text will appear here..."
)

output_action_items = gr.Textbox(
    label="Action Items",
    lines=10,
    placeholder="Extracted action items will appear here..."
)

# 4. Create the Gradio interface
interface = gr.Interface(
    fn=transcribe_and_extract_action_items,
    inputs=input_audio,
    outputs=[output_transcript, output_action_items],
    title="Audio Transcription and Action Item Extraction",
    description=(
        "Upload or record an audio clip. The system will transcribe the audio "
        "and extract actionable items from the transcript."
    ),
    theme="default"
)

# 5. Launch the interface
if __name__ == "__main__":
    interface.launch()