File size: 4,878 Bytes
a574625
68a9ed4
a574625
 
 
 
 
 
 
 
68a9ed4
 
 
 
 
 
5f135e9
68a9ed4
5f135e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612eda8
5f135e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68a9ed4
5f135e9
 
 
 
68a9ed4
5f135e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68a9ed4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import whisper
import os
from pytube import YouTube
import pandas as pd
import plotly_express as px
import nltk
import plotly.graph_objects as go
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import streamlit as st

nltk.download('punkt')

from nltk import sent_tokenize

def run():

    st.set_page_config(
        page_title="Home",
        page_icon="πŸ“ž",
    )
    
    auth_token = os.environ.get("auth_token")
    
    @st.experimental_singleton()
    def load_models():
        asr_model = whisper.load_model("small")
        q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
        q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
        cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
        
        return asr_model, q_model, q_tokenizer, cross_encoder
        
    asr_model, q_model, q_tokenizer, cross_encoder = load_models()
    
    @st.experimental_memo(suppress_st_warning=True)
    def inference(link, upload):
        '''Convert Youtube video or Audio upload to text'''
        
        if validators.url(link):
        
          yt = YouTube(link)
          title = yt.title
          path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
          options = whisper.DecodingOptions(without_timestamps=True)
          results = asr_model.transcribe(path)
          
          return results, yt.title
          
        elif upload:
          results = asr_model.transcribe(upload)
          
          return results, "Transcribed Earnings Audio"
          
    @st.experimental_memo(suppress_st_warning=True)
    def sentiment_pipe(earnings_text):
        '''Determine the sentiment of the text'''
        
        remote_clx = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
    
        earnings_sentiment = remote_clx(sent_tokenize(earnings_text))
        
        return earnings_sentiment    
        
    
    def preprocess_plain_text(text,window_size=3):
        '''Preprocess text for semantic search'''
        
        text = text.encode("ascii", "ignore").decode()  # unicode
        text = re.sub(r"https*\S+", " ", text)  # url
        text = re.sub(r"@\S+", " ", text)  # mentions
        text = re.sub(r"#\S+", " ", text)  # hastags
        text = re.sub(r"\s{2,}", " ", text)  # over spaces
        #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text)  # special characters except .,!?
        
        #break into lines and remove leading and trailing space on each
        lines = [line.strip() for line in text.splitlines()]
        
        # #break multi-headlines into a line each
        chunks = [phrase.strip() for line in lines for phrase in line.split("  ")]
        
        # # drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)
        
        ## We split this article into paragraphs and then every paragraph into sentences
        paragraphs = []
        for paragraph in text.replace('\n',' ').split("\n\n"):
            if len(paragraph.strip()) > 0:
                paragraphs.append(sent_tokenize(paragraph.strip()))
    
        #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
        #Smaller value: Context from other sentences might get lost
        #Lager values: More context from the paragraph remains, but results are longer
        window_size = window_size
        passages = []
        for paragraph in paragraphs:
            for start_idx in range(0, len(paragraph), window_size):
                end_idx = min(start_idx+window_size, len(paragraph))
                passages.append(" ".join(paragraph[start_idx:end_idx]))
            
        print(f"Sentences: {sum([len(p) for p in paragraphs])}")
        print(f"Passages: {len(passages)}")
    
        return passages
        
    def display_df_as_table(model,top_k,score='score'):
        '''Display the df with text and scores as a table'''
        
        df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
        df['Score'] = round(df['Score'],2)
        
        return df   
        
    def make_spans(text,results):
        results_list = []
        for i in range(len(results)):
            results_list.append(results[i]['label'])
        facts_spans = []
        facts_spans = list(zip(sent_tokenizer(text),results_list))
        return facts_spans
    
    ##Fiscal Sentiment by Sentence
    def fin_ext(text):
        results = remote_clx(sent_tokenizer(text))
        return make_spans(text,results)
        
if __name__ == "__main__":
    run()