nickmuchi commited on
Commit
5f135e9
Β·
1 Parent(s): 8c0f3f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -104
app.py CHANGED
@@ -14,112 +14,117 @@ nltk.download('punkt')
14
 
15
  from nltk import sent_tokenize
16
 
17
- st.set_page_config(
18
- page_title="Home",
19
- page_icon="πŸ“ž",
20
- )
21
 
22
- auth_token = os.environ.get("auth_token")
23
-
24
- @st.experimental_singleton()
25
- def load_models():
26
- asr_model = whisper.load_model("small")
27
- q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
28
- q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
29
- cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
30
-
31
- return asr_model, q_model, q_tokenizer, cross_encoder
32
-
33
- asr_model, q_model, q_tokenizer, cross_encoder = load_models()
34
-
35
- @st.experimental_memo(suppress_st_warning=True)
36
- def inference(link, upload):
37
- '''Convert Youtube video or Audio upload to text'''
38
-
39
- if validators.url(link):
40
-
41
- yt = YouTube(link)
42
- title = yt.title
43
- path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
44
- options = whisper.DecodingOptions(without_timestamps=True)
45
- results = asr_model.transcribe(path)
46
-
47
- return results, yt.title
48
-
49
- elif upload:
50
- results = asr_model.transcribe(upload)
51
-
52
- return results, "Transcribed Earnings Audio"
53
-
54
- @st.experimental_memo(suppress_st_warning=True)
55
- def sentiment_pipe(earnings_text):
56
- '''Determine the sentiment of the text'''
57
-
58
- remote_clx = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
59
-
60
- earnings_sentiment = remote_clx(sent_tokenize(earnings_text))
61
-
62
- return earnings_sentiment
63
-
64
-
65
- def preprocess_plain_text(text,window_size=3):
66
- '''Preprocess text for semantic search'''
67
-
68
- text = text.encode("ascii", "ignore").decode() # unicode
69
- text = re.sub(r"https*\S+", " ", text) # url
70
- text = re.sub(r"@\S+", " ", text) # mentions
71
- text = re.sub(r"#\S+", " ", text) # hastags
72
- text = re.sub(r"\s{2,}", " ", text) # over spaces
73
- #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?
74
-
75
- #break into lines and remove leading and trailing space on each
76
- lines = [line.strip() for line in text.splitlines()]
77
-
78
- # #break multi-headlines into a line each
79
- chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]
80
-
81
- # # drop blank lines
82
- text = '\n'.join(chunk for chunk in chunks if chunk)
83
-
84
- ## We split this article into paragraphs and then every paragraph into sentences
85
- paragraphs = []
86
- for paragraph in text.replace('\n',' ').split("\n\n"):
87
- if len(paragraph.strip()) > 0:
88
- paragraphs.append(sent_tokenize(paragraph.strip()))
89
-
90
- #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
91
- #Smaller value: Context from other sentences might get lost
92
- #Lager values: More context from the paragraph remains, but results are longer
93
- window_size = window_size
94
- passages = []
95
- for paragraph in paragraphs:
96
- for start_idx in range(0, len(paragraph), window_size):
97
- end_idx = min(start_idx+window_size, len(paragraph))
98
- passages.append(" ".join(paragraph[start_idx:end_idx]))
99
-
100
- print(f"Sentences: {sum([len(p) for p in paragraphs])}")
101
- print(f"Passages: {len(passages)}")
102
-
103
- return passages
104
-
105
- def display_df_as_table(model,top_k,score='score'):
106
- '''Display the df with text and scores as a table'''
107
 
108
- df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
109
- df['Score'] = round(df['Score'],2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- return df
 
 
 
112
 
113
- def make_spans(text,results):
114
- results_list = []
115
- for i in range(len(results)):
116
- results_list.append(results[i]['label'])
117
- facts_spans = []
118
- facts_spans = list(zip(sent_tokenizer(text),results_list))
119
- return facts_spans
120
-
121
- ##Fiscal Sentiment by Sentence
122
- def fin_ext(text):
123
- results = remote_clx(sent_tokenizer(text))
124
- return make_spans(text,results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
 
14
 
15
  from nltk import sent_tokenize
16
 
17
+ def run():
 
 
 
18
 
19
+ st.set_page_config(
20
+ page_title="Home",
21
+ page_icon="πŸ“ž",
22
+ )
23
+
24
+ auth_token = os.environ.get("auth_token")
25
+
26
+ @st.experimental_singleton()
27
+ def load_models():
28
+ asr_model = whisper.load_model("small")
29
+ q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
30
+ q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
31
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
32
+
33
+ return asr_model, q_model, q_tokenizer, cross_encoder
34
+
35
+ asr_model, q_model, q_tokenizer, cross_encoder = load_models()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ @st.experimental_memo(suppress_st_warning=True)
38
+ def inference(link, upload):
39
+ '''Convert Youtube video or Audio upload to text'''
40
+
41
+ if validators.url(link):
42
+
43
+ yt = YouTube(link)
44
+ title = yt.title
45
+ path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
46
+ options = whisper.DecodingOptions(without_timestamps=True)
47
+ results = asr_model.transcribe(path)
48
+
49
+ return results, yt.title
50
+
51
+ elif upload:
52
+ results = asr_model.transcribe(upload)
53
+
54
+ return results, "Transcribed Earnings Audio"
55
+
56
+ @st.experimental_memo(suppress_st_warning=True)
57
+ def sentiment_pipe(earnings_text):
58
+ '''Determine the sentiment of the text'''
59
+
60
+ remote_clx = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
61
 
62
+ earnings_sentiment = remote_clx(sent_tokenize(earnings_text))
63
+
64
+ return earnings_sentiment
65
+
66
 
67
+ def preprocess_plain_text(text,window_size=3):
68
+ '''Preprocess text for semantic search'''
69
+
70
+ text = text.encode("ascii", "ignore").decode() # unicode
71
+ text = re.sub(r"https*\S+", " ", text) # url
72
+ text = re.sub(r"@\S+", " ", text) # mentions
73
+ text = re.sub(r"#\S+", " ", text) # hastags
74
+ text = re.sub(r"\s{2,}", " ", text) # over spaces
75
+ #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?
76
+
77
+ #break into lines and remove leading and trailing space on each
78
+ lines = [line.strip() for line in text.splitlines()]
79
+
80
+ # #break multi-headlines into a line each
81
+ chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]
82
+
83
+ # # drop blank lines
84
+ text = '\n'.join(chunk for chunk in chunks if chunk)
85
+
86
+ ## We split this article into paragraphs and then every paragraph into sentences
87
+ paragraphs = []
88
+ for paragraph in text.replace('\n',' ').split("\n\n"):
89
+ if len(paragraph.strip()) > 0:
90
+ paragraphs.append(sent_tokenize(paragraph.strip()))
91
+
92
+ #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
93
+ #Smaller value: Context from other sentences might get lost
94
+ #Lager values: More context from the paragraph remains, but results are longer
95
+ window_size = window_size
96
+ passages = []
97
+ for paragraph in paragraphs:
98
+ for start_idx in range(0, len(paragraph), window_size):
99
+ end_idx = min(start_idx+window_size, len(paragraph))
100
+ passages.append(" ".join(paragraph[start_idx:end_idx]))
101
+
102
+ print(f"Sentences: {sum([len(p) for p in paragraphs])}")
103
+ print(f"Passages: {len(passages)}")
104
+
105
+ return passages
106
+
107
+ def display_df_as_table(model,top_k,score='score'):
108
+ '''Display the df with text and scores as a table'''
109
+
110
+ df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
111
+ df['Score'] = round(df['Score'],2)
112
+
113
+ return df
114
+
115
+ def make_spans(text,results):
116
+ results_list = []
117
+ for i in range(len(results)):
118
+ results_list.append(results[i]['label'])
119
+ facts_spans = []
120
+ facts_spans = list(zip(sent_tokenizer(text),results_list))
121
+ return facts_spans
122
+
123
+ ##Fiscal Sentiment by Sentence
124
+ def fin_ext(text):
125
+ results = remote_clx(sent_tokenizer(text))
126
+ return make_spans(text,results)
127
+
128
+ if __name__ == "__main__":
129
+ run()
130