Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

nickmuchi commited on Oct 2, 2022

Commit

5f135e9

1 Parent(s): 8c0f3f2

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -104

app.py CHANGED Viewed

@@ -14,112 +14,117 @@ nltk.download('punkt')
 from nltk import sent_tokenize
-st.set_page_config(
-    page_title="Home",
-    page_icon="📞",
-)
-auth_token = os.environ.get("auth_token")
-@st.experimental_singleton()
-def load_models():
-    asr_model = whisper.load_model("small")
-    q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
-    q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
-    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
-    return asr_model, q_model, q_tokenizer, cross_encoder
-asr_model, q_model, q_tokenizer, cross_encoder = load_models()
-@st.experimental_memo(suppress_st_warning=True)
-def inference(link, upload):
-    '''Convert Youtube video or Audio upload to text'''
-    if validators.url(link):
-      yt = YouTube(link)
-      title = yt.title
-      path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
-      options = whisper.DecodingOptions(without_timestamps=True)
-      results = asr_model.transcribe(path)
-      return results, yt.title
-    elif upload:
-      results = asr_model.transcribe(upload)
-      return results, "Transcribed Earnings Audio"
-@st.experimental_memo(suppress_st_warning=True)
-def sentiment_pipe(earnings_text):
-    '''Determine the sentiment of the text'''
-    remote_clx = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
-    earnings_sentiment = remote_clx(sent_tokenize(earnings_text))
-    return earnings_sentiment
-def preprocess_plain_text(text,window_size=3):
-    '''Preprocess text for semantic search'''
-    text = text.encode("ascii", "ignore").decode()  # unicode
-    text = re.sub(r"https*\S+", " ", text)  # url
-    text = re.sub(r"@\S+", " ", text)  # mentions
-    text = re.sub(r"#\S+", " ", text)  # hastags
-    text = re.sub(r"\s{2,}", " ", text)  # over spaces
-    #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text)  # special characters except .,!?
-    #break into lines and remove leading and trailing space on each
-    lines = [line.strip() for line in text.splitlines()]
-    # #break multi-headlines into a line each
-    chunks = [phrase.strip() for line in lines for phrase in line.split("  ")]
-    # # drop blank lines
-    text = '\n'.join(chunk for chunk in chunks if chunk)
-    ## We split this article into paragraphs and then every paragraph into sentences
-    paragraphs = []
-    for paragraph in text.replace('\n',' ').split("\n\n"):
-        if len(paragraph.strip()) > 0:
-            paragraphs.append(sent_tokenize(paragraph.strip()))
-    #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
-    #Smaller value: Context from other sentences might get lost
-    #Lager values: More context from the paragraph remains, but results are longer
-    window_size = window_size
-    passages = []
-    for paragraph in paragraphs:
-        for start_idx in range(0, len(paragraph), window_size):
-            end_idx = min(start_idx+window_size, len(paragraph))
-            passages.append(" ".join(paragraph[start_idx:end_idx]))
-    print(f"Sentences: {sum([len(p) for p in paragraphs])}")
-    print(f"Passages: {len(passages)}")
-    return passages
-def display_df_as_table(model,top_k,score='score'):
-    '''Display the df with text and scores as a table'''
-    df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
-    df['Score'] = round(df['Score'],2)
-    return df
-def make_spans(text,results):
-    results_list = []
-    for i in range(len(results)):
-        results_list.append(results[i]['label'])
-    facts_spans = []
-    facts_spans = list(zip(sent_tokenizer(text),results_list))
-    return facts_spans
-##Fiscal Sentiment by Sentence
-def fin_ext(text):
-    results = remote_clx(sent_tokenizer(text))
-    return make_spans(text,results)

 from nltk import sent_tokenize
+def run():
+    st.set_page_config(
+        page_title="Home",
+        page_icon="📞",
+    )
+    auth_token = os.environ.get("auth_token")
+    @st.experimental_singleton()
+    def load_models():
+        asr_model = whisper.load_model("small")
+        q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
+        q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
+        cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
+        return asr_model, q_model, q_tokenizer, cross_encoder
+    asr_model, q_model, q_tokenizer, cross_encoder = load_models()
+    @st.experimental_memo(suppress_st_warning=True)
+    def inference(link, upload):
+        '''Convert Youtube video or Audio upload to text'''
+        if validators.url(link):
+          yt = YouTube(link)
+          title = yt.title
+          path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
+          options = whisper.DecodingOptions(without_timestamps=True)
+          results = asr_model.transcribe(path)
+          return results, yt.title
+        elif upload:
+          results = asr_model.transcribe(upload)
+          return results, "Transcribed Earnings Audio"
+    @st.experimental_memo(suppress_st_warning=True)
+    def sentiment_pipe(earnings_text):
+        '''Determine the sentiment of the text'''
+        remote_clx = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
+        earnings_sentiment = remote_clx(sent_tokenize(earnings_text))
+        return earnings_sentiment
+    def preprocess_plain_text(text,window_size=3):
+        '''Preprocess text for semantic search'''
+        text = text.encode("ascii", "ignore").decode()  # unicode
+        text = re.sub(r"https*\S+", " ", text)  # url
+        text = re.sub(r"@\S+", " ", text)  # mentions
+        text = re.sub(r"#\S+", " ", text)  # hastags
+        text = re.sub(r"\s{2,}", " ", text)  # over spaces
+        #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text)  # special characters except .,!?
+        #break into lines and remove leading and trailing space on each
+        lines = [line.strip() for line in text.splitlines()]
+        # #break multi-headlines into a line each
+        chunks = [phrase.strip() for line in lines for phrase in line.split("  ")]
+        # # drop blank lines
+        text = '\n'.join(chunk for chunk in chunks if chunk)
+        ## We split this article into paragraphs and then every paragraph into sentences
+        paragraphs = []
+        for paragraph in text.replace('\n',' ').split("\n\n"):
+            if len(paragraph.strip()) > 0:
+                paragraphs.append(sent_tokenize(paragraph.strip()))
+        #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
+        #Smaller value: Context from other sentences might get lost
+        #Lager values: More context from the paragraph remains, but results are longer
+        window_size = window_size
+        passages = []
+        for paragraph in paragraphs:
+            for start_idx in range(0, len(paragraph), window_size):
+                end_idx = min(start_idx+window_size, len(paragraph))
+                passages.append(" ".join(paragraph[start_idx:end_idx]))
+        print(f"Sentences: {sum([len(p) for p in paragraphs])}")
+        print(f"Passages: {len(passages)}")
+        return passages
+    def display_df_as_table(model,top_k,score='score'):
+        '''Display the df with text and scores as a table'''
+        df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
+        df['Score'] = round(df['Score'],2)
+        return df
+    def make_spans(text,results):
+        results_list = []
+        for i in range(len(results)):
+            results_list.append(results[i]['label'])
+        facts_spans = []
+        facts_spans = list(zip(sent_tokenizer(text),results_list))
+        return facts_spans
+    ##Fiscal Sentiment by Sentence
+    def fin_ext(text):
+        results = remote_clx(sent_tokenizer(text))
+        return make_spans(text,results)
+if __name__ == "__main__":
+    run()