|
''' Text Keyword Match''' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import sent_tokenize |
|
from nltk.tokenize import word_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction |
|
|
|
nltk.download('omw-1.4') |
|
lemmatizer = WordNetLemmatizer() |
|
stop_words = set(stopwords.words('english')) |
|
|
|
|
|
class scoreText(object): |
|
""" |
|
A class used to score sentences based on the input keyword |
|
""" |
|
|
|
def __init__(self): |
|
|
|
self.sentences = [] |
|
|
|
def cleanText(self, sentences): |
|
""" |
|
Eliminates the duplicates and cleans the text |
|
""" |
|
try: |
|
sentences = list(set(sentences)) |
|
mainBody = [] |
|
for i, text in enumerate(sentences): |
|
text = re.sub("[-()\"#/@&&^*();:<>{}`+=~|!?,]", "", text) |
|
mainBody.append(text) |
|
return mainBody |
|
except Exception as e: |
|
print("Error occured in text clean", e) |
|
|
|
def preProcessText(self, sentences): |
|
""" |
|
Tokenization of sentence and lemmatization of words |
|
""" |
|
try: |
|
|
|
word_tokens = word_tokenize(sentences) |
|
|
|
wordlist = [lemmatizer.lemmatize( |
|
w) for w in word_tokens if not w in stop_words] |
|
|
|
return wordlist |
|
except Exception as e: |
|
print("Error occured in text preprocessing", e) |
|
|
|
|
|
def scoreText(self, keyword, sentences): |
|
""" |
|
Compares sentences with keyword with bleu scoring technique |
|
""" |
|
try: |
|
|
|
sentences = self.cleanText(sentences) |
|
|
|
|
|
keywordList = self.preProcessText(keyword) |
|
|
|
scoredSentencesList = [] |
|
for i in range(len(sentences)): |
|
|
|
|
|
wordlist = self.preProcessText(sentences[i]) |
|
|
|
|
|
reference = [keywordList] |
|
chencherry = SmoothingFunction() |
|
|
|
|
|
bleu_score_1 = sentence_bleu( |
|
reference, wordlist, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1) |
|
bleu_score_2 = sentence_bleu( |
|
reference, wordlist, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1) |
|
bleu_score_3 = sentence_bleu( |
|
reference, wordlist, weights=(0.33, 0.33, 0.34, 0), smoothing_function=chencherry.method1) |
|
bleu_score_4 = sentence_bleu( |
|
reference, wordlist, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1) |
|
bleu_score = (4*bleu_score_4 + 3*bleu_score_3 + |
|
2*bleu_score_2 + bleu_score_1)/10 |
|
|
|
|
|
scList = [bleu_score, sentences[i]] |
|
scoredSentencesList.append(scList) |
|
return scoredSentencesList |
|
|
|
except Exception as e: |
|
print("Error occured in score text", e) |
|
|
|
def sortText(self, scoredText): |
|
""" |
|
Returns 3 top scored list of sentences |
|
""" |
|
try: |
|
scoredTexts = sorted(scoredText, key=lambda x: x[0], reverse=True) |
|
scoredTexts = [v[1] for i, v in enumerate(scoredTexts) if i < 3] |
|
return scoredTexts |
|
except Exception as e: |
|
print("Error occured in sorting text", e) |
|
|
|
def sentenceMatch(self, keyword, paragraph): |
|
""" |
|
Converts paragraph into list and calls scoreText and sortText functions, |
|
and returns the most matching sentences with the keywords. |
|
""" |
|
try: |
|
sentencesList = sent_tokenize(paragraph) |
|
scoredSentence = self.scoreText(keyword, sentencesList) |
|
sortedSentence = self.sortText(scoredSentence) |
|
return sortedSentence |
|
except Exception as e: |
|
print("Error occured in sentence match", e) |