File size: 4,588 Bytes

45315e1

''' Text Keyword Match'''
# --------------------------------
# Date : 19-06-2020
# Project : Text Keyword Match
# Category : NLP/NLTK sentence Scoring
# Company : weblineindia
# Department : AI/ML
# --------------------------------
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


class scoreText(object):
    """
    A class used to score sentences based on the input keyword
    """

    def __init__(self):

        self.sentences = []

    def cleanText(self, sentences):
        """
        Eliminates the duplicates and cleans the text
        """
        try:
            sentences = list(set(sentences))
            mainBody = []
            for i, text in enumerate(sentences):
                text = re.sub("[-()\"#/@&&^*();:<>{}`+=~|!?,]", "", text)
                mainBody.append(text)
            return mainBody
        except Exception as e:
            print("Error occured in text clean", e)

    def preProcessText(self, sentences):
        """
        Tokenization of sentence and lemmatization of words
        """
        try:
            # Tokenize words in a sentence
            word_tokens = word_tokenize(sentences)
            # Lemmatization of words
            wordlist = [lemmatizer.lemmatize(
                w) for w in word_tokens if not w in stop_words]

            return wordlist
        except Exception as e:
            print("Error occured in text preprocessing", e)

    # similarity of subject
    def scoreText(self, keyword, sentences):
        """
        Compares sentences with keyword with bleu scoring technique
        """
        try:
            # Remove symbols from text
            sentences = self.cleanText(sentences)

            # Tokenization and Lennatization of the keyword
            keywordList = self.preProcessText(keyword)

            scoredSentencesList = []
            for i in range(len(sentences)):

                # Tokenization and Lennatization of the sentences
                wordlist = self.preProcessText(sentences[i])

                # list of keyword taken as reference
                reference = [keywordList]
                chencherry = SmoothingFunction()
                # sentence bleu calculates the score based on 1-gram,2-gram,3-gram-4-gram,
                # and a cumulative of the above is taken as score of the sentence.
                bleu_score_1 = sentence_bleu(
                    reference, wordlist, weights=(1, 0, 0, 0), smoothing_function=chencherry.method1)
                bleu_score_2 = sentence_bleu(
                    reference, wordlist, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method1)
                bleu_score_3 = sentence_bleu(
                    reference, wordlist, weights=(0.33, 0.33, 0.34, 0), smoothing_function=chencherry.method1)
                bleu_score_4 = sentence_bleu(
                    reference, wordlist, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method1)
                bleu_score = (4*bleu_score_4 + 3*bleu_score_3 +
                              2*bleu_score_2 + bleu_score_1)/10

                # append the score with sentence to the list
                scList = [bleu_score, sentences[i]]
                scoredSentencesList.append(scList)
            return scoredSentencesList

        except Exception as e:
            print("Error occured in score text", e)

    def sortText(self, scoredText):
        """
        Returns 3 top scored list of sentences
        """
        try:
            scoredTexts = sorted(scoredText, key=lambda x: x[0], reverse=True)
            scoredTexts = [v[1] for i, v in enumerate(scoredTexts) if i < 3]
            return scoredTexts
        except Exception as e:
            print("Error occured in sorting text", e)

    def sentenceMatch(self, keyword, paragraph):
        """
        Converts paragraph into list and calls scoreText and sortText functions,
        and returns the most matching sentences with the keywords.
        """
        try:
            sentencesList = sent_tokenize(paragraph)
            scoredSentence = self.scoreText(keyword, sentencesList)
            sortedSentence = self.sortText(scoredSentence)
            return sortedSentence
        except Exception as e:
            print("Error occured in sentence match", e)