Upload 8 files

Browse files

Files changed (8) hide show

app.py +24 -0
data_analysis.py +15 -0
data_cleaning.py +61 -0
data_preparing.py +0 -0
data_splitting.py +0 -0
model.py +0 -0
model_callbacks.py +13 -0
predict.py +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from flask import Flask, request, render_template, jsonify
+from predict import predict_language
+import joblib
+import tensorflow as tf
+import h5py
+model = tf.keras.models.load_model('models\\full_language_identifcation_modelf.h5')
+model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
+CountVectorizer = joblib.load('models\\cv.joblib')
+LabelEncoder = joblib.load('models\\le.joblib')
+app = Flask(__name__)
+@app.route('/', methods=['GET', 'POST'])
+def predict():
+    if request.method == 'POST':
+        text = request.form['text']
+        prediction = predict_language(text, model, CountVectorizer, LabelEncoder)  # Call your prediction function
+        return render_template('result.html', prediction=prediction, text=text)
+    return render_template('index.html')
+if __name__ == '__main__':
+    app.run(debug=True)

data_analysis.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import pandas as pd
+df = pd.read_csv('data\\dataset.csv')
+# df.head()
+# df.info()
+# df.isnull().sum()
+# df.language.value_counts()
+# df.text[0]
+# df.language[0]

data_cleaning.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from data_analysis import df
+from nltk.tokenize import word_tokenize
+import re
+import pandas as pd
+import nltk
+#Removing Duplicates
+# df = df.drop_duplicates(subset='Text')
+# df = df.reset_index(drop=True)
+nltk.download('punkt')
+# Initialize the set of non-alphanumeric characters to remove
+nonalphanumeric = ['\'', '.', ',', '\"', ':', ';', '!', '@', '#', '$', '%', '^', '&',
+                   '*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '?',
+                   '/', '>', '<', '|', ' ']
+def clean_text(text):
+    """
+    Function to clean and preprocess text data.
+    """
+    # Tokenize the text using spaCy
+    tokens = word_tokenize(text)
+    # Remove non-alphanumeric characters
+    words = [word.lower() for word in tokens if word not in nonalphanumeric]
+    # Join the lemmatized words back into a single string
+    cleaned_text = " ".join(words)
+    return cleaned_text
+def remove_english(text):
+    """
+    function that takes text as input and returns text without english words
+    """
+    pat = "[a-zA-Z]+"
+    text = re.sub(pat, "", text)
+    return text
+#applying clean_text function to all rows in 'Text' column
+# df['clean_text'] = df['Text'].apply(clean_text)
+# #Removing English from Chinese text
+# df_Chinese = df[df['language']=='Chinese']  # Chinese data in dataset
+# clean_text = df.loc[df.language=='Chinese']['clean_text']
+# clean_text = clean_text.apply(remove_english)  # removing English words
+# df_Chinese.loc[:,'clean_text'] = clean_text
+# # Concatenate the original DataFrame with the cleaned Chinese text DataFrame
+# df = pd.concat([df, df_Chinese], axis=0, ignore_index=True)
+# # Drop rows with 'Chinese' language from the original DataFrame
+# df = df[~df['language'].isin(['Chinese'])].reset_index(drop=True)
+# # shuffling dataframe and resetting index
+# df = df.sample(frac=1).reset_index(drop=True)

data_preparing.py ADDED Viewed

File without changes

data_splitting.py ADDED Viewed

File without changes

model.py ADDED Viewed

File without changes

model_callbacks.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import tensorflow as tf
+def lr_scheduler(epoch, lr):
+    if epoch < 3:
+        return lr
+    else:
+        return lr * tf.math.exp(-0.1)
+early_stopping = tf.keras.callbacksEarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
+lr_scheduler_callback = tf.keras.callbacksLearningRateScheduler(lr_scheduler)
+optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

predict.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from data_cleaning import clean_text
+import numpy as np
+def predict_language(text, model, cv, le):
+    cleaned_text = clean_text(text)
+    text_vectorized = cv.transform([cleaned_text])
+    prediction = model.predict(text_vectorized)
+    predicted_label = le.inverse_transform([np.argmax(prediction)])[0]  # Get the first element of the list
+    return predicted_label