Upload 8 files
Browse files- app.py +24 -0
- data_analysis.py +15 -0
- data_cleaning.py +61 -0
- data_preparing.py +0 -0
- data_splitting.py +0 -0
- model.py +0 -0
- model_callbacks.py +13 -0
- predict.py +10 -0
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, render_template, jsonify
|
2 |
+
from predict import predict_language
|
3 |
+
import joblib
|
4 |
+
import tensorflow as tf
|
5 |
+
import h5py
|
6 |
+
|
7 |
+
model = tf.keras.models.load_model('models\\full_language_identifcation_modelf.h5')
|
8 |
+
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
|
9 |
+
CountVectorizer = joblib.load('models\\cv.joblib')
|
10 |
+
LabelEncoder = joblib.load('models\\le.joblib')
|
11 |
+
|
12 |
+
|
13 |
+
app = Flask(__name__)
|
14 |
+
|
15 |
+
@app.route('/', methods=['GET', 'POST'])
|
16 |
+
def predict():
|
17 |
+
if request.method == 'POST':
|
18 |
+
text = request.form['text']
|
19 |
+
prediction = predict_language(text, model, CountVectorizer, LabelEncoder) # Call your prediction function
|
20 |
+
return render_template('result.html', prediction=prediction, text=text)
|
21 |
+
return render_template('index.html')
|
22 |
+
|
23 |
+
if __name__ == '__main__':
|
24 |
+
app.run(debug=True)
|
data_analysis.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
df = pd.read_csv('data\\dataset.csv')
|
4 |
+
|
5 |
+
# df.head()
|
6 |
+
|
7 |
+
# df.info()
|
8 |
+
|
9 |
+
# df.isnull().sum()
|
10 |
+
|
11 |
+
# df.language.value_counts()
|
12 |
+
|
13 |
+
# df.text[0]
|
14 |
+
|
15 |
+
# df.language[0]
|
data_cleaning.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_analysis import df
|
2 |
+
from nltk.tokenize import word_tokenize
|
3 |
+
import re
|
4 |
+
import pandas as pd
|
5 |
+
import nltk
|
6 |
+
|
7 |
+
#Removing Duplicates
|
8 |
+
# df = df.drop_duplicates(subset='Text')
|
9 |
+
# df = df.reset_index(drop=True)
|
10 |
+
|
11 |
+
nltk.download('punkt')
|
12 |
+
# Initialize the set of non-alphanumeric characters to remove
|
13 |
+
nonalphanumeric = ['\'', '.', ',', '\"', ':', ';', '!', '@', '#', '$', '%', '^', '&',
|
14 |
+
'*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '?',
|
15 |
+
'/', '>', '<', '|', ' ']
|
16 |
+
|
17 |
+
def clean_text(text):
|
18 |
+
"""
|
19 |
+
Function to clean and preprocess text data.
|
20 |
+
"""
|
21 |
+
# Tokenize the text using spaCy
|
22 |
+
tokens = word_tokenize(text)
|
23 |
+
|
24 |
+
# Remove non-alphanumeric characters
|
25 |
+
words = [word.lower() for word in tokens if word not in nonalphanumeric]
|
26 |
+
|
27 |
+
# Join the lemmatized words back into a single string
|
28 |
+
cleaned_text = " ".join(words)
|
29 |
+
|
30 |
+
return cleaned_text
|
31 |
+
|
32 |
+
def remove_english(text):
|
33 |
+
"""
|
34 |
+
function that takes text as input and returns text without english words
|
35 |
+
"""
|
36 |
+
pat = "[a-zA-Z]+"
|
37 |
+
text = re.sub(pat, "", text)
|
38 |
+
return text
|
39 |
+
|
40 |
+
|
41 |
+
#applying clean_text function to all rows in 'Text' column
|
42 |
+
# df['clean_text'] = df['Text'].apply(clean_text)
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
# #Removing English from Chinese text
|
47 |
+
# df_Chinese = df[df['language']=='Chinese'] # Chinese data in dataset
|
48 |
+
|
49 |
+
# clean_text = df.loc[df.language=='Chinese']['clean_text']
|
50 |
+
# clean_text = clean_text.apply(remove_english) # removing English words
|
51 |
+
# df_Chinese.loc[:,'clean_text'] = clean_text
|
52 |
+
|
53 |
+
# # Concatenate the original DataFrame with the cleaned Chinese text DataFrame
|
54 |
+
# df = pd.concat([df, df_Chinese], axis=0, ignore_index=True)
|
55 |
+
|
56 |
+
# # Drop rows with 'Chinese' language from the original DataFrame
|
57 |
+
# df = df[~df['language'].isin(['Chinese'])].reset_index(drop=True)
|
58 |
+
|
59 |
+
|
60 |
+
# # shuffling dataframe and resetting index
|
61 |
+
# df = df.sample(frac=1).reset_index(drop=True)
|
data_preparing.py
ADDED
File without changes
|
data_splitting.py
ADDED
File without changes
|
model.py
ADDED
File without changes
|
model_callbacks.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
|
3 |
+
|
4 |
+
def lr_scheduler(epoch, lr):
|
5 |
+
if epoch < 3:
|
6 |
+
return lr
|
7 |
+
else:
|
8 |
+
return lr * tf.math.exp(-0.1)
|
9 |
+
|
10 |
+
|
11 |
+
early_stopping = tf.keras.callbacksEarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
|
12 |
+
lr_scheduler_callback = tf.keras.callbacksLearningRateScheduler(lr_scheduler)
|
13 |
+
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
|
predict.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_cleaning import clean_text
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def predict_language(text, model, cv, le):
|
6 |
+
cleaned_text = clean_text(text)
|
7 |
+
text_vectorized = cv.transform([cleaned_text])
|
8 |
+
prediction = model.predict(text_vectorized)
|
9 |
+
predicted_label = le.inverse_transform([np.argmax(prediction)])[0] # Get the first element of the list
|
10 |
+
return predicted_label
|