|
import os
|
|
import re
|
|
import numpy as np
|
|
from gensim.models import Word2Vec
|
|
from tensorflow.keras.models import Sequential
|
|
from tensorflow.keras.layers import Dense, LSTM, Embedding, GRU, Dropout
|
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
from tensorflow.keras.layers import Layer, MultiHeadAttention, LayerNormalization
|
|
|
|
|
|
max_worker = os.cpu_count()
|
|
vector_size = 1000
|
|
window_size = 500
|
|
min_count = 1
|
|
context_length = 4096
|
|
sentence_length = 5
|
|
|
|
|
|
embed_dim = 128
|
|
num_heads = 80
|
|
feed_forward_dim = 512
|
|
dropout_rate_transformer = 0.1
|
|
epsilon = 1e-6
|
|
|
|
|
|
lstm_units = [16000, 16000, 16000, 16000, 8000, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
|
|
gru_units = [2048, 2048, 2048, 2048, 1024, 1024, 512, 512, 512, 512, 512, 512, 512, 512, 256, 256, 256, 256, 128, 64, 32, 16, 8, 4, 2, 1]
|
|
dropout_rate_rnn = 0.2
|
|
return_sequences = True
|
|
|
|
|
|
input_dim = 10000
|
|
output_dim = 1000
|
|
input_length = context_length
|
|
|
|
|
|
dense_units = input_dim
|
|
activation = "softmax"
|
|
|
|
|
|
loss = "sparse_categorical_crossentropy"
|
|
optimizer = "adam"
|
|
metrics = ["accuracy"]
|
|
epochs = 60
|
|
batch_size = 64
|
|
|
|
|
|
class TransformerEncoder(Layer):
|
|
def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
|
|
super(TransformerEncoder, self).__init__()
|
|
self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
|
|
self.dropout1 = Dropout(dropout_rate)
|
|
self.norm1 = LayerNormalization(epsilon=epsilon)
|
|
self.dense1 = Dense(feed_forward_dim, activation="relu")
|
|
self.dense2 = Dense(embed_dim)
|
|
self.dropout2 = Dropout(dropout_rate)
|
|
self.norm2 = LayerNormalization(epsilon=epsilon)
|
|
|
|
def call(self, inputs, training=None):
|
|
|
|
attention_output = self.attention(inputs, inputs)
|
|
attention_output = self.dropout1(attention_output, training=training)
|
|
out1 = self.norm1(inputs + attention_output)
|
|
|
|
|
|
dense_output = self.dense1(out1)
|
|
dense_output = self.dense2(dense_output)
|
|
dense_output = self.dropout2(dense_output, training=training)
|
|
return self.norm2(out1 + dense_output)
|
|
|
|
|
|
def train_model(X, y, tokenizer):
|
|
nn_model = Sequential([
|
|
|
|
Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length),
|
|
|
|
*[LSTM(units, return_sequences=True, dropout=dropout_rate_rnn) for units in lstm_units[:-1]],
|
|
LSTM(lstm_units[-1], return_sequences=True, dropout=dropout_rate_rnn),
|
|
|
|
*[GRU(units, return_sequences=True, dropout=dropout_rate_rnn) for units in gru_units[:-1]],
|
|
GRU(gru_units[-1], return_sequences=False, dropout=dropout_rate_rnn),
|
|
|
|
Dense(dense_units, activation=activation)
|
|
])
|
|
|
|
nn_model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
|
|
print("Model eğitiliyor...")
|
|
nn_model.fit(X, y, epochs=epochs, batch_size=batch_size)
|
|
return nn_model
|
|
|
|
|
|
def generate_sentence(model, tokenizer, start_word, sentence_length, temperature=1.0):
|
|
sentence = [start_word]
|
|
for _ in range(sentence_length - 1):
|
|
sequence = tokenizer.texts_to_sequences([' '.join(sentence)])
|
|
sequence = pad_sequences(sequence, maxlen=context_length, padding='post')
|
|
predicted_probs = model.predict(sequence)[0]
|
|
|
|
predicted_probs = np.asarray(predicted_probs).astype('float64')
|
|
predicted_probs = np.log(predicted_probs + 1e-10) / temperature
|
|
predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
|
|
|
|
predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
|
|
next_word = tokenizer.index_word.get(predicted_index, '')
|
|
|
|
if not next_word:
|
|
break
|
|
sentence.append(next_word)
|
|
|
|
return ' '.join(sentence)
|
|
|
|
|
|
file_path = input("Veri setinin dosya yolunu giriniz: ")
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
dataset = f.readlines()
|
|
except FileNotFoundError:
|
|
print("Dosya bulunamadı!")
|
|
exit()
|
|
|
|
tokenized_sentences = [re.findall(r'\b\w+\b', sentence.lower()) for sentence in dataset]
|
|
word2vec_model = Word2Vec(tokenized_sentences, vector_size=vector_size, window=window_size, min_count=min_count, workers=max_worker)
|
|
|
|
tokenizer = Tokenizer()
|
|
tokenizer.fit_on_texts([' '.join(sentence) for sentence in tokenized_sentences])
|
|
sequences = tokenizer.texts_to_sequences([' '.join(sentence) for sentence in tokenized_sentences])
|
|
X = pad_sequences(sequences, maxlen=context_length, padding='post')
|
|
y = np.array([seq[-1] if len(seq) > 0 else 0 for seq in sequences])
|
|
|
|
model = train_model(X, y, tokenizer)
|
|
|
|
|
|
start_word = input("Başlangıç kelimesi giriniz: ")
|
|
print("\nÜretilen Cümle:", generate_sentence(model, tokenizer, start_word, sentence_length, temperature=1.0))
|
|
|
|
|
|
|
|
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.optim as optim
|
|
import numpy as np
|
|
from torch.utils.data import DataLoader, Dataset
|
|
|
|
|
|
data = [
|
|
("Bu harika bir gün", 1),
|
|
("Bugün çok mutluyum", 1),
|
|
("Kötü bir deneyim yaşadım", 0),
|
|
("Berbat bir gündü", 0),
|
|
("Güzel bir film izledim", 1),
|
|
("Bugün hiç iyi hissetmiyorum", 0)
|
|
]
|
|
|
|
|
|
word_to_index = {"<PAD>": 0, "<UNK>": 1}
|
|
for sentence, _ in data:
|
|
for word in sentence.split():
|
|
if word not in word_to_index:
|
|
word_to_index[word] = len(word_to_index)
|
|
|
|
VOCAB_SIZE = len(word_to_index)
|
|
|
|
|
|
def tokenize(sentence):
|
|
return [word_to_index.get(word, word_to_index["<UNK>"]) for word in sentence.split()]
|
|
|
|
|
|
class TextDataset(Dataset):
|
|
def __init__(self, data):
|
|
self.data = [(torch.tensor(tokenize(sentence), dtype=torch.long), label) for sentence, label in data]
|
|
|
|
def __len__(self):
|
|
return len(self.data)
|
|
|
|
def __getitem__(self, idx):
|
|
return self.data[idx][0], torch.tensor(self.data[idx][1], dtype=torch.float)
|
|
|
|
|
|
dataset = TextDataset(data)
|
|
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: (torch.nn.utils.rnn.pad_sequence([s[0] for s in x], batch_first=True), torch.tensor([s[1] for s in x])))
|
|
|
|
|
|
class DusunceMotoru(nn.Module):
|
|
def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, output_dim):
|
|
super(DusunceMotoru, self).__init__()
|
|
|
|
self.embedding = nn.Embedding(vocab_size, embed_dim)
|
|
|
|
self.transformer = nn.TransformerEncoder(
|
|
nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads),
|
|
num_layers=num_layers
|
|
)
|
|
|
|
self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
|
|
self.gru = nn.GRU(hidden_dim, hidden_dim, num_layers, batch_first=True)
|
|
|
|
self.fc = nn.Linear(hidden_dim, output_dim)
|
|
|
|
def forward(self, x):
|
|
x = self.embedding(x)
|
|
x = self.transformer(x)
|
|
x, _ = self.lstm(x)
|
|
x, _ = self.gru(x)
|
|
x = self.fc(x[:, -1, :])
|
|
return torch.sigmoid(x)
|
|
|
|
|
|
model = DusunceMotoru(VOCAB_SIZE, embed_dim=128, num_heads=8, num_layers=2, hidden_dim=256, output_dim=1)
|
|
|
|
|
|
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
|
loss_fn = nn.BCELoss()
|
|
|
|
|
|
EPOCHS = 100
|
|
for epoch in range(EPOCHS):
|
|
total_loss = 0
|
|
for inputs, labels in dataloader:
|
|
optimizer.zero_grad()
|
|
outputs = model(inputs).squeeze()
|
|
loss = loss_fn(outputs, labels)
|
|
loss.backward()
|
|
optimizer.step()
|
|
total_loss += loss.item()
|
|
|
|
print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}")
|
|
|
|
|
|
def tahmin_yap(sentence):
|
|
model.eval()
|
|
tokens = tokenize(sentence)
|
|
input_tensor = torch.tensor([tokens], dtype=torch.long)
|
|
with torch.no_grad():
|
|
output = model(input_tensor)
|
|
return "Pozitif" if output.item() > 0.5 else "Negatif"
|
|
|
|
|
|
test_sentence = start_word
|
|
düşünme_sonucu = f"Cümle: {test_sentence} -> Tahmin: {tahmin_yap(test_sentence)}"
|
|
print("\nÜretilen Cümle:", generate_sentence(model, tokenizer, düşünme_sonucu, sentence_length, temperature=1.0))
|
|
|
|
|