I used tensor in python to create a model classifying if a review was positive or negative. I started with loaded in previously created reviews, but then I created a text file and added in my own reviews. I went on instagram to find some reviews of college board's ap tests, and then I added them into a text file. The closer the first number in the array was to 0, that meant that the review was positive, while the closet it was to 1 meant that the review was negative. The 2nd number in the array was the confidence of the model. The closer the number was to one, the higher the confidence of the model.
from tensorflow import keras
import numpy as np
data = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=10000) #only take most frequent words
word_index = data.get_word_index()
word_index = {k:(v+3)for k, v in word_index.items()} #keys and values, three special charcters
word_index["<PAD>"] = 0 #padding
word_index["<START>"] = 1 #start
word_index["<UNK>"] = 2 #unknown
word_index["<UNUSED>"] = 3 #unused
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) #swap all values and keys
#trim the data,
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"],padding="post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"],padding="post", maxlen=250)
def decode_review(text):
return " ".join([reverse_word_index.get(i, "?") for i in text]) #return readable words
# model
# embedding layer to group words that are similar and different
model = keras.Sequential()
model.add(keras.layers.Embedding(88000, 16)) #creating word vectors
model.add(keras.layers.GlobalAveragePooling1D()) #gets all the word vectors
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid")) #give probabilty
model.summary()
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
#split training data
x_val = train_data[:10000]
x_train = train_data[10000:]
y_val = train_labels[:10000]
y_train = train_labels[10000:]
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1) #how many movie reviews
results = model.evaluate(test_data, test_labels)
print(results)
model.save("model.h5")
def review_encode(s):
encoded = [1] #starting point
for word in s:
if word.lower() in word_index:
encoded.append(word_index[word.lower()])
else:
encoded.append(2)
return encoded
model = keras.models.load_model("model.h5")
with open("test.txt", encoding="utf-8") as f:
for line in f.readlines():
nline = line.replace(",","").replace(".","").replace("(", "").replace("(", "").replace(":", "").replace("\"", "").strip().split(" ")
encode = review_encode(nline)
encode = keras.preprocessing.sequence.pad_sequences([encode], value=word_index["<PAD>"], padding="post", maxlen=250)
predict = model.predict(encode)
print(line)
print(encode)
print(predict[0])