-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlstm_toxic.py
103 lines (86 loc) · 2.76 KB
/
lstm_toxic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 22 00:19:44 2019
@author: tanma
"""
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
import keras.backend as K
if len(K.tensorflow_backend._get_available_gpus()) > 0:
from keras.layers import CuDNNLSTM as LSTM
from keras.layers import CuDNNGRU as GRU
MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 5
word2vec = {}
with open(os.path.join('Glove Data/glove.6B.%sd.txt' % EMBEDDING_DIM),'rb') as f:
for line in f:
values = line.split()
word = values[0]
vec = np.asarray(values[1:], dtype='float32')
word2vec[word] = vec
train = pd.read_csv("Toxic Dataset/train.csv")
sentences = train["comment_text"].fillna("DUMMY_VALUE").values
possible_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
targets = train[possible_labels].values
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
word2idx = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
if i < MAX_VOCAB_SIZE:
embedding_vector = word2vec.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(
num_words,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False
)
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
# x = LSTM(15, return_sequences=True)(x)
x = Bidirectional(LSTM(15, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
output = Dense(len(possible_labels), activation="sigmoid")(x)
model = Model(input_, output)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(lr=0.01),
metrics=['accuracy']
)
model.fit(
data,
targets,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
validation_split=VALIDATION_SPLIT
)
def custom_input(string):
input_seq = tokenizer.texts_to_sequences(string)
input_seq = pad_sequences(input_seq, maxlen=MAX_SEQUENCE_LENGTH)
predicted = model.predict(input_seq)[0]
return predicted
take_input = str(input())
gt = sum(custom_input(take_input))
for i, prob in enumerate(custom_input(take_input)):
if prob > 0.2*gt:
print(possible_labels[i])