-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathintent_classifier.py
137 lines (108 loc) · 4.08 KB
/
intent_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import numpy as np
import pandas as pd
import tensorflow_datasets as tfds
import tensorflow as tf
from sklearn import preprocessing
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from nlp_pipeline import get_nel
tfds.disable_progress_bar()
model = tf.keras.models.load_model("models/model_intent_classify")
def load_data(filename):
# Load the CSV file into a Pandas DataFrame
data = pd.read_csv(filename)
label = preprocessing.LabelEncoder()
y = label.fit_transform(data['intent'])
data['label'] = y
# Separate classes
class_count_0, class_count_1 = data['label'].value_counts()
class_0 = data[data['label'] == 0]
class_1 = data[data['label'] == 1]
class_0_under = class_0.sample(class_count_1)
data = pd.concat([class_0_under, class_1], axis=0)
print("Total number of examples:", data.shape[0])
print("Class counts:", data['label'].value_counts())
return data
def preprocess_data(data, vocab_size=1000, max_length=200):
df = data.copy()
# Split the DataFrame into training and testing sets
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)
# Create a TensorFlow Dataset from the training and testing sets
train_dataset = tf.data.Dataset.from_tensor_slices(
(train_df['parent'].values, train_df['label'].values))
test_dataset = tf.data.Dataset.from_tensor_slices(
(test_df['parent'].values, test_df['label'].values))
BUFFER_SIZE = 10000
BATCH_SIZE = 64
# Shuffle and batch the training dataset
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(
BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
# Batch the testing dataset
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
# Create a TextVectorization layer to encode text
encoder = tf.keras.layers.TextVectorization(
max_tokens=vocab_size,
output_sequence_length=max_length
)
encoder.adapt(train_dataset.map(lambda text, label: text))
return train_dataset, test_dataset, encoder
def build_model(encoder, vocab_size, embedding_dim=64, lstm_units=64):
# Build the model architecture
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(
len(encoder.get_vocabulary()), 64, mask_zero=True),
tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(64, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(1)
])
model.summary()
return model
def train_model(model, train_dataset, test_dataset, epochs=5):
# Compile the model
model.compile(
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(1e-4),
metrics=['accuracy']
)
# Train the model
history = model.fit(
train_dataset,
epochs=epochs,
validation_data=test_dataset,
validation_steps=30
)
model.save('/models/model_intent_classifier')
return history, model
def predict_text(model, encoder, text):
# Make prediction on input text
prediction = model.predict(np.array([text]))
proba = tf.sigmoid(prediction).numpy()[0][0]
label = 'chitchat' if proba < 0 else 'reddit'
return {'intent': label, 'confidence': round(proba, 3)}
def plot_history(history):
# Plot the training and validation accuracy and loss over time
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
def get_model():
global model
if model == None:
model = tf.keras.models.load_model("models/model_intent_classify")
return model
def classify(prompt, topics):
model = get_model()
predictions = model.predict(np.array([prompt]))
print("predict score", predictions[0][0])
topic = False
if topics != 0:
topic = True
if predictions[0][0] < 0 and topic == 0:
return "chitchat"
else:
return "topic"