Final: After testing various configurations, the final model takes 10…

…0 epochs, with a learning rate of 0.00005 trained on the full dataset to produce an accuracy of 87.137%
Tickloop · May 17, 2022 · 2f7a2d3 · 2f7a2d3
1 parent 1cd523a
commit 2f7a2d3
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 18 deletions.
diff --git a/main.py b/main.py
@@ -22,9 +22,11 @@ def train(model, datasets, mask_tree, max_epochs, eta):
     optimizer = Adam(model.parameters(), lr=eta)
     loss_criterion = CrossEntropyLoss()
     interactions = { ep : { word : {} for word, label in datasets['train'] } for ep in range(max_epochs) }
-
+    
     for epoch in range(max_epochs):
         i = 0
+        model.train()
+
         for correct_word, correct_word_labels in datasets['train']:
             features = get_default_features()
             i += 1
@@ -34,7 +36,7 @@ def train(model, datasets, mask_tree, max_epochs, eta):
                 optimizer.zero_grad()
 
                 outputs = model(features)
-                guessed_word = get_word_beam_search(outputs, mask_tree)
+                guessed_word = get_word_beam_search(outputs, mask_tree, k=3)
 
                 word_loss = loss_criterion(outputs, correct_word_labels)
 
@@ -53,25 +55,26 @@ def train(model, datasets, mask_tree, max_epochs, eta):
 
                 if guessed_word == correct_word:
                     break
-
+
+        model.eval()
         val_acc[epoch], _ = accuracy(model, datasets['train'], mask_tree)
-        # val_loss[epoch] = avg_loss(model, datasets['val'], mask_tree)
-        print(f"Epoch {epoch} / {max_epochs}, loss => {losses[epoch]}, full_acc => {val_acc[epoch]}")
+        val_loss[epoch] = avg_loss(model, datasets['train'], mask_tree)
+        print(f"Epoch {epoch} / {max_epochs}, loss => {losses[epoch]}, val_acc => {val_acc[epoch]}, val_loss => {val_loss[epoch]}")
 
         if val_acc[epoch] > max_val_acc:
-            save_model(model, "100epoch_bigger_train_beam_4")
+            save_model(model, "100epoch_bigger_full")
             max_val_acc = val_acc[epoch]
 
     return losses, interactions
 
 if __name__ == "__main__":
-    splits = [1.0, 0.0, 0]
+    splits = [1.0, 0, 0]
     mask_tree = get_mask_tree("data/official.txt")
     dataset = get_dataset("data/official.txt")
     datasets = get_split_dataset(dataset, splits)
 
     b1 = BaseModel(in_features=26 * 12)
     b1_loss, interaction_history = train(b1, datasets, mask_tree, max_epochs=100, eta=0.00005)
 
-    save_history(interaction_history, "interaction_history_17.json")
-    save_loss(b1_loss, "100epoch_bigger_train_beam_4.npy")
+    save_history(interaction_history, "final_interaction_history.json")
+    save_loss(b1_loss, "100epoch_bigger_full.npy")
diff --git a/models.py b/models.py
@@ -24,7 +24,6 @@ def __init__(self, in_features):
 
         self.flatten = nn.Flatten(start_dim=0)
         self.activation = nn.ReLU()
-        self.softmax = nn.Softmax(dim=0)
 
     def forward(self, x):
         output = self.flatten(x)
@@ -33,7 +32,6 @@ def forward(self, x):
         outputs = torch.empty((5, 26))
         for i, layer in enumerate(self.output_char_layers):
             outputs[i] = layer(output)
-            # outputs[i] = self.softmax(outputs[i])
 
         return outputs
 
diff --git a/utils.py b/utils.py
@@ -252,7 +252,7 @@ def get_mask_tree(wordlist_path : str) -> dict:
     return mask_tree
 
 
-def get_word_beam_search(outputs : torch.Tensor, mask_tree : dict) -> str:
+def get_word_beam_search(outputs : torch.Tensor, mask_tree : dict, k : int = 3) -> str:
     """
         To convert the output of our model to a word that can be made sense of, we use this function.
         Rather than taking the argmax independent of the underlying word distribution, we carry out a beam search to find the optimal 
@@ -264,7 +264,6 @@ def get_word_beam_search(outputs : torch.Tensor, mask_tree : dict) -> str:
     """
     # initialize
     soft_outputs = torch.nn.functional.softmax(outputs, dim=1)
-    k = 3
     mask = mask_tree[0]
     mask = torch.tensor(mask)
     mask = mask * soft_outputs[0]

diff --git a/visualize.py b/visualize.py
@@ -142,7 +142,7 @@ def accuracy_on_output(one_epoch_interaction : dict) -> float:
         count += 1
     return round(100. * acc / count, 3)
 
-def accuracy_on_dataset(model_path : str, wordlist_path : str, dataset_name : str) -> tuple:
+def accuracy_on_dataset(model_path : str, wordlist_path : str, dataset_name : str, k : int = 3) -> tuple:
     """
         Given a model path, wordlist path, and the dataset name from {'train', 'test', 'val'},
         finds the accuracy on the given dataset.
@@ -157,16 +157,19 @@ def accuracy_on_dataset(model_path : str, wordlist_path : str, dataset_name : st
         `dataset_name`: The default split on the loaded wordlist will be [0.8, 0.05, 0.15] for 
         {'train', 'val', 'test'}. The dataset_name specifies which dataset to use to find this accuracy.
 
+        `k`: The number of words to track in beam search. Increasing this number makes search slower.
+
         Return:
         `results`: A dict, storing the attempts that the model made for each word in the specified dataset.
         `accuracy`: A float multiplied by 100 to give % of accuracy
     """
-    splits = [1.0, 0, 0]
+    splits = [0.8, 0.05, 0]
     dataset = get_dataset(wordlist_path)
     datasets = get_split_dataset(dataset, splits)
     mask_tree = get_mask_tree(wordlist_path)
 
     model = torch.load(model_path)
+    model.eval()
     acc, count = 0., 0.
     results = {word : {} for word, label in datasets[dataset_name]}
 
@@ -179,7 +182,7 @@ def accuracy_on_dataset(model_path : str, wordlist_path : str, dataset_name : st
 
         for attempt in range(6):
             output = model(features)
-            guessed_word = get_word_beam_search(output, mask_tree)
+            guessed_word = get_word_beam_search(output, mask_tree, k)
             feedback = get_feedback(guessed_word, correct_word)
             features = get_updated_features(features, feedback, guessed_word)
 
@@ -254,6 +257,20 @@ def show_guess_distribution(results : dict):
     plt.show()
     plt.close()
 
+def k_variation_beam_search(model_name : str) -> None:
+    print(model_name)
+    ks = [1, 3, 5, 10]
+    results, acc = {}, {} 
+    for k in ks:
+        results[k], acc[k] = accuracy_on_dataset(model_name, "data/official.txt", "train")
+
+    for k in ks:
+        print(f"Accuracy for k = {k}: {acc[k]}%")
+        show_guess_distribution(results[k])
+
+    print("")
+
+
 def print_model_statistics(model_name : str) -> None:
     """
         This is used to quickly see the statistics like interaction history and accuracy on different
@@ -283,7 +300,7 @@ def print_model_statistics(model_name : str) -> None:
     print(f"Words guessed in vocab(val): {in_vocab['val']}%")
     print(f"Words guessed in vocab(test): {in_vocab['test']}%")
 
-    show_guess_distribution(results['train'])
+    # show_guess_distribution(results['train'])
     print("")
 
 if __name__ == "__main__":
@@ -303,8 +320,16 @@ def print_model_statistics(model_name : str) -> None:
     # print_model_statistics("models/100epoch_bigger_train_beam")
     # print_model_statistics("models/100epoch_bigger_train_beam_2")
     # print_model_statistics("models/100epoch_bigger_train_beam_3")
-    print_model_statistics("models/100epoch_bigger_train_beam_4")
+    # print_model_statistics("models/100epoch_bigger_train_beam_4")
 
     # print_model_statistics("models/25epoch_biggest_train_beam")
     # print_model_statistics("models/25epoch_biggest_train_beam_2")
+
+    # k_variation_beam_search("models/100epoch_bigger_train_beam_4")
+
+    # print_model_statistics("models/25epoch_bigger_train_beam_k1")
+    # print_model_statistics("models/25epoch_bigger_train_beam_k3")
+    # print_model_statistics("models/25epoch_bigger_train_beam_k5")
+    # print_model_statistics("models/25epoch_bigger_train_beam_k10")
 
+    print_model_statistics("models/100epoch_bigger_train")