diff --git a/nbval/plugin.py b/nbval/plugin.py
index 4200ec8..104e0ca 100644
--- a/nbval/plugin.py
+++ b/nbval/plugin.py
@@ -428,12 +428,22 @@ def compare_outputs(self, test, ref, skip_compare=None):
         test_keys = set(testing_outs.keys())
 
         if ref_keys - test_keys:
-            self.comparison_traceback.append(
-                cc.FAIL
-                + "Missing output fields from running code: %s"
-                % (ref_keys - test_keys)
-                + cc.ENDC
-            )
+            if ref_keys == {'evalue', 'ename'}:
+                self.comparison_traceback.append(
+                    cc.FAIL
+                    + "Expected error:\n  %s: %r" % (
+                        '\n'.join(reference_outs['ename']),
+                        '\n'.join(reference_outs['evalue'])
+                    )
+                    + cc.ENDC
+                )
+            else:
+                self.comparison_traceback.append(
+                    cc.FAIL
+                    + "Missing output fields from running code: %s"
+                    % (ref_keys - test_keys)
+                    + cc.ENDC
+                )
             return False
         elif test_keys - ref_keys:
             self.comparison_traceback.append(
@@ -570,7 +580,7 @@ def runtest(self):
 
         # Poll the shell channel to get a message
         try:
-            self.parent.kernel.await_reply(msg_id, timeout=timeout)
+            kernel.await_reply(msg_id, timeout=timeout)
         except Empty:  # Timeout reached
             # Try to interrupt kernel, as this will give us traceback:
             kernel.interrupt()
@@ -582,6 +592,13 @@ def runtest(self):
         # TODO: Only store if comparing with nbdime, to save on memory usage
         self.test_outputs = outs
 
+        # Cells where the reference is not run, will not check outputs:
+        unrun = self.cell.execution_count is None
+        if unrun and self.cell.outputs:
+            self.raise_cell_error('Unrun reference cell has outputs')
+
+        cell_has_error = False
+
         # Now get the outputs from the iopub channel
         while True:
             # The iopub channel broadcasts a range of messages. We keep reading
@@ -692,6 +709,7 @@ def runtest(self):
             # cell execution. Therefore raise a cell error and pass the
             # traceback information.
             elif msg_type == 'error':
+                cell_has_error = True
                 # Store error in output first
                 out['ename'] = reply['ename']
                 out['evalue'] = reply['evalue']
@@ -700,9 +718,9 @@ def runtest(self):
                 if not self.options['check_exception']:
                     # Ensure we flush iopub before raising error
                     try:
-                        self.parent.kernel.await_idle(msg_id, self.output_timeout)
+                        kernel.await_idle(msg_id, self.output_timeout)
                     except Empty:
-                        self.stop()
+                        kernel.stop()
                         raise RuntimeError('Timed out waiting for idle kernel!')
                     traceback = '\n' + '\n'.join(reply['traceback'])
                     if out['ename'] == 'KeyboardInterrupt' and self.parent.timed_out:
@@ -718,10 +736,11 @@ def runtest(self):
 
         outs[:] = coalesce_streams(outs)
 
-        # Cells where the reference is not run, will not check outputs:
-        unrun = self.cell.execution_count is None
-        if unrun and self.cell.outputs:
-            self.raise_cell_error('Unrun reference cell has outputs')
+        if self.options['check_exception'] and unrun and not cell_has_error:
+            # If unrun, we cannot rely on output comparison for checking errors
+            self.raise_cell_error(
+                "Expected error",
+                "Expected cell to produce an error, but none was produced!")
 
         # Compare if the outputs have the same number of lines
         # and throw an error if it fails
diff --git a/tests/test_expected_exceptions.py b/tests/test_expected_exceptions.py
new file mode 100644
index 0000000..d949d3b
--- /dev/null
+++ b/tests/test_expected_exceptions.py
@@ -0,0 +1,89 @@
+import os
+
+import nbformat
+import pytest
+
+from utils import build_nb
+
+
+pytest_plugins = "pytester"
+
+
+def test_run_raises(testdir):
+    # This test uses the testdir fixture from pytester, which is useful for
+    # testing pytest plugins. It writes a notebook to a temporary dir
+    # and then runs pytest.
+
+    # Setup notebook to test:
+    sources = [
+        # In [1]:
+        "",   # No error produced, when one is expected
+        # In [2]:
+        "raise ValueError('foo')",   # Wrong ename
+        # In [3]:
+        "raise ValueError('foo')",   # Wrong evalue
+    ]
+    # Build unrun notebook:
+    nb = build_nb(sources, mark_run=True)
+
+    nb.cells[0].metadata.tags = ['raises-exception']
+    nb.cells[0].outputs.append(
+        nbformat.v4.new_output(
+            'error',
+            ename='ValueError',
+            evalue='foo',
+            traceback=['foobar', 'bob'],  # Should be ignored
+        )
+    )
+
+    nb.cells[1].metadata.tags = ['raises-exception']
+    nb.cells[1].outputs.append(
+        nbformat.v4.new_output(
+            'error',
+            ename='TypeError',   # Expected TypeError, got ValueError
+            evalue='foo',
+            traceback=['foobar', 'bob'],  # Should be ignored
+        )
+    )
+
+    nb.cells[2].metadata.tags = ['raises-exception']
+    nb.cells[2].outputs.append(
+        nbformat.v4.new_output(
+            'error',
+            ename='ValueError',
+            evalue='bar',   # Expected bar, got foo
+            traceback=['foobar', 'bob'],  # Should be ignored
+        )
+    )
+
+    # Write notebook to test dir
+    nbformat.write(nb, os.path.join(
+        str(testdir.tmpdir), 'test_expcted_exceptions.ipynb'))
+
+    # Run tests
+    result = testdir.runpytest_subprocess('--nbval', '--current-env', '-s')
+    result.assert_outcomes(failed=3)
+
+
+
+def test_unrun_raises(testdir):
+    # This test uses the testdir fixture from pytester, which is useful for
+    # testing pytest plugins. It writes a notebook to a temporary dir
+    # and then runs pytest.
+
+    # Setup notebook to test:
+    sources = [
+        # In [1]:
+        "pass",
+    ]
+    # Build unrun notebook:
+    nb = build_nb(sources, mark_run=False)
+    nb.cells[0].metadata.tags = ['raises-exception']
+
+    # Write notebook to test dir
+    nbformat.write(nb, os.path.join(
+        str(testdir.tmpdir), 'test_expcted_exceptions.ipynb'))
+
+    # Run tests
+    result = testdir.runpytest_subprocess('--nbval', '--current-env', '-s')
+    result.assert_outcomes(failed=1)