Skip to content

Commit

Permalink
Merge pull request broadinstitute#5042 from broadinstitute/ks_deep_yaml
Browse files Browse the repository at this point in the history
Deep Yaml DOS checks BA-5722
  • Loading branch information
kshakir authored Jun 24, 2019
2 parents d78d5ad + 3856852 commit 0f1f434
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 18 deletions.
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ More information can be found in the [documentation on localization](https://cro

#### Yaml node limits

Yaml parsing now checks for cycles, and limits the maximum number of parsed nodes to a configurable value. See
[the documentation on configuring Yaml](https://cromwell.readthedocs.io/en/stable/Configuring/#yaml) for more
information.
Yaml parsing now checks for cycles, and limits the maximum number of parsed nodes to a configurable value. It also
limits the nesting depth of sequences and mappings. See [the documentation on configuring
YAML](https://cromwell.readthedocs.io/en/stable/Configuring/#yaml) for more information.

### API Changes

Expand Down
18 changes: 18 additions & 0 deletions docs/Configuring.md
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,8 @@ The default threshold value is 100, just like the default for the heartbeat batc

### YAML

**Maximum number of nodes**

Cromwell will throw an error when detecting cyclic loops in Yaml inputs. However one can craft small acyclic YAML
documents that consume significant amounts of memory or cpu. To limit the amount of processing during parsing, there is
a limit on the number of nodes parsed per YAML document.
Expand All @@ -539,3 +541,19 @@ yaml {
```

The default limit is 1,000,000 nodes.

**Maximum nesting depth**

There is a limit on the maximum depth of nested YAML. If you decide to increase this value, you will likely need to also
increase the Java Virtual Machine's thread stack size as well using
[either `-Xss` or `-XX:ThreadStackSize`](https://docs.oracle.com/javase/8/docs/technotes/tools/unix/java.html).

This limit may be configured via the configuration value:

```hocon
yaml {
max-depth = 1000
}
```

The default limit is a maximum nesting depth of 1,000.
2 changes: 2 additions & 0 deletions wom/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
yaml {
# The maximum number of nodes (scalars + sequences + mappings) that will be parsed in a YAML
max-nodes = 1000000
# The maximum nested depth for nodes (sequences + mappings) that will be parsed in a YAML
max-depth = 1000
}
86 changes: 71 additions & 15 deletions wom/src/main/scala/wom/util/YamlUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -10,36 +10,37 @@ import eu.timepit.refined.refineV
import io.circe.{Json, ParsingFailure}
import net.ceedubs.ficus.Ficus._
import net.ceedubs.ficus.readers.ValueReader
import org.yaml.snakeyaml.composer.Composer
import org.yaml.snakeyaml.constructor.Constructor
import org.yaml.snakeyaml.nodes.Node
import org.yaml.snakeyaml.parser.ParserImpl
import org.yaml.snakeyaml.reader.StreamReader
import org.yaml.snakeyaml.resolver.Resolver

import scala.collection.JavaConverters._

object YamlUtils {

private[util] implicit val refinedNonNegativeReader: ValueReader[Int Refined NonNegative] = {
(config: Config, path: String) => {
val int = config.getInt(path)
refineV[NonNegative](int) match {
case Left(error) => throw new BadValue(path, error)
case Right(refinedInt) => refinedInt
}
}
}

private val defaultMaxNodes = ConfigFactory.load().as[Int Refined NonNegative]("yaml.max-nodes")

/**
* Parses the yaml, detecting loops, and a maximum number of nodes.
*
* See: https://en.wikipedia.org/wiki/Billion_laughs_attack
*
* @param yaml The yaml string.
* @param maxNodes Maximum number of yaml nodes to parse.
* @param maxDepth Maximum nested depth of yaml to parse.
* @return The parsed yaml.
*/
def parse(yaml: String, maxNodes: Int Refined NonNegative = defaultMaxNodes): Either[ParsingFailure, Json] = {
def parse(yaml: String,
maxNodes: Int Refined NonNegative = defaultMaxNodes,
maxDepth: Int Refined NonNegative = defaultMaxDepth
): Either[ParsingFailure, Json] = {
try {
val snakeYamlParser = new org.yaml.snakeyaml.Yaml()
val parsed = snakeYamlParser.load[AnyRef](new StringReader(yaml))
val yamlConstructor = new Constructor()
val yamlComposer = new MaxDepthComposer(yaml, maxDepth)
yamlConstructor.setComposer(yamlComposer)
val parsed = yamlConstructor.getSingleData(classOf[AnyRef])

// Use identity comparisons to check if two nodes are the same and do NOT recurse into them checking equality
val identityHashMap = new java.util.IdentityHashMap[AnyRef, java.lang.Boolean]()
// Since we don't actually need the values, wrap the Map in a Set
Expand All @@ -52,11 +53,66 @@ object YamlUtils {
}
}

private[util] implicit val refinedNonNegativeReader: ValueReader[Int Refined NonNegative] = {
(config: Config, path: String) => {
val int = config.getInt(path)
refineV[NonNegative](int) match {
case Left(error) => throw new BadValue(path, error)
case Right(refinedInt) => refinedInt
}
}
}

private lazy val composerRecursiveNodesField = {
val field = classOf[Composer].getDeclaredField("recursiveNodes")
field.setAccessible(true)
field
}

private val yamlConfig = ConfigFactory.load().getConfig("yaml")
private val defaultMaxNodes = yamlConfig.as[Int Refined NonNegative]("max-nodes")
private val defaultMaxDepth = yamlConfig.as[Int Refined NonNegative]("max-depth")

/** Extends SnakeYaml's Composer checking for a maximum depth before a StackOverflowError occurs. */
private class MaxDepthComposer(yaml: String, maxDepth: Int Refined NonNegative)
extends Composer(
new ParserImpl(new StreamReader(new StringReader(yaml))),
new Resolver()
) {

private lazy val composerRecursiveFields: java.util.Set[Node] = getRecursiveNodeSet(this)

private def checkDepth(): Unit = {
if (composerRecursiveFields.size > maxDepth.value)
throw new IllegalArgumentException(s"Parsing halted at node depth $maxDepth")
}

override def composeScalarNode(anchor: String): Node = {
checkDepth()
super.composeScalarNode(anchor)
}

override def composeSequenceNode(anchor: String): Node = {
checkDepth()
super.composeSequenceNode(anchor)
}

override def composeMappingNode(anchor: String): Node = {
checkDepth()
super.composeMappingNode(anchor)
}
}

/** A "pointer" reference to a mutable count. */
private class Counter {
var count = 0L
}

/** Use reflection to access the existing but private Set of nested nodes */
private def getRecursiveNodeSet(composer: Composer): java.util.Set[Node] = {
composerRecursiveNodesField.get(composer).asInstanceOf[java.util.Set[Node]]
}

/**
* Looks for loops and large documents in yaml parsed by SnakeYaml.
*
Expand Down
14 changes: 14 additions & 0 deletions wom/src/test/scala/wom/util/YamlUtilsSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,20 @@ class YamlUtilsSpec extends FlatSpec with Matchers with TableDrivenPropertyCheck
exception should have message "Loop detection halted at 1000000 nodes" // <-- Updating here? Also get the docs too!
}

it should "fail to parse deeply nested yaml sequence with the default configuration" in {
val nesting = 1000000
val yaml = ("[" * nesting) + ("]" * nesting)
val exception: Exception = YamlUtils.parse(yaml).left.value
exception should have message "Parsing halted at node depth 1000" // <-- Updating here? Also get the docs too!
}

it should "fail to parse deeply nested yaml mapping with the default configuration" in {
val nesting = 1000000
val yaml = ("{a:" * nesting) + "b" + ("}" * nesting)
val exception: Exception = YamlUtils.parse(yaml).left.value
exception should have message "Parsing halted at node depth 1000" // <-- Updating here? Also get the docs too!
}

it should "not parse a config with a negative value" in {
import wom.util.YamlUtils.refinedNonNegativeReader
the[BadValue] thrownBy {
Expand Down

0 comments on commit 0f1f434

Please sign in to comment.