Add bit string literals for ease of encoding BIT STRINGs

BIT STRING is distinct from OCTET STRING in that it has a leading byte specifying the number of "unimportant" bits in the last octet. This adds syntax (b`...`) similar to hex literals, which generates this prefix in a convenient fashion. See language.txt for details. Includes changes to ascii2der, as well as a changing the existing der2ascii heuristic for well-formed BIT STRINGs to use the new syntax instead.
google · Jul 1, 2021 · 07da533 · 07da533
1 parent c0da311
commit 07da533
Show file tree

Hide file tree

Showing 7 changed files with 299 additions and 13 deletions.
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -12,4 +12,5 @@
 David Benjamin <[email protected]>
 Carl Mehner <[email protected]>
 Eric Roman <[email protected]>
+Miguel Young de la Sota <[email protected]>
 Victor Vasiliev <[email protected]>
diff --git a/cmd/ascii2der/scanner.go b/cmd/ascii2der/scanner.go
@@ -284,6 +284,53 @@ again:
 		if s.pos.Offset+1 < len(s.text) && s.text[s.pos.Offset+1] == '"' {
 			return s.parseUTF32String()
 		}
+	case 'b':
+		if s.pos.Offset+1 < len(s.text) && s.text[s.pos.Offset+1] == '`' {
+			s.advance() // Skip the b.
+			s.advance() // Skip the `.
+			bitStr, ok := s.consumeUpTo('`')
+			if !ok {
+				return token{}, &parseError{s.pos, errors.New("unmatched `")}
+			}
+
+			// The leading byte is the number of "extra" bits at the end.
+			var bitCount int
+			var sawPipe bool
+			value := []byte{0}
+			for i, r := range bitStr {
+				switch r {
+				case '0', '1':
+					if bitCount%8 == 0 {
+						value = append(value, 0)
+					}
+					if r == '1' {
+						value[bitCount/8+1] |= 1 << uint(7-bitCount%8)
+					}
+					bitCount++
+				case '|':
+					if sawPipe {
+						return token{}, &parseError{s.pos, errors.New("duplicate |")}
+					}
+
+					// bitsRemaining is the number of bits remaining in the output that haven't
+					// been used yet. There cannot be more than that many bits past the |.
+					bitsRemaining := (len(value)-1)*8 - bitCount
+					inputRemaining := len(bitStr) - i - 1
+					if inputRemaining > bitsRemaining {
+						return token{}, &parseError{s.pos, fmt.Errorf("expected at most %v explicit padding bits; found %v", bitsRemaining, inputRemaining)}
+					}
+
+					sawPipe = true
+					value[0] = byte(bitsRemaining)
+				default:
+					return token{}, &parseError{s.pos, fmt.Errorf("unexpected rune %q", r)}
+				}
+			}
+			if !sawPipe {
+				value[0] = byte((len(value)-1)*8 - bitCount)
+			}
+			return token{Kind: tokenBytes, Value: value, Pos: s.pos}, nil
+		}
 	case '`':
 		s.advance()
 		hexStr, ok := s.consumeUpTo('`')

diff --git a/cmd/ascii2der/scanner_test.go b/cmd/ascii2der/scanner_test.go
@@ -118,6 +118,8 @@ indefinite long-form:2`,
 	{"[long-form:2 SEQUENCE]", []token{{Kind: tokenBytes, Value: []byte{0x3f, 0x80, 0x10}}, {Kind: tokenEOF}}, true},
 	// Bad hex bytes.
 	{"`hi there!`", nil, false},
+	// Bad bit characters.
+	{"b`hi there!`", nil, false},
 	// UTF-16 literals are parsed correctly.
 	{
 		`u""`,
@@ -224,6 +226,110 @@ indefinite long-form:2`,
 		},
 		true,
 	},
+	// BIT STRING literals are parsed correctly.
+	{
+		"b``",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x00}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	{
+		"b`1`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x07, 0x100 - (1 << 7)}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	{
+		"b`11`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x06, 0x100 - (1 << 6)}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	{
+		"b`111`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x05, 0x100 - (1 << 5)}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	{
+		"b`1111`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x04, 0x100 - (1 << 4)}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	{
+		"b`11111`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x03, 0x100 - (1 << 3)}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	{
+		"b`111111`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x02, 0x100 - (1 << 2)}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	{
+		"b`1111111`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x01, 0x100 - (1 << 1)}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	{
+		"b`1010101001010101`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x00, 0xaa, 0x55}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	{
+		"b`101010100101`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x04, 0xaa, 0x50}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	// We can stick a | in the middle of a BIT STRING to add "explicit" padding.
+	{
+		"b`101010100|1010101`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x07, 0xaa, 0x55}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	// If explicit padding does not end at a byte boundary, the remaining padding
+	// bits are zero.
+	{
+		"b`101010100101|010`",
+		[]token{
+			{Kind: tokenBytes, Value: []byte{0x04, 0xaa, 0x54}},
+			{Kind: tokenEOF},
+		},
+		true,
+	},
+	// Padding that passes a byte boundary is an error.
+	{"b`0000000|01`", nil, false},
+	// Extra |s are an error.
+	{"b`0|0|0`", nil, false},
 	// Bad or truncated escape sequences.
 	{`"\`, nil, false},
 	{`"\x`, nil, false},
@@ -267,6 +373,7 @@ indefinite long-form:2`,
 	{`"hello`, nil, false},
 	{`u"hello`, nil, false},
 	{`U"hello`, nil, false},
+	{"b`0101", nil, false},
 	// Long-form with invalid number.
 	{"long-form:", nil, false},
 	{"long-form:garbage", nil, false},

diff --git a/cmd/der2ascii/main.go b/cmd/der2ascii/main.go
@@ -15,8 +15,8 @@
 package main
 
 import (
-	"encoding/pem"
 	"encoding/hex"
+	"encoding/pem"
 	"flag"
 	"fmt"
 	"io/ioutil"

diff --git a/cmd/der2ascii/writer.go b/cmd/der2ascii/writer.go
@@ -355,6 +355,37 @@ func derToASCIIImpl(out *bytes.Buffer, in []byte, indent int, stopAtEOC bool) []
 					// Emit the remaining as a DER element.
 					derToASCIIImpl(out, elem.body[1:], indent+1, false) // Adds a trailing newline.
 					addLine(out, indent, "}")
+				} else if len(elem.body) == 1 && elem.body[0] == 0 {
+					addLine(out, indent, fmt.Sprintf("%s b`` }", header))
+				} else if len(elem.body) > 1 && len(elem.body) <= 5 && elem.body[0] < 8 {
+					// Convert to a b`` literal when the leading byte is valid and the
+					// number of data octets is at most 4; we limit the length for
+					// readability.
+					bits := new(strings.Builder)
+
+					// The first octet is the number of unused bits.
+					significant := 8 - elem.body[0]
+					for i, octet := range elem.body[1:] {
+						// Last octet gets some special handling.
+						isLast := i == len(elem.body)-2
+						for j := 0; j < 8; j++ {
+							if isLast && int(significant) == j {
+								if octet == 0 {
+									break
+								}
+								bits.WriteRune('|')
+							}
+
+							if octet&0x80 == 0 {
+								bits.WriteRune('0')
+							} else {
+								bits.WriteRune('1')
+							}
+							octet <<= 1
+						}
+					}
+
+					addLine(out, indent, fmt.Sprintf("%s b`%s` }", header, bits))
 				} else if len(elem.body) > 1 && elem.body[0] < 8 {
 					// The first byte is the number of unused bits.
 					addLine(out, indent, fmt.Sprintf("%s %s %s }", header, bytesToString(elem.body[:1]), bytesToString(elem.body[1:])))

diff --git a/cmd/der2ascii/writer_test.go b/cmd/der2ascii/writer_test.go
@@ -128,25 +128,86 @@ var derToASCIITests = []convertFuncTest{
 		[]byte{0x03, 0x03, 0x00, 0x30, 0x00},
 		"BIT_STRING {\n  `00`\n  SEQUENCE {}\n}\n",
 	},
-	// BIT STRINGs are encoded normally if the contents are not an element.
+	// BIT STRINGs are encoded as bit string literals if the contents are not an
+	// element.
 	{
 		[]byte{0x03, 0x03, 0x00, 0x00, 0x00},
-		"BIT_STRING { `00` `0000` }\n",
+		"BIT_STRING { b`0000000000000000` }\n",
 	},
-	// BIT STRINGs are encoded normally if the leading byte is non-zero.
+	{
+		[]byte{0x03, 0x01, 0x00},
+		"BIT_STRING { b`` }\n",
+	},
+	{
+		[]byte{0x03, 0x02, 0x07, 0x100 - (1 << 7)},
+		"BIT_STRING { b`1` }\n",
+	},
+	{
+		[]byte{0x03, 0x02, 0x06, 0x100 - (1 << 6)},
+		"BIT_STRING { b`11` }\n",
+	},
+	{
+		[]byte{0x03, 0x02, 0x05, 0x100 - (1 << 5)},
+		"BIT_STRING { b`111` }\n",
+	},
+	{
+		[]byte{0x03, 0x02, 0x04, 0x100 - (1 << 4)},
+		"BIT_STRING { b`1111` }\n",
+	},
+	{
+		[]byte{0x03, 0x02, 0x03, 0x100 - (1 << 3)},
+		"BIT_STRING { b`11111` }\n",
+	},
+	{
+		[]byte{0x03, 0x02, 0x02, 0x100 - (1 << 2)},
+		"BIT_STRING { b`111111` }\n",
+	},
+	{
+		[]byte{0x03, 0x02, 0x01, 0x100 - (1 << 1)},
+		"BIT_STRING { b`1111111` }\n",
+	},
+	{
+		[]byte{0x03, 0x02, 0x00, 0xff},
+		"BIT_STRING { b`11111111` }\n",
+	},
+	{
+		[]byte{0x03, 0x03, 0x07, 0xff, 0x100 - (1 << 7)},
+		"BIT_STRING { b`111111111` }\n",
+	},
+	// The above, but with padding.
+	{
+		[]byte{0x03, 0x02, 0x07, 0xc0},
+		"BIT_STRING { b`1|1000000` }\n",
+	},
+	// BIT STRINGs are encoded as bit string literals if the they are at most 32
+	// bits.
 	{
 		[]byte{0x03, 0x05, 0x01, 0x30, 0x80, 0x00, 0x00},
-		"BIT_STRING { `01` `30800000` }\n",
+		"BIT_STRING { b`0011000010000000000000000000000` }\n",
+	},
+	// The above, but with non-trivial padding.
+	{
+		[]byte{0x03, 0x05, 0x01, 0x30, 0x80, 0x00, 0xff},
+		"BIT_STRING { b`0011000010000000000000001111111|1` }\n",
+	},
+	// BIT STRINGs with more than four components are hex-encoded instead.
+	{
+		[]byte{0x03, 0x06, 0x01, 0x30, 0x80, 0xaa, 0x55, 0xaa},
+		"BIT_STRING { `01` `3080aa55aa` }\n",
 	},
 	// BIT STRINGs do not attempt to separate the leading byte if invalid.
 	{
 		[]byte{0x03, 0x05, 0xff, 0x30, 0x80, 0x00, 0x00},
 		"BIT_STRING { `ff30800000` }\n",
 	},
-	// Empty BIT STRINGs do not emit extra whitspace.
+	// Empty BIT STRINGs with non-zero leading byte are always invalid.
 	{
-		[]byte{0x03, 0x01, 0x00},
-		"BIT_STRING { `00` }\n",
+		[]byte{0x03, 0x01, 0x07},
+		"BIT_STRING { `07` }\n",
+	},
+	{
+		[]byte{0x03, 0x01, 0x08},
+		"BIT_STRING { `08` }\n",
 	},
 	// OBJECT IDENTIFIERs are pretty-printed if possible.
 	{

diff --git a/language.txt b/language.txt
@@ -76,6 +76,32 @@ Unicode code point."
 `AbCdEf`
 
 
+# Bit string literals.
+
+# A backtick string beginning with 'b' is a bit string literal. 0 or 1
+# characters denote bits in a bit string. | characters are interpreted as below.
+# No other characters may appear. The emit the contents of the bit string's DER
+# encoding as a BIT STRING. (Big-endian bit order, prefixed with the number of
+# trailing padding bits)
+
+# This encodes as `00aa`.
+b`10101010`
+
+# This encodes as `04a0`.
+b`1010`
+
+# A single | may appear, which marks the beginning of explicit padding bits. BER
+# permits any bit sequence after the padding bytes. However, it is an error for
+# padding to cross the byte boundary.
+
+# This encodes as `04aa`.
+b`1010|1010`
+
+# This is an error, since only four padding bits are available for the user to
+# specify.
+# b`1010|10101`
+
+
 # Integers.
 
 # Tokens which match /-?[0-9]+/ are integer tokens. They emit the contents of
@@ -288,11 +314,24 @@ SEQUENCE `aabbcc`
 #    c. If the tag is BOOLEAN and the body is valid, encode as TRUE or FALSE.
 #       Otherwise encode as a hex literal.
 #
-#    d. If the tag is BIT STRING and the body is non-empty, encode the first
-#       byte as a separate hex literal. If this value is non-zero, encode the
-#       remainder as a raw byte string. Otherwise, apply step g to encode the
-#       body. This is to account for X.509 incorrectly using BIT STRING
-#       instead of OCTET STRING for SubjectPublicKeyInfo and signatures.
+#    d. If the tag is a BIT STRING:
+#       
+#       i.   If the body is a valid bit string, contains a whole number of
+#            bytes, and may be parsed as a series of BER elements with no
+#            trailing data, encode as `00` followed by recursing into the body
+#            as in step g. This accounts for X.509 incorrectly using BIT STRING
+#            instead of OCTET STRING for SubjectPublicKeyInfo and signatures.
+#
+#       ii.  If the body is a valid bit string with at most 32 bits, encode as a
+#            bit string literal. If any padding bits are non-zero, they are
+#            encoded explicitly.
+#
+#       iii. If the body is a valid bit string with more than 32 bits, encode as
+#            apair of hex literals, containing the initial byte and the data
+#            bytes.
+#
+#       iv.  Otherwise, the body is not a valid bit string. Encode as a single
+#            hex literal.
 #
 #    e. If the tag is BMPString, decode the body as UTF-16 and encode as a
 #       UTF-16 literal. Unpaired surrogates and unprintable code points are