Skip to content

princemaple/abnf_parsec

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

AbnfParsec

hex.pm hex.pm hex.pm github.com

ABNF in and parser out.

Parses ABNF with a parser written with nimble_parsec, emits parser consists of nimble_parsec combinators.

Features

  • Brevity - flattens unnecessary nesting in parsed ABNF
  • Easy to config and customize
  • Full test coverage

Installation

The package is available on Hex and can be installed by adding abnf_parsec to your list of dependencies in mix.exs:

def deps do
  [
    {:abnf_parsec, "~> 1.0", runtime: false}
  ]
end

Documentation can be generated with ExDoc and published on HexDocs. Once published, the docs can be found at https://hexdocs.pm/abnf_parsec.

Text / Byte mode

In some RFCs, literals in ABNF could be used to describe the byte representation instead of the text codepoints. There is no clear distinction between them for us to detect automatically. Hence a text / byte mode is added for the user to set.

defmodule TextParser do
  use AbnfParsec,
    mode: :text, # the default, can be omitted
    abnf: """
    ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
      / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
      / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
      / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
      / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
      / %xD0000-DFFFD / %xE1000-EFFFD
    """
end

defmodule ByteParser do
  use AbnfParsec,
    mode: :byte,
    abnf: """
    ; from RFC 5322 + UTF8-non-ascii
    atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" /
      "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}" / "~" / UTF8-non-ascii

    ; from RFC 3629
    UTF8-non-ascii  =   UTF8-2 / UTF8-3 / UTF8-4

    UTF8-2      = %xC2-DF UTF8-tail
    UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
          %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
    UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
          %xF4 %x80-8F 2( UTF8-tail )
    UTF8-tail   = %x80-BF
    """
end

Usage

defmodule IPv4Parser do
  use AbnfParsec,
    abnf: """
    ip = first dot second dot third dot fourth
    dot = "."
    dec-octet =
      "25" %x30-35      /   ; 250-255
      "2" %x30-34 DIGIT /   ; 200-249
      "1" 2DIGIT        /   ; 100-199
      %x31-39 DIGIT     /   ; 10-99
      DIGIT                 ; 0-9
    first = dec-octet
    second = dec-octet
    third = dec-octet
    fourth = dec-octet
    """,
    unbox: ["dec-octet"],
    ignore: ["dot"],
    parse: :ip
end

# IPv4Parser.ip("192.168.0.1")
# IPv4Parser.parse("127.0.0.1")
# IPv4Parser.parse!("10.0.0.1")

iex> IPv4Parser.parse! "10.0.0.1"
[ip: [first: '10', second: '0', third: '0', fourth: '1']]
defmodule JsonParser do
  use AbnfParsec,
    abnf_file: "test/fixture/json.abnf",
    parse: :json_text,
    transform: %{
      "string" => {:reduce, {List, :to_string, []}},
      "int" => [{:reduce, {List, :to_string, []}}, {:map, {String, :to_integer, []}}],
      "frac" => {:reduce, {List, :to_string, []}},
      "null" => {:replace, nil},
      "true" => {:replace, true},
      "false" => {:replace, false}
    },
    untag: ["member"],
    unwrap: ["int", "frac"],
    unbox: [
      "JSON-text",
      "null",
      "true",
      "false",
      "digit1-9",
      "decimal-point",
      "escape",
      "unescaped",
      "char"
    ],
    ignore: [
      "name-separator",
      "value-separator",
      "quotation-mark",
      "begin-object",
      "end-object",
      "begin-array",
      "end-array"
    ]
end

json = """
  {"a": {"b": 1, "c": [true]}, "d": null, "e": "e\\te"}
  """

# JsonParser.json_text(json)
# JsonParser.parse(json)
# JsonParser.parse!(json)

iex> JsonParser.parse! """
...> {"a": {"b": 1, "c": [true]}, "d": null, "e": "e\\te"}
...> """
[
  object: [
    [
      string: ["a"],
      value: [
        object: [
          [string: ["b"], value: [number: [int: 1]]],
          [string: ["c"], value: [array: [value: [true]]]]
        ]
      ]
    ],
    [string: ["d"], value: [nil]],
    [string: ["e"], value: [string: ["e\\te"]]]
  ]
]

For more details of options for customization, see abnf_parsec.ex

What does it do, really?

For example, ABNF of ABNF 😉 is parsed

rulelist       =  1*( rule / (*c-wsp c-nl) )

rule           =  rulename defined-as elements c-nl
                      ; continues if next line starts
                      ;  with white space

rulename       =  ALPHA *(ALPHA / DIGIT / "-")

defined-as     =  *c-wsp ("=" / "=/") *c-wsp
                      ; basic rules definition and
                      ;  incremental alternatives

elements       =  alternation *c-wsp

c-wsp          =  WSP / (c-nl WSP)

c-nl           =  comment / CRLF
                      ; comment or newline

comment        =  ";" *(WSP / VCHAR) CRLF

alternation    =  concatenation
                 *(*c-wsp "/" *c-wsp concatenation)

concatenation  =  repetition *(1*c-wsp repetition)

repetition     =  [repeat] element

repeat         =  1*DIGIT / (*DIGIT "*" *DIGIT)

element        =  rulename / group / option /
                 char-val / num-val / prose-val

group          =  "(" *c-wsp alternation *c-wsp ")"

option         =  "[" *c-wsp alternation *c-wsp "]"

char-val       =  DQUOTE *(%x20-21 / %x23-7E) DQUOTE
                      ; quoted string of SP and VCHAR
                      ;  without DQUOTE

num-val        =  "%" (bin-val / dec-val / hex-val)

bin-val        =  "b" 1*BIT
                 [ 1*("." 1*BIT) / ("-" 1*BIT) ]
                      ; series of concatenated bit values
                      ;  or single ONEOF range

dec-val        =  "d" 1*DIGIT
                 [ 1*("." 1*DIGIT) / ("-" 1*DIGIT) ]

hex-val        =  "x" 1*HEXDIG
                 [ 1*("." 1*HEXDIG) / ("-" 1*HEXDIG) ]

prose-val      =  "<" *(%x20-3D / %x3F-7E) ">"
                      ; bracketed string of SP and VCHAR
                      ;  without angles
                      ; prose description, to be used as
                      ;  last resort

into

[
  rule: [
    rulename: "rulelist",
    repetition: [
      repeat: [min: 1],
      alternation: [
        rulename: "rule",
        concatenation: [
          repetition: [repeat: [], rulename: "c-wsp"],
          rulename: "c-nl"
        ]
      ]
    ]
  ],
  rule: [
    rulename: "rule",
    concatenation: [
      rulename: "rulename",
      rulename: "defined-as",
      rulename: "elements",
      rulename: "c-nl"
    ],
    comment: "continues if next line starts",
    comment: "with white space"
  ],
  rule: [
    rulename: "rulename",
    concatenation: [
      rulename: "ALPHA",
      repetition: [
        repeat: [],
        alternation: [{:rulename, "ALPHA"}, {:rulename, "DIGIT"}, "-"]
      ]
    ]
  ],
  rule: [
    rulename: "defined-as",
    concatenation: [
      repetition: [repeat: [], rulename: "c-wsp"],
      alternation: ["=", "=/"],
      repetition: [repeat: [], rulename: "c-wsp"]
    ],
    comment: "basic rules definition and",
    comment: "incremental alternatives"
  ],
  rule: [
    rulename: "elements",
    concatenation: [
      rulename: "alternation",
      repetition: [repeat: [], rulename: "c-wsp"]
    ]
  ],
  rule: [
    rulename: "c-wsp",
    alternation: [
      rulename: "WSP",
      concatenation: [rulename: "c-nl", rulename: "WSP"]
    ]
  ],
  rule: [
    rulename: "c-nl",
    alternation: [rulename: "comment", rulename: "CRLF"],
    comment: "comment or newline"
  ],
  rule: [
    rulename: "comment",
    concatenation: [
      ";",
      {:repetition,
       [repeat: [], alternation: [rulename: "WSP", rulename: "VCHAR"]]},
      {:rulename, "CRLF"}
    ]
  ],
  rule: [
    rulename: "alternation",
    concatenation: [
      rulename: "concatenation",
      repetition: [
        repeat: [],
        concatenation: [
          {:repetition, [repeat: [], rulename: "c-wsp"]},
          "/",
          {:repetition, [repeat: [], rulename: "c-wsp"]},
          {:rulename, "concatenation"}
        ]
      ]
    ]
  ],
  rule: [
    rulename: "concatenation",
    concatenation: [
      rulename: "repetition",
      repetition: [
        repeat: [],
        concatenation: [
          repetition: [repeat: [min: 1], rulename: "c-wsp"],
          rulename: "repetition"
        ]
      ]
    ]
  ],
  rule: [
    rulename: "repetition",
    concatenation: [option: [rulename: "repeat"], rulename: "element"]
  ],
  rule: [
    rulename: "repeat",
    alternation: [
      repetition: [repeat: [min: 1], rulename: "DIGIT"],
      concatenation: [
        {:repetition, [repeat: [], rulename: "DIGIT"]},
        "*",
        {:repetition, [repeat: [], rulename: "DIGIT"]}
      ]
    ]
  ],
  rule: [
    rulename: "element",
    alternation: [
      rulename: "rulename",
      rulename: "group",
      rulename: "option",
      rulename: "char-val",
      rulename: "num-val",
      rulename: "prose-val"
    ]
  ],
  rule: [
    rulename: "group",
    concatenation: [
      "(",
      {:repetition, [repeat: [], rulename: "c-wsp"]},
      {:rulename, "alternation"},
      {:repetition, [repeat: [], rulename: "c-wsp"]},
      ")"
    ]
  ],
  rule: [
    rulename: "option",
    concatenation: [
      "[",
      {:repetition, [repeat: [], rulename: "c-wsp"]},
      {:rulename, "alternation"},
      {:repetition, [repeat: [], rulename: "c-wsp"]},
      "]"
    ]
  ],
  rule: [
    rulename: "char-val",
    concatenation: [
      rulename: "DQUOTE",
      repetition: [
        repeat: [],
        alternation: [
          num_range: [{:base, "x"}, "20", "21"],
          num_range: [{:base, "x"}, "23", "7E"]
        ]
      ],
      rulename: "DQUOTE"
    ],
    comment: "quoted string of SP and VCHAR",
    comment: "without DQUOTE"
  ],
  rule: [
    rulename: "num-val",
    concatenation: [
      "%",
      {:alternation,
       [rulename: "bin-val", rulename: "dec-val", rulename: "hex-val"]}
    ]
  ],
  rule: [
    rulename: "bin-val",
    concatenation: [
      "b",
      {:repetition, [repeat: [min: 1], rulename: "BIT"]},
      {:option,
       [
         alternation: [
           repetition: [
             repeat: [min: 1],
             concatenation: [
               ".",
               {:repetition, [repeat: [min: 1], rulename: "BIT"]}
             ]
           ],
           concatenation: [
             "-",
             {:repetition, [repeat: [min: 1], rulename: "BIT"]}
           ]
         ]
       ]}
    ],
    comment: "series of concatenated bit values",
    comment: "or single ONEOF range"
  ],
  rule: [
    rulename: "dec-val",
    concatenation: [
      "d",
      {:repetition, [repeat: [min: 1], rulename: "DIGIT"]},
      {:option,
       [
         alternation: [
           repetition: [
             repeat: [min: 1],
             concatenation: [
               ".",
               {:repetition, [repeat: [min: 1], rulename: "DIGIT"]}
             ]
           ],
           concatenation: [
             "-",
             {:repetition, [repeat: [min: 1], rulename: "DIGIT"]}
           ]
         ]
       ]}
    ]
  ],
  rule: [
    rulename: "hex-val",
    concatenation: [
      "x",
      {:repetition, [repeat: [min: 1], rulename: "HEXDIG"]},
      {:option,
       [
         alternation: [
           repetition: [
             repeat: [min: 1],
             concatenation: [
               ".",
               {:repetition, [repeat: [min: 1], rulename: "HEXDIG"]}
             ]
           ],
           concatenation: [
             "-",
             {:repetition, [repeat: [min: 1], rulename: "HEXDIG"]}
           ]
         ]
       ]}
    ]
  ],
  rule: [
    rulename: "prose-val",
    concatenation: [
      "<",
      {:repetition,
       [
         repeat: [],
         alternation: [
           num_range: [{:base, "x"}, "20", "3D"],
           num_range: [{:base, "x"}, "3F", "7E"]
         ]
       ]},
      ">"
    ],
    comment: "bracketed string of SP and VCHAR",
    comment: "without angles",
    comment: "prose description, to be used as",
    comment: "last resort"
  ]
]

And generated parser looks like:

[
  defparsec(
    :rulelist,
    tag(
      times(choice([parsec(:rule), repeat(parsec(:c_wsp)) |> parsec(:c_nl)]), min: 1),
      :rulelist
    )
  ),
  defparsec(
    :rule,
    tag(parsec(:rulename) |> parsec(:defined_as) |> parsec(:elements) |> parsec(:c_nl), :rule)
  ),
  defparsec(
    :rulename,
    tag(
      parsec(:core_alpha)
      |> repeat(choice([parsec(:core_alpha), parsec(:core_digit), string("-")])),
      :rulename
    )
  ),
  defparsec(
    :defined_as,
    tag(
      repeat(parsec(:c_wsp)) |> choice([string("="), string("=/")]) |> repeat(parsec(:c_wsp)),
      :defined_as
    )
  ),
  defparsec(:elements, tag(parsec(:alternation) |> repeat(parsec(:c_wsp)), :elements)),
  defparsec(:c_wsp, tag(choice([parsec(:core_wsp), parsec(:c_nl) |> parsec(:core_wsp)]), :c_wsp)),
  defparsec(:c_nl, tag(choice([parsec(:comment), parsec(:core_crlf)]), :c_nl)),
  defparsec(
    :comment,
    tag(
      string(";")
      |> repeat(choice([parsec(:core_wsp), parsec(:core_vchar)]))
      |> parsec(:core_crlf),
      :comment
    )
  ),
  defparsec(
    :alternation,
    tag(
      parsec(:concatenation)
      |> repeat(
        repeat(parsec(:c_wsp))
        |> string("/")
        |> repeat(parsec(:c_wsp))
        |> parsec(:concatenation)
      ),
      :alternation
    )
  ),
  defparsec(
    :concatenation,
    tag(
      parsec(:repetition) |> repeat(times(parsec(:c_wsp), min: 1) |> parsec(:repetition)),
      :concatenation
    )
  ),
  defparsec(:repetition, tag(optional(parsec(:repeat)) |> parsec(:element), :repetition)),
  defparsec(
    :repeat,
    tag(
      choice([
        times(parsec(:core_digit), min: 1),
        repeat(parsec(:core_digit)) |> string("*") |> repeat(parsec(:core_digit))
      ]),
      :repeat
    )
  ),
  defparsec(
    :element,
    tag(
      choice([
        parsec(:rulename),
        parsec(:group),
        parsec(:option),
        parsec(:char_val),
        parsec(:num_val),
        parsec(:prose_val)
      ]),
      :element
    )
  ),
  defparsec(
    :group,
    tag(
      string("(")
      |> repeat(parsec(:c_wsp))
      |> parsec(:alternation)
      |> repeat(parsec(:c_wsp))
      |> string(")"),
      :group
    )
  ),
  defparsec(
    :option,
    tag(
      string("[")
      |> repeat(parsec(:c_wsp))
      |> parsec(:alternation)
      |> repeat(parsec(:c_wsp))
      |> string("]"),
      :option
    )
  ),
  defparsec(
    :char_val,
    tag(
      parsec(:core_dquote)
      |> repeat(choice([ascii_char([32..33]), ascii_char([35..126])]))
      |> parsec(:core_dquote),
      :char_val
    )
  ),
  defparsec(
    :num_val,
    tag(string("%") |> choice([parsec(:bin_val), parsec(:dec_val), parsec(:hex_val)]), :num_val)
  ),
  defparsec(
    :bin_val,
    tag(
      string("b")
      |> times(parsec(:core_bit), min: 1)
      |> optional(
        choice([
          times(string(".") |> times(parsec(:core_bit), min: 1), min: 1),
          string("-") |> times(parsec(:core_bit), min: 1)
        ])
      ),
      :bin_val
    )
  ),
  defparsec(
    :dec_val,
    tag(
      string("d")
      |> times(parsec(:core_digit), min: 1)
      |> optional(
        choice([
          times(string(".") |> times(parsec(:core_digit), min: 1), min: 1),
          string("-") |> times(parsec(:core_digit), min: 1)
        ])
      ),
      :dec_val
    )
  ),
  defparsec(
    :hex_val,
    tag(
      string("x")
      |> times(parsec(:core_hexdig), min: 1)
      |> optional(
        choice([
          times(string(".") |> times(parsec(:core_hexdig), min: 1), min: 1),
          string("-") |> times(parsec(:core_hexdig), min: 1)
        ])
      ),
      :hex_val
    )
  ),
  defparsec(
    :prose_val,
    tag(
      string("<") |> repeat(choice([ascii_char([32..61]), ascii_char([63..126])])) |> string(">"),
      :prose_val
    )
  )
]