diff --git a/doc/ocamlmark.md b/doc/ocamlmark.md
new file mode 100644
index 00000000..9d470eab
--- /dev/null
+++ b/doc/ocamlmark.md
@@ -0,0 +1,329 @@
+`ocamlmark` – An `ocamldoc` to CommonMark bi-directional translation
+====================================================================
+
+The `ocamlmark` language allows to write OCaml documentation strings
+and `.mld` files in [CommonMark].
+
+It is just the CommonMark language with the following tweaks:
+
+1. Link references `[label]` whose `label` starts with `!` cannot be
+ defined in documents and are used to stand for the `{!…}`,
+ `{!modules…}` and `{!indexlist}` references and directives
+ of `ocamldoc`.
+2. An extension to specify heading reference labels to match the
+ `{[0-5]+:label text}` `ocamldoc` heading construct. We reuse
+ [Pandoc's heading identifiers][pandoc-heading-ids]'s syntax.
+3. In paragraphs, a few more construct are detected to support
+ `ocamldoc`'s [`@-tags`][ocamldoc-@tag].
+
+See the [full translation](#translation).
+
+## Short example
+
+The following show equivalent `ocamldoc` and `ocamlmark` text side-by-side:
+
+```
+{1:mapping Mapping list elements} | # Mapping list elements {#mapping}
+ |
+Use function {!List.map} to map the | Use function [!List.map] to map the
+elements of a list. Other | elements of a list. Other
+{{!List.iterators}iterators} are | [iterators][!List.iterators] are
+available. | available.
+```
+
+And this body of `ocamlmark` warns on the reference definition that it
+is illegal, because its label starts with a `!`.
+
+```
+See my wonderful [website][!website]
+
+[!website]: https://example.org
+```
+
+[CommonMark]: https://spec.commonmark.org/current
+[ocamldoc]: https://ocaml.org/manual/ocamldoc.html#ss:ocamldoc-syntax
+[ocamldoc-ast]: https://github.com/ocaml-doc/odoc-parser/blob/main/src/ast.ml
+[ocamldoc-@tag]: https://ocaml.org/manual/ocamldoc.html#ss:ocamldoc-tags
+[pandoc-heading-ids]: https://pandoc.org/MANUAL.html#heading-identifiers
+
+## Translation
+
+The `ocamldoc` language is defined [here][ocamldoc]
+([AST][ocamldoc-ast]). The `CommonMark` language is defined
+[here][commonmark].
+
+
+ The ←, ↔ and → arrows indicate translation directions.
+````
+ocamldoc ocamlmark
+--------------------------------------------------------
+{[0-5] text} ↔ #… text
+{[0-5]:label text} ↔ #… text {#label}
+{b text} ↔ **text**
+{i text} ↔ *text*
+{e text} ↔ *text*
+{C text} ↔ text
+{L text} ↔ text
+{R text} ↔ text
+{ol {- }… ↔ 1. …
+{ul {- }… ↔ * …
+{{:url} text} ↔ [text](url)
+{{!ref} text} ↔ [text][!ref]
+[string] ↔ `string` (string not starting with `)
+[`string] ↔ `` `string ``
+{!ref} ↔ [!ref]
+{!modules m…} ↔ [!modules m…] (in its own paragraph)
+{!indexlist} ↔ [!indexlist] (in its own paragraph)
+{^ text} → text
+{%html: …%} ← text
+{_ text} → text
+{%html: …%} ← text
+{[ string ]} ↔ ```
+ string
+ ```
+ or indented code block
+{@info[ string ]} ↔ ```info
+ string
+ ```
+{v string v} ↔ ```verb
+ string
+ ```
+{%latex: string %} ↔ ```=latex
+{% string %} string
+ ```
+{%html: string %} ← HTML block or raw HTML or
+ ```=html
+ string
+ ```
+{%texi: string %} ← ```=texi
+ string
+ ```
+{%man: string %} ← ```=man
+ string
+ ```
+
+
+{%html:
%} ← *** (thematic break)
+??? ← > … (block quotes)
+
+@authors string ↔ @authors string
+@deprecated text ↔ @deprected text
+@param id text ↔ @param id text
+@raise Exc text ↔ @raise string text
+@return text ↔ @return text
+@see url text ↔ @see url string
+@since string ↔ @since string
+@before string ↔ @before string
+@version string ↔ @version string
+````
+
+### Documentation comment structure
+
+CommonMark is sensitive to leading spaces on lines and `ocamldoc`
+comments are usually indented.
+
+`ocamlmark` defines a start column to define the first relevant
+character of lines in documentation comments. The start column is set
+to the first non-space character after the `**` marker (this means right
+after it if a newline immediately follows). In this example, the first
+character of the line is the `c` character:
+
+ (** c
+ This is an indented code block.
+ And this is not *)
+
+On every line of the comment, leading spaces are trimmed up to the
+start column. If there are less spaces than that, the line starts on the
+first non space character as if it was indented to the start column.
+
+These three comments are equivalent:
+
+```
+ (** This is multi-line
+ text *)
+
+ (**This is multi-line
+ text *)
+
+ (** This is multi-line
+ text *)
+```
+
+## Notes and unresolved questions
+
+Here are a few notes and questions on the above translation and the
+[current implementation] in `odoc-parser` that translates CommonMark to
+the `ocamldoc` abstract syntax tree.
+
+ 0. Comment structure. We should likely refine the strategy for fenced code
+ blocks, so that we can have them flush left to benefit from 80 columns.
+ For now if you do that you lose indentation up to the start column.
+
+ 1. Tags are a litte bit unclear. `odoc` changed their semantics: they
+ are no longer allowed inline. We could simply define them as
+ paragraphs and the first inline defines the tag. Personally I
+ would rather be in favor of deprecating most them in `ocamlmark`
+ and let them be OCaml attributes for the ones that are useful. This
+ somehow already happened for `@deprecated` which became an
+ alert. `@version`, `@since` and `@before` would also be useful for
+ enriching the compiler alerting mecanisms and `@raise` would provide
+ information for static analysers. However there are new `odoc`
+ specific hint tags that are needed (e.g. `@canonical`, `@open`,
+ `@inline`) and don't make sense as attributes.
+
+ 2. The image link reference translation is not really
+ statisfactory. After all these years it's time for `odoc` to
+ really solve the [page assets problem][page-assets].
+
+ 3. The above table does not include the latest `ocamldoc` additions
+ (LATEX maths) and upcoming additions (tables).
+ `cmarkit` supports both though.
+
+ 4. We need a way to handle the heading 0 level trick of `.mld`
+ files. It's not natural to use, for toplevel sections, level 2 in
+ `.mld` files and level 1 in modules. The simplest would be for
+ `.mld` files to take a first level 1 heading as defining the
+ heading 0. But for that we need to know we are parsing a `.mld`
+ files, the current `odoc-parser` API does not tell us that.
+
+ 5. At the moment `odoc` does not support `ocamldoc`'s [raw HTML
+ tags][ocamldoc-html], [no decision][odoc-html] has been taken yet
+ but it wouldn't be hard to translate it in either direction.
+
+ 6. CommonMark is less restricted than `ocamldoc`. It can have headings
+ in list items. When this occurs we warn and drop the construct.
+
+ 7. What should we do with thematic breaks ? For now we translated
+ them to raw HTML.
+
+ 8. Should we treat the concatenation of comments as a document ?
+ This only influences link reference definition. Maybe we could
+ just accumulate link references as we go. This makes it slightly
+ at odds with CommonMark semantics were link definitions are
+ allowed to occur anywhere.
+
+ But it would be nice to be able to reuse link reference definitions
+ from one comment to the other, especially given disadvantage 4 (see
+ below). Doing it in reverse (from bottom to top) and accumulating
+ refs would allow to have long URLs as link reference definitions at
+ the end of the doc.
+
+ 9. For the `ocamldoc` to `ocamlmark` translation we don't have a
+ syntax for `{%backend:` *when these are used inlines*, except for
+ raw HTML if the raw HTML satisfies the complicated contraints of
+ CommonMark raw HTML. Could be a problem for that translation
+ direction if people used that.
+
+11. Ordered list can specify their starting point in `CommonMark`
+ but can't in `ocamldoc`, we issue a warning.
+
+12. There's no notion of hard breaks in `ocamldoc`, we issue a warning
+ to indicate that.
+
+13. For now the super and sub scripts will be translated by virtue of
+ raw HTML but this will only show up in the HTML backend. We could
+ try to be smarter on raw HTML and try to parse ``/``
+ markup in inlines.
+
+14. For simplicity when we get a `[!modules: …]` directive in the
+ middle of a paragraph, we drop it. `ocamldoc` is smarter, it warns
+ and splits the paragraph. Could be done at the cost of less clear
+ control flow, not sure it's a big deal the way it works now, you
+ have been warned as they say.
+
+15. `{!indexlist}` is still [unsupported][indexlist] in `odoc` and likely
+ forever. For now we recognize it and warn about it.
+
+16. There's no notion of link title in `ocamldoc`, we issue a warning
+ and drop them.
+
+[ocamldoc-html]: https://ocaml.org/manual/ocamldoc.html#sss:ocamldoc-html-tags
+[page-assets]: https://github.com/ocaml/odoc/issues/59
+[indexlist]: https://github.com/ocaml/odoc/issues/577
+[odoc-html]: https://github.com/ocaml/odoc/issues/576
+[current implementation]: https://github.com/dbuenzli/odoc-parser/tree/ocamlmark
+
+## Advantages
+
+1. CommonMark is found in a lot of other systems including
+ Git{Hub,Lab}, Discourse, etc. It is also increasingly being used
+ in API documentation tools, `rustdoc`, `Documenter.jl`, etc.
+
+2. Since it's everywhere, that's one less thing to learn for newcomers and
+ one less thing to have in the head for old-timers. But *only if* we
+ eventually get rid of the `ocamldoc` syntax.
+
+3. `.mld` files become (slightly special) `.md` files which means they
+ get a bit of rendering in various systems like Git{Lab,Hub} etc. As
+ per CommonMark specification, `ocamldoc` references will become
+ plain text in these renders, so we get "graceful degradation". We
+ can also lean on the CommonMark support of editors to edit them.
+
+4. One of the stated philosophy of CommonMark is that its formatting
+ should be publishable as-is without looking like it's been marked
+ up. This looks like a good fit for documentation in programming
+ language comments.
+
+
+## Disadvantages
+
+1. CommonMark never errors. If you get your markup wrong, you just get
+ text. You may not realize your outputs are garbled if you don't
+ check the renderings.
+
+2. If the `ocamldoc` language is kept. You now have two syntaxes for
+ writing documentation. Who can possibly think that syntax choice
+ is good ?
+
+3. The syntax for inline code clashes with the backtick of polymorphic
+ variants. If you want to inline a polymorphic variant case you need
+ to write, ``` `` `Myvariant `` ``` which is quite annoying to
+ remember, read and write.
+
+4. CommonMark has no syntax to break long URLs. Neither has `ocamldoc`
+ but I [hope] it can be added.
+
+ [hope]: https://github.com/ocaml/odoc/issues/865
+
+5. CommonMark is a rather complicated beast to parse full of weird
+ corner and pathological cases. This could make some tooling more
+ complicated than it needs to be (e.g. `ocp-indent` or `merlin` which
+ currenly completes in comments ? `odoc-parser` could mediate that
+ though).
+
+6. OCaml's doc comments becomes constrained by the CommonMark
+ specification and tied to what CommonMark can be. If `ocamlmark`
+ deviates too much, benefits are lost.
+
+
+## Personal opinions
+
+1. Disadvantage 2. should really be taken seriously, it basically thwarts
+ advantage 2. If both syntax are offered and officially supported then
+ as a user of the eco-system I have to a) choose one b) still learn the
+ other, since others will have made a different choice.
+
+ If `ocamlmark` is to be pursued, it should become the official
+ language, `ocamldoc` should be deprecated and we should devise
+ source-to-source comment migration tool so code bases can gradually
+ move to it effortlessly. Nothing insurmountable.
+
+2. I'm also a bit concerned by disadvantage 1. – personally I mostly read
+ the renderings when I devise docs but I suspect many people don't,
+ could be a problem for doc QA.
+
+3. To really assess advantage 4. I'd need more than the toy examples I
+ played with for making the POC. Does it really *look* good ?
+
+ One thing I missed is the easily scannable `ocamldoc` headers and
+ their labels. But maybe better syntax highlighting in comments
+ could fix that.
+
+ In general I find the `ocamldoc` code span syntax much more
+ readable:
+
+ * `` `'a` `` vs `['a]`
+ * `` `'a'` `` vs `['a']`
+ * `` `"string"` `` vs `["string"]`
+ * ``` `` `A `` ``` vs ``[`A]``
diff --git a/dune-project b/dune-project
index 6d8660be..6f26c3eb 100644
--- a/dune-project
+++ b/dune-project
@@ -1,6 +1,6 @@
(lang dune 2.8)
(name odoc-parser)
-(version 1.0.0)
+(version 2.0.0)
(generate_opam_files true)
@@ -21,5 +21,5 @@ understood by ocamldoc.")
astring
result
camlp-streams
+ cmarkit
(ppx_expect :with-test)))
-
diff --git a/odoc-parser.opam b/odoc-parser.opam
index 05b794a2..ec2eab23 100644
--- a/odoc-parser.opam
+++ b/odoc-parser.opam
@@ -1,6 +1,6 @@
# This file is generated by dune, edit dune-project instead
opam-version: "2.0"
-version: "1.0.0"
+version: "2.0.0"
synopsis: "Parser for ocaml documentation comments"
description: """
Odoc_parser is a library for parsing the contents of OCaml documentation
@@ -20,6 +20,7 @@ depends: [
"ocaml" {>= "4.02.0"}
"astring"
"result"
+ "cmarkit"
"camlp-streams"
"ppx_expect" {with-test}
("ocaml" {< "4.04.1" & with-test} | "sexplib0" {with-test})
@@ -37,4 +38,3 @@ build: [
"@runtest" {with-test}
]
]
-
diff --git a/odoc-parser.opam.template b/odoc-parser.opam.template
index 47ac9f5d..f8dd0985 100644
--- a/odoc-parser.opam.template
+++ b/odoc-parser.opam.template
@@ -6,6 +6,7 @@ depends: [
"ocaml" {>= "4.02.0"}
"astring"
"result"
+ "cmarkit"
"camlp-streams"
"ppx_expect" {with-test}
("ocaml" {< "4.04.1" & with-test} | "sexplib0" {with-test})
@@ -23,4 +24,3 @@ build: [
"@runtest" {with-test}
]
]
-
diff --git a/src/dune b/src/dune
index 36bd5bf2..2da3e56c 100644
--- a/src/dune
+++ b/src/dune
@@ -7,4 +7,4 @@
(backend bisect_ppx))
(flags
(:standard -w -50))
- (libraries astring result camlp-streams))
+ (libraries astring result cmarkit camlp-streams))
diff --git a/src/ocamlmark.ml b/src/ocamlmark.ml
new file mode 100644
index 00000000..27cf6935
--- /dev/null
+++ b/src/ocamlmark.ml
@@ -0,0 +1,647 @@
+
+let strf = Printf.sprintf
+
+(* Syntax sniffing *)
+
+type syntax = [ `Ocamldoc | `Ocamlmark ]
+
+let default_sniff =
+ (* We default to ocamldoc because while we do not treat @-tags
+ yet. Note that if there's no signal, e.g. pure text the results
+ should be the same. *)
+ `Ocamldoc
+
+let sniff_syntax ?(default = default_sniff) ~text () =
+ let ocamldoc_brace_next = function
+ | '0' .. '9' | 'b' | 'i' | 'e' | 'C' | 'L' | 'R' | 'o' | 'u' | '{'
+ | '!' | 'm' | '^' | '_' | '[' | 'v' | '%' -> true
+ | _ -> false
+ in
+ let is_blank = function ' ' | '\t' -> true | _ -> false in
+ let is_uppercase = function 'A' .. 'Z' -> true | _ -> false in
+ let ocamlmark_ref_next = function '!' -> true | _ -> false in
+ let ocamlmark_link_next = function '[' | '(' | ':' -> true | _ -> false in
+ let rec loop odoc cmark s max i =
+ if i >= max || Int.abs (odoc - cmark) > 3 then
+ if odoc = cmark then default else
+ if odoc > cmark then `Ocamldoc else `Ocamlmark
+ else
+ let prev = if i = 0 then 0 else i - 1 in
+ let next = i + 1 (* assert (next <= max) *) in
+ let odoc, cmark = match s.[i] with
+ | '{' when ocamldoc_brace_next s.[next] -> (odoc + 1), cmark
+ | '[' when ocamlmark_ref_next s.[next] -> odoc, (cmark + 1)
+ | '[' -> (* ocamldoc code spans *) odoc + 1, cmark
+ | ']' when ocamlmark_link_next s.[next] -> (odoc - 1), (cmark + 1)
+ | '`' when not (is_uppercase s.[next]) -> odoc, (cmark + 1)
+ | '#' when prev = 0 || is_blank s.[prev] -> odoc, (cmark + 1)
+ | _ -> odoc, cmark
+ in
+ loop odoc cmark s max (i + 1)
+ in
+ loop 0 0 text (String.length text - 1) 0
+
+(* ocamlmark parsing *)
+
+open Cmarkit
+
+(* Text location and comment massaging.
+
+ One slight annoyance is that CommonMark is sensitive to leading
+ blanks on lines and ocamldoc comments are usually indented by [n]
+ spaces up the … of (** … *). So we can't just feed it the comment
+ text: we would mostly get CommonMark indented code blocks.
+
+ So we massage the comment to trim up to [n] initial spaces after
+ newlines. [n] being the number of columns until … in (** … *). We
+ need to remember how much we trimmed on each line in order to patch
+ the locations reported by cmarkit. Below we keep pass that info
+ around using the [~locator] argument. *)
+
+let comment_col ~location = location.Lexing.pos_cnum - location.Lexing.pos_bol
+
+let massage_comment ~location b s =
+ let rec next_non_space s ~max i =
+ if i > max || not (s.[i] = ' ') then i else next_non_space s ~max (i + 1)
+ in
+ let rec find_after_trim ~max_trim s max ~start i =
+ if i - start + 1 > max_trim || i > max || s.[i] <> ' ' then i else
+ find_after_trim ~max_trim s max ~start (i + 1)
+ in
+ let flush b s start last = Buffer.add_substring b s start (last - start + 1)in
+ let rec loop b s acc ~max_trim max start k =
+ if k > max then
+ (flush b s start max;
+ (location, Array.of_list (List.rev acc)), Buffer.contents b) else
+ if not (s.[k] = '\n' || s.[k] = '\r')
+ then loop b s acc ~max_trim max start (k + 1) else
+ let next = k + 1 in
+ let next =
+ if s.[k] = '\r' && next <= max && s.[next] = '\n' then next + 1 else
+ next
+ in
+ let after_trim = find_after_trim ~max_trim s max ~start:next next in
+ let trim = after_trim - next in
+ flush b s start (next - 1);
+ loop b s (trim :: acc) ~max_trim max after_trim after_trim
+ in
+ if s = "" then (location, [|0|]), s else
+ let max = String.length s - 1 in
+ let nsp = next_non_space s ~max 0 in
+ let max_trim = comment_col ~location + nsp in
+ loop b s [nsp (* trim *)] ~max_trim max nsp nsp
+
+let textloc_to_loc ~locator textloc =
+ (* Note: if you get an [Invalid_argument] from this function suspect a bug
+ in cmarkit's location computation. *)
+ let point_of_line_and_byte_pos ~locator:(location, line_trim_counts) l pos =
+ let line_num, line_pos = l in
+ let line = location.Lexing.pos_lnum + line_num - 1 in
+ let column = line_trim_counts.(line_num - 1) + (pos - line_pos) in
+ let column = match line_num with
+ | 1 -> comment_col ~location + column
+ | _ -> column
+ in
+ { Loc.line; column }
+ in
+ let file = Textloc.file textloc in
+ let first_line = Textloc.first_line textloc in
+ let first_byte = Textloc.first_byte textloc in
+ let last_line = Textloc.last_line textloc in
+ let last_byte = Textloc.last_byte textloc + 1 in
+ let start = point_of_line_and_byte_pos ~locator first_line first_byte in
+ let end_ = point_of_line_and_byte_pos ~locator last_line last_byte in
+ { Loc.file; start; end_ }
+
+let meta_to_loc ~locator meta = textloc_to_loc ~locator (Meta.textloc meta)
+
+(* Sometimes we need to munge a bit the cmarkit metas and textlocs.
+ These function do that. They are not general and make assumptions
+ about the nature of data they apply to. E.g. most assume the
+ textloc is on the same line. *)
+
+let chop_end_of_meta_textloc ~count meta =
+ let textloc = Meta.textloc meta in
+ let last_line = Textloc.last_line textloc in
+ let last_byte = Textloc.last_byte textloc - count in
+ let textloc = Textloc.set_last textloc ~last_byte ~last_line in
+ Meta.with_textloc ~keep_id:true meta textloc
+
+let split_info_string_locs ~left_count ~right_count m =
+ if right_count = 0 then (Meta.textloc m, Textloc.none) else
+ let textloc = Meta.textloc m in
+ let line = Textloc.first_line textloc in
+ let last_byte = Textloc.first_byte textloc + left_count - 1 in
+ let first_byte = Textloc.last_byte textloc - right_count + 1 in
+ Textloc.set_last textloc ~last_byte ~last_line:line,
+ Textloc.set_first textloc ~first_byte ~first_line:line
+
+let textloc_of_sub textloc ~first ~last (* in textloc relative space *) =
+ let file = Textloc.file textloc in
+ let line = Textloc.first_line textloc in
+ let first_byte = Textloc.first_byte textloc + first in
+ let last_byte = Textloc.first_byte textloc + last in
+ Textloc.v ~file ~first_byte ~last_byte ~first_line:line ~last_line:line
+
+(* Warnings *)
+
+let warn_unsupported_hard_break =
+ "Hard breaks are unsupported in ocamlmark, using a soft break."
+
+let warn_unsupported_header_nesting =
+ "Headers in list items are unsupported in ocamlmark, dropped."
+
+let warn_heading_level_6 =
+ "Heading level 6 is unsupported in ocamlmark, using 5."
+
+let warn_unsupported_list_start_number start =
+ strf "List start numbers are unsupported in ocamlmark, replacing %d with 1."
+ start
+
+let warn_unsupported_cmark kind =
+ strf "%s are unsupported in ocamlmark, dropped." kind
+
+let warn_unsupported_indexlist =
+ "[!indexlist] is unsupported in ocamlmark, dropped."
+
+let warn_not_in_own_paragraph directive =
+ strf "%s must be alone in its own paragraph, dropped." directive
+
+let warn_unsupported_link_title =
+ "Link titles are unsupported in ocamlmark, dropped."
+
+let warn_illegal_label_def l =
+ strf "Label %S: label definitions can't start with ! in ocamlmark" l
+
+let warn_illegal_image_ref l =
+ strf "Image cannot reference OCaml reference %S." l
+
+let warn ~loc:location message warns = { Warning.location; message } :: warns
+
+let warn_unsupported_cmark ~locator kind meta (acc, warns) =
+ let msg = warn_unsupported_cmark kind in
+ (acc, warn ~loc:(meta_to_loc ~locator meta) msg warns)
+
+let warn_unsupported_header_nesting ~locator meta (acc, warns) =
+ let msg = warn_unsupported_header_nesting in
+ (acc, warn ~loc:(meta_to_loc ~locator meta) msg warns)
+
+(* This handle ocamlmark's implementation of ocamldoc's reference
+ syntax during CommonMark parsing. We forbid link reference
+ definitions whose label start with a ! character and on inline
+ reference links resolve these labels to synthetic label definitions
+ that we process specially during the AST translation. *)
+
+type ocamldoc_reference =
+[ `Reference of string Loc.with_location
+| `Modules of string Loc.with_location list
+| `Indexlist ]
+
+let ocamldoc_reference : ocamldoc_reference Meta.key = Meta.key ()
+let make_ocamldoc_reference label ref =
+ (* Synthetic label definition with the ocaml reference *)
+ let meta = Meta.add ocamldoc_reference ref (Label.meta label) in
+ Label.with_meta meta label
+
+let label_is_ocamldoc_reference l = String.starts_with ~prefix:"!" (Label.key l)
+let indexlist_directive = "!indexlist"
+let modules_directive = "!modules:"
+
+let is_blank = function ' ' | '\t' -> true | _ -> false
+let rec next_blank s ~max i =
+ if i > max || is_blank s.[i] then i else next_blank s ~max (i + 1)
+
+let rec next_nonblank s ~max i =
+ if i > max || not (is_blank s.[i]) then i else next_nonblank s ~max (i + 1)
+
+let parse_ocamldoc_modules_directive ~locator l =
+ let next_line = function [] -> None | l :: ls -> Some (ls, l) in
+ let flush_tok s meta acc first last =
+ let textloc = textloc_of_sub (Meta.textloc meta) ~first ~last in
+ let loc = textloc_to_loc ~locator textloc in
+ Loc.at loc (String.sub s first (last - first + 1)) :: acc
+ in
+ let rec parse_toks lines s meta acc max start =
+ let nb = next_nonblank s ~max start in
+ if nb > max then match next_line lines with
+ | None -> Some (`Modules (List.rev acc))
+ | Some (ls, (_, (s, m))) -> parse_toks ls s m acc (String.length s - 1) 0
+ else
+ let bl = next_blank s ~max nb in
+ let acc = flush_tok s meta acc nb (bl - 1) in
+ parse_toks lines s meta acc max bl
+ in
+ match next_line (Label.text l) with
+ | None -> None
+ | Some (lines, (_, (s, m))) ->
+ match String.index_opt s ':' with
+ | None -> None
+ | Some colon -> parse_toks lines s m [] (String.length s - 1) (colon + 1)
+
+let try_parse_ocamldoc_reference ~locator l =
+ match Label.key l (* we match on the normalized label *) with
+ | r when String.equal r indexlist_directive -> Some `Indexlist
+ | r when String.starts_with ~prefix:modules_directive r ->
+ parse_ocamldoc_modules_directive ~locator l
+ | r when String.starts_with ~prefix:"!" r ->
+ let loc = textloc_to_loc ~locator (Meta.textloc (Label.meta l)) in
+ let text = Label.text_to_string l (* the unormalized text *) in
+ let ref = String.sub text 1 (String.length text - 1) in
+ Some (`Reference (Loc.at loc ref))
+ | _ -> None
+
+let define_label ~locator warns label = (* Called on link reference defs *)
+ if not (label_is_ocamldoc_reference label) then Some label else
+ let loc = textloc_to_loc ~locator (Meta.textloc (Label.meta label)) in
+ let label = Label.text_to_string label in
+ warns := warn ~loc (warn_illegal_label_def label) !warns;
+ None
+
+let link_label_ref ~locator _warns label = (* Called on reference links *)
+ match try_parse_ocamldoc_reference ~locator label with
+ | None -> None
+ | Some ref -> Some (make_ocamldoc_reference label ref)
+
+let image_label_ref ~locator warns label = (* Called on images *)
+ if not (label_is_ocamldoc_reference label) then None else
+ let loc = textloc_to_loc ~locator (Meta.textloc (Label.meta label)) in
+ let label = Label.text_to_string label in
+ warns := warn ~loc (warn_illegal_image_ref label) !warns;
+ None
+
+let ocamldoc_reference_resolver ~locator warns = function
+| `Def (Some _, _) -> None (* XXX we could warn on multiple def here *)
+| `Def (None, k) -> define_label ~locator warns k
+| `Ref (_, _, (Some _ as k)) -> k
+| `Ref (`Link, ref, None) -> link_label_ref ~locator warns ref
+| `Ref (`Image, ref, None) -> image_label_ref ~locator warns ref
+
+(* Translating blocks and inlines. *)
+
+(* A few type definitions for better variant typing. *)
+
+type inlines_acc = Ast.inline_element Ast.with_location list * Warning.t list
+type ast_acc = Ast.t * Warning.t list
+type nestable_ast_acc =
+ Ast.nestable_block_element Ast.with_location list * Warning.t list
+
+(* Inline translations *)
+
+let link_definition defs l = match Inline.Link.reference_definition defs l with
+| Some (Link_definition.Def (ld, _)) -> ld
+| Some _ -> assert false (* if we parse without cmarkit extensions *)
+| None -> assert false (* assert [l]'s referenced label is not synthetic *)
+
+let autolink_to_inline_element ~locator a m (is, warns) =
+ let loc = meta_to_loc ~locator m in
+ let link, link_loc = Inline.Autolink.link a in
+ let link_loc = meta_to_loc ~locator link_loc in
+ let text = [Loc.at link_loc (`Word link)] in
+ Loc.at loc (`Link (link, text)) :: is, warns
+
+let break_to_inline_element ~locator br m (is, warns) =
+ let loc = meta_to_loc ~locator m in
+ let warns = match Inline.Break.type' br with
+ | `Soft -> warns
+ | `Hard -> warn ~loc warn_unsupported_hard_break warns
+ in
+ Loc.at loc (`Space "\n") :: is, warns
+
+let code_span_to_inline_element ~locator cs m (is, warns) =
+ let loc = meta_to_loc ~locator m in
+ let code = Inline.Code_span.code cs in
+ Loc.at loc (`Code_span code) :: is, warns
+
+let raw_html_to_inline_element ~locator html m (is, warns) =
+ let loc = meta_to_loc ~locator m in
+ let html = String.concat "\n" (List.map Block_line.tight_to_string html) in
+ Loc.at loc (`Raw_markup (Some "html", html)) :: is, warns
+
+let image_to_inline_element ~locator defs i m (is, warns) =
+ (* We map to raw html, ocamldoc's ast should have a case for that. *)
+ let escape esc b s = Buffer.clear b; esc b s; Buffer.contents b in
+ let pct_esc = escape Cmarkit_html.buffer_add_pct_encoded_string in
+ let html_esc = escape Cmarkit_html.buffer_add_html_escaped_string in
+ let loc = meta_to_loc ~locator m in
+ let b = Buffer.create 255 in
+ let ld = link_definition defs i in
+ let link = match Link_definition.dest ld with
+ | None -> "" | Some (link, _) -> pct_esc b link
+ in
+ let title = match Link_definition.title ld with
+ | None -> ""
+ | Some title ->
+ let title = List.map Block_line.tight_to_string title in
+ html_esc b (String.concat "\n" title)
+ in
+ let alt =
+ let ls = Inline.to_plain_text ~break_on_soft:false (Inline.Link.text i) in
+ html_esc b (String.concat "\n" (List.map (String.concat "") ls))
+ in
+ let img =
+ String.concat ""
+ [{|
"|}]
+ in
+ Loc.at loc (`Raw_markup (Some "html", img)) :: is, warns
+
+let text_to_inline_elements ~locator s meta (is, warns as acc) =
+ (* [s] is on a single source line (but may have newlines because of
+ character references) we need to tokenize it for ocamldoc's ast. *)
+ let flush_tok s meta acc is_space first last =
+ let textloc = textloc_of_sub (Meta.textloc meta) ~first ~last in
+ let loc = textloc_to_loc ~locator textloc in
+ let s = String.sub s first (last - first + 1) in
+ Loc.at loc (if is_space then `Space s else `Word s) :: acc
+ in
+ let rec tokenize s meta acc max start is_space =
+ if start > max then (List.rev_append acc is, warns) else
+ let next_start =
+ if is_space then next_nonblank s ~max start else next_blank s ~max start
+ in
+ let acc = flush_tok s meta acc is_space start (next_start - 1) in
+ tokenize s meta acc max next_start (not is_space)
+ in
+ let max = String.length s - 1 in
+ if max < 0 then acc else tokenize s meta [] max 0 (is_blank s.[0])
+
+let rec ocamldoc_reference_to_inline_element ~locator defs l m ref (is, warns) =
+ let loc = meta_to_loc ~locator m in
+ let kind, text, warns = match Inline.Link.reference l with
+ | `Ref ((`Collapsed | `Shortcut), _, _) -> `Simple, [], warns
+ | `Ref (`Full, _, _) ->
+ let i = Inline.Link.text l in
+ let text, warns = inline_to_inline_elements ~locator defs ([], warns) i in
+ `With_text, text, warns
+ | `Inline _ -> assert false
+ in
+ Loc.at loc (`Reference (kind, ref, text)) :: is, warns
+
+and link_reference_to_inline_element ~locator defs l m (is, warns) =
+ let loc = meta_to_loc ~locator m in
+ let ld = link_definition defs l in
+ let link = match Link_definition.dest ld with None -> "" | Some (l,_) -> l in
+ let warns = match Link_definition.title ld with
+ | None -> warns
+ | Some title ->
+ let textloc = Block_line.tight_list_textloc title in
+ let loc = textloc_to_loc ~locator textloc in
+ warn ~loc warn_unsupported_link_title warns
+ in
+ let text, warns =
+ inline_to_inline_elements ~locator defs ([], warns) (Inline.Link.text l)
+ in
+ Loc.at loc (`Link (link, text)) :: is, warns
+
+and link_to_inline_element ~locator defs l m (is, warns as acc) =
+ match Inline.Link.reference l with
+ | `Inline _ -> link_reference_to_inline_element ~locator defs l m acc
+ | `Ref (_, _, def) ->
+ match Meta.find ocamldoc_reference (Label.meta def) with
+ | None -> link_reference_to_inline_element ~locator defs l m acc
+ | Some `Reference ref ->
+ ocamldoc_reference_to_inline_element ~locator defs l m ref acc
+ | Some `Indexlist ->
+ let w = warn_not_in_own_paragraph "[!indexlist]" in
+ is, warn ~loc:(meta_to_loc ~locator m) w warns
+ | Some `Modules _ ->
+ let w = warn_not_in_own_paragraph "[!modules …]" in
+ is, warn ~loc:(meta_to_loc ~locator m) w warns
+
+and emphasis_to_inline_element ~locator defs style e m (is, warns) =
+ let loc = meta_to_loc ~locator m in
+ let i = Inline.Emphasis.inline e in
+ let inlines, warns = inline_to_inline_elements ~locator defs ([], warns) i in
+ Loc.at loc (`Styled (style, inlines)) :: is, warns
+
+and inline_to_inline_elements ~locator defs acc i : inlines_acc = match i with
+| Inline.Autolink (a, m) ->
+ autolink_to_inline_element ~locator a m acc
+| Inline.Break (b, m) ->
+ break_to_inline_element ~locator b m acc
+| Inline.Code_span (cs, m) ->
+ code_span_to_inline_element ~locator cs m acc
+| Inline.Emphasis (e, m) ->
+ emphasis_to_inline_element ~locator defs `Emphasis e m acc
+| Inline.Image (i, m) ->
+ image_to_inline_element ~locator defs i m acc
+| Inline.Inlines (is, _m) ->
+ let inline = inline_to_inline_elements ~locator defs in
+ List.fold_left inline acc (List.rev is)
+| Inline.Link (l, m) ->
+ link_to_inline_element ~locator defs l m acc
+| Inline.Raw_html (html, m) ->
+ raw_html_to_inline_element ~locator html m acc
+| Inline.Strong_emphasis (e, m) ->
+ emphasis_to_inline_element ~locator defs `Bold e m acc
+| Inline.Text (t, m) ->
+ text_to_inline_elements ~locator t m acc
+| _ -> assert false
+
+(* Heading label support - CommonMark extension. Parses a potential
+ final {#id} in heading inlines. In [id] braces must be escaped
+ otherwise parsing fails; if the rightmost brace is escaped it's
+ not a heading label. The parse runs from right to left *)
+
+let parse_heading_label s =
+ let rec loop s max prev i =
+ if i < 0 then None else match s.[i] with
+ | '{' as c ->
+ if i > 0 && s.[i - 1] = '\\' then loop s max c (i - 1) else
+ if prev = '#' then Some i else None
+ | '}' as c ->
+ if i > 0 && s.[i - 1] = '\\' then loop s max c (i - 1) else None
+ | c -> loop s max c (i - 1)
+ in
+ let max = String.length s - 1 in
+ let last = (* [last] is rightmost non blank, if any. *)
+ let k = ref max in
+ while not (!k < 0) && is_blank s.[!k] do decr k done; !k
+ in
+ if last < 1 || s.[last] <> '}' || s.[last - 1] = '\\' then None else
+ match loop s max s.[last] (last - 1) with
+ | None -> None
+ | Some first ->
+ let chop = max - first + 1 in
+ let text = String.sub s 0 first in
+ let first = first + 2 and last = last - 1 in (* remove delims *)
+ let label = String.sub s first (last - first + 1) in
+ Some (text, chop, label)
+
+let heading_inline_and_label h =
+ (* cmarkit claims it's already normalized but let's be defensive :-) *)
+ match Inline.normalize (Block.Heading.inline h) with
+ | Inline.Text (t, m) as inline ->
+ begin match parse_heading_label t with
+ | None -> inline, None
+ | Some (t, chop, label) ->
+ let m = chop_end_of_meta_textloc ~count:chop m in
+ Inline.Text (t, m), Some label
+ end
+ | Inline.Inlines (is, m0) as inline ->
+ begin match List.rev is with
+ | Inline.Text (t, m1) :: ris ->
+ begin match parse_heading_label t with
+ | None -> inline, None
+ | Some (t, chop, label) ->
+ let m0 = chop_end_of_meta_textloc ~count:chop m0 in
+ let m1 = chop_end_of_meta_textloc ~count:chop m1 in
+ Inline.Inlines (List.rev (Inline.Text (t, m1) :: ris), m0),
+ Some label
+ end
+ | _ -> inline, None
+ end
+ | inline -> inline, None
+
+(* Block translations *)
+
+let raw_paragraph ~loc ~raw_loc backend raw =
+ Loc.at loc (`Paragraph [Loc.at raw_loc (`Raw_markup (Some backend, raw))])
+
+let code_block_to_nestable_block_element ~locator cb m (bs, warns) =
+ let loc = meta_to_loc ~locator m in
+ let code = Block.Code_block.code cb in
+ let code_loc = textloc_to_loc ~locator (Block_line.list_textloc code) in
+ let code = String.concat "\n" (List.map Block_line.to_string code) in
+ match Block.Code_block.info_string cb with
+ | None -> Loc.at loc (`Code_block (None, Loc.at code_loc code)) :: bs, warns
+ | Some (info, im) ->
+ match Block.Code_block.language_of_info_string info with
+ | None ->
+ Loc.at loc (`Code_block (None, Loc.at code_loc code)) :: bs, warns
+ | Some ("verb", _) ->
+ Loc.at loc (`Verbatim code) :: bs, warns
+ | Some ("=html", _) ->
+ (raw_paragraph ~loc ~raw_loc:code_loc "html" code) :: bs, warns
+ | Some ("=latex", _) ->
+ (raw_paragraph ~loc ~raw_loc:code_loc "latex" code) :: bs, warns
+ | Some ("=texi", _) ->
+ (raw_paragraph ~loc ~raw_loc:code_loc "texi" code) :: bs, warns
+ | Some ("=man", _) ->
+ (raw_paragraph ~loc ~raw_loc:code_loc "man" code) :: bs, warns
+ | Some (lang, env) ->
+ let left_count = String.length lang in
+ let right_count = String.length env in
+ let lang_loc, env_loc =
+ split_info_string_locs ~left_count ~right_count im
+ in
+ let env =
+ if env = "" then None else
+ Some (Loc.at (textloc_to_loc ~locator env_loc) env)
+ in
+ let lang = Loc.at (textloc_to_loc ~locator lang_loc) lang in
+ let metadata = Some (lang, env) in
+ Loc.at loc (`Code_block (metadata, Loc.at code_loc code)) :: bs,
+ warns
+
+let html_block_to_nestable_block_element ~locator html m (bs, warns) =
+ let loc = meta_to_loc ~locator m in
+ let html = String.concat "\n" (List.map fst html) in
+ raw_paragraph ~loc ~raw_loc:loc "html" html :: bs, warns
+
+let heading_to_block_element ~locator defs h m (bs, warns) =
+ let loc = meta_to_loc ~locator m in
+ let level, warns = match Block.Heading.level h with
+ | 6 -> 5, warn ~loc warn_heading_level_6 warns
+ | level -> level, warns
+ in
+ let inline, label = heading_inline_and_label h in
+ let inlines, warns =
+ inline_to_inline_elements ~locator defs ([], warns) inline
+ in
+ Loc.at loc (`Heading (level, label, inlines)) :: bs, warns
+
+let try_ocamldoc_reference_directive ~locator i (bs, warns) = match i with
+| Inline.Link (l, meta) ->
+ begin match Inline.Link.reference l with
+ | `Inline _ -> None
+ | `Ref (_, _, def) ->
+ match Meta.find ocamldoc_reference (Label.meta def) with
+ | None | Some (`Reference _) -> None
+ | Some (`Modules _ as m) ->
+ let loc = meta_to_loc ~locator meta in
+ Some (Loc.at loc m :: bs, warns)
+ | Some `Indexlist ->
+ let loc = meta_to_loc ~locator meta in
+ Some (bs, warn ~loc warn_unsupported_indexlist warns)
+ end
+| _ -> None
+
+let paragraph_to_nestable_block_element ~locator defs p m (bs, warns as acc) =
+ (* TODO Parse inlines for @tags support. *)
+ let loc = meta_to_loc ~locator m in
+ let i = Block.Paragraph.inline p in
+ match try_ocamldoc_reference_directive ~locator i acc with
+ | Some acc -> acc
+ | None ->
+ let is, warns = inline_to_inline_elements ~locator defs ([], warns) i in
+ Loc.at loc (`Paragraph is) :: bs, warns
+
+let thematic_break_to_nestable_block_element ~locator m (bs, warns) =
+ let loc = meta_to_loc ~locator m in
+ (raw_paragraph ~loc ~raw_loc:loc "html" "
") :: bs, warns
+
+let rec list_to_nestable_block_element ~locator defs l m (bs, warns) =
+ let loc = meta_to_loc ~locator m in
+ let style = `Heavy (* Note this is a layout property of ocamldoc *) in
+ let kind, warns = match Block.List'.type' l with
+ | `Unordered _ -> `Unordered, warns
+ | `Ordered (start, _) ->
+ `Ordered, (if start = 1 then warns else
+ warn ~loc (warn_unsupported_list_start_number start) warns)
+ in
+ let add_item ~locator (acc, warns) (i, _meta) =
+ let b = Block.List_item.block i in
+ let bs, warns =
+ block_to_nestable_block_elements ~locator defs ([], warns) b
+ in
+ (bs :: acc, warns)
+ in
+ let ritems = List.rev (Block.List'.items l) in
+ let items, warns = List.fold_left (add_item ~locator) ([], warns) ritems in
+ Loc.at loc (`List (kind, style, items)) :: bs, warns
+
+and block_to_nestable_block_elements ~locator defs acc b : nestable_ast_acc =
+ match b with
+ | Block.Blocks (bs, _) ->
+ let block = block_to_nestable_block_elements ~locator defs in
+ List.fold_left block acc (List.rev bs)
+ | Block.Code_block (c, m) ->
+ code_block_to_nestable_block_element ~locator c m acc
+ | Block.Heading (_, m) ->
+ warn_unsupported_header_nesting ~locator m acc
+ | Block.Html_block (html, m) ->
+ html_block_to_nestable_block_element ~locator html m acc
+ | Block.List (l, m) ->
+ list_to_nestable_block_element ~locator defs l m acc
+ | Block.Paragraph (p, m) ->
+ paragraph_to_nestable_block_element ~locator defs p m acc
+ | Block.Block_quote (_, m) ->
+ warn_unsupported_cmark ~locator "Block quotes" m acc
+ | Block.Thematic_break (_, m) ->
+ thematic_break_to_nestable_block_element ~locator m acc
+ | Block.Blank_line _
+ | Block.Link_reference_definition _ -> (* layout cases *) acc
+ | _ -> assert false
+
+let rec block_to_ast ~locator defs acc b : ast_acc = match b with
+| Block.Heading (h, m) -> heading_to_block_element ~locator defs h m acc
+| Block.Blocks (bs, _) ->
+ List.fold_left (block_to_ast ~locator defs) acc (List.rev bs)
+| b ->
+ (* We can't go directy with acc because of nestable typing. *)
+ let bs, ws = acc in
+ let bs', ws = block_to_nestable_block_elements ~locator defs ([], ws) b in
+ List.rev_append (List.rev (bs' :> Ast.t)) bs, ws
+
+(* Parsing comments *)
+
+let parse_comment ?buffer:b ~location ~text:s () : Ast.t * Warning.t list =
+ let b = match b with
+ | None -> Buffer.create (String.length s)
+ | Some b -> Buffer.reset b; b
+ in
+ let locator, text = massage_comment ~location b s in
+ let warns = ref [] and file = location.Lexing.pos_fname in
+ let resolver = ocamldoc_reference_resolver ~locator warns in
+ let doc = Doc.of_string ~resolver ~file ~locs:true ~strict:true text in
+ block_to_ast ~locator (Doc.defs doc) ([], !warns) (Doc.block doc)
diff --git a/src/ocamlmark.mli b/src/ocamlmark.mli
new file mode 100644
index 00000000..9f75d018
--- /dev/null
+++ b/src/ocamlmark.mli
@@ -0,0 +1,18 @@
+(** [ocamlmark] support. *)
+
+(** {1:syntax Syntax sniffing} *)
+
+type syntax = [ `Ocamldoc | `Ocamlmark ]
+(** The type for document syntaxes. *)
+
+val sniff_syntax : ?default:syntax -> text:string -> unit -> syntax
+(** [sniff_syntax ~text] sniffes a syntax for [text], defaulting
+ to [default] if nothing smells (defaults to [`Ocamldoc]). *)
+
+(** {1:parsing ocamlmark parsing} *)
+
+val parse_comment :
+ ?buffer:Buffer.t -> location:Lexing.position -> text:string -> unit ->
+ Ast.t * Warning.t list
+(** [parse_comment ~location ~text] parses the ocamlmark [text] assuming it
+ corresponds to [location]. [buffer] is used as a scratch buffer. *)
diff --git a/src/odoc_parser.ml b/src/odoc_parser.ml
index 2d9fdd5d..e7992b0b 100644
--- a/src/odoc_parser.ml
+++ b/src/odoc_parser.ml
@@ -99,8 +99,7 @@ let position_of_point : t -> Loc.point -> Lexing.position =
let pos_fname = original_pos.pos_fname in
{ Lexing.pos_bol; pos_lnum; pos_cnum; pos_fname }
-(* The main entry point for this module *)
-let parse_comment ~location ~text =
+let ocamldoc_parse_comment ~location ~text =
let warnings = ref [] in
let reversed_newlines = reversed_newlines ~input:text in
let token_stream =
@@ -116,6 +115,14 @@ let parse_comment ~location ~text =
let ast, warnings = Syntax.parse warnings token_stream in
{ ast; warnings; reversed_newlines; original_pos = location }
+(* The main entry point for this module *)
+let parse_comment ~location ~text = match Ocamlmark.sniff_syntax ~text () with
+| `Ocamldoc -> ocamldoc_parse_comment ~location ~text
+| `Ocamlmark ->
+ let ast, warnings = Ocamlmark.parse_comment ~location ~text () in
+ let reversed_newlines = reversed_newlines ~input:text in
+ { ast; warnings; reversed_newlines; original_pos = location }
+
(* Accessor functions, as [t] is opaque *)
let warnings t = t.warnings
let ast t = t.ast
diff --git a/src/odoc_parser.mli b/src/odoc_parser.mli
index 5dbd4a08..c9c69a96 100644
--- a/src/odoc_parser.mli
+++ b/src/odoc_parser.mli
@@ -22,7 +22,7 @@ module Loc = Loc
(** Warnings produced during parsing. *)
module Warning : sig
type t = Warning.t = { location : Loc.span; message : string }
- (** Warnings are represented as record containing the human-readable text
+ (** Warnings are represented as record containing the human-readable text
of the warning alongside the location of the offending text in the source *)
val pp : Format.formatter -> t -> unit