diff --git a/doc/ocamlmark.md b/doc/ocamlmark.md new file mode 100644 index 00000000..9d470eab --- /dev/null +++ b/doc/ocamlmark.md @@ -0,0 +1,329 @@ +`ocamlmark` – An `ocamldoc` to CommonMark bi-directional translation +==================================================================== + +The `ocamlmark` language allows to write OCaml documentation strings +and `.mld` files in [CommonMark]. + +It is just the CommonMark language with the following tweaks: + +1. Link references `[label]` whose `label` starts with `!` cannot be + defined in documents and are used to stand for the `{!…}`, + `{!modules…}` and `{!indexlist}` references and directives + of `ocamldoc`. +2. An extension to specify heading reference labels to match the + `{[0-5]+:label text}` `ocamldoc` heading construct. We reuse + [Pandoc's heading identifiers][pandoc-heading-ids]'s syntax. +3. In paragraphs, a few more construct are detected to support + `ocamldoc`'s [`@-tags`][ocamldoc-@tag]. + +See the [full translation](#translation). + +## Short example + +The following show equivalent `ocamldoc` and `ocamlmark` text side-by-side: + +``` +{1:mapping Mapping list elements} | # Mapping list elements {#mapping} + | +Use function {!List.map} to map the | Use function [!List.map] to map the +elements of a list. Other | elements of a list. Other +{{!List.iterators}iterators} are | [iterators][!List.iterators] are +available. | available. +``` + +And this body of `ocamlmark` warns on the reference definition that it +is illegal, because its label starts with a `!`. + +``` +See my wonderful [website][!website] + +[!website]: https://example.org +``` + +[CommonMark]: https://spec.commonmark.org/current +[ocamldoc]: https://ocaml.org/manual/ocamldoc.html#ss:ocamldoc-syntax +[ocamldoc-ast]: https://github.com/ocaml-doc/odoc-parser/blob/main/src/ast.ml +[ocamldoc-@tag]: https://ocaml.org/manual/ocamldoc.html#ss:ocamldoc-tags +[pandoc-heading-ids]: https://pandoc.org/MANUAL.html#heading-identifiers + +## Translation + +The `ocamldoc` language is defined [here][ocamldoc] +([AST][ocamldoc-ast]). The `CommonMark` language is defined +[here][commonmark]. + + + The ←, ↔ and → arrows indicate translation directions. +```` +ocamldoc ocamlmark +-------------------------------------------------------- +{[0-5] text} ↔ #… text +{[0-5]:label text} ↔ #… text {#label} +{b text} ↔ **text** +{i text} ↔ *text* +{e text} ↔ *text* +{C text} ↔ text +{L text} ↔ text +{R text} ↔ text +{ol {- }… ↔ 1. … +{ul {- }… ↔ * … +{{:url} text} ↔ [text](url) +{{!ref} text} ↔ [text][!ref] +[string] ↔ `string` (string not starting with `) +[`string] ↔ `` `string `` +{!ref} ↔ [!ref] +{!modules m…} ↔ [!modules m…] (in its own paragraph) +{!indexlist} ↔ [!indexlist] (in its own paragraph) +{^ text} → text +{%html: …%} ← text +{_ text} → text +{%html: …%} ← text +{[ string ]} ↔ ``` + string + ``` + or indented code block +{@info[ string ]} ↔ ```info + string + ``` +{v string v} ↔ ```verb + string + ``` +{%latex: string %} ↔ ```=latex +{% string %} string + ``` +{%html: string %} ← HTML block or raw HTML or + ```=html + string + ``` +{%texi: string %} ← ```=texi + string + ``` +{%man: string %} ← ```=man + string + ``` +text + +{%html:
%} ← *** (thematic break) +??? ← > … (block quotes) + +@authors string ↔ @authors string +@deprecated text ↔ @deprected text +@param id text ↔ @param id text +@raise Exc text ↔ @raise string text +@return text ↔ @return text +@see url text ↔ @see url string +@since string ↔ @since string +@before string ↔ @before string +@version string ↔ @version string +```` + +### Documentation comment structure + +CommonMark is sensitive to leading spaces on lines and `ocamldoc` +comments are usually indented. + +`ocamlmark` defines a start column to define the first relevant +character of lines in documentation comments. The start column is set +to the first non-space character after the `**` marker (this means right +after it if a newline immediately follows). In this example, the first +character of the line is the `c` character: + + (** c + This is an indented code block. + And this is not *) + +On every line of the comment, leading spaces are trimmed up to the +start column. If there are less spaces than that, the line starts on the +first non space character as if it was indented to the start column. + +These three comments are equivalent: + +``` + (** This is multi-line + text *) + + (**This is multi-line + text *) + + (** This is multi-line + text *) +``` + +## Notes and unresolved questions + +Here are a few notes and questions on the above translation and the +[current implementation] in `odoc-parser` that translates CommonMark to +the `ocamldoc` abstract syntax tree. + + 0. Comment structure. We should likely refine the strategy for fenced code + blocks, so that we can have them flush left to benefit from 80 columns. + For now if you do that you lose indentation up to the start column. + + 1. Tags are a litte bit unclear. `odoc` changed their semantics: they + are no longer allowed inline. We could simply define them as + paragraphs and the first inline defines the tag. Personally I + would rather be in favor of deprecating most them in `ocamlmark` + and let them be OCaml attributes for the ones that are useful. This + somehow already happened for `@deprecated` which became an + alert. `@version`, `@since` and `@before` would also be useful for + enriching the compiler alerting mecanisms and `@raise` would provide + information for static analysers. However there are new `odoc` + specific hint tags that are needed (e.g. `@canonical`, `@open`, + `@inline`) and don't make sense as attributes. + + 2. The image link reference translation is not really + statisfactory. After all these years it's time for `odoc` to + really solve the [page assets problem][page-assets]. + + 3. The above table does not include the latest `ocamldoc` additions + (LATEX maths) and upcoming additions (tables). + `cmarkit` supports both though. + + 4. We need a way to handle the heading 0 level trick of `.mld` + files. It's not natural to use, for toplevel sections, level 2 in + `.mld` files and level 1 in modules. The simplest would be for + `.mld` files to take a first level 1 heading as defining the + heading 0. But for that we need to know we are parsing a `.mld` + files, the current `odoc-parser` API does not tell us that. + + 5. At the moment `odoc` does not support `ocamldoc`'s [raw HTML + tags][ocamldoc-html], [no decision][odoc-html] has been taken yet + but it wouldn't be hard to translate it in either direction. + + 6. CommonMark is less restricted than `ocamldoc`. It can have headings + in list items. When this occurs we warn and drop the construct. + + 7. What should we do with thematic breaks ? For now we translated + them to raw HTML. + + 8. Should we treat the concatenation of comments as a document ? + This only influences link reference definition. Maybe we could + just accumulate link references as we go. This makes it slightly + at odds with CommonMark semantics were link definitions are + allowed to occur anywhere. + + But it would be nice to be able to reuse link reference definitions + from one comment to the other, especially given disadvantage 4 (see + below). Doing it in reverse (from bottom to top) and accumulating + refs would allow to have long URLs as link reference definitions at + the end of the doc. + + 9. For the `ocamldoc` to `ocamlmark` translation we don't have a + syntax for `{%backend:` *when these are used inlines*, except for + raw HTML if the raw HTML satisfies the complicated contraints of + CommonMark raw HTML. Could be a problem for that translation + direction if people used that. + +11. Ordered list can specify their starting point in `CommonMark` + but can't in `ocamldoc`, we issue a warning. + +12. There's no notion of hard breaks in `ocamldoc`, we issue a warning + to indicate that. + +13. For now the super and sub scripts will be translated by virtue of + raw HTML but this will only show up in the HTML backend. We could + try to be smarter on raw HTML and try to parse ``/`` + markup in inlines. + +14. For simplicity when we get a `[!modules: …]` directive in the + middle of a paragraph, we drop it. `ocamldoc` is smarter, it warns + and splits the paragraph. Could be done at the cost of less clear + control flow, not sure it's a big deal the way it works now, you + have been warned as they say. + +15. `{!indexlist}` is still [unsupported][indexlist] in `odoc` and likely + forever. For now we recognize it and warn about it. + +16. There's no notion of link title in `ocamldoc`, we issue a warning + and drop them. + +[ocamldoc-html]: https://ocaml.org/manual/ocamldoc.html#sss:ocamldoc-html-tags +[page-assets]: https://github.com/ocaml/odoc/issues/59 +[indexlist]: https://github.com/ocaml/odoc/issues/577 +[odoc-html]: https://github.com/ocaml/odoc/issues/576 +[current implementation]: https://github.com/dbuenzli/odoc-parser/tree/ocamlmark + +## Advantages + +1. CommonMark is found in a lot of other systems including + Git{Hub,Lab}, Discourse, etc. It is also increasingly being used + in API documentation tools, `rustdoc`, `Documenter.jl`, etc. + +2. Since it's everywhere, that's one less thing to learn for newcomers and + one less thing to have in the head for old-timers. But *only if* we + eventually get rid of the `ocamldoc` syntax. + +3. `.mld` files become (slightly special) `.md` files which means they + get a bit of rendering in various systems like Git{Lab,Hub} etc. As + per CommonMark specification, `ocamldoc` references will become + plain text in these renders, so we get "graceful degradation". We + can also lean on the CommonMark support of editors to edit them. + +4. One of the stated philosophy of CommonMark is that its formatting + should be publishable as-is without looking like it's been marked + up. This looks like a good fit for documentation in programming + language comments. + + +## Disadvantages + +1. CommonMark never errors. If you get your markup wrong, you just get + text. You may not realize your outputs are garbled if you don't + check the renderings. + +2. If the `ocamldoc` language is kept. You now have two syntaxes for + writing documentation. Who can possibly think that syntax choice + is good ? + +3. The syntax for inline code clashes with the backtick of polymorphic + variants. If you want to inline a polymorphic variant case you need + to write, ``` `` `Myvariant `` ``` which is quite annoying to + remember, read and write. + +4. CommonMark has no syntax to break long URLs. Neither has `ocamldoc` + but I [hope] it can be added. + + [hope]: https://github.com/ocaml/odoc/issues/865 + +5. CommonMark is a rather complicated beast to parse full of weird + corner and pathological cases. This could make some tooling more + complicated than it needs to be (e.g. `ocp-indent` or `merlin` which + currenly completes in comments ? `odoc-parser` could mediate that + though). + +6. OCaml's doc comments becomes constrained by the CommonMark + specification and tied to what CommonMark can be. If `ocamlmark` + deviates too much, benefits are lost. + + +## Personal opinions + +1. Disadvantage 2. should really be taken seriously, it basically thwarts + advantage 2. If both syntax are offered and officially supported then + as a user of the eco-system I have to a) choose one b) still learn the + other, since others will have made a different choice. + + If `ocamlmark` is to be pursued, it should become the official + language, `ocamldoc` should be deprecated and we should devise + source-to-source comment migration tool so code bases can gradually + move to it effortlessly. Nothing insurmountable. + +2. I'm also a bit concerned by disadvantage 1. – personally I mostly read + the renderings when I devise docs but I suspect many people don't, + could be a problem for doc QA. + +3. To really assess advantage 4. I'd need more than the toy examples I + played with for making the POC. Does it really *look* good ? + + One thing I missed is the easily scannable `ocamldoc` headers and + their labels. But maybe better syntax highlighting in comments + could fix that. + + In general I find the `ocamldoc` code span syntax much more + readable: + + * `` `'a` `` vs `['a]` + * `` `'a'` `` vs `['a']` + * `` `"string"` `` vs `["string"]` + * ``` `` `A `` ``` vs ``[`A]`` diff --git a/dune-project b/dune-project index 6d8660be..6f26c3eb 100644 --- a/dune-project +++ b/dune-project @@ -1,6 +1,6 @@ (lang dune 2.8) (name odoc-parser) -(version 1.0.0) +(version 2.0.0) (generate_opam_files true) @@ -21,5 +21,5 @@ understood by ocamldoc.") astring result camlp-streams + cmarkit (ppx_expect :with-test))) - diff --git a/odoc-parser.opam b/odoc-parser.opam index 05b794a2..ec2eab23 100644 --- a/odoc-parser.opam +++ b/odoc-parser.opam @@ -1,6 +1,6 @@ # This file is generated by dune, edit dune-project instead opam-version: "2.0" -version: "1.0.0" +version: "2.0.0" synopsis: "Parser for ocaml documentation comments" description: """ Odoc_parser is a library for parsing the contents of OCaml documentation @@ -20,6 +20,7 @@ depends: [ "ocaml" {>= "4.02.0"} "astring" "result" + "cmarkit" "camlp-streams" "ppx_expect" {with-test} ("ocaml" {< "4.04.1" & with-test} | "sexplib0" {with-test}) @@ -37,4 +38,3 @@ build: [ "@runtest" {with-test} ] ] - diff --git a/odoc-parser.opam.template b/odoc-parser.opam.template index 47ac9f5d..f8dd0985 100644 --- a/odoc-parser.opam.template +++ b/odoc-parser.opam.template @@ -6,6 +6,7 @@ depends: [ "ocaml" {>= "4.02.0"} "astring" "result" + "cmarkit" "camlp-streams" "ppx_expect" {with-test} ("ocaml" {< "4.04.1" & with-test} | "sexplib0" {with-test}) @@ -23,4 +24,3 @@ build: [ "@runtest" {with-test} ] ] - diff --git a/src/dune b/src/dune index 36bd5bf2..2da3e56c 100644 --- a/src/dune +++ b/src/dune @@ -7,4 +7,4 @@ (backend bisect_ppx)) (flags (:standard -w -50)) - (libraries astring result camlp-streams)) + (libraries astring result cmarkit camlp-streams)) diff --git a/src/ocamlmark.ml b/src/ocamlmark.ml new file mode 100644 index 00000000..27cf6935 --- /dev/null +++ b/src/ocamlmark.ml @@ -0,0 +1,647 @@ + +let strf = Printf.sprintf + +(* Syntax sniffing *) + +type syntax = [ `Ocamldoc | `Ocamlmark ] + +let default_sniff = + (* We default to ocamldoc because while we do not treat @-tags + yet. Note that if there's no signal, e.g. pure text the results + should be the same. *) + `Ocamldoc + +let sniff_syntax ?(default = default_sniff) ~text () = + let ocamldoc_brace_next = function + | '0' .. '9' | 'b' | 'i' | 'e' | 'C' | 'L' | 'R' | 'o' | 'u' | '{' + | '!' | 'm' | '^' | '_' | '[' | 'v' | '%' -> true + | _ -> false + in + let is_blank = function ' ' | '\t' -> true | _ -> false in + let is_uppercase = function 'A' .. 'Z' -> true | _ -> false in + let ocamlmark_ref_next = function '!' -> true | _ -> false in + let ocamlmark_link_next = function '[' | '(' | ':' -> true | _ -> false in + let rec loop odoc cmark s max i = + if i >= max || Int.abs (odoc - cmark) > 3 then + if odoc = cmark then default else + if odoc > cmark then `Ocamldoc else `Ocamlmark + else + let prev = if i = 0 then 0 else i - 1 in + let next = i + 1 (* assert (next <= max) *) in + let odoc, cmark = match s.[i] with + | '{' when ocamldoc_brace_next s.[next] -> (odoc + 1), cmark + | '[' when ocamlmark_ref_next s.[next] -> odoc, (cmark + 1) + | '[' -> (* ocamldoc code spans *) odoc + 1, cmark + | ']' when ocamlmark_link_next s.[next] -> (odoc - 1), (cmark + 1) + | '`' when not (is_uppercase s.[next]) -> odoc, (cmark + 1) + | '#' when prev = 0 || is_blank s.[prev] -> odoc, (cmark + 1) + | _ -> odoc, cmark + in + loop odoc cmark s max (i + 1) + in + loop 0 0 text (String.length text - 1) 0 + +(* ocamlmark parsing *) + +open Cmarkit + +(* Text location and comment massaging. + + One slight annoyance is that CommonMark is sensitive to leading + blanks on lines and ocamldoc comments are usually indented by [n] + spaces up the … of (** … *). So we can't just feed it the comment + text: we would mostly get CommonMark indented code blocks. + + So we massage the comment to trim up to [n] initial spaces after + newlines. [n] being the number of columns until … in (** … *). We + need to remember how much we trimmed on each line in order to patch + the locations reported by cmarkit. Below we keep pass that info + around using the [~locator] argument. *) + +let comment_col ~location = location.Lexing.pos_cnum - location.Lexing.pos_bol + +let massage_comment ~location b s = + let rec next_non_space s ~max i = + if i > max || not (s.[i] = ' ') then i else next_non_space s ~max (i + 1) + in + let rec find_after_trim ~max_trim s max ~start i = + if i - start + 1 > max_trim || i > max || s.[i] <> ' ' then i else + find_after_trim ~max_trim s max ~start (i + 1) + in + let flush b s start last = Buffer.add_substring b s start (last - start + 1)in + let rec loop b s acc ~max_trim max start k = + if k > max then + (flush b s start max; + (location, Array.of_list (List.rev acc)), Buffer.contents b) else + if not (s.[k] = '\n' || s.[k] = '\r') + then loop b s acc ~max_trim max start (k + 1) else + let next = k + 1 in + let next = + if s.[k] = '\r' && next <= max && s.[next] = '\n' then next + 1 else + next + in + let after_trim = find_after_trim ~max_trim s max ~start:next next in + let trim = after_trim - next in + flush b s start (next - 1); + loop b s (trim :: acc) ~max_trim max after_trim after_trim + in + if s = "" then (location, [|0|]), s else + let max = String.length s - 1 in + let nsp = next_non_space s ~max 0 in + let max_trim = comment_col ~location + nsp in + loop b s [nsp (* trim *)] ~max_trim max nsp nsp + +let textloc_to_loc ~locator textloc = + (* Note: if you get an [Invalid_argument] from this function suspect a bug + in cmarkit's location computation. *) + let point_of_line_and_byte_pos ~locator:(location, line_trim_counts) l pos = + let line_num, line_pos = l in + let line = location.Lexing.pos_lnum + line_num - 1 in + let column = line_trim_counts.(line_num - 1) + (pos - line_pos) in + let column = match line_num with + | 1 -> comment_col ~location + column + | _ -> column + in + { Loc.line; column } + in + let file = Textloc.file textloc in + let first_line = Textloc.first_line textloc in + let first_byte = Textloc.first_byte textloc in + let last_line = Textloc.last_line textloc in + let last_byte = Textloc.last_byte textloc + 1 in + let start = point_of_line_and_byte_pos ~locator first_line first_byte in + let end_ = point_of_line_and_byte_pos ~locator last_line last_byte in + { Loc.file; start; end_ } + +let meta_to_loc ~locator meta = textloc_to_loc ~locator (Meta.textloc meta) + +(* Sometimes we need to munge a bit the cmarkit metas and textlocs. + These function do that. They are not general and make assumptions + about the nature of data they apply to. E.g. most assume the + textloc is on the same line. *) + +let chop_end_of_meta_textloc ~count meta = + let textloc = Meta.textloc meta in + let last_line = Textloc.last_line textloc in + let last_byte = Textloc.last_byte textloc - count in + let textloc = Textloc.set_last textloc ~last_byte ~last_line in + Meta.with_textloc ~keep_id:true meta textloc + +let split_info_string_locs ~left_count ~right_count m = + if right_count = 0 then (Meta.textloc m, Textloc.none) else + let textloc = Meta.textloc m in + let line = Textloc.first_line textloc in + let last_byte = Textloc.first_byte textloc + left_count - 1 in + let first_byte = Textloc.last_byte textloc - right_count + 1 in + Textloc.set_last textloc ~last_byte ~last_line:line, + Textloc.set_first textloc ~first_byte ~first_line:line + +let textloc_of_sub textloc ~first ~last (* in textloc relative space *) = + let file = Textloc.file textloc in + let line = Textloc.first_line textloc in + let first_byte = Textloc.first_byte textloc + first in + let last_byte = Textloc.first_byte textloc + last in + Textloc.v ~file ~first_byte ~last_byte ~first_line:line ~last_line:line + +(* Warnings *) + +let warn_unsupported_hard_break = + "Hard breaks are unsupported in ocamlmark, using a soft break." + +let warn_unsupported_header_nesting = + "Headers in list items are unsupported in ocamlmark, dropped." + +let warn_heading_level_6 = + "Heading level 6 is unsupported in ocamlmark, using 5." + +let warn_unsupported_list_start_number start = + strf "List start numbers are unsupported in ocamlmark, replacing %d with 1." + start + +let warn_unsupported_cmark kind = + strf "%s are unsupported in ocamlmark, dropped." kind + +let warn_unsupported_indexlist = + "[!indexlist] is unsupported in ocamlmark, dropped." + +let warn_not_in_own_paragraph directive = + strf "%s must be alone in its own paragraph, dropped." directive + +let warn_unsupported_link_title = + "Link titles are unsupported in ocamlmark, dropped." + +let warn_illegal_label_def l = + strf "Label %S: label definitions can't start with ! in ocamlmark" l + +let warn_illegal_image_ref l = + strf "Image cannot reference OCaml reference %S." l + +let warn ~loc:location message warns = { Warning.location; message } :: warns + +let warn_unsupported_cmark ~locator kind meta (acc, warns) = + let msg = warn_unsupported_cmark kind in + (acc, warn ~loc:(meta_to_loc ~locator meta) msg warns) + +let warn_unsupported_header_nesting ~locator meta (acc, warns) = + let msg = warn_unsupported_header_nesting in + (acc, warn ~loc:(meta_to_loc ~locator meta) msg warns) + +(* This handle ocamlmark's implementation of ocamldoc's reference + syntax during CommonMark parsing. We forbid link reference + definitions whose label start with a ! character and on inline + reference links resolve these labels to synthetic label definitions + that we process specially during the AST translation. *) + +type ocamldoc_reference = +[ `Reference of string Loc.with_location +| `Modules of string Loc.with_location list +| `Indexlist ] + +let ocamldoc_reference : ocamldoc_reference Meta.key = Meta.key () +let make_ocamldoc_reference label ref = + (* Synthetic label definition with the ocaml reference *) + let meta = Meta.add ocamldoc_reference ref (Label.meta label) in + Label.with_meta meta label + +let label_is_ocamldoc_reference l = String.starts_with ~prefix:"!" (Label.key l) +let indexlist_directive = "!indexlist" +let modules_directive = "!modules:" + +let is_blank = function ' ' | '\t' -> true | _ -> false +let rec next_blank s ~max i = + if i > max || is_blank s.[i] then i else next_blank s ~max (i + 1) + +let rec next_nonblank s ~max i = + if i > max || not (is_blank s.[i]) then i else next_nonblank s ~max (i + 1) + +let parse_ocamldoc_modules_directive ~locator l = + let next_line = function [] -> None | l :: ls -> Some (ls, l) in + let flush_tok s meta acc first last = + let textloc = textloc_of_sub (Meta.textloc meta) ~first ~last in + let loc = textloc_to_loc ~locator textloc in + Loc.at loc (String.sub s first (last - first + 1)) :: acc + in + let rec parse_toks lines s meta acc max start = + let nb = next_nonblank s ~max start in + if nb > max then match next_line lines with + | None -> Some (`Modules (List.rev acc)) + | Some (ls, (_, (s, m))) -> parse_toks ls s m acc (String.length s - 1) 0 + else + let bl = next_blank s ~max nb in + let acc = flush_tok s meta acc nb (bl - 1) in + parse_toks lines s meta acc max bl + in + match next_line (Label.text l) with + | None -> None + | Some (lines, (_, (s, m))) -> + match String.index_opt s ':' with + | None -> None + | Some colon -> parse_toks lines s m [] (String.length s - 1) (colon + 1) + +let try_parse_ocamldoc_reference ~locator l = + match Label.key l (* we match on the normalized label *) with + | r when String.equal r indexlist_directive -> Some `Indexlist + | r when String.starts_with ~prefix:modules_directive r -> + parse_ocamldoc_modules_directive ~locator l + | r when String.starts_with ~prefix:"!" r -> + let loc = textloc_to_loc ~locator (Meta.textloc (Label.meta l)) in + let text = Label.text_to_string l (* the unormalized text *) in + let ref = String.sub text 1 (String.length text - 1) in + Some (`Reference (Loc.at loc ref)) + | _ -> None + +let define_label ~locator warns label = (* Called on link reference defs *) + if not (label_is_ocamldoc_reference label) then Some label else + let loc = textloc_to_loc ~locator (Meta.textloc (Label.meta label)) in + let label = Label.text_to_string label in + warns := warn ~loc (warn_illegal_label_def label) !warns; + None + +let link_label_ref ~locator _warns label = (* Called on reference links *) + match try_parse_ocamldoc_reference ~locator label with + | None -> None + | Some ref -> Some (make_ocamldoc_reference label ref) + +let image_label_ref ~locator warns label = (* Called on images *) + if not (label_is_ocamldoc_reference label) then None else + let loc = textloc_to_loc ~locator (Meta.textloc (Label.meta label)) in + let label = Label.text_to_string label in + warns := warn ~loc (warn_illegal_image_ref label) !warns; + None + +let ocamldoc_reference_resolver ~locator warns = function +| `Def (Some _, _) -> None (* XXX we could warn on multiple def here *) +| `Def (None, k) -> define_label ~locator warns k +| `Ref (_, _, (Some _ as k)) -> k +| `Ref (`Link, ref, None) -> link_label_ref ~locator warns ref +| `Ref (`Image, ref, None) -> image_label_ref ~locator warns ref + +(* Translating blocks and inlines. *) + +(* A few type definitions for better variant typing. *) + +type inlines_acc = Ast.inline_element Ast.with_location list * Warning.t list +type ast_acc = Ast.t * Warning.t list +type nestable_ast_acc = + Ast.nestable_block_element Ast.with_location list * Warning.t list + +(* Inline translations *) + +let link_definition defs l = match Inline.Link.reference_definition defs l with +| Some (Link_definition.Def (ld, _)) -> ld +| Some _ -> assert false (* if we parse without cmarkit extensions *) +| None -> assert false (* assert [l]'s referenced label is not synthetic *) + +let autolink_to_inline_element ~locator a m (is, warns) = + let loc = meta_to_loc ~locator m in + let link, link_loc = Inline.Autolink.link a in + let link_loc = meta_to_loc ~locator link_loc in + let text = [Loc.at link_loc (`Word link)] in + Loc.at loc (`Link (link, text)) :: is, warns + +let break_to_inline_element ~locator br m (is, warns) = + let loc = meta_to_loc ~locator m in + let warns = match Inline.Break.type' br with + | `Soft -> warns + | `Hard -> warn ~loc warn_unsupported_hard_break warns + in + Loc.at loc (`Space "\n") :: is, warns + +let code_span_to_inline_element ~locator cs m (is, warns) = + let loc = meta_to_loc ~locator m in + let code = Inline.Code_span.code cs in + Loc.at loc (`Code_span code) :: is, warns + +let raw_html_to_inline_element ~locator html m (is, warns) = + let loc = meta_to_loc ~locator m in + let html = String.concat "\n" (List.map Block_line.tight_to_string html) in + Loc.at loc (`Raw_markup (Some "html", html)) :: is, warns + +let image_to_inline_element ~locator defs i m (is, warns) = + (* We map to raw html, ocamldoc's ast should have a case for that. *) + let escape esc b s = Buffer.clear b; esc b s; Buffer.contents b in + let pct_esc = escape Cmarkit_html.buffer_add_pct_encoded_string in + let html_esc = escape Cmarkit_html.buffer_add_html_escaped_string in + let loc = meta_to_loc ~locator m in + let b = Buffer.create 255 in + let ld = link_definition defs i in + let link = match Link_definition.dest ld with + | None -> "" | Some (link, _) -> pct_esc b link + in + let title = match Link_definition.title ld with + | None -> "" + | Some title -> + let title = List.map Block_line.tight_to_string title in + html_esc b (String.concat "\n" title) + in + let alt = + let ls = Inline.to_plain_text ~break_on_soft:false (Inline.Link.text i) in + html_esc b (String.concat "\n" (List.map (String.concat "") ls)) + in + let img = + String.concat "" + [{||}; alt; {|"|}] + in + Loc.at loc (`Raw_markup (Some "html", img)) :: is, warns + +let text_to_inline_elements ~locator s meta (is, warns as acc) = + (* [s] is on a single source line (but may have newlines because of + character references) we need to tokenize it for ocamldoc's ast. *) + let flush_tok s meta acc is_space first last = + let textloc = textloc_of_sub (Meta.textloc meta) ~first ~last in + let loc = textloc_to_loc ~locator textloc in + let s = String.sub s first (last - first + 1) in + Loc.at loc (if is_space then `Space s else `Word s) :: acc + in + let rec tokenize s meta acc max start is_space = + if start > max then (List.rev_append acc is, warns) else + let next_start = + if is_space then next_nonblank s ~max start else next_blank s ~max start + in + let acc = flush_tok s meta acc is_space start (next_start - 1) in + tokenize s meta acc max next_start (not is_space) + in + let max = String.length s - 1 in + if max < 0 then acc else tokenize s meta [] max 0 (is_blank s.[0]) + +let rec ocamldoc_reference_to_inline_element ~locator defs l m ref (is, warns) = + let loc = meta_to_loc ~locator m in + let kind, text, warns = match Inline.Link.reference l with + | `Ref ((`Collapsed | `Shortcut), _, _) -> `Simple, [], warns + | `Ref (`Full, _, _) -> + let i = Inline.Link.text l in + let text, warns = inline_to_inline_elements ~locator defs ([], warns) i in + `With_text, text, warns + | `Inline _ -> assert false + in + Loc.at loc (`Reference (kind, ref, text)) :: is, warns + +and link_reference_to_inline_element ~locator defs l m (is, warns) = + let loc = meta_to_loc ~locator m in + let ld = link_definition defs l in + let link = match Link_definition.dest ld with None -> "" | Some (l,_) -> l in + let warns = match Link_definition.title ld with + | None -> warns + | Some title -> + let textloc = Block_line.tight_list_textloc title in + let loc = textloc_to_loc ~locator textloc in + warn ~loc warn_unsupported_link_title warns + in + let text, warns = + inline_to_inline_elements ~locator defs ([], warns) (Inline.Link.text l) + in + Loc.at loc (`Link (link, text)) :: is, warns + +and link_to_inline_element ~locator defs l m (is, warns as acc) = + match Inline.Link.reference l with + | `Inline _ -> link_reference_to_inline_element ~locator defs l m acc + | `Ref (_, _, def) -> + match Meta.find ocamldoc_reference (Label.meta def) with + | None -> link_reference_to_inline_element ~locator defs l m acc + | Some `Reference ref -> + ocamldoc_reference_to_inline_element ~locator defs l m ref acc + | Some `Indexlist -> + let w = warn_not_in_own_paragraph "[!indexlist]" in + is, warn ~loc:(meta_to_loc ~locator m) w warns + | Some `Modules _ -> + let w = warn_not_in_own_paragraph "[!modules …]" in + is, warn ~loc:(meta_to_loc ~locator m) w warns + +and emphasis_to_inline_element ~locator defs style e m (is, warns) = + let loc = meta_to_loc ~locator m in + let i = Inline.Emphasis.inline e in + let inlines, warns = inline_to_inline_elements ~locator defs ([], warns) i in + Loc.at loc (`Styled (style, inlines)) :: is, warns + +and inline_to_inline_elements ~locator defs acc i : inlines_acc = match i with +| Inline.Autolink (a, m) -> + autolink_to_inline_element ~locator a m acc +| Inline.Break (b, m) -> + break_to_inline_element ~locator b m acc +| Inline.Code_span (cs, m) -> + code_span_to_inline_element ~locator cs m acc +| Inline.Emphasis (e, m) -> + emphasis_to_inline_element ~locator defs `Emphasis e m acc +| Inline.Image (i, m) -> + image_to_inline_element ~locator defs i m acc +| Inline.Inlines (is, _m) -> + let inline = inline_to_inline_elements ~locator defs in + List.fold_left inline acc (List.rev is) +| Inline.Link (l, m) -> + link_to_inline_element ~locator defs l m acc +| Inline.Raw_html (html, m) -> + raw_html_to_inline_element ~locator html m acc +| Inline.Strong_emphasis (e, m) -> + emphasis_to_inline_element ~locator defs `Bold e m acc +| Inline.Text (t, m) -> + text_to_inline_elements ~locator t m acc +| _ -> assert false + +(* Heading label support - CommonMark extension. Parses a potential + final {#id} in heading inlines. In [id] braces must be escaped + otherwise parsing fails; if the rightmost brace is escaped it's + not a heading label. The parse runs from right to left *) + +let parse_heading_label s = + let rec loop s max prev i = + if i < 0 then None else match s.[i] with + | '{' as c -> + if i > 0 && s.[i - 1] = '\\' then loop s max c (i - 1) else + if prev = '#' then Some i else None + | '}' as c -> + if i > 0 && s.[i - 1] = '\\' then loop s max c (i - 1) else None + | c -> loop s max c (i - 1) + in + let max = String.length s - 1 in + let last = (* [last] is rightmost non blank, if any. *) + let k = ref max in + while not (!k < 0) && is_blank s.[!k] do decr k done; !k + in + if last < 1 || s.[last] <> '}' || s.[last - 1] = '\\' then None else + match loop s max s.[last] (last - 1) with + | None -> None + | Some first -> + let chop = max - first + 1 in + let text = String.sub s 0 first in + let first = first + 2 and last = last - 1 in (* remove delims *) + let label = String.sub s first (last - first + 1) in + Some (text, chop, label) + +let heading_inline_and_label h = + (* cmarkit claims it's already normalized but let's be defensive :-) *) + match Inline.normalize (Block.Heading.inline h) with + | Inline.Text (t, m) as inline -> + begin match parse_heading_label t with + | None -> inline, None + | Some (t, chop, label) -> + let m = chop_end_of_meta_textloc ~count:chop m in + Inline.Text (t, m), Some label + end + | Inline.Inlines (is, m0) as inline -> + begin match List.rev is with + | Inline.Text (t, m1) :: ris -> + begin match parse_heading_label t with + | None -> inline, None + | Some (t, chop, label) -> + let m0 = chop_end_of_meta_textloc ~count:chop m0 in + let m1 = chop_end_of_meta_textloc ~count:chop m1 in + Inline.Inlines (List.rev (Inline.Text (t, m1) :: ris), m0), + Some label + end + | _ -> inline, None + end + | inline -> inline, None + +(* Block translations *) + +let raw_paragraph ~loc ~raw_loc backend raw = + Loc.at loc (`Paragraph [Loc.at raw_loc (`Raw_markup (Some backend, raw))]) + +let code_block_to_nestable_block_element ~locator cb m (bs, warns) = + let loc = meta_to_loc ~locator m in + let code = Block.Code_block.code cb in + let code_loc = textloc_to_loc ~locator (Block_line.list_textloc code) in + let code = String.concat "\n" (List.map Block_line.to_string code) in + match Block.Code_block.info_string cb with + | None -> Loc.at loc (`Code_block (None, Loc.at code_loc code)) :: bs, warns + | Some (info, im) -> + match Block.Code_block.language_of_info_string info with + | None -> + Loc.at loc (`Code_block (None, Loc.at code_loc code)) :: bs, warns + | Some ("verb", _) -> + Loc.at loc (`Verbatim code) :: bs, warns + | Some ("=html", _) -> + (raw_paragraph ~loc ~raw_loc:code_loc "html" code) :: bs, warns + | Some ("=latex", _) -> + (raw_paragraph ~loc ~raw_loc:code_loc "latex" code) :: bs, warns + | Some ("=texi", _) -> + (raw_paragraph ~loc ~raw_loc:code_loc "texi" code) :: bs, warns + | Some ("=man", _) -> + (raw_paragraph ~loc ~raw_loc:code_loc "man" code) :: bs, warns + | Some (lang, env) -> + let left_count = String.length lang in + let right_count = String.length env in + let lang_loc, env_loc = + split_info_string_locs ~left_count ~right_count im + in + let env = + if env = "" then None else + Some (Loc.at (textloc_to_loc ~locator env_loc) env) + in + let lang = Loc.at (textloc_to_loc ~locator lang_loc) lang in + let metadata = Some (lang, env) in + Loc.at loc (`Code_block (metadata, Loc.at code_loc code)) :: bs, + warns + +let html_block_to_nestable_block_element ~locator html m (bs, warns) = + let loc = meta_to_loc ~locator m in + let html = String.concat "\n" (List.map fst html) in + raw_paragraph ~loc ~raw_loc:loc "html" html :: bs, warns + +let heading_to_block_element ~locator defs h m (bs, warns) = + let loc = meta_to_loc ~locator m in + let level, warns = match Block.Heading.level h with + | 6 -> 5, warn ~loc warn_heading_level_6 warns + | level -> level, warns + in + let inline, label = heading_inline_and_label h in + let inlines, warns = + inline_to_inline_elements ~locator defs ([], warns) inline + in + Loc.at loc (`Heading (level, label, inlines)) :: bs, warns + +let try_ocamldoc_reference_directive ~locator i (bs, warns) = match i with +| Inline.Link (l, meta) -> + begin match Inline.Link.reference l with + | `Inline _ -> None + | `Ref (_, _, def) -> + match Meta.find ocamldoc_reference (Label.meta def) with + | None | Some (`Reference _) -> None + | Some (`Modules _ as m) -> + let loc = meta_to_loc ~locator meta in + Some (Loc.at loc m :: bs, warns) + | Some `Indexlist -> + let loc = meta_to_loc ~locator meta in + Some (bs, warn ~loc warn_unsupported_indexlist warns) + end +| _ -> None + +let paragraph_to_nestable_block_element ~locator defs p m (bs, warns as acc) = + (* TODO Parse inlines for @tags support. *) + let loc = meta_to_loc ~locator m in + let i = Block.Paragraph.inline p in + match try_ocamldoc_reference_directive ~locator i acc with + | Some acc -> acc + | None -> + let is, warns = inline_to_inline_elements ~locator defs ([], warns) i in + Loc.at loc (`Paragraph is) :: bs, warns + +let thematic_break_to_nestable_block_element ~locator m (bs, warns) = + let loc = meta_to_loc ~locator m in + (raw_paragraph ~loc ~raw_loc:loc "html" "
") :: bs, warns + +let rec list_to_nestable_block_element ~locator defs l m (bs, warns) = + let loc = meta_to_loc ~locator m in + let style = `Heavy (* Note this is a layout property of ocamldoc *) in + let kind, warns = match Block.List'.type' l with + | `Unordered _ -> `Unordered, warns + | `Ordered (start, _) -> + `Ordered, (if start = 1 then warns else + warn ~loc (warn_unsupported_list_start_number start) warns) + in + let add_item ~locator (acc, warns) (i, _meta) = + let b = Block.List_item.block i in + let bs, warns = + block_to_nestable_block_elements ~locator defs ([], warns) b + in + (bs :: acc, warns) + in + let ritems = List.rev (Block.List'.items l) in + let items, warns = List.fold_left (add_item ~locator) ([], warns) ritems in + Loc.at loc (`List (kind, style, items)) :: bs, warns + +and block_to_nestable_block_elements ~locator defs acc b : nestable_ast_acc = + match b with + | Block.Blocks (bs, _) -> + let block = block_to_nestable_block_elements ~locator defs in + List.fold_left block acc (List.rev bs) + | Block.Code_block (c, m) -> + code_block_to_nestable_block_element ~locator c m acc + | Block.Heading (_, m) -> + warn_unsupported_header_nesting ~locator m acc + | Block.Html_block (html, m) -> + html_block_to_nestable_block_element ~locator html m acc + | Block.List (l, m) -> + list_to_nestable_block_element ~locator defs l m acc + | Block.Paragraph (p, m) -> + paragraph_to_nestable_block_element ~locator defs p m acc + | Block.Block_quote (_, m) -> + warn_unsupported_cmark ~locator "Block quotes" m acc + | Block.Thematic_break (_, m) -> + thematic_break_to_nestable_block_element ~locator m acc + | Block.Blank_line _ + | Block.Link_reference_definition _ -> (* layout cases *) acc + | _ -> assert false + +let rec block_to_ast ~locator defs acc b : ast_acc = match b with +| Block.Heading (h, m) -> heading_to_block_element ~locator defs h m acc +| Block.Blocks (bs, _) -> + List.fold_left (block_to_ast ~locator defs) acc (List.rev bs) +| b -> + (* We can't go directy with acc because of nestable typing. *) + let bs, ws = acc in + let bs', ws = block_to_nestable_block_elements ~locator defs ([], ws) b in + List.rev_append (List.rev (bs' :> Ast.t)) bs, ws + +(* Parsing comments *) + +let parse_comment ?buffer:b ~location ~text:s () : Ast.t * Warning.t list = + let b = match b with + | None -> Buffer.create (String.length s) + | Some b -> Buffer.reset b; b + in + let locator, text = massage_comment ~location b s in + let warns = ref [] and file = location.Lexing.pos_fname in + let resolver = ocamldoc_reference_resolver ~locator warns in + let doc = Doc.of_string ~resolver ~file ~locs:true ~strict:true text in + block_to_ast ~locator (Doc.defs doc) ([], !warns) (Doc.block doc) diff --git a/src/ocamlmark.mli b/src/ocamlmark.mli new file mode 100644 index 00000000..9f75d018 --- /dev/null +++ b/src/ocamlmark.mli @@ -0,0 +1,18 @@ +(** [ocamlmark] support. *) + +(** {1:syntax Syntax sniffing} *) + +type syntax = [ `Ocamldoc | `Ocamlmark ] +(** The type for document syntaxes. *) + +val sniff_syntax : ?default:syntax -> text:string -> unit -> syntax +(** [sniff_syntax ~text] sniffes a syntax for [text], defaulting + to [default] if nothing smells (defaults to [`Ocamldoc]). *) + +(** {1:parsing ocamlmark parsing} *) + +val parse_comment : + ?buffer:Buffer.t -> location:Lexing.position -> text:string -> unit -> + Ast.t * Warning.t list +(** [parse_comment ~location ~text] parses the ocamlmark [text] assuming it + corresponds to [location]. [buffer] is used as a scratch buffer. *) diff --git a/src/odoc_parser.ml b/src/odoc_parser.ml index 2d9fdd5d..e7992b0b 100644 --- a/src/odoc_parser.ml +++ b/src/odoc_parser.ml @@ -99,8 +99,7 @@ let position_of_point : t -> Loc.point -> Lexing.position = let pos_fname = original_pos.pos_fname in { Lexing.pos_bol; pos_lnum; pos_cnum; pos_fname } -(* The main entry point for this module *) -let parse_comment ~location ~text = +let ocamldoc_parse_comment ~location ~text = let warnings = ref [] in let reversed_newlines = reversed_newlines ~input:text in let token_stream = @@ -116,6 +115,14 @@ let parse_comment ~location ~text = let ast, warnings = Syntax.parse warnings token_stream in { ast; warnings; reversed_newlines; original_pos = location } +(* The main entry point for this module *) +let parse_comment ~location ~text = match Ocamlmark.sniff_syntax ~text () with +| `Ocamldoc -> ocamldoc_parse_comment ~location ~text +| `Ocamlmark -> + let ast, warnings = Ocamlmark.parse_comment ~location ~text () in + let reversed_newlines = reversed_newlines ~input:text in + { ast; warnings; reversed_newlines; original_pos = location } + (* Accessor functions, as [t] is opaque *) let warnings t = t.warnings let ast t = t.ast diff --git a/src/odoc_parser.mli b/src/odoc_parser.mli index 5dbd4a08..c9c69a96 100644 --- a/src/odoc_parser.mli +++ b/src/odoc_parser.mli @@ -22,7 +22,7 @@ module Loc = Loc (** Warnings produced during parsing. *) module Warning : sig type t = Warning.t = { location : Loc.span; message : string } - (** Warnings are represented as record containing the human-readable text + (** Warnings are represented as record containing the human-readable text of the warning alongside the location of the offending text in the source *) val pp : Format.formatter -> t -> unit