Emit comment semantic tokens for SNA and text in the right margin

OCamlPro · Oct 23, 2023 · 86e6cca · 86e6cca
1 parent 19d5894
commit 86e6cca
Show file tree

Hide file tree

Showing 13 changed files with 161 additions and 82 deletions.
diff --git a/src/lsp/cobol_lsp/lsp_document.ml b/src/lsp/cobol_lsp/lsp_document.ml
@@ -63,6 +63,7 @@ module TYPES = struct
       doc_cache_pplog: Cobol_preproc.Trace.log;
       doc_cache_tokens: Cobol_parser.Outputs.tokens_with_locs;
       doc_cache_comments: Cobol_preproc.Text.comments;
+      doc_cache_ignored: Cobol_common.Srcloc.lexloc list;
       doc_cache_parsed: (Cobol_ptree.compilation_group * CUs.t) option;
       doc_cache_diags: DIAGS.Set.serializable;
     }
@@ -116,7 +117,8 @@ let lazy_references ptree cus defs =
 let no_artifacts =
   Cobol_parser.Outputs.{ tokens = lazy [];
                          pplog = Cobol_preproc.Trace.empty;
-                         comments = [] }
+                         rev_comments = [];
+                         rev_ignored = [] }
 
 let gather_parsed_data ptree =
   Cobol_typeck.analyze_compilation_group ptree |>
@@ -216,15 +218,17 @@ let retrieve_parsed_data: document -> parsed_data = function
 (** Caching utilities *)
 
 let to_cache ({ project; textdoc; parsed; diags;
-                artifacts = { pplog; tokens; comments; _ }; _ } as doc) =
+                artifacts = { pplog; tokens;
+                              rev_comments; rev_ignored; _ }; _ } as doc) =
   {
     doc_cache_filename = Lsp_project.relative_path_for ~uri:(uri doc) project;
     doc_cache_checksum = Digest.string (Lsp.Text_document.text textdoc);
     doc_cache_langid = Lsp.Text_document.languageId textdoc;
     doc_cache_version = Lsp.Text_document.version textdoc;
     doc_cache_pplog = pplog;
     doc_cache_tokens = Lazy.force tokens;
-    doc_cache_comments = comments;
+    doc_cache_comments = rev_comments;
+    doc_cache_ignored = rev_ignored;
     doc_cache_parsed = Option.map (fun { ptree; cus; _ } -> ptree, cus) parsed;
     doc_cache_diags = DIAGS.Set.apply_delayed_formatting diags;
   }
@@ -239,7 +243,8 @@ let of_cache ~project
       doc_cache_version = version;
       doc_cache_pplog = pplog;
       doc_cache_tokens = tokens;
-      doc_cache_comments = comments;
+      doc_cache_comments = rev_comments;
+      doc_cache_ignored = rev_ignored;
       doc_cache_parsed = parsed;
       doc_cache_diags = diags } =
   let absolute_filename = Lsp_project.absolute_path_for ~filename project in
@@ -260,7 +265,8 @@ let of_cache ~project
            { ptree; cus; definitions; references })
         parsed
     in
-    { doc with artifacts = { pplog; tokens = lazy tokens; comments };
+    { doc with artifacts = { pplog; tokens = lazy tokens;
+                             rev_comments; rev_ignored };
                diags = DIAGS.Set.of_serializable diags;
                parsed }
 

diff --git a/src/lsp/cobol_lsp/lsp_request.ml b/src/lsp/cobol_lsp/lsp_request.ml
@@ -216,11 +216,13 @@ let handle_semtoks_full,
     handle_semtoks_range =
   let handle registry ?range (doc: TextDocumentIdentifier.t) =
     try_with_document_data registry doc
-      ~f:begin fun ~doc:{ artifacts = { pplog; tokens; comments; _ };
+      ~f:begin fun ~doc:{ artifacts = { pplog; tokens;
+                                        rev_comments; rev_ignored; _ };
                           _ } Lsp_document.{ ptree; _ } ->
         let data =
           Lsp_semtoks.data ~filename:(Lsp.Uri.to_path doc.uri) ~range
-            ~pplog ~comments ~tokens:(Lazy.force tokens) ~ptree
+            ~pplog ~rev_comments ~rev_ignored
+            ~tokens:(Lazy.force tokens) ~ptree
         in
         Some (SemanticTokens.create ~data ())
       end

diff --git a/src/lsp/cobol_lsp/lsp_semtoks.ml b/src/lsp/cobol_lsp/lsp_semtoks.ml
@@ -553,17 +553,32 @@ let semtoks_from_ptree ~filename ?range ptree =
 
   end) ptree [] |> List.rev
 
-let semtoks_of_comments ~filename ?range comments = comments |>
-  List.filter_map begin function
+let semtoks_of_comments ~filename ?range rev_comments =
+  rev_comments |>
+  List.fold_left begin fun acc -> function
     | Cobol_preproc.Text.{ comment_loc = s, _ as lexloc; _ }
       when s.Lexing.pos_fname = filename &&
            Option.fold range
              ~some:(fun r -> Lsp_position.intersects_lexloc r lexloc)
              ~none:true ->
-        Some (semtok TOKTYP.comment lexloc)
+        semtok TOKTYP.comment lexloc :: acc
     | _ ->
-        None
-  end
+        acc
+  end []
+
+let semtoks_of_ignored ~filename ?range rev_ignored =
+  (* Decorate like comments, for lack of a better suited token type in the set
+     of available ones.  This could be improved later with some client-side code
+     or configuration.  *)
+  rev_ignored |>
+  List.fold_left begin fun acc ((s, _ ) as lexloc) ->
+    if s.Lexing.pos_fname = filename &&
+       Option.fold range
+         ~some:(fun r -> Lsp_position.intersects_lexloc r lexloc)
+         ~none:true
+    then semtok TOKTYP.comment lexloc :: acc
+    else acc
+  end []
 
 let semtoks_of_preproc_statements ~filename ?range pplog =
   List.rev @@ List.fold_left begin fun acc -> function
@@ -646,11 +661,13 @@ let ensure_sorted name ~filename cmp l =
       List.fast_sort cmp l
 
 
-let data ~filename ~range ~tokens ~pplog ~comments ~ptree : int array =
+let data ~filename ~range ~tokens ~pplog
+    ~rev_comments ~rev_ignored ~ptree : int array =
   let semtoks1 = semtoks_of_non_ambigious_tokens ~filename ?range tokens in
   let semtoks2 = semtoks_from_ptree              ~filename ?range ptree in
-  let semtoks3 = semtoks_of_comments             ~filename ?range comments in
-  let semtoks4 = semtoks_of_preproc_statements   ~filename ?range pplog in
+  let semtoks3 = semtoks_of_comments             ~filename ?range rev_comments in
+  let semtoks4 = semtoks_of_ignored              ~filename ?range rev_ignored in
+  let semtoks5 = semtoks_of_preproc_statements   ~filename ?range pplog in
   (* NB: In *principle* all those lists are already sorted w.r.t lexical
      locations in [filename].  We just check that for now and raise a warning,
      in case. *)
@@ -660,8 +677,10 @@ let data ~filename ~range ~tokens ~pplog ~comments ~ptree : int array =
   let semtoks1 = ensure_sorted "nonambiguous" ~filename compare_semtoks semtoks1
   and semtoks2 = ensure_sorted "ptree"        ~filename compare_semtoks semtoks2
   and semtoks3 = ensure_sorted "comments"     ~filename compare_semtoks semtoks3
-  and semtoks4 = ensure_sorted "preproc"      ~filename compare_semtoks semtoks4 in
+  and semtoks4 = ensure_sorted "ignored"      ~filename compare_semtoks semtoks4
+  and semtoks5 = ensure_sorted "preproc"      ~filename compare_semtoks semtoks5 in
   relative_semtoks
     List.(merge compare_semtoks semtoks1 @@
           merge compare_semtoks semtoks2 @@
-          merge compare_semtoks semtoks3 @@ semtoks4)
+          merge compare_semtoks semtoks3 @@
+          merge compare_semtoks semtoks4 @@ semtoks5)
diff --git a/src/lsp/cobol_lsp/lsp_semtoks.mli b/src/lsp/cobol_lsp/lsp_semtoks.mli
@@ -21,6 +21,7 @@ val data
   -> range: Lsp.Types.Range.t option
   -> tokens: Cobol_parser.Outputs.tokens_with_locs
   -> pplog: Cobol_preproc.Trace.log
-  -> comments: Cobol_preproc.Text.comments
+  -> rev_comments: Cobol_preproc.Text.comments
+  -> rev_ignored: Cobol_common.Srcloc.lexloc list
   -> ptree: Cobol_ptree.compilation_group
   -> int array
diff --git a/src/lsp/cobol_parser/parser_engine.ml b/src/lsp/cobol_parser/parser_engine.ml
@@ -419,8 +419,9 @@ let aggregate_output (type m) (ps: m state) res
   | Eidetic ->
       let artifacts =
         { tokens = Tokzr.parsed_tokens ps.preproc.tokzr;
-          pplog = Cobol_preproc.log ps.preproc.pp;
-          comments = Cobol_preproc.comments ps.preproc.pp } in
+          pplog = Cobol_preproc.rev_log ps.preproc.pp;
+          rev_comments = Cobol_preproc.rev_comments ps.preproc.pp;
+          rev_ignored = Cobol_preproc.rev_ignored ps.preproc.pp } in
       WithArtifacts (res, artifacts)
 
 (** Simple parsing *)
@@ -509,32 +510,14 @@ let parse_with_history ?(save_stage = 10) rwps =
     (rewindable_parser_state rwps).preproc.tokzr;
   res, rwps
 
-
-let lexing_postion_of ~position rwps = match position with
-  | Lexing pos ->
-      pos
-  | Indexed { line; char } ->
-      let ps = rewindable_parser_state rwps in
-      let newline_cnums = Cobol_preproc.newline_cnums ps.preproc.pp in
-      if newline_cnums = []
-      then raise Not_found   (* no complete line was processed yet; just skip *)
-      else
-        let lexpos = Cobol_preproc.position ps.preproc.pp in
-        try
-          let pos_bol =
-            try List.nth newline_cnums (line - 1)
-            with Not_found | Invalid_argument _ -> 0
-          in
-          Lexing.{ lexpos with pos_bol;
-                               pos_cnum = pos_bol + char;
-                               pos_lnum = line + 1 }
-        with Failure _ ->
-          (* The given line exceeds what was already processed, so we restart
-             from the current preprocessor position. *)
-          lexpos
-
 let find_history_event_preceding ~position ({ store; _ } as rwps) =
-  let lexpos = lexing_postion_of ~position rwps in
+  let lexpos = match position with
+    | Lexing pos ->
+        pos
+    | Indexed { line; char } ->
+        let ps = rewindable_parser_state rwps in
+        Cobol_preproc.position_at ~line ~char ps.preproc.pp
+  in
   let rec aux = function
     | [] ->
         raise Not_found

diff --git a/src/lsp/cobol_parser/parser_outputs.ml b/src/lsp/cobol_parser/parser_outputs.ml
@@ -22,7 +22,8 @@ type artifacts =
   {
     tokens: tokens_with_locs Lazy.t;
     pplog: Cobol_preproc.Trace.log;
-    comments: Cobol_preproc.Text.comments;
+    rev_comments: Cobol_preproc.Text.comments;
+    rev_ignored: lexloc list;
   }
 
 (** The output of parsing functions depends on its memorization abilities:

diff --git a/src/lsp/cobol_preproc/preproc_engine.ml b/src/lsp/cobol_preproc/preproc_engine.ml
@@ -47,11 +47,33 @@ and preprocessor_persist =
 let diags { diags; reader; _ } = DIAGS.Set.union diags @@ Src_reader.diags reader
 let add_diag lp d = { lp with diags = DIAGS.Set.cons d lp.diags }
 let add_diags lp d = { lp with diags = DIAGS.Set.union d lp.diags }
-let log { pplog; _ } = pplog
-let source_format { reader; _ } = Src_reader.source_format reader
 let position { reader; _ } = Src_reader.position reader
-let comments { reader; _ } = Src_reader.comments reader
-let newline_cnums { reader; _ } = Src_reader.newline_cnums reader
+let source_format { reader; _ } = Src_reader.source_format reader
+let rev_log { pplog; _ } = pplog
+let rev_comments { reader; _ } = Src_reader.rev_comments reader
+let rev_ignored { reader; _ } = Src_reader.rev_ignored reader
+
+(** [position_at ~line ~char pp] computes a lexing position that corresponds to
+    the given line and character indexes (all starting at 0) in the input
+    already read by [pp].  Raises {!Not_found} if no complete line was processed
+    yet, or the current position if the given line index doed not correspond to
+    already processed lines. *)
+let position_at ~line ~char { reader; _ } =
+  let rev_newline_cnums = Src_reader.rev_newline_cnums reader in
+  if rev_newline_cnums = []
+  then raise Not_found
+  else
+    let lexpos = Src_reader.position reader in
+    try
+      let pos_bol =
+        try List.nth rev_newline_cnums (lexpos.pos_lnum - line - 1)
+        with Not_found | Invalid_argument _ -> 0
+      in
+      Lexing.{ lexpos with pos_bol;
+                           pos_cnum = pos_bol + char;
+                           pos_lnum = line + 1 }
+    with Failure _ ->
+      lexpos
 
 let with_reader lp reader =
   if lp.reader == reader then lp else { lp with reader }

diff --git a/src/lsp/cobol_preproc/preproc_engine.mli b/src/lsp/cobol_preproc/preproc_engine.mli
@@ -30,11 +30,12 @@ val reset_preprocessor_for_string
 val diags: preprocessor -> Cobol_common.Diagnostics.Set.t
 val add_diag: preprocessor -> Cobol_common.Diagnostics.t -> preprocessor
 val add_diags: preprocessor -> Cobol_common.Diagnostics.Set.t -> preprocessor
-val log: preprocessor -> Preproc_trace.log
-val comments: preprocessor -> Text.comments
 val position: preprocessor -> Lexing.position
+val position_at: line:int -> char: int -> preprocessor -> Lexing.position
 val source_format: preprocessor -> Src_format.any
-val newline_cnums: preprocessor -> int list
+val rev_log: preprocessor -> Preproc_trace.log
+val rev_comments: preprocessor -> Text.comments
+val rev_ignored: preprocessor -> lexloc list
 
 val next_chunk: preprocessor -> Text.text * preprocessor
 

diff --git a/src/lsp/cobol_preproc/src_lexer.mll b/src/lsp/cobol_preproc/src_lexer.mll
@@ -59,7 +59,7 @@
 
 let newline = '\r'* '\n'
 let nnl = _ # ['\r' '\n']                             (* anything but newline *)
-let sna = nnl nnl nnl nnl nnl nnl                                  (* 6 chars *)
+let sna = nnl nnl nnl nnl nnl nnl              (* 6 chars; TODO: exclude tabs *)
 let spaces = ([' ' '\t']*)
 let    blank        = [' ' '\009' '\r']
 let nonblank        = nnl # blank
@@ -131,42 +131,56 @@ let cdir_word =
 
 rule fixed_line state
   = shortest
-  | sna ' ' | '\t'                                                 (* nominal *)
+  | sna                                                            (* nominal *)
+      {
+        fixed_indicator (Src_lexing.sna state lexbuf) lexbuf
+      }
+  | '\t'
+      {
+        fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
+      }
+  | (nnl* newline)                                  (* blank line (too short) *)
+      {
+        Src_lexing.new_line (Src_lexing.sna state lexbuf) lexbuf
+      }
+  | (nnl* eof)           (* blank line (too short), without newline character *)
+      {
+        Src_lexing.(flush @@ eof (Src_lexing.sna state lexbuf) lexbuf)
+      }
+and fixed_indicator state
+  = parse
+  | ' ' | '\t' (* second tab *)                                    (* nominal *)
       {
         fixed_nominal_line (Src_lexing.flush_continued state) lexbuf
       }
-  | sna '-'                                              (* continuation line *)
+  | '-'                                                  (* continuation line *)
       {
         fixed_continue_line state lexbuf
       }
-  | sna ('$' as marker)                                 (* compiler directive *)
+  | ('$' as marker)                                     (* compiler directive *)
       {
         fixed_mf_cdir_line (String.make 1 marker) state lexbuf
       }
-  | sna ('>' as marker)
+  | ('>' as marker)
       {
         maybe_fixed_cdir_line marker state lexbuf
       }
-  | sna (['*' '/'] as marker)                                 (* comment line *)
+  | (['*' '/'] as marker)                                     (* comment line *)
       {
         comment_line marker state lexbuf
       }
-  | sna ['d' 'D']
+  | ['d' 'D']
       {
         fixed_debug_line state lexbuf
       }
-  | sna (_#['\n' '\r'] as c)                             (* unknown indicator *)
+  | (_#['\n' '\r'] as c)                                 (* unknown indicator *)
       {
         Src_lexing.unexpected Char ~c ~knd:"indicator" state lexbuf
           ~k:(fun state -> fixed_nominal_line (Src_lexing.flush_continued state))
       }
-  | (nnl* newline)                                  (* blank line (too short) *)
-      {
-        Src_lexing.new_line state lexbuf
-      }
-  | (nnl* eof)           (* blank line (too short), without newline character *)
+  | epsilon
       {
-        Src_lexing.(flush @@ eof state lexbuf)
+        gobble_line (Src_lexing.sna state lexbuf) lexbuf
       }
 and xopen_line state                                     (* X/Open free-form  *)
   = parse                            (* (note no continuation line indicator) *)