diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 2d2340d4..0c8e88a2 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -22,4 +22,4 @@ jobs: env: VERSION: ${{ matrix.emacs_version }} run: >- - make docker CMD="make lint lint-changelog" + make docker CMD="make unit lint lint-changelog" diff --git a/.gitignore b/.gitignore index 9a66d3ec..ad07b996 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .log .tmp auto +vendor/buttercup-*/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 067591b2..0ca6cc35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ The format is based on [Keep a Changelog]. ### Enhancements ### Formatters ### Bugs fixed +* The point alignment algorithm, which has been slightly wrong since + 2019, has been fixed to more correctly use dynamic programming to + maintain the position of point. Also, some cases of a nil pointer + error during point alignment have been fixed ([#290]). * `apheleia-indent-lisp-buffer` updated to apply local variables after calling major-mode. Also includes setting for `indent-tabs-mode` ([#286]). * [Formatter scripts](scripts/formatters) will now work on Windows if Emacs @@ -14,6 +18,7 @@ The format is based on [Keep a Changelog]. [#286]: https://github.com/radian-software/apheleia/pull/286 [#285]: https://github.com/radian-software/apheleia/issues/285 +[#290]: https://github.com/radian-software/apheleia/pull/290 ## 4.1 (released 2024-02-25) ### Enhancements diff --git a/Makefile b/Makefile index 1e2b7aa1..ce9e944a 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ TAG ?= latest # The order is important for compilation. for_compile := \ apheleia-utils.el \ + apheleia-dp.el \ apheleia-formatter-context.el \ apheleia-log.el \ apheleia-formatters.el \ @@ -96,18 +97,33 @@ fmt-build-common: ## Build a Docker image with just the common base fmt-docker: ## Start a Docker shell for testing formatters @scripts/docker-run.bash -e FORMATTERS "apheleia-formatters:$(TAG)" "$(CMD)" +APHELEIA_FT := -L test/formatters -l apheleia-ft + .PHONY: fmt-lint fmt-lint: ## Do basic linting for formatter configuration - @test/formatters/run-func.bash apheleia-ft-lint + @test/shared/run-func.bash apheleia-ft-lint $(APHELEIA_FT) .PHONY: fmt-check fmt-changed: ## Get list of changed formatters on this PR - @test/formatters/run-func.bash apheleia-ft-changed + @test/shared/run-func.bash apheleia-ft-changed $(APHELEIA_FT) .PHONY: fmt-test # env var: FORMATTERS fmt-test: ## Actually run formatter tests - @test/formatters/run-func.bash apheleia-ft-test + @test/shared/run-func.bash apheleia-ft-test $(APHELEIA_FT) .PHONY: lint-changelog lint-changelog: ## Report an error if the changelog wasn't updated @scripts/lint-changelog.bash + +BUTTERCUP_VER := 1.34 +BUTTERCUP := vendor/buttercup-$(BUTTERCUP_VER) + +$(BUTTERCUP): + @rm -rf $(BUTTERCUP) && mkdir -p $(BUTTERCUP) + @curl -fsSL https://github.com/jorgenschaefer/emacs-buttercup/archive/refs/tags/v$(BUTTERCUP_VER).tar.gz -o $(BUTTERCUP).tar.gz + @tar -xf $(BUTTERCUP).tar.gz --strip-components=1 -C $(BUTTERCUP) + @rm $(BUTTERCUP).tar.gz + +.PHONY: unit +unit: $(BUTTERCUP) ## Run unit tests + @$(BUTTERCUP)/bin/buttercup test/unit -L $(BUTTERCUP) -L . diff --git a/apheleia-dp.el b/apheleia-dp.el new file mode 100644 index 00000000..5a7d9fb0 --- /dev/null +++ b/apheleia-dp.el @@ -0,0 +1,181 @@ +;;; apheleia-dp.el --- Dynamic programming -*- lexical-binding: t -*- + +;; SPDX-License-Identifier: MIT + +;;; Commentary: + +;; The dynamic programming implementation for edit distance. + +;;; Code: + +;; I wrote this code in 2019 and completely forgot how it worked by +;; 2024. So, for the next person who has to touch it, here is an +;; explanation. +;; +;; This is essentially the standard dynamic programming algorithm for +;; string alignment, which is the same thing as edit distance, which +;; is to say Levenshtein distance, that you learn in Algorithms 101. +;; +;; Step 1 is we take two strings and compute the edit distance between +;; them. This is the minimum sequence of insertions, deletions, and +;; substitutions (changing one character for another) that you can use +;; to transform the first string into the second, or vice versa. +;; +;; As an example suppose that we have the string "HEL|LO" where point +;; is represented by the pipe character. The formatter changes the +;; string to "HEO". We want to figure out where to put point. We want +;; to obtain the result "HE|O" as this will seem intuitive to the +;; user. +;; +;; For step 1 we determine that the edit distance is 2 because you can +;; transform "HELLO" into "HEO" by two deletions (deleting the two +;; Ls). +;; +;; To implement step 1 we use dynamic programming. Specifically we +;; construct a six-by-four table (because the lengths of the strings +;; are five and three, respectively, and we add one to each +;; dimension). Each entry in the table will show the edit distance +;; between two substrings, specifically column i1 and row i2 is the +;; edit distance between the first i1 chars of the first string and +;; the first i2 chars of the second string. Zero-indexed. +;; +;; We start out by filling in the first row and column. The upper-left +;; cell is zero because it is the edit distance from the empty string +;; to itself. The remainder of the first row and column are increasing +;; integers because they are the edit distance from the empty string +;; to a string of length n, or vice versa, which is always n +;; insertions or deletions. +;; +;; Then the dynamic programming part is filling in the rest of the +;; table based on previously computed entries. For each cell we have +;; the following choices: +;; +;; 1. Add one to the value from the cell above. Thinking about the +;; prefixes, this corresponds to inserting a char from the second +;; string, which is an insertion in terms of edit distance. +;; +;; 2. Add one to the cell on the left. This corresponds to inserting a +;; char from the first string, which is a deletion in terms of edit +;; distance. +;; +;; 3. Add one to the cell diagonally above and to the left. This +;; corresponds to a substitution in terms of edit distance. But if +;; the chars at this row and column happen to be the same between +;; the first and second strings, it is a substitution of the same +;; char for itself, so not actually a substitution - thus we only +;; add one if the chars differ. +;; +;; Here is the edit distance table for "HELLO" and "HEO": +;; +;; - H E L L O +;; - 0 1 2 3 4 5 +;; H 1 0 1 2 3 4 +;; E 2 1 0 1 2 3 +;; O 3 2 1 1 2 2 +;; +;; Step 2 is to take this table and convert it into the sequence of +;; editing operations that transforms the first string to the second. +;; Imagine that each time we fill in a cell, we draw an arrow from +;; that cell to the cell that we used as the basis for its value (one +;; of the 1, 2, 3 options in the list above). Starting at the +;; lower-right cell, we can follow the arrows, which will point a +;; unique path to the upper-left. These are the cells that the path +;; will traverse for the example table above: +;; +;; - H E L L O +;; - * 1 2 3 4 5 +;; H 1 * 1 2 3 4 +;; E 2 1 * * * 3 +;; O 3 2 1 1 2 * +;; +;; Tracing to the upper-left, we get this sequence of operations: +;; +;; 1. O - Leave alone +;; 2. L - Delete +;; 3. L - Delete +;; 4. E - Leave alone +;; 5. H - Leave alone +;; +;; Step 3 is to take this sequence of operations and use it to compute +;; the correct offset within the modified string. Since we start with +;; "HEL|LO", note that operations 1-2 will not affect point since they +;; occur after point (and the numerical value of point is the number +;; of chars before point, no matter how many are after it), while +;; operations 3-5 are relevant for point. But we do need to go through +;; operations 1-2 during step 2 to determine the correct path through +;; the dynamic programming table. +;; +;; When going through steps 3-5, we may adjust point. We assume point +;; will start at the same position in the modified string as it was in +;; the original string, and then possibly make changes to it. In the +;; case of a substitution or no-op, we don't move point. In the case +;; of a deletion that occurs before point, we need to decrease point. +;; In the case of an insertion that occurs before point, we need to +;; increase point. +;; +;; For our given example, we have two no-ops and one deletion that +;; occur in steps 3-5 before point, so "HEL|LO" becomes "HE|O" with +;; point changing from 3 to 2 (assuming the start of the string is 0). +;; +;; This example is tested in the unit tests file if you want to look +;; there to verify usage. + +(require 'cl-lib) + +(cl-defun apheleia--edit-distance-table (s1 s2) + "Align strings S1 and S2 for minimum edit distance. +Return the dynamic programming table as a hash table which maps +cons of integers (I1 . I2) to the edit distance between the first +I1 characters of S1 and the first I2 characters of S2." + (let ((table (make-hash-table :test #'equal))) + (dotimes (i1 (1+ (length s1))) + (puthash (cons i1 0) i1 table)) + (dotimes (i2 (1+ (length s2))) + (puthash (cons 0 i2) i2 table)) + (dotimes (i1 (length s1)) + ;; Iterate from 1 to length+1. + (cl-incf i1) + (dotimes (i2 (length s2)) + (cl-incf i2) + (let ((ins (1+ (gethash (cons i1 (1- i2)) table))) + (del (1+ (gethash (cons (1- i1) i2) table))) + (sub (gethash (cons (1- i1) (1- i2)) table))) + (unless (= (aref s1 (1- i1)) (aref s2 (1- i2))) + (cl-incf sub)) + (puthash (cons i1 i2) (min ins del sub) table)))) + table)) + +(defun apheleia--align-point (s1 s2 p1) + "Given strings S1 and S2 and index P1 in S1, return matching index P2 in S2. +If S1 and S2 are the same, then P1 and P2 will also be the same. +Otherwise, the text of S2 surrounding P2 is \"similar\" to the +text of S1 surrounding P1." + (let* ((table (apheleia--edit-distance-table s1 s2)) + (i1 (length s1)) + (i2 (length s2)) + (p2 p1)) + (while (not (= i1 i2 0)) + (let ((ins (1+ (gethash (cons i1 (1- i2)) table 9999))) + (del (1+ (gethash (cons (1- i1) i2) table 9999))) + (sub (gethash (cons (1- i1) (1- i2)) table 9999))) + (unless (and (> 0 i1) (> 0 i2) + (= (aref s1 (1- i1)) (aref s2 (1- i2)))) + (cl-incf sub)) + (let ((cost (min ins del sub))) + (cond + ((= cost sub) + (cl-decf i1) + (cl-decf i2)) + ((= cost ins) + (cl-decf i2) + (when (< i1 p1) + (cl-incf p2))) + ((= cost del) + (cl-decf i1) + (when (< i1 p1) + (cl-decf p2))))))) + p2)) + +(provide 'apheleia-dp) + +;;; apheleia-dp.el ends here diff --git a/apheleia-rcs.el b/apheleia-rcs.el index 2e10c12d..e4629210 100644 --- a/apheleia-rcs.el +++ b/apheleia-rcs.el @@ -9,59 +9,12 @@ ;;; Code: +(require 'apheleia-dp) (require 'apheleia-log) (require 'cl-lib) (require 'subr-x) -(cl-defun apheleia--edit-distance-table (s1 s2) - "Align strings S1 and S2 for minimum edit distance. -Return the dynamic programming table as has table which maps cons -of integers (I1 . I2) to the edit distance between the first I1 -characters of S1 and the first I2 characters of S2." - (let ((table (make-hash-table :test #'equal))) - (dotimes (i1 (1+ (length s1))) - (puthash (cons i1 0) i1 table)) - (dotimes (i2 (1+ (length s2))) - (puthash (cons 0 i2) i2 table)) - (dotimes (i1 (length s1)) - ;; Iterate from 1 to length+1. - (cl-incf i1) - (dotimes (i2 (length s2)) - (cl-incf i2) - (let ((ins (1+ (gethash (cons i1 (1- i2)) table))) - (del (1+ (gethash (cons (1- i1) i2) table))) - (sub (gethash (cons (1- i1) (1- i2)) table))) - (unless (= (aref s1 (1- i1)) (aref s2 (1- i2))) - (cl-incf sub)) - (puthash (cons i1 i2) (min ins del sub) table)))) - table)) - -(defun apheleia--align-point (s1 s2 p1) - "Given strings S1 and S2 and index P1 in S1, return matching index P2 in S2. -If S1 and S2 are the same, then P1 and P2 will also be the same. -Otherwise, the text of S2 surrounding P2 is \"similar\" to the -text of S1 surrounding P1." - (let* ((table (apheleia--edit-distance-table s1 s2)) - (i1 (length s1)) - (i2 (length s2))) - (while (> i1 p1) - (let ((ins (1+ (gethash (cons i1 (1- i2)) table))) - (del (1+ (gethash (cons (1- i1) i2) table))) - (sub (gethash (cons (1- i1) (1- i2)) table))) - (unless (= (aref s1 (1- i1)) (aref s2 (1- i2))) - (cl-incf sub)) - (let ((cost (min ins del sub))) - (cond - ((= cost ins) - (cl-decf i2)) - ((= cost del) - (cl-decf i1)) - ((= cost sub) - (cl-decf i1) - (cl-decf i2)))))) - i2)) - (defun apheleia--map-rcs-patch (func) "Map over the RCS patch in the current buffer. For each RCS patch command, FUNC is called with an alist that has diff --git a/scripts/check-line-length.bash b/scripts/check-line-length.bash index 52a419cb..20f66d92 100755 --- a/scripts/check-line-length.bash +++ b/scripts/check-line-length.bash @@ -9,6 +9,8 @@ find=( -name .log -prune -o -path ./scripts/pnp-bin.js -prune -o -path ./test/formatters -prune -o + -path ./test/unit -prune -o + -path ./vendor -prune -o -name "*.elc" -o -type f -print ) diff --git a/test/formatters/run-func.bash b/test/shared/run-func.bash similarity index 53% rename from test/formatters/run-func.bash rename to test/shared/run-func.bash index 621edde8..f5d6f96d 100755 --- a/test/formatters/run-func.bash +++ b/test/shared/run-func.bash @@ -2,11 +2,13 @@ set -euo pipefail +func="$1" +shift + # Avoid using git to get project directory, due to # https://github.com/radian-software/apheleia/pull/89#issuecomment-1107319617 -cd "$(dirname "$0")" -repo="$(cd ../.. && pwd)" +cd "$(dirname "$0")/../.." -exec emacs --batch -L "${repo}" -L . -l apheleia-ft \ - --eval "(setq debug-on-error t)" -f "$1" +exec emacs --batch -L . "$@" \ + --eval "(setq debug-on-error t)" -f "${func}" diff --git a/test/unit/apheleia-unit-test.el b/test/unit/apheleia-unit-test.el new file mode 100644 index 00000000..c5d54cc5 --- /dev/null +++ b/test/unit/apheleia-unit-test.el @@ -0,0 +1,97 @@ +;; -*- lexical-binding: t -*- + +;; `apheleia-unit-tests' - unit tests using ERT. + +(require 'apheleia) + +;; Using buttercup because ert makes it really hard to write tabular +;; tests that report enough context to debug when they fail. +(require 'buttercup) + +(require 'map) + +(defun apheleia-unit-find-vars (form) + (cond + ((symbolp form) + (list form)) + ((listp form) + (mapcan #'apheleia-unit-find-vars (cdr form))))) + +(describe "apheleia--edit-distance-table" + (cl-flet ((table-error + (before-str after-str expected-table) + (let* ((hash (apheleia--edit-distance-table before-str after-str)) + (table + (mapcar + (lambda (i2) + (mapcar + (lambda (i1) + (gethash (cons i1 i2) hash)) + (number-sequence 0 (length before-str)))) + (number-sequence 0 (length after-str))))) + (unless (equal table expected-table) + table)))) + (cl-macrolet ((testcases + (description &rest specs) + `(it ,description + ,@(mapcar + (lambda (spec) + `(expect + (table-error ,@spec) + :to-be nil)) + specs)))) + (testcases + "computes the example from apheleia-dp file header" + ("hello" "heo" + '((0 1 2 3 4 5) + (1 0 1 2 3 4) + (2 1 0 1 2 3) + (3 2 1 1 2 2))))))) + +(describe "apheleia--align-point" + (cl-flet ((alignment-error + (before-spec after-spec) + (let* ((before-pos (string-match "|" before-spec)) + (after-pos (string-match "|" after-spec)) + (before-str (replace-regexp-in-string "|" "" before-spec)) + (after-str (replace-regexp-in-string "|" "" after-spec)) + (real-after-pos (apheleia--align-point before-str after-str before-pos))) + (unless (= after-pos real-after-pos) + (concat (substring after-str 0 real-after-pos) "|" + (substring after-str real-after-pos)))))) + (cl-macrolet ((testcases + (description &rest specs) + `(it ,description + ,@(mapcar + (lambda (spec) + (cl-destructuring-bind (before-spec after-spec) spec + `(expect + (alignment-error ,before-spec ,after-spec) + :to-be nil))) + specs)))) + (testcases + "does normal alignments" + ("hel|lo" + "he|o") + ("hello| world" + "helo| word") + ("hello | world" + "hello|world")) + (testcases + "solves issue #2" + (" if (node.type === \"CallExpression\" && (node.callee.type === \"Import\" @@ (node.callee.type === \"Identifier\" && node.callee.name === \"require\"))) { + //| + } +" + " if ( + node.type === \"CallExpression\" && + (node.callee.type === \"Import\" @@ + (node.callee.type === \"Identifier\" && node.callee.name === \"require\")) + ) { + //| + } +")) + (testcases + "solves issue #290" + (" |