diff --git a/Documentation/config/diff.adoc b/Documentation/config/diff.adoc index 1135a62a0ad3de..ac0635bb3bee1c 100644 --- a/Documentation/config/diff.adoc +++ b/Documentation/config/diff.adoc @@ -218,6 +218,11 @@ endif::git-diff[] Set this option to `true` to make the diff driver cache the text conversion outputs. See linkgit:gitattributes[5] for details. +`diff..process`:: + The command to run as a long-running diff process that + provides hunks to Git's diff pipeline. + See linkgit:gitattributes[5] for details. + `diff.indentHeuristic`:: Set this option to `false` to disable the default heuristics that shift diff hunk boundaries to make patches easier to read. diff --git a/Documentation/diff-algorithm-option.adoc b/Documentation/diff-algorithm-option.adoc index 8e3a0b63d784d8..4d7e2ec35f97ea 100644 --- a/Documentation/diff-algorithm-option.adoc +++ b/Documentation/diff-algorithm-option.adoc @@ -18,3 +18,6 @@ For instance, if you configured the `diff.algorithm` variable to a non-default value and want to use the default one, then you have to use `--diff-algorithm=default` option. ++ +If you explicitly choose a diff algorithm, it also bypasses +`diff..process` (see linkgit:gitattributes[5]). diff --git a/Documentation/diff-options.adoc b/Documentation/diff-options.adoc index c8242e24627eef..a884445211ed8e 100644 --- a/Documentation/diff-options.adoc +++ b/Documentation/diff-options.adoc @@ -833,7 +833,9 @@ endif::git-format-patch[] to use this option with linkgit:git-log[1] and friends. `--no-ext-diff`:: - Disallow external diff drivers. + Disallow external diff helpers, including + `diff..command` and `diff..process` + (see linkgit:gitattributes[5]). `--textconv`:: `--no-textconv`:: diff --git a/Documentation/gitattributes.adoc b/Documentation/gitattributes.adoc index bd76167a45eb71..8c8cb051c7075a 100644 --- a/Documentation/gitattributes.adoc +++ b/Documentation/gitattributes.adoc @@ -784,6 +784,16 @@ with the above configuration, i.e. `j-c-diff`, with 7 parameters, just like `GIT_EXTERNAL_DIFF` program is called. See linkgit:git[1] for details. +An external diff driver replaces the patch Git would otherwise +produce for the path: Git runs the command and shows its output in +place of its own. Output features that post-process Git's diff do +not apply to it; word diff, function context (`-W`), `--color-moved`, +and coloring all act on Git's builtin diff, not the driver's output. +The driver is consulted only when Git generates a textual patch. The +summary formats (`--stat`, `--numstat`, `--shortstat`, and +`--dirstat`), `git blame`, and `git log -L` do not run it and +continue to use Git's builtin diff. + If the program is able to ignore certain changes (similar to `git diff --ignore-space-change`), then also set the option `trustExitCode` to true. It is then expected to return exit code 1 if @@ -821,6 +831,249 @@ NOTE: If `diff..command` is defined for path with the (see above), and adding `diff..algorithm` has no effect, as the algorithm is not passed to the external diff driver. +Using an external diff process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If `diff..process` is defined, Git sends the old and new file +content to an external tool and receives back a list of changed +regions (pairs of line ranges in the old and new file). Git uses +these instead of its builtin diff algorithm, but still controls +all output formatting, so features like word diff, function context, +color, and blame work normally. This is achieved by using the +long-running process protocol (described in +Documentation/technical/long-running-process-protocol.adoc). +Unlike `diff..command`, which replaces Git's output entirely, +the diff process feeds results back into the standard pipeline. If +both are configured for a path, `diff..command` takes precedence +for the output it produces: since it replaces the diff, the process is +not consulted there. + +First, in `.gitattributes`, assign the `diff` attribute for paths. + +------------------------ +*.c diff=cdiff +------------------------ + +Then, define a "diff..process" configuration to specify +the diff process command. + +---------------------------------------------------------------- +[diff "cdiff"] + process = /path/to/diff-process-tool +---------------------------------------------------------------- + +When Git encounters the first file that needs to be diffed, it starts +the process and performs the handshake. In the handshake, the welcome +message sent by Git is "git-diff-client", only version 1 is supported, +and the supported capability is "hunks" (the changed regions +described below). + +For each file, Git sends a list of "key=value" pairs terminated with +a flush packet, followed by the old and new file content as packetized +data, each terminated with a flush packet. The pathname is relative +to the repository root. When `diff..textconv` is also set, +the tool receives the textconv-transformed content rather than the +raw blob, matching what the consuming feature itself diffs: patch +output is textconv'd, the summary formats (noted below) are not, and +`git blame` applies textconv only under `--textconv`. Git does not +send binary files to the diff process. + +----------------------- +packet: git> command=hunks +packet: git> pathname=path/file.c +packet: git> 0000 +packet: git> OLD_CONTENT +packet: git> 0000 +packet: git> NEW_CONTENT +packet: git> 0000 +----------------------- + +The tool is expected to respond with zero or more hunk lines, +a flush packet, and a status packet terminated with a flush packet. +Each hunk line has the form: + + `hunk ` + +where `` and `` identify a range of lines in +the old file, and `` and `` identify the +replacement range in the new file. Start values are 1-based and +counts are non-negative. Ranges must not extend beyond the end of +the file. For example, `hunk 3 2 3 4` means that 2 lines starting +at line 3 in the old file were replaced by 4 lines starting at +line 3 in the new file. An `` of 0 means no lines were +removed (pure insertion); a `` of 0 means no lines were +added (pure deletion). A start value of 0 is accepted when +the corresponding count is 0 (e.g., `hunk 0 0 1 5` for a newly +added file), matching what `git diff` itself emits for empty +file sides. Git ignores any extra whitespace-separated tokens after +``, so a future protocol version can append fields to a +hunk line (for example a "moved" marker) without older clients +rejecting it. + +Lines are delimited by newlines. A file `"foo\nbar\n"` and a +file `"foo\nbar"` both have 2 lines. + +Hunks must be listed in order and must not overlap. Any line not +covered by a hunk is treated as unchanged and is paired, in order, +with the unchanged lines on the other side. Each run of unchanged +lines between two hunks (and the run before the first hunk and +after the last) must therefore be the same length on both sides, +not merely equal in total. For the hunks `1 3 1 5` and `10 2 12 2` +below, lines 4-9 of the old file and lines 6-11 of the new file are +both the six unchanged lines between the two hunks. A response that +balances only the total unchanged count but misaligns one of these +runs is rejected, and Git falls back to the builtin diff. + +Git does not check that the lines a hunk leaves unchanged are +byte-for-byte identical between the two sides; it pairs them by +position and shows the new side as context. A tool may therefore +report lines that differ textually (a pure reformatting, say) as +unchanged, and the diff reflects that judgment. This is +the point of a semantic backend, but it means a misbehaving tool can +produce a diff whose context does not match the old blob; as with +`git diff -w`, such a patch may not apply against the old content. + +----------------------- +packet: git< hunk 1 3 1 5 +packet: git< hunk 10 2 12 2 +packet: git< 0000 +packet: git< status=success +packet: git< 0000 +----------------------- + +If the tool responds with hunks and "success", Git marks those lines +as changed and feeds them into the standard diff pipeline. Git may +still slide or regroup those changes against matching context for +display, exactly as it compacts its own diffs, so the tool controls +which lines are reported as changed, not the precise hunk boundaries. +Patch output features (word diff, function context, color) work +normally, as do summary formats like `--stat`. Not every feature +consults the process, though; see "Which features consult the diff +process" below for the full picture and the reasoning behind it. + +If no hunk lines precede the flush, followed by "success", Git +treats the files as having no changes: `git diff` produces no output, +`git diff --exit-code` and `--quiet` report success even though the +stored blobs differ, and `git blame` skips the commit, attributing +lines to earlier commits. +The one exception is a change that only adds or removes the file's +trailing newline: it cannot be expressed as line hunks, so when the +line content otherwise matches Git keeps the builtin diff for that +file (preserving the `\ No newline at end of file` marker) instead of +treating the two sides as equal. + +----------------------- +packet: git< 0000 +packet: git< status=success +packet: git< 0000 +----------------------- + +If the tool returns invalid hunks (out of bounds, overlapping, or +mismatched unchanged line counts), Git warns and falls back to the +builtin diff algorithm. + +In case the tool cannot or does not want to process the content, +it is expected to respond with an "error" status. Git warns and +falls back to the builtin diff algorithm for this file. The tool +remains available for subsequent files. + +----------------------- +packet: git< 0000 +packet: git< status=error +packet: git< 0000 +----------------------- + +In case the tool cannot or does not want to process the content as +well as any future content for the lifetime of the Git process, it +is expected to respond with an "abort" status. Git silently falls +back to the builtin diff algorithm for this file and does not send +further requests to the tool. + +----------------------- +packet: git< 0000 +packet: git< status=abort +packet: git< 0000 +----------------------- + +If the tool dies during the communication or does not adhere to the +protocol then Git will stop the process and fall back to the builtin +diff algorithm. Git warns once and does not restart the process for +subsequent files. + +Tools should ignore unknown keys in the per-file request to remain +forward-compatible. Future versions of Git may send additional +`command=` values; tools that receive an unrecognized command should +respond with `status=error` rather than terminating. + +Which features consult the diff process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The diff process answers a single question: given two blobs, which +line ranges differ? Whether a particular feature consults it follows +from whether that is the question the feature is really asking. + +Features that ask "which lines changed" use the tool's hunks in place +of the builtin algorithm: + +- `git diff` patch output, together with everything layered on it: + word diff, function context (`-W`), `--color-moved`, and the `@@` + hunk headers. These operate on the lines the patch step already + emitted, so they reflect the tool's hunks without any further + negotiation. +- `git blame`: a commit whose change the tool reports as equivalent is + skipped, and its lines are attributed to an earlier commit. +- `git log -L`: both the line-range display and the underlying range + tracking consult the tool, so a commit it reports as equivalent is + dropped from the log (its tracked range maps across unchanged) + rather than selected and then shown with an empty diff. +- `--stat`, `--numstat`, and `--shortstat`: the inserted and deleted + counts come from the tool's hunks, so a file the tool calls + equivalent contributes no stat line, matching the empty patch that + `git diff` produces for it. These summary formats do not apply + textconv (just as the builtin summary path does not), so the tool + is consulted on the raw blob content even when a `textconv` is also + configured for patch output; this mirrors how builtin `--stat` + already counts raw lines rather than the textconv'd view. The + line-counting `--dirstat=lines` uses these same counts; the default + `--dirstat`, which weighs byte changes, is computed on its own and + does not consult the tool. + +Features that ask a different question do not consult the process, by +design: + +- The pickaxe `-G` searches the textual diff for a pattern; it + asks "does this string appear in the diff," not "did these lines + change." (`-S` runs at an earlier stage and is likewise unaffected.) +- `git patch-id` must produce a stable hash for `git rebase` and + cherry-pick detection; deriving it from a configured tool would make + equal patches hash differently from machine to machine. +- The merge machinery (`git merge-tree`, `rerere`) computes merge + content and conflict signatures rather than display output, so the + tool's hunks must not alter its results. +- `git range-diff` diffs patch text, not source blobs, so source-file + hunks do not apply to it. +- `--check` reports whitespace errors in added lines using the builtin + diff's notion of which lines are added, not the tool's. It can + therefore flag (and exit non-zero on) a line the tool treats as + unchanged and that `git diff` shows as context. Whitespace breakage + is a property of the literal bytes, so `--check` keeps the builtin + partition deliberately; a future change could wire it to the tool if + matching `git diff` exactly became desirable. +- `--raw`, `--name-only`, and `--name-status` compare object ids at + the tree level and never run a line-level diff at all. + +Combined diffs (`--cc` and merge diffs) ask "which lines changed" but +still use the builtin algorithm, and may consult the process in a +later change; their protocol would have to be extended from a single +old/new pair to one comparison per merge parent. + +`--no-ext-diff` and `--diff-algorithm` bypass the process entirely, +for every feature listed above. The whitespace-ignoring options +(`-w`, `--ignore-space-change`, `--ignore-blank-lines`, and the like), +`-I`, and `--anchored` also bypass it for the affected files: +the tool is never told about these options, so it could not honor +them, and Git falls back to the builtin diff, which does. + Defining a custom hunk-header ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/Makefile b/Makefile index 1cec251f4387cf..1314c104634310 100644 --- a/Makefile +++ b/Makefile @@ -811,6 +811,7 @@ TEST_BUILTINS_OBJS += test-csprng.o TEST_BUILTINS_OBJS += test-date.o TEST_BUILTINS_OBJS += test-delete-gpgsig.o TEST_BUILTINS_OBJS += test-delta.o +TEST_BUILTINS_OBJS += test-diff-process-backend.o TEST_BUILTINS_OBJS += test-dir-iterator.o TEST_BUILTINS_OBJS += test-drop-caches.o TEST_BUILTINS_OBJS += test-dump-cache-tree.o @@ -1140,6 +1141,7 @@ LIB_OBJS += diff-delta.o LIB_OBJS += diff-merges.o LIB_OBJS += diff-lib.o LIB_OBJS += diff-no-index.o +LIB_OBJS += diff-process.o LIB_OBJS += diff.o LIB_OBJS += diffcore-break.o LIB_OBJS += diffcore-delta.o diff --git a/blame.c b/blame.c index 126e2324162353..a914430e0270b9 100644 --- a/blame.c +++ b/blame.c @@ -19,6 +19,8 @@ #include "tag.h" #include "trace2.h" #include "blame.h" +#include "diff-process.h" +#include "xdiff-interface.h" #include "alloc.h" #include "commit-slab.h" #include "bloom.h" @@ -314,17 +316,25 @@ static struct commit *fake_working_tree_commit(struct repository *r, -static int diff_hunks(mmfile_t *file_a, mmfile_t *file_b, - xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, int xdl_opts) +static int diff_hunks_xpp(mmfile_t *file_a, mmfile_t *file_b, + xdl_emit_hunk_consume_func_t hunk_func, + void *cb_data, xpparam_t *xpp) { - xpparam_t xpp = {0}; xdemitconf_t xecfg = {0}; xdemitcb_t ecb = {NULL}; - xpp.flags = xdl_opts; xecfg.hunk_func = hunk_func; ecb.priv = cb_data; - return xdi_diff(file_a, file_b, &xpp, &xecfg, &ecb); + return xdi_diff(file_a, file_b, xpp, &xecfg, &ecb); +} + +static int diff_hunks(mmfile_t *file_a, mmfile_t *file_b, + xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, int xdl_opts) +{ + xpparam_t xpp = {0}; + + xpp.flags = xdl_opts; + return diff_hunks_xpp(file_a, file_b, hunk_func, cb_data, &xpp); } static const char *get_next_line(const char *start, const char *end) @@ -1946,6 +1956,7 @@ static void pass_blame_to_parent(struct blame_scoreboard *sb, struct blame_origin *parent, int ignore_diffs) { mmfile_t file_p, file_o; + xpparam_t xpp = {0}; struct blame_chunk_cb_data d; struct blame_entry *newdest = NULL; @@ -1964,10 +1975,27 @@ static void pass_blame_to_parent(struct blame_scoreboard *sb, &sb->num_read_blob, ignore_diffs); sb->num_get_patch++; - if (diff_hunks(&file_p, &file_o, blame_chunk_cb, &d, sb->xdl_opts)) - die("unable to generate diff (%s -> %s)", - oid_to_hex(&parent->commit->object.oid), - oid_to_hex(&target->commit->object.oid)); + xpp.flags = sb->xdl_opts; + /* + * Whitespace-ignoring options are not communicated to the diff + * process, so it could not honor them; consult it only when none + * are set, and otherwise use the builtin diff (which does honor + * them via xpp.flags), matching the bypass in + * diff_process_fill_hunks() for "git diff". If the process + * considers the files equivalent, skip the diff so blame looks + * past this commit. + */ + if ((xpp.flags & (XDF_WHITESPACE_FLAGS | XDF_IGNORE_BLANK_LINES)) || + diff_process_fill_hunks(&sb->revs->diffopt, target->path, + &file_p, &file_o, &xpp) + != DIFF_PROCESS_EQUIVALENT) { + if (diff_hunks_xpp(&file_p, &file_o, blame_chunk_cb, + &d, &xpp)) + die("unable to generate diff (%s -> %s)", + oid_to_hex(&parent->commit->object.oid), + oid_to_hex(&target->commit->object.oid)); + } + free(xpp.external_hunks); /* The rest are the same as the parent */ blame_chunk(&d.dstq, &d.srcq, INT_MAX, d.offset, INT_MAX, 0, parent, target, 0); diff --git a/builtin/log.c b/builtin/log.c index d027ce1e0bc833..7821a61143c9d5 100644 --- a/builtin/log.c +++ b/builtin/log.c @@ -2217,6 +2217,13 @@ int cmd_format_patch(int argc, if (argc > 1) die(_("unrecognized argument: %s"), argv[1]); + /* + * Disable diff..process so that patches generated by + * format-patch are always based on the builtin diff algorithm + * and can be applied reliably. + */ + rev.diffopt.flags.no_diff_process = 1; + if (rev.diffopt.output_format & DIFF_FORMAT_NAME) die(_("--name-only does not make sense")); if (rev.diffopt.output_format & DIFF_FORMAT_NAME_STATUS) diff --git a/diff-process.c b/diff-process.c new file mode 100644 index 00000000000000..9a161c6f10a97d --- /dev/null +++ b/diff-process.c @@ -0,0 +1,350 @@ +/* + * Diff process backend: communicates with a long-running external + * tool via the pkt-line protocol to obtain custom line-matching + * results. The tool controls which lines are marked as changed + * while the display shows the file content (after any textconv + * transformation, if configured). + * + * Protocol: pkt-line over stdin/stdout, following the pattern of + * the long-running filter process protocol (see convert.c). + * + * Handshake: + * git> git-diff-client / version=1 / flush + * tool< git-diff-server / version=1 / flush + * git> capability=hunks / flush + * tool< capability=hunks / flush + * + * Per-file: + * git> command=hunks / pathname= / flush + * git> / flush + * git> / flush + * tool< hunk + * tool< ... / flush + * tool< status=success / flush + * + * When the tool returns no hunks with status=success, it considers + * the files equivalent. Git will skip the diff for that file. + */ + +#include "git-compat-util.h" +#include "diff-process.h" +#include "diff.h" +#include "gettext.h" +#include "repository.h" +#include "sigchain.h" +#include "userdiff.h" +#include "sub-process.h" +#include "pkt-line.h" +#include "strbuf.h" +#include "xdiff/xdiff.h" + +#define CAP_HUNKS (1u << 0) + +struct diff_subprocess { + struct subprocess_entry subprocess; + unsigned int supported_capabilities; +}; + +static int start_diff_process_fn(struct subprocess_entry *subprocess) +{ + static int versions[] = { 1, 0 }; + static struct subprocess_capability capabilities[] = { + { "hunks", CAP_HUNKS }, + { NULL, 0 } + }; + struct diff_subprocess *entry = + container_of(subprocess, struct diff_subprocess, subprocess); + + return subprocess_handshake(subprocess, "git-diff", + versions, NULL, + capabilities, + &entry->supported_capabilities); +} + +static struct diff_subprocess *get_or_launch_process( + struct userdiff_driver *drv) +{ + struct diff_subprocess *entry; + + if (drv->diff_subprocess) + return drv->diff_subprocess; + + entry = xcalloc(1, sizeof(*entry)); + if (subprocess_start_command(&entry->subprocess, drv->process, + start_diff_process_fn)) { + free(entry); + drv->diff_process_failed = 1; + return NULL; + } + + drv->diff_subprocess = entry; + return entry; +} + +static int send_file_content(int fd, const char *buf, long size) +{ + int ret = 0; + + if (size < 0) + return -1; + if (size > 0) + ret = write_packetized_from_buf_no_flush(buf, size, fd); + if (ret) + return ret; + return packet_flush_gently(fd); +} + +static int parse_hunk_line(const char *line, struct xdl_hunk *hunk) +{ + char *end; + + /* + * Format: "hunk " + * All numbers must be non-negative decimal with no leading + * whitespace or sign characters. + */ + if (!skip_prefix(line, "hunk ", &line)) + return -1; + + if (!isdigit(*line)) + return -1; + errno = 0; + hunk->old_start = strtol(line, &end, 10); + if (errno || end == line || *end++ != ' ') + return -1; + line = end; + + if (!isdigit(*line)) + return -1; + errno = 0; + hunk->old_count = strtol(line, &end, 10); + if (errno || end == line || *end++ != ' ') + return -1; + line = end; + + if (!isdigit(*line)) + return -1; + errno = 0; + hunk->new_start = strtol(line, &end, 10); + if (errno || end == line || *end++ != ' ') + return -1; + line = end; + + if (!isdigit(*line)) + return -1; + errno = 0; + hunk->new_count = strtol(line, &end, 10); + /* + * Accept (and ignore) any trailing space-separated tokens after + * the four counts, so future protocol versions can append fields + * to a hunk line (e.g. a "moved" or "semantic" marker) without + * older clients rejecting it. Mirrors the request-side rule that + * tools ignore unknown keys. + */ + if (errno || end == line || (*end != '\0' && *end != ' ')) + return -1; + + /* + * git diff emits start=0 when count=0 (empty file side). + * Normalize to 1-based so downstream validation can assume start >= 1. + */ + if (!hunk->old_count && !hunk->old_start) + hunk->old_start = 1; + if (!hunk->new_count && !hunk->new_start) + hunk->new_start = 1; + + return 0; +} + +static enum diff_process_result get_hunks( + struct userdiff_driver *drv, + const char *path, + const char *old_buf, long old_size, + const char *new_buf, long new_size, + struct xdl_hunk **hunks_out, + size_t *nr_hunks_out) +{ + struct diff_subprocess *backend; + struct child_process *process; + int fd_in, fd_out; + struct strbuf status = STRBUF_INIT; + struct xdl_hunk *hunks = NULL; + struct xdl_hunk hunk; + size_t nr_hunks = 0, alloc_hunks = 0, max_hunks; + int len; + char *line; + + backend = get_or_launch_process(drv); + if (!backend) + return DIFF_PROCESS_ERROR; + + if (!(backend->supported_capabilities & CAP_HUNKS)) + return DIFF_PROCESS_SKIP; + + process = subprocess_get_child_process(&backend->subprocess); + fd_in = process->in; + fd_out = process->out; + + sigchain_push(SIGPIPE, SIG_IGN); + + /* Send request */ + if (packet_write_fmt_gently(fd_in, "command=hunks\n") || + packet_write_fmt_gently(fd_in, "pathname=%s\n", path) || + packet_flush_gently(fd_in)) + goto comm_error; + + /* Send old file content */ + if (send_file_content(fd_in, old_buf, old_size)) + goto comm_error; + + /* Send new file content */ + if (send_file_content(fd_in, new_buf, new_size)) + goto comm_error; + + /* + * Hunks are non-overlapping and each covers at least one line, so + * a valid response cannot contain more hunks than the two files + * have lines, which is bounded by their byte sizes. Cap the + * accumulation accordingly so a misbehaving tool that floods hunk + * lines cannot drive unbounded memory growth before validation. + */ + max_hunks = (size_t)old_size + (size_t)new_size + 1; + + /* Read hunks until flush packet */ + while ((len = packet_read_line_gently(fd_out, NULL, &line)) >= 0 && + line) { + if (parse_hunk_line(line, &hunk) < 0) + goto comm_error; + if (nr_hunks >= max_hunks) { + warning(_("diff process '%s' sent too many hunks" + " for '%s'"), drv->process, path); + goto comm_error; + } + ALLOC_GROW(hunks, nr_hunks + 1, alloc_hunks); + hunks[nr_hunks++] = hunk; + } + if (len < 0) + goto comm_error; + + /* Read status */ + if (subprocess_read_status(fd_out, &status)) + goto comm_error; + + if (!strcmp(status.buf, "success")) { + *hunks_out = hunks; + *nr_hunks_out = nr_hunks; + strbuf_release(&status); + sigchain_pop(SIGPIPE); + return DIFF_PROCESS_OK; + } + + if (!strcmp(status.buf, "abort")) { + /* + * The tool voluntarily withdrew: stop sending requests + * but do not warn (this is not a failure). + */ + backend->supported_capabilities &= ~CAP_HUNKS; + free(hunks); + strbuf_release(&status); + sigchain_pop(SIGPIPE); + return DIFF_PROCESS_SKIP; + } + + /* status=error or unknown status */ + free(hunks); + strbuf_release(&status); + sigchain_pop(SIGPIPE); + return DIFF_PROCESS_ERROR; + +comm_error: + /* + * Communication failure (broken pipe, malformed response). + * Tear down the process and mark as failed so we do not + * retry on every subsequent file. + */ + drv->diff_process_failed = 1; + drv->diff_subprocess = NULL; + subprocess_stop_command(&backend->subprocess); + free(backend); + free(hunks); + strbuf_release(&status); + sigchain_pop(SIGPIPE); + return DIFF_PROCESS_ERROR; +} + +/* + * Whether exactly one of the two blobs ends in a newline. A change + * that only adds or removes the trailing newline is not expressible as + * line hunks, so a tool comparing lines reports the files as equal. + */ +static int eof_newline_differs(const mmfile_t *a, const mmfile_t *b) +{ + int a_nl = a->size > 0 && a->ptr[a->size - 1] == '\n'; + int b_nl = b->size > 0 && b->ptr[b->size - 1] == '\n'; + return a_nl != b_nl; +} + +enum diff_process_result diff_process_fill_hunks( + struct diff_options *diffopt, + const char *path, + const mmfile_t *file_a, + const mmfile_t *file_b, + xpparam_t *xpp) +{ + struct userdiff_driver *drv; + struct xdl_hunk *ext_hunks = NULL; + size_t nr = 0; + enum diff_process_result res; + + if (!diffopt || !path) + return DIFF_PROCESS_SKIP; + if (diffopt->flags.no_diff_process || diffopt->ignore_driver_algorithm) + return DIFF_PROCESS_SKIP; + /* + * Whitespace-ignoring, regex-ignore (-I) and anchored options + * change which lines count as different, but the tool is never + * told about them, so its hunks could not honor them. Rather + * than silently override the user's request, fall back to the + * builtin diff, which does honor these flags. + */ + if ((diffopt->xdl_opts & (XDF_WHITESPACE_FLAGS | XDF_IGNORE_BLANK_LINES)) || + diffopt->ignore_regex_nr || diffopt->anchors_nr) + return DIFF_PROCESS_SKIP; + + drv = userdiff_find_by_path(diffopt->repo->index, path); + if (!drv || !drv->process) + return DIFF_PROCESS_SKIP; + if (drv->diff_process_failed) + return DIFF_PROCESS_SKIP; + + res = get_hunks(drv, path, + file_a->ptr, file_a->size, + file_b->ptr, file_b->size, + &ext_hunks, &nr); + if (res == DIFF_PROCESS_OK) { + if (!nr) { + free(ext_hunks); + /* + * Zero hunks means the tool considers the line + * content identical, but it cannot express a + * trailing-newline-only change. When that is the + * actual difference, fall back to the builtin diff + * so the "\ No newline at end of file" marker is + * preserved instead of reporting the files equal. + */ + if (eof_newline_differs(file_a, file_b)) + return DIFF_PROCESS_SKIP; + return DIFF_PROCESS_EQUIVALENT; + } + xpp->external_hunks = ext_hunks; + xpp->external_hunks_nr = nr; + return DIFF_PROCESS_OK; + } + if (res == DIFF_PROCESS_ERROR) { + warning(_("diff process '%s' failed for '%s'," + " falling back to builtin diff"), + drv->process, path); + return DIFF_PROCESS_ERROR; + } + return DIFF_PROCESS_SKIP; +} diff --git a/diff-process.h b/diff-process.h new file mode 100644 index 00000000000000..d34b42f8116a04 --- /dev/null +++ b/diff-process.h @@ -0,0 +1,39 @@ +#ifndef DIFF_PROCESS_H +#define DIFF_PROCESS_H + +#include "xdiff/xdiff.h" + +struct diff_options; + +enum diff_process_result { + DIFF_PROCESS_ERROR = -1, /* tool failure: warned, fell back */ + DIFF_PROCESS_OK = 0, /* hunks populated in xpp */ + DIFF_PROCESS_SKIP, /* no process configured: use builtin */ + DIFF_PROCESS_EQUIVALENT, /* tool says files are equivalent */ +}; + +/* + * Consult the diff process configured for 'path' and populate + * xpp->external_hunks with the returned hunks. + * + * Handles driver lookup, flag checks (--no-ext-diff, + * --diff-algorithm), subprocess management, and error reporting. + * + * Returns DIFF_PROCESS_OK when hunks are populated in xpp. + * The caller owns xpp->external_hunks and must free() it. + * + * Returns DIFF_PROCESS_EQUIVALENT when the tool returns no hunks + * (files are considered identical); caller should skip diff/blame. + * Returns DIFF_PROCESS_SKIP when no process applies; caller + * should use the builtin diff algorithm. + * Returns DIFF_PROCESS_ERROR on tool failure (already warned); + * caller should fall back to the builtin diff algorithm. + */ +enum diff_process_result diff_process_fill_hunks( + struct diff_options *diffopt, + const char *path, + const mmfile_t *file_a, + const mmfile_t *file_b, + xpparam_t *xpp); + +#endif /* DIFF_PROCESS_H */ diff --git a/diff.c b/diff.c index 2a9d0d86871139..a260ecfaf48e3c 100644 --- a/diff.c +++ b/diff.c @@ -25,6 +25,7 @@ #include "utf8.h" #include "odb.h" #include "userdiff.h" +#include "diff-process.h" #include "submodule.h" #include "hashmap.h" #include "mem-pool.h" @@ -4055,6 +4056,17 @@ static void builtin_diff(const char *name_a, xpp.ignore_regex_nr = o->ignore_regex_nr; xpp.anchors = o->anchors; xpp.anchors_nr = o->anchors_nr; + + if (diff_process_fill_hunks(o, name_a, + &mf1, &mf2, &xpp) + == DIFF_PROCESS_EQUIVALENT) { + if (textconv_one) + free(mf1.ptr); + if (textconv_two) + free(mf2.ptr); + goto free_ab_and_return; + } + xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; xecfg.flags = XDL_EMIT_FUNCNAMES; @@ -4135,6 +4147,7 @@ static void builtin_diff(const char *name_a, } else if (xdi_diff_outf(&mf1, &mf2, NULL, fn_out_consume, &ecbdata, &xpp, &xecfg)) die("unable to generate diff for %s", one->path); + free(xpp.external_hunks); if (o->word_diff) free_diff_words_data(&ecbdata); if (textconv_one) @@ -4248,9 +4261,24 @@ static void builtin_diffstat(const char *name_a, const char *name_b, xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; xecfg.flags = XDL_EMIT_NO_HUNK_HDR; - if (xdi_diff_outf(&mf1, &mf2, NULL, + /* + * Consult the diff process so --stat reflects the + * tool's view of which lines changed rather than the + * builtin line diff. As in the builtin path, --stat + * does not apply textconv, so the tool is fed the same + * raw mmfiles the stat itself diffs (unlike builtin_diff, + * which consults the process on textconv'd content). + * When the tool reports the files as equivalent we skip + * xdiff entirely, leaving added and deleted at zero so + * the file is pruned below, just as builtin_diff() emits + * no patch for an equivalent file. + */ + if (diff_process_fill_hunks(o, name_a, &mf1, &mf2, &xpp) + != DIFF_PROCESS_EQUIVALENT && + xdi_diff_outf(&mf1, &mf2, NULL, diffstat_consume, diffstat, &xpp, &xecfg)) die("unable to generate diffstat for %s", one->path); + free(xpp.external_hunks); if (DIFF_FILE_VALID(one) && DIFF_FILE_VALID(two)) { struct diffstat_file *file = @@ -5927,6 +5955,17 @@ static int diff_opt_submodule(const struct option *opt, return 0; } +static int diff_opt_ext_diff(const struct option *opt, + const char *arg, int unset) +{ + struct diff_options *options = opt->value; + + BUG_ON_OPT_ARG(arg); + options->flags.allow_external = !unset; + options->flags.no_diff_process = unset; + return 0; +} + static int diff_opt_textconv(const struct option *opt, const char *arg, int unset) { @@ -6257,8 +6296,9 @@ struct option *add_diff_options(const struct option *opts, N_("exit with 1 if there were differences, 0 otherwise")), OPT_BOOL(0, "quiet", &options->flags.quick, N_("disable all output of the program")), - OPT_BOOL(0, "ext-diff", &options->flags.allow_external, - N_("allow an external diff helper to be executed")), + OPT_CALLBACK_F(0, "ext-diff", options, NULL, + N_("allow an external diff helper to be executed"), + PARSE_OPT_NOARG, diff_opt_ext_diff), OPT_CALLBACK_F(0, "textconv", options, NULL, N_("run external text conversion filters when comparing binary files"), PARSE_OPT_NOARG, diff_opt_textconv), diff --git a/diff.h b/diff.h index bb5cddaf3499e9..bc7da6986a4aac 100644 --- a/diff.h +++ b/diff.h @@ -173,6 +173,11 @@ struct diff_flags { */ unsigned allow_external; + /** + * Disables diff..process. Set by --no-ext-diff. + */ + unsigned no_diff_process; + /** * For communication between the calling program and the options parser; * tell the calling program to signal the presence of difference using diff --git a/line-log.c b/line-log.c index 5fc75ae275e03a..5da7bcdaeb330b 100644 --- a/line-log.c +++ b/line-log.c @@ -7,6 +7,7 @@ #include "tag.h" #include "tree.h" #include "diff.h" +#include "diff-process.h" #include "commit.h" #include "decorate.h" #include "repository.h" @@ -330,12 +331,15 @@ static int collect_diff_cb(long start_a, long count_a, return 0; } -static int collect_diff(mmfile_t *parent, mmfile_t *target, struct diff_ranges *out) +static int collect_diff(struct diff_options *diffopt, const char *path, + mmfile_t *parent, mmfile_t *target, + struct diff_ranges *out) { struct collect_diff_cbdata cbdata = {NULL}; xpparam_t xpp; xdemitconf_t xecfg; xdemitcb_t ecb; + int ret = 0; memset(&xpp, 0, sizeof(xpp)); memset(&xecfg, 0, sizeof(xecfg)); @@ -345,7 +349,20 @@ static int collect_diff(mmfile_t *parent, mmfile_t *target, struct diff_ranges * xecfg.hunk_func = collect_diff_cb; memset(&ecb, 0, sizeof(ecb)); ecb.priv = &cbdata; - return xdi_diff(parent, target, &xpp, &xecfg, &ecb); + + /* + * Consult the diff process so range tracking agrees with the + * diff that will be shown. When the tool reports the files as + * equivalent we collect no ranges, so the tracked range maps + * across unchanged and the commit drops out of the log, rather + * than being selected here but rendered with an empty diff by + * the process-aware builtin_diff(). + */ + if (diff_process_fill_hunks(diffopt, path, parent, target, &xpp) + != DIFF_PROCESS_EQUIVALENT) + ret = xdi_diff(parent, target, &xpp, &xecfg, &ecb); + free(xpp.external_hunks); + return ret; } /* @@ -927,7 +944,8 @@ static int process_diff_filepair(struct rev_info *rev, } diff_ranges_init(&diff); - if (collect_diff(&file_parent, &file_target, &diff)) + if (collect_diff(&rev->diffopt, pair->two->path, + &file_parent, &file_target, &diff)) die("unable to generate diff for %s", pair->one->path); /* NEEDSWORK should apply some heuristics to prevent mismatches */ diff --git a/meson.build b/meson.build index 3247697f74aae1..aa532f5200a916 100644 --- a/meson.build +++ b/meson.build @@ -328,6 +328,7 @@ libgit_sources = [ 'diff-merges.c', 'diff-lib.c', 'diff-no-index.c', + 'diff-process.c', 'diff.c', 'diffcore-break.c', 'diffcore-delta.c', diff --git a/sub-process.c b/sub-process.c index 2d5c965169727b..3cef42b0880e3f 100644 --- a/sub-process.c +++ b/sub-process.c @@ -49,7 +49,7 @@ int subprocess_read_status(int fd, struct strbuf *status) return (len < 0) ? len : 0; } -void subprocess_stop(struct hashmap *hashmap, struct subprocess_entry *entry) +void subprocess_stop_command(struct subprocess_entry *entry) { if (!entry) return; @@ -57,7 +57,14 @@ void subprocess_stop(struct hashmap *hashmap, struct subprocess_entry *entry) entry->process.clean_on_exit = 0; kill(entry->process.pid, SIGTERM); finish_command(&entry->process); +} +void subprocess_stop(struct hashmap *hashmap, struct subprocess_entry *entry) +{ + if (!entry) + return; + + subprocess_stop_command(entry); hashmap_remove(hashmap, &entry->ent, NULL); } @@ -72,7 +79,7 @@ static void subprocess_exit_handler(struct child_process *process) finish_command(process); } -int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, const char *cmd, +int subprocess_start_command(struct subprocess_entry *entry, const char *cmd, subprocess_start_fn startfn) { int err; @@ -96,15 +103,26 @@ int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, co return err; } - hashmap_entry_init(&entry->ent, strhash(cmd)); - err = startfn(entry); if (err) { error("initialization for subprocess '%s' failed", cmd); - subprocess_stop(hashmap, entry); + subprocess_stop_command(entry); return err; } + return 0; +} + +int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, const char *cmd, + subprocess_start_fn startfn) +{ + int err; + + err = subprocess_start_command(entry, cmd, startfn); + if (err) + return err; + + hashmap_entry_init(&entry->ent, strhash(cmd)); hashmap_add(hashmap, &entry->ent); return 0; } diff --git a/sub-process.h b/sub-process.h index bfc3959a1b4894..45f1b8e5e3212f 100644 --- a/sub-process.h +++ b/sub-process.h @@ -52,10 +52,17 @@ int cmd2process_cmp(const void *unused_cmp_data, */ typedef int(*subprocess_start_fn)(struct subprocess_entry *entry); -/* Start a subprocess and add it to the subprocess hashmap. */ +/* Start a subprocess and run the startfn (typically handshake). */ +int subprocess_start_command(struct subprocess_entry *entry, const char *cmd, + subprocess_start_fn startfn); + +/* Start a subprocess, run startfn, and add it to the subprocess hashmap. */ int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, const char *cmd, subprocess_start_fn startfn); +/* Kill a subprocess. */ +void subprocess_stop_command(struct subprocess_entry *entry); + /* Kill a subprocess and remove it from the subprocess hashmap. */ void subprocess_stop(struct hashmap *hashmap, struct subprocess_entry *entry); diff --git a/t/helper/meson.build b/t/helper/meson.build index 3235f10ab8aae1..6abcda4afb89c0 100644 --- a/t/helper/meson.build +++ b/t/helper/meson.build @@ -12,6 +12,7 @@ test_tool_sources = [ 'test-date.c', 'test-delete-gpgsig.c', 'test-delta.c', + 'test-diff-process-backend.c', 'test-dir-iterator.c', 'test-drop-caches.c', 'test-dump-cache-tree.c', diff --git a/t/helper/test-diff-process-backend.c b/t/helper/test-diff-process-backend.c new file mode 100644 index 00000000000000..a5a70dd40b5e3a --- /dev/null +++ b/t/helper/test-diff-process-backend.c @@ -0,0 +1,341 @@ +/* + * Test backend for the long-running diff process protocol + * (see diff-process.c and Documentation/gitattributes.adoc). + * + * Usage: test-tool diff-process-backend --mode= [--log=] + * + * Implements the server side of the pkt-line handshake and a per-file + * response loop. The --mode= switch selects the response shape + * (success, error, abort, crash, malformed hunks). + * + * Per-file request from Git: + * + * packet: git> command=hunks + * packet: git> pathname= + * packet: git> 0000 + * packet: git> OLD_CONTENT + * packet: git> 0000 + * packet: git> NEW_CONTENT + * packet: git> 0000 + * + * Response varies by --mode (default: whole-file): + * + * whole-file packet: git< hunk 1 1 + * fixed-hunk packet: git< hunk 5 2 5 2 + * no-hunks (no hunk packets) + * bad-hunk packet: git< hunk 999 1 999 1 + * bad-parse packet: git< garbage not a hunk + * bad-sync packet: git< hunk 1 2 1 1 + * bad-gap packet: git< hunk 1 1 3 1 + * multi-hunk packet: git< hunk 5 2 5 2 + * packet: git< hunk 9 2 9 2 + * flood packet: git< hunk 1 1 1 1 (x100000) + * overlap packet: git< hunk 1 5 1 5 + * packet: git< hunk 3 2 3 2 + * no-cap (omits capability=hunks during handshake) + * error (status=error instead of status=success) + * abort (status=abort instead of status=success) + * crash exit(1) before sending any response + * + * All non-error/abort modes end with: + * + * packet: git< 0000 + * packet: git< status=success + * packet: git< 0000 + * + * Each request is logged to --log as: + * + * command= pathname= old= new= + */ + +#include "test-tool.h" +#include "pkt-line.h" +#include "parse-options.h" +#include "strbuf.h" + +static FILE *logfile; + +enum mode { + MODE_WHOLE_FILE, + MODE_FIXED_HUNK, + MODE_NO_HUNKS, + MODE_BAD_HUNK, + MODE_BAD_PARSE, + MODE_BAD_SYNC, + MODE_BAD_GAP, + MODE_MULTI_HUNK, + MODE_FLOOD, + MODE_OVERLAP, + MODE_NO_CAP, + MODE_ERROR, + MODE_ABORT, + MODE_CRASH, +}; + +static enum mode parse_mode(const char *s) +{ + if (!strcmp(s, "whole-file")) + return MODE_WHOLE_FILE; + if (!strcmp(s, "fixed-hunk")) + return MODE_FIXED_HUNK; + if (!strcmp(s, "no-hunks")) + return MODE_NO_HUNKS; + if (!strcmp(s, "bad-hunk")) + return MODE_BAD_HUNK; + if (!strcmp(s, "bad-parse")) + return MODE_BAD_PARSE; + if (!strcmp(s, "bad-sync")) + return MODE_BAD_SYNC; + if (!strcmp(s, "bad-gap")) + return MODE_BAD_GAP; + if (!strcmp(s, "multi-hunk")) + return MODE_MULTI_HUNK; + if (!strcmp(s, "flood")) + return MODE_FLOOD; + if (!strcmp(s, "overlap")) + return MODE_OVERLAP; + if (!strcmp(s, "no-cap")) + return MODE_NO_CAP; + if (!strcmp(s, "error")) + return MODE_ERROR; + if (!strcmp(s, "abort")) + return MODE_ABORT; + if (!strcmp(s, "crash")) + return MODE_CRASH; + die("unknown --mode=%s", s); +} + +/* + * Read "key=value" packets up to a flush, capturing "command" and + * "pathname". Returns 1 if a request was read, 0 on EOF. + * + * The first packet uses the gentle variant so that a clean shutdown + * by Git (EOF) does not produce a spurious "the remote end hung up + * unexpectedly" on stderr. Subsequent packets use the non-gentle + * variant: once inside a request, truncation is a protocol violation + * and dying loudly is the correct response. + */ +static int read_request_header(char **command, char **pathname) +{ + int first = 1; + char *line; + + *command = *pathname = NULL; + for (;;) { + const char *value; + + if (first) { + if (packet_read_line_gently(0, NULL, &line) < 0) + return 0; + first = 0; + } else { + line = packet_read_line(0, NULL); + } + if (!line) + break; + if (skip_prefix(line, "command=", &value)) + *command = xstrdup(value); + else if (skip_prefix(line, "pathname=", &value)) + *pathname = xstrdup(value); + } + return 1; +} + +static size_t count_lines(const struct strbuf *buf) +{ + size_t lines = 0; + + for (size_t i = 0; i < buf->len; i++) + if (buf->buf[i] == '\n') + lines++; + + return lines + (buf->len > 0 && buf->buf[buf->len - 1] != '\n'); +} + +static void send_status(const char *status) +{ + packet_flush(1); + packet_write_fmt(1, "%s\n", status); + packet_flush(1); +} + +static void respond(enum mode mode, + const struct strbuf *old_buf, + const struct strbuf *new_buf) +{ + switch (mode) { + case MODE_ERROR: + send_status("status=error"); + return; + case MODE_ABORT: + send_status("status=abort"); + return; + case MODE_CRASH: + exit(1); + case MODE_FIXED_HUNK: + packet_write_fmt(1, "hunk 5 2 5 2\n"); + break; + case MODE_BAD_HUNK: + packet_write_fmt(1, "hunk 999 1 999 1\n"); + break; + case MODE_BAD_PARSE: + packet_write_fmt(1, "garbage not a hunk\n"); + break; + case MODE_BAD_SYNC: + packet_write_fmt(1, "hunk 1 2 1 1\n"); + break; + case MODE_BAD_GAP: + /* + * Globally balanced (1 changed line on each side, so the + * total unchanged counts match) but the gap before the + * change differs between sides: old line 1 vs new line 3. + * Exercises the per-gap lockstep-alignment check. + */ + packet_write_fmt(1, "hunk 1 1 3 1\n"); + break; + case MODE_MULTI_HUNK: + /* + * Two valid, non-overlapping, gap-aligned hunks. Exercises + * the accepting branch of the per-gap lockstep check with a + * non-zero previous-hunk end (the realistic two-region case). + */ + packet_write_fmt(1, "hunk 5 2 5 2\n"); + packet_write_fmt(1, "hunk 9 2 9 2\n"); + break; + case MODE_FLOOD: { + /* + * Emit far more hunks than any small file has lines, so Git + * trips its accumulation cap and falls back before reading + * them all. + */ + int i; + for (i = 0; i < 100000; i++) + packet_write_fmt(1, "hunk 1 1 1 1\n"); + break; + } + case MODE_OVERLAP: + packet_write_fmt(1, "hunk 1 5 1 5\n"); + packet_write_fmt(1, "hunk 3 2 3 2\n"); + break; + case MODE_NO_HUNKS: + break; + case MODE_NO_CAP: + case MODE_WHOLE_FILE: { + size_t old_lines = count_lines(old_buf); + size_t new_lines = count_lines(new_buf); + /* + * Match git diff output: start=0 when count=0 + * (empty file side), 1 otherwise. + */ + packet_write_fmt(1, "hunk %"PRIuMAX" %"PRIuMAX + " %"PRIuMAX" %"PRIuMAX"\n", + (uintmax_t)(old_lines ? 1 : 0), + (uintmax_t)old_lines, + (uintmax_t)(new_lines ? 1 : 0), + (uintmax_t)new_lines); + break; + } + } + send_status("status=success"); +} + +static void command_loop(enum mode mode) +{ + for (;;) { + char *command = NULL, *pathname = NULL; + struct strbuf obuf = STRBUF_INIT; + struct strbuf nbuf = STRBUF_INIT; + + if (!read_request_header(&command, &pathname)) + break; /* EOF: Git closed its end */ + + read_packetized_to_strbuf(0, &obuf, 0); + read_packetized_to_strbuf(0, &nbuf, 0); + + if (logfile) { + fprintf(logfile, + "command=%s pathname=%s old=%.*s new=%.*s\n", + command ? command : "(none)", + pathname ? pathname : "(none)", + (int)(strchrnul(obuf.buf, '\n') - obuf.buf), + obuf.buf, + (int)(strchrnul(nbuf.buf, '\n') - nbuf.buf), + nbuf.buf); + fflush(logfile); + } + + respond(mode, &obuf, &nbuf); + + free(command); + free(pathname); + strbuf_release(&obuf); + strbuf_release(&nbuf); + } +} + +static void handshake(enum mode mode) +{ + char *line; + + line = packet_read_line(0, NULL); + if (!line || strcmp(line, "git-diff-client")) + die("bad welcome: '%s'", line ? line : "(eof)"); + line = packet_read_line(0, NULL); + if (!line || strcmp(line, "version=1")) + die("bad version: '%s'", line ? line : "(eof)"); + if (packet_read_line(0, NULL)) + die("expected flush after version"); + + packet_write_fmt(1, "git-diff-server\n"); + packet_write_fmt(1, "version=1\n"); + packet_flush(1); + + /* Drain capabilities advertised by Git */ + while ((line = packet_read_line(0, NULL))) + ; /* drain */ + + /* Respond with our capabilities (or none for no-cap mode) */ + if (mode != MODE_NO_CAP) + packet_write_fmt(1, "capability=hunks\n"); + packet_flush(1); +} + +static const char *const usage_str[] = { + "test-tool diff-process-backend --mode= [--log=]", + NULL +}; + +int cmd__diff_process_backend(int argc, const char **argv) +{ + const char *mode_str = NULL, *log_path = NULL; + enum mode mode = MODE_WHOLE_FILE; + struct option options[] = { + OPT_STRING(0, "mode", &mode_str, "mode", + "response shape: whole-file (default), fixed-hunk," + " no-hunks, bad-hunk, bad-sync, overlap, error," + " abort, crash"), + OPT_STRING(0, "log", &log_path, "path", + "append per-request summary to this file"), + OPT_END() + }; + + argc = parse_options(argc, argv, NULL, options, usage_str, 0); + if (argc) + usage_with_options(usage_str, options); + + if (mode_str) + mode = parse_mode(mode_str); + + if (log_path) { + logfile = fopen(log_path, "a"); + if (!logfile) + die_errno("failed to open log '%s'", log_path); + } + + handshake(mode); + command_loop(mode); + + if (logfile && fclose(logfile)) + die_errno("error closing log"); + return 0; +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index b71a22b43bbc9e..3c3f95269c6279 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -22,6 +22,7 @@ static struct test_cmd cmds[] = { { "date", cmd__date }, { "delete-gpgsig", cmd__delete_gpgsig }, { "delta", cmd__delta }, + { "diff-process-backend", cmd__diff_process_backend }, { "dir-iterator", cmd__dir_iterator }, { "drop-caches", cmd__drop_caches }, { "dump-cache-tree", cmd__dump_cache_tree }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index f2885b33d58aa8..a5bb7555162c8e 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -15,6 +15,7 @@ int cmd__csprng(int argc, const char **argv); int cmd__date(int argc, const char **argv); int cmd__delta(int argc, const char **argv); int cmd__delete_gpgsig(int argc, const char **argv); +int cmd__diff_process_backend(int argc, const char **argv); int cmd__dir_iterator(int argc, const char **argv); int cmd__drop_caches(int argc, const char **argv); int cmd__dump_cache_tree(int argc, const char **argv); diff --git a/t/meson.build b/t/meson.build index 3219264fe7d497..6afbfa6a87155b 100644 --- a/t/meson.build +++ b/t/meson.build @@ -512,6 +512,7 @@ integration_tests = [ 't4072-diff-max-depth.sh', 't4073-diff-stat-name-width.sh', 't4074-diff-shifted-matched-group.sh', + 't4080-diff-process.sh', 't4100-apply-stat.sh', 't4101-apply-nonl.sh', 't4102-apply-rename.sh', diff --git a/t/t4080-diff-process.sh b/t/t4080-diff-process.sh new file mode 100755 index 00000000000000..07a2ad7403706e --- /dev/null +++ b/t/t4080-diff-process.sh @@ -0,0 +1,832 @@ +#!/bin/sh + +test_description='diff process via long-running process' + +. ./test-lib.sh + +# See t/helper/test-diff-process-backend.c for the backend implementation +# and available --mode= options. + +BACKEND="test-tool diff-process-backend" + +test_expect_success 'setup' ' + echo "*.c diff=cdiff" >.gitattributes && + git add .gitattributes && + + # boundary.c: 10 lines, changes at 5-6 and 9-10. + # Used by: hunk boundaries, error fallback, crash, bad hunks, overlap. + cat >boundary.c <<-\EOF && + line1 + line2 + line3 + line4 + OLD5 + OLD6 + line7 + line8 + OLD9 + OLD10 + EOF + git add boundary.c && + + # worddiff.c: single-line function, value changes 1 -> 999. + # Used by: word-diff, --diff-algorithm, --no-ext-diff, --stat. + cat >worddiff.c <<-\EOF && + int value(void) { return 1; } + EOF + git add worddiff.c && + + # newfile.c: single-line function, value changes 42 -> 99. + # Used by: modified file, --exit-code, multiple drivers. + cat >newfile.c <<-\EOF && + int new_func(void) { return 42; } + EOF + git add newfile.c && + + # logtest.c: single-line function for log/format-patch tests. + # Needs two commits so log -1 has a diff. + cat >logtest.c <<-\EOF && + int logfunc(void) { return 1; } + EOF + git add logtest.c && + + # two.c/one.c: two-file pair for error/abort/startup-failure tests. + cat >one.c <<-\EOF && + int first(void) { return 1; } + EOF + cat >two.c <<-\EOF && + int second(void) { return 2; } + EOF + git add one.c two.c && + + git commit -m "initial" && + + # Second commit for logtest.c (so log -1 has something to show). + cat >logtest.c <<-\EOF && + int logfunc(void) { return 2; } + EOF + git add logtest.c && + git commit -m "change logtest.c" && + + # Working tree modifications (not committed). + cat >boundary.c <<-\EOF && + line1 + line2 + line3 + line4 + NEW5 + NEW6 + line7 + line8 + NEW9 + NEW10 + EOF + + cat >worddiff.c <<-\EOF && + int value(void) { return 999; } + EOF + + cat >newfile.c <<-\EOF && + int new_func(void) { return 99; } + EOF + + cat >one.c <<-\EOF && + int first(void) { return 10; } + EOF + + cat >two.c <<-\EOF + int second(void) { return 20; } + EOF +' + +# +# Core behavior: the tool controls which lines are marked as changed. +# + +test_expect_success 'diff process hunk boundaries affect output' ' + # The file has changes at lines 5-6 and 9-10, but fixed-hunk + # only reports lines 5-6 as changed. Lines 9-10 should not + # appear as changed in the output. + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk" \ + diff boundary.c >actual && + test_grep "^-OLD5" actual && + test_grep "^-OLD6" actual && + test_grep "^+NEW5" actual && + test_grep "^+NEW6" actual && + test_grep ! "^-OLD9" actual && + test_grep ! "^-OLD10" actual && + test_grep ! "^+NEW9" actual && + test_grep ! "^+NEW10" actual +' + +test_expect_success 'diff process accepts valid multi-hunk output' ' + # multi-hunk reports both changed regions (5-6 and 9-10) as two + # gap-aligned hunks. This exercises the accepting branch of the + # per-gap lockstep check (non-zero previous-hunk end) and must + # produce a correct two-region diff with the lines between the + # hunks kept as context. + git -c diff.cdiff.process="$BACKEND --mode=multi-hunk" \ + diff boundary.c >actual 2>stderr && + test_grep "^-OLD5" actual && + test_grep "^+NEW5" actual && + test_grep "^-OLD9" actual && + test_grep "^+NEW9" actual && + test_grep "^ line7" actual && + test_grep "^ line8" actual && + test_must_be_empty stderr +' + +test_expect_success 'diff process works with modified file' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff -- newfile.c >actual 2>stderr && + test_grep "return 99" actual && + test_grep "pathname=newfile.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process works with added file (empty old side)' ' + cat >added.c <<-\EOF && + int added(void) { return 1; } + EOF + git add added.c && + + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --cached -- added.c >actual 2>stderr && + test_grep "added" actual && + test_grep "pathname=added.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process works with deleted file (empty new side)' ' + git add added.c && + git commit -m "commit added.c" && + git rm added.c && + + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --cached -- added.c >actual 2>stderr && + test_grep "deleted file" actual && + test_grep "pathname=added.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process skipped for binary files' ' + printf "\\0binary" >binary.c && + git add binary.c && + git commit -m "add binary" && + printf "\\0changed" >binary.c && + + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff -- binary.c >actual && + test_grep "Binary files" actual && + test_path_is_missing backend.log +' + +test_expect_success 'diff process not consulted for unmatched driver' ' + echo "not tracked by cdiff" >unmatched.txt && + git add unmatched.txt && + git commit -m "add unmatched.txt" && + + echo "modified" >unmatched.txt && + + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff -- unmatched.txt >actual && + test_grep "modified" actual && + test_path_is_missing backend.log +' + +test_expect_success 'multiple drivers use separate processes' ' + echo "*.h diff=hdiff" >>.gitattributes && + git add .gitattributes && + + cat >multi.h <<-\EOF && + int header(void) { return 1; } + EOF + git add multi.h && + git commit -m "add multi.h" && + + cat >multi.h <<-\EOF && + int header(void) { return 2; } + EOF + + test_when_finished "rm -f backend-c.log backend-h.log" && + git -c diff.cdiff.process="$BACKEND --log=backend-c.log" \ + -c diff.hdiff.process="$BACKEND --log=backend-h.log" \ + diff -- newfile.c multi.h >actual 2>stderr && + test_grep "pathname=newfile.c" backend-c.log && + test_grep "pathname=multi.h" backend-h.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process works alongside textconv' ' + write_script uppercase-filter <<-\EOF && + tr "a-z" "A-Z" <"$1" + EOF + + cat >textconv.c <<-\EOF && + hello world + EOF + git add textconv.c && + git commit -m "add textconv.c" && + + cat >textconv.c <<-\EOF && + goodbye world + EOF + + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.textconv="./uppercase-filter" \ + -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff -- textconv.c >actual 2>stderr && + # The diff process receives textconv-transformed (uppercase) content. + test_grep "pathname=textconv.c" backend.log && + test_grep "old=HELLO WORLD" backend.log && + test_grep "new=GOODBYE WORLD" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process --stat is fed raw, not textconv, content' ' + # Reuses textconv.c from the previous test (committed "hello + # world", modified to "goodbye world"). Unlike patch output, + # --stat does not apply textconv, so the tool sees raw lowercase + # content here even with a textconv configured. + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.textconv="./uppercase-filter" \ + -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --stat -- textconv.c >actual 2>stderr && + test_grep "pathname=textconv.c" backend.log && + test_grep "old=hello world" backend.log && + test_grep "new=goodbye world" backend.log && + test_must_be_empty stderr +' + +# +# Downstream features: word diff, log, equivalent files, exit code. +# + +test_expect_success 'diff process with --word-diff' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --word-diff worddiff.c >actual 2>stderr && + test_grep "\[-1;-\]" actual && + test_grep "{+999;+}" actual && + test_grep "pathname=worddiff.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process works with git log -p' ' + # With no-hunks mode, the tool says the files are equivalent, + # so log -p should show the commit but no diff content. + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + log -1 -p -- logtest.c >actual 2>stderr && + test_grep "change logtest.c" actual && + test_grep ! "return 2" actual && + test_grep "command=hunks pathname=logtest.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process no hunks suppresses diff output' ' + cat >nohunks.c <<-\EOF && + int zero(void) { return 0; } + EOF + git add nohunks.c && + git commit -m "add nohunks.c" && + + cat >nohunks.c <<-\EOF && + int zero(void) { return 999; } + EOF + + git -c diff.cdiff.process="$BACKEND --mode=no-hunks" \ + diff nohunks.c >actual && + test_must_be_empty actual +' + +test_expect_success 'diff process no hunks with --exit-code returns success' ' + git -c diff.cdiff.process="$BACKEND --mode=no-hunks" \ + diff --exit-code nohunks.c +' + +test_expect_success 'diff process falls back for trailing-newline-only change' ' + test_when_finished "rm -f backend.log" && + printf "a\nb\nc\n" >eofnl.c && + git add eofnl.c && + git commit -m "add eofnl.c" && + printf "a\nb\nc" >eofnl.c && + # Same lines, only the final newline removed. The tool reports + # no hunks (it sees identical lines), but that change is not + # expressible as hunks, so git falls back to the builtin diff + # rather than treating the files as equivalent. + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + diff eofnl.c >actual 2>stderr && + test_grep "No newline at end of file" actual && + test_grep "pathname=eofnl.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process falls back for added file (empty old side)' ' + test_when_finished "rm -f backend.log" && + printf "x\ny\nz\n" >addnl.c && + git add addnl.c && + # The empty old side has no trailing newline while the new side + # does, so the newline fallback shows the addition rather than + # letting no-hunks suppress the whole new file. + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + diff --cached addnl.c >actual 2>stderr && + test_grep "^+x" actual && + test_grep "pathname=addnl.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process with --exit-code and hunks returns failure' ' + test_expect_code 1 git -c diff.cdiff.process="$BACKEND" \ + diff --exit-code newfile.c +' + +test_expect_success 'diff process feeds --numstat counts' ' + # fixed-hunk reports only lines 5-6 as changed, so the stat + # counts come from the tool (2/2), not the builtin diff (4/4). + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk --log=backend.log" \ + diff --numstat boundary.c >actual 2>stderr && + printf "2\t2\tboundary.c\n" >expect && + test_cmp expect actual && + test_grep "command=hunks pathname=boundary.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process --numstat sums multi-hunk counts' ' + # multi-hunk reports both 2-line regions (5-6 and 9-10), so the + # counts add up across both hunks: 4 inserted, 4 deleted. This + # exercises the two-region hunk path through builtin_diffstat. + git -c diff.cdiff.process="$BACKEND --mode=multi-hunk" \ + diff --numstat boundary.c >actual && + printf "4\t4\tboundary.c\n" >expect && + test_cmp expect actual +' + +test_expect_success 'diff process equivalent files produce no --stat line' ' + # A file the tool calls equivalent contributes no stat line, + # matching the empty patch that git diff produces for it. + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + diff --stat worddiff.c >actual 2>stderr && + test_must_be_empty actual && + test_grep "command=hunks pathname=worddiff.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process feeds --shortstat counts' ' + # fixed-hunk reports lines 5-6 only, so the summary counts come + # from the tool (2 insertions, 2 deletions), not builtin (4/4). + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk" \ + diff --shortstat boundary.c >actual && + test_grep "2 insertions" actual && + test_grep "2 deletions" actual +' + +test_expect_success 'diff process equivalent file makes --stat --exit-code succeed' ' + # The tool reports worddiff.c equivalent, so --exit-code reports + # no change (0); the builtin diff would report a change (1). + git -c diff.cdiff.process="$BACKEND --mode=no-hunks" \ + diff --stat --exit-code worddiff.c && + test_expect_code 1 git diff --no-ext-diff --stat --exit-code worddiff.c +' + +test_expect_success 'diff process --numstat with mixed equivalent and changed files' ' + test_when_finished "rm -f c.log h.log" && + # Self-contained fixtures: *.c uses whole-file (changed); *.mh + # uses no-hunks (equivalent). + echo "*.mh diff=hdiff" >>.gitattributes && + git add .gitattributes && + printf "int a(void) { return 1; }\n" >mixed.c && + printf "int b(void) { return 1; }\n" >mixed.mh && + git add mixed.c mixed.mh && + git commit -m "add mixed fixtures" && + printf "int a(void) { return 2; }\n" >mixed.c && + printf "int b(void) { return 2; }\n" >mixed.mh && + git -c diff.cdiff.process="$BACKEND --mode=whole-file --log=c.log" \ + -c diff.hdiff.process="$BACKEND --mode=no-hunks --log=h.log" \ + diff --numstat mixed.c mixed.mh >actual 2>stderr && + test_grep "mixed.c" actual && + test_grep ! "mixed.mh" actual && + test_grep "pathname=mixed.c" c.log && + test_grep "pathname=mixed.mh" h.log && + test_must_be_empty stderr +' + +test_expect_success POSIXPERM 'diff process keeps mode-only change in --stat' ' + test_when_finished "rm -f backend.log" && + cat >modeonly.c <<-\EOF && + int m(void) { return 1; } + EOF + git add modeonly.c && + git commit -m "add modeonly.c" && + cat >modeonly.c <<-\EOF && + int m(void) { return 2; } + EOF + git add modeonly.c && + test_chmod +x modeonly.c && + git commit -m "edit and chmod modeonly.c" && + # Content and mode both changed, but no-hunks reports the content + # equivalent. The tool is consulted (counts are zero, not the + # builtin 1/1), yet the mode change keeps the file from being + # pruned. + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + diff --stat HEAD^ HEAD >actual 2>stderr && + test_grep "modeonly.c" actual && + test_grep "command=hunks pathname=modeonly.c" backend.log && + test_grep ! "1 insertion" actual && + test_must_be_empty stderr +' + +test_expect_success 'diff process not consulted for default --dirstat' ' + # Default --dirstat counts via its own path and never contacts + # the tool, so the change is still reported even though no-hunks + # would call it equivalent. (--dirstat=lines uses the stat path.) + test_when_finished "rm -f backend.log" && + mkdir -p dsub && + printf "a\nb\nc\n" >dsub/d.c && + git add dsub/d.c && + git commit -m "add dsub/d.c" && + printf "a\nB\nc\n" >dsub/d.c && + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + diff --dirstat=0 dsub/d.c >actual && + test_grep "dsub" actual && + test_path_is_missing backend.log +' + +# +# Bypass mechanisms: flags and commands that skip the diff process. +# + +test_expect_success 'diff process bypassed by --diff-algorithm' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --diff-algorithm=patience worddiff.c >actual && + test_grep "return 999" actual && + test_path_is_missing backend.log +' + +test_expect_success 'diff process bypassed by --no-ext-diff' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --no-ext-diff worddiff.c >actual && + test_grep "return 999" actual && + test_path_is_missing backend.log +' + +test_expect_success 'diff process not used by format-patch' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + format-patch -1 --stdout -- logtest.c >actual && + test_grep "return 2" actual && + test_path_is_missing backend.log +' + +test_expect_success 'diff process bypassed under whitespace-ignoring flags' ' + test_when_finished "rm -f backend.log" && + printf "a\nb\nc\n" >wsbypass.c && + git add wsbypass.c && + git commit -m "add wsbypass.c" && + printf "a\n b \nc\n" >wsbypass.c && + # The tool is never told about these options and could not honor + # them, so git bypasses the process for each (covering the whole + # XDF_WHITESPACE_FLAGS | XDF_IGNORE_BLANK_LINES mask, not just -w). + for opt in -w -b --ignore-space-at-eol --ignore-blank-lines + do + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff $opt wsbypass.c >actual 2>stderr && + test_path_is_missing backend.log && + test_must_be_empty stderr || + return 1 + done && + # -w additionally suppresses the whitespace-only change via the + # builtin diff that now runs. + git -c diff.cdiff.process="$BACKEND" diff -w wsbypass.c >actual && + test_must_be_empty actual +' + +# +# Error handling and fallback. +# + +test_expect_success 'diff process fallback on tool error status' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=error --log=backend.log" \ + diff boundary.c >actual 2>stderr && + # Fallback produces the full builtin diff (both change regions). + test_grep "^-OLD5" actual && + test_grep "^+NEW5" actual && + test_grep "^-OLD9" actual && + test_grep "^+NEW9" actual && + # Tool was contacted (it replied with error, not crash). + test_grep "command=hunks pathname=boundary.c" backend.log && + test_grep "diff process.*failed" stderr +' + +test_expect_success 'diff process error keeps tool available for next file' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=error --log=backend.log" \ + diff -- one.c two.c >actual 2>stderr && + # Unlike abort, error keeps the tool available: both files + # are sent to the tool (and both fall back). + test_grep "pathname=one.c" backend.log && + test_grep "pathname=two.c" backend.log && + test_grep "return 10" actual && + test_grep "return 20" actual && + test_grep "diff process.*failed" stderr +' + +test_expect_success 'diff process abort disables for session' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=abort --log=backend.log" \ + diff -- one.c two.c >actual 2>stderr && + # Both files should still produce diff output via fallback. + test_grep "return 10" actual && + test_grep "return 20" actual && + # The tool aborts on the first file and git clears its + # capability. The second file never contacts the tool. + test_grep "pathname=one.c" backend.log && + test_grep ! "pathname=two.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process fallback on tool crash' ' + git -c diff.cdiff.process="$BACKEND --mode=crash" \ + diff boundary.c >actual 2>stderr && + test_grep "^-OLD5" actual && + test_grep "^+NEW5" actual && + test_grep "^-OLD9" actual && + test_grep "^+NEW9" actual && + # Crash is a communication failure, so a warning is emitted. + test_grep "diff process.*failed" stderr +' + +test_expect_success 'diff process startup failure only warns once' ' + git -c diff.cdiff.process="/nonexistent/tool" \ + diff -- one.c two.c >actual 2>stderr && + # Both files produce diff output via fallback. + test_grep "return 10" actual && + test_grep "return 20" actual && + # Sentinel prevents repeated warnings: only one, not one per file. + test_grep "diff process.*failed" stderr >warnings && + test_line_count = 1 warnings +' + + +test_expect_success 'diff process fallback on bad hunks' ' + git -c diff.cdiff.process="$BACKEND --mode=bad-hunk" \ + diff boundary.c >actual 2>stderr && + test_grep "^-OLD5" actual && + test_grep "^+NEW5" actual && + test_grep "^-OLD9" actual && + test_grep "^+NEW9" actual && + test_grep "exceeds.*lines" stderr +' + +test_expect_success 'diff process fallback on mismatched unchanged totals' ' + cat >synctest.c <<-\EOF && + line1 + line2 + line3 + EOF + git add synctest.c && + git commit -m "add synctest.c" && + + cat >synctest.c <<-\EOF && + line1 + changed + line3 + EOF + + # bad-sync reports hunk 1 2 1 1: marks 2 old lines and 1 new + # line as changed, leaving 1 unchanged old vs 2 unchanged new. + # The synchronization invariant fails and git falls back. + git -c diff.cdiff.process="$BACKEND --mode=bad-sync" \ + diff synctest.c >actual 2>stderr && + test_grep "changed" actual && + test_grep "unchanged line count mismatch" stderr +' + +test_expect_success 'diff process fallback on misaligned hunk gap' ' + # bad-gap reports hunk 1 1 3 1 on boundary.c: one changed line + # on each side, so the total unchanged counts match, but the + # unchanged run before the change differs (old line 1 vs new + # line 3). A global count check would accept this and emit a + # corrupt diff; the per-gap lockstep check rejects it and git + # falls back to the builtin algorithm. + git -c diff.cdiff.process="$BACKEND --mode=bad-gap" \ + diff boundary.c >actual 2>stderr && + # The builtin fallback shows both changed regions as additions + # (a corrupt-accepted hunk would show NEW5 only as context). + test_grep "^+NEW5" actual && + test_grep "^+NEW9" actual && + test_grep "unchanged run before hunk differs" stderr +' + +test_expect_success 'diff process fallback on overlapping hunks' ' + # boundary.c has 10 lines, so both hunks are in bounds + # but they overlap at lines 3-5, triggering the ordering check. + git -c diff.cdiff.process="$BACKEND --mode=overlap" \ + diff boundary.c >actual 2>stderr && + test_grep "NEW5" actual && + test_grep "overlaps with previous" stderr +' + +test_expect_success 'diff process fallback on malformed hunk line' ' + git -c diff.cdiff.process="$BACKEND --mode=bad-parse" \ + diff boundary.c >actual 2>stderr && + test_grep "^-OLD5" actual && + test_grep "^+NEW5" actual +' + +test_expect_success 'diff process caps a flood of hunks and falls back' ' + # flood emits far more hunks than the file has lines. Git must + # stop accumulating and fall back to the builtin diff rather than + # grow memory without bound. + git -c diff.cdiff.process="$BACKEND --mode=flood" \ + diff boundary.c >actual 2>stderr && + test_grep "^-OLD5" actual && + test_grep "too many hunks" stderr +' + +test_expect_success 'diff process skipped when tool omits capability' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=no-cap --log=backend.log" \ + diff boundary.c >actual 2>stderr && + # Builtin diff runs: all changes appear, including lines 9-10 + # that a tool-provided hunk would have narrowed away. + test_grep "^-OLD5" actual && + test_grep "^-OLD9" actual && + # The process started (the handshake created the log) but was + # never sent a per-file request, so no hunks command is logged. + test_path_is_file backend.log && + test_grep ! "command=hunks" backend.log && + test_must_be_empty stderr +' + +# +# Blame integration. +# + +test_expect_success 'blame uses tool-provided hunks' ' + cat >blame-hunk.c <<-\EOF && + line1 + line2 + line3 + line4 + original5 + original6 + line7 + line8 + line9 + line10 + EOF + git add blame-hunk.c && + git commit -m "add blame-hunk.c" && + ORIG=$(git rev-parse --short HEAD) && + + cat >blame-hunk.c <<-\EOF && + line1 + line2 + line3 + line4 + changed5 + changed6 + line7 + line8 + changed9 + changed10 + EOF + git add blame-hunk.c && + git commit -m "change blame-hunk.c" && + CHANGE=$(git rev-parse --short HEAD) && + + # With fixed-hunk mode the tool reports only lines 5-6 as changed, + # so blame should attribute lines 9-10 to the original commit + # even though the builtin diff would show them as changed. + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk" \ + blame blame-hunk.c >actual && + sed -n "9p" actual >line9 && + sed -n "10p" actual >line10 && + test_grep "$ORIG" line9 && + test_grep "$ORIG" line10 && + sed -n "5p" actual >line5 && + sed -n "6p" actual >line6 && + test_grep "$CHANGE" line5 && + test_grep "$CHANGE" line6 +' + +test_expect_success 'blame skips commits with no hunks from diff process' ' + cat >blame.c <<-\EOF && + int main(void) { + return 0; + } + EOF + git add blame.c && + git commit -m "add blame.c" && + ORIG_COMMIT=$(git rev-parse --short HEAD) && + + cat >blame.c <<-\EOF && + int main(void) + { + return 0; + } + EOF + git add blame.c && + git commit -m "reformat blame.c" && + BLAME_COMMIT=$(git rev-parse --short HEAD) && + + # Without no-hunks mode, blame attributes the change. + git blame blame.c >without && + test_grep "$BLAME_COMMIT" without && + + # With no-hunks mode, the process considers the files equivalent + # and blame skips the reformat commit, attributing to the original. + git -c diff.cdiff.process="$BACKEND --mode=no-hunks" \ + blame blame.c >with && + test_grep ! "$BLAME_COMMIT" with && + test_grep "$ORIG_COMMIT" with +' + +test_expect_success 'blame --no-ext-diff bypasses diff process' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + blame --no-ext-diff blame.c >actual && + # Without the process, blame attributes the reformat commit normally. + test_grep "$BLAME_COMMIT" actual && + test_path_is_missing backend.log +' + +test_expect_success 'blame --no-ext-diff uses builtin hunks' ' + # fixed-hunk mode would narrow blame to lines 5-6, but + # --no-ext-diff should bypass it and use the builtin diff. + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk --log=backend.log" \ + blame --no-ext-diff blame-hunk.c >actual && + # Builtin diff attributes lines 9-10 to the change commit. + sed -n "9p" actual >line9 && + test_grep "$CHANGE" line9 && + test_path_is_missing backend.log +' + +test_expect_success 'blame -w bypasses diff process' ' + test_when_finished "rm -f backend.log" && + printf "alpha\nbeta\ngamma\n" >blamew.c && + git add blamew.c && + git commit -m "add blamew.c" && + orig=$(git rev-parse --short HEAD) && + printf "alpha\n beta \ngamma\n" >blamew.c && + git commit -am "reindent beta" && + reindent=$(git rev-parse --short HEAD) && + # blame -w must ignore the whitespace-only change and attribute + # beta to the original commit, not the reindent commit. The tool + # is never told about -w, so blame must bypass it (not let tool + # hunks override -w). + git -c diff.cdiff.process="$BACKEND --mode=whole-file --log=backend.log" \ + blame -w blamew.c >actual && + sed -n "2p" actual >line2 && + test_grep "$orig" line2 && + test_grep ! "$reindent" line2 && + test_path_is_missing backend.log +' + +# +# Line-log (git log -L) range tracking. +# + +test_expect_success 'diff process drops equivalent commit from log -L' ' + test_when_finished "rm -f backend.log" && + cat >linelog.c <<-\EOF && + int tracked(void) { return 1; } + EOF + git add linelog.c && + git commit -m "add linelog.c" && + + cat >linelog.c <<-\EOF && + int tracked(void) { return 2; } + EOF + git commit -am "change tracked line" && + + # Builtin line tracking selects the change commit. + git log --no-ext-diff -L1,1:linelog.c --format="%s" >builtin && + test_grep "change tracked line" builtin && + + # With the tool reporting the change as equivalent, tracking + # drops the commit (the range maps across unchanged) instead of + # selecting it and rendering an empty diff. + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + log -L1,1:linelog.c --format="%s" >actual && + test_grep ! "change tracked line" actual && + # The creating commit still appears, so the change commit was + # selectively dropped rather than the whole log going empty. + test_grep "add linelog.c" actual && + test_grep "command=hunks pathname=linelog.c" backend.log +' + +test_done diff --git a/userdiff.c b/userdiff.c index b5412e6bc3ecd3..7547874aa2569d 100644 --- a/userdiff.c +++ b/userdiff.c @@ -509,6 +509,13 @@ int userdiff_config(const char *k, const char *v) drv->algorithm = drv->algorithm_owned; return ret; } + if (!strcmp(type, "process")) { + int ret; + FREE_AND_NULL(drv->process_owned); + ret = git_config_string(&drv->process_owned, k, v); + drv->process = drv->process_owned; + return ret; + } return 0; } diff --git a/userdiff.h b/userdiff.h index 827361b0bc9569..a98eabe3770cc6 100644 --- a/userdiff.h +++ b/userdiff.h @@ -3,6 +3,7 @@ #include "notes-cache.h" +struct diff_subprocess; struct index_state; struct repository; @@ -31,6 +32,10 @@ struct userdiff_driver { char *textconv_owned; struct notes_cache *textconv_cache; int textconv_want_cache; + const char *process; + char *process_owned; + struct diff_subprocess *diff_subprocess; + unsigned diff_process_failed : 1; }; enum userdiff_driver_type { USERDIFF_DRIVER_TYPE_BUILTIN = 1<<0, diff --git a/xdiff-interface.c b/xdiff-interface.c index db6938689f0a9e..1fa16af6681cf2 100644 --- a/xdiff-interface.c +++ b/xdiff-interface.c @@ -124,7 +124,12 @@ int xdi_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t co if (mf1->size > MAX_XDIFF_SIZE || mf2->size > MAX_XDIFF_SIZE) return -1; - if (!xecfg->ctxlen && !(xecfg->flags & XDL_EMIT_FUNCCONTEXT)) + /* + * External hunks reference line numbers in the original content; + * trimming the tail would change line counts and invalidate them. + */ + if (!xpp->external_hunks && + !xecfg->ctxlen && !(xecfg->flags & XDL_EMIT_FUNCCONTEXT)) trim_common_tail(&a, &b); return xdl_diff(&a, &b, xpp, xecfg, xecb); diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h index dc370712e92860..dd4915fe16ff19 100644 --- a/xdiff/xdiff.h +++ b/xdiff/xdiff.h @@ -78,6 +78,16 @@ typedef struct s_mmbuffer { long size; } mmbuffer_t; +/* + * Hunk descriptor for externally computed diffs. + * Line numbers are 1-based; a start of 0 is accepted when + * count is 0 (empty file side, matching git diff output). + */ +struct xdl_hunk { + long old_start, old_count; + long new_start, new_count; +}; + typedef struct s_xpparam { unsigned long flags; @@ -88,6 +98,10 @@ typedef struct s_xpparam { /* See Documentation/diff-options.adoc. */ char **anchors; size_t anchors_nr; + + /* Externally computed hunks: bypass the diff algorithm. Owned by caller. */ + struct xdl_hunk *external_hunks; + size_t external_hunks_nr; } xpparam_t; typedef struct s_xdemitcb { diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index c5a892f91e00c0..7fe22557f59136 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -1085,16 +1085,147 @@ static void xdl_mark_ignorable_regex(xdchange_t *xscr, const xdfenv_t *xe, } } +/* + * Populate the changed[] arrays from externally supplied hunks, + * bypassing the diff algorithm. Validates that hunks are in order, + * non-overlapping, and within bounds. + * + * Returns 0 on success, -1 on validation failure. + */ +static int xdl_populate_hunks_from_external(xdfenv_t *xe, + struct xdl_hunk *hunks, + size_t nr_hunks) +{ + size_t i; + long j, prev_old_end = 0, prev_new_end = 0; + + /* + * xdl_prepare_env() may dirty changed[] via xdl_cleanup_records(). + * Clear them so only the external hunks are marked. + */ + xdl_clear_changed(&xe->xdf1); + xdl_clear_changed(&xe->xdf2); + + for (i = 0; i < nr_hunks; i++) { + struct xdl_hunk *h = &hunks[i]; + + if (h->old_count < 0 || h->new_count < 0) { + warning("diff process hunk %"PRIuMAX": " + "negative count (old=%ld, new=%ld)", + (uintmax_t)(i + 1), + h->old_count, h->new_count); + return -1; + } + if (h->old_start < 1 || h->new_start < 1) { + warning("diff process hunk %"PRIuMAX": " + "start must be >= 1 (old=%ld, new=%ld)", + (uintmax_t)(i + 1), + h->old_start, h->new_start); + return -1; + } + + /* + * Range must fit: start + count - 1 <= nrec, + * rewritten to avoid overflow. Same for both sides. + * + * When count is 0 (pure insert/delete) the check + * reduces to 0 > nrec - start + 1, which rejects + * start > nrec + 1 and allows start == nrec + 1 + * (the position after the last line). + */ + if (h->old_count > (long)xe->xdf1.nrec - h->old_start + 1) { + warning("diff process hunk %"PRIuMAX": " + "old range %ld+%ld exceeds %lu lines", + (uintmax_t)(i + 1), + h->old_start, h->old_count, + (unsigned long)xe->xdf1.nrec); + return -1; + } + if (h->new_count > (long)xe->xdf2.nrec - h->new_start + 1) { + warning("diff process hunk %"PRIuMAX": " + "new range %ld+%ld exceeds %lu lines", + (uintmax_t)(i + 1), + h->new_start, h->new_count, + (unsigned long)xe->xdf2.nrec); + return -1; + } + + /* Ordering: no overlap with previous hunk (adjacent is OK) */ + if (h->old_start < prev_old_end || + h->new_start < prev_new_end) { + warning("diff process hunk %"PRIuMAX": " + "overlaps with previous hunk", + (uintmax_t)(i + 1)); + return -1; + } + + /* + * Lockstep alignment: the run of unchanged lines between + * the previous hunk and this one must be the same length + * on both sides. xdl_build_script() walks the two files + * together over unchanged lines, so a mismatched gap + * desynchronizes it and yields a corrupt script even + * though the *total* unchanged counts may still match. + */ + if (h->old_start - prev_old_end != + h->new_start - prev_new_end) { + warning("diff process hunk %"PRIuMAX": " + "unchanged run before hunk differs between " + "sides (old %ld, new %ld)", + (uintmax_t)(i + 1), + h->old_start - prev_old_end, + h->new_start - prev_new_end); + return -1; + } + + for (j = 0; j < h->old_count; j++) + xe->xdf1.changed[h->old_start - 1 + j] = true; + for (j = 0; j < h->new_count; j++) + xe->xdf2.changed[h->new_start - 1 + j] = true; + + prev_old_end = h->old_start + h->old_count; + prev_new_end = h->new_start + h->new_count; + } + + /* + * The trailing run of unchanged lines after the last hunk must + * also match. Together with the per-hunk gap check above, this + * guarantees the two files stay synchronized for + * xdl_build_script()'s lockstep walk, and it subsumes the + * weaker "total unchanged counts match" invariant. + */ + if ((long)xe->xdf1.nrec - prev_old_end != + (long)xe->xdf2.nrec - prev_new_end) { + warning("diff process: unchanged line count mismatch " + "(old: %ld trailing, new: %ld trailing)", + (long)xe->xdf1.nrec - prev_old_end, + (long)xe->xdf2.nrec - prev_new_end); + return -1; + } + + return 0; +} + int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t const *xecfg, xdemitcb_t *ecb) { xdchange_t *xscr; xdfenv_t xe; emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff; - if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) { + if (xpp->external_hunks) { + if (xdl_prepare_env(mf1, mf2, xpp, &xe) < 0) + return -1; + if (xdl_populate_hunks_from_external(&xe, + xpp->external_hunks, + xpp->external_hunks_nr) == 0) + goto diff_done; + xdl_free_env(&xe); + } + if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) return -1; - } + +diff_done: if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 || xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 || xdl_build_script(&xe, &xscr) < 0) { diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 11bada2608a7a4..f4ab93533286da 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -471,3 +471,13 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, return 0; } + +/* + * Reset the changed[] array so that no lines are marked as changed. + * Also clears the sentinel slots at changed[-1] and changed[nrec] + * that xdl_change_compact() relies on during backward scans. + */ +void xdl_clear_changed(xdfile_t *xdf) +{ + memset(xdf->changed - 1, 0, (xdf->nrec + 2) * sizeof(bool)); +} diff --git a/xdiff/xprepare.h b/xdiff/xprepare.h index 947d9fc1bb8cf9..0413baf07bcc90 100644 --- a/xdiff/xprepare.h +++ b/xdiff/xprepare.h @@ -28,6 +28,7 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdfenv_t *xe); void xdl_free_env(xdfenv_t *xe); +void xdl_clear_changed(xdfile_t *xdf);