Skip to content

Commit

Permalink
utils.git: Handle output with embedded '\r' (CR) character
Browse files Browse the repository at this point in the history
In the diff output, be it from `git format-patch`, `git log -p`, or
`git diff`, there can be '\r' (CR) characters embedded.  The same is
true for configuration variables values.

Turning on `text` aka. `universal_newlines` translates '\r' into '\n',
which might lead to unidiff.UnidiffParseError, as the fragment of the line
after '\r' (aka. CR, sometimes shown as ^M) may not start with '+' for
the added line, '-' for the removed line, or ' ' for the context line.

  UnidiffParseError: Hunk diff line expected

For cases where we can expect for '\r' characters to be possible in the
Git command output, use binary mode with subprocess, and then decode it.

However, unidiff and _parse_commit_text() expects '\n' as newline, and
wouldn't work correctly with '\r\n' (for example unidiff would mis-detect
changed file names, and _parse_commit_text() wouldn't be able to extract
authorship info), so there was a need to replace '\r\n' in a few places
by '\n'.

.
Additionally, this commit fixes an intermittent Heisen-BUG, that sometimes
happened when running tests, namely unexpected end of patch.

When parsing `git log -p` output (in .log_p() method), don't stop
processing output if the subprocess finished - there can still be
something in the buffer.
  • Loading branch information
jnareb committed Oct 25, 2024
1 parent 5956b6a commit 8b84601
Show file tree
Hide file tree
Showing 5 changed files with 249 additions and 19 deletions.
80 changes: 62 additions & 18 deletions src/diffannotator/utils/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,17 @@ class ChangeSet(PatchSet):
# https://git-scm.com/docs/hash-function-transition

def __init__(self, patch_source: Union[StringIO, TextIO, str], commit_id: str,
prev: Optional[str] = None,
prev: Optional[str] = None, newline: Optional[str] = '\n',
*args, **kwargs):
"""ChangeSet class constructor
:param patch_source: patch source to be parsed by PatchSet (parent class)
:param commit_id: oid of the "after" commit (tree-ish) for the change
:param prev: previous state, when ChangeSet is generated with .unidiff(),
or `None` it the change corresponds to a commit (assumed first-parent)
:param newline: determines how to parse newline characters from the stream,
and how the stream was opened (one of None i.e. universal newline,
'', '\n', '\r', and '\r\n' - same as `open`)
:param args: passed to PatchSet constructor
:param kwargs: passed to PatchSet constructor (recommended);
PatchSet uses `encoding` (str) and `metadata_only` (bool): :raw-html:`<br />`
Expand All @@ -95,6 +98,17 @@ def __init__(self, patch_source: Union[StringIO, TextIO, str], commit_id: str,
(i.e. hunks without content) which is around 2.5-6 times faster;
it will still validate the diff metadata consistency and get counts
"""
# with universal newline you don't need to do translation, because it is already done
# with '\n' as newline you don't need to do translation, because it has correct EOLs
# this means that `newline` can be '\r' or '\r\n'
if newline is not None and newline != '\n':
# TODO: create a proper wrapper around io.StringIO.readline()
if isinstance(patch_source, StringIO):
patch_source = patch_source.getvalue()
elif isinstance(patch_source, TextIOWrapper):
patch_source = patch_source.read()
patch_source = StringIO(patch_source.replace(newline, '\n'))

super().__init__(patch_source, *args, **kwargs)
self.commit_id = commit_id
self.prev = prev
Expand Down Expand Up @@ -122,8 +136,19 @@ def __init__(self, patch_source: Union[StringIO, TextIO, str], commit_id: str,
# override
@classmethod
def from_filename(cls, filename: Union[str, Path], encoding: str = DEFAULT_ENCODING,
errors: Optional[str] = ENCODING_ERRORS, newline: Optional[str] = None) -> 'ChangeSet':
"""Return a PatchSet instance given a diff filename."""
errors: Optional[str] = ENCODING_ERRORS, newline: Optional[str] = '\n') -> 'ChangeSet':
"""Return a ChangeSet instance given a diff filename.
:param filename: path to a diff file with the patch to parse
:param encoding: the name of the encoding used to decode or encode
the diff file
:param errors: specifies how encoding and decoding errors are to be
handled (one of None, 'strict', 'ignore', 'replace', 'backslashreplace',
or 'surrogateescape' - same as `open`)
:param newline: determines how to parse newline characters from the stream
(one of None, '', '\n', '\r', and '\r\n' - same as `open`)
:return: instance of ChangeSet class
"""
# NOTE: unconditional `file_path = Path(filename)` would also work
if isinstance(filename, Path):
file_path = filename
Expand All @@ -139,7 +164,7 @@ def from_filename(cls, filename: Union[str, Path], encoding: str = DEFAULT_ENCOD

# slightly modified contents of PatchSet.from_filename() alternate constructor
with file_path.open(mode='r', encoding=encoding, errors=errors, newline=newline) as fp:
obj = cls(fp, commit_id=commit_id) # PatchSet.from_filename() has type mismatch
obj = cls(fp, commit_id=commit_id, newline=newline) # PatchSet.from_filename() has type mismatch

# adjust commit_id if we were able to retrieve commit metadata from file
if commit_id != '' and obj.commit_metadata is not None:
Expand Down Expand Up @@ -203,6 +228,9 @@ def _parse_commit_text(commit_text: str, with_parents_line: bool = True,
"""
# based on `parse_commit_text` from gitweb/gitweb.perl in git project
# NOTE: cannot use .splitlines() here
if '\r\n' in commit_text:
# in case commit_text came from a patch file with CRLF line endings
commit_text = commit_text.replace('\r\n', '\n')
commit_lines = commit_text.split('\n')[:-1] # remove trailing '' after last '\n'

if not commit_lines:
Expand Down Expand Up @@ -576,14 +604,16 @@ def format_patch(self,
else:
cmd.extend(revision_range)

# NOTE: specifying `encoding` or `errors` turns on `text` == `universal_newlines`
# and you cannot say `text=False` and/or `universal_newlines=False` to turn it off
# The output of the `git format-patch` command can contain embedded `\r` (CR)
process = subprocess.run(cmd,
capture_output=True, check=True,
encoding='utf-8', errors=self.encoding_errors)
capture_output=True, check=True)
# MAYBE: better checks for process.returncode, and examine process.stderr
if process.returncode == 0:
return process.stdout
return process.stdout.decode(encoding='utf-8', errors=self.encoding_errors)
else:
return process.stderr
return process.stderr.decode(encoding='utf-8', errors=self.encoding_errors)

def list_files(self, commit: str = 'HEAD') -> list[str]:
"""Retrieve list of files at given revision in a repository
Expand Down Expand Up @@ -913,20 +943,25 @@ def commit_with_patch(_commit_id: str, _commit_data: StringIO) -> ChangeSet:
## DEBUG (TODO: switch to logger.debug())
#print(f"{cmd=}")

# NOTE: specifying `encoding` or `errors` turns on `text` == `universal_newlines`
# and you cannot say `text=False` and/or `universal_newlines=False` to turn it off
# The output of the `git log -p` command can contain embedded `\r` (CR)
process = subprocess.Popen(
cmd,
bufsize=1, # line buffered
#bufsize=1, # line buffered
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
encoding='utf-8',
errors=self.encoding_errors,
text=True,
stderr=subprocess.DEVNULL, # TODO: consider capturing stderr
)

commit_data = StringIO()
commit_id: Optional[str] = None
while process.poll() is None:
log_p_line = process.stdout.readline()
# if we are waiting on the process, readline can return empty line
# if the process ends, we can still have lines in buffer
while (log_p_line_raw := process.stdout.readline()) or process.poll() is None:
log_p_line = log_p_line_raw.decode(
encoding='utf-8',
errors=self.encoding_errors,
)
if log_p_line:
if not commit_id and log_p_line[0] != '\0':
# first line in output
Expand Down Expand Up @@ -1188,6 +1223,8 @@ def get_current_branch(self) -> Union[str, None]:
# but exit with non-zero status silently if HEAD is not a symbolic ref, but detached HEAD
process = subprocess.run(cmd,
capture_output=True, check=True,
# branch names cannot contain '\r' (CR) character,
# see https://git-scm.com/docs/git-check-ref-format
text=True, errors=self.encoding_errors)
except subprocess.CalledProcessError:
return None
Expand All @@ -1213,6 +1250,8 @@ def resolve_symbolic_ref(self, ref: str = 'HEAD') -> Union[str, None]:
# but exit with non-zero status silently if `ref` is not a symbolic ref
process = subprocess.run(cmd,
capture_output=True, check=True,
# branch names and symbolic refereces cannot contain '\r' (CR),
# see https://git-scm.com/docs/git-check-ref-format
text=True, errors=self.encoding_errors)
except subprocess.CalledProcessError:
return None
Expand Down Expand Up @@ -1268,6 +1307,8 @@ def check_merged_into(self, commit: str, ref_pattern: Union[str, list[str]] = 'H
]
process = subprocess.run(cmd,
capture_output=True, check=True,
# branch and other refs names cannot contain '\r' (CR),
# see https://git-scm.com/docs/git-check-ref-format
text=True, errors=self.encoding_errors)
return process.stdout.splitlines()

Expand Down Expand Up @@ -1305,6 +1346,7 @@ def count_commits(self,
cmd.append('--first-parent')
process = subprocess.run(cmd,
capture_output=True, check=True,
# `git rev-list --count <start>` returns a number, no '\r' possible
encoding='utf-8', errors=self.encoding_errors)

return int(process.stdout)
Expand Down Expand Up @@ -1361,6 +1403,8 @@ def find_roots(self, start_from: str = StartLogFrom.CURRENT) -> list[str]:
]
process = subprocess.run(cmd,
capture_output=True, check=True,
# the Git command above returns list of commit identifiers
# separated by newlines, therefore no '\r' in output possible
text=True, errors=self.encoding_errors)
return process.stdout.splitlines()

Expand Down Expand Up @@ -1388,9 +1432,9 @@ def get_config(self, name: str, value_type: Optional[str] = None) -> Union[str,

try:
process = subprocess.run(cmd,
capture_output=True, check=True,
text=True, errors=self.encoding_errors)
return process.stdout.strip()
capture_output=True, check=True)
return process.stdout.decode(errors=self.encoding_errors).strip()

except subprocess.CalledProcessError as err:
# This command will fail with non-zero status upon error. Some exit codes are:
# - The section or key is invalid (ret=1),
Expand Down
2 changes: 1 addition & 1 deletion tests/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ def test_Bug_from_changeset():
# see tests/test_utils_git.py::test_ChangeSet_from_filename
commit_id = 'c0dcf39b046d1b4ff6de14ac99ad9a1b10487512'
filename_diff = f'tests/test_dataset/tqdm-1/{commit_id}.diff_with_raw'
changeset = ChangeSet.from_filename(filename_diff)
changeset = ChangeSet.from_filename(filename_diff, newline='\r\n')

bug = Bug.from_patchset(patch_id=commit_id, patch_set=changeset)
assert [commit_id] == list(bug.patches.keys()), \
Expand Down
3 changes: 3 additions & 0 deletions tests/test_dataset/qtile/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# do not perform any end-of-line conversions;
# *.diff files here need to keep LF as end of line, but have embedded CR
*.diff -text
175 changes: 175 additions & 0 deletions tests/test_dataset/qtile/4424a39ba5d6374cc18b98297f6de8a82c37ab6a.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
diff --git a/bin/qtile-cmd b/bin/qtile-cmd
deleted file mode 100755
index a2136ee6f3..0000000000
--- a/bin/qtile-cmd
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import sys
-
-this_dir = os.path.dirname(__file__)
-base_dir = os.path.abspath(os.path.join(this_dir, ".."))
-sys.path.insert(0, base_dir)
-
-if __name__ == '__main__':
- from libqtile.scripts import qtile_cmd
- qtile_cmd.main()
diff --git a/libqtile/scripts/qtile_cmd.py b/libqtile/scripts/cmd_obj.py
old mode 100755
new mode 100644
similarity index 92%
rename from libqtile/scripts/qtile_cmd.py
rename to libqtile/scripts/cmd_obj.py
index 56c70df2f6..3d4e95072c
--- a/libqtile/scripts/qtile_cmd.py
+++ b/libqtile/scripts/cmd_obj.py
@@ -164,35 +164,8 @@ def print_base_objects() -> None:
print("\n".join(actions))


-def main() -> None:
+def cmd_obj(args) -> None:
"Runs tool according to specified arguments."
- description = 'Simple tool to expose qtile.command functionality to shell.'
- epilog = textwrap.dedent('''\
- Examples:
- qtile-cmd
- qtile-cmd -o cmd
- qtile-cmd -o cmd -f prev_layout -i
- qtile-cmd -o cmd -f prev_layout -a 3 # prev_layout on group 3
- qtile-cmd -o group 3 -f focus_back''')
- fmt = argparse.RawDescriptionHelpFormatter
-
- parser = argparse.ArgumentParser(description=description, epilog=epilog,
- formatter_class=fmt)
- parser.add_argument('--object', '-o', dest='obj_spec', nargs='+',
- help='Specify path to object (space separated). '
- 'If no --function flag display available commands. '
- 'Use `cmd` to specify root command.')
- parser.add_argument('--function', '-f', default="help",
- help='Select function to execute.')
- parser.add_argument('--args', '-a', nargs='+', default=[],
- help='Set arguments supplied to function.')
- parser.add_argument('--info', '-i', action='store_true',
- help='With both --object and --function args prints documentation for function.')
- parser.add_argument(
- "--socket", "-s",
- help='Path of the Qtile IPC socket.'
- )
- args = parser.parse_args()

if args.obj_spec:
sock_file = args.socket or find_sockfile()
@@ -214,5 +187,29 @@ def main() -> None:
sys.exit(1)


-if __name__ == "__main__":
- main()
+def add_subcommand(subparsers):
+ epilog = textwrap.dedent('''\
+ Examples:
+ qtile cmd-obj
+ qtile cmd-obj -o cmd
+ qtile cmd-obj -o cmd -f prev_layout -i
+ qtile cmd-obj -o cmd -f prev_layout -a 3 # prev_layout on group 3
+ qtile cmd-obj -o group 3 -f focus_back''')
+ description = 'qtile.command functionality exposed to the shell.'
+ parser = subparsers.add_parser("cmd-obj", help=description, epilog=epilog,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.add_argument('--object', '-o', dest='obj_spec', nargs='+',
+ help='Specify path to object (space separated). '
+ 'If no --function flag display available commands. '
+ 'Use `cmd` to specify root command.')
+ parser.add_argument('--function', '-f', default="help",
+ help='Select function to execute.')
+ parser.add_argument('--args', '-a', nargs='+', default=[],
+ help='Set arguments supplied to function.')
+ parser.add_argument('--info', '-i', action='store_true',
+ help='With both --object and --function args prints documentation for function.')
+ parser.add_argument(
+ "--socket", "-s",
+ help='Path of the Qtile IPC socket.'
+ )
+ parser.set_defaults(func=cmd_obj)
diff --git a/libqtile/scripts/main.py b/libqtile/scripts/main.py
index ffd2314fb3..f0a15671fd 100644
--- a/libqtile/scripts/main.py
+++ b/libqtile/scripts/main.py
@@ -1,7 +1,7 @@
import argparse
import sys

-from libqtile.scripts import run_cmd, shell, start, top
+from libqtile.scripts import cmd_obj, run_cmd, shell, start, top

try:
import pkg_resources
@@ -26,6 +26,7 @@ def main():
shell.add_subcommand(subparsers)
top.add_subcommand(subparsers)
run_cmd.add_subcommand(subparsers)
+ cmd_obj.add_subcommand(subparsers)

# backward compat hack: `qtile` with no args (or non-subcommand args)
# should default to `qtile start`. it seems impolite for commands to do
diff --git a/scripts/dqtile-cmd b/scripts/dqtile-cmd
index b133f3c0ee..ace056a31e 100755
--- a/scripts/dqtile-cmd
+++ b/scripts/dqtile-cmd
@@ -7,7 +7,7 @@ usage() {

"

- qtile-cmd -h | sed "s/qtile-cmd/dqtile-cmd/"
+ qtile cmd-obj -h | sed "s/qtile cmd-obj/dqtile-cmd/"


echo "
@@ -21,7 +21,7 @@ case $1 in
--force-dmenu) FORCE_DMENU=1; shift;;
esac

-action=$(qtile-cmd $@)
+action=$(qtile cmd-obj $@)

# Path to menu application
if [[ -n $(command -v rofi) ]] && [[ -z "$FORCE_DMENU" ]]; then
@@ -45,7 +45,7 @@ action=$(echo "$action"| cut -f 1 | sed -e 's/ *$//g')
if [ "$action_info" -eq "10" ]; then
# only run when -f is present (then -i makes sense)
if [[ $action == *"-f"* ]]; then
- info=$(qtile-cmd $action -i)
+ info=$(qtile cmd-obj $action -i)
action=$($menu -mesg "$global_mesg<b>Help</b>$info" -filter "$action -a ")
fi;
fi;
diff --git a/setup.cfg b/setup.cfg
index 81a3f39845..b0851c0b29 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -59,7 +59,6 @@ share/man/man1 =
[options.entry_points]
console_scripts =
qtile = libqtile.scripts.main:main
- qtile-cmd = libqtile.scripts.qtile_cmd:main

[options.extras_require]
doc =
diff --git a/test/test_qtile_cmd.py b/test/test_qtile_cmd.py
index bbfcf129e2..6f537e3238 100644
--- a/test/test_qtile_cmd.py
+++ b/test/test_qtile_cmd.py
@@ -69,8 +69,8 @@ class ServerConfig(Config):


def run_qtile_cmd(args):
- cmd = os.path.join(os.path.dirname(__file__), '..', 'bin', 'qtile-cmd')
- argv = [cmd]
+ cmd = os.path.join(os.path.dirname(__file__), '..', 'bin', 'qtile')
+ argv = [cmd, "cmd-obj"]
argv.extend(args.split())
pipe = subprocess.Popen(argv, stdout=subprocess.PIPE)
output, _ = pipe.communicate()
Expand Down
8 changes: 8 additions & 0 deletions tests/test_utils_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,3 +356,11 @@ def test_ChangeSet_from_filename():
# NOTE: this depends on the test file used!
assert changeset_diff_full.commit_metadata['message'].count('\n') == 1, \
"The commit message has exactly one line, ending in '\\n'"


def test_ChangeSet_from_patch_file_with_cr():
diff_filename = 'tests/test_dataset/qtile/4424a39ba5d6374cc18b98297f6de8a82c37ab6a.diff'

ChangeSet.from_filename(diff_filename)

# there were no exceptions

0 comments on commit 8b84601

Please sign in to comment.