diff options
author | Tao Klerks <tao@klerks.biz> | 2022-04-30 21:26:52 +0200 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2022-05-04 19:30:01 +0200 |
commit | f7b5ff607fa1a62a480399afb6ccb9691735bf79 (patch) | |
tree | fbf3bc4cda3b6e879c0d2652c6ac32e96767d447 /git-p4.py | |
parent | Git 2.36 (diff) | |
download | git-f7b5ff607fa1a62a480399afb6ccb9691735bf79.tar.xz git-f7b5ff607fa1a62a480399afb6ccb9691735bf79.zip |
git-p4: improve encoding handling to support inconsistent encodings
git-p4 is designed to run correctly under python2.7 and python3, but
its functional behavior wrt importing user-entered text differs across
these environments:
Under python2, git-p4 "naively" writes the Perforce bytestream into git
metadata (and does not set an "encoding" header on the commits); this
means that any non-utf-8 byte sequences end up creating invalidly-encoded
commit metadata in git.
Under python3, git-p4 attempts to decode the Perforce bytestream as utf-8
data, and fails badly (with an unhelpful error) when non-utf-8 data is
encountered.
Perforce clients (especially p4v) encourage user entry of changelist
descriptions (and user full names) in OS-local encoding, and store the
resulting bytestream to the server unmodified - such that different
clients can end up creating mutually-unintelligible messages. The most
common inconsistency, in many Perforce environments, is likely to be utf-8
(typical in linux) vs cp-1252 (typical in windows).
Make the changelist-description- and user-fullname-handling code
python-runtime-agnostic, introducing three "strategies" selectable via
config:
- 'passthrough', behaving as previously under python2,
- 'strict', behaving as previously under python3, and
- 'fallback', favoring utf-8 but supporting a secondary encoding when
utf-8 decoding fails, and finally escaping high-range bytes if the
decoding with the secondary encoding also fails.
Keep the python2 default behavior as-is ('legacy' strategy), but switch
the python3 default strategy to 'fallback' with default fallback encoding
'cp1252'.
Also include tests exercising these encoding strategies, documentation for
the new config, and improve the user-facing error messages when decoding
does fail.
Signed-off-by: Tao Klerks <tao@klerks.biz>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'git-p4.py')
-rwxr-xr-x | git-p4.py | 123 |
1 files changed, 107 insertions, 16 deletions
@@ -15,6 +15,7 @@ # pylint: disable=too-many-statements,too-many-instance-attributes # pylint: disable=too-many-branches,too-many-nested-blocks # +import struct import sys if sys.version_info.major < 3 and sys.version_info.minor < 7: sys.stderr.write("git-p4: requires Python 2.7 or later.\n") @@ -54,6 +55,9 @@ defaultLabelRegexp = r'[a-zA-Z0-9_\-.]+$' # The block size is reduced automatically if required defaultBlockSize = 1<<20 +defaultMetadataDecodingStrategy = 'passthrough' if sys.version_info.major == 2 else 'fallback' +defaultFallbackMetadataEncoding = 'cp1252' + p4_access_checked = False re_ko_keywords = re.compile(br'\$(Id|Header)(:[^$\n]+)?\$') @@ -203,6 +207,70 @@ else: def encode_text_stream(s): return s.encode('utf_8') if isinstance(s, unicode) else s + +class MetadataDecodingException(Exception): + def __init__(self, input_string): + self.input_string = input_string + + def __str__(self): + return """Decoding perforce metadata failed! +The failing string was: +--- +{} +--- +Consider setting the git-p4.metadataDecodingStrategy config option to +'fallback', to allow metadata to be decoded using a fallback encoding, +defaulting to cp1252.""".format(self.input_string) + + +encoding_fallback_warning_issued = False +encoding_escape_warning_issued = False +def metadata_stream_to_writable_bytes(s): + encodingStrategy = gitConfig('git-p4.metadataDecodingStrategy') or defaultMetadataDecodingStrategy + fallbackEncoding = gitConfig('git-p4.metadataFallbackEncoding') or defaultFallbackMetadataEncoding + if not isinstance(s, bytes): + return s.encode('utf_8') + if encodingStrategy == 'passthrough': + return s + try: + s.decode('utf_8') + return s + except UnicodeDecodeError: + if encodingStrategy == 'fallback' and fallbackEncoding: + global encoding_fallback_warning_issued + global encoding_escape_warning_issued + try: + if not encoding_fallback_warning_issued: + print("\nCould not decode value as utf-8; using configured fallback encoding %s: %s" % (fallbackEncoding, s)) + print("\n(this warning is only displayed once during an import)") + encoding_fallback_warning_issued = True + return s.decode(fallbackEncoding).encode('utf_8') + except Exception as exc: + if not encoding_escape_warning_issued: + print("\nCould not decode value with configured fallback encoding %s; escaping bytes over 127: %s" % (fallbackEncoding, s)) + print("\n(this warning is only displayed once during an import)") + encoding_escape_warning_issued = True + escaped_bytes = b'' + # bytes and strings work very differently in python2 vs python3... + if str is bytes: + for byte in s: + byte_number = struct.unpack('>B', byte)[0] + if byte_number > 127: + escaped_bytes += b'%' + escaped_bytes += hex(byte_number)[2:].upper() + else: + escaped_bytes += byte + else: + for byte_number in s: + if byte_number > 127: + escaped_bytes += b'%' + escaped_bytes += hex(byte_number).upper().encode()[2:] + else: + escaped_bytes += bytes([byte_number]) + return escaped_bytes + + raise MetadataDecodingException(s) + def decode_path(path): """Decode a given string (bytes or otherwise) using configured path encoding options """ @@ -702,11 +770,12 @@ def p4CmdList(cmd, stdin=None, stdin_mode='w+b', cb=None, skip_info=False, if bytes is not str: # Decode unmarshalled dict to use str keys and values, except for: # - `data` which may contain arbitrary binary data - # - `depotFile[0-9]*`, `path`, or `clientFile` which may contain non-UTF8 encoded text + # - `desc` or `FullName` which may contain non-UTF8 encoded text handled below, eagerly converted to bytes + # - `depotFile[0-9]*`, `path`, or `clientFile` which may contain non-UTF8 encoded text, handled by decode_path() decoded_entry = {} for key, value in entry.items(): key = key.decode() - if isinstance(value, bytes) and not (key in ('data', 'path', 'clientFile') or key.startswith('depotFile')): + if isinstance(value, bytes) and not (key in ('data', 'desc', 'FullName', 'path', 'clientFile') or key.startswith('depotFile')): value = value.decode() decoded_entry[key] = value # Parse out data if it's an error response @@ -716,6 +785,10 @@ def p4CmdList(cmd, stdin=None, stdin_mode='w+b', cb=None, skip_info=False, if skip_info: if 'code' in entry and entry['code'] == 'info': continue + if 'desc' in entry: + entry['desc'] = metadata_stream_to_writable_bytes(entry['desc']) + if 'FullName' in entry: + entry['FullName'] = metadata_stream_to_writable_bytes(entry['FullName']) if cb is not None: cb(entry) else: @@ -1435,7 +1508,13 @@ class P4UserMap: for output in p4CmdList(["users"]): if "User" not in output: continue - self.users[output["User"]] = output["FullName"] + " <" + output["Email"] + ">" + # "FullName" is bytes. "Email" on the other hand might be bytes + # or unicode string depending on whether we are running under + # python2 or python3. To support + # git-p4.metadataDecodingStrategy=fallback, self.users dict values + # are always bytes, ready to be written to git. + emailbytes = metadata_stream_to_writable_bytes(output["Email"]) + self.users[output["User"]] = output["FullName"] + b" <" + emailbytes + b">" self.emails[output["Email"]] = output["User"] mapUserConfigRegex = re.compile(r"^\s*(\S+)\s*=\s*(.+)\s*<(\S+)>\s*$", re.VERBOSE) @@ -1445,26 +1524,28 @@ class P4UserMap: user = mapUser[0][0] fullname = mapUser[0][1] email = mapUser[0][2] - self.users[user] = fullname + " <" + email + ">" + fulluser = fullname + " <" + email + ">" + self.users[user] = metadata_stream_to_writable_bytes(fulluser) self.emails[email] = user - s = '' + s = b'' for (key, val) in self.users.items(): - s += "%s\t%s\n" % (key.expandtabs(1), val.expandtabs(1)) + keybytes = metadata_stream_to_writable_bytes(key) + s += b"%s\t%s\n" % (keybytes.expandtabs(1), val.expandtabs(1)) - open(self.getUserCacheFilename(), 'w').write(s) + open(self.getUserCacheFilename(), 'wb').write(s) self.userMapFromPerforceServer = True def loadUserMapFromCache(self): self.users = {} self.userMapFromPerforceServer = False try: - cache = open(self.getUserCacheFilename(), 'r') + cache = open(self.getUserCacheFilename(), 'rb') lines = cache.readlines() cache.close() for line in lines: - entry = line.strip().split("\t") - self.users[entry[0]] = entry[1] + entry = line.strip().split(b"\t") + self.users[entry[0].decode('utf_8')] = entry[1] except IOError: self.getUserMapFromPerforceServer() @@ -3020,7 +3101,8 @@ class P4Sync(Command, P4UserMap): if userid in self.users: return self.users[userid] else: - return "%s <a@b>" % userid + userid_bytes = metadata_stream_to_writable_bytes(userid) + return b"%s <a@b>" % userid_bytes def streamTag(self, gitStream, labelName, labelDetails, commit, epoch): """ Stream a p4 tag. @@ -3043,9 +3125,10 @@ class P4Sync(Command, P4UserMap): email = self.make_email(owner) else: email = self.make_email(self.p4UserId()) - tagger = "%s %s %s" % (email, epoch, self.tz) - gitStream.write("tagger %s\n" % tagger) + gitStream.write("tagger ") + gitStream.write(email) + gitStream.write(" %s %s\n" % (epoch, self.tz)) print("labelDetails=",labelDetails) if 'Description' in labelDetails: @@ -3138,12 +3221,12 @@ class P4Sync(Command, P4UserMap): self.gitStream.write("commit %s\n" % branch) self.gitStream.write("mark :%s\n" % details["change"]) self.committedChanges.add(int(details["change"])) - committer = "" if author not in self.users: self.getUserMapFromPerforceServer() - committer = "%s %s %s" % (self.make_email(author), epoch, self.tz) - self.gitStream.write("committer %s\n" % committer) + self.gitStream.write("committer ") + self.gitStream.write(self.make_email(author)) + self.gitStream.write(" %s %s\n" % (epoch, self.tz)) self.gitStream.write("data <<EOT\n") self.gitStream.write(details["desc"]) @@ -4055,6 +4138,14 @@ class P4Clone(P4Sync): if self.useClientSpec_from_options: system(["git", "config", "--bool", "git-p4.useclientspec", "true"]) + # persist any git-p4 encoding-handling config options passed in for clone: + if gitConfig('git-p4.metadataDecodingStrategy'): + system(["git", "config", "git-p4.metadataDecodingStrategy", gitConfig('git-p4.metadataDecodingStrategy')]) + if gitConfig('git-p4.metadataFallbackEncoding'): + system(["git", "config", "git-p4.metadataFallbackEncoding", gitConfig('git-p4.metadataFallbackEncoding')]) + if gitConfig('git-p4.pathEncoding'): + system(["git", "config", "git-p4.pathEncoding", gitConfig('git-p4.pathEncoding')]) + return True class P4Unshelve(Command): |