diff options
author | Junio C Hamano <gitster@pobox.com> | 2018-05-08 08:59:22 +0200 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2018-05-08 08:59:22 +0200 |
commit | 1ac0ce4d32ab7a3546e7e84a562625576208c7db (patch) | |
tree | 23ebffb680b2fe7b69453bb55d8170f69d767a69 /utf8.c | |
parent | Merge branch 'ab/nuke-emacs-contrib' (diff) | |
parent | convert: add round trip check based on 'core.checkRoundtripEncoding' (diff) | |
download | git-1ac0ce4d32ab7a3546e7e84a562625576208c7db.tar.xz git-1ac0ce4d32ab7a3546e7e84a562625576208c7db.zip |
Merge branch 'ls/checkout-encoding'
The new "checkout-encoding" attribute can ask Git to convert the
contents to the specified encoding when checking out to the working
tree (and the other way around when checking in).
* ls/checkout-encoding:
convert: add round trip check based on 'core.checkRoundtripEncoding'
convert: add tracing for 'working-tree-encoding' attribute
convert: check for detectable errors in UTF encodings
convert: add 'working-tree-encoding' attribute
utf8: add function to detect a missing UTF-16/32 BOM
utf8: add function to detect prohibited UTF-16/32 BOM
utf8: teach same_encoding() alternative UTF encoding names
strbuf: add a case insensitive starts_with()
strbuf: add xstrdup_toupper()
strbuf: remove unnecessary NUL assignment in xstrdup_tolower()
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 65 |
1 files changed, 63 insertions, 2 deletions
@@ -401,18 +401,40 @@ out: strbuf_release(&sb_dst); } +/* + * Returns true (1) if the src encoding name matches the dst encoding + * name directly or one of its alternative names. E.g. UTF-16BE is the + * same as UTF16BE. + */ +static int same_utf_encoding(const char *src, const char *dst) +{ + if (istarts_with(src, "utf") && istarts_with(dst, "utf")) { + /* src[3] or dst[3] might be '\0' */ + int i = (src[3] == '-' ? 4 : 3); + int j = (dst[3] == '-' ? 4 : 3); + return !strcasecmp(src+i, dst+j); + } + return 0; +} + int is_encoding_utf8(const char *name) { if (!name) return 1; - if (!strcasecmp(name, "utf-8") || !strcasecmp(name, "utf8")) + if (same_utf_encoding("utf-8", name)) return 1; return 0; } int same_encoding(const char *src, const char *dst) { - if (is_encoding_utf8(src) && is_encoding_utf8(dst)) + static const char utf8[] = "UTF-8"; + + if (!src) + src = utf8; + if (!dst) + dst = utf8; + if (same_utf_encoding(src, dst)) return 1; return !strcasecmp(src, dst); } @@ -538,6 +560,45 @@ char *reencode_string_len(const char *in, int insz, } #endif +static int has_bom_prefix(const char *data, size_t len, + const char *bom, size_t bom_len) +{ + return data && bom && (len >= bom_len) && !memcmp(data, bom, bom_len); +} + +static const char utf16_be_bom[] = {0xFE, 0xFF}; +static const char utf16_le_bom[] = {0xFF, 0xFE}; +static const char utf32_be_bom[] = {0x00, 0x00, 0xFE, 0xFF}; +static const char utf32_le_bom[] = {0xFF, 0xFE, 0x00, 0x00}; + +int has_prohibited_utf_bom(const char *enc, const char *data, size_t len) +{ + return ( + (same_utf_encoding("UTF-16BE", enc) || + same_utf_encoding("UTF-16LE", enc)) && + (has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) || + has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom))) + ) || ( + (same_utf_encoding("UTF-32BE", enc) || + same_utf_encoding("UTF-32LE", enc)) && + (has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) || + has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom))) + ); +} + +int is_missing_required_utf_bom(const char *enc, const char *data, size_t len) +{ + return ( + (same_utf_encoding(enc, "UTF-16")) && + !(has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) || + has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom))) + ) || ( + (same_utf_encoding(enc, "UTF-32")) && + !(has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) || + has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom))) + ); +} + /* * Returns first character length in bytes for multi-byte `text` according to * `encoding`. |