diff options
Diffstat (limited to 'modules/typesniffer')
-rw-r--r-- | modules/typesniffer/typesniffer.go | 143 | ||||
-rw-r--r-- | modules/typesniffer/typesniffer_test.go | 137 |
2 files changed, 280 insertions, 0 deletions
diff --git a/modules/typesniffer/typesniffer.go b/modules/typesniffer/typesniffer.go new file mode 100644 index 0000000..6aec5c2 --- /dev/null +++ b/modules/typesniffer/typesniffer.go @@ -0,0 +1,143 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package typesniffer + +import ( + "bytes" + "fmt" + "io" + "net/http" + "regexp" + "strings" + + "code.gitea.io/gitea/modules/util" +) + +// Use at most this many bytes to determine Content Type. +const sniffLen = 1024 + +const ( + // SvgMimeType MIME type of SVG images. + SvgMimeType = "image/svg+xml" + // ApplicationOctetStream MIME type of binary files. + ApplicationOctetStream = "application/octet-stream" +) + +var ( + svgComment = regexp.MustCompile(`(?s)<!--.*?-->`) + svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`) + svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`) +) + +// SniffedType contains information about a blobs type. +type SniffedType struct { + contentType string +} + +// IsText etects if content format is plain text. +func (ct SniffedType) IsText() bool { + return strings.Contains(ct.contentType, "text/") +} + +// IsImage detects if data is an image format +func (ct SniffedType) IsImage() bool { + return strings.Contains(ct.contentType, "image/") +} + +// IsSvgImage detects if data is an SVG image format +func (ct SniffedType) IsSvgImage() bool { + return strings.Contains(ct.contentType, SvgMimeType) +} + +// IsPDF detects if data is a PDF format +func (ct SniffedType) IsPDF() bool { + return strings.Contains(ct.contentType, "application/pdf") +} + +// IsVideo detects if data is an video format +func (ct SniffedType) IsVideo() bool { + return strings.Contains(ct.contentType, "video/") +} + +// IsAudio detects if data is an video format +func (ct SniffedType) IsAudio() bool { + return strings.Contains(ct.contentType, "audio/") +} + +// IsRepresentableAsText returns true if file content can be represented as +// plain text or is empty. +func (ct SniffedType) IsRepresentableAsText() bool { + return ct.IsText() || ct.IsSvgImage() +} + +// IsBrowsableBinaryType returns whether a non-text type can be displayed in a browser +func (ct SniffedType) IsBrowsableBinaryType() bool { + return ct.IsImage() || ct.IsSvgImage() || ct.IsPDF() || ct.IsVideo() || ct.IsAudio() +} + +// GetMimeType returns the mime type +func (ct SniffedType) GetMimeType() string { + return strings.SplitN(ct.contentType, ";", 2)[0] +} + +// DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty. +func DetectContentType(data []byte) SniffedType { + if len(data) == 0 { + return SniffedType{"text/unknown"} + } + + ct := http.DetectContentType(data) + + if len(data) > sniffLen { + data = data[:sniffLen] + } + + // SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888 + + detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html") + detectByXML := strings.Contains(ct, "text/xml") + if detectByHTML || detectByXML { + dataProcessed := svgComment.ReplaceAll(data, nil) + dataProcessed = bytes.TrimSpace(dataProcessed) + if detectByHTML && svgTagRegex.Match(dataProcessed) || + detectByXML && svgTagInXMLRegex.Match(dataProcessed) { + ct = SvgMimeType + } + } + + if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) { + // The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg". + // So remove the "ID3" prefix and detect again, if result is text, then it must be text content. + // This works especially because audio files contain many unprintable/invalid characters like `0x00` + ct2 := http.DetectContentType(data[3:]) + if strings.HasPrefix(ct2, "text/") { + ct = ct2 + } + } + + if ct == "application/ogg" { + dataHead := data + if len(dataHead) > 256 { + dataHead = dataHead[:256] // only need to do a quick check for the file header + } + if bytes.Contains(dataHead, []byte("theora")) || bytes.Contains(dataHead, []byte("dirac")) { + ct = "video/ogg" // ogg is only used for some video formats, and it's not popular + } else { + ct = "audio/ogg" // for most cases, it is used as an audio container + } + } + return SniffedType{ct} +} + +// DetectContentTypeFromReader guesses the content type contained in the reader. +func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) { + buf := make([]byte, sniffLen) + n, err := util.ReadAtMost(r, buf) + if err != nil { + return SniffedType{}, fmt.Errorf("DetectContentTypeFromReader io error: %w", err) + } + buf = buf[:n] + + return DetectContentType(buf), nil +} diff --git a/modules/typesniffer/typesniffer_test.go b/modules/typesniffer/typesniffer_test.go new file mode 100644 index 0000000..f6fa07e --- /dev/null +++ b/modules/typesniffer/typesniffer_test.go @@ -0,0 +1,137 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package typesniffer + +import ( + "bytes" + "encoding/base64" + "encoding/hex" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDetectContentTypeLongerThanSniffLen(t *testing.T) { + // Pre-condition: Shorter than sniffLen detects SVG. + assert.Equal(t, "image/svg+xml", DetectContentType([]byte(`<!-- Comment --><svg></svg>`)).contentType) + // Longer than sniffLen detects something else. + assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(`<!-- `+strings.Repeat("x", sniffLen)+` --><svg></svg>`)).contentType) +} + +func TestIsTextFile(t *testing.T) { + assert.True(t, DetectContentType([]byte{}).IsText()) + assert.True(t, DetectContentType([]byte("lorem ipsum")).IsText()) +} + +func TestIsSvgImage(t *testing.T) { + assert.True(t, DetectContentType([]byte("<svg></svg>")).IsSvgImage()) + assert.True(t, DetectContentType([]byte(" <svg></svg>")).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`<svg width="100"></svg>`)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?><svg></svg>`)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`<!-- Comment --> + <svg></svg>`)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`<!-- Multiple --> + <!-- Comments --> + <svg></svg>`)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`<!-- Multiline + Comment --> + <svg></svg>`)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Basic//EN" + "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd"> + <svg></svg>`)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?> + <!-- Comment --> + <svg></svg>`)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?> + <!-- Multiple --> + <!-- Comments --> + <svg></svg>`)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?> + <!-- Multiline + Comment --> + <svg></svg>`)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?> + <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> + <!-- Multiline + Comment --> + <svg></svg>`)).IsSvgImage()) + + // the DetectContentType should work for incomplete data, because only beginning bytes are used for detection + assert.True(t, DetectContentType([]byte(`<svg>....`)).IsSvgImage()) + + assert.False(t, DetectContentType([]byte{}).IsSvgImage()) + assert.False(t, DetectContentType([]byte("svg")).IsSvgImage()) + assert.False(t, DetectContentType([]byte("<svgfoo></svgfoo>")).IsSvgImage()) + assert.False(t, DetectContentType([]byte("text<svg></svg>")).IsSvgImage()) + assert.False(t, DetectContentType([]byte("<html><body><svg></svg></body></html>")).IsSvgImage()) + assert.False(t, DetectContentType([]byte(`<script>"<svg></svg>"</script>`)).IsSvgImage()) + assert.False(t, DetectContentType([]byte(`<!-- <svg></svg> inside comment --> + <foo></foo>`)).IsSvgImage()) + assert.False(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?> + <!-- <svg></svg> inside comment --> + <foo></foo>`)).IsSvgImage()) + + assert.False(t, DetectContentType([]byte(` +<!-- comment1 --> +<div> + <!-- comment2 --> + <svg></svg> +</div> +`)).IsSvgImage()) + + assert.False(t, DetectContentType([]byte(` +<!-- comment1 +--> +<div> + <!-- comment2 +--> + <svg></svg> +</div> +`)).IsSvgImage()) + assert.False(t, DetectContentType([]byte(`<html><body><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg></svg></body></html>`)).IsSvgImage()) + assert.False(t, DetectContentType([]byte(`<html><body><?xml version="1.0" encoding="UTF-8"?><svg></svg></body></html>`)).IsSvgImage()) +} + +func TestIsPDF(t *testing.T) { + pdf, _ := base64.StdEncoding.DecodeString("JVBERi0xLjYKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0ZURlY29kZT4+CnN0cmVhbQp4nF3NPwsCMQwF8D2f4s2CNYk1baF0EHRwOwg4iJt/NsFb/PpevUE4Mjwe") + assert.True(t, DetectContentType(pdf).IsPDF()) + assert.False(t, DetectContentType([]byte("plain text")).IsPDF()) +} + +func TestIsVideo(t *testing.T) { + mp4, _ := base64.StdEncoding.DecodeString("AAAAGGZ0eXBtcDQyAAAAAGlzb21tcDQyAAEI721vb3YAAABsbXZoZAAAAADaBlwX2gZcFwAAA+gA") + assert.True(t, DetectContentType(mp4).IsVideo()) + assert.False(t, DetectContentType([]byte("plain text")).IsVideo()) +} + +func TestIsAudio(t *testing.T) { + mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl") + assert.True(t, DetectContentType(mp3).IsAudio()) + assert.False(t, DetectContentType([]byte("plain text")).IsAudio()) + + assert.True(t, DetectContentType([]byte("ID3Toy\000")).IsAudio()) + assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ...")).IsText()) // test ID3 tag for plain text + assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char +} + +func TestDetectContentTypeFromReader(t *testing.T) { + mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl") + st, err := DetectContentTypeFromReader(bytes.NewReader(mp3)) + require.NoError(t, err) + assert.True(t, st.IsAudio()) +} + +func TestDetectContentTypeOgg(t *testing.T) { + oggAudio, _ := hex.DecodeString("4f67675300020000000000000000352f0000000000007dc39163011e01766f72626973000000000244ac0000000000000071020000000000b8014f6767530000") + st, err := DetectContentTypeFromReader(bytes.NewReader(oggAudio)) + require.NoError(t, err) + assert.True(t, st.IsAudio()) + + oggVideo, _ := hex.DecodeString("4f676753000200000000000000007d9747ef000000009b59daf3012a807468656f7261030201001e00110001e000010e00020000001e00000001000001000001") + st, err = DetectContentTypeFromReader(bytes.NewReader(oggVideo)) + require.NoError(t, err) + assert.True(t, st.IsVideo()) +} |