From e68b9d00a6e05b3a941f63ffb696f91e554ac5ec Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 18 Oct 2024 20:33:49 +0200 Subject: Adding upstream version 9.0.3. Signed-off-by: Daniel Baumann --- modules/typesniffer/typesniffer.go | 143 ++++++++++++++++++++++++++++++++ modules/typesniffer/typesniffer_test.go | 137 ++++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+) create mode 100644 modules/typesniffer/typesniffer.go create mode 100644 modules/typesniffer/typesniffer_test.go (limited to 'modules/typesniffer') diff --git a/modules/typesniffer/typesniffer.go b/modules/typesniffer/typesniffer.go new file mode 100644 index 0000000..6aec5c2 --- /dev/null +++ b/modules/typesniffer/typesniffer.go @@ -0,0 +1,143 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package typesniffer + +import ( + "bytes" + "fmt" + "io" + "net/http" + "regexp" + "strings" + + "code.gitea.io/gitea/modules/util" +) + +// Use at most this many bytes to determine Content Type. +const sniffLen = 1024 + +const ( + // SvgMimeType MIME type of SVG images. + SvgMimeType = "image/svg+xml" + // ApplicationOctetStream MIME type of binary files. + ApplicationOctetStream = "application/octet-stream" +) + +var ( + svgComment = regexp.MustCompile(`(?s)`) + svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(|>))\s*)*\s*(?:(|>))\s*)* sniffLen { + data = data[:sniffLen] + } + + // SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888 + + detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html") + detectByXML := strings.Contains(ct, "text/xml") + if detectByHTML || detectByXML { + dataProcessed := svgComment.ReplaceAll(data, nil) + dataProcessed = bytes.TrimSpace(dataProcessed) + if detectByHTML && svgTagRegex.Match(dataProcessed) || + detectByXML && svgTagInXMLRegex.Match(dataProcessed) { + ct = SvgMimeType + } + } + + if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) { + // The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg". + // So remove the "ID3" prefix and detect again, if result is text, then it must be text content. + // This works especially because audio files contain many unprintable/invalid characters like `0x00` + ct2 := http.DetectContentType(data[3:]) + if strings.HasPrefix(ct2, "text/") { + ct = ct2 + } + } + + if ct == "application/ogg" { + dataHead := data + if len(dataHead) > 256 { + dataHead = dataHead[:256] // only need to do a quick check for the file header + } + if bytes.Contains(dataHead, []byte("theora")) || bytes.Contains(dataHead, []byte("dirac")) { + ct = "video/ogg" // ogg is only used for some video formats, and it's not popular + } else { + ct = "audio/ogg" // for most cases, it is used as an audio container + } + } + return SniffedType{ct} +} + +// DetectContentTypeFromReader guesses the content type contained in the reader. +func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) { + buf := make([]byte, sniffLen) + n, err := util.ReadAtMost(r, buf) + if err != nil { + return SniffedType{}, fmt.Errorf("DetectContentTypeFromReader io error: %w", err) + } + buf = buf[:n] + + return DetectContentType(buf), nil +} diff --git a/modules/typesniffer/typesniffer_test.go b/modules/typesniffer/typesniffer_test.go new file mode 100644 index 0000000..f6fa07e --- /dev/null +++ b/modules/typesniffer/typesniffer_test.go @@ -0,0 +1,137 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package typesniffer + +import ( + "bytes" + "encoding/base64" + "encoding/hex" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDetectContentTypeLongerThanSniffLen(t *testing.T) { + // Pre-condition: Shorter than sniffLen detects SVG. + assert.Equal(t, "image/svg+xml", DetectContentType([]byte(``)).contentType) + // Longer than sniffLen detects something else. + assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(``)).contentType) +} + +func TestIsTextFile(t *testing.T) { + assert.True(t, DetectContentType([]byte{}).IsText()) + assert.True(t, DetectContentType([]byte("lorem ipsum")).IsText()) +} + +func TestIsSvgImage(t *testing.T) { + assert.True(t, DetectContentType([]byte("")).IsSvgImage()) + assert.True(t, DetectContentType([]byte(" ")).IsSvgImage()) + assert.True(t, DetectContentType([]byte(``)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(``)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(` + `)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(` + + `)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(` + `)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(` + `)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(` + + `)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(` + + + `)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(` + + `)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(` + + + `)).IsSvgImage()) + + // the DetectContentType should work for incomplete data, because only beginning bytes are used for detection + assert.True(t, DetectContentType([]byte(`....`)).IsSvgImage()) + + assert.False(t, DetectContentType([]byte{}).IsSvgImage()) + assert.False(t, DetectContentType([]byte("svg")).IsSvgImage()) + assert.False(t, DetectContentType([]byte("")).IsSvgImage()) + assert.False(t, DetectContentType([]byte("text")).IsSvgImage()) + assert.False(t, DetectContentType([]byte("")).IsSvgImage()) + assert.False(t, DetectContentType([]byte(``)).IsSvgImage()) + assert.False(t, DetectContentType([]byte(` + `)).IsSvgImage()) + assert.False(t, DetectContentType([]byte(` + + `)).IsSvgImage()) + + assert.False(t, DetectContentType([]byte(` + +
+ + +
+`)).IsSvgImage()) + + assert.False(t, DetectContentType([]byte(` + +
+ + +
+`)).IsSvgImage()) + assert.False(t, DetectContentType([]byte(``)).IsSvgImage()) + assert.False(t, DetectContentType([]byte(``)).IsSvgImage()) +} + +func TestIsPDF(t *testing.T) { + pdf, _ := base64.StdEncoding.DecodeString("JVBERi0xLjYKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0ZURlY29kZT4+CnN0cmVhbQp4nF3NPwsCMQwF8D2f4s2CNYk1baF0EHRwOwg4iJt/NsFb/PpevUE4Mjwe") + assert.True(t, DetectContentType(pdf).IsPDF()) + assert.False(t, DetectContentType([]byte("plain text")).IsPDF()) +} + +func TestIsVideo(t *testing.T) { + mp4, _ := base64.StdEncoding.DecodeString("AAAAGGZ0eXBtcDQyAAAAAGlzb21tcDQyAAEI721vb3YAAABsbXZoZAAAAADaBlwX2gZcFwAAA+gA") + assert.True(t, DetectContentType(mp4).IsVideo()) + assert.False(t, DetectContentType([]byte("plain text")).IsVideo()) +} + +func TestIsAudio(t *testing.T) { + mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl") + assert.True(t, DetectContentType(mp3).IsAudio()) + assert.False(t, DetectContentType([]byte("plain text")).IsAudio()) + + assert.True(t, DetectContentType([]byte("ID3Toy\000")).IsAudio()) + assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ...")).IsText()) // test ID3 tag for plain text + assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char +} + +func TestDetectContentTypeFromReader(t *testing.T) { + mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl") + st, err := DetectContentTypeFromReader(bytes.NewReader(mp3)) + require.NoError(t, err) + assert.True(t, st.IsAudio()) +} + +func TestDetectContentTypeOgg(t *testing.T) { + oggAudio, _ := hex.DecodeString("4f67675300020000000000000000352f0000000000007dc39163011e01766f72626973000000000244ac0000000000000071020000000000b8014f6767530000") + st, err := DetectContentTypeFromReader(bytes.NewReader(oggAudio)) + require.NoError(t, err) + assert.True(t, st.IsAudio()) + + oggVideo, _ := hex.DecodeString("4f676753000200000000000000007d9747ef000000009b59daf3012a807468656f7261030201001e00110001e000010e00020000001e00000001000001000001") + st, err = DetectContentTypeFromReader(bytes.NewReader(oggVideo)) + require.NoError(t, err) + assert.True(t, st.IsVideo()) +} -- cgit v1.2.3