Adding upstream version 9.0.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
author: Daniel Baumann <daniel@debian.org> 2024-10-18 20:33:49 +0200
committer: Daniel Baumann <daniel@debian.org> 2024-10-18 20:33:49 +0200
commit: dd136858f1ea40ad3c94191d647487fa4f31926c (patch)
tree: 58fec94a7b2a12510c9664b21793f1ed560c6518 /modules/typesniffer
parent: Initial commit. (diff)
download: forgejo-dd136858f1ea40ad3c94191d647487fa4f31926c.tar.xz
forgejo-dd136858f1ea40ad3c94191d647487fa4f31926c.zip
2 files changed, 280 insertions, 0 deletions
diff --git a/modules/typesniffer/typesniffer.go b/modules/typesniffer/typesniffer.go
new file mode 100644
index 0000000..6aec5c2
--- /dev/null
+++ b/modules/typesniffer/typesniffer.go
@@ -0,0 +1,143 @@
+// Copyright 2021 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package typesniffer
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"net/http"
+	"regexp"
+	"strings"
+
+	"code.gitea.io/gitea/modules/util"
+)
+
+// Use at most this many bytes to determine Content Type.
+const sniffLen = 1024
+
+const (
+	// SvgMimeType MIME type of SVG images.
+	SvgMimeType = "image/svg+xml"
+	// ApplicationOctetStream MIME type of binary files.
+	ApplicationOctetStream = "application/octet-stream"
+)
+
+var (
+	svgComment       = regexp.MustCompile(`(?s)<!--.*?-->`)
+	svgTagRegex      = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
+	svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
+)
+
+// SniffedType contains information about a blobs type.
+type SniffedType struct {
+	contentType string
+}
+
+// IsText etects if content format is plain text.
+func (ct SniffedType) IsText() bool {
+	return strings.Contains(ct.contentType, "text/")
+}
+
+// IsImage detects if data is an image format
+func (ct SniffedType) IsImage() bool {
+	return strings.Contains(ct.contentType, "image/")
+}
+
+// IsSvgImage detects if data is an SVG image format
+func (ct SniffedType) IsSvgImage() bool {
+	return strings.Contains(ct.contentType, SvgMimeType)
+}
+
+// IsPDF detects if data is a PDF format
+func (ct SniffedType) IsPDF() bool {
+	return strings.Contains(ct.contentType, "application/pdf")
+}
+
+// IsVideo detects if data is an video format
+func (ct SniffedType) IsVideo() bool {
+	return strings.Contains(ct.contentType, "video/")
+}
+
+// IsAudio detects if data is an video format
+func (ct SniffedType) IsAudio() bool {
+	return strings.Contains(ct.contentType, "audio/")
+}
+
+// IsRepresentableAsText returns true if file content can be represented as
+// plain text or is empty.
+func (ct SniffedType) IsRepresentableAsText() bool {
+	return ct.IsText() || ct.IsSvgImage()
+}
+
+// IsBrowsableBinaryType returns whether a non-text type can be displayed in a browser
+func (ct SniffedType) IsBrowsableBinaryType() bool {
+	return ct.IsImage() || ct.IsSvgImage() || ct.IsPDF() || ct.IsVideo() || ct.IsAudio()
+}
+
+// GetMimeType returns the mime type
+func (ct SniffedType) GetMimeType() string {
+	return strings.SplitN(ct.contentType, ";", 2)[0]
+}
+
+// DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty.
+func DetectContentType(data []byte) SniffedType {
+	if len(data) == 0 {
+		return SniffedType{"text/unknown"}
+	}
+
+	ct := http.DetectContentType(data)
+
+	if len(data) > sniffLen {
+		data = data[:sniffLen]
+	}
+
+	// SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
+
+	detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html")
+	detectByXML := strings.Contains(ct, "text/xml")
+	if detectByHTML || detectByXML {
+		dataProcessed := svgComment.ReplaceAll(data, nil)
+		dataProcessed = bytes.TrimSpace(dataProcessed)
+		if detectByHTML && svgTagRegex.Match(dataProcessed) ||
+			detectByXML && svgTagInXMLRegex.Match(dataProcessed) {
+			ct = SvgMimeType
+		}
+	}
+
+	if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
+		// The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
+		// So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
+		// This works especially because audio files contain many unprintable/invalid characters like `0x00`
+		ct2 := http.DetectContentType(data[3:])
+		if strings.HasPrefix(ct2, "text/") {
+			ct = ct2
+		}
+	}
+
+	if ct == "application/ogg" {
+		dataHead := data
+		if len(dataHead) > 256 {
+			dataHead = dataHead[:256] // only need to do a quick check for the file header
+		}
+		if bytes.Contains(dataHead, []byte("theora")) || bytes.Contains(dataHead, []byte("dirac")) {
+			ct = "video/ogg" // ogg is only used for some video formats, and it's not popular
+		} else {
+			ct = "audio/ogg" // for most cases, it is used as an audio container
+		}
+	}
+	return SniffedType{ct}
+}
+
+// DetectContentTypeFromReader guesses the content type contained in the reader.
+func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) {
+	buf := make([]byte, sniffLen)
+	n, err := util.ReadAtMost(r, buf)
+	if err != nil {
+		return SniffedType{}, fmt.Errorf("DetectContentTypeFromReader io error: %w", err)
+	}
+	buf = buf[:n]
+
+	return DetectContentType(buf), nil
+}
diff --git a/modules/typesniffer/typesniffer_test.go b/modules/typesniffer/typesniffer_test.go
new file mode 100644
index 0000000..f6fa07e
--- /dev/null
+++ b/modules/typesniffer/typesniffer_test.go
@@ -0,0 +1,137 @@
+// Copyright 2021 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package typesniffer
+
+import (
+	"bytes"
+	"encoding/base64"
+	"encoding/hex"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestDetectContentTypeLongerThanSniffLen(t *testing.T) {
+	// Pre-condition: Shorter than sniffLen detects SVG.
+	assert.Equal(t, "image/svg+xml", DetectContentType([]byte(`<!-- Comment --><svg></svg>`)).contentType)
+	// Longer than sniffLen detects something else.
+	assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(`<!-- `+strings.Repeat("x", sniffLen)+` --><svg></svg>`)).contentType)
+}
+
+func TestIsTextFile(t *testing.T) {
+	assert.True(t, DetectContentType([]byte{}).IsText())
+	assert.True(t, DetectContentType([]byte("lorem ipsum")).IsText())
+}
+
+func TestIsSvgImage(t *testing.T) {
+	assert.True(t, DetectContentType([]byte("<svg></svg>")).IsSvgImage())
+	assert.True(t, DetectContentType([]byte("    <svg></svg>")).IsSvgImage())
+	assert.True(t, DetectContentType([]byte(`<svg width="100"></svg>`)).IsSvgImage())
+	assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?><svg></svg>`)).IsSvgImage())
+	assert.True(t, DetectContentType([]byte(`<!-- Comment -->
+	<svg></svg>`)).IsSvgImage())
+	assert.True(t, DetectContentType([]byte(`<!-- Multiple -->
+	<!-- Comments -->
+	<svg></svg>`)).IsSvgImage())
+	assert.True(t, DetectContentType([]byte(`<!-- Multiline
+	Comment -->
+	<svg></svg>`)).IsSvgImage())
+	assert.True(t, DetectContentType([]byte(`<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Basic//EN"
+	"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd">
+	<svg></svg>`)).IsSvgImage())
+	assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
+	<!-- Comment -->
+	<svg></svg>`)).IsSvgImage())
+	assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
+	<!-- Multiple -->
+	<!-- Comments -->
+	<svg></svg>`)).IsSvgImage())
+	assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
+	<!-- Multiline
+	Comment -->
+	<svg></svg>`)).IsSvgImage())
+	assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
+	<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+	<!-- Multiline
+	Comment -->
+	<svg></svg>`)).IsSvgImage())
+
+	// the DetectContentType should work for incomplete data, because only beginning bytes are used for detection
+	assert.True(t, DetectContentType([]byte(`<svg>....`)).IsSvgImage())
+
+	assert.False(t, DetectContentType([]byte{}).IsSvgImage())
+	assert.False(t, DetectContentType([]byte("svg")).IsSvgImage())
+	assert.False(t, DetectContentType([]byte("<svgfoo></svgfoo>")).IsSvgImage())
+	assert.False(t, DetectContentType([]byte("text<svg></svg>")).IsSvgImage())
+	assert.False(t, DetectContentType([]byte("<html><body><svg></svg></body></html>")).IsSvgImage())
+	assert.False(t, DetectContentType([]byte(`<script>"<svg></svg>"</script>`)).IsSvgImage())
+	assert.False(t, DetectContentType([]byte(`<!-- <svg></svg> inside comment -->
+	<foo></foo>`)).IsSvgImage())
+	assert.False(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
+	<!-- <svg></svg> inside comment -->
+	<foo></foo>`)).IsSvgImage())
+
+	assert.False(t, DetectContentType([]byte(`
+<!-- comment1 -->
+<div>
+	<!-- comment2 -->
+	<svg></svg>
+</div>
+`)).IsSvgImage())
+
+	assert.False(t, DetectContentType([]byte(`
+<!-- comment1
+-->
+<div>
+	<!-- comment2
+-->
+	<svg></svg>
+</div>
+`)).IsSvgImage())
+	assert.False(t, DetectContentType([]byte(`<html><body><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg></svg></body></html>`)).IsSvgImage())
+	assert.False(t, DetectContentType([]byte(`<html><body><?xml version="1.0" encoding="UTF-8"?><svg></svg></body></html>`)).IsSvgImage())
+}
+
+func TestIsPDF(t *testing.T) {
+	pdf, _ := base64.StdEncoding.DecodeString("JVBERi0xLjYKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0ZURlY29kZT4+CnN0cmVhbQp4nF3NPwsCMQwF8D2f4s2CNYk1baF0EHRwOwg4iJt/NsFb/PpevUE4Mjwe")
+	assert.True(t, DetectContentType(pdf).IsPDF())
+	assert.False(t, DetectContentType([]byte("plain text")).IsPDF())
+}
+
+func TestIsVideo(t *testing.T) {
+	mp4, _ := base64.StdEncoding.DecodeString("AAAAGGZ0eXBtcDQyAAAAAGlzb21tcDQyAAEI721vb3YAAABsbXZoZAAAAADaBlwX2gZcFwAAA+gA")
+	assert.True(t, DetectContentType(mp4).IsVideo())
+	assert.False(t, DetectContentType([]byte("plain text")).IsVideo())
+}
+
+func TestIsAudio(t *testing.T) {
+	mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
+	assert.True(t, DetectContentType(mp3).IsAudio())
+	assert.False(t, DetectContentType([]byte("plain text")).IsAudio())
+
+	assert.True(t, DetectContentType([]byte("ID3Toy\000")).IsAudio())
+	assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ...")).IsText())          // test ID3 tag for plain text
+	assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char
+}
+
+func TestDetectContentTypeFromReader(t *testing.T) {
+	mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
+	st, err := DetectContentTypeFromReader(bytes.NewReader(mp3))
+	require.NoError(t, err)
+	assert.True(t, st.IsAudio())
+}
+
+func TestDetectContentTypeOgg(t *testing.T) {
+	oggAudio, _ := hex.DecodeString("4f67675300020000000000000000352f0000000000007dc39163011e01766f72626973000000000244ac0000000000000071020000000000b8014f6767530000")
+	st, err := DetectContentTypeFromReader(bytes.NewReader(oggAudio))
+	require.NoError(t, err)
+	assert.True(t, st.IsAudio())
+
+	oggVideo, _ := hex.DecodeString("4f676753000200000000000000007d9747ef000000009b59daf3012a807468656f7261030201001e00110001e000010e00020000001e00000001000001000001")
+	st, err = DetectContentTypeFromReader(bytes.NewReader(oggVideo))
+	require.NoError(t, err)
+	assert.True(t, st.IsVideo())
+}
author	Daniel Baumann <daniel@debian.org>	2024-10-18 20:33:49 +0200
committer	Daniel Baumann <daniel@debian.org>	2024-10-18 20:33:49 +0200
commit	dd136858f1ea40ad3c94191d647487fa4f31926c (patch)
tree	58fec94a7b2a12510c9664b21793f1ed560c6518 /modules/typesniffer
parent	Initial commit. (diff)
download	forgejo-dd136858f1ea40ad3c94191d647487fa4f31926c.tar.xz forgejo-dd136858f1ea40ad3c94191d647487fa4f31926c.zip