diff options
author | Daniel Baumann <daniel@debian.org> | 2024-10-18 20:33:49 +0200 |
---|---|---|
committer | Daniel Baumann <daniel@debian.org> | 2024-10-18 20:33:49 +0200 |
commit | dd136858f1ea40ad3c94191d647487fa4f31926c (patch) | |
tree | 58fec94a7b2a12510c9664b21793f1ed560c6518 /modules/typesniffer/typesniffer.go | |
parent | Initial commit. (diff) | |
download | forgejo-debian.tar.xz forgejo-debian.zip |
Adding upstream version 9.0.0.HEADupstream/9.0.0upstreamdebian
Signed-off-by: Daniel Baumann <daniel@debian.org>
Diffstat (limited to '')
-rw-r--r-- | modules/typesniffer/typesniffer.go | 143 |
1 files changed, 143 insertions, 0 deletions
diff --git a/modules/typesniffer/typesniffer.go b/modules/typesniffer/typesniffer.go new file mode 100644 index 0000000..6aec5c2 --- /dev/null +++ b/modules/typesniffer/typesniffer.go @@ -0,0 +1,143 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package typesniffer + +import ( + "bytes" + "fmt" + "io" + "net/http" + "regexp" + "strings" + + "code.gitea.io/gitea/modules/util" +) + +// Use at most this many bytes to determine Content Type. +const sniffLen = 1024 + +const ( + // SvgMimeType MIME type of SVG images. + SvgMimeType = "image/svg+xml" + // ApplicationOctetStream MIME type of binary files. + ApplicationOctetStream = "application/octet-stream" +) + +var ( + svgComment = regexp.MustCompile(`(?s)<!--.*?-->`) + svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`) + svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`) +) + +// SniffedType contains information about a blobs type. +type SniffedType struct { + contentType string +} + +// IsText etects if content format is plain text. +func (ct SniffedType) IsText() bool { + return strings.Contains(ct.contentType, "text/") +} + +// IsImage detects if data is an image format +func (ct SniffedType) IsImage() bool { + return strings.Contains(ct.contentType, "image/") +} + +// IsSvgImage detects if data is an SVG image format +func (ct SniffedType) IsSvgImage() bool { + return strings.Contains(ct.contentType, SvgMimeType) +} + +// IsPDF detects if data is a PDF format +func (ct SniffedType) IsPDF() bool { + return strings.Contains(ct.contentType, "application/pdf") +} + +// IsVideo detects if data is an video format +func (ct SniffedType) IsVideo() bool { + return strings.Contains(ct.contentType, "video/") +} + +// IsAudio detects if data is an video format +func (ct SniffedType) IsAudio() bool { + return strings.Contains(ct.contentType, "audio/") +} + +// IsRepresentableAsText returns true if file content can be represented as +// plain text or is empty. +func (ct SniffedType) IsRepresentableAsText() bool { + return ct.IsText() || ct.IsSvgImage() +} + +// IsBrowsableBinaryType returns whether a non-text type can be displayed in a browser +func (ct SniffedType) IsBrowsableBinaryType() bool { + return ct.IsImage() || ct.IsSvgImage() || ct.IsPDF() || ct.IsVideo() || ct.IsAudio() +} + +// GetMimeType returns the mime type +func (ct SniffedType) GetMimeType() string { + return strings.SplitN(ct.contentType, ";", 2)[0] +} + +// DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty. +func DetectContentType(data []byte) SniffedType { + if len(data) == 0 { + return SniffedType{"text/unknown"} + } + + ct := http.DetectContentType(data) + + if len(data) > sniffLen { + data = data[:sniffLen] + } + + // SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888 + + detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html") + detectByXML := strings.Contains(ct, "text/xml") + if detectByHTML || detectByXML { + dataProcessed := svgComment.ReplaceAll(data, nil) + dataProcessed = bytes.TrimSpace(dataProcessed) + if detectByHTML && svgTagRegex.Match(dataProcessed) || + detectByXML && svgTagInXMLRegex.Match(dataProcessed) { + ct = SvgMimeType + } + } + + if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) { + // The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg". + // So remove the "ID3" prefix and detect again, if result is text, then it must be text content. + // This works especially because audio files contain many unprintable/invalid characters like `0x00` + ct2 := http.DetectContentType(data[3:]) + if strings.HasPrefix(ct2, "text/") { + ct = ct2 + } + } + + if ct == "application/ogg" { + dataHead := data + if len(dataHead) > 256 { + dataHead = dataHead[:256] // only need to do a quick check for the file header + } + if bytes.Contains(dataHead, []byte("theora")) || bytes.Contains(dataHead, []byte("dirac")) { + ct = "video/ogg" // ogg is only used for some video formats, and it's not popular + } else { + ct = "audio/ogg" // for most cases, it is used as an audio container + } + } + return SniffedType{ct} +} + +// DetectContentTypeFromReader guesses the content type contained in the reader. +func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) { + buf := make([]byte, sniffLen) + n, err := util.ReadAtMost(r, buf) + if err != nil { + return SniffedType{}, fmt.Errorf("DetectContentTypeFromReader io error: %w", err) + } + buf = buf[:n] + + return DetectContentType(buf), nil +} |