diff options
author | Daniel Baumann <daniel@debian.org> | 2024-10-18 20:33:49 +0200 |
---|---|---|
committer | Daniel Baumann <daniel@debian.org> | 2024-12-12 23:57:56 +0100 |
commit | e68b9d00a6e05b3a941f63ffb696f91e554ac5ec (patch) | |
tree | 97775d6c13b0f416af55314eb6a89ef792474615 /modules/markup/mdstripper | |
parent | Initial commit. (diff) | |
download | forgejo-e68b9d00a6e05b3a941f63ffb696f91e554ac5ec.tar.xz forgejo-e68b9d00a6e05b3a941f63ffb696f91e554ac5ec.zip |
Adding upstream version 9.0.3.
Signed-off-by: Daniel Baumann <daniel@debian.org>
Diffstat (limited to 'modules/markup/mdstripper')
-rw-r--r-- | modules/markup/mdstripper/mdstripper.go | 199 | ||||
-rw-r--r-- | modules/markup/mdstripper/mdstripper_test.go | 85 |
2 files changed, 284 insertions, 0 deletions
diff --git a/modules/markup/mdstripper/mdstripper.go b/modules/markup/mdstripper/mdstripper.go new file mode 100644 index 0000000..2a69d95 --- /dev/null +++ b/modules/markup/mdstripper/mdstripper.go @@ -0,0 +1,199 @@ +// Copyright 2019 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package mdstripper + +import ( + "bytes" + "io" + "net/url" + "strings" + "sync" + + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/markup/common" + "code.gitea.io/gitea/modules/setting" + + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/extension" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer" + "github.com/yuin/goldmark/renderer/html" + "github.com/yuin/goldmark/text" +) + +var ( + giteaHostInit sync.Once + giteaHost *url.URL +) + +type stripRenderer struct { + localhost *url.URL + links []string + empty bool +} + +func (r *stripRenderer) Render(w io.Writer, source []byte, doc ast.Node) error { + return ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkContinue, nil + } + switch v := n.(type) { + case *ast.Text: + if !v.IsRaw() { + _, prevSibIsText := n.PreviousSibling().(*ast.Text) + coalesce := prevSibIsText + r.processString( + w, + v.Text(source), + coalesce) + if v.SoftLineBreak() { + r.doubleSpace(w) + } + } + return ast.WalkContinue, nil + case *ast.Link: + r.processLink(v.Destination) + return ast.WalkSkipChildren, nil + case *ast.AutoLink: + // This could be a reference to an issue or pull - if so convert it + r.processAutoLink(w, v.URL(source)) + return ast.WalkSkipChildren, nil + } + return ast.WalkContinue, nil + }) +} + +func (r *stripRenderer) doubleSpace(w io.Writer) { + if !r.empty { + _, _ = w.Write([]byte{'\n'}) + } +} + +func (r *stripRenderer) processString(w io.Writer, text []byte, coalesce bool) { + // Always break-up words + if !coalesce { + r.doubleSpace(w) + } + _, _ = w.Write(text) + r.empty = false +} + +// ProcessAutoLinks to detect and handle links to issues and pulls +func (r *stripRenderer) processAutoLink(w io.Writer, link []byte) { + linkStr := string(link) + u, err := url.Parse(linkStr) + if err != nil { + // Process out of band + r.links = append(r.links, linkStr) + return + } + + // Note: we're not attempting to match the URL scheme (http/https) + host := strings.ToLower(u.Host) + if host != "" && host != strings.ToLower(r.localhost.Host) { + // Process out of band + r.links = append(r.links, linkStr) + return + } + + // We want: /user/repo/issues/3 + parts := strings.Split(strings.TrimPrefix(u.EscapedPath(), r.localhost.EscapedPath()), "/") + if len(parts) != 5 || parts[0] != "" { + // Process out of band + r.links = append(r.links, linkStr) + return + } + + var sep string + if parts[3] == "issues" { + sep = "#" + } else if parts[3] == "pulls" { + sep = "!" + } else { + // Process out of band + r.links = append(r.links, linkStr) + return + } + + _, _ = w.Write([]byte(parts[1])) + _, _ = w.Write([]byte("/")) + _, _ = w.Write([]byte(parts[2])) + _, _ = w.Write([]byte(sep)) + _, _ = w.Write([]byte(parts[4])) +} + +func (r *stripRenderer) processLink(link []byte) { + // Links are processed out of band + r.links = append(r.links, string(link)) +} + +// GetLinks returns the list of link data collected while parsing +func (r *stripRenderer) GetLinks() []string { + return r.links +} + +// AddOptions adds given option to this renderer. +func (r *stripRenderer) AddOptions(...renderer.Option) { + // no-op +} + +// StripMarkdown parses markdown content by removing all markup and code blocks +// in order to extract links and other references +func StripMarkdown(rawBytes []byte) (string, []string) { + buf, links := StripMarkdownBytes(rawBytes) + return string(buf), links +} + +var ( + stripParser parser.Parser + once = sync.Once{} +) + +// StripMarkdownBytes parses markdown content by removing all markup and code blocks +// in order to extract links and other references +func StripMarkdownBytes(rawBytes []byte) ([]byte, []string) { + once.Do(func() { + gdMarkdown := goldmark.New( + goldmark.WithExtensions(extension.Table, + extension.Strikethrough, + extension.TaskList, + extension.DefinitionList, + common.FootnoteExtension, + common.Linkify, + ), + goldmark.WithParserOptions( + parser.WithAttribute(), + parser.WithAutoHeadingID(), + ), + goldmark.WithRendererOptions( + html.WithUnsafe(), + ), + ) + stripParser = gdMarkdown.Parser() + }) + stripper := &stripRenderer{ + localhost: getGiteaHost(), + links: make([]string, 0, 10), + empty: true, + } + reader := text.NewReader(rawBytes) + doc := stripParser.Parse(reader) + var buf bytes.Buffer + if err := stripper.Render(&buf, rawBytes, doc); err != nil { + log.Error("Unable to strip: %v", err) + } + return buf.Bytes(), stripper.GetLinks() +} + +// getGiteaHostName returns a normalized string with the local host name, with no scheme or port information +func getGiteaHost() *url.URL { + giteaHostInit.Do(func() { + var err error + if giteaHost, err = url.Parse(setting.AppURL); err != nil { + giteaHost = &url.URL{} + } + }) + return giteaHost +} diff --git a/modules/markup/mdstripper/mdstripper_test.go b/modules/markup/mdstripper/mdstripper_test.go new file mode 100644 index 0000000..ea34df0 --- /dev/null +++ b/modules/markup/mdstripper/mdstripper_test.go @@ -0,0 +1,85 @@ +// Copyright 2019 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package mdstripper + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMarkdownStripper(t *testing.T) { + type testItem struct { + markdown string + expectedText []string + expectedLinks []string + } + + list := []testItem{ + { + ` +## This is a title + +This is [one](link) to paradise. +This **is emphasized**. +This: should coalesce. + +` + "```" + ` +This is a code block. +This should not appear in the output at all. +` + "```" + ` + +* Bullet 1 +* Bullet 2 + +A HIDDEN ` + "`" + `GHOST` + "`" + ` IN THIS LINE. + `, + []string{ + "This is a title", + "This is", + "to paradise.", + "This", + "is emphasized", + ".", + "This: should coalesce.", + "Bullet 1", + "Bullet 2", + "A HIDDEN", + "IN THIS LINE.", + }, + []string{ + "link", + }, + }, + { + "Simply closes: #29 yes", + []string{ + "Simply closes: #29 yes", + }, + []string{}, + }, + { + "Simply closes: !29 yes", + []string{ + "Simply closes: !29 yes", + }, + []string{}, + }, + } + + for _, test := range list { + text, links := StripMarkdown([]byte(test.markdown)) + rawlines := strings.Split(text, "\n") + lines := make([]string, 0, len(rawlines)) + for _, line := range rawlines { + line := strings.TrimSpace(line) + if line != "" { + lines = append(lines, line) + } + } + assert.EqualValues(t, test.expectedText, lines) + assert.EqualValues(t, test.expectedLinks, links) + } +} |