summaryrefslogtreecommitdiffstats
path: root/modules/markup/mdstripper/mdstripper.go
blob: 2a69d952244abf17b56ce39a81a6b5bf7808c573 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package mdstripper

import (
	"bytes"
	"io"
	"net/url"
	"strings"
	"sync"

	"code.gitea.io/gitea/modules/log"
	"code.gitea.io/gitea/modules/markup/common"
	"code.gitea.io/gitea/modules/setting"

	"github.com/yuin/goldmark"
	"github.com/yuin/goldmark/ast"
	"github.com/yuin/goldmark/extension"
	"github.com/yuin/goldmark/parser"
	"github.com/yuin/goldmark/renderer"
	"github.com/yuin/goldmark/renderer/html"
	"github.com/yuin/goldmark/text"
)

var (
	giteaHostInit sync.Once
	giteaHost     *url.URL
)

type stripRenderer struct {
	localhost *url.URL
	links     []string
	empty     bool
}

func (r *stripRenderer) Render(w io.Writer, source []byte, doc ast.Node) error {
	return ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
		if !entering {
			return ast.WalkContinue, nil
		}
		switch v := n.(type) {
		case *ast.Text:
			if !v.IsRaw() {
				_, prevSibIsText := n.PreviousSibling().(*ast.Text)
				coalesce := prevSibIsText
				r.processString(
					w,
					v.Text(source),
					coalesce)
				if v.SoftLineBreak() {
					r.doubleSpace(w)
				}
			}
			return ast.WalkContinue, nil
		case *ast.Link:
			r.processLink(v.Destination)
			return ast.WalkSkipChildren, nil
		case *ast.AutoLink:
			// This could be a reference to an issue or pull - if so convert it
			r.processAutoLink(w, v.URL(source))
			return ast.WalkSkipChildren, nil
		}
		return ast.WalkContinue, nil
	})
}

func (r *stripRenderer) doubleSpace(w io.Writer) {
	if !r.empty {
		_, _ = w.Write([]byte{'\n'})
	}
}

func (r *stripRenderer) processString(w io.Writer, text []byte, coalesce bool) {
	// Always break-up words
	if !coalesce {
		r.doubleSpace(w)
	}
	_, _ = w.Write(text)
	r.empty = false
}

// ProcessAutoLinks to detect and handle links to issues and pulls
func (r *stripRenderer) processAutoLink(w io.Writer, link []byte) {
	linkStr := string(link)
	u, err := url.Parse(linkStr)
	if err != nil {
		// Process out of band
		r.links = append(r.links, linkStr)
		return
	}

	// Note: we're not attempting to match the URL scheme (http/https)
	host := strings.ToLower(u.Host)
	if host != "" && host != strings.ToLower(r.localhost.Host) {
		// Process out of band
		r.links = append(r.links, linkStr)
		return
	}

	// We want: /user/repo/issues/3
	parts := strings.Split(strings.TrimPrefix(u.EscapedPath(), r.localhost.EscapedPath()), "/")
	if len(parts) != 5 || parts[0] != "" {
		// Process out of band
		r.links = append(r.links, linkStr)
		return
	}

	var sep string
	if parts[3] == "issues" {
		sep = "#"
	} else if parts[3] == "pulls" {
		sep = "!"
	} else {
		// Process out of band
		r.links = append(r.links, linkStr)
		return
	}

	_, _ = w.Write([]byte(parts[1]))
	_, _ = w.Write([]byte("/"))
	_, _ = w.Write([]byte(parts[2]))
	_, _ = w.Write([]byte(sep))
	_, _ = w.Write([]byte(parts[4]))
}

func (r *stripRenderer) processLink(link []byte) {
	// Links are processed out of band
	r.links = append(r.links, string(link))
}

// GetLinks returns the list of link data collected while parsing
func (r *stripRenderer) GetLinks() []string {
	return r.links
}

// AddOptions adds given option to this renderer.
func (r *stripRenderer) AddOptions(...renderer.Option) {
	// no-op
}

// StripMarkdown parses markdown content by removing all markup and code blocks
// in order to extract links and other references
func StripMarkdown(rawBytes []byte) (string, []string) {
	buf, links := StripMarkdownBytes(rawBytes)
	return string(buf), links
}

var (
	stripParser parser.Parser
	once        = sync.Once{}
)

// StripMarkdownBytes parses markdown content by removing all markup and code blocks
// in order to extract links and other references
func StripMarkdownBytes(rawBytes []byte) ([]byte, []string) {
	once.Do(func() {
		gdMarkdown := goldmark.New(
			goldmark.WithExtensions(extension.Table,
				extension.Strikethrough,
				extension.TaskList,
				extension.DefinitionList,
				common.FootnoteExtension,
				common.Linkify,
			),
			goldmark.WithParserOptions(
				parser.WithAttribute(),
				parser.WithAutoHeadingID(),
			),
			goldmark.WithRendererOptions(
				html.WithUnsafe(),
			),
		)
		stripParser = gdMarkdown.Parser()
	})
	stripper := &stripRenderer{
		localhost: getGiteaHost(),
		links:     make([]string, 0, 10),
		empty:     true,
	}
	reader := text.NewReader(rawBytes)
	doc := stripParser.Parse(reader)
	var buf bytes.Buffer
	if err := stripper.Render(&buf, rawBytes, doc); err != nil {
		log.Error("Unable to strip: %v", err)
	}
	return buf.Bytes(), stripper.GetLinks()
}

// getGiteaHostName returns a normalized string with the local host name, with no scheme or port information
func getGiteaHost() *url.URL {
	giteaHostInit.Do(func() {
		var err error
		if giteaHost, err = url.Parse(setting.AppURL); err != nil {
			giteaHost = &url.URL{}
		}
	})
	return giteaHost
}