services/gitdiff/highlightdiff.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227

// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package gitdiff

import (
	"strings"

	"code.gitea.io/gitea/modules/highlight"

	"github.com/sergi/go-diff/diffmatchpatch"
)

// token is a html tag or entity, eg: "<span ...>", "</span>", "&lt;"
func extractHTMLToken(s string) (before, token, after string, valid bool) {
	for pos1 := 0; pos1 < len(s); pos1++ {
		if s[pos1] == '<' {
			pos2 := strings.IndexByte(s[pos1:], '>')
			if pos2 == -1 {
				return "", "", s, false
			}
			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
		} else if s[pos1] == '&' {
			pos2 := strings.IndexByte(s[pos1:], ';')
			if pos2 == -1 {
				return "", "", s, false
			}
			return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
		}
	}
	return "", "", s, true
}

// HighlightCodeDiff is used to do diff with highlighted HTML code.
// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
// The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001"
// These Unicode placeholders are friendly to the diff.
// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
// It's guaranteed that the tags in final diff result are paired correctly.
type HighlightCodeDiff struct {
	placeholderBegin    rune
	placeholderMaxCount int
	placeholderIndex    int
	PlaceholderTokenMap map[rune]string
	tokenPlaceholderMap map[string]rune

	placeholderOverflowCount int

	lineWrapperTags []string
}

func NewHighlightCodeDiff() *HighlightCodeDiff {
	return &HighlightCodeDiff{
		placeholderBegin:    rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
		placeholderMaxCount: 64000,
		PlaceholderTokenMap: map[rune]string{},
		tokenPlaceholderMap: map[string]rune{},
	}
}

// NextPlaceholder returns 0 if no more placeholder can be used
// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
// so the placeholderMaxCount is impossible to be exhausted in real cases.
func (hcd *HighlightCodeDiff) NextPlaceholder() rune {
	for hcd.placeholderIndex < hcd.placeholderMaxCount {
		r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
		hcd.placeholderIndex++
		// only use non-existing (not used by code) rune as placeholders
		if _, ok := hcd.PlaceholderTokenMap[r]; !ok {
			return r
		}
	}
	return 0 // no more available placeholder
}

func (hcd *HighlightCodeDiff) isInPlaceholderRange(r rune) bool {
	return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
}

func (hcd *HighlightCodeDiff) CollectUsedRunes(code string) {
	for _, r := range code {
		if hcd.isInPlaceholderRange(r) {
			// put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
			hcd.PlaceholderTokenMap[r] = ""
		}
	}
}

func (hcd *HighlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff {
	hcd.CollectUsedRunes(codeA)
	hcd.CollectUsedRunes(codeB)

	highlightCodeA, _ := highlight.Code(filename, language, codeA)
	highlightCodeB, _ := highlight.Code(filename, language, codeB)

	convertedCodeA := hcd.ConvertToPlaceholders(string(highlightCodeA))
	convertedCodeB := hcd.ConvertToPlaceholders(string(highlightCodeB))

	diffs := diffMatchPatch.DiffMain(convertedCodeA, convertedCodeB, true)
	diffs = diffMatchPatch.DiffCleanupSemantic(diffs)
	diffs = diffMatchPatch.DiffCleanupEfficiency(diffs)

	for i := range diffs {
		hcd.recoverOneDiff(&diffs[i])
	}
	return diffs
}

// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
func (hcd *HighlightCodeDiff) ConvertToPlaceholders(htmlCode string) string {
	var tagStack []string
	res := strings.Builder{}

	firstRunForLineTags := hcd.lineWrapperTags == nil

	var beforeToken, token string
	var valid bool

	// the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>"
	for {
		beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode)
		if !valid || token == "" {
			break
		}
		// write the content before the token into result string, and consume the token in the string
		res.WriteString(beforeToken)

		// the line wrapper tags should be removed before diff
		if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) {
			if firstRunForLineTags {
				// if this is the first run for converting, save the line wrapper tags for later use, they should be added back
				hcd.lineWrapperTags = append(hcd.lineWrapperTags, token)
			}
			htmlCode = strings.TrimSuffix(htmlCode, "</span>")
			continue
		}

		var tokenInMap string
		if strings.HasSuffix(token, "</") { // for closing tag
			if len(tagStack) == 0 {
				break // invalid diff result, no opening tag but see closing tag
			}
			// make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
			// the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>"
			tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->"
			tagStack = tagStack[:len(tagStack)-1]
		} else if token[0] == '<' { // for opening tag
			tokenInMap = token
			tagStack = append(tagStack, token)
		} else if token[0] == '&' { // for html entity
			tokenInMap = token
		} // else: impossible

		// remember the placeholder and token in the map
		placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap]
		if !ok {
			placeholder = hcd.NextPlaceholder()
			if placeholder != 0 {
				hcd.tokenPlaceholderMap[tokenInMap] = placeholder
				hcd.PlaceholderTokenMap[placeholder] = tokenInMap
			}
		}

		if placeholder != 0 {
			res.WriteRune(placeholder) // use the placeholder to replace the token
		} else {
			// unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
			// usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
			hcd.placeholderOverflowCount++
			if strings.HasPrefix(token, "&") {
				// when the token is a html entity, something must be outputted even if there is no placeholder.
				res.WriteRune(0xFFFD)      // replacement character TODO: how to handle this case more gracefully?
				res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
			}
		}
	}

	// write the remaining string
	res.WriteString(htmlCode)
	return res.String()
}

func (hcd *HighlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) {
	diff.Text = hcd.Recover(diff.Text)
}

func (hcd *HighlightCodeDiff) Recover(src string) string {
	sb := strings.Builder{}
	var tagStack []string

	for _, r := range src {
		token, ok := hcd.PlaceholderTokenMap[r]
		if !ok || token == "" {
			sb.WriteRune(r) // if the rune is not a placeholder, write it as it is
			continue
		}
		var tokenToRecover string
		if strings.HasPrefix(token, "</") { // for closing tag
			// only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
			tokenToRecover = token[:strings.IndexByte(token, '>')+1]
			if len(tagStack) == 0 {
				continue // if no opening tag in stack yet, skip the closing tag
			}
			tagStack = tagStack[:len(tagStack)-1]
		} else if token[0] == '<' { // for opening tag
			tokenToRecover = token
			tagStack = append(tagStack, token)
		} else if token[0] == '&' { // for html entity
			tokenToRecover = token
		} // else: impossible
		sb.WriteString(tokenToRecover)
	}

	if len(tagStack) > 0 {
		// close all opening tags
		for i := len(tagStack) - 1; i >= 0; i-- {
			tagToClose := tagStack[i]
			// get the closing tag "</span>" from "<span class=...>" or "<span>"
			pos := strings.IndexAny(tagToClose, " >")
			if pos != -1 {
				sb.WriteString("</" + tagToClose[1:pos] + ">")
			} // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag
		}
	}

	return sb.String()
}